From 6afe1a1fe8ff83f6ac2726b04665e76ba7b14f3e Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Thu, 13 Mar 2008 23:52:49 +0100
Subject: PM: Remove legacy PM

AFAICT pm_send_all is a nop when noone uses pm_register...

Hmm.. can we just force CONFIG_PM_LEGACY=n, and see what happens?

Or maybe this is better idea? It may break build somewhere, but it
should be easy to fix... (it builds here, i386 and x86-64).

Signed-off-by: Pavel Machek <pavel@suse.cz>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 kernel/power/Kconfig  |  10 ---
 kernel/power/Makefile |   1 -
 kernel/power/pm.c     | 205 --------------------------------------------------
 3 files changed, 216 deletions(-)
 delete mode 100644 kernel/power/pm.c

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6233f3b4ae66..b45da40e8d25 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -19,16 +19,6 @@ config PM
 	  will issue the hlt instruction if nothing is to be done, thereby
 	  sending the processor to sleep and saving power.
 
-config PM_LEGACY
-	bool "Legacy Power Management API (DEPRECATED)"
-	depends on PM
-	default n
-	---help---
-	   Support for pm_register() and friends.  This old API is obsoleted
-	   by the driver model.
-
-	   If unsure, say N.
-
 config PM_DEBUG
 	bool "Power Management Debug Support"
 	depends on PM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f7dfff28ecdb..597823b5b700 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,6 @@ EXTRA_CFLAGS	+=	-DDEBUG
 endif
 
 obj-y				:= main.o
-obj-$(CONFIG_PM_LEGACY)		+= pm.o
 obj-$(CONFIG_PM_SLEEP)		+= process.o console.o
 obj-$(CONFIG_HIBERNATION)	+= swsusp.o disk.o snapshot.o swap.o user.o
 
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
deleted file mode 100644
index 60c73fa670d5..000000000000
--- a/kernel/power/pm.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- *  pm.c - Power management interface
- *
- *  Copyright (C) 2000 Andrew Henroid
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/pm.h>
-#include <linux/pm_legacy.h>
-#include <linux/interrupt.h>
-#include <linux/mutex.h>
-
-/*
- *	Locking notes:
- *		pm_devs_lock can be a semaphore providing pm ops are not called
- *	from an interrupt handler (already a bad idea so no change here). Each
- *	change must be protected so that an unlink of an entry doesn't clash
- *	with a pm send - which is permitted to sleep in the current architecture
- *
- *	Module unloads clashing with pm events now work out safely, the module 
- *	unload path will block until the event has been sent. It may well block
- *	until a resume but that will be fine.
- */
- 
-static DEFINE_MUTEX(pm_devs_lock);
-static LIST_HEAD(pm_devs);
-
-/**
- *	pm_register - register a device with power management
- *	@type: device type 
- *	@id: device ID
- *	@callback: callback function
- *
- *	Add a device to the list of devices that wish to be notified about
- *	power management events. A &pm_dev structure is returned on success,
- *	on failure the return is %NULL.
- *
- *      The callback function will be called in process context and
- *      it may sleep.
- */
- 
-struct pm_dev *pm_register(pm_dev_t type,
-			   unsigned long id,
-			   pm_callback callback)
-{
-	struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
-	if (dev) {
-		dev->type = type;
-		dev->id = id;
-		dev->callback = callback;
-
-		mutex_lock(&pm_devs_lock);
-		list_add(&dev->entry, &pm_devs);
-		mutex_unlock(&pm_devs_lock);
-	}
-	return dev;
-}
-
-/**
- *	pm_send - send request to a single device
- *	@dev: device to send to
- *	@rqst: power management request
- *	@data: data for the callback
- *
- *	Issue a power management request to a given device. The 
- *	%PM_SUSPEND and %PM_RESUME events are handled specially. The
- *	data field must hold the intended next state. No call is made
- *	if the state matches.
- *
- *	BUGS: what stops two power management requests occurring in parallel
- *	and conflicting.
- *
- *	WARNING: Calling pm_send directly is not generally recommended, in
- *	particular there is no locking against the pm_dev going away. The
- *	caller must maintain all needed locking or have 'inside knowledge'
- *	on the safety. Also remember that this function is not locked against
- *	pm_unregister. This means that you must handle SMP races on callback
- *	execution and unload yourself.
- */
- 
-static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
-{
-	int status = 0;
-	unsigned long prev_state, next_state;
-
-	if (in_interrupt())
-		BUG();
-
-	switch (rqst) {
-	case PM_SUSPEND:
-	case PM_RESUME:
-		prev_state = dev->state;
-		next_state = (unsigned long) data;
-		if (prev_state != next_state) {
-			if (dev->callback)
-				status = (*dev->callback)(dev, rqst, data);
-			if (!status) {
-				dev->state = next_state;
-				dev->prev_state = prev_state;
-			}
-		}
-		else {
-			dev->prev_state = prev_state;
-		}
-		break;
-	default:
-		if (dev->callback)
-			status = (*dev->callback)(dev, rqst, data);
-		break;
-	}
-	return status;
-}
-
-/*
- * Undo incomplete request
- */
-static void pm_undo_all(struct pm_dev *last)
-{
-	struct list_head *entry = last->entry.prev;
-	while (entry != &pm_devs) {
-		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
-		if (dev->state != dev->prev_state) {
-			/* previous state was zero (running) resume or
-			 * previous state was non-zero (suspended) suspend
-			 */
-			pm_request_t undo = (dev->prev_state
-					     ? PM_SUSPEND:PM_RESUME);
-			pm_send(dev, undo, (void*) dev->prev_state);
-		}
-		entry = entry->prev;
-	}
-}
-
-/**
- *	pm_send_all - send request to all managed devices
- *	@rqst: power management request
- *	@data: data for the callback
- *
- *	Issue a power management request to a all devices. The 
- *	%PM_SUSPEND events are handled specially. Any device is 
- *	permitted to fail a suspend by returning a non zero (error)
- *	value from its callback function. If any device vetoes a 
- *	suspend request then all other devices that have suspended 
- *	during the processing of this request are restored to their
- *	previous state.
- *
- *	WARNING:  This function takes the pm_devs_lock. The lock is not dropped until
- *	the callbacks have completed. This prevents races against pm locking
- *	functions, races against module unload pm_unregister code. It does
- *	mean however that you must not issue pm_ functions within the callback
- *	or you will deadlock and users will hate you.
- *
- *	Zero is returned on success. If a suspend fails then the status
- *	from the device that vetoes the suspend is returned.
- *
- *	BUGS: what stops two power management requests occurring in parallel
- *	and conflicting.
- */
- 
-int pm_send_all(pm_request_t rqst, void *data)
-{
-	struct list_head *entry;
-	
-	mutex_lock(&pm_devs_lock);
-	entry = pm_devs.next;
-	while (entry != &pm_devs) {
-		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
-		if (dev->callback) {
-			int status = pm_send(dev, rqst, data);
-			if (status) {
-				/* return devices to previous state on
-				 * failed suspend request
-				 */
-				if (rqst == PM_SUSPEND)
-					pm_undo_all(dev);
-				mutex_unlock(&pm_devs_lock);
-				return status;
-			}
-		}
-		entry = entry->next;
-	}
-	mutex_unlock(&pm_devs_lock);
-	return 0;
-}
-
-EXPORT_SYMBOL(pm_register);
-EXPORT_SYMBOL(pm_send_all);
-
-- 
cgit v1.3-14-g43fede


From 436c405c7d19455a71f42c9bec5fd5e028f1eb4e Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 18 Apr 2008 10:01:04 -0400
Subject: Audit: end printk with newline

A couple of audit printk statements did not have a newline.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 56e56ed594a8..d7249fcdc442 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1596,7 +1596,7 @@ static inline void handle_one(const struct inode *inode)
 	if (likely(put_tree_ref(context, chunk)))
 		return;
 	if (unlikely(!grow_tree_refs(context))) {
-		printk(KERN_WARNING "out of memory, audit has lost a tree reference");
+		printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
 		audit_set_auditable(context);
 		audit_put_chunk(chunk);
 		unroll_tree_refs(context, p, count);
@@ -1656,7 +1656,7 @@ retry:
 		}
 		/* too bad */
 		printk(KERN_WARNING
-			"out of memory, audit has lost a tree reference");
+			"out of memory, audit has lost a tree reference\n");
 		unroll_tree_refs(context, p, count);
 		audit_set_auditable(context);
 		return;
@@ -1752,13 +1752,13 @@ static int audit_inc_name_count(struct audit_context *context,
 	if (context->name_count >= AUDIT_NAMES) {
 		if (inode)
 			printk(KERN_DEBUG "name_count maxed, losing inode data: "
-			       "dev=%02x:%02x, inode=%lu",
+			       "dev=%02x:%02x, inode=%lu\n",
 			       MAJOR(inode->i_sb->s_dev),
 			       MINOR(inode->i_sb->s_dev),
 			       inode->i_ino);
 
 		else
-			printk(KERN_DEBUG "name_count maxed, losing inode data");
+			printk(KERN_DEBUG "name_count maxed, losing inode data\n");
 		return 1;
 	}
 	context->name_count++;
-- 
cgit v1.3-14-g43fede


From 2532386f480eefbdd67b48be55fb4fb3e5a6081c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 18 Apr 2008 10:09:25 -0400
Subject: Audit: collect sessionid in netlink messages

Previously I added sessionid output to all audit messages where it was
available but we still didn't know the sessionid of the sender of
netlink messages.  This patch adds that information to netlink messages
so we can audit who sent netlink messages.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/char/tty_audit.c          |  7 +---
 include/linux/audit.h             |  3 +-
 include/linux/netlink.h           |  1 +
 include/linux/tty.h               |  4 +--
 include/net/netlabel.h            |  1 +
 include/net/xfrm.h                | 23 +++++++------
 kernel/audit.c                    | 72 ++++++++++++++++++++++-----------------
 kernel/auditfilter.c              | 16 +++++----
 net/key/af_key.c                  | 17 ++++++---
 net/netlabel/netlabel_unlabeled.c |  1 +
 net/netlabel/netlabel_user.c      |  4 ++-
 net/netlabel/netlabel_user.h      |  1 +
 net/netlink/af_netlink.c          |  1 +
 net/xfrm/xfrm_policy.c            | 12 ++++---
 net/xfrm/xfrm_state.c             | 13 ++++---
 net/xfrm/xfrm_user.c              | 41 +++++++++++++++-------
 security/smack/smackfs.c          |  2 ++
 17 files changed, 132 insertions(+), 87 deletions(-)

(limited to 'kernel')

diff --git a/drivers/char/tty_audit.c b/drivers/char/tty_audit.c
index 7722466e052f..9739bbfc8f70 100644
--- a/drivers/char/tty_audit.c
+++ b/drivers/char/tty_audit.c
@@ -151,14 +151,9 @@ void tty_audit_fork(struct signal_struct *sig)
 /**
  *	tty_audit_push_task	-	Flush task's pending audit data
  */
-void tty_audit_push_task(struct task_struct *tsk, uid_t loginuid)
+void tty_audit_push_task(struct task_struct *tsk, uid_t loginuid, u32 sessionid)
 {
 	struct tty_audit_buf *buf;
-	/* FIXME I think this is correct.  Check against netlink once that is
-	 * I really need to read this code more closely.  But that's for
-	 * another patch.
-	 */
-	unsigned int sessionid = audit_get_sessionid(tsk);
 
 	spin_lock_irq(&tsk->sighand->siglock);
 	buf = tsk->signal->tty_audit_buf;
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 4ccb048cae1d..25f6ae30dd4b 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -569,7 +569,8 @@ extern int		    audit_update_lsm_rules(void);
 extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
 extern int audit_filter_type(int type);
 extern int  audit_receive_filter(int type, int pid, int uid, int seq,
-			 void *data, size_t datasz, uid_t loginuid, u32 sid);
+				void *data, size_t datasz, uid_t loginuid,
+				u32 sessionid, u32 sid);
 extern int audit_enabled;
 #else
 #define audit_log(c,g,t,f,...) do { ; } while (0)
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index fb0713b6ffaf..bec1062a25a1 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -166,6 +166,7 @@ struct netlink_skb_parms
 	__u32			dst_group;
 	kernel_cap_t		eff_cap;
 	__u32			loginuid;	/* Login (audit) uid */
+	__u32			sessionid;	/* Session id (audit) */
 	__u32			sid;		/* SELinux security id */
 };
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index dd8e08fe8855..430624504ca0 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -351,7 +351,7 @@ extern void tty_audit_add_data(struct tty_struct *tty, unsigned char *data,
 extern void tty_audit_exit(void);
 extern void tty_audit_fork(struct signal_struct *sig);
 extern void tty_audit_push(struct tty_struct *tty);
-extern void tty_audit_push_task(struct task_struct *tsk, uid_t loginuid);
+extern void tty_audit_push_task(struct task_struct *tsk, uid_t loginuid, u32 sessionid);
 extern void tty_audit_opening(void);
 #else
 static inline void tty_audit_add_data(struct tty_struct *tty,
@@ -367,7 +367,7 @@ static inline void tty_audit_fork(struct signal_struct *sig)
 static inline void tty_audit_push(struct tty_struct *tty)
 {
 }
-static inline void tty_audit_push_task(struct task_struct *tsk, uid_t loginuid)
+static inline void tty_audit_push_task(struct task_struct *tsk, uid_t loginuid, u32 sessionid)
 {
 }
 static inline void tty_audit_opening(void)
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 5e53a85b5ca1..e4d2d6baa983 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -103,6 +103,7 @@ struct cipso_v4_doi;
 struct netlbl_audit {
 	u32 secid;
 	uid_t loginuid;
+	u32 sessionid;
 };
 
 /*
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index baa9f372cfd1..d1350bcccb03 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -597,8 +597,9 @@ struct xfrm_spi_skb_cb {
 /* Audit Information */
 struct xfrm_audit
 {
-	u32	loginuid;
 	u32	secid;
+	uid_t	loginuid;
+	u32	sessionid;
 };
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -616,13 +617,13 @@ static inline struct audit_buffer *xfrm_audit_start(const char *op)
 	return audit_buf;
 }
 
-static inline void xfrm_audit_helper_usrinfo(u32 auid, u32 secid,
+static inline void xfrm_audit_helper_usrinfo(uid_t auid, u32 ses, u32 secid,
 					     struct audit_buffer *audit_buf)
 {
 	char *secctx;
 	u32 secctx_len;
 
-	audit_log_format(audit_buf, " auid=%u", auid);
+	audit_log_format(audit_buf, " auid=%u ses=%u", auid, ses);
 	if (secid != 0 &&
 	    security_secid_to_secctx(secid, &secctx, &secctx_len) == 0) {
 		audit_log_format(audit_buf, " subj=%s", secctx);
@@ -632,13 +633,13 @@ static inline void xfrm_audit_helper_usrinfo(u32 auid, u32 secid,
 }
 
 extern void xfrm_audit_policy_add(struct xfrm_policy *xp, int result,
-				  u32 auid, u32 secid);
+				  u32 auid, u32 ses, u32 secid);
 extern void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
-				  u32 auid, u32 secid);
+				  u32 auid, u32 ses, u32 secid);
 extern void xfrm_audit_state_add(struct xfrm_state *x, int result,
-				 u32 auid, u32 secid);
+				 u32 auid, u32 ses, u32 secid);
 extern void xfrm_audit_state_delete(struct xfrm_state *x, int result,
-				    u32 auid, u32 secid);
+				    u32 auid, u32 ses, u32 secid);
 extern void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
 					     struct sk_buff *skb);
 extern void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family);
@@ -647,10 +648,10 @@ extern void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
 extern void xfrm_audit_state_icvfail(struct xfrm_state *x,
 				     struct sk_buff *skb, u8 proto);
 #else
-#define xfrm_audit_policy_add(x, r, a, s)	do { ; } while (0)
-#define xfrm_audit_policy_delete(x, r, a, s)	do { ; } while (0)
-#define xfrm_audit_state_add(x, r, a, s)	do { ; } while (0)
-#define xfrm_audit_state_delete(x, r, a, s)	do { ; } while (0)
+#define xfrm_audit_policy_add(x, r, a, se, s)	do { ; } while (0)
+#define xfrm_audit_policy_delete(x, r, a, se, s)	do { ; } while (0)
+#define xfrm_audit_state_add(x, r, a, se, s)	do { ; } while (0)
+#define xfrm_audit_state_delete(x, r, a, se, s)	do { ; } while (0)
 #define xfrm_audit_state_replay_overflow(x, s)	do { ; } while (0)
 #define xfrm_audit_state_notfound_simple(s, f)	do { ; } while (0)
 #define xfrm_audit_state_notfound(s, f, sp, sq)	do { ; } while (0)
diff --git a/kernel/audit.c b/kernel/audit.c
index a7b16086d36f..ad6d1abfa1d2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -252,14 +252,15 @@ void audit_log_lost(const char *message)
 }
 
 static int audit_log_config_change(char *function_name, int new, int old,
-				   uid_t loginuid, u32 sid, int allow_changes)
+				   uid_t loginuid, u32 sessionid, u32 sid,
+				   int allow_changes)
 {
 	struct audit_buffer *ab;
 	int rc = 0;
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-	audit_log_format(ab, "%s=%d old=%d by auid=%u", function_name, new,
-			 old, loginuid);
+	audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
+			 old, loginuid, sessionid);
 	if (sid) {
 		char *ctx = NULL;
 		u32 len;
@@ -279,7 +280,8 @@ static int audit_log_config_change(char *function_name, int new, int old,
 }
 
 static int audit_do_config_change(char *function_name, int *to_change,
-				  int new, uid_t loginuid, u32 sid)
+				  int new, uid_t loginuid, u32 sessionid,
+				  u32 sid)
 {
 	int allow_changes, rc = 0, old = *to_change;
 
@@ -290,8 +292,8 @@ static int audit_do_config_change(char *function_name, int *to_change,
 		allow_changes = 1;
 
 	if (audit_enabled != AUDIT_OFF) {
-		rc = audit_log_config_change(function_name, new, old,
-					     loginuid, sid, allow_changes);
+		rc = audit_log_config_change(function_name, new, old, loginuid,
+					     sessionid, sid, allow_changes);
 		if (rc)
 			allow_changes = 0;
 	}
@@ -305,26 +307,28 @@ static int audit_do_config_change(char *function_name, int *to_change,
 	return rc;
 }
 
-static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
+static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
+				u32 sid)
 {
 	return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
-				      limit, loginuid, sid);
+				      limit, loginuid, sessionid, sid);
 }
 
-static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
+static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
+				   u32 sid)
 {
 	return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
-				      limit, loginuid, sid);
+				      limit, loginuid, sessionid, sid);
 }
 
-static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
+static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
 {
 	int rc;
 	if (state < AUDIT_OFF || state > AUDIT_LOCKED)
 		return -EINVAL;
 
 	rc =  audit_do_config_change("audit_enabled", &audit_enabled, state,
-				     loginuid, sid);
+				     loginuid, sessionid, sid);
 
 	if (!rc)
 		audit_ever_enabled |= !!state;
@@ -332,7 +336,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
 	return rc;
 }
 
-static int audit_set_failure(int state, uid_t loginuid, u32 sid)
+static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
 {
 	if (state != AUDIT_FAIL_SILENT
 	    && state != AUDIT_FAIL_PRINTK
@@ -340,7 +344,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
 		return -EINVAL;
 
 	return audit_do_config_change("audit_failure", &audit_failure, state,
-				      loginuid, sid);
+				      loginuid, sessionid, sid);
 }
 
 static int kauditd_thread(void *dummy)
@@ -385,7 +389,7 @@ static int kauditd_thread(void *dummy)
 	return 0;
 }
 
-static int audit_prepare_user_tty(pid_t pid, uid_t loginuid)
+static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
 {
 	struct task_struct *tsk;
 	int err;
@@ -404,7 +408,7 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid)
 	if (err)
 		goto out;
 
-	tty_audit_push_task(tsk, loginuid);
+	tty_audit_push_task(tsk, loginuid, sessionid);
 out:
 	read_unlock(&tasklist_lock);
 	return err;
@@ -534,7 +538,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 }
 
 static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
-				     u32 pid, u32 uid, uid_t auid, u32 sid)
+				     u32 pid, u32 uid, uid_t auid, u32 ses,
+				     u32 sid)
 {
 	int rc = 0;
 	char *ctx = NULL;
@@ -546,8 +551,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
 	}
 
 	*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
-	audit_log_format(*ab, "user pid=%d uid=%u auid=%u",
-			 pid, uid, auid);
+	audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
+			 pid, uid, auid, ses);
 	if (sid) {
 		rc = security_secid_to_secctx(sid, &ctx, &len);
 		if (rc)
@@ -570,6 +575,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	struct audit_buffer	*ab;
 	u16			msg_type = nlh->nlmsg_type;
 	uid_t			loginuid; /* loginuid of sender */
+	u32			sessionid;
 	struct audit_sig_info   *sig_data;
 	char			*ctx = NULL;
 	u32			len;
@@ -591,6 +597,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	pid  = NETLINK_CREDS(skb)->pid;
 	uid  = NETLINK_CREDS(skb)->uid;
 	loginuid = NETLINK_CB(skb).loginuid;
+	sessionid = NETLINK_CB(skb).sessionid;
 	sid  = NETLINK_CB(skb).sid;
 	seq  = nlh->nlmsg_seq;
 	data = NLMSG_DATA(nlh);
@@ -613,12 +620,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		status_get   = (struct audit_status *)data;
 		if (status_get->mask & AUDIT_STATUS_ENABLED) {
 			err = audit_set_enabled(status_get->enabled,
-							loginuid, sid);
+						loginuid, sessionid, sid);
 			if (err < 0) return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_FAILURE) {
 			err = audit_set_failure(status_get->failure,
-							 loginuid, sid);
+						loginuid, sessionid, sid);
 			if (err < 0) return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_PID) {
@@ -627,17 +634,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			if (audit_enabled != AUDIT_OFF)
 				audit_log_config_change("audit_pid", new_pid,
 							audit_pid, loginuid,
-							sid, 1);
+							sessionid, sid, 1);
 
 			audit_pid = new_pid;
 			audit_nlk_pid = NETLINK_CB(skb).pid;
 		}
 		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
 			err = audit_set_rate_limit(status_get->rate_limit,
-							 loginuid, sid);
+						   loginuid, sessionid, sid);
 		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
 			err = audit_set_backlog_limit(status_get->backlog_limit,
-							loginuid, sid);
+						      loginuid, sessionid, sid);
 		break;
 	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
@@ -649,12 +656,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (err == 1) {
 			err = 0;
 			if (msg_type == AUDIT_USER_TTY) {
-				err = audit_prepare_user_tty(pid, loginuid);
+				err = audit_prepare_user_tty(pid, loginuid,
+							     sessionid);
 				if (err)
 					break;
 			}
 			audit_log_common_recv_msg(&ab, msg_type, pid, uid,
-						  loginuid, sid);
+						  loginuid, sessionid, sid);
 
 			if (msg_type != AUDIT_USER_TTY)
 				audit_log_format(ab, " msg='%.1024s'",
@@ -677,7 +685,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			return -EINVAL;
 		if (audit_enabled == AUDIT_LOCKED) {
 			audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
-						  uid, loginuid, sid);
+						  uid, loginuid, sessionid, sid);
 
 			audit_log_format(ab, " audit_enabled=%d res=0",
 					 audit_enabled);
@@ -688,7 +696,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	case AUDIT_LIST:
 		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
 					   uid, seq, data, nlmsg_len(nlh),
-					   loginuid, sid);
+					   loginuid, sessionid, sid);
 		break;
 	case AUDIT_ADD_RULE:
 	case AUDIT_DEL_RULE:
@@ -696,7 +704,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			return -EINVAL;
 		if (audit_enabled == AUDIT_LOCKED) {
 			audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
-						  uid, loginuid, sid);
+						  uid, loginuid, sessionid, sid);
 
 			audit_log_format(ab, " audit_enabled=%d res=0",
 					 audit_enabled);
@@ -707,13 +715,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	case AUDIT_LIST_RULES:
 		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
 					   uid, seq, data, nlmsg_len(nlh),
-					   loginuid, sid);
+					   loginuid, sessionid, sid);
 		break;
 	case AUDIT_TRIM:
 		audit_trim_trees();
 
 		audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
-					  uid, loginuid, sid);
+					  uid, loginuid, sessionid, sid);
 
 		audit_log_format(ab, " op=trim res=1");
 		audit_log_end(ab);
@@ -745,7 +753,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		err = audit_tag_tree(old, new);
 
 		audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
-					  uid, loginuid, sid);
+					  uid, loginuid, sessionid, sid);
 
 		audit_log_format(ab, " op=make_equiv old=");
 		audit_log_untrustedstring(ab, old);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 28fef6bf8534..af3ae91c47b1 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1500,8 +1500,9 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
 }
 
 /* Log rule additions and removals */
-static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
-				  struct audit_krule *rule, int res)
+static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
+				  char *action, struct audit_krule *rule,
+				  int res)
 {
 	struct audit_buffer *ab;
 
@@ -1511,7 +1512,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
 	if (!ab)
 		return;
-	audit_log_format(ab, "auid=%u", loginuid);
+	audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
 	if (sid) {
 		char *ctx = NULL;
 		u32 len;
@@ -1543,7 +1544,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
  * @sid: SE Linux Security ID of sender
  */
 int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
-			 size_t datasz, uid_t loginuid, u32 sid)
+			 size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
 {
 	struct task_struct *tsk;
 	struct audit_netlink_list *dest;
@@ -1590,7 +1591,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 
 		err = audit_add_rule(entry,
 				     &audit_filter_list[entry->rule.listnr]);
-		audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err);
+		audit_log_rule_change(loginuid, sessionid, sid, "add",
+				      &entry->rule, !err);
 
 		if (err)
 			audit_free_rule(entry);
@@ -1606,8 +1608,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 
 		err = audit_del_rule(entry,
 				     &audit_filter_list[entry->rule.listnr]);
-		audit_log_rule_change(loginuid, sid, "remove", &entry->rule,
-				      !err);
+		audit_log_rule_change(loginuid, sessionid, sid, "remove",
+				      &entry->rule, !err);
 
 		audit_free_rule(entry);
 		break;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 2403a31fe0f6..9e7236ff6bcc 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1498,7 +1498,8 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
 		err = xfrm_state_update(x);
 
 	xfrm_audit_state_add(x, err ? 0 : 1,
-			     audit_get_loginuid(current), 0);
+			     audit_get_loginuid(current),
+			     audit_get_sessionid(current), 0);
 
 	if (err < 0) {
 		x->km.state = XFRM_STATE_DEAD;
@@ -1552,7 +1553,8 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	km_state_notify(x, &c);
 out:
 	xfrm_audit_state_delete(x, err ? 0 : 1,
-			       audit_get_loginuid(current), 0);
+				audit_get_loginuid(current),
+				audit_get_sessionid(current), 0);
 	xfrm_state_put(x);
 
 	return err;
@@ -1728,6 +1730,7 @@ static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hd
 		return -EINVAL;
 
 	audit_info.loginuid = audit_get_loginuid(current);
+	audit_info.sessionid = audit_get_sessionid(current);
 	audit_info.secid = 0;
 	err = xfrm_state_flush(proto, &audit_info);
 	if (err)
@@ -2324,7 +2327,8 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 				 hdr->sadb_msg_type != SADB_X_SPDUPDATE);
 
 	xfrm_audit_policy_add(xp, err ? 0 : 1,
-			     audit_get_loginuid(current), 0);
+			      audit_get_loginuid(current),
+			      audit_get_sessionid(current), 0);
 
 	if (err)
 		goto out;
@@ -2406,7 +2410,8 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 		return -ENOENT;
 
 	xfrm_audit_policy_delete(xp, err ? 0 : 1,
-				audit_get_loginuid(current), 0);
+				 audit_get_loginuid(current),
+				 audit_get_sessionid(current), 0);
 
 	if (err)
 		goto out;
@@ -2667,7 +2672,8 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 
 	if (delete) {
 		xfrm_audit_policy_delete(xp, err ? 0 : 1,
-				audit_get_loginuid(current), 0);
+				audit_get_loginuid(current),
+				audit_get_sessionid(current), 0);
 
 		if (err)
 			goto out;
@@ -2767,6 +2773,7 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 	int err;
 
 	audit_info.loginuid = audit_get_loginuid(current);
+	audit_info.sessionid = audit_get_sessionid(current);
 	audit_info.secid = 0;
 	err = xfrm_policy_flush(XFRM_POLICY_TYPE_MAIN, &audit_info);
 	if (err)
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index d282ad1570a7..0099da5b2591 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1780,6 +1780,7 @@ int __init netlbl_unlabel_defconf(void)
 	 * messages so don't worry to much about these values. */
 	security_task_getsecid(current, &audit_info.secid);
 	audit_info.loginuid = 0;
+	audit_info.sessionid = 0;
 
 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 	if (entry == NULL)
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index b17d4203806e..68706b4e3bf8 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -107,7 +107,9 @@ struct audit_buffer *netlbl_audit_start_common(int type,
 	if (audit_buf == NULL)
 		return NULL;
 
-	audit_log_format(audit_buf, "netlabel: auid=%u", audit_info->loginuid);
+	audit_log_format(audit_buf, "netlabel: auid=%u ses=%u",
+			 audit_info->loginuid,
+			 audit_info->sessionid);
 
 	if (audit_info->secid != 0 &&
 	    security_secid_to_secctx(audit_info->secid,
diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h
index 6d7f4ab46c2b..6caef8b20611 100644
--- a/net/netlabel/netlabel_user.h
+++ b/net/netlabel/netlabel_user.h
@@ -51,6 +51,7 @@ static inline void netlbl_netlink_auditinfo(struct sk_buff *skb,
 {
 	audit_info->secid = NETLINK_CB(skb).sid;
 	audit_info->loginuid = NETLINK_CB(skb).loginuid;
+	audit_info->sessionid = NETLINK_CB(skb).sessionid;
 }
 
 /* NetLabel NETLINK I/O functions */
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 46f3e44bb83a..9b97f8006c9c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1248,6 +1248,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	NETLINK_CB(skb).pid	= nlk->pid;
 	NETLINK_CB(skb).dst_group = dst_group;
 	NETLINK_CB(skb).loginuid = audit_get_loginuid(current);
+	NETLINK_CB(skb).sessionid = audit_get_sessionid(current);
 	security_task_getsecid(current, &(NETLINK_CB(skb).sid));
 	memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index e0c0390613c0..cae9fd815543 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -762,6 +762,7 @@ xfrm_policy_flush_secctx_check(u8 type, struct xfrm_audit *audit_info)
 			if (err) {
 				xfrm_audit_policy_delete(pol, 0,
 							 audit_info->loginuid,
+							 audit_info->sessionid,
 							 audit_info->secid);
 				return err;
 			}
@@ -777,6 +778,7 @@ xfrm_policy_flush_secctx_check(u8 type, struct xfrm_audit *audit_info)
 				if (err) {
 					xfrm_audit_policy_delete(pol, 0,
 							audit_info->loginuid,
+							audit_info->sessionid,
 							audit_info->secid);
 					return err;
 				}
@@ -819,6 +821,7 @@ int xfrm_policy_flush(u8 type, struct xfrm_audit *audit_info)
 			write_unlock_bh(&xfrm_policy_lock);
 
 			xfrm_audit_policy_delete(pol, 1, audit_info->loginuid,
+						 audit_info->sessionid,
 						 audit_info->secid);
 
 			xfrm_policy_kill(pol);
@@ -841,6 +844,7 @@ int xfrm_policy_flush(u8 type, struct xfrm_audit *audit_info)
 
 				xfrm_audit_policy_delete(pol, 1,
 							 audit_info->loginuid,
+							 audit_info->sessionid,
 							 audit_info->secid);
 				xfrm_policy_kill(pol);
 				killed++;
@@ -2472,14 +2476,14 @@ static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp,
 }
 
 void xfrm_audit_policy_add(struct xfrm_policy *xp, int result,
-			   u32 auid, u32 secid)
+			   uid_t auid, u32 sessionid, u32 secid)
 {
 	struct audit_buffer *audit_buf;
 
 	audit_buf = xfrm_audit_start("SPD-add");
 	if (audit_buf == NULL)
 		return;
-	xfrm_audit_helper_usrinfo(auid, secid, audit_buf);
+	xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf);
 	audit_log_format(audit_buf, " res=%u", result);
 	xfrm_audit_common_policyinfo(xp, audit_buf);
 	audit_log_end(audit_buf);
@@ -2487,14 +2491,14 @@ void xfrm_audit_policy_add(struct xfrm_policy *xp, int result,
 EXPORT_SYMBOL_GPL(xfrm_audit_policy_add);
 
 void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
-			      u32 auid, u32 secid)
+			      uid_t auid, u32 sessionid, u32 secid)
 {
 	struct audit_buffer *audit_buf;
 
 	audit_buf = xfrm_audit_start("SPD-delete");
 	if (audit_buf == NULL)
 		return;
-	xfrm_audit_helper_usrinfo(auid, secid, audit_buf);
+	xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf);
 	audit_log_format(audit_buf, " res=%u", result);
 	xfrm_audit_common_policyinfo(xp, audit_buf);
 	audit_log_end(audit_buf);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 5dcc10b93c86..c3f5f70934ec 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -496,7 +496,8 @@ expired:
 		km_state_expired(x, 1, 0);
 
 	xfrm_audit_state_delete(x, err ? 0 : 1,
-				audit_get_loginuid(current), 0);
+				audit_get_loginuid(current),
+				audit_get_sessionid(current), 0);
 
 out:
 	spin_unlock(&x->lock);
@@ -603,6 +604,7 @@ xfrm_state_flush_secctx_check(u8 proto, struct xfrm_audit *audit_info)
 			   (err = security_xfrm_state_delete(x)) != 0) {
 				xfrm_audit_state_delete(x, 0,
 							audit_info->loginuid,
+							audit_info->sessionid,
 							audit_info->secid);
 				return err;
 			}
@@ -641,6 +643,7 @@ restart:
 				err = xfrm_state_delete(x);
 				xfrm_audit_state_delete(x, err ? 0 : 1,
 							audit_info->loginuid,
+							audit_info->sessionid,
 							audit_info->secid);
 				xfrm_state_put(x);
 
@@ -2123,14 +2126,14 @@ static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family,
 }
 
 void xfrm_audit_state_add(struct xfrm_state *x, int result,
-			  u32 auid, u32 secid)
+			  uid_t auid, u32 sessionid, u32 secid)
 {
 	struct audit_buffer *audit_buf;
 
 	audit_buf = xfrm_audit_start("SAD-add");
 	if (audit_buf == NULL)
 		return;
-	xfrm_audit_helper_usrinfo(auid, secid, audit_buf);
+	xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf);
 	xfrm_audit_helper_sainfo(x, audit_buf);
 	audit_log_format(audit_buf, " res=%u", result);
 	audit_log_end(audit_buf);
@@ -2138,14 +2141,14 @@ void xfrm_audit_state_add(struct xfrm_state *x, int result,
 EXPORT_SYMBOL_GPL(xfrm_audit_state_add);
 
 void xfrm_audit_state_delete(struct xfrm_state *x, int result,
-			     u32 auid, u32 secid)
+			     uid_t auid, u32 sessionid, u32 secid)
 {
 	struct audit_buffer *audit_buf;
 
 	audit_buf = xfrm_audit_start("SAD-delete");
 	if (audit_buf == NULL)
 		return;
-	xfrm_audit_helper_usrinfo(auid, secid, audit_buf);
+	xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf);
 	xfrm_audit_helper_sainfo(x, audit_buf);
 	audit_log_format(audit_buf, " res=%u", result);
 	audit_log_end(audit_buf);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 22a30ae582a2..a1b0fbe3ea35 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -407,6 +407,9 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct xfrm_state *x;
 	int err;
 	struct km_event c;
+	uid_t loginuid = NETLINK_CB(skb).loginuid;
+	u32 sessionid = NETLINK_CB(skb).sessionid;
+	u32 sid = NETLINK_CB(skb).sid;
 
 	err = verify_newsa_info(p, attrs);
 	if (err)
@@ -422,8 +425,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
 	else
 		err = xfrm_state_update(x);
 
-	xfrm_audit_state_add(x, err ? 0 : 1, NETLINK_CB(skb).loginuid,
-			     NETLINK_CB(skb).sid);
+	xfrm_audit_state_add(x, err ? 0 : 1, loginuid, sessionid, sid);
 
 	if (err < 0) {
 		x->km.state = XFRM_STATE_DEAD;
@@ -478,6 +480,9 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err = -ESRCH;
 	struct km_event c;
 	struct xfrm_usersa_id *p = nlmsg_data(nlh);
+	uid_t loginuid = NETLINK_CB(skb).loginuid;
+	u32 sessionid = NETLINK_CB(skb).sessionid;
+	u32 sid = NETLINK_CB(skb).sid;
 
 	x = xfrm_user_state_lookup(p, attrs, &err);
 	if (x == NULL)
@@ -502,8 +507,7 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
 	km_state_notify(x, &c);
 
 out:
-	xfrm_audit_state_delete(x, err ? 0 : 1, NETLINK_CB(skb).loginuid,
-				NETLINK_CB(skb).sid);
+	xfrm_audit_state_delete(x, err ? 0 : 1, loginuid, sessionid, sid);
 	xfrm_state_put(x);
 	return err;
 }
@@ -1123,6 +1127,9 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct km_event c;
 	int err;
 	int excl;
+	uid_t loginuid = NETLINK_CB(skb).loginuid;
+	u32 sessionid = NETLINK_CB(skb).sessionid;
+	u32 sid = NETLINK_CB(skb).sid;
 
 	err = verify_newpolicy_info(p);
 	if (err)
@@ -1141,8 +1148,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	 * a type XFRM_MSG_UPDPOLICY - JHS */
 	excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY;
 	err = xfrm_policy_insert(p->dir, xp, excl);
-	xfrm_audit_policy_add(xp, err ? 0 : 1, NETLINK_CB(skb).loginuid,
-			      NETLINK_CB(skb).sid);
+	xfrm_audit_policy_add(xp, err ? 0 : 1, loginuid, sessionid, sid);
 
 	if (err) {
 		security_xfrm_policy_free(xp->security);
@@ -1371,9 +1377,12 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 					    NETLINK_CB(skb).pid);
 		}
 	} else {
-		xfrm_audit_policy_delete(xp, err ? 0 : 1,
-					 NETLINK_CB(skb).loginuid,
-					 NETLINK_CB(skb).sid);
+		uid_t loginuid = NETLINK_CB(skb).loginuid;
+		u32 sessionid = NETLINK_CB(skb).sessionid;
+		u32 sid = NETLINK_CB(skb).sid;
+
+		xfrm_audit_policy_delete(xp, err ? 0 : 1, loginuid, sessionid,
+					 sid);
 
 		if (err != 0)
 			goto out;
@@ -1399,6 +1408,7 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err;
 
 	audit_info.loginuid = NETLINK_CB(skb).loginuid;
+	audit_info.sessionid = NETLINK_CB(skb).sessionid;
 	audit_info.secid = NETLINK_CB(skb).sid;
 	err = xfrm_state_flush(p->proto, &audit_info);
 	if (err)
@@ -1546,6 +1556,7 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return err;
 
 	audit_info.loginuid = NETLINK_CB(skb).loginuid;
+	audit_info.sessionid = NETLINK_CB(skb).sessionid;
 	audit_info.secid = NETLINK_CB(skb).sid;
 	err = xfrm_policy_flush(type, &audit_info);
 	if (err)
@@ -1604,9 +1615,11 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	read_unlock(&xp->lock);
 	err = 0;
 	if (up->hard) {
+		uid_t loginuid = NETLINK_CB(skb).loginuid;
+		uid_t sessionid = NETLINK_CB(skb).sessionid;
+		u32 sid = NETLINK_CB(skb).sid;
 		xfrm_policy_delete(xp, p->dir);
-		xfrm_audit_policy_delete(xp, 1, NETLINK_CB(skb).loginuid,
-					 NETLINK_CB(skb).sid);
+		xfrm_audit_policy_delete(xp, 1, loginuid, sessionid, sid);
 
 	} else {
 		// reset the timers here?
@@ -1640,9 +1653,11 @@ static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	km_state_expired(x, ue->hard, current->pid);
 
 	if (ue->hard) {
+		uid_t loginuid = NETLINK_CB(skb).loginuid;
+		uid_t sessionid = NETLINK_CB(skb).sessionid;
+		u32 sid = NETLINK_CB(skb).sid;
 		__xfrm_state_delete(x);
-		xfrm_audit_state_delete(x, 1, NETLINK_CB(skb).loginuid,
-					NETLINK_CB(skb).sid);
+		xfrm_audit_state_delete(x, 1, loginuid, sessionid, sid);
 	}
 	err = 0;
 out:
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index 6ba283783b70..5d1bee0fa513 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -324,6 +324,7 @@ void smk_cipso_doi(void)
 	struct netlbl_audit audit_info;
 
 	audit_info.loginuid = audit_get_loginuid(current);
+	audit_info.sessionid = audit_get_sessionid(current);
 	audit_info.secid = smack_to_secid(current->security);
 
 	rc = netlbl_cfg_map_del(NULL, &audit_info);
@@ -356,6 +357,7 @@ void smk_unlbl_ambient(char *oldambient)
 	struct netlbl_audit audit_info;
 
 	audit_info.loginuid = audit_get_loginuid(current);
+	audit_info.sessionid = audit_get_sessionid(current);
 	audit_info.secid = smack_to_secid(current->security);
 
 	if (oldambient != NULL) {
-- 
cgit v1.3-14-g43fede


From f3d357b092956959563398b59ef2fdd10aea387d Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 18 Apr 2008 10:02:28 -0400
Subject: Audit: save audit_backlog_limit audit messages in case auditd comes
 back

This patch causes the kernel audit subsystem to store up to
audit_backlog_limit messages for use by auditd if it ever appears
sometime in the future in userspace.  This is useful to collect audit
messages during bootup and even when auditd is stopped.  This is NOT a
reliable mechanism, it does not ever call audit_panic, nor should it.
audit_log_lost()/audit_panic() are called during the normal delivery
mechanism.  The messages are still sent to printk/syslog as usual and if
too many messages appear to be queued they will be silently discarded.

I liked doing it by default, but this patch only uses the queue in
question if it was booted with audit=1 or if the kernel was built
enabling audit by default.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 102 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 81 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index ad6d1abfa1d2..fee9052eb5cf 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -126,6 +126,8 @@ static int	   audit_freelist_count;
 static LIST_HEAD(audit_freelist);
 
 static struct sk_buff_head audit_skb_queue;
+/* queue of skbs to send to auditd when/if it comes back */
+static struct sk_buff_head audit_skb_hold_queue;
 static struct task_struct *kauditd_task;
 static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
@@ -347,30 +349,83 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
 				      loginuid, sessionid, sid);
 }
 
+/*
+ * Queue skbs to be sent to auditd when/if it comes back.  These skbs should
+ * already have been sent via prink/syslog and so if these messages are dropped
+ * it is not a huge concern since we already passed the audit_log_lost()
+ * notification and stuff.  This is just nice to get audit messages during
+ * boot before auditd is running or messages generated while auditd is stopped.
+ * This only holds messages is audit_default is set, aka booting with audit=1
+ * or building your kernel that way.
+ */
+static void audit_hold_skb(struct sk_buff *skb)
+{
+	if (audit_default &&
+	    skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)
+		skb_queue_tail(&audit_skb_hold_queue, skb);
+	else
+		kfree_skb(skb);
+}
+
+static void kauditd_send_skb(struct sk_buff *skb)
+{
+	int err;
+	/* take a reference in case we can't send it and we want to hold it */
+	skb_get(skb);
+	err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
+	if (err < 0) {
+		BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+		printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+		audit_log_lost("auditd dissapeared\n");
+		audit_pid = 0;
+		/* we might get lucky and get this in the next auditd */
+		audit_hold_skb(skb);
+	} else
+		/* drop the extra reference if sent ok */
+		kfree_skb(skb);
+}
+
 static int kauditd_thread(void *dummy)
 {
 	struct sk_buff *skb;
 
 	set_freezable();
 	while (!kthread_should_stop()) {
+		/*
+		 * if auditd just started drain the queue of messages already
+		 * sent to syslog/printk.  remember loss here is ok.  we already
+		 * called audit_log_lost() if it didn't go out normally.  so the
+		 * race between the skb_dequeue and the next check for audit_pid
+		 * doesn't matter.
+		 *
+		 * if you ever find kauditd to be too slow we can get a perf win
+		 * by doing our own locking and keeping better track if there
+		 * are messages in this queue.  I don't see the need now, but
+		 * in 5 years when I want to play with this again I'll see this
+		 * note and still have no friggin idea what i'm thinking today.
+		 */
+		if (audit_default && audit_pid) {
+			skb = skb_dequeue(&audit_skb_hold_queue);
+			if (unlikely(skb)) {
+				while (skb && audit_pid) {
+					kauditd_send_skb(skb);
+					skb = skb_dequeue(&audit_skb_hold_queue);
+				}
+			}
+		}
+
 		skb = skb_dequeue(&audit_skb_queue);
 		wake_up(&audit_backlog_wait);
 		if (skb) {
-			if (audit_pid) {
-				int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
-				if (err < 0) {
-					BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
-					printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
-					audit_log_lost("auditd dissapeared\n");
-					audit_pid = 0;
-				}
-			} else {
+			if (audit_pid)
+				kauditd_send_skb(skb);
+			else {
 				if (printk_ratelimit())
-					printk(KERN_NOTICE "%s\n", skb->data +
-						NLMSG_SPACE(0));
+					printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
 				else
 					audit_log_lost("printk limit exceeded\n");
-				kfree_skb(skb);
+
+				audit_hold_skb(skb);
 			}
 		} else {
 			DECLARE_WAITQUEUE(wait, current);
@@ -885,6 +940,7 @@ static int __init audit_init(void)
 		audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 
 	skb_queue_head_init(&audit_skb_queue);
+	skb_queue_head_init(&audit_skb_hold_queue);
 	audit_initialized = 1;
 	audit_enabled = audit_default;
 	audit_ever_enabled |= !!audit_default;
@@ -1363,19 +1419,23 @@ void audit_log_end(struct audit_buffer *ab)
 		audit_log_lost("rate limit exceeded");
 	} else {
 		struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
+		nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+
 		if (audit_pid) {
-			nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
 			skb_queue_tail(&audit_skb_queue, ab->skb);
-			ab->skb = NULL;
 			wake_up_interruptible(&kauditd_wait);
-		} else if (nlh->nlmsg_type != AUDIT_EOE) {
-			if (printk_ratelimit()) {
-				printk(KERN_NOTICE "type=%d %s\n",
-					nlh->nlmsg_type,
-					ab->skb->data + NLMSG_SPACE(0));
-			} else
-				audit_log_lost("printk limit exceeded\n");
+		} else {
+			if (nlh->nlmsg_type != AUDIT_EOE) {
+				if (printk_ratelimit()) {
+					printk(KERN_NOTICE "type=%d %s\n",
+						nlh->nlmsg_type,
+						ab->skb->data + NLMSG_SPACE(0));
+				} else
+					audit_log_lost("printk limit exceeded\n");
+			}
+			audit_hold_skb(ab->skb);
 		}
+		ab->skb = NULL;
 	}
 	audit_buffer_free(ab);
 }
-- 
cgit v1.3-14-g43fede


From f09ac9db2aafe36fde9ebd63c8c5d776f6e7bd41 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 18 Apr 2008 10:11:04 -0400
Subject: Audit: stop deadlock from signals under load

A deadlock is possible between kauditd and auditd under load if auditd
receives a signal.  When auditd receives a signal it sends a netlink
message to the kernel asking for information about the sender of the
signal.  In that same context the audit system will attempt to send a
netlink message back to the userspace auditd.  If kauditd has already
filled the socket buffer (see netlink_attachskb()) auditd will now put
itself to sleep waiting for room to send the message.  Since auditd is
responsible for draining that socket we have a deadlock.  The fix, since
the response from the kernel does not need to be synchronous is to send
the signal information back to auditd in a separate thread.  And thus
auditd can continue to drain the audit queue normally.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 40 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index fee9052eb5cf..520583d8ca18 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -156,6 +156,11 @@ struct audit_buffer {
 	gfp_t		     gfp_mask;
 };
 
+struct audit_reply {
+	int pid;
+	struct sk_buff *skb;
+};
+
 static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
 {
 	if (ab) {
@@ -528,6 +533,19 @@ nlmsg_failure:			/* Used by NLMSG_PUT */
 	return NULL;
 }
 
+static int audit_send_reply_thread(void *arg)
+{
+	struct audit_reply *reply = (struct audit_reply *)arg;
+
+	mutex_lock(&audit_cmd_mutex);
+	mutex_unlock(&audit_cmd_mutex);
+
+	/* Ignore failure. It'll only happen if the sender goes away,
+	   because our timeout is set to infinite. */
+	netlink_unicast(audit_sock, reply->skb, reply->pid, 0);
+	kfree(reply);
+	return 0;
+}
 /**
  * audit_send_reply - send an audit reply message via netlink
  * @pid: process id to send reply to
@@ -544,14 +562,26 @@ nlmsg_failure:			/* Used by NLMSG_PUT */
 void audit_send_reply(int pid, int seq, int type, int done, int multi,
 		      void *payload, int size)
 {
-	struct sk_buff	*skb;
+	struct sk_buff *skb;
+	struct task_struct *tsk;
+	struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
+					    GFP_KERNEL);
+
+	if (!reply)
+		return;
+
 	skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
 	if (!skb)
 		return;
-	/* Ignore failure. It'll only happen if the sender goes away,
-	   because our timeout is set to infinite. */
-	netlink_unicast(audit_sock, skb, pid, 0);
-	return;
+
+	reply->pid = pid;
+	reply->skb = skb;
+
+	tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
+	if (IS_ERR(tsk)) {
+		kfree(reply);
+		kfree_skb(skb);
+	}
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From b556f8ad58c6e9f8f485c8cef7546e3fc82c382a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 18 Apr 2008 10:12:59 -0400
Subject: Audit: standardize string audit interfaces

This patch standardized the string auditing interfaces.  No userspace
changes will be visible and this is all just cleanup and consistancy
work.  We have the following string audit interfaces to use:

void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len);

void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n);
void audit_log_string(struct audit_buffer *ab, const char *buf);

void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n);
void audit_log_untrustedstring(struct audit_buffer *ab, const char *string);

This may be the first step to possibly fixing some of the issues that
people have with the string output from the kernel audit system.  But we
still don't have an agreed upon solution to that problem.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/char/tty_audit.c |  2 +-
 include/linux/audit.h    | 22 ++++++++++++++--------
 kernel/audit.c           | 19 +++++++++----------
 kernel/auditsc.c         |  8 ++++----
 security/selinux/avc.c   |  2 +-
 5 files changed, 29 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/drivers/char/tty_audit.c b/drivers/char/tty_audit.c
index 9739bbfc8f70..caeedd12d494 100644
--- a/drivers/char/tty_audit.c
+++ b/drivers/char/tty_audit.c
@@ -92,7 +92,7 @@ static void tty_audit_buf_push(struct task_struct *tsk, uid_t loginuid,
 		get_task_comm(name, tsk);
 		audit_log_untrustedstring(ab, name);
 		audit_log_format(ab, " data=");
-		audit_log_n_untrustedstring(ab, buf->valid, buf->data);
+		audit_log_n_untrustedstring(ab, buf->data, buf->valid);
 		audit_log_end(ab);
 	}
 	buf->valid = 0;
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 25f6ae30dd4b..f938335af75e 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -549,16 +549,20 @@ extern void		    audit_log_format(struct audit_buffer *ab,
 					     const char *fmt, ...)
 			    __attribute__((format(printf,2,3)));
 extern void		    audit_log_end(struct audit_buffer *ab);
-extern void		    audit_log_hex(struct audit_buffer *ab,
-					  const unsigned char *buf,
-					  size_t len);
 extern int		    audit_string_contains_control(const char *string,
 							  size_t len);
+extern void		    audit_log_n_hex(struct audit_buffer *ab,
+					  const unsigned char *buf,
+					  size_t len);
+extern void		    audit_log_n_string(struct audit_buffer *ab,
+					       const char *buf,
+					       size_t n);
+#define audit_log_string(a,b) audit_log_n_string(a, b, strlen(b));
+extern void		    audit_log_n_untrustedstring(struct audit_buffer *ab,
+							const char *string,
+							size_t n);
 extern void		    audit_log_untrustedstring(struct audit_buffer *ab,
 						      const char *string);
-extern void		    audit_log_n_untrustedstring(struct audit_buffer *ab,
-							size_t n,
-							const char *string);
 extern void		    audit_log_d_path(struct audit_buffer *ab,
 					     const char *prefix,
 					     struct path *path);
@@ -578,9 +582,11 @@ extern int audit_enabled;
 #define audit_log_vformat(b,f,a) do { ; } while (0)
 #define audit_log_format(b,f,...) do { ; } while (0)
 #define audit_log_end(b) do { ; } while (0)
-#define audit_log_hex(a,b,l) do { ; } while (0)
-#define audit_log_untrustedstring(a,s) do { ; } while (0)
+#define audit_log_n_hex(a,b,l) do { ; } while (0)
+#define audit_log_n_string(a,c,l) do { ; } while (0)
+#define audit_log_string(a,c) do { ; } while (0)
 #define audit_log_n_untrustedstring(a,n,s) do { ; } while (0)
+#define audit_log_untrustedstring(a,s) do { ; } while (0)
 #define audit_log_d_path(b, p, d) do { ; } while (0)
 #define audit_enabled 0
 #endif
diff --git a/kernel/audit.c b/kernel/audit.c
index 520583d8ca18..5b9ad3dda885 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -757,8 +757,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 				audit_log_format(ab, " msg=");
 				size = nlmsg_len(nlh);
-				audit_log_n_untrustedstring(ab, size,
-							    data);
+				audit_log_n_untrustedstring(ab, data, size);
 			}
 			audit_set_pid(ab, pid);
 			audit_log_end(ab);
@@ -1293,7 +1292,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
  * This function will take the passed buf and convert it into a string of
  * ascii hex digits. The new string is placed onto the skb.
  */
-void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
+void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
 		size_t len)
 {
 	int i, avail, new_len;
@@ -1329,8 +1328,8 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
  * Format a string of no more than slen characters into the audit buffer,
  * enclosed in quote marks.
  */
-static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
-			       const char *string)
+void audit_log_n_string(struct audit_buffer *ab, const char *string,
+			size_t slen)
 {
 	int avail, new_len;
 	unsigned char *ptr;
@@ -1386,13 +1385,13 @@ int audit_string_contains_control(const char *string, size_t len)
  * The caller specifies the number of characters in the string to log, which may
  * or may not be the entire string.
  */
-void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
-				 const char *string)
+void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string,
+				 size_t len)
 {
 	if (audit_string_contains_control(string, len))
-		audit_log_hex(ab, string, len);
+		audit_log_n_hex(ab, string, len);
 	else
-		audit_log_n_string(ab, len, string);
+		audit_log_n_string(ab, string, len);
 }
 
 /**
@@ -1405,7 +1404,7 @@ void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
  */
 void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 {
-	audit_log_n_untrustedstring(ab, strlen(string), string);
+	audit_log_n_untrustedstring(ab, string, strlen(string));
 }
 
 /* This is a helper-function to print the escaped d_path */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d7249fcdc442..0072b1d8b258 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1095,7 +1095,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 			audit_log_format(*ab, "[%d]", i);
 		audit_log_format(*ab, "=");
 		if (has_cntl)
-			audit_log_hex(*ab, buf, to_send);
+			audit_log_n_hex(*ab, buf, to_send);
 		else
 			audit_log_format(*ab, "\"%s\"", buf);
 		audit_log_format(*ab, "\n");
@@ -1307,7 +1307,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			struct audit_aux_data_sockaddr *axs = (void *)aux;
 
 			audit_log_format(ab, "saddr=");
-			audit_log_hex(ab, axs->a, axs->len);
+			audit_log_n_hex(ab, axs->a, axs->len);
 			break; }
 
 		case AUDIT_FD_PAIR: {
@@ -1371,8 +1371,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			default:
 				/* log the name's directory component */
 				audit_log_format(ab, " name=");
-				audit_log_n_untrustedstring(ab, n->name_len,
-							    n->name);
+				audit_log_n_untrustedstring(ab, n->name,
+							    n->name_len);
 			}
 		} else
 			audit_log_format(ab, " name=(null)");
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 95a8ef4a5073..114b4b4c97b2 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -646,7 +646,7 @@ void avc_audit(u32 ssid, u32 tsid,
 					if (*p)
 						audit_log_untrustedstring(ab, p);
 					else
-						audit_log_hex(ab, p, len);
+						audit_log_n_hex(ab, p, len);
 					break;
 				}
 			}
-- 
cgit v1.3-14-g43fede


From c782f242f0602edf848355d41e3676753c2280c8 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sun, 27 Apr 2008 02:39:17 -0700
Subject: [PATCH 1/2] audit: move extern declarations to audit.h

Leave audit_sig_{uid|pid|sid} protected by #ifdef CONFIG_AUDITSYSCALL.

Noticed by sparse:
kernel/audit.c:73:6: warning: symbol 'audit_ever_enabled' was not declared. Should it be static?
kernel/audit.c:100:8: warning: symbol 'audit_sig_uid' was not declared. Should it be static?
kernel/audit.c:101:8: warning: symbol 'audit_sig_pid' was not declared. Should it be static?
kernel/audit.c:102:6: warning: symbol 'audit_sig_sid' was not declared. Should it be static?
kernel/audit.c:117:23: warning: symbol 'audit_ih' was not declared. Should it be static?
kernel/auditfilter.c:78:18: warning: symbol 'audit_filter_list' was not declared. Should it be static?

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.h       | 13 +++++++++++++
 kernel/auditfilter.c |  5 -----
 kernel/auditsc.c     |  6 ------
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.h b/kernel/audit.h
index 3cfc54ee3e1f..9d6717412fec 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -74,6 +74,11 @@ struct audit_entry {
 	struct audit_krule	rule;
 };
 
+#ifdef CONFIG_AUDIT
+extern int audit_enabled;
+extern int audit_ever_enabled;
+#endif
+
 extern int audit_pid;
 
 #define AUDIT_INODE_BUCKETS	32
@@ -104,6 +109,9 @@ struct audit_netlink_list {
 int audit_send_list(void *);
 
 struct inotify_watch;
+/* Inotify handle */
+extern struct inotify_handle *audit_ih;
+
 extern void audit_free_parent(struct inotify_watch *);
 extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
 				const char *, struct inode *);
@@ -111,6 +119,7 @@ extern int selinux_audit_rule_update(void);
 
 extern struct mutex audit_filter_mutex;
 extern void audit_free_rule_rcu(struct rcu_head *);
+extern struct list_head audit_filter_list[];
 
 #ifdef CONFIG_AUDIT_TREE
 extern struct audit_chunk *audit_tree_lookup(const struct inode *);
@@ -137,6 +146,10 @@ extern void audit_put_tree(struct audit_tree *);
 
 extern char *audit_unpack_string(void **, size_t *, size_t);
 
+extern pid_t audit_sig_pid;
+extern uid_t audit_sig_uid;
+extern u32 audit_sig_sid;
+
 #ifdef CONFIG_AUDITSYSCALL
 extern int __audit_signal_info(int sig, struct task_struct *t);
 static inline int audit_signal_info(int sig, struct task_struct *t)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index af3ae91c47b1..bcf1fb7c7f32 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -89,14 +89,9 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 
 DEFINE_MUTEX(audit_filter_mutex);
 
-/* Inotify handle */
-extern struct inotify_handle *audit_ih;
-
 /* Inotify events we care about. */
 #define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
 
-extern int audit_enabled;
-
 void audit_free_parent(struct inotify_watch *i_watch)
 {
 	struct audit_parent *parent;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 0072b1d8b258..e128adcb33c2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -68,9 +68,6 @@
 
 #include "audit.h"
 
-extern struct list_head audit_filter_list[];
-extern int audit_ever_enabled;
-
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
  * for saving names from getname(). */
 #define AUDIT_NAMES    20
@@ -2361,9 +2358,6 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	struct audit_aux_data_pids *axp;
 	struct task_struct *tsk = current;
 	struct audit_context *ctx = tsk->audit_context;
-	extern pid_t audit_sig_pid;
-	extern uid_t audit_sig_uid;
-	extern u32 audit_sig_sid;
 
 	if (audit_pid && t->tgid == audit_pid) {
 		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
-- 
cgit v1.3-14-g43fede


From 7719e437fac119e57b17588bab3a8e39ff9d22eb Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sun, 27 Apr 2008 02:39:56 -0700
Subject: [PATCH 2/2] audit: fix sparse shadowed variable warnings

Use msglen as the identifier.
kernel/audit.c:724:10: warning: symbol 'len' shadows an earlier one
kernel/audit.c:575:8: originally declared here

Don't use ino_f to check the inode field at the end of the functions.
kernel/auditfilter.c:429:22: warning: symbol 'f' shadows an earlier one
kernel/auditfilter.c:420:21: originally declared here
kernel/auditfilter.c:542:22: warning: symbol 'f' shadows an earlier one
kernel/auditfilter.c:529:21: originally declared here

i always used as a counter for a for loop and initialized to zero before
use.  Eliminate the inner i variables.
kernel/auditsc.c:1295:8: warning: symbol 'i' shadows an earlier one
kernel/auditsc.c:1152:6: originally declared here
kernel/auditsc.c:1320:7: warning: symbol 'i' shadows an earlier one
kernel/auditsc.c:1152:6: originally declared here

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c       | 10 +++++-----
 kernel/auditfilter.c | 16 ++++++++--------
 kernel/auditsc.c     |  2 --
 3 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 5b9ad3dda885..f4799eb6977a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -813,21 +813,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	case AUDIT_MAKE_EQUIV: {
 		void *bufp = data;
 		u32 sizes[2];
-		size_t len = nlmsg_len(nlh);
+		size_t msglen = nlmsg_len(nlh);
 		char *old, *new;
 
 		err = -EINVAL;
-		if (len < 2 * sizeof(u32))
+		if (msglen < 2 * sizeof(u32))
 			break;
 		memcpy(sizes, bufp, 2 * sizeof(u32));
 		bufp += 2 * sizeof(u32);
-		len -= 2 * sizeof(u32);
-		old = audit_unpack_string(&bufp, &len, sizes[0]);
+		msglen -= 2 * sizeof(u32);
+		old = audit_unpack_string(&bufp, &msglen, sizes[0]);
 		if (IS_ERR(old)) {
 			err = PTR_ERR(old);
 			break;
 		}
-		new = audit_unpack_string(&bufp, &len, sizes[1]);
+		new = audit_unpack_string(&bufp, &msglen, sizes[1]);
 		if (IS_ERR(new)) {
 			err = PTR_ERR(new);
 			kfree(old);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index bcf1fb7c7f32..7c3450d063fe 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -417,7 +417,7 @@ exit_err:
 static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 {
 	struct audit_entry *entry;
-	struct audit_field *f;
+	struct audit_field *ino_f;
 	int err = 0;
 	int i;
 
@@ -499,9 +499,9 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 		}
 	}
 
-	f = entry->rule.inode_f;
-	if (f) {
-		switch(f->op) {
+	ino_f = entry->rule.inode_f;
+	if (ino_f) {
+		switch(ino_f->op) {
 		case AUDIT_NOT_EQUAL:
 			entry->rule.inode_f = NULL;
 		case AUDIT_EQUAL:
@@ -526,7 +526,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 {
 	int err = 0;
 	struct audit_entry *entry;
-	struct audit_field *f;
+	struct audit_field *ino_f;
 	void *bufp;
 	size_t remain = datasz - sizeof(struct audit_rule_data);
 	int i;
@@ -654,9 +654,9 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		}
 	}
 
-	f = entry->rule.inode_f;
-	if (f) {
-		switch(f->op) {
+	ino_f = entry->rule.inode_f;
+	if (ino_f) {
+		switch(ino_f->op) {
 		case AUDIT_NOT_EQUAL:
 			entry->rule.inode_f = NULL;
 		case AUDIT_EQUAL:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e128adcb33c2..091409996577 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1293,7 +1293,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			break; }
 
 		case AUDIT_SOCKETCALL: {
-			int i;
 			struct audit_aux_data_socketcall *axs = (void *)aux;
 			audit_log_format(ab, "nargs=%d", axs->nargs);
 			for (i=0; i<axs->nargs; i++)
@@ -1318,7 +1317,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
 	for (aux = context->aux_pids; aux; aux = aux->next) {
 		struct audit_aux_data_pids *axs = (void *)aux;
-		int i;
 
 		for (i = 0; i < axs->pid_count; i++)
 			if (audit_log_pid_context(context, axs->target_pid[i],
-- 
cgit v1.3-14-g43fede


From 4a761b8c1d7a3a4ee7ccf92ce255d986f601e067 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 18 Apr 2008 13:30:15 -0700
Subject: [patch 2/2] Use find_task_by_vpid in audit code

The pid to lookup a task by is passed inside audit code via netlink message.

Thanks to Denis Lunev, netlink packets are now (since 2.6.24) _always_
processed in the context of the sending task.  So this is correct to lookup
the task with find_task_by_vpid() here.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index f4799eb6977a..b7d3709cc452 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -455,7 +455,7 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
 	int err;
 
 	read_lock(&tasklist_lock);
-	tsk = find_task_by_pid(pid);
+	tsk = find_task_by_vpid(pid);
 	err = -ESRCH;
 	if (!tsk)
 		goto out;
@@ -871,7 +871,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		struct task_struct *tsk;
 
 		read_lock(&tasklist_lock);
-		tsk = find_task_by_pid(pid);
+		tsk = find_task_by_vpid(pid);
 		if (!tsk)
 			err = -ESRCH;
 		else {
@@ -894,7 +894,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (s->enabled != 0 && s->enabled != 1)
 			return -EINVAL;
 		read_lock(&tasklist_lock);
-		tsk = find_task_by_pid(pid);
+		tsk = find_task_by_vpid(pid);
 		if (!tsk)
 			err = -ESRCH;
 		else {
-- 
cgit v1.3-14-g43fede


From 8b67dca9420474623709e00d72a066068a502b20 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 28 Apr 2008 04:15:49 -0400
Subject: [PATCH] new predicate - AUDIT_FILETYPE

Argument is S_IF... | <index>, where index is normally 0 or 1.
Triggers if chosen element of ctx->names[] is present and the
mode of object in question matches the upper bits of argument.
I.e. for things like "is the argument of that chmod a directory",
etc.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  1 +
 kernel/auditfilter.c  |  8 ++++++++
 kernel/auditsc.c      | 16 ++++++++++++++++
 3 files changed, 25 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index dcd5395b400d..63c3bb98558f 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -209,6 +209,7 @@
 #define AUDIT_WATCH	105
 #define AUDIT_PERM	106
 #define AUDIT_DIR	107
+#define AUDIT_FILETYPE	108
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7c3450d063fe..9435d9392df5 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -478,6 +478,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 			if (f->val & ~15)
 				goto exit_free;
 			break;
+		case AUDIT_FILETYPE:
+			if ((f->val & ~S_IFMT) > S_IFMT)
+				goto exit_free;
+			break;
 		case AUDIT_INODE:
 			err = audit_to_inode(&entry->rule, f);
 			if (err)
@@ -649,6 +653,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 			if (f->val & ~15)
 				goto exit_free;
 			break;
+		case AUDIT_FILETYPE:
+			if ((f->val & ~S_IFMT) > S_IFMT)
+				goto exit_free;
+			break;
 		default:
 			goto exit_free;
 		}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 091409996577..c10e7aae04d7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -280,6 +280,19 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
 	}
 }
 
+static int audit_match_filetype(struct audit_context *ctx, int which)
+{
+	unsigned index = which & ~S_IFMT;
+	mode_t mode = which & S_IFMT;
+	if (index >= ctx->name_count)
+		return 0;
+	if (ctx->names[index].ino == -1)
+		return 0;
+	if ((ctx->names[index].mode ^ mode) & S_IFMT)
+		return 0;
+	return 1;
+}
+
 /*
  * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *;
  * ->first_trees points to its beginning, ->trees - to the current end of data.
@@ -589,6 +602,9 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_PERM:
 			result = audit_match_perm(ctx, f->val);
 			break;
+		case AUDIT_FILETYPE:
+			result = audit_match_filetype(ctx, f->val);
+			break;
 		}
 
 		if (!result)
-- 
cgit v1.3-14-g43fede


From 0c96c5979a522c3323c30a078a70120e29b5bdbc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Apr 2008 09:23:24 +0200
Subject: hrtimer: raise softirq unlocked to avoid circular lock dependency

The scheduler hrtimer bits in 2.6.25 introduced a circular lock
dependency in a rare code path:

=======================================================
[ INFO: possible circular locking dependency detected ]
2.6.25-sched-devel.git-x86-latest.git #19
-------------------------------------------------------
X/2980 is trying to acquire lock:
 (&rq->rq_lock_key#2){++..}, at: [<ffffffff80230146>] task_rq_lock+0x56/0xa0

but task is already holding lock:
 (&cpu_base->lock){++..}, at: [<ffffffff80257ae1>] lock_hrtimer_base+0x31/0x60

which lock already depends on the new lock.

The scenario which leads to this is:

posix-timer signal is delivered
 -> posix-timer is rearmed
    timer is already expired in hrtimer_enqueue()
     -> softirq is raised

To prevent this we need to move the raise of the softirq out of the
base->lock protected code path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/hrtimer.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e379ef0e9c20..dea4c9124ac8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -590,7 +590,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 			list_add_tail(&timer->cb_entry,
 				      &base->cpu_base->cb_pending);
 			timer->state = HRTIMER_STATE_PENDING;
-			raise_softirq(HRTIMER_SOFTIRQ);
 			return 1;
 		default:
 			BUG();
@@ -633,6 +632,11 @@ static int hrtimer_switch_to_hres(void)
 	return 1;
 }
 
+static inline void hrtimer_raise_softirq(void)
+{
+	raise_softirq(HRTIMER_SOFTIRQ);
+}
+
 #else
 
 static inline int hrtimer_hres_active(void) { return 0; }
@@ -651,6 +655,7 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
 {
 	return 0;
 }
+static inline void hrtimer_raise_softirq(void) { }
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -850,7 +855,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
-	int ret;
+	int ret, raise;
 
 	base = lock_hrtimer_base(timer, &flags);
 
@@ -884,8 +889,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 	enqueue_hrtimer(timer, new_base,
 			new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
 
+	/*
+	 * The timer may be expired and moved to the cb_pending
+	 * list. We can not raise the softirq with base lock held due
+	 * to a possible deadlock with runqueue lock.
+	 */
+	raise = timer->state == HRTIMER_STATE_PENDING;
+
 	unlock_hrtimer_base(timer, &flags);
 
+	if (raise)
+		hrtimer_raise_softirq();
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
-- 
cgit v1.3-14-g43fede


From 9d04d9280c4bbf6950b70b705bc4ace41de65615 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 28 Apr 2008 13:57:19 -0700
Subject: ptrace: conditionalize compat_ptrace_request

My recent additions to compat_ptrace_request made it mandatory
for CONFIG_COMPAT arch's to define copy_siginfo_from_user32.
This broke some builds, though they all really should get cleaned
up in that way.

Since all the arch's that actually call compat_ptrace_request have
now been cleaned up to use the generic compat_sys_ptrace, we can
avoid the build problems on the crufty arch's by changing the
conditionals on the definition.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 67e392ed5496..dac4b4e57293 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -612,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
 	return (copied == sizeof(data)) ? 0 : -EIO;
 }
 
-#ifdef CONFIG_COMPAT
+#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE
 #include <linux/compat.h>
 
 int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -667,7 +667,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 	return ret;
 }
 
-#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
 asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 				  compat_long_t addr, compat_long_t data)
 {
@@ -710,6 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 	unlock_kernel();
 	return ret;
 }
-#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
-
-#endif	/* CONFIG_COMPAT */
+#endif	/* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */
-- 
cgit v1.3-14-g43fede


From b331d259b1147f82d692f3b866e036017cbde8fe Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 28 Apr 2008 14:13:19 -0700
Subject: kernel: fix integer as NULL pointer warnings

kernel/cpuset.c:1268:52: warning: Using plain integer as NULL pointer
kernel/pid_namespace.c:95:24: warning: Using plain integer as NULL pointer

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Reviewed-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c        | 3 ++-
 kernel/pid_namespace.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 024888bb9814..48a976c52cf5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1265,7 +1265,8 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
 		return -E2BIG;
 
 	/* +1 for nul-terminator */
-	if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)
+	buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+	if (!buffer)
 		return -ENOMEM;
 
 	if (copy_from_user(buffer, userbuf, nbytes)) {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6d792b66d854..5ca37fa50beb 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -92,7 +92,7 @@ static struct pid_namespace *create_pid_namespace(int level)
 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
 
 	for (i = 1; i < PIDMAP_ENTRIES; i++) {
-		ns->pidmap[i].page = 0;
+		ns->pidmap[i].page = NULL;
 		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
 	}
 
-- 
cgit v1.3-14-g43fede


From c3270e577c18b3d0e984c3371493205a4807db9d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@comcast.ne>
Date: Thu, 24 Apr 2008 12:52:20 +0200
Subject: relay: fix splice problem

Splice isn't always incrementing the ppos correctly, which broke
relay splice.

Signed-off-by: Tom Zanussi <zanussi@comcast.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c    | 2 +-
 kernel/relay.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/fs/splice.c b/fs/splice.c
index eeb1a86a7014..633f58ebfb72 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1075,7 +1075,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
 	if (ret > 0)
-		*ppos += ret;
+		*ppos = sd.pos;
 
 	return ret;
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index d6204a485818..dc873fba90d2 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1162,7 +1162,7 @@ static ssize_t relay_file_splice_read(struct file *in,
 	ret = 0;
 	spliced = 0;
 
-	while (len) {
+	while (len && !spliced) {
 		ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
 		if (ret < 0)
 			break;
-- 
cgit v1.3-14-g43fede


From 95b570c9cef3b12356454c7112571b7e406b4b51 Mon Sep 17 00:00:00 2001
From: Nur Hussein <nurhussein@gmail.com>
Date: Tue, 29 Apr 2008 00:58:39 -0700
Subject: Taint kernel after WARN_ON(condition)

The kernel is sent to tainted within the warn_on_slowpath() function, and
whenever a warning occurs the new taint flag 'W' is set.  This is useful to
know if a warning occurred before a BUG by preserving the warning as a flag
in the taint state.

This does not work on architectures where WARN_ON has its own definition.
These archs are:
	1. s390
	2. superh
	3. avr32
	4. parisc

The maintainers of these architectures have been added in the Cc: list
in this email to alert them to the situation.

The documentation in oops-tracing.txt has been updated to include the
new flag.

Signed-off-by: Nur Hussein <nurhussein@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/oops-tracing.txt | 4 ++++
 include/linux/kernel.h         | 1 +
 kernel/panic.c                 | 8 ++++++--
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index 7f60dfe642ca..b152e81da592 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -253,6 +253,10 @@ characters, each representing a particular tainted value.
 
   8: 'D' if the kernel has died recently, i.e. there was an OOPS or BUG.
 
+  9: 'A' if the ACPI table has been overridden.
+
+ 10: 'W' if a warning has previously been issued by the kernel.
+
 The primary reason for the 'Tainted: ' string is to tell kernel
 debuggers if this is a clean kernel or if anything unusual has
 occurred.  Tainting is permanent: even if an offending module is
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index cd6d02cf854d..28caa53dd1f7 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -255,6 +255,7 @@ extern enum system_states {
 #define TAINT_USER			(1<<6)
 #define TAINT_DIE			(1<<7)
 #define TAINT_OVERRIDDEN_ACPI_TABLE	(1<<8)
+#define TAINT_WARN			(1<<9)
 
 extern void dump_stack(void) __cold;
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 24af9f8bac99..425567f45b9f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -153,6 +153,8 @@ EXPORT_SYMBOL(panic);
  *  'M' - System experienced a machine check exception.
  *  'B' - System has hit bad_page.
  *  'U' - Userspace-defined naughtiness.
+ *  'A' - ACPI table overridden.
+ *  'W' - Taint on warning.
  *
  *	The string is overwritten by the next call to print_taint().
  */
@@ -161,7 +163,7 @@ const char *print_tainted(void)
 {
 	static char buf[20];
 	if (tainted) {
-		snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c",
+		snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c",
 			tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
 			tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
 			tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
@@ -170,7 +172,8 @@ const char *print_tainted(void)
 			tainted & TAINT_BAD_PAGE ? 'B' : ' ',
 			tainted & TAINT_USER ? 'U' : ' ',
 			tainted & TAINT_DIE ? 'D' : ' ',
-			tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ');
+			tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ',
+			tainted & TAINT_WARN ? 'W' : ' ');
 	}
 	else
 		snprintf(buf, sizeof(buf), "Not tainted");
@@ -312,6 +315,7 @@ void warn_on_slowpath(const char *file, int line)
 	print_modules();
 	dump_stack();
 	print_oops_end_marker();
+	add_taint(TAINT_WARN);
 }
 EXPORT_SYMBOL(warn_on_slowpath);
 #endif
-- 
cgit v1.3-14-g43fede


From 679c9cd4acc2cf2872171813752eab3320273339 Mon Sep 17 00:00:00 2001
From: Sripathi Kodi <sripathik@in.ibm.com>
Date: Tue, 29 Apr 2008 00:58:42 -0700
Subject: add RUSAGE_THREAD

Add the RUSAGE_THREAD option for the getrusage system call.  This is
essentially Roland's patch from http://lkml.org/lkml/2008/1/18/589, but the
line about RUSAGE_LWP line has been removed, as suggested by Ulrich and
Christoph.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/resource.h |  1 +
 kernel/sys.c             | 31 ++++++++++++++++++++++---------
 2 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/resource.h b/include/linux/resource.h
index ae13db714742..aaa423a6f3d9 100644
--- a/include/linux/resource.h
+++ b/include/linux/resource.h
@@ -19,6 +19,7 @@ struct task_struct;
 #define	RUSAGE_SELF	0
 #define	RUSAGE_CHILDREN	(-1)
 #define RUSAGE_BOTH	(-2)		/* sys_wait4() uses this */
+#define	RUSAGE_THREAD	1		/* only the calling thread */
 
 struct	rusage {
 	struct timeval ru_utime;	/* user time used */
diff --git a/kernel/sys.c b/kernel/sys.c
index f2a451366953..e423d0d9e6ff 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1545,6 +1545,19 @@ out:
  *
  */
 
+static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
+				     cputime_t *utimep, cputime_t *stimep)
+{
+	*utimep = cputime_add(*utimep, t->utime);
+	*stimep = cputime_add(*stimep, t->stime);
+	r->ru_nvcsw += t->nvcsw;
+	r->ru_nivcsw += t->nivcsw;
+	r->ru_minflt += t->min_flt;
+	r->ru_majflt += t->maj_flt;
+	r->ru_inblock += task_io_get_inblock(t);
+	r->ru_oublock += task_io_get_oublock(t);
+}
+
 static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 {
 	struct task_struct *t;
@@ -1554,6 +1567,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	memset((char *) r, 0, sizeof *r);
 	utime = stime = cputime_zero;
 
+	if (who == RUSAGE_THREAD) {
+		accumulate_thread_rusage(p, r, &utime, &stime);
+		goto out;
+	}
+
 	rcu_read_lock();
 	if (!lock_task_sighand(p, &flags)) {
 		rcu_read_unlock();
@@ -1586,14 +1604,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			r->ru_oublock += p->signal->oublock;
 			t = p;
 			do {
-				utime = cputime_add(utime, t->utime);
-				stime = cputime_add(stime, t->stime);
-				r->ru_nvcsw += t->nvcsw;
-				r->ru_nivcsw += t->nivcsw;
-				r->ru_minflt += t->min_flt;
-				r->ru_majflt += t->maj_flt;
-				r->ru_inblock += task_io_get_inblock(t);
-				r->ru_oublock += task_io_get_oublock(t);
+				accumulate_thread_rusage(t, r, &utime, &stime);
 				t = next_thread(t);
 			} while (t != p);
 			break;
@@ -1605,6 +1616,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	unlock_task_sighand(p, &flags);
 	rcu_read_unlock();
 
+out:
 	cputime_to_timeval(utime, &r->ru_utime);
 	cputime_to_timeval(stime, &r->ru_stime);
 }
@@ -1618,7 +1630,8 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 
 asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
 {
-	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN)
+	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
+	    who != RUSAGE_THREAD)
 		return -EINVAL;
 	return getrusage(current, who, ru);
 }
-- 
cgit v1.3-14-g43fede


From 9647155ffbce9dffed8a9a4768c8994334b609db Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Tue, 29 Apr 2008 00:58:48 -0700
Subject: cpu: fix section mismatch warning in unregister_cpu_notifier

Fix following warning:
WARNING: vmlinux.o(.text+0x75f4e): Section mismatch in reference from the function unregister_cpu_notifier() to the variable .cpuinit.data:cpu_chain

We know that unregister_cpu_notifier is using HOTPLUG_CPU
stuff - so ignore these references.
Annotating unregister_cpu_notifier had been another option
but this caused far more warnings since not all callers were
annotated __cpuinit.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2011ad8d2697..da31165fd298 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -149,7 +149,7 @@ int __cpuinit register_cpu_notifier(struct notifier_block *nb)
 
 EXPORT_SYMBOL(register_cpu_notifier);
 
-void unregister_cpu_notifier(struct notifier_block *nb)
+void __ref unregister_cpu_notifier(struct notifier_block *nb)
 {
 	cpu_maps_update_begin();
 	raw_notifier_chain_unregister(&cpu_chain, nb);
-- 
cgit v1.3-14-g43fede


From 514a20a5da99aef8e667cc395841a5c4e5f9e8c1 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Tue, 29 Apr 2008 00:58:50 -0700
Subject: cpu: fix section mismatch warnings in *cpu_down

Fix following warnings:
WARNING: vmlinux.o(.text+0x75c8d): Section mismatch in reference from the function take_cpu_down() to the variable .cpuinit.data:cpu_chain
WARNING: vmlinux.o(.text+0x75d2a): Section mismatch in reference from the function _cpu_down() to the variable .cpuinit.data:cpu_chain
WARNING: vmlinux.o(.text+0x75d4d): Section mismatch in reference from the function _cpu_down() to the variable .cpuinit.data:cpu_chain
WARNING: vmlinux.o(.text+0x75de4): Section mismatch in reference from the function _cpu_down() to the variable .cpuinit.data:cpu_chain
WARNING: vmlinux.o(.text+0x75e33): Section mismatch in reference from the function _cpu_down() to the variable .cpuinit.data:cpu_chain

cpu_down is only used from code surrounded by HOTPLUG_CPU so any references to
__cpuinit is OK.

Add a few __ref to tech modpost to ignore the references.

This is just papering over the fact that the cpu hotplug code is fragile with
respect to use of HOTPLUG_CPU and in many cases rely on __cpuinit to get rid
of code when HOTPLUG_CPU is not enabled.  For now this is the least invasive
change.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index da31165fd298..306844ed58f7 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -180,7 +180,7 @@ struct take_cpu_down_param {
 };
 
 /* Take this CPU down. */
-static int take_cpu_down(void *_param)
+static int __ref take_cpu_down(void *_param)
 {
 	struct take_cpu_down_param *param = _param;
 	int err;
@@ -199,7 +199,7 @@ static int take_cpu_down(void *_param)
 }
 
 /* Requires cpu_add_remove_lock to be held */
-static int _cpu_down(unsigned int cpu, int tasks_frozen)
+static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
 	struct task_struct *p;
@@ -274,7 +274,7 @@ out_release:
 	return err;
 }
 
-int cpu_down(unsigned int cpu)
+int __ref cpu_down(unsigned int cpu)
 {
 	int err = 0;
 
-- 
cgit v1.3-14-g43fede


From f7b16c108fd044adc422ff21b5d6c16022462fd0 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Tue, 29 Apr 2008 00:58:51 -0700
Subject: cpu: fix section mismatch warning in reference to
 register_cpu_notifier

Fix following warnings:
WARNING: vmlinux.o(.text+0xc60): Section mismatch in reference from the function kvm_init() to the function .cpuinit.text:register_cpu_notifier()
WARNING: vmlinux.o(.text+0x33869a): Section mismatch in reference from the function xfs_icsb_init_counters() to the function .cpuinit.text:register_cpu_notifier()
WARNING: vmlinux.o(.text+0x5556a1): Section mismatch in reference from the function acpi_processor_install_hotplug_notify() to the function .cpuinit.text:register_cpu_notifier()
WARNING: vmlinux.o(.text+0xfe6b28): Section mismatch in reference from the function cpufreq_register_driver() to the function .cpuinit.text:register_cpu_notifier()

register_cpu_notifier() are only really defined when HOTPLUG_CPU is enabled.
So references to the function are OK.

Annotate it with __ref so we do not get warnings from callers and do not get
warnings for the functions/data used by register_cpu_notifier().

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 306844ed58f7..f8f9468d17d7 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -136,7 +136,7 @@ static void cpu_hotplug_done(void)
 	mutex_unlock(&cpu_hotplug.lock);
 }
 /* Need to know about CPUs going up/down? */
-int __cpuinit register_cpu_notifier(struct notifier_block *nb)
+int __ref register_cpu_notifier(struct notifier_block *nb)
 {
 	int ret;
 	cpu_maps_update_begin();
-- 
cgit v1.3-14-g43fede


From cbd9b67bd3883dff0ef4b8ec9229d315a9ba38f0 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Tue, 29 Apr 2008 00:59:23 -0700
Subject: kthread: call wake_up_process() without the lock being held

From the POV of synchronization, there should be no need to call
wake_up_process() with the 'kthread_create_lock' being held.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kthread.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 92cf6930ab51..ac72eea48339 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -144,9 +144,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 
 	spin_lock(&kthread_create_lock);
 	list_add_tail(&create.list, &kthread_create_list);
-	wake_up_process(kthreadd_task);
 	spin_unlock(&kthread_create_lock);
 
+	wake_up_process(kthreadd_task);
 	wait_for_completion(&create.done);
 
 	if (!IS_ERR(create.result)) {
-- 
cgit v1.3-14-g43fede


From 1aeb272cf09f9e2cbc62163b9f37a9b4d1c7e81d Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Tue, 29 Apr 2008 00:59:25 -0700
Subject: kernel: explicitly include required header files under kernel/

Following an experimental deletion of the unnecessary directive

 #include <linux/slab.h>

from the header file <linux/percpu.h>, these files under kernel/ were exposed
as needing to include one of <linux/slab.h> or <linux/gfp.h>, so explicit
includes were added where necessary.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/devres.c     | 1 +
 kernel/irq/manage.c     | 1 +
 kernel/marker.c         | 1 +
 kernel/ns_cgroup.c      | 1 +
 kernel/rcutorture.c     | 1 +
 kernel/res_counter.c    | 1 +
 kernel/time.c           | 1 +
 kernel/user_namespace.c | 1 +
 kernel/utsname.c        | 1 +
 9 files changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 6d9204f3a370..38a25b8d8bff 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,6 +1,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/device.h>
+#include <linux/gfp.h>
 
 /*
  * Device resource management aware IRQ request/free implementation.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 438a01464287..46e4ad1723f0 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
+#include <linux/slab.h>
 
 #include "internals.h"
 
diff --git a/kernel/marker.c b/kernel/marker.c
index 005b95954593..139260e5460c 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -23,6 +23,7 @@
 #include <linux/rcupdate.h>
 #include <linux/marker.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 extern struct marker __start___markers[];
 extern struct marker __stop___markers[];
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index aead4d69f62b..18df038d7cd5 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 
 struct ns_cgroup {
 	struct cgroup_subsys_state css;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 47894f919d4e..33acc424667e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -45,6 +45,7 @@
 #include <linux/byteorder/swabb.h>
 #include <linux/stat.h>
 #include <linux/srcu.h>
+#include <linux/slab.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index efbfc0fc232f..a508c2769463 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,6 +10,7 @@
 #include <linux/types.h>
 #include <linux/parser.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
 
diff --git a/kernel/time.c b/kernel/time.c
index 35d373a98782..86729042e4cd 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,6 +35,7 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4c9006275df7..2731ba80e30b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/version.h>
 #include <linux/nsproxy.h>
+#include <linux/slab.h>
 #include <linux/user_namespace.h>
 
 /*
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 816d7b24fa03..64d398f12444 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/version.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 /*
  * Clone a new ns copying an original utsname, setting refcount to 1
-- 
cgit v1.3-14-g43fede


From 5f97a5a8799b8d7d0afdb9d68a50a4e0e8298a05 Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Tue, 29 Apr 2008 00:59:43 -0700
Subject: isolate ratelimit from printk.c for other use

Due to the rcupreempt.h WARN_ON trigged, I got 2G syslog file.  For some
serious complaining of kernel, we need repeat the warnings, so here I isolate
the ratelimit part of printk.c to a standalone file.

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  1 +
 kernel/printk.c        | 26 +------------------------
 lib/Makefile           |  2 +-
 lib/ratelimit.c        | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 54 insertions(+), 26 deletions(-)
 create mode 100644 lib/ratelimit.c

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 28caa53dd1f7..ad5d05efcd1a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -188,6 +188,7 @@ extern int log_buf_copy(char *dest, int idx, int len);
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int printk_ratelimit(void);
+extern int __ratelimit(int ratelimit_jiffies, int ratelimit_burst);
 extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
diff --git a/kernel/printk.c b/kernel/printk.c
index bdd4ea8c3f2b..d3f9c0f788bf 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1287,31 +1287,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
  */
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
-	static DEFINE_SPINLOCK(ratelimit_lock);
-	static unsigned toks = 10 * 5 * HZ;
-	static unsigned long last_msg;
-	static int missed;
-	unsigned long flags;
-	unsigned long now = jiffies;
-
-	spin_lock_irqsave(&ratelimit_lock, flags);
-	toks += now - last_msg;
-	last_msg = now;
-	if (toks > (ratelimit_burst * ratelimit_jiffies))
-		toks = ratelimit_burst * ratelimit_jiffies;
-	if (toks >= ratelimit_jiffies) {
-		int lost = missed;
-
-		missed = 0;
-		toks -= ratelimit_jiffies;
-		spin_unlock_irqrestore(&ratelimit_lock, flags);
-		if (lost)
-			printk(KERN_WARNING "printk: %d messages suppressed.\n", lost);
-		return 1;
-	}
-	missed++;
-	spin_unlock_irqrestore(&ratelimit_lock, flags);
-	return 0;
+	return __ratelimit(ratelimit_jiffies, ratelimit_burst);
 }
 EXPORT_SYMBOL(__printk_ratelimit);
 
diff --git a/lib/Makefile b/lib/Makefile
index 2d7001b7f5a4..0ae4eb047aac 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -6,7 +6,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
-	 proportions.o prio_heap.o
+	 proportions.o prio_heap.o ratelimit.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/ratelimit.c b/lib/ratelimit.c
new file mode 100644
index 000000000000..485e3040dcd4
--- /dev/null
+++ b/lib/ratelimit.c
@@ -0,0 +1,51 @@
+/*
+ * ratelimit.c - Do something with rate limit.
+ *
+ * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+
+/*
+ * __ratelimit - rate limiting
+ * @ratelimit_jiffies: minimum time in jiffies between two callbacks
+ * @ratelimit_burst: number of callbacks we do before ratelimiting
+ *
+ * This enforces a rate limit: not more than @ratelimit_burst callbacks
+ * in every ratelimit_jiffies
+ */
+int __ratelimit(int ratelimit_jiffies, int ratelimit_burst)
+{
+	static DEFINE_SPINLOCK(ratelimit_lock);
+	static unsigned toks = 10 * 5 * HZ;
+	static unsigned long last_msg;
+	static int missed;
+	unsigned long flags;
+	unsigned long now = jiffies;
+
+	spin_lock_irqsave(&ratelimit_lock, flags);
+	toks += now - last_msg;
+	last_msg = now;
+	if (toks > (ratelimit_burst * ratelimit_jiffies))
+		toks = ratelimit_burst * ratelimit_jiffies;
+	if (toks >= ratelimit_jiffies) {
+		int lost = missed;
+
+		missed = 0;
+		toks -= ratelimit_jiffies;
+		spin_unlock_irqrestore(&ratelimit_lock, flags);
+		if (lost)
+			printk(KERN_WARNING "%s: %d messages suppressed\n",
+				__func__, lost);
+		return 1;
+	}
+	missed++;
+	spin_unlock_irqrestore(&ratelimit_lock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(__ratelimit);
-- 
cgit v1.3-14-g43fede


From 6a3fd92e73fffd9e583650c56ad9558afe51dc5c Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Tue, 29 Apr 2008 00:59:52 -0700
Subject: eCryptfs: make key module subsystem respect namespaces

Make eCryptfs key module subsystem respect namespaces.

Since I will be removing the netlink interface in a future patch, I just made
changes to the netlink.c code so that it will not break the build.  With my
recent patches, the kernel module currently defaults to the device handle
interface rather than the netlink interface.

[akpm@linux-foundation.org: export free_user_ns()]
Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/ecryptfs_kernel.h | 25 ++++++++-----
 fs/ecryptfs/messaging.c       | 81 ++++++++++++++++++++++++++++++++-----------
 fs/ecryptfs/miscdev.c         | 68 +++++++++++++++++++++++-------------
 fs/ecryptfs/netlink.c         | 25 ++++++++-----
 kernel/user_namespace.c       |  1 +
 5 files changed, 136 insertions(+), 64 deletions(-)

(limited to 'kernel')

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 72e117706a68..951ee33a022d 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/scatterlist.h>
 #include <linux/hash.h>
+#include <linux/nsproxy.h>
 
 /* Version verification for shared data structures w/ userspace */
 #define ECRYPTFS_VERSION_MAJOR 0x00
@@ -410,8 +411,9 @@ struct ecryptfs_daemon {
 #define ECRYPTFS_DAEMON_MISCDEV_OPEN 0x00000008
 	u32 flags;
 	u32 num_queued_msg_ctx;
-	pid_t pid;
+	struct pid *pid;
 	uid_t euid;
+	struct user_namespace *user_ns;
 	struct task_struct *task;
 	struct mutex mux;
 	struct list_head msg_ctx_out_queue;
@@ -610,10 +612,13 @@ int
 ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		  size_t size, int flags);
 int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
-int ecryptfs_process_helo(unsigned int transport, uid_t uid, pid_t pid);
-int ecryptfs_process_quit(uid_t uid, pid_t pid);
-int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t uid,
-			      pid_t pid, u32 seq);
+int ecryptfs_process_helo(unsigned int transport, uid_t euid,
+			  struct user_namespace *user_ns, struct pid *pid);
+int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
+			  struct pid *pid);
+int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
+			      struct user_namespace *user_ns, struct pid *pid,
+			      u32 seq);
 int ecryptfs_send_message(unsigned int transport, char *data, int data_len,
 			  struct ecryptfs_msg_ctx **msg_ctx);
 int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
@@ -623,13 +628,13 @@ void ecryptfs_release_messaging(unsigned int transport);
 
 int ecryptfs_send_netlink(char *data, int data_len,
 			  struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
-			  u16 msg_flags, pid_t daemon_pid);
+			  u16 msg_flags, struct pid *daemon_pid);
 int ecryptfs_init_netlink(void);
 void ecryptfs_release_netlink(void);
 
 int ecryptfs_send_connector(char *data, int data_len,
 			    struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
-			    u16 msg_flags, pid_t daemon_pid);
+			    u16 msg_flags, struct pid *daemon_pid);
 int ecryptfs_init_connector(void);
 void ecryptfs_release_connector(void);
 void
@@ -672,7 +677,8 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 				     struct inode *ecryptfs_inode);
 struct page *ecryptfs_get_locked_page(struct file *file, loff_t index);
 int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
-int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid);
+int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
+				 struct user_namespace *user_ns);
 int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
 				 size_t *length_size);
 int ecryptfs_write_packet_length(char *dest, size_t size,
@@ -684,6 +690,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
 			  u16 msg_flags, struct ecryptfs_daemon *daemon);
 void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx);
 int
-ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, pid_t pid);
+ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
+		      struct user_namespace *user_ns, struct pid *pid);
 
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index c6038bd60897..1b5c20058acb 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -20,6 +20,8 @@
  * 02111-1307, USA.
  */
 #include <linux/sched.h>
+#include <linux/user_namespace.h>
+#include <linux/nsproxy.h>
 #include "ecryptfs_kernel.h"
 
 static LIST_HEAD(ecryptfs_msg_ctx_free_list);
@@ -103,6 +105,7 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)
 /**
  * ecryptfs_find_daemon_by_euid
  * @euid: The effective user id which maps to the desired daemon id
+ * @user_ns: The namespace in which @euid applies
  * @daemon: If return value is zero, points to the desired daemon pointer
  *
  * Must be called with ecryptfs_daemon_hash_mux held.
@@ -111,7 +114,8 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)
  *
  * Returns zero if the user id exists in the list; non-zero otherwise.
  */
-int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid)
+int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
+				 struct user_namespace *user_ns)
 {
 	struct hlist_node *elem;
 	int rc;
@@ -119,7 +123,7 @@ int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid)
 	hlist_for_each_entry(*daemon, elem,
 			     &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)],
 			     euid_chain) {
-		if ((*daemon)->euid == euid) {
+		if ((*daemon)->euid == euid && (*daemon)->user_ns == user_ns) {
 			rc = 0;
 			goto out;
 		}
@@ -186,6 +190,7 @@ out:
  * ecryptfs_spawn_daemon - Create and initialize a new daemon struct
  * @daemon: Pointer to set to newly allocated daemon struct
  * @euid: Effective user id for the daemon
+ * @user_ns: The namespace in which @euid applies
  * @pid: Process id for the daemon
  *
  * Must be called ceremoniously while in possession of
@@ -194,7 +199,8 @@ out:
  * Returns zero on success; non-zero otherwise
  */
 int
-ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, pid_t pid)
+ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
+		      struct user_namespace *user_ns, struct pid *pid)
 {
 	int rc = 0;
 
@@ -206,7 +212,8 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, pid_t pid)
 		goto out;
 	}
 	(*daemon)->euid = euid;
-	(*daemon)->pid = pid;
+	(*daemon)->user_ns = get_user_ns(user_ns);
+	(*daemon)->pid = get_pid(pid);
 	(*daemon)->task = current;
 	mutex_init(&(*daemon)->mux);
 	INIT_LIST_HEAD(&(*daemon)->msg_ctx_out_queue);
@@ -222,6 +229,7 @@ out:
  * ecryptfs_process_helo
  * @transport: The underlying transport (netlink, etc.)
  * @euid: The user ID owner of the message
+ * @user_ns: The namespace in which @euid applies
  * @pid: The process ID for the userspace program that sent the
  *       message
  *
@@ -231,32 +239,33 @@ out:
  * Returns zero after adding a new daemon to the hash list;
  * non-zero otherwise.
  */
-int ecryptfs_process_helo(unsigned int transport, uid_t euid, pid_t pid)
+int ecryptfs_process_helo(unsigned int transport, uid_t euid,
+			  struct user_namespace *user_ns, struct pid *pid)
 {
 	struct ecryptfs_daemon *new_daemon;
 	struct ecryptfs_daemon *old_daemon;
 	int rc;
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
-	rc = ecryptfs_find_daemon_by_euid(&old_daemon, euid);
+	rc = ecryptfs_find_daemon_by_euid(&old_daemon, euid, user_ns);
 	if (rc != 0) {
 		printk(KERN_WARNING "Received request from user [%d] "
-		       "to register daemon [%d]; unregistering daemon "
-		       "[%d]\n", euid, pid, old_daemon->pid);
+		       "to register daemon [0x%p]; unregistering daemon "
+		       "[0x%p]\n", euid, pid, old_daemon->pid);
 		rc = ecryptfs_send_raw_message(transport, ECRYPTFS_MSG_QUIT,
 					       old_daemon);
 		if (rc)
 			printk(KERN_WARNING "Failed to send QUIT "
-			       "message to daemon [%d]; rc = [%d]\n",
+			       "message to daemon [0x%p]; rc = [%d]\n",
 			       old_daemon->pid, rc);
 		hlist_del(&old_daemon->euid_chain);
 		kfree(old_daemon);
 	}
-	rc = ecryptfs_spawn_daemon(&new_daemon, euid, pid);
+	rc = ecryptfs_spawn_daemon(&new_daemon, euid, user_ns, pid);
 	if (rc)
 		printk(KERN_ERR "%s: The gods are displeased with this attempt "
-		       "to create a new daemon object for euid [%d]; pid [%d]; "
-		       "rc = [%d]\n", __func__, euid, pid, rc);
+		       "to create a new daemon object for euid [%d]; pid "
+		       "[0x%p]; rc = [%d]\n", __func__, euid, pid, rc);
 	mutex_unlock(&ecryptfs_daemon_hash_mux);
 	return rc;
 }
@@ -277,7 +286,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
 	    || (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)) {
 		rc = -EBUSY;
 		printk(KERN_WARNING "%s: Attempt to destroy daemon with pid "
-		       "[%d], but it is in the midst of a read or a poll\n",
+		       "[0x%p], but it is in the midst of a read or a poll\n",
 		       __func__, daemon->pid);
 		mutex_unlock(&daemon->mux);
 		goto out;
@@ -293,6 +302,10 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
 	hlist_del(&daemon->euid_chain);
 	if (daemon->task)
 		wake_up_process(daemon->task);
+	if (daemon->pid)
+		put_pid(daemon->pid);
+	if (daemon->user_ns)
+		put_user_ns(daemon->user_ns);
 	mutex_unlock(&daemon->mux);
 	memset(daemon, 0, sizeof(*daemon));
 	kfree(daemon);
@@ -303,6 +316,7 @@ out:
 /**
  * ecryptfs_process_quit
  * @euid: The user ID owner of the message
+ * @user_ns: The namespace in which @euid applies
  * @pid: The process ID for the userspace program that sent the
  *       message
  *
@@ -310,17 +324,18 @@ out:
  * it is the registered that is requesting the deletion. Returns zero
  * after deleting the desired daemon; non-zero otherwise.
  */
-int ecryptfs_process_quit(uid_t euid, pid_t pid)
+int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
+			  struct pid *pid)
 {
 	struct ecryptfs_daemon *daemon;
 	int rc;
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
-	rc = ecryptfs_find_daemon_by_euid(&daemon, euid);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, user_ns);
 	if (rc || !daemon) {
 		rc = -EINVAL;
 		printk(KERN_ERR "Received request from user [%d] to "
-		       "unregister unrecognized daemon [%d]\n", euid, pid);
+		       "unregister unrecognized daemon [0x%p]\n", euid, pid);
 		goto out_unlock;
 	}
 	rc = ecryptfs_exorcise_daemon(daemon);
@@ -354,11 +369,14 @@ out_unlock:
  * Returns zero on success; non-zero otherwise
  */
 int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
-			      pid_t pid, u32 seq)
+			      struct user_namespace *user_ns, struct pid *pid,
+			      u32 seq)
 {
 	struct ecryptfs_daemon *daemon;
 	struct ecryptfs_msg_ctx *msg_ctx;
 	size_t msg_size;
+	struct nsproxy *nsproxy;
+	struct user_namespace *current_user_ns;
 	int rc;
 
 	if (msg->index >= ecryptfs_message_buf_len) {
@@ -372,12 +390,25 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 	msg_ctx = &ecryptfs_msg_ctx_arr[msg->index];
 	mutex_lock(&msg_ctx->mux);
 	mutex_lock(&ecryptfs_daemon_hash_mux);
-	rc = ecryptfs_find_daemon_by_euid(&daemon, msg_ctx->task->euid);
+	rcu_read_lock();
+	nsproxy = task_nsproxy(msg_ctx->task);
+	if (nsproxy == NULL) {
+		rc = -EBADMSG;
+		printk(KERN_ERR "%s: Receiving process is a zombie. Dropping "
+		       "message.\n", __func__);
+		rcu_read_unlock();
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
+		goto wake_up;
+	}
+	current_user_ns = nsproxy->user_ns;
+	rc = ecryptfs_find_daemon_by_euid(&daemon, msg_ctx->task->euid,
+					  current_user_ns);
+	rcu_read_unlock();
 	mutex_unlock(&ecryptfs_daemon_hash_mux);
 	if (rc) {
 		rc = -EBADMSG;
 		printk(KERN_WARNING "%s: User [%d] received a "
-		       "message response from process [%d] but does "
+		       "message response from process [0x%p] but does "
 		       "not have a registered daemon\n", __func__,
 		       msg_ctx->task->euid, pid);
 		goto wake_up;
@@ -389,10 +420,17 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 		       euid, msg_ctx->task->euid);
 		goto unlock;
 	}
+	if (current_user_ns != user_ns) {
+		rc = -EBADMSG;
+		printk(KERN_WARNING "%s: Received message from user_ns "
+		       "[0x%p]; expected message from user_ns [0x%p]\n",
+		       __func__, user_ns, nsproxy->user_ns);
+		goto unlock;
+	}
 	if (daemon->pid != pid) {
 		rc = -EBADMSG;
 		printk(KERN_ERR "%s: User [%d] sent a message response "
-		       "from an unrecognized process [%d]\n",
+		       "from an unrecognized process [0x%p]\n",
 		       __func__, msg_ctx->task->euid, pid);
 		goto unlock;
 	}
@@ -446,7 +484,8 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
 	struct ecryptfs_daemon *daemon;
 	int rc;
 
-	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+					  current->nsproxy->user_ns);
 	if (rc || !daemon) {
 		rc = -ENOTCONN;
 		printk(KERN_ERR "%s: User [%d] does not have a daemon "
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 0c559731ae34..788995efd1d3 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -46,7 +46,8 @@ ecryptfs_miscdev_poll(struct file *file, poll_table *pt)
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
 	/* TODO: Just use file->private_data? */
-	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+					  current->nsproxy->user_ns);
 	BUG_ON(rc || !daemon);
 	mutex_lock(&daemon->mux);
 	mutex_unlock(&ecryptfs_daemon_hash_mux);
@@ -92,10 +93,12 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file)
 		       "count; rc = [%d]\n", __func__, rc);
 		goto out_unlock_daemon_list;
 	}
-	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+					  current->nsproxy->user_ns);
 	if (rc || !daemon) {
 		rc = ecryptfs_spawn_daemon(&daemon, current->euid,
-					   current->pid);
+					   current->nsproxy->user_ns,
+					   task_pid(current));
 		if (rc) {
 			printk(KERN_ERR "%s: Error attempting to spawn daemon; "
 			       "rc = [%d]\n", __func__, rc);
@@ -103,18 +106,18 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file)
 		}
 	}
 	mutex_lock(&daemon->mux);
-	if (daemon->pid != current->pid) {
+	if (daemon->pid != task_pid(current)) {
 		rc = -EINVAL;
-		printk(KERN_ERR "%s: pid [%d] has registered with euid [%d], "
-		       "but pid [%d] has attempted to open the handle "
+		printk(KERN_ERR "%s: pid [0x%p] has registered with euid [%d], "
+		       "but pid [0x%p] has attempted to open the handle "
 		       "instead\n", __func__, daemon->pid, daemon->euid,
-		       current->pid);
+		       task_pid(current));
 		goto out_unlock_daemon;
 	}
 	if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) {
 		rc = -EBUSY;
 		printk(KERN_ERR "%s: Miscellaneous device handle may only be "
-		       "opened once per daemon; pid [%d] already has this "
+		       "opened once per daemon; pid [0x%p] already has this "
 		       "handle open\n", __func__, daemon->pid);
 		goto out_unlock_daemon;
 	}
@@ -147,10 +150,11 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file)
 	int rc;
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
-	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+					  current->nsproxy->user_ns);
 	BUG_ON(rc || !daemon);
 	mutex_lock(&daemon->mux);
-	BUG_ON(daemon->pid != current->pid);
+	BUG_ON(daemon->pid != task_pid(current));
 	BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN));
 	daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN;
 	atomic_dec(&ecryptfs_num_miscdev_opens);
@@ -247,7 +251,8 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
 	/* TODO: Just use file->private_data? */
-	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+					  current->nsproxy->user_ns);
 	BUG_ON(rc || !daemon);
 	mutex_lock(&daemon->mux);
 	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
@@ -285,7 +290,8 @@ check_list:
 		goto check_list;
 	}
 	BUG_ON(current->euid != daemon->euid);
-	BUG_ON(current->pid != daemon->pid);
+	BUG_ON(current->nsproxy->user_ns != daemon->user_ns);
+	BUG_ON(task_pid(current) != daemon->pid);
 	msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue,
 				   struct ecryptfs_msg_ctx, daemon_out_list);
 	BUG_ON(!msg_ctx);
@@ -355,15 +361,18 @@ out_unlock_daemon:
 /**
  * ecryptfs_miscdev_helo
  * @euid: effective user id of miscdevess sending helo packet
+ * @user_ns: The namespace in which @euid applies
  * @pid: miscdevess id of miscdevess sending helo packet
  *
  * Returns zero on success; non-zero otherwise
  */
-static int ecryptfs_miscdev_helo(uid_t uid, pid_t pid)
+static int ecryptfs_miscdev_helo(uid_t euid, struct user_namespace *user_ns,
+				 struct pid *pid)
 {
 	int rc;
 
-	rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_MISCDEV, uid, pid);
+	rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_MISCDEV, euid, user_ns,
+				   pid);
 	if (rc)
 		printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc);
 	return rc;
@@ -372,15 +381,17 @@ static int ecryptfs_miscdev_helo(uid_t uid, pid_t pid)
 /**
  * ecryptfs_miscdev_quit
  * @euid: effective user id of miscdevess sending quit packet
+ * @user_ns: The namespace in which @euid applies
  * @pid: miscdevess id of miscdevess sending quit packet
  *
  * Returns zero on success; non-zero otherwise
  */
-static int ecryptfs_miscdev_quit(uid_t euid, pid_t pid)
+static int ecryptfs_miscdev_quit(uid_t euid, struct user_namespace *user_ns,
+				 struct pid *pid)
 {
 	int rc;
 
-	rc = ecryptfs_process_quit(euid, pid);
+	rc = ecryptfs_process_quit(euid, user_ns, pid);
 	if (rc)
 		printk(KERN_WARNING
 		       "Error processing QUIT message; rc = [%d]\n", rc);
@@ -392,13 +403,15 @@ static int ecryptfs_miscdev_quit(uid_t euid, pid_t pid)
  * @data: Bytes comprising struct ecryptfs_message
  * @data_size: sizeof(struct ecryptfs_message) + data len
  * @euid: Effective user id of miscdevess sending the miscdev response
+ * @user_ns: The namespace in which @euid applies
  * @pid: Miscdevess id of miscdevess sending the miscdev response
  * @seq: Sequence number for miscdev response packet
  *
  * Returns zero on success; non-zero otherwise
  */
 static int ecryptfs_miscdev_response(char *data, size_t data_size,
-					  uid_t euid, pid_t pid, u32 seq)
+				     uid_t euid, struct user_namespace *user_ns,
+				     struct pid *pid, u32 seq)
 {
 	struct ecryptfs_message *msg = (struct ecryptfs_message *)data;
 	int rc;
@@ -410,7 +423,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size,
 		rc = -EINVAL;
 		goto out;
 	}
-	rc = ecryptfs_process_response(msg, euid, pid, seq);
+	rc = ecryptfs_process_response(msg, euid, user_ns, pid, seq);
 	if (rc)
 		printk(KERN_ERR
 		       "Error processing response message; rc = [%d]\n", rc);
@@ -491,27 +504,32 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 		}
 		rc = ecryptfs_miscdev_response(&data[i], packet_size,
 					       current->euid,
-					       current->pid, seq);
+					       current->nsproxy->user_ns,
+					       task_pid(current), seq);
 		if (rc)
 			printk(KERN_WARNING "%s: Failed to deliver miscdev "
 			       "response to requesting operation; rc = [%d]\n",
 			       __func__, rc);
 		break;
 	case ECRYPTFS_MSG_HELO:
-		rc = ecryptfs_miscdev_helo(current->euid, current->pid);
+		rc = ecryptfs_miscdev_helo(current->euid,
+					   current->nsproxy->user_ns,
+					   task_pid(current));
 		if (rc) {
 			printk(KERN_ERR "%s: Error attempting to process "
-			       "helo from pid [%d]; rc = [%d]\n", __func__,
-			       current->pid, rc);
+			       "helo from pid [0x%p]; rc = [%d]\n", __func__,
+			       task_pid(current), rc);
 			goto out_free;
 		}
 		break;
 	case ECRYPTFS_MSG_QUIT:
-		rc = ecryptfs_miscdev_quit(current->euid, current->pid);
+		rc = ecryptfs_miscdev_quit(current->euid,
+					   current->nsproxy->user_ns,
+					   task_pid(current));
 		if (rc) {
 			printk(KERN_ERR "%s: Error attempting to process "
-			       "quit from pid [%d]; rc = [%d]\n", __func__,
-			       current->pid, rc);
+			       "quit from pid [0x%p]; rc = [%d]\n", __func__,
+			       task_pid(current), rc);
 			goto out_free;
 		}
 		break;
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index eb70f69d705d..e0abad62b395 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -45,7 +45,7 @@ static struct sock *ecryptfs_nl_sock;
  */
 int ecryptfs_send_netlink(char *data, int data_len,
 			  struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
-			  u16 msg_flags, pid_t daemon_pid)
+			  u16 msg_flags, struct pid *daemon_pid)
 {
 	struct sk_buff *skb;
 	struct nlmsghdr *nlh;
@@ -60,7 +60,7 @@ int ecryptfs_send_netlink(char *data, int data_len,
 		ecryptfs_printk(KERN_ERR, "Failed to allocate socket buffer\n");
 		goto out;
 	}
-	nlh = NLMSG_PUT(skb, daemon_pid, msg_ctx ? msg_ctx->counter : 0,
+	nlh = NLMSG_PUT(skb, pid_nr(daemon_pid), msg_ctx ? msg_ctx->counter : 0,
 			msg_type, payload_len);
 	nlh->nlmsg_flags = msg_flags;
 	if (msg_ctx && payload_len) {
@@ -69,7 +69,7 @@ int ecryptfs_send_netlink(char *data, int data_len,
 		msg->data_len = data_len;
 		memcpy(msg->data, data, data_len);
 	}
-	rc = netlink_unicast(ecryptfs_nl_sock, skb, daemon_pid, 0);
+	rc = netlink_unicast(ecryptfs_nl_sock, skb, pid_nr(daemon_pid), 0);
 	if (rc < 0) {
 		ecryptfs_printk(KERN_ERR, "Failed to send eCryptfs netlink "
 				"message; rc = [%d]\n", rc);
@@ -99,6 +99,7 @@ static int ecryptfs_process_nl_response(struct sk_buff *skb)
 {
 	struct nlmsghdr *nlh = nlmsg_hdr(skb);
 	struct ecryptfs_message *msg = NLMSG_DATA(nlh);
+	struct pid *pid;
 	int rc;
 
 	if (skb->len - NLMSG_HDRLEN - sizeof(*msg) != msg->data_len) {
@@ -107,8 +108,10 @@ static int ecryptfs_process_nl_response(struct sk_buff *skb)
 				"incorrectly specified data length\n");
 		goto out;
 	}
-	rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid,
-				       NETLINK_CREDS(skb)->pid, nlh->nlmsg_seq);
+	pid = find_get_pid(NETLINK_CREDS(skb)->pid);
+	rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid, NULL,
+				       pid, nlh->nlmsg_seq);
+	put_pid(pid);
 	if (rc)
 		printk(KERN_ERR
 		       "Error processing response message; rc = [%d]\n", rc);
@@ -126,11 +129,13 @@ out:
  */
 static int ecryptfs_process_nl_helo(struct sk_buff *skb)
 {
+	struct pid *pid;
 	int rc;
 
+	pid = find_get_pid(NETLINK_CREDS(skb)->pid);
 	rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_NETLINK,
-				   NETLINK_CREDS(skb)->uid,
-				   NETLINK_CREDS(skb)->pid);
+				   NETLINK_CREDS(skb)->uid, NULL, pid);
+	put_pid(pid);
 	if (rc)
 		printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc);
 	return rc;
@@ -147,10 +152,12 @@ static int ecryptfs_process_nl_helo(struct sk_buff *skb)
  */
 static int ecryptfs_process_nl_quit(struct sk_buff *skb)
 {
+	struct pid *pid;
 	int rc;
 
-	rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid,
-				   NETLINK_CREDS(skb)->pid);
+	pid = find_get_pid(NETLINK_CREDS(skb)->pid);
+	rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid, NULL, pid);
+	put_pid(pid);
 	if (rc)
 		printk(KERN_WARNING
 		       "Error processing QUIT message; rc = [%d]\n", rc);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2731ba80e30b..a9ab0596de44 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -74,3 +74,4 @@ void free_user_ns(struct kref *kref)
 	release_uids(ns);
 	kfree(ns);
 }
+EXPORT_SYMBOL(free_user_ns);
-- 
cgit v1.3-14-g43fede


From 3df91fe30a1547af7e794c6e8cca76f4932c6ad7 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 29 Apr 2008 00:59:54 -0700
Subject: make cgroup_enable_task_cg_lists() static

Make the needlessly global cgroup_enable_task_cg_lists() static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6d8de051382b..e7da66efc9fc 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1715,7 +1715,7 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
  * The tasklist_lock is not held here, as do_each_thread() and
  * while_each_thread() are protected by RCU.
  */
-void cgroup_enable_task_cg_lists(void)
+static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 	write_lock(&css_set_lock);
-- 
cgit v1.3-14-g43fede


From 4fe91d518e4958af7edebbeb112a3272b2be232d Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 29 Apr 2008 00:59:55 -0700
Subject: cgroup: fix sparse warning of shadow symbol in cgroup.c

Fix a code warning: symbol 'p' shadows an earlier one

This is a reincarnation of Harvey Harrison's patch:
	cpuset: sparse warnings in cpuset.c

Independently, Cliff Wickman moved the affected code,
from kernel/cpuset.c to kernel/cgroup.c, in his patch:
	cpusets: update_cpumask revision

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Cliff Wickman <cpw@sgi.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e7da66efc9fc..068f58da855a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1913,14 +1913,14 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
 
 	if (heap->size) {
 		for (i = 0; i < heap->size; i++) {
-			struct task_struct *p = heap->ptrs[i];
+			struct task_struct *q = heap->ptrs[i];
 			if (i == 0) {
-				latest_time = p->start_time;
-				latest_task = p;
+				latest_time = q->start_time;
+				latest_task = q;
 			}
 			/* Process the task per the caller's callback */
-			scan->process_task(p, scan);
-			put_task_struct(p);
+			scan->process_task(q, scan);
+			put_task_struct(q);
 		}
 		/*
 		 * If we had to process any tasks at all, scan again
-- 
cgit v1.3-14-g43fede


From 3ff31d0cca38b3c20e88a022bf38c4f7c98492f0 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 29 Apr 2008 00:59:55 -0700
Subject: cgroups: kernel/ns_cgroup.c should #include <linux/nsproxy.h>

Every file should include the headers containing the externs its global
functions (in this case for ns_cgroup_clone()).

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ns_cgroup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 18df038d7cd5..48d7ed6fc3a4 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -8,6 +8,7 @@
 #include <linux/cgroup.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/nsproxy.h>
 
 struct ns_cgroup {
 	struct cgroup_subsys_state css;
-- 
cgit v1.3-14-g43fede


From f4c753b7eacc277e506066abdda351cbc1cf8e6a Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 29 Apr 2008 00:59:56 -0700
Subject: CGroup API files: rename read/write_uint methods to read_write_u64

Several people have justifiably complained that the "_uint" suffix is
inappropriate for functions that handle u64 values, so this patch just renames
all these functions and their users to have the suffic _u64.

[peterz@infradead.org: build fix]
Signed-off-by: Paul Menage <menage@google.com>
Cc: "Li Zefan" <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "YAMAMOTO Takashi" <yamamoto@valinux.co.jp>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  8 ++++----
 kernel/cgroup.c        | 32 ++++++++++++++++----------------
 kernel/cgroup_debug.c  |  8 ++++----
 kernel/sched.c         | 16 ++++++++--------
 4 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a6a6035a4e1e..058371c5d360 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -190,20 +190,20 @@ struct cftype {
 			 struct file *file,
 			 char __user *buf, size_t nbytes, loff_t *ppos);
 	/*
-	 * read_uint() is a shortcut for the common case of returning a
+	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
 	 */
-	u64 (*read_uint) (struct cgroup *cgrp, struct cftype *cft);
+	u64 (*read_u64) (struct cgroup *cgrp, struct cftype *cft);
 	ssize_t (*write) (struct cgroup *cgrp, struct cftype *cft,
 			  struct file *file,
 			  const char __user *buf, size_t nbytes, loff_t *ppos);
 
 	/*
-	 * write_uint() is a shortcut for the common case of accepting
+	 * write_u64() is a shortcut for the common case of accepting
 	 * a single integer (as parsed by simple_strtoull) from
 	 * userspace. Use in place of write(); return 0 or error.
 	 */
-	int (*write_uint) (struct cgroup *cgrp, struct cftype *cft, u64 val);
+	int (*write_u64) (struct cgroup *cgrp, struct cftype *cft, u64 val);
 
 	int (*release) (struct inode *inode, struct file *file);
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 068f58da855a..0bd79a81666a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1311,10 +1311,10 @@ enum cgroup_filetype {
 	FILE_RELEASE_AGENT,
 };
 
-static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
-				 struct file *file,
-				 const char __user *userbuf,
-				 size_t nbytes, loff_t *unused_ppos)
+static ssize_t cgroup_write_u64(struct cgroup *cgrp, struct cftype *cft,
+				struct file *file,
+				const char __user *userbuf,
+				size_t nbytes, loff_t *unused_ppos)
 {
 	char buffer[64];
 	int retval = 0;
@@ -1338,7 +1338,7 @@ static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
 		return -EINVAL;
 
 	/* Pass to subsystem */
-	retval = cft->write_uint(cgrp, cft, val);
+	retval = cft->write_u64(cgrp, cft, val);
 	if (!retval)
 		retval = nbytes;
 	return retval;
@@ -1419,18 +1419,18 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 		return -ENODEV;
 	if (cft->write)
 		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
-	if (cft->write_uint)
-		return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos);
+	if (cft->write_u64)
+		return cgroup_write_u64(cgrp, cft, file, buf, nbytes, ppos);
 	return -EINVAL;
 }
 
-static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft,
-				   struct file *file,
-				   char __user *buf, size_t nbytes,
-				   loff_t *ppos)
+static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
+			       struct file *file,
+			       char __user *buf, size_t nbytes,
+			       loff_t *ppos)
 {
 	char tmp[64];
-	u64 val = cft->read_uint(cgrp, cft);
+	u64 val = cft->read_u64(cgrp, cft);
 	int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
 
 	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
@@ -1490,8 +1490,8 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 
 	if (cft->read)
 		return cft->read(cgrp, cft, file, buf, nbytes, ppos);
-	if (cft->read_uint)
-		return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos);
+	if (cft->read_u64)
+		return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
 	return -EINVAL;
 }
 
@@ -2158,14 +2158,14 @@ static struct cftype files[] = {
 
 	{
 		.name = "notify_on_release",
-		.read_uint = cgroup_read_notify_on_release,
+		.read_u64 = cgroup_read_notify_on_release,
 		.write = cgroup_common_file_write,
 		.private = FILE_NOTIFY_ON_RELEASE,
 	},
 
 	{
 		.name = "releasable",
-		.read_uint = cgroup_read_releasable,
+		.read_u64 = cgroup_read_releasable,
 		.private = FILE_RELEASABLE,
 	}
 };
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index 37301e877cb0..cbb7a26f4ea3 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -65,21 +65,21 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
 static struct cftype files[] =  {
 	{
 		.name = "cgroup_refcount",
-		.read_uint = cgroup_refcount_read,
+		.read_u64 = cgroup_refcount_read,
 	},
 	{
 		.name = "taskcount",
-		.read_uint = taskcount_read,
+		.read_u64 = taskcount_read,
 	},
 
 	{
 		.name = "current_css_set",
-		.read_uint = current_css_set_read,
+		.read_u64 = current_css_set_read,
 	},
 
 	{
 		.name = "current_css_set_refcount",
-		.read_uint = current_css_set_refcount_read,
+		.read_u64 = current_css_set_refcount_read,
 	},
 };
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 740fb409e5bb..2528fbd974b4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9057,13 +9057,13 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
 				u64 shareval)
 {
 	return sched_group_set_shares(cgroup_tg(cgrp), shareval);
 }
 
-static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
 
@@ -9133,8 +9133,8 @@ static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
-		.read_uint = cpu_shares_read_uint,
-		.write_uint = cpu_shares_write_uint,
+		.read_u64 = cpu_shares_read_u64,
+		.write_u64 = cpu_shares_write_u64,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -9145,8 +9145,8 @@ static struct cftype cpu_files[] = {
 	},
 	{
 		.name = "rt_period_us",
-		.read_uint = cpu_rt_period_read_uint,
-		.write_uint = cpu_rt_period_write_uint,
+		.read_u64 = cpu_rt_period_read_uint,
+		.write_u64 = cpu_rt_period_write_uint,
 	},
 #endif
 };
@@ -9277,8 +9277,8 @@ out:
 static struct cftype files[] = {
 	{
 		.name = "usage",
-		.read_uint = cpuusage_read,
-		.write_uint = cpuusage_write,
+		.read_u64 = cpuusage_read,
+		.write_u64 = cpuusage_write,
 	},
 };
 
-- 
cgit v1.3-14-g43fede


From 2c7eabf37647dd459d555e76954b4de87be2321f Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 29 Apr 2008 00:59:58 -0700
Subject: CGroup API files: add res_counter_read_u64()

Adds a function for returning the value of a resource counter member, in a
form suitable for use in a cgroup read_u64 control file method.

Signed-off-by: Paul Menage <menage@google.com>
Cc: "Li Zefan" <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "YAMAMOTO Takashi" <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/res_counter.h | 5 ++++-
 kernel/res_counter.c        | 5 +++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 61363ce896d5..8cb1ecd420a9 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -39,8 +39,9 @@ struct res_counter {
 	spinlock_t lock;
 };
 
-/*
+/**
  * Helpers to interact with userspace
+ * res_counter_read_u64() - returns the value of the specified member.
  * res_counter_read/_write - put/get the specified fields from the
  * res_counter struct to/from the user
  *
@@ -51,6 +52,8 @@ struct res_counter {
  * @pos:     and the offset.
  */
 
+u64 res_counter_read_u64(struct res_counter *counter, int member);
+
 ssize_t res_counter_read(struct res_counter *counter, int member,
 		const char __user *buf, size_t nbytes, loff_t *pos,
 		int (*read_strategy)(unsigned long long val, char *s));
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index a508c2769463..70587657dda3 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -93,6 +93,11 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
 			pos, buf, s - buf);
 }
 
+u64 res_counter_read_u64(struct res_counter *counter, int member)
+{
+	return *res_counter_member(counter, member);
+}
+
 ssize_t res_counter_write(struct res_counter *counter, int member,
 		const char __user *userbuf, size_t nbytes, loff_t *pos,
 		int (*write_strategy)(char *st_buf, unsigned long long *val))
-- 
cgit v1.3-14-g43fede


From b7269dfc826fbf554c9e6a9eaa4e6ff95fa08656 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 29 Apr 2008 00:59:59 -0700
Subject: CGroup API files: strip all trailing whitespace in cgroup_write_u64

This removes the need for people to remember to pass the -n flag to echo when
writing values to cgroup control files.

Signed-off-by: Paul Menage <menage@google.com>
Cc: "Li Zefan" <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "YAMAMOTO Takashi" <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0bd79a81666a..57afdde871ac 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1329,10 +1329,7 @@ static ssize_t cgroup_write_u64(struct cgroup *cgrp, struct cftype *cft,
 		return -EFAULT;
 
 	buffer[nbytes] = 0;     /* nul-terminate */
-
-	/* strip newline if necessary */
-	if (nbytes && (buffer[nbytes-1] == '\n'))
-		buffer[nbytes-1] = 0;
+	strstrip(buffer);
 	val = simple_strtoull(buffer, &end, 0);
 	if (*end)
 		return -EINVAL;
-- 
cgit v1.3-14-g43fede


From 700fe1ab99240c1a9c4d155e2a0612a1b044bb69 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 29 Apr 2008 01:00:00 -0700
Subject: CGroup API files: update cpusets to use cgroup structured file API

Many of the cpusets control files are simple integer values, which don't
require the overhead of memory allocations for reads and writes.

Move the handlers for these control files into cpuset_read_u64() and
cpuset_write_u64().

[akpm@linux-foundation.org: ad dmissing `break']
Signed-off-by: Paul Menage <menage@google.com>
Cc: "Li Zefan" <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "YAMAMOTO Takashi" <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 160 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 83 insertions(+), 77 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 48a976c52cf5..832004935ca7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1023,19 +1023,6 @@ int current_cpuset_is_being_rebound(void)
 	return task_cs(current) == cpuset_being_rebound;
 }
 
-/*
- * Call with cgroup_mutex held.
- */
-
-static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
-{
-	if (simple_strtoul(buf, NULL, 10) != 0)
-		cpuset_memory_pressure_enabled = 1;
-	else
-		cpuset_memory_pressure_enabled = 0;
-	return 0;
-}
-
 static int update_relax_domain_level(struct cpuset *cs, char *buf)
 {
 	int val = simple_strtol(buf, NULL, 10);
@@ -1063,15 +1050,13 @@ static int update_relax_domain_level(struct cpuset *cs, char *buf)
  * Call with cgroup_mutex held.
  */
 
-static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+		       int turning_on)
 {
-	int turning_on;
 	struct cpuset trialcs;
 	int err;
 	int cpus_nonempty, balance_flag_changed;
 
-	turning_on = (simple_strtoul(buf, NULL, 10) != 0);
-
 	trialcs = *cs;
 	if (turning_on)
 		set_bit(bit, &trialcs.flags);
@@ -1289,46 +1274,68 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
 	case FILE_MEMLIST:
 		retval = update_nodemask(cs, buffer);
 		break;
+	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+		retval = update_relax_domain_level(cs, buffer);
+		break;
+	default:
+		retval = -EINVAL;
+		goto out2;
+	}
+
+	if (retval == 0)
+		retval = nbytes;
+out2:
+	cgroup_unlock();
+out1:
+	kfree(buffer);
+	return retval;
+}
+
+static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	int retval = 0;
+	struct cpuset *cs = cgroup_cs(cgrp);
+	cpuset_filetype_t type = cft->private;
+
+	cgroup_lock();
+
+	if (cgroup_is_removed(cgrp)) {
+		cgroup_unlock();
+		return -ENODEV;
+	}
+
+	switch (type) {
 	case FILE_CPU_EXCLUSIVE:
-		retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
+		retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
 		break;
 	case FILE_MEM_EXCLUSIVE:
-		retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
+		retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
 		break;
 	case FILE_SCHED_LOAD_BALANCE:
-		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
-		break;
-	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
-		retval = update_relax_domain_level(cs, buffer);
+		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
 		break;
 	case FILE_MEMORY_MIGRATE:
-		retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
+		retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
 		break;
 	case FILE_MEMORY_PRESSURE_ENABLED:
-		retval = update_memory_pressure_enabled(cs, buffer);
+		cpuset_memory_pressure_enabled = !!val;
 		break;
 	case FILE_MEMORY_PRESSURE:
 		retval = -EACCES;
 		break;
 	case FILE_SPREAD_PAGE:
-		retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
+		retval = update_flag(CS_SPREAD_PAGE, cs, val);
 		cs->mems_generation = cpuset_mems_generation++;
 		break;
 	case FILE_SPREAD_SLAB:
-		retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
+		retval = update_flag(CS_SPREAD_SLAB, cs, val);
 		cs->mems_generation = cpuset_mems_generation++;
 		break;
 	default:
 		retval = -EINVAL;
-		goto out2;
+		break;
 	}
-
-	if (retval == 0)
-		retval = nbytes;
-out2:
 	cgroup_unlock();
-out1:
-	kfree(buffer);
 	return retval;
 }
 
@@ -1390,33 +1397,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
 	case FILE_MEMLIST:
 		s += cpuset_sprintf_memlist(s, cs);
 		break;
-	case FILE_CPU_EXCLUSIVE:
-		*s++ = is_cpu_exclusive(cs) ? '1' : '0';
-		break;
-	case FILE_MEM_EXCLUSIVE:
-		*s++ = is_mem_exclusive(cs) ? '1' : '0';
-		break;
-	case FILE_SCHED_LOAD_BALANCE:
-		*s++ = is_sched_load_balance(cs) ? '1' : '0';
-		break;
 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
 		s += sprintf(s, "%d", cs->relax_domain_level);
 		break;
-	case FILE_MEMORY_MIGRATE:
-		*s++ = is_memory_migrate(cs) ? '1' : '0';
-		break;
-	case FILE_MEMORY_PRESSURE_ENABLED:
-		*s++ = cpuset_memory_pressure_enabled ? '1' : '0';
-		break;
-	case FILE_MEMORY_PRESSURE:
-		s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
-		break;
-	case FILE_SPREAD_PAGE:
-		*s++ = is_spread_page(cs) ? '1' : '0';
-		break;
-	case FILE_SPREAD_SLAB:
-		*s++ = is_spread_slab(cs) ? '1' : '0';
-		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1429,8 +1412,31 @@ out:
 	return retval;
 }
 
-
-
+static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
+{
+	struct cpuset *cs = cgroup_cs(cont);
+	cpuset_filetype_t type = cft->private;
+	switch (type) {
+	case FILE_CPU_EXCLUSIVE:
+		return is_cpu_exclusive(cs);
+	case FILE_MEM_EXCLUSIVE:
+		return is_mem_exclusive(cs);
+	case FILE_SCHED_LOAD_BALANCE:
+		return is_sched_load_balance(cs);
+	case FILE_MEMORY_MIGRATE:
+		return is_memory_migrate(cs);
+	case FILE_MEMORY_PRESSURE_ENABLED:
+		return cpuset_memory_pressure_enabled;
+	case FILE_MEMORY_PRESSURE:
+		return fmeter_getrate(&cs->fmeter);
+	case FILE_SPREAD_PAGE:
+		return is_spread_page(cs);
+	case FILE_SPREAD_SLAB:
+		return is_spread_slab(cs);
+	default:
+		BUG();
+	}
+}
 
 
 /*
@@ -1453,22 +1459,22 @@ static struct cftype cft_mems = {
 
 static struct cftype cft_cpu_exclusive = {
 	.name = "cpu_exclusive",
-	.read = cpuset_common_file_read,
-	.write = cpuset_common_file_write,
+	.read_u64 = cpuset_read_u64,
+	.write_u64 = cpuset_write_u64,
 	.private = FILE_CPU_EXCLUSIVE,
 };
 
 static struct cftype cft_mem_exclusive = {
 	.name = "mem_exclusive",
-	.read = cpuset_common_file_read,
-	.write = cpuset_common_file_write,
+	.read_u64 = cpuset_read_u64,
+	.write_u64 = cpuset_write_u64,
 	.private = FILE_MEM_EXCLUSIVE,
 };
 
 static struct cftype cft_sched_load_balance = {
 	.name = "sched_load_balance",
-	.read = cpuset_common_file_read,
-	.write = cpuset_common_file_write,
+	.read_u64 = cpuset_read_u64,
+	.write_u64 = cpuset_write_u64,
 	.private = FILE_SCHED_LOAD_BALANCE,
 };
 
@@ -1481,36 +1487,36 @@ static struct cftype cft_sched_relax_domain_level = {
 
 static struct cftype cft_memory_migrate = {
 	.name = "memory_migrate",
-	.read = cpuset_common_file_read,
-	.write = cpuset_common_file_write,
+	.read_u64 = cpuset_read_u64,
+	.write_u64 = cpuset_write_u64,
 	.private = FILE_MEMORY_MIGRATE,
 };
 
 static struct cftype cft_memory_pressure_enabled = {
 	.name = "memory_pressure_enabled",
-	.read = cpuset_common_file_read,
-	.write = cpuset_common_file_write,
+	.read_u64 = cpuset_read_u64,
+	.write_u64 = cpuset_write_u64,
 	.private = FILE_MEMORY_PRESSURE_ENABLED,
 };
 
 static struct cftype cft_memory_pressure = {
 	.name = "memory_pressure",
-	.read = cpuset_common_file_read,
-	.write = cpuset_common_file_write,
+	.read_u64 = cpuset_read_u64,
+	.write_u64 = cpuset_write_u64,
 	.private = FILE_MEMORY_PRESSURE,
 };
 
 static struct cftype cft_spread_page = {
 	.name = "memory_spread_page",
-	.read = cpuset_common_file_read,
-	.write = cpuset_common_file_write,
+	.read_u64 = cpuset_read_u64,
+	.write_u64 = cpuset_write_u64,
 	.private = FILE_SPREAD_PAGE,
 };
 
 static struct cftype cft_spread_slab = {
 	.name = "memory_spread_slab",
-	.read = cpuset_common_file_read,
-	.write = cpuset_common_file_write,
+	.read_u64 = cpuset_read_u64,
+	.write_u64 = cpuset_write_u64,
 	.private = FILE_SPREAD_SLAB,
 };
 
@@ -1643,7 +1649,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 	cpuset_update_task_memory_state();
 
 	if (is_sched_load_balance(cs))
-		update_flag(CS_SCHED_LOAD_BALANCE, cs, "0");
+		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
 
 	number_of_cpusets--;
 	kfree(cs);
-- 
cgit v1.3-14-g43fede


From 9179656961adcea3c25403365597e486d851ac5e Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 29 Apr 2008 01:00:01 -0700
Subject: CGroup API files: add cgroup map data type

Adds a new type of supported control file representation, a map from strings
to u64 values.

Each map entry is printed as a line in a similar format to /proc/vmstat, i.e.
"$key $value\n"

Signed-off-by: Paul Menage <menage@google.com>
Cc: "Li Zefan" <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "YAMAMOTO Takashi" <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 19 ++++++++++++++++++
 kernel/cgroup.c        | 53 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 71 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 058371c5d360..0a3ab670dd2f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -166,6 +166,16 @@ struct css_set {
 
 };
 
+/*
+ * cgroup_map_cb is an abstract callback API for reporting map-valued
+ * control files
+ */
+
+struct cgroup_map_cb {
+	int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
+	void *state;
+};
+
 /* struct cftype:
  *
  * The files in the cgroup filesystem mostly have a very simple read/write
@@ -194,6 +204,15 @@ struct cftype {
 	 * single integer. Use it in place of read()
 	 */
 	u64 (*read_u64) (struct cgroup *cgrp, struct cftype *cft);
+	/*
+	 * read_map() is used for defining a map of key/value
+	 * pairs. It should call cb->fill(cb, key, value) for each
+	 * entry. The key/value pairs (and their ordering) should not
+	 * change between reboots.
+	 */
+	int (*read_map) (struct cgroup *cont, struct cftype *cft,
+			 struct cgroup_map_cb *cb);
+
 	ssize_t (*write) (struct cgroup *cgrp, struct cftype *cft,
 			  struct file *file,
 			  const char __user *buf, size_t nbytes, loff_t *ppos);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 57afdde871ac..693bcc03188b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1492,6 +1492,46 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 	return -EINVAL;
 }
 
+/*
+ * seqfile ops/methods for returning structured data. Currently just
+ * supports string->u64 maps, but can be extended in future.
+ */
+
+struct cgroup_seqfile_state {
+	struct cftype *cft;
+	struct cgroup *cgroup;
+};
+
+static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
+{
+	struct seq_file *sf = cb->state;
+	return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
+}
+
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+	struct cgroup_seqfile_state *state = m->private;
+	struct cftype *cft = state->cft;
+	struct cgroup_map_cb cb = {
+		.fill = cgroup_map_add,
+		.state = m,
+	};
+	return cft->read_map(state->cgroup, cft, &cb);
+}
+
+int cgroup_seqfile_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	kfree(seq->private);
+	return single_release(inode, file);
+}
+
+static struct file_operations cgroup_seqfile_operations = {
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = cgroup_seqfile_release,
+};
+
 static int cgroup_file_open(struct inode *inode, struct file *file)
 {
 	int err;
@@ -1504,7 +1544,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
 	cft = __d_cft(file->f_dentry);
 	if (!cft)
 		return -ENODEV;
-	if (cft->open)
+	if (cft->read_map) {
+		struct cgroup_seqfile_state *state =
+			kzalloc(sizeof(*state), GFP_USER);
+		if (!state)
+			return -ENOMEM;
+		state->cft = cft;
+		state->cgroup = __d_cgrp(file->f_dentry->d_parent);
+		file->f_op = &cgroup_seqfile_operations;
+		err = single_open(file, cgroup_seqfile_show, state);
+		if (err < 0)
+			kfree(state);
+	} else if (cft->open)
 		err = cft->open(inode, file);
 	else
 		err = 0;
-- 
cgit v1.3-14-g43fede


From 3116f0e3df0a67ad56f15dd4c5f6cefb04bb4a98 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 29 Apr 2008 01:00:04 -0700
Subject: CGroup API files: move "releasable" to cgroup_debug subsystem

The "releasable" control file provided by the cgroup framework exports the
state of a per-cgroup flag that's related to the notify-on-release feature.
This isn't really generally useful, unless you're trying to debug this
particular feature of cgroups.

This patch moves the "releasable" file to the cgroup_debug subsystem.

Signed-off-by: Paul Menage <menage@google.com>
Cc: "Li Zefan" <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "YAMAMOTO Takashi" <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 11 +++++++++++
 kernel/cgroup.c        | 23 -----------------------
 kernel/cgroup_debug.c  | 12 +++++++++++-
 3 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0a3ab670dd2f..b40fd5ee9a76 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -88,6 +88,17 @@ static inline void css_put(struct cgroup_subsys_state *css)
 		__css_put(css);
 }
 
+/* bits in struct cgroup flags field */
+enum {
+	/* Control Group is dead */
+	CGRP_REMOVED,
+	/* Control Group has previously had a child cgroup or a task,
+	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
+	CGRP_RELEASABLE,
+	/* Control Group requires release notifications to userspace */
+	CGRP_NOTIFY_ON_RELEASE,
+};
+
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 693bcc03188b..b5ef0c4772f7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -119,17 +119,6 @@ static int root_count;
  */
 static int need_forkexit_callback;
 
-/* bits in struct cgroup flags field */
-enum {
-	/* Control Group is dead */
-	CGRP_REMOVED,
-	/* Control Group has previously had a child cgroup or a task,
-	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
-	CGRP_RELEASABLE,
-	/* Control Group requires release notifications to userspace */
-	CGRP_NOTIFY_ON_RELEASE,
-};
-
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -1307,7 +1296,6 @@ enum cgroup_filetype {
 	FILE_DIR,
 	FILE_TASKLIST,
 	FILE_NOTIFY_ON_RELEASE,
-	FILE_RELEASABLE,
 	FILE_RELEASE_AGENT,
 };
 
@@ -2186,11 +2174,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
 	return notify_on_release(cgrp);
 }
 
-static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
-{
-	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
-}
-
 /*
  * for the common functions, 'private' gives the type of file
  */
@@ -2210,12 +2193,6 @@ static struct cftype files[] = {
 		.write = cgroup_common_file_write,
 		.private = FILE_NOTIFY_ON_RELEASE,
 	},
-
-	{
-		.name = "releasable",
-		.read_u64 = cgroup_read_releasable,
-		.private = FILE_RELEASABLE,
-	}
 };
 
 static struct cftype cft_release_agent = {
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index cbb7a26f4ea3..c3dc3aba4c02 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -1,5 +1,5 @@
 /*
- * kernel/ccontainer_debug.c - Example cgroup subsystem that
+ * kernel/cgroup_debug.c - Example cgroup subsystem that
  * exposes debug info
  *
  * Copyright (C) Google Inc, 2007
@@ -62,6 +62,11 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
 	return count;
 }
 
+static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+}
+
 static struct cftype files[] =  {
 	{
 		.name = "cgroup_refcount",
@@ -81,6 +86,11 @@ static struct cftype files[] =  {
 		.name = "current_css_set_refcount",
 		.read_u64 = current_css_set_refcount_read,
 	},
+
+	{
+		.name = "releasable",
+		.read_u64 = releasable_read,
+	}
 };
 
 static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-- 
cgit v1.3-14-g43fede


From e73d2c61d1fcbd3621688ae457b49509c8d4c601 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 29 Apr 2008 01:00:06 -0700
Subject: CGroups _s64 files: add cgroups read_s64/write_s64 file methods

These patches add cgroups read_s64 and write_s64 control file methods (the
signed equivalent of read_u64/write_u64) and use them to implement the
cpu.rt_runtime_us control file in the CFS cgroup subsystem.

This patch:

These are the signed equivalents of the read_u64/write_u64 methods

Signed-off-by: Paul Menage <menage@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  8 ++++++++
 kernel/cgroup.c        | 38 ++++++++++++++++++++++++++++----------
 2 files changed, 36 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b40fd5ee9a76..785a01cfb49d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -215,6 +215,10 @@ struct cftype {
 	 * single integer. Use it in place of read()
 	 */
 	u64 (*read_u64) (struct cgroup *cgrp, struct cftype *cft);
+	/*
+	 * read_s64() is a signed version of read_u64()
+	 */
+	s64 (*read_s64) (struct cgroup *cgrp, struct cftype *cft);
 	/*
 	 * read_map() is used for defining a map of key/value
 	 * pairs. It should call cb->fill(cb, key, value) for each
@@ -234,6 +238,10 @@ struct cftype {
 	 * userspace. Use in place of write(); return 0 or error.
 	 */
 	int (*write_u64) (struct cgroup *cgrp, struct cftype *cft, u64 val);
+	/*
+	 * write_s64() is a signed version of write_u64()
+	 */
+	int (*write_s64) (struct cgroup *cgrp, struct cftype *cft, s64 val);
 
 	int (*release) (struct inode *inode, struct file *file);
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b5ef0c4772f7..bd6122ccc0ba 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1299,14 +1299,13 @@ enum cgroup_filetype {
 	FILE_RELEASE_AGENT,
 };
 
-static ssize_t cgroup_write_u64(struct cgroup *cgrp, struct cftype *cft,
+static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
 				struct file *file,
 				const char __user *userbuf,
 				size_t nbytes, loff_t *unused_ppos)
 {
 	char buffer[64];
 	int retval = 0;
-	u64 val;
 	char *end;
 
 	if (!nbytes)
@@ -1318,12 +1317,17 @@ static ssize_t cgroup_write_u64(struct cgroup *cgrp, struct cftype *cft,
 
 	buffer[nbytes] = 0;     /* nul-terminate */
 	strstrip(buffer);
-	val = simple_strtoull(buffer, &end, 0);
-	if (*end)
-		return -EINVAL;
-
-	/* Pass to subsystem */
-	retval = cft->write_u64(cgrp, cft, val);
+	if (cft->write_u64) {
+		u64 val = simple_strtoull(buffer, &end, 0);
+		if (*end)
+			return -EINVAL;
+		retval = cft->write_u64(cgrp, cft, val);
+	} else {
+		s64 val = simple_strtoll(buffer, &end, 0);
+		if (*end)
+			return -EINVAL;
+		retval = cft->write_s64(cgrp, cft, val);
+	}
 	if (!retval)
 		retval = nbytes;
 	return retval;
@@ -1404,8 +1408,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 		return -ENODEV;
 	if (cft->write)
 		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
-	if (cft->write_u64)
-		return cgroup_write_u64(cgrp, cft, file, buf, nbytes, ppos);
+	if (cft->write_u64 || cft->write_s64)
+		return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
 	return -EINVAL;
 }
 
@@ -1421,6 +1425,18 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
 	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 
+static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
+			       struct file *file,
+			       char __user *buf, size_t nbytes,
+			       loff_t *ppos)
+{
+	char tmp[64];
+	s64 val = cft->read_s64(cgrp, cft);
+	int len = sprintf(tmp, "%lld\n", (long long) val);
+
+	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+}
+
 static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
 					  struct cftype *cft,
 					  struct file *file,
@@ -1477,6 +1493,8 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 		return cft->read(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->read_u64)
 		return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
+	if (cft->read_s64)
+		return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
 	return -EINVAL;
 }
 
-- 
cgit v1.3-14-g43fede


From 06ecb27cfbf53ac2c7e397aa1619a6f9a98c5896 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 29 Apr 2008 01:00:06 -0700
Subject: CGroups _s64 files: use read_s64/write_s64 in CFS cgroup for
 rt_runtime file

This removes some filesystem boilerplate from the CFS cgroup subsystem.

Signed-off-by: Paul Menage <menage@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 46 ++++++----------------------------------------
 1 file changed, 6 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2528fbd974b4..e2f7f5acc807 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9073,48 +9073,14 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
-				struct file *file,
-				const char __user *userbuf,
-				size_t nbytes, loff_t *unused_ppos)
+				s64 val)
 {
-	char buffer[64];
-	int retval = 0;
-	s64 val;
-	char *end;
-
-	if (!nbytes)
-		return -EINVAL;
-	if (nbytes >= sizeof(buffer))
-		return -E2BIG;
-	if (copy_from_user(buffer, userbuf, nbytes))
-		return -EFAULT;
-
-	buffer[nbytes] = 0;     /* nul-terminate */
-
-	/* strip newline if necessary */
-	if (nbytes && (buffer[nbytes-1] == '\n'))
-		buffer[nbytes-1] = 0;
-	val = simple_strtoll(buffer, &end, 0);
-	if (*end)
-		return -EINVAL;
-
-	/* Pass to subsystem */
-	retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
-	if (!retval)
-		retval = nbytes;
-	return retval;
+	return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
 }
 
-static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
-				   struct file *file,
-				   char __user *buf, size_t nbytes,
-				   loff_t *ppos)
+static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
 {
-	char tmp[64];
-	long val = sched_group_rt_runtime(cgroup_tg(cgrp));
-	int len = sprintf(tmp, "%ld\n", val);
-
-	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+	return sched_group_rt_runtime(cgroup_tg(cgrp));
 }
 
 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
@@ -9140,8 +9106,8 @@ static struct cftype cpu_files[] = {
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
-		.read = cpu_rt_runtime_read,
-		.write = cpu_rt_runtime_write,
+		.read_s64 = cpu_rt_runtime_read,
+		.write_s64 = cpu_rt_runtime_write,
 	},
 	{
 		.name = "rt_period_us",
-- 
cgit v1.3-14-g43fede


From 06a119204d3e1e67d393e996ed987b0df7998381 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Apr 2008 01:00:07 -0700
Subject: cgroup: annotate cgroup_init_subsys with __init

It is called by cgroup_init() and cgroup_init_early() only, which are
annotated with __init.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bd6122ccc0ba..97ab04c3fcf5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2444,7 +2444,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	return 0;
 }
 
-static void cgroup_init_subsys(struct cgroup_subsys *ss)
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
 	struct cgroup_subsys_state *css;
 	struct list_head *l;
-- 
cgit v1.3-14-g43fede


From 46ae220bea40bd1cf4abec2d5cdfb4f9396c7115 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Apr 2008 01:00:08 -0700
Subject: cgroup: switch to proc_create()

There is a race between create_proc_entry() and the assignment of file ops.
proc_create() is invented to fix it.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 97ab04c3fcf5..436e26f4d624 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2545,7 +2545,6 @@ int __init cgroup_init(void)
 {
 	int err;
 	int i;
-	struct proc_dir_entry *entry;
 
 	err = bdi_init(&cgroup_backing_dev_info);
 	if (err)
@@ -2561,9 +2560,7 @@ int __init cgroup_init(void)
 	if (err < 0)
 		goto out;
 
-	entry = create_proc_entry("cgroups", 0, NULL);
-	if (entry)
-		entry->proc_fops = &proc_cgroupstats_operations;
+	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
 
 out:
 	if (err)
-- 
cgit v1.3-14-g43fede


From d447ea2f30ec60370ddb99a668e5ac12995f043d Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 29 Apr 2008 01:00:08 -0700
Subject: cgroups: add the trigger callback to struct cftype

Trigger callback can be used to receive a kick-up from the user space.  The
string written is ignored.

The cftype->private is used for multiplexing events.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Paul Menage <menage@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 8 ++++++++
 kernel/cgroup.c        | 4 ++++
 2 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 785a01cfb49d..2d1d151258cf 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -243,6 +243,14 @@ struct cftype {
 	 */
 	int (*write_s64) (struct cgroup *cgrp, struct cftype *cft, s64 val);
 
+	/*
+	 * trigger() callback can be used to get some kick from the
+	 * userspace, when the actual string written is not important
+	 * at all. The private field can be used to determine the
+	 * kick type for multiplexing.
+	 */
+	int (*trigger)(struct cgroup *cgrp, unsigned int event);
+
 	int (*release) (struct inode *inode, struct file *file);
 };
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 436e26f4d624..7c8cc5141877 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1410,6 +1410,10 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->write_u64 || cft->write_s64)
 		return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+	if (cft->trigger) {
+		int ret = cft->trigger(cgrp, (unsigned int)cft->private);
+		return ret ? ret : nbytes;
+	}
 	return -EINVAL;
 }
 
-- 
cgit v1.3-14-g43fede


From 472b1053f3c319cc60bfb2a0bb062fed77a93eb6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Apr 2008 01:00:11 -0700
Subject: cgroups: use a hash table for css_set finding

When we attach a process to a different cgroup, the css_set linked-list will
be run through to find a suitable existing css_set to use.  This patch
implements a hash table for better performance.

The following benchmarks have been tested:

For N in 1, 5, 10, 50, 100, 500, 1000, create N cgroups with one sleeping
task in each, and then move an additional task through each cgroup in
turn.

Here is a test result:

N	Loop	orig - Time(s)	hash - Time(s)
----------------------------------------------
1	10000	1.201231728	1.196311177
5	2000	1.065743872	1.040566424
10	1000	0.991054735	0.986876440
50	200	0.976554203	0.969608733
100	100	0.998504680	0.969218270
500	20	1.157347764	0.962602963
1000	10	1.619521852	1.085140172

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  7 +++++-
 kernel/cgroup.c        | 59 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 53 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 2d1d151258cf..f585b7cde87b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -155,6 +155,12 @@ struct css_set {
 	 */
 	struct list_head list;
 
+	/*
+	 * List running through all cgroup groups in the same hash
+	 * slot. Protected by css_set_lock
+	 */
+	struct hlist_node hlist;
+
 	/*
 	 * List running through all tasks using this cgroup
 	 * group. Protected by css_set_lock
@@ -174,7 +180,6 @@ struct css_set {
 	 * during subsystem registration (at boot time).
 	 */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
-
 };
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7c8cc5141877..c447c29f8749 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -44,6 +44,7 @@
 #include <linux/kmod.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
+#include <linux/hash.h>
 
 #include <asm/atomic.h>
 
@@ -193,6 +194,27 @@ static struct cg_cgroup_link init_css_set_link;
 static DEFINE_RWLOCK(css_set_lock);
 static int css_set_count;
 
+/* hash table for cgroup groups. This improves the performance to
+ * find an existing css_set */
+#define CSS_SET_HASH_BITS	7
+#define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
+static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+
+static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+{
+	int i;
+	int index;
+	unsigned long tmp = 0UL;
+
+	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+		tmp += (unsigned long)css[i];
+	tmp = (tmp >> 16) ^ tmp;
+
+	index = hash_long(tmp, CSS_SET_HASH_BITS);
+
+	return &css_set_table[index];
+}
+
 /* We don't maintain the lists running through each css_set to its
  * task until after the first call to cgroup_iter_start(). This
  * reduces the fork()/exit() overhead for people who have cgroups
@@ -219,6 +241,7 @@ static int use_task_css_set_links;
 static void unlink_css_set(struct css_set *cg)
 {
 	write_lock(&css_set_lock);
+	hlist_del(&cg->hlist);
 	list_del(&cg->list);
 	css_set_count--;
 	while (!list_empty(&cg->cg_links)) {
@@ -284,9 +307,7 @@ static inline void put_css_set_taskexit(struct css_set *cg)
 /*
  * find_existing_css_set() is a helper for
  * find_css_set(), and checks to see whether an existing
- * css_set is suitable. This currently walks a linked-list for
- * simplicity; a later patch will use a hash table for better
- * performance
+ * css_set is suitable.
  *
  * oldcg: the cgroup group that we're using before the cgroup
  * transition
@@ -303,7 +324,9 @@ static struct css_set *find_existing_css_set(
 {
 	int i;
 	struct cgroupfs_root *root = cgrp->root;
-	struct list_head *l = &init_css_set.list;
+	struct hlist_head *hhead;
+	struct hlist_node *node;
+	struct css_set *cg;
 
 	/* Built the set of subsystem state objects that we want to
 	 * see in the new css_set */
@@ -320,18 +343,13 @@ static struct css_set *find_existing_css_set(
 		}
 	}
 
-	/* Look through existing cgroup groups to find one to reuse */
-	do {
-		struct css_set *cg =
-			list_entry(l, struct css_set, list);
-
+	hhead = css_set_hash(template);
+	hlist_for_each_entry(cg, node, hhead, hlist) {
 		if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
 			/* All subsystems matched */
 			return cg;
 		}
-		/* Try the next cgroup group */
-		l = l->next;
-	} while (l != &init_css_set.list);
+	}
 
 	/* No existing cgroup group matched */
 	return NULL;
@@ -393,6 +411,8 @@ static struct css_set *find_css_set(
 	struct list_head tmp_cg_links;
 	struct cg_cgroup_link *link;
 
+	struct hlist_head *hhead;
+
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
 	write_lock(&css_set_lock);
@@ -417,6 +437,7 @@ static struct css_set *find_css_set(
 	kref_init(&res->ref);
 	INIT_LIST_HEAD(&res->cg_links);
 	INIT_LIST_HEAD(&res->tasks);
+	INIT_HLIST_NODE(&res->hlist);
 
 	/* Copy the set of subsystem state objects generated in
 	 * find_existing_css_set() */
@@ -459,6 +480,11 @@ static struct css_set *find_css_set(
 	/* Link this cgroup group into the list */
 	list_add(&res->list, &init_css_set.list);
 	css_set_count++;
+
+	/* Add this cgroup group to the hash table */
+	hhead = css_set_hash(res->subsys);
+	hlist_add_head(&res->hlist, hhead);
+
 	write_unlock(&css_set_lock);
 
 	return res;
@@ -2508,6 +2534,7 @@ int __init cgroup_init_early(void)
 	INIT_LIST_HEAD(&init_css_set.list);
 	INIT_LIST_HEAD(&init_css_set.cg_links);
 	INIT_LIST_HEAD(&init_css_set.tasks);
+	INIT_HLIST_NODE(&init_css_set.hlist);
 	css_set_count = 1;
 	init_cgroup_root(&rootnode);
 	list_add(&rootnode.root_list, &roots);
@@ -2520,6 +2547,9 @@ int __init cgroup_init_early(void)
 	list_add(&init_css_set_link.cg_link_list,
 		 &init_css_set.cg_links);
 
+	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
+		INIT_HLIST_HEAD(&css_set_table[i]);
+
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 
@@ -2549,6 +2579,7 @@ int __init cgroup_init(void)
 {
 	int err;
 	int i;
+	struct hlist_head *hhead;
 
 	err = bdi_init(&cgroup_backing_dev_info);
 	if (err)
@@ -2560,6 +2591,10 @@ int __init cgroup_init(void)
 			cgroup_init_subsys(ss);
 	}
 
+	/* Add init_css_set to the hash table */
+	hhead = css_set_hash(init_css_set.subsys);
+	hlist_add_head(&init_css_set.hlist, hhead);
+
 	err = register_filesystem(&cgroup_fs_type);
 	if (err < 0)
 		goto out;
-- 
cgit v1.3-14-g43fede


From e8d55fdeb882cfcb5e8db5a5ce16edfba78aafc5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Apr 2008 01:00:13 -0700
Subject: cgroups: simplify init_subsys()

We are at system boot and there is only 1 cgroup group (i,e, init_css_set), so
we don't need to run through the css_set linked list.  Neither do we need to
run through the task list, since no processes have been created yet.

Also referring to a comment in cgroup.h:

struct css_set
{
	...
	/*
	 * Set of subsystem states, one for each subsystem. This array
	 * is immutable after creation apart from the init_css_set
	 * during subsystem registration (at boot time).
	 */
	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
}

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups.txt |  3 +--
 kernel/cgroup.c           | 35 +++++++++--------------------------
 2 files changed, 10 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt
index 31d12e21ff8a..c298a6690e0d 100644
--- a/Documentation/cgroups.txt
+++ b/Documentation/cgroups.txt
@@ -500,8 +500,7 @@ post-attachment activity that requires memory allocations or blocking.
 
 void fork(struct cgroup_subsy *ss, struct task_struct *task)
 
-Called when a task is forked into a cgroup. Also called during
-registration for all existing tasks.
+Called when a task is forked into a cgroup.
 
 void exit(struct cgroup_subsys *ss, struct task_struct *task)
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c447c29f8749..b893c8c94858 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2477,7 +2477,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
 	struct cgroup_subsys_state *css;
-	struct list_head *l;
 
 	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
 
@@ -2488,35 +2487,19 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	BUG_ON(IS_ERR(css));
 	init_cgroup_css(css, ss, dummytop);
 
-	/* Update all cgroup groups to contain a subsys
+	/* Update the init_css_set to contain a subsys
 	 * pointer to this state - since the subsystem is
-	 * newly registered, all tasks and hence all cgroup
-	 * groups are in the subsystem's top cgroup. */
-	write_lock(&css_set_lock);
-	l = &init_css_set.list;
-	do {
-		struct css_set *cg =
-			list_entry(l, struct css_set, list);
-		cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
-		l = l->next;
-	} while (l != &init_css_set.list);
-	write_unlock(&css_set_lock);
-
- 	/* If this subsystem requested that it be notified with fork
- 	 * events, we should send it one now for every process in the
- 	 * system */
-	if (ss->fork) {
-		struct task_struct *g, *p;
-
-		read_lock(&tasklist_lock);
-		do_each_thread(g, p) {
-			ss->fork(ss, p);
-		} while_each_thread(g, p);
-		read_unlock(&tasklist_lock);
-	}
+	 * newly registered, all tasks and hence the
+	 * init_css_set is in the subsystem's top cgroup. */
+	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
 
 	need_forkexit_callback |= ss->fork || ss->exit;
 
+	/* At system boot, before all subsystems have been
+	 * registered, no tasks have been forked, so we don't
+	 * need to invoke fork callbacks here. */
+	BUG_ON(!list_empty(&init_task.tasks));
+
 	ss->active = 1;
 }
 
-- 
cgit v1.3-14-g43fede


From 28fd5dfc12bde391981dfdcf20755952b6e916af Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Apr 2008 01:00:13 -0700
Subject: cgroups: remove the css_set linked-list

Now we can run through the hash table instead of running through the
linked-list.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  6 ------
 kernel/cgroup.c        | 40 ++++++++++++++++++++--------------------
 2 files changed, 20 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f585b7cde87b..d58a958444ab 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -149,12 +149,6 @@ struct css_set {
 	/* Reference count */
 	struct kref ref;
 
-	/*
-	 * List running through all cgroup groups. Protected by
-	 * css_set_lock
-	 */
-	struct list_head list;
-
 	/*
 	 * List running through all cgroup groups in the same hash
 	 * slot. Protected by css_set_lock
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b893c8c94858..aeceb8868981 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -242,7 +242,6 @@ static void unlink_css_set(struct css_set *cg)
 {
 	write_lock(&css_set_lock);
 	hlist_del(&cg->hlist);
-	list_del(&cg->list);
 	css_set_count--;
 	while (!list_empty(&cg->cg_links)) {
 		struct cg_cgroup_link *link;
@@ -477,8 +476,6 @@ static struct css_set *find_css_set(
 
 	BUG_ON(!list_empty(&tmp_cg_links));
 
-	/* Link this cgroup group into the list */
-	list_add(&res->list, &init_css_set.list);
 	css_set_count++;
 
 	/* Add this cgroup group to the hash table */
@@ -963,7 +960,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 	int ret = 0;
 	struct super_block *sb;
 	struct cgroupfs_root *root;
-	struct list_head tmp_cg_links, *l;
+	struct list_head tmp_cg_links;
 	INIT_LIST_HEAD(&tmp_cg_links);
 
 	/* First find the desired set of subsystems */
@@ -1005,6 +1002,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 		/* New superblock */
 		struct cgroup *cgrp = &root->top_cgroup;
 		struct inode *inode;
+		int i;
 
 		BUG_ON(sb->s_root != NULL);
 
@@ -1049,22 +1047,25 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 		/* Link the top cgroup in this hierarchy into all
 		 * the css_set objects */
 		write_lock(&css_set_lock);
-		l = &init_css_set.list;
-		do {
+		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+			struct hlist_head *hhead = &css_set_table[i];
+			struct hlist_node *node;
 			struct css_set *cg;
-			struct cg_cgroup_link *link;
-			cg = list_entry(l, struct css_set, list);
-			BUG_ON(list_empty(&tmp_cg_links));
-			link = list_entry(tmp_cg_links.next,
-					  struct cg_cgroup_link,
-					  cgrp_link_list);
-			list_del(&link->cgrp_link_list);
-			link->cg = cg;
-			list_add(&link->cgrp_link_list,
-				 &root->top_cgroup.css_sets);
-			list_add(&link->cg_link_list, &cg->cg_links);
-			l = l->next;
-		} while (l != &init_css_set.list);
+
+			hlist_for_each_entry(cg, node, hhead, hlist) {
+				struct cg_cgroup_link *link;
+
+				BUG_ON(list_empty(&tmp_cg_links));
+				link = list_entry(tmp_cg_links.next,
+						  struct cg_cgroup_link,
+						  cgrp_link_list);
+				list_del(&link->cgrp_link_list);
+				link->cg = cg;
+				list_add(&link->cgrp_link_list,
+					 &root->top_cgroup.css_sets);
+				list_add(&link->cg_link_list, &cg->cg_links);
+			}
+		}
 		write_unlock(&css_set_lock);
 
 		free_cg_links(&tmp_cg_links);
@@ -2514,7 +2515,6 @@ int __init cgroup_init_early(void)
 	int i;
 	kref_init(&init_css_set.ref);
 	kref_get(&init_css_set.ref);
-	INIT_LIST_HEAD(&init_css_set.list);
 	INIT_LIST_HEAD(&init_css_set.cg_links);
 	INIT_LIST_HEAD(&init_css_set.tasks);
 	INIT_HLIST_NODE(&init_css_set.hlist);
-- 
cgit v1.3-14-g43fede


From 29486df325e1fe6e1764afcb19e3370804c2b002 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Tue, 29 Apr 2008 01:00:14 -0700
Subject: cgroups: introduce cft->read_seq()

Introduce a read_seq() helper in cftype, which uses seq_file to print out
lists.  Use it in the devices cgroup.  Also split devices.allow into two
files, so now devices.deny and devices.allow are the ones to use to manipulate
the whitelist, while devices.list outputs the cgroup's current whitelist.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h   |  6 ++++
 kernel/cgroup.c          | 15 ++++++----
 security/device_cgroup.c | 74 +++++++++++++++---------------------------------
 3 files changed, 38 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d58a958444ab..095248082b7e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -226,6 +226,12 @@ struct cftype {
 	 */
 	int (*read_map) (struct cgroup *cont, struct cftype *cft,
 			 struct cgroup_map_cb *cb);
+	/*
+	 * read_seq_string() is used for outputting a simple sequence
+	 * using seqfile.
+	 */
+	int (*read_seq_string) (struct cgroup *cont, struct cftype *cft,
+			 struct seq_file *m);
 
 	ssize_t (*write) (struct cgroup *cgrp, struct cftype *cft,
 			  struct file *file,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index aeceb8868981..abc433772e5a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1549,11 +1549,14 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct cgroup_seqfile_state *state = m->private;
 	struct cftype *cft = state->cft;
-	struct cgroup_map_cb cb = {
-		.fill = cgroup_map_add,
-		.state = m,
-	};
-	return cft->read_map(state->cgroup, cft, &cb);
+	if (cft->read_map) {
+		struct cgroup_map_cb cb = {
+			.fill = cgroup_map_add,
+			.state = m,
+		};
+		return cft->read_map(state->cgroup, cft, &cb);
+	}
+	return cft->read_seq_string(state->cgroup, cft, m);
 }
 
 int cgroup_seqfile_release(struct inode *inode, struct file *file)
@@ -1581,7 +1584,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
 	cft = __d_cft(file->f_dentry);
 	if (!cft)
 		return -ENODEV;
-	if (cft->read_map) {
+	if (cft->read_map || cft->read_seq_string) {
 		struct cgroup_seqfile_state *state =
 			kzalloc(sizeof(*state), GFP_USER);
 		if (!state)
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 4237b19e8fb3..4ea583689eec 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -9,6 +9,7 @@
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/uaccess.h>
+#include <linux/seq_file.h>
 
 #define ACC_MKNOD 1
 #define ACC_READ  2
@@ -201,11 +202,15 @@ static void devcgroup_destroy(struct cgroup_subsys *ss,
 
 #define DEVCG_ALLOW 1
 #define DEVCG_DENY 2
+#define DEVCG_LIST 3
+
+#define MAJMINLEN 10
+#define ACCLEN 4
 
 static void set_access(char *acc, short access)
 {
 	int idx = 0;
-	memset(acc, 0, 4);
+	memset(acc, 0, ACCLEN);
 	if (access & ACC_READ)
 		acc[idx++] = 'r';
 	if (access & ACC_WRITE)
@@ -225,70 +230,33 @@ static char type_to_char(short type)
 	return 'X';
 }
 
-static void set_majmin(char *str, int len, unsigned m)
+static void set_majmin(char *str, unsigned m)
 {
-	memset(str, 0, len);
+	memset(str, 0, MAJMINLEN);
 	if (m == ~0)
 		sprintf(str, "*");
 	else
-		snprintf(str, len, "%d", m);
+		snprintf(str, MAJMINLEN, "%d", m);
 }
 
-static char *print_whitelist(struct dev_cgroup *devcgroup, int *len)
+static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
+				struct seq_file *m)
 {
-	char *buf, *s, acc[4];
+	struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
 	struct dev_whitelist_item *wh;
-	int ret;
-	int count = 0;
-	char maj[10], min[10];
-
-	buf = kmalloc(4096, GFP_KERNEL);
-	if (!buf)
-		return ERR_PTR(-ENOMEM);
-	s = buf;
-	*s = '\0';
-	*len = 0;
+	char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
 
 	spin_lock(&devcgroup->lock);
 	list_for_each_entry(wh, &devcgroup->whitelist, list) {
 		set_access(acc, wh->access);
-		set_majmin(maj, 10, wh->major);
-		set_majmin(min, 10, wh->minor);
-		ret = snprintf(s, 4095-(s-buf), "%c %s:%s %s\n",
-			type_to_char(wh->type), maj, min, acc);
-		if (s+ret >= buf+4095) {
-			kfree(buf);
-			buf = ERR_PTR(-ENOMEM);
-			break;
-		}
-		s += ret;
-		*len += ret;
-		count++;
+		set_majmin(maj, wh->major);
+		set_majmin(min, wh->minor);
+		seq_printf(m, "%c %s:%s %s\n", type_to_char(wh->type),
+			   maj, min, acc);
 	}
 	spin_unlock(&devcgroup->lock);
 
-	return buf;
-}
-
-static ssize_t devcgroup_access_read(struct cgroup *cgroup,
-			struct cftype *cft, struct file *file,
-			char __user *userbuf, size_t nbytes, loff_t *ppos)
-{
-	struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
-	int filetype = cft->private;
-	char *buffer;
-	int uninitialized_var(len);
-	int retval;
-
-	if (filetype != DEVCG_ALLOW)
-		return -EINVAL;
-	buffer = print_whitelist(devcgroup, &len);
-	if (IS_ERR(buffer))
-		return PTR_ERR(buffer);
-
-	retval = simple_read_from_buffer(userbuf, nbytes, ppos, buffer, len);
-	kfree(buffer);
-	return retval;
+	return 0;
 }
 
 /*
@@ -501,7 +469,6 @@ out1:
 static struct cftype dev_cgroup_files[] = {
 	{
 		.name = "allow",
-		.read = devcgroup_access_read,
 		.write  = devcgroup_access_write,
 		.private = DEVCG_ALLOW,
 	},
@@ -510,6 +477,11 @@ static struct cftype dev_cgroup_files[] = {
 		.write = devcgroup_access_write,
 		.private = DEVCG_DENY,
 	},
+	{
+		.name = "list",
+		.read_seq_string = devcgroup_seq_read,
+		.private = DEVCG_LIST,
+	},
 };
 
 static int devcgroup_populate(struct cgroup_subsys *ss,
-- 
cgit v1.3-14-g43fede


From cf475ad28ac35cc9ba612d67158f29b73b38b05d Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Tue, 29 Apr 2008 01:00:16 -0700
Subject: cgroups: add an owner to the mm_struct

Remove the mem_cgroup member from mm_struct and instead adds an owner.

This approach was suggested by Paul Menage.  The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined.  It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.

A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.

This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes.  The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.

I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.

This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.

After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                  |  1 +
 include/linux/cgroup.h     | 15 +++++++++
 include/linux/memcontrol.h | 16 ++-------
 include/linux/mm_types.h   |  5 +--
 include/linux/sched.h      | 13 ++++++++
 init/Kconfig               |  7 ++++
 init/main.c                |  1 +
 kernel/cgroup.c            | 30 +++++++++++++++++
 kernel/exit.c              | 83 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c              | 11 ++++--
 mm/memcontrol.c            | 28 +++-------------
 11 files changed, 169 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 7768453dc986..711bc45d789c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -735,6 +735,7 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
+	mm_update_next_owner(mm);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 095248082b7e..e155aa78d859 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -305,6 +305,12 @@ struct cgroup_subsys {
 			struct cgroup *cgrp);
 	void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
+	/*
+	 * This routine is called with the task_lock of mm->owner held
+	 */
+	void (*mm_owner_changed)(struct cgroup_subsys *ss,
+					struct cgroup *old,
+					struct cgroup *new);
 	int subsys_id;
 	int active;
 	int disabled;
@@ -390,4 +396,13 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
 
 #endif /* !CONFIG_CGROUPS */
 
+#ifdef CONFIG_MM_OWNER
+extern void
+cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new);
+#else /* !CONFIG_MM_OWNER */
+static inline void
+cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
+{
+}
+#endif /* CONFIG_MM_OWNER */
 #endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8b1c4295848b..e6608776bc96 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -27,9 +27,6 @@ struct mm_struct;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
-extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
-extern void mm_free_cgroup(struct mm_struct *mm);
-
 #define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
 
 extern struct page_cgroup *page_get_page_cgroup(struct page *page);
@@ -48,8 +45,10 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
 
+extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+
 #define mm_match_cgroup(mm, cgroup)	\
-	((cgroup) == rcu_dereference((mm)->mem_cgroup))
+	((cgroup) == mem_cgroup_from_task((mm)->owner))
 
 extern int mem_cgroup_prepare_migration(struct page *page);
 extern void mem_cgroup_end_migration(struct page *page);
@@ -73,15 +72,6 @@ extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
 				struct zone *zone, int priority);
 
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
-static inline void mm_init_cgroup(struct mm_struct *mm,
-					struct task_struct *p)
-{
-}
-
-static inline void mm_free_cgroup(struct mm_struct *mm)
-{
-}
-
 static inline void page_reset_bad_cgroup(struct page *page)
 {
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e2bae8dde35a..bc97bd54f606 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -225,8 +225,9 @@ struct mm_struct {
 	/* aio bits */
 	rwlock_t		ioctx_list_lock;	/* aio lock */
 	struct kioctx		*ioctx_list;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-	struct mem_cgroup *mem_cgroup;
+#ifdef CONFIG_MM_OWNER
+	struct task_struct *owner;	/* The thread group leader that */
+					/* owns the mm_struct.		*/
 #endif
 };
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 024d72b47a0c..1d02babdb2c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2148,6 +2148,19 @@ static inline void migration_init(void)
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 
+#ifdef CONFIG_MM_OWNER
+extern void mm_update_next_owner(struct mm_struct *mm);
+extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
+#else
+static inline void mm_update_next_owner(struct mm_struct *mm)
+{
+}
+
+static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+}
+#endif /* CONFIG_MM_OWNER */
+
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index a3457926342a..98fa96eac415 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -378,9 +378,13 @@ config RESOURCE_COUNTERS
           infrastructure that works with cgroups
 	depends on CGROUPS
 
+config MM_OWNER
+	bool
+
 config CGROUP_MEM_RES_CTLR
 	bool "Memory Resource Controller for Control Groups"
 	depends on CGROUPS && RESOURCE_COUNTERS
+	select MM_OWNER
 	help
 	  Provides a memory resource controller that manages both page cache and
 	  RSS memory.
@@ -393,6 +397,9 @@ config CGROUP_MEM_RES_CTLR
 	  Only enable when you're ok with these trade offs and really
 	  sure you need the memory resource controller.
 
+	  This config option also selects MM_OWNER config option, which
+	  could in turn add some fork/exit overhead.
+
 config SYSFS_DEPRECATED
 	bool
 
diff --git a/init/main.c b/init/main.c
index 1116d2f40cc1..c62c98f381f2 100644
--- a/init/main.c
+++ b/init/main.c
@@ -559,6 +559,7 @@ asmlinkage void __init start_kernel(void)
 	printk(KERN_NOTICE);
 	printk(linux_banner);
 	setup_arch(&command_line);
+	mm_init_owner(&init_mm, &init_task);
 	setup_command_line(command_line);
 	unwind_setup();
 	setup_per_cpu_areas();
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index abc433772e5a..b9d467d83fc1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -119,6 +119,7 @@ static int root_count;
  * be called.
  */
 static int need_forkexit_callback;
+static int need_mm_owner_callback __read_mostly;
 
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -2498,6 +2499,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
 
 	need_forkexit_callback |= ss->fork || ss->exit;
+	need_mm_owner_callback |= !!ss->mm_owner_changed;
 
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
@@ -2748,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child)
 	}
 }
 
+#ifdef CONFIG_MM_OWNER
+/**
+ * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
+ * @p: the new owner
+ *
+ * Called on every change to mm->owner. mm_init_owner() does not
+ * invoke this routine, since it assigns the mm->owner the first time
+ * and does not change it.
+ */
+void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
+{
+	struct cgroup *oldcgrp, *newcgrp;
+
+	if (need_mm_owner_callback) {
+		int i;
+		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+			struct cgroup_subsys *ss = subsys[i];
+			oldcgrp = task_cgroup(old, ss->subsys_id);
+			newcgrp = task_cgroup(new, ss->subsys_id);
+			if (oldcgrp == newcgrp)
+				continue;
+			if (ss->mm_owner_changed)
+				ss->mm_owner_changed(ss, oldcgrp, newcgrp);
+		}
+	}
+}
+#endif /* CONFIG_MM_OWNER */
+
 /**
  * cgroup_post_fork - called on a new task after adding it to the task list
  * @child: the task in question
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a9d98c641ac..ae0f2c4e452b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -557,6 +557,88 @@ void exit_fs(struct task_struct *tsk)
 
 EXPORT_SYMBOL_GPL(exit_fs);
 
+#ifdef CONFIG_MM_OWNER
+/*
+ * Task p is exiting and it owned mm, lets find a new owner for it
+ */
+static inline int
+mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
+{
+	/*
+	 * If there are other users of the mm and the owner (us) is exiting
+	 * we need to find a new owner to take on the responsibility.
+	 */
+	if (!mm)
+		return 0;
+	if (atomic_read(&mm->mm_users) <= 1)
+		return 0;
+	if (mm->owner != p)
+		return 0;
+	return 1;
+}
+
+void mm_update_next_owner(struct mm_struct *mm)
+{
+	struct task_struct *c, *g, *p = current;
+
+retry:
+	if (!mm_need_new_owner(mm, p))
+		return;
+
+	read_lock(&tasklist_lock);
+	/*
+	 * Search in the children
+	 */
+	list_for_each_entry(c, &p->children, sibling) {
+		if (c->mm == mm)
+			goto assign_new_owner;
+	}
+
+	/*
+	 * Search in the siblings
+	 */
+	list_for_each_entry(c, &p->parent->children, sibling) {
+		if (c->mm == mm)
+			goto assign_new_owner;
+	}
+
+	/*
+	 * Search through everything else. We should not get
+	 * here often
+	 */
+	do_each_thread(g, c) {
+		if (c->mm == mm)
+			goto assign_new_owner;
+	} while_each_thread(g, c);
+
+	read_unlock(&tasklist_lock);
+	return;
+
+assign_new_owner:
+	BUG_ON(c == p);
+	get_task_struct(c);
+	/*
+	 * The task_lock protects c->mm from changing.
+	 * We always want mm->owner->mm == mm
+	 */
+	task_lock(c);
+	/*
+	 * Delay read_unlock() till we have the task_lock()
+	 * to ensure that c does not slip away underneath us
+	 */
+	read_unlock(&tasklist_lock);
+	if (c->mm != mm) {
+		task_unlock(c);
+		put_task_struct(c);
+		goto retry;
+	}
+	cgroup_mm_owner_callbacks(mm->owner, c);
+	mm->owner = c;
+	task_unlock(c);
+	put_task_struct(c);
+}
+#endif /* CONFIG_MM_OWNER */
+
 /*
  * Turn us into a lazy TLB process if we
  * aren't already..
@@ -596,6 +678,7 @@ static void exit_mm(struct task_struct * tsk)
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
 	task_unlock(tsk);
+	mm_update_next_owner(mm);
 	mmput(mm);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 6067e429f281..156db96ff754 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -381,14 +381,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	mm->ioctx_list = NULL;
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
-	mm_init_cgroup(mm, p);
+	mm_init_owner(mm, p);
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
 		return mm;
 	}
 
-	mm_free_cgroup(mm);
 	free_mm(mm);
 	return NULL;
 }
@@ -438,7 +437,6 @@ void mmput(struct mm_struct *mm)
 			spin_unlock(&mmlist_lock);
 		}
 		put_swap_token(mm);
-		mm_free_cgroup(mm);
 		mmdrop(mm);
 	}
 }
@@ -982,6 +980,13 @@ static void rt_mutex_init_task(struct task_struct *p)
 #endif
 }
 
+#ifdef CONFIG_MM_OWNER
+void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+	mm->owner = p;
+}
+#endif /* CONFIG_MM_OWNER */
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d12795cc7622..49d80814798b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -236,26 +236,12 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 				css);
 }
 
-static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 				struct mem_cgroup, css);
 }
 
-void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
-{
-	struct mem_cgroup *mem;
-
-	mem = mem_cgroup_from_task(p);
-	css_get(&mem->css);
-	mm->mem_cgroup = mem;
-}
-
-void mm_free_cgroup(struct mm_struct *mm)
-{
-	css_put(&mm->mem_cgroup->css);
-}
-
 static inline int page_cgroup_locked(struct page *page)
 {
 	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
@@ -476,6 +462,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 	int zid = zone_idx(z);
 	struct mem_cgroup_per_zone *mz;
 
+	BUG_ON(!mem_cont);
 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 	if (active)
 		src = &mz->active_list;
@@ -574,7 +561,7 @@ retry:
 		mm = &init_mm;
 
 	rcu_read_lock();
-	mem = rcu_dereference(mm->mem_cgroup);
+	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 	/*
 	 * For every charge from the cgroup, increment reference count
 	 */
@@ -985,10 +972,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	struct mem_cgroup *mem;
 	int node;
 
-	if (unlikely((cont->parent) == NULL)) {
+	if (unlikely((cont->parent) == NULL))
 		mem = &init_mem_cgroup;
-		init_mm.mem_cgroup = mem;
-	} else
+	else
 		mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
 
 	if (mem == NULL)
@@ -1067,10 +1053,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 	if (!thread_group_leader(p))
 		goto out;
 
-	css_get(&mem->css);
-	rcu_assign_pointer(mm->mem_cgroup, mem);
-	css_put(&old_mem->css);
-
 out:
 	mmput(mm);
 }
-- 
cgit v1.3-14-g43fede


From c84872e168d10926acd2dee975d19172eef79252 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 29 Apr 2008 01:00:17 -0700
Subject: memcgroup: add the max_usage member on the res_counter

This field is the maximal value of the usage one since the counter creation
(or since the latest reset).

To reset this to the usage value simply write anything to the appropriate
cgroup file.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/res_counter.h | 14 ++++++++++++++
 kernel/res_counter.c        |  4 ++++
 mm/memcontrol.c             | 17 +++++++++++++++++
 3 files changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 8cb1ecd420a9..df8085acba16 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -24,6 +24,10 @@ struct res_counter {
 	 * the current resource consumption level
 	 */
 	unsigned long long usage;
+	/*
+	 * the maximal value of the usage from the counter creation
+	 */
+	unsigned long long max_usage;
 	/*
 	 * the limit that usage cannot exceed
 	 */
@@ -67,6 +71,7 @@ ssize_t res_counter_write(struct res_counter *counter, int member,
 
 enum {
 	RES_USAGE,
+	RES_MAX_USAGE,
 	RES_LIMIT,
 	RES_FAILCNT,
 };
@@ -127,4 +132,13 @@ static inline bool res_counter_check_under_limit(struct res_counter *cnt)
 	return ret;
 }
 
+static inline void res_counter_reset_max(struct res_counter *cnt)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	cnt->max_usage = cnt->usage;
+	spin_unlock_irqrestore(&cnt->lock, flags);
+}
+
 #endif
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 70587657dda3..d3c61b4ebef2 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -28,6 +28,8 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
 	}
 
 	counter->usage += val;
+	if (counter->usage > counter->max_usage)
+		counter->max_usage = counter->usage;
 	return 0;
 }
 
@@ -66,6 +68,8 @@ res_counter_member(struct res_counter *counter, int member)
 	switch (member) {
 	case RES_USAGE:
 		return &counter->usage;
+	case RES_MAX_USAGE:
+		return &counter->max_usage;
 	case RES_LIMIT:
 		return &counter->limit;
 	case RES_FAILCNT:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 49d80814798b..350a14da6525 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -855,6 +855,17 @@ static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 				mem_cgroup_write_strategy);
 }
 
+static ssize_t mem_cgroup_max_reset(struct cgroup *cont, struct cftype *cft,
+				struct file *file, const char __user *userbuf,
+				size_t nbytes, loff_t *ppos)
+{
+	struct mem_cgroup *mem;
+
+	mem = mem_cgroup_from_cont(cont);
+	res_counter_reset_max(&mem->res);
+	return nbytes;
+}
+
 static ssize_t mem_force_empty_write(struct cgroup *cont,
 				struct cftype *cft, struct file *file,
 				const char __user *userbuf,
@@ -909,6 +920,12 @@ static struct cftype mem_cgroup_files[] = {
 		.private = RES_USAGE,
 		.read_u64 = mem_cgroup_read,
 	},
+	{
+		.name = "max_usage_in_bytes",
+		.private = RES_MAX_USAGE,
+		.write = mem_cgroup_max_reset,
+		.read_u64 = mem_cgroup_read,
+	},
 	{
 		.name = "limit_in_bytes",
 		.private = RES_LIMIT,
-- 
cgit v1.3-14-g43fede


From 9e0c914cabc6d75d2eafdff00671a2ad683a5e3c Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 29 Apr 2008 01:00:25 -0700
Subject: kernel/cpuset.c: make 3 functions static

Make the following needlessly global functions static:

- cpuset_test_cpumask()
- cpuset_change_cpumask()
- cpuset_do_move_task()

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 832004935ca7..b5571272132c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -735,7 +735,8 @@ static inline int started_after(void *p1, void *p2)
  * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
  * words, if its mask is not equal to its cpuset's mask).
  */
-int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
+static int cpuset_test_cpumask(struct task_struct *tsk,
+			       struct cgroup_scanner *scan)
 {
 	return !cpus_equal(tsk->cpus_allowed,
 			(cgroup_cs(scan->cg))->cpus_allowed);
@@ -752,7 +753,8 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
  * We don't need to re-check for the cgroup/cpuset membership, since we're
  * holding cgroup_lock() at this point.
  */
-void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
+static void cpuset_change_cpumask(struct task_struct *tsk,
+				  struct cgroup_scanner *scan)
 {
 	set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
 }
@@ -1714,7 +1716,8 @@ int __init cpuset_init(void)
  * Called by cgroup_scan_tasks() for each task in a cgroup.
  * Return nonzero to stop the walk through the tasks.
  */
-void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
+static void cpuset_do_move_task(struct task_struct *tsk,
+				struct cgroup_scanner *scan)
 {
 	struct cpuset_hotplug_scanner *chsp;
 
-- 
cgit v1.3-14-g43fede


From addf2c739d9015d3e9c0500b58a3af051cd58ea7 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 29 Apr 2008 01:00:26 -0700
Subject: Cpuset hardwall flag: switch cpusets to use the bulk
 cgroup_add_files() API

Currently the cpusets mem_exclusive flag is overloaded to mean both
"no-overlapping" and "no GFP_KERNEL allocations outside this cpuset".

These patches add a new mem_hardwall flag with just the allocation restriction
part of the mem_exclusive semantics, without breaking backwards-compatibility
for those who continue to use just mem_exclusive.  Additionally, the cgroup
control file registration for cpusets is cleaned up to reduce boilerplate.

This patch:

This change tidies up the cpusets control file definitions, and reduces the
amount of boilerplate required to add/change control files in the future.

Signed-off-by: Paul Menage <menage@google.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 166 +++++++++++++++++++++++++-------------------------------
 1 file changed, 75 insertions(+), 91 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b5571272132c..fe5407ca2f1e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1445,53 +1445,76 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
  * for the common functions, 'private' gives the type of file
  */
 
-static struct cftype cft_cpus = {
-	.name = "cpus",
-	.read = cpuset_common_file_read,
-	.write = cpuset_common_file_write,
-	.private = FILE_CPULIST,
-};
-
-static struct cftype cft_mems = {
-	.name = "mems",
-	.read = cpuset_common_file_read,
-	.write = cpuset_common_file_write,
-	.private = FILE_MEMLIST,
-};
-
-static struct cftype cft_cpu_exclusive = {
-	.name = "cpu_exclusive",
-	.read_u64 = cpuset_read_u64,
-	.write_u64 = cpuset_write_u64,
-	.private = FILE_CPU_EXCLUSIVE,
-};
-
-static struct cftype cft_mem_exclusive = {
-	.name = "mem_exclusive",
-	.read_u64 = cpuset_read_u64,
-	.write_u64 = cpuset_write_u64,
-	.private = FILE_MEM_EXCLUSIVE,
-};
-
-static struct cftype cft_sched_load_balance = {
-	.name = "sched_load_balance",
-	.read_u64 = cpuset_read_u64,
-	.write_u64 = cpuset_write_u64,
-	.private = FILE_SCHED_LOAD_BALANCE,
-};
-
-static struct cftype cft_sched_relax_domain_level = {
-	.name = "sched_relax_domain_level",
-	.read = cpuset_common_file_read,
-	.write = cpuset_common_file_write,
-	.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
-};
-
-static struct cftype cft_memory_migrate = {
-	.name = "memory_migrate",
-	.read_u64 = cpuset_read_u64,
-	.write_u64 = cpuset_write_u64,
-	.private = FILE_MEMORY_MIGRATE,
+static struct cftype files[] = {
+	{
+		.name = "cpus",
+		.read = cpuset_common_file_read,
+		.write = cpuset_common_file_write,
+		.private = FILE_CPULIST,
+	},
+
+	{
+		.name = "mems",
+		.read = cpuset_common_file_read,
+		.write = cpuset_common_file_write,
+		.private = FILE_MEMLIST,
+	},
+
+	{
+		.name = "cpu_exclusive",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_CPU_EXCLUSIVE,
+	},
+
+	{
+		.name = "mem_exclusive",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_MEM_EXCLUSIVE,
+	},
+
+	{
+		.name = "sched_load_balance",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_SCHED_LOAD_BALANCE,
+	},
+
+	{
+		.name = "sched_relax_domain_level",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+	},
+
+	{
+		.name = "memory_migrate",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_MEMORY_MIGRATE,
+	},
+
+	{
+		.name = "memory_pressure",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_MEMORY_PRESSURE,
+	},
+
+	{
+		.name = "memory_spread_page",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_SPREAD_PAGE,
+	},
+
+	{
+		.name = "memory_spread_slab",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_SPREAD_SLAB,
+	},
 };
 
 static struct cftype cft_memory_pressure_enabled = {
@@ -1501,57 +1524,18 @@ static struct cftype cft_memory_pressure_enabled = {
 	.private = FILE_MEMORY_PRESSURE_ENABLED,
 };
 
-static struct cftype cft_memory_pressure = {
-	.name = "memory_pressure",
-	.read_u64 = cpuset_read_u64,
-	.write_u64 = cpuset_write_u64,
-	.private = FILE_MEMORY_PRESSURE,
-};
-
-static struct cftype cft_spread_page = {
-	.name = "memory_spread_page",
-	.read_u64 = cpuset_read_u64,
-	.write_u64 = cpuset_write_u64,
-	.private = FILE_SPREAD_PAGE,
-};
-
-static struct cftype cft_spread_slab = {
-	.name = "memory_spread_slab",
-	.read_u64 = cpuset_read_u64,
-	.write_u64 = cpuset_write_u64,
-	.private = FILE_SPREAD_SLAB,
-};
-
 static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	int err;
 
-	if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0)
-		return err;
-	if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
-		return err;
-	if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
-		return err;
-	if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
-		return err;
-	if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
-		return err;
-	if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
-		return err;
-	if ((err = cgroup_add_file(cont, ss,
-					&cft_sched_relax_domain_level)) < 0)
-		return err;
-	if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
-		return err;
-	if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
-		return err;
-	if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
+	err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
+	if (err)
 		return err;
 	/* memory_pressure_enabled is in root cpuset only */
-	if (err == 0 && !cont->parent)
+	if (!cont->parent)
 		err = cgroup_add_file(cont, ss,
-					 &cft_memory_pressure_enabled);
-	return 0;
+				      &cft_memory_pressure_enabled);
+	return err;
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 786083667e0ced85ce17c4c0b6c57a9f47c5b9f2 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 29 Apr 2008 01:00:26 -0700
Subject: Cpuset hardwall flag: add a mem_hardwall flag to cpusets

This flag provides the hardwalling properties of mem_exclusive, without
enforcing the exclusivity.  Either mem_hardwall or mem_exclusive is sufficient
to prevent GFP_KERNEL allocations from passing outside the cpuset's assigned
nodes.

Signed-off-by: Paul Menage <menage@google.com>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cpusets.txt | 26 +++++++++++++------------
 kernel/cpuset.c           | 48 +++++++++++++++++++++++++++++++----------------
 2 files changed, 46 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index aa854b9b18cd..fb7b361e6eea 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -171,6 +171,7 @@ files describing that cpuset:
  - memory_migrate flag: if set, move pages to cpusets nodes
  - cpu_exclusive flag: is cpu placement exclusive?
  - mem_exclusive flag: is memory placement exclusive?
+ - mem_hardwall flag:  is memory allocation hardwalled
  - memory_pressure: measure of how much paging pressure in cpuset
 
 In addition, the root cpuset only has the following file:
@@ -222,17 +223,18 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than
 a direct ancestor or descendent, may share any of the same CPUs or
 Memory Nodes.
 
-A cpuset that is mem_exclusive restricts kernel allocations for
-page, buffer and other data commonly shared by the kernel across
-multiple users.  All cpusets, whether mem_exclusive or not, restrict
-allocations of memory for user space.  This enables configuring a
-system so that several independent jobs can share common kernel data,
-such as file system pages, while isolating each jobs user allocation in
-its own cpuset.  To do this, construct a large mem_exclusive cpuset to
-hold all the jobs, and construct child, non-mem_exclusive cpusets for
-each individual job.  Only a small amount of typical kernel memory,
-such as requests from interrupt handlers, is allowed to be taken
-outside even a mem_exclusive cpuset.
+A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled",
+i.e. it restricts kernel allocations for page, buffer and other data
+commonly shared by the kernel across multiple users.  All cpusets,
+whether hardwalled or not, restrict allocations of memory for user
+space.  This enables configuring a system so that several independent
+jobs can share common kernel data, such as file system pages, while
+isolating each job's user allocation in its own cpuset.  To do this,
+construct a large mem_exclusive cpuset to hold all the jobs, and
+construct child, non-mem_exclusive cpusets for each individual job.
+Only a small amount of typical kernel memory, such as requests from
+interrupt handlers, is allowed to be taken outside even a
+mem_exclusive cpuset.
 
 
 1.5 What is memory_pressure ?
@@ -707,7 +709,7 @@ Now you want to do something with this cpuset.
 
 In this directory you can find several files:
 # ls
-cpus  cpu_exclusive  mems  mem_exclusive  tasks
+cpus  cpu_exclusive  mems  mem_exclusive mem_hardwall  tasks
 
 Reading them will give you information about the state of this cpuset:
 the CPUs and Memory Nodes it can use, the processes that are using
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fe5407ca2f1e..8da627d33804 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -127,6 +127,7 @@ struct cpuset_hotplug_scanner {
 typedef enum {
 	CS_CPU_EXCLUSIVE,
 	CS_MEM_EXCLUSIVE,
+	CS_MEM_HARDWALL,
 	CS_MEMORY_MIGRATE,
 	CS_SCHED_LOAD_BALANCE,
 	CS_SPREAD_PAGE,
@@ -144,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
 	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
 }
 
+static inline int is_mem_hardwall(const struct cpuset *cs)
+{
+	return test_bit(CS_MEM_HARDWALL, &cs->flags);
+}
+
 static inline int is_sched_load_balance(const struct cpuset *cs)
 {
 	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
@@ -1042,12 +1048,9 @@ static int update_relax_domain_level(struct cpuset *cs, char *buf)
 
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
- * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- *				CS_SCHED_LOAD_BALANCE,
- *				CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
- *				CS_SPREAD_PAGE, CS_SPREAD_SLAB)
- * cs:	the cpuset to update
- * buf:	the buffer where we read the 0 or 1
+ * bit:		the bit to update (see cpuset_flagbits_t)
+ * cs:		the cpuset to update
+ * turning_on: 	whether the flag is being set or cleared
  *
  * Call with cgroup_mutex held.
  */
@@ -1228,6 +1231,7 @@ typedef enum {
 	FILE_MEMLIST,
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
+	FILE_MEM_HARDWALL,
 	FILE_SCHED_LOAD_BALANCE,
 	FILE_SCHED_RELAX_DOMAIN_LEVEL,
 	FILE_MEMORY_PRESSURE_ENABLED,
@@ -1313,6 +1317,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 	case FILE_MEM_EXCLUSIVE:
 		retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
 		break;
+	case FILE_MEM_HARDWALL:
+		retval = update_flag(CS_MEM_HARDWALL, cs, val);
+		break;
 	case FILE_SCHED_LOAD_BALANCE:
 		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
 		break;
@@ -1423,6 +1430,8 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
 		return is_cpu_exclusive(cs);
 	case FILE_MEM_EXCLUSIVE:
 		return is_mem_exclusive(cs);
+	case FILE_MEM_HARDWALL:
+		return is_mem_hardwall(cs);
 	case FILE_SCHED_LOAD_BALANCE:
 		return is_sched_load_balance(cs);
 	case FILE_MEMORY_MIGRATE:
@@ -1474,6 +1483,13 @@ static struct cftype files[] = {
 		.private = FILE_MEM_EXCLUSIVE,
 	},
 
+	{
+		.name = "mem_hardwall",
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_MEM_HARDWALL,
+	},
+
 	{
 		.name = "sched_load_balance",
 		.read_u64 = cpuset_read_u64,
@@ -1963,14 +1979,14 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 }
 
 /*
- * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
- * ancestor to the specified cpuset.  Call holding callback_mutex.
- * If no ancestor is mem_exclusive (an unusual configuration), then
- * returns the root cpuset.
+ * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
+ * mem_hardwall ancestor to the specified cpuset.  Call holding
+ * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall
+ * (an unusual configuration), then returns the root cpuset.
  */
-static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 {
-	while (!is_mem_exclusive(cs) && cs->parent)
+	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
 		cs = cs->parent;
 	return cs;
 }
@@ -1984,7 +2000,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * __GFP_THISNODE is set, yes, we can always allocate.  If zone
  * z's node is in our tasks mems_allowed, yes.  If it's not a
  * __GFP_HARDWALL request and this zone's nodes is in the nearest
- * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * hardwalled cpuset ancestor to this tasks cpuset, yes.
  * If the task has been OOM killed and has access to memory reserves
  * as specified by the TIF_MEMDIE flag, yes.
  * Otherwise, no.
@@ -2007,7 +2023,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * and do not allow allocations outside the current tasks cpuset
  * unless the task has been OOM killed as is marked TIF_MEMDIE.
  * GFP_KERNEL allocations are not so marked, so can escape to the
- * nearest enclosing mem_exclusive ancestor cpuset.
+ * nearest enclosing hardwalled ancestor cpuset.
  *
  * Scanning up parent cpusets requires callback_mutex.  The
  * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
@@ -2030,7 +2046,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  *	in_interrupt - any node ok (current task context irrelevant)
  *	GFP_ATOMIC   - any node ok
  *	TIF_MEMDIE   - any node ok
- *	GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
+ *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  *
  * Rule:
@@ -2067,7 +2083,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 	mutex_lock(&callback_mutex);
 
 	task_lock(current);
-	cs = nearest_exclusive_ancestor(task_cs(current));
+	cs = nearest_hardwall_ancestor(task_cs(current));
 	task_unlock(current);
 
 	allowed = node_isset(node, cs->mems_allowed);
-- 
cgit v1.3-14-g43fede


From 00dfcaf748f46de89efe41baa298b5cf9adda67e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 29 Apr 2008 01:00:27 -0700
Subject: workqueues: shrink cpu_populated_map when CPU dies

When cpu_populated_map was introduced, it was supposed that cwq->thread can
survive after CPU_DEAD, that is why we never shrink cpu_populated_map.

This is not very nice, we can safely remove the already dead CPU from the map.
 The only required change is that destroy_workqueue() must hold the hotplug
lock until it destroys all cwq->thread's, to protect the cpu_populated_map.
We could make the local copy of cpu mask and drop the lock, but
sizeof(cpumask_t) may be very large.

Also, fix the comment near queue_work().  Unless _cpu_down() happens we do
guarantee the cpu-affinity of the work_struct, and we have users which rely on
this.

[akpm@linux-foundation.org: repair comment]
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 00ff4d08e370..1ad0ee489cd1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -158,8 +158,8 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
  *
  * Returns 0 if @work was already on a queue, non-zero otherwise.
  *
- * We queue the work to the CPU it was submitted, but there is no
- * guarantee that it will be processed by that CPU.
+ * We queue the work to the CPU on which it was submitted, but if the CPU dies
+ * it can be processed by another CPU.
  */
 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
@@ -815,12 +815,12 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	spin_lock(&workqueue_lock);
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
-	put_online_cpus();
 
 	for_each_cpu_mask(cpu, *cpu_map) {
 		cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 		cleanup_workqueue_thread(cwq, cpu);
 	}
+	put_online_cpus();
 
 	free_percpu(wq->cpu_wq);
 	kfree(wq);
@@ -838,7 +838,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	action &= ~CPU_TASKS_FROZEN;
 
 	switch (action) {
-
 	case CPU_UP_PREPARE:
 		cpu_set(cpu, cpu_populated_map);
 	}
@@ -866,6 +865,12 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		}
 	}
 
+	switch (action) {
+	case CPU_UP_CANCELED:
+	case CPU_DEAD:
+		cpu_clear(cpu, cpu_populated_map);
+	}
+
 	return NOTIFY_OK;
 }
 
-- 
cgit v1.3-14-g43fede


From 1e35eaa2d86419470f3f3aed9acd85b8addff25c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 29 Apr 2008 01:00:28 -0700
Subject: cleanup_workqueue_thread: remove the unneeded "cpu" parameter

cleanup_workqueue_thread() doesn't need the second argument, remove it.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ad0ee489cd1..7db251a959c5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -772,7 +772,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
 
-static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
 	/*
 	 * Our caller is either destroy_workqueue() or CPU_DEAD,
@@ -808,7 +808,6 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 void destroy_workqueue(struct workqueue_struct *wq)
 {
 	const cpumask_t *cpu_map = wq_cpu_map(wq);
-	struct cpu_workqueue_struct *cwq;
 	int cpu;
 
 	get_online_cpus();
@@ -816,10 +815,8 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
-	for_each_cpu_mask(cpu, *cpu_map) {
-		cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-		cleanup_workqueue_thread(cwq, cpu);
-	}
+	for_each_cpu_mask(cpu, *cpu_map)
+		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
 	put_online_cpus();
 
 	free_percpu(wq->cpu_wq);
@@ -860,7 +857,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		case CPU_UP_CANCELED:
 			start_workqueue_thread(cwq, -1);
 		case CPU_DEAD:
-			cleanup_workqueue_thread(cwq, cpu);
+			cleanup_workqueue_thread(cwq);
 			break;
 		}
 	}
-- 
cgit v1.3-14-g43fede


From d2ba7e2ae206e9ab24e8937d99d0d5513bfd08e5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 29 Apr 2008 01:00:29 -0700
Subject: simplify cpu_hotplug_begin()/put_online_cpus()

cpu_hotplug_begin() must be always called under cpu_add_remove_lock, this
means that only one process can be cpu_hotplug.active_writer.  So we don't
need the cpu_hotplug.writer_queue, we can wake up the ->active_writer
directly.

Also, fix the comment.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index f8f9468d17d7..a98f6ab16ecd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -33,17 +33,13 @@ static struct {
 	 * an ongoing cpu hotplug operation.
 	 */
 	int refcount;
-	wait_queue_head_t writer_queue;
 } cpu_hotplug;
 
-#define writer_exists() (cpu_hotplug.active_writer != NULL)
-
 void __init cpu_hotplug_init(void)
 {
 	cpu_hotplug.active_writer = NULL;
 	mutex_init(&cpu_hotplug.lock);
 	cpu_hotplug.refcount = 0;
-	init_waitqueue_head(&cpu_hotplug.writer_queue);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -65,11 +61,8 @@ void put_online_cpus(void)
 	if (cpu_hotplug.active_writer == current)
 		return;
 	mutex_lock(&cpu_hotplug.lock);
-	cpu_hotplug.refcount--;
-
-	if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
-		wake_up(&cpu_hotplug.writer_queue);
-
+	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
+		wake_up_process(cpu_hotplug.active_writer);
 	mutex_unlock(&cpu_hotplug.lock);
 
 }
@@ -98,8 +91,8 @@ void cpu_maps_update_done(void)
  * Note that during a cpu-hotplug operation, the new readers, if any,
  * will be blocked by the cpu_hotplug.lock
  *
- * Since cpu_maps_update_begin is always called after invoking
- * cpu_maps_update_begin, we can be sure that only one writer is active.
+ * Since cpu_hotplug_begin() is always called after invoking
+ * cpu_maps_update_begin(), we can be sure that only one writer is active.
  *
  * Note that theoretically, there is a possibility of a livelock:
  * - Refcount goes to zero, last reader wakes up the sleeping
@@ -115,19 +108,16 @@ void cpu_maps_update_done(void)
  */
 static void cpu_hotplug_begin(void)
 {
-	DECLARE_WAITQUEUE(wait, current);
-
-	mutex_lock(&cpu_hotplug.lock);
-
 	cpu_hotplug.active_writer = current;
-	add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
-	while (cpu_hotplug.refcount) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+
+	for (;;) {
+		mutex_lock(&cpu_hotplug.lock);
+		if (likely(!cpu_hotplug.refcount))
+			break;
+		__set_current_state(TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&cpu_hotplug.lock);
 		schedule();
-		mutex_lock(&cpu_hotplug.lock);
 	}
-	remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
 }
 
 static void cpu_hotplug_done(void)
-- 
cgit v1.3-14-g43fede


From 6546bc4279241e8fa432de1bb63a4f6f791fd669 Mon Sep 17 00:00:00 2001
From: Nadia Derbey <Nadia.Derbey@bull.net>
Date: Tue, 29 Apr 2008 01:00:45 -0700
Subject: ipc: re-enable msgmni automatic recomputing msgmni if set to negative

The enhancement as asked for by Yasunori: if msgmni is set to a negative
value, register it back into the ipcns notifier chain.

A new interface has been added to the notification mechanism:
notifier_chain_cond_register() registers a notifier block only if not already
registered.  With that new interface we avoid taking care of the states
changes in procfs.

Signed-off-by: Nadia Derbey <Nadia.Derbey@bull.net>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Mingming Cao <cmm@us.ibm.com>
Cc: Pierre Peiffer <pierre.peiffer@bull.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  1 +
 include/linux/notifier.h      |  4 ++++
 ipc/ipc_sysctl.c              | 45 +++++++++++++++++++++++++++++++++----------
 ipc/ipcns_notifier.c          |  9 +++++++++
 kernel/notifier.c             | 38 ++++++++++++++++++++++++++++++++++++
 5 files changed, 87 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index c3b1da9e5feb..ea6c18a8b0d4 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -52,6 +52,7 @@ extern atomic_t nr_ipc_ns;
 #define INIT_IPC_NS(ns)		.ns		= &init_ipc_ns,
 
 extern int register_ipcns_notifier(struct ipc_namespace *);
+extern int cond_register_ipcns_notifier(struct ipc_namespace *);
 extern int unregister_ipcns_notifier(struct ipc_namespace *);
 extern int ipcns_notify(unsigned long);
 
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 20dfed590183..0ff6224d172a 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -121,6 +121,10 @@ extern int raw_notifier_chain_register(struct raw_notifier_head *nh,
 extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
 		struct notifier_block *nb);
 
+extern int blocking_notifier_chain_cond_register(
+		struct blocking_notifier_head *nh,
+		struct notifier_block *nb);
+
 extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
 		struct notifier_block *nb);
 extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index d12ff5cd2a0b..d3497465cc0a 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -15,6 +15,8 @@
 #include <linux/sysctl.h>
 #include <linux/uaccess.h>
 #include <linux/ipc_namespace.h>
+#include <linux/msg.h>
+#include "util.h"
 
 static void *get_ipc(ctl_table *table)
 {
@@ -24,6 +26,27 @@ static void *get_ipc(ctl_table *table)
 	return which;
 }
 
+/*
+ * Routine that is called when a tunable has successfully been changed by
+ * hand and it has a callback routine registered on the ipc namespace notifier
+ * chain: we don't want such tunables to be recomputed anymore upon memory
+ * add/remove or ipc namespace creation/removal.
+ * They can come back to a recomputable state by being set to a <0 value.
+ */
+static void tunable_set_callback(int val)
+{
+	if (val >= 0)
+		unregister_ipcns_notifier(current->nsproxy->ipc_ns);
+	else {
+		/*
+		 * Re-enable automatic recomputing only if not already
+		 * enabled.
+		 */
+		recompute_msgmni(current->nsproxy->ipc_ns);
+		cond_register_ipcns_notifier(current->nsproxy->ipc_ns);
+	}
+}
+
 #ifdef CONFIG_PROC_FS
 static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
 	void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -38,17 +61,17 @@ static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
 static int proc_ipc_callback_dointvec(ctl_table *table, int write,
 	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct ctl_table ipc_table;
 	size_t lenp_bef = *lenp;
 	int rc;
 
-	rc = proc_ipc_dointvec(table, write, filp, buffer, lenp, ppos);
+	memcpy(&ipc_table, table, sizeof(ipc_table));
+	ipc_table.data = get_ipc(table);
+
+	rc = proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos);
 
 	if (write && !rc && lenp_bef == *lenp)
-		/*
-		 * Tunable has successfully been changed from userland:
-		 * disable its automatic recomputing.
-		 */
-		unregister_ipcns_notifier(current->nsproxy->ipc_ns);
+		tunable_set_callback(*((int *)(ipc_table.data)));
 
 	return rc;
 }
@@ -119,12 +142,14 @@ static int sysctl_ipc_registered_data(ctl_table *table, int __user *name,
 	rc = sysctl_ipc_data(table, name, nlen, oldval, oldlenp, newval,
 		newlen);
 
-	if (newval && newlen && rc > 0)
+	if (newval && newlen && rc > 0) {
 		/*
-		 * Tunable has successfully been changed from userland:
-		 * disable its automatic recomputing.
+		 * Tunable has successfully been changed from userland
 		 */
-		unregister_ipcns_notifier(current->nsproxy->ipc_ns);
+		int *data = get_ipc(table);
+
+		tunable_set_callback(*data);
+	}
 
 	return rc;
 }
diff --git a/ipc/ipcns_notifier.c b/ipc/ipcns_notifier.c
index c7974609defc..70ff09183f7b 100644
--- a/ipc/ipcns_notifier.c
+++ b/ipc/ipcns_notifier.c
@@ -61,6 +61,15 @@ int register_ipcns_notifier(struct ipc_namespace *ns)
 	return blocking_notifier_chain_register(&ipcns_chain, &ns->ipcns_nb);
 }
 
+int cond_register_ipcns_notifier(struct ipc_namespace *ns)
+{
+	memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb));
+	ns->ipcns_nb.notifier_call = ipcns_callback;
+	ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI;
+	return blocking_notifier_chain_cond_register(&ipcns_chain,
+							&ns->ipcns_nb);
+}
+
 int unregister_ipcns_notifier(struct ipc_namespace *ns)
 {
 	return blocking_notifier_chain_unregister(&ipcns_chain,
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 643360d1bb14..823be11584ef 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -31,6 +31,21 @@ static int notifier_chain_register(struct notifier_block **nl,
 	return 0;
 }
 
+static int notifier_chain_cond_register(struct notifier_block **nl,
+		struct notifier_block *n)
+{
+	while ((*nl) != NULL) {
+		if ((*nl) == n)
+			return 0;
+		if (n->priority > (*nl)->priority)
+			break;
+		nl = &((*nl)->next);
+	}
+	n->next = *nl;
+	rcu_assign_pointer(*nl, n);
+	return 0;
+}
+
 static int notifier_chain_unregister(struct notifier_block **nl,
 		struct notifier_block *n)
 {
@@ -204,6 +219,29 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
 }
 EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
 
+/**
+ *	blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
+ *	@n: New entry in notifier chain
+ *
+ *	Adds a notifier to a blocking notifier chain, only if not already
+ *	present in the chain.
+ *	Must be called in process context.
+ *
+ *	Currently always returns zero.
+ */
+int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
+		struct notifier_block *n)
+{
+	int ret;
+
+	down_write(&nh->rwsem);
+	ret = notifier_chain_cond_register(&nh->head, n);
+	up_write(&nh->rwsem);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
+
 /**
  *	blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
  *	@nh: Pointer to head of the blocking notifier chain
-- 
cgit v1.3-14-g43fede


From 9edff4ab1f8d82675277a04e359d0ed8bf14a7b7 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Tue, 29 Apr 2008 01:00:57 -0700
Subject: ipc: sysvsem: implement sys_unshare(CLONE_SYSVSEM)

sys_unshare(CLONE_NEWIPC) doesn't handle the undo lists properly, this can
cause a kernel memory corruption.  CLONE_NEWIPC must detach from the existing
undo lists.

Fix, part 1: add support for sys_unshare(CLONE_SYSVSEM)

The original reason to not support it was the potential (inevitable?)
confusion due to the fact that sys_unshare(CLONE_SYSVSEM) has the
inverse meaning of clone(CLONE_SYSVSEM).

Our two most reasonable options then appear to be (1) fully support
CLONE_SYSVSEM, or (2) continue to refuse explicit CLONE_SYSVSEM,
but always do it anyway on unshare(CLONE_SYSVSEM).  This patch does
(1).

Changelog:
	Apr 16: SEH: switch to Manfred's alternative patch which
		removes the unshare_semundo() function which
		always refused CLONE_SYSVSEM.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Pierre Peiffer <peifferp@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/sem.c     |  1 +
 kernel/fork.c | 29 +++++++++++------------------
 2 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/ipc/sem.c b/ipc/sem.c
index d56d3ab6bb8a..e9418df5ff3e 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1250,6 +1250,7 @@ void exit_sem(struct task_struct *tsk)
 	undo_list = tsk->sysvsem.undo_list;
 	if (!undo_list)
 		return;
+	tsk->sysvsem.undo_list = NULL;
 
 	if (!atomic_dec_and_test(&undo_list->refcnt))
 		return;
diff --git a/kernel/fork.c b/kernel/fork.c
index 156db96ff754..01666979beac 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1668,18 +1668,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
 	return 0;
 }
 
-/*
- * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
- * supported yet
- */
-static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
-{
-	if (unshare_flags & CLONE_SYSVSEM)
-		return -EINVAL;
-
-	return 0;
-}
-
 /*
  * unshare allows a process to 'unshare' part of the process
  * context which was originally shared using clone.  copy_*
@@ -1695,8 +1683,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	struct sighand_struct *new_sigh = NULL;
 	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
 	struct files_struct *fd, *new_fd = NULL;
-	struct sem_undo_list *new_ulist = NULL;
 	struct nsproxy *new_nsproxy = NULL;
+	int do_sysvsem = 0;
 
 	check_unshare_flags(&unshare_flags);
 
@@ -1708,6 +1696,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 				CLONE_NEWNET))
 		goto bad_unshare_out;
 
+	if (unshare_flags & CLONE_SYSVSEM)
+		do_sysvsem = 1;
 	if ((err = unshare_thread(unshare_flags)))
 		goto bad_unshare_out;
 	if ((err = unshare_fs(unshare_flags, &new_fs)))
@@ -1718,13 +1708,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		goto bad_unshare_cleanup_sigh;
 	if ((err = unshare_fd(unshare_flags, &new_fd)))
 		goto bad_unshare_cleanup_vm;
-	if ((err = unshare_semundo(unshare_flags, &new_ulist)))
-		goto bad_unshare_cleanup_fd;
 	if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
 			new_fs)))
-		goto bad_unshare_cleanup_semundo;
+		goto bad_unshare_cleanup_fd;
 
-	if (new_fs ||  new_mm || new_fd || new_ulist || new_nsproxy) {
+	if (new_fs ||  new_mm || new_fd || do_sysvsem || new_nsproxy) {
+		if (do_sysvsem) {
+			/*
+			 * CLONE_SYSVSEM is equivalent to sys_exit().
+			 */
+			exit_sem(current);
+		}
 
 		if (new_nsproxy) {
 			switch_task_namespaces(current, new_nsproxy);
@@ -1760,7 +1754,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	if (new_nsproxy)
 		put_nsproxy(new_nsproxy);
 
-bad_unshare_cleanup_semundo:
 bad_unshare_cleanup_fd:
 	if (new_fd)
 		put_files_struct(new_fd);
-- 
cgit v1.3-14-g43fede


From 6013f67fc1a4c7fa5bcab2d39c1eaa3e260c7ac1 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Tue, 29 Apr 2008 01:00:59 -0700
Subject: ipc: sysvsem: force unshare(CLONE_SYSVSEM) when CLONE_NEWIPC

sys_unshare(CLONE_NEWIPC) doesn't handle the undo lists properly, this can
cause a kernel memory corruption.  CLONE_NEWIPC must detach from the existing
undo lists.

Fix, part 2: perform an implicit CLONE_SYSVSEM in CLONE_NEWIPC.  CLONE_NEWIPC
creates a new IPC namespace, the task cannot access the existing semaphore
arrays after the unshare syscall.  Thus the task can/must detach from the
existing undo list entries, too.

This fixes the kernel corruption, because it makes it impossible that
undo records from two different namespaces are in sysvsem.undo_list.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Pierre Peiffer <peifferp@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 01666979beac..de5c16c6b6ec 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1696,7 +1696,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 				CLONE_NEWNET))
 		goto bad_unshare_out;
 
-	if (unshare_flags & CLONE_SYSVSEM)
+	/*
+	 * CLONE_NEWIPC must also detach from the undolist: after switching
+	 * to a new ipc namespace, the semaphore arrays from the old
+	 * namespace are unreachable.
+	 */
+	if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
 		do_sysvsem = 1;
 	if ((err = unshare_thread(unshare_flags)))
 		goto bad_unshare_out;
-- 
cgit v1.3-14-g43fede


From 02fdb36ae7f55db7757b623acd27a62d5000d755 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Tue, 29 Apr 2008 01:01:00 -0700
Subject: ipc: sysvsem: refuse clone(CLONE_SYSVSEM|CLONE_NEWIPC)

CLONE_NEWIPC|CLONE_SYSVSEM interaction isn't handled properly.  This can cause
a kernel memory corruption.  CLONE_NEWIPC must detach from the existing undo
lists.

Fix, part 3: refuse clone(CLONE_SYSVSEM|CLONE_NEWIPC).

With unshare, specifying CLONE_SYSVSEM means unshare the sysvsem.  So it seems
reasonable that CLONE_NEWIPC without CLONE_SYSVSEM would just imply
CLONE_SYSVSEM.

However with clone, specifying CLONE_SYSVSEM means *share* the sysvsem.  So
calling clone(CLONE_SYSVSEM|CLONE_NEWIPC) is explicitly asking for something
we can't allow.  So return -EINVAL in that case.

[akpm@linux-foundation.org: cleanups]
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Pierre Peiffer <peifferp@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/nsproxy.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5d332cf8c63..adc785146a1c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -139,6 +139,18 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 		goto out;
 	}
 
+	/*
+	 * CLONE_NEWIPC must detach from the undolist: after switching
+	 * to a new ipc namespace, the semaphore arrays from the old
+	 * namespace are unreachable.  In clone parlance, CLONE_SYSVSEM
+	 * means share undolist with parent, so we must forbid using
+	 * it along with CLONE_NEWIPC.
+	 */
+	if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
+		err = -EINVAL;
+		goto out;
+	}
+
 	new_ns = create_new_namespaces(flags, tsk, tsk->fs);
 	if (IS_ERR(new_ns)) {
 		err = PTR_ERR(new_ns);
-- 
cgit v1.3-14-g43fede


From 69664cf16af4f31cd54d77948a4baf9c7e0ca7b9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Apr 2008 01:01:31 -0700
Subject: keys: don't generate user and user session keyrings unless they're
 accessed

Don't generate the per-UID user and user session keyrings unless they're
explicitly accessed.  This solves a problem during a login process whereby
set*uid() is called before the SELinux PAM module, resulting in the per-UID
keyrings having the wrong security labels.

This also cures the problem of multiple per-UID keyrings sometimes appearing
due to PAM modules (including pam_keyinit) setuiding and causing user_structs
to come into and go out of existence whilst the session keyring pins the user
keyring.  This is achieved by first searching for extant per-UID keyrings
before inventing new ones.

The serial bound argument is also dropped from find_keyring_by_name() as it's
not currently made use of (setting it to 0 disables the feature).

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: <kwc@citi.umich.edu>
Cc: <arunsr@cse.iitk.ac.in>
Cc: <dwalsh@redhat.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/key.h          |   8 ---
 kernel/user.c                |  15 ++---
 security/keys/internal.h     |   4 +-
 security/keys/key.c          |  45 +-------------
 security/keys/keyring.c      |  19 +++---
 security/keys/process_keys.c | 142 +++++++++++++++++++++++++------------------
 security/selinux/hooks.c     |   8 ---
 7 files changed, 96 insertions(+), 145 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/key.h b/include/linux/key.h
index 8b0bd3393abc..2effd031a817 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -268,9 +268,6 @@ extern struct key *key_lookup(key_serial_t id);
 /*
  * the userspace interface
  */
-extern struct key root_user_keyring, root_session_keyring;
-extern int alloc_uid_keyring(struct user_struct *user,
-			     struct task_struct *ctx);
 extern void switch_uid_keyring(struct user_struct *new_user);
 extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk);
 extern int copy_thread_group_keys(struct task_struct *tsk);
@@ -299,7 +296,6 @@ extern void key_init(void);
 #define make_key_ref(k, p)			({ NULL; })
 #define key_ref_to_ptr(k)		({ NULL; })
 #define is_key_possessed(k)		0
-#define alloc_uid_keyring(u,c)		0
 #define switch_uid_keyring(u)		do { } while(0)
 #define __install_session_keyring(t, k)	({ NULL; })
 #define copy_keys(f,t)			0
@@ -312,10 +308,6 @@ extern void key_init(void);
 #define key_fsgid_changed(t)		do { } while(0)
 #define key_init()			do { } while(0)
 
-/* Initial keyrings */
-extern struct key root_user_keyring;
-extern struct key root_session_keyring;
-
 #endif /* CONFIG_KEYS */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_KEY_H */
diff --git a/kernel/user.c b/kernel/user.c
index debce602bfdd..aefbbfa3159f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -53,10 +53,6 @@ struct user_struct root_user = {
 	.files		= ATOMIC_INIT(0),
 	.sigpending	= ATOMIC_INIT(0),
 	.locked_shm     = 0,
-#ifdef CONFIG_KEYS
-	.uid_keyring	= &root_user_keyring,
-	.session_keyring = &root_session_keyring,
-#endif
 #ifdef CONFIG_USER_SCHED
 	.tg		= &init_task_group,
 #endif
@@ -420,12 +416,12 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 		new->mq_bytes = 0;
 #endif
 		new->locked_shm = 0;
-
-		if (alloc_uid_keyring(new, current) < 0)
-			goto out_free_user;
+#ifdef CONFIG_KEYS
+		new->uid_keyring = new->session_keyring = NULL;
+#endif
 
 		if (sched_create_user(new) < 0)
-			goto out_put_keys;
+			goto out_free_user;
 
 		if (uids_user_create(new))
 			goto out_destoy_sched;
@@ -459,9 +455,6 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 
 out_destoy_sched:
 	sched_destroy_user(new);
-out_put_keys:
-	key_put(new->uid_keyring);
-	key_put(new->session_keyring);
 out_free_user:
 	kmem_cache_free(uid_cachep, new);
 out_unlock:
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 6361d3736dbc..2ab38854c47f 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -77,8 +77,6 @@ extern struct mutex key_construction_mutex;
 extern wait_queue_head_t request_key_conswq;
 
 
-extern void keyring_publish_name(struct key *keyring);
-
 extern int __key_link(struct key *keyring, struct key *key);
 
 extern key_ref_t __keyring_search_one(key_ref_t keyring_ref,
@@ -102,7 +100,7 @@ extern key_ref_t search_process_keyrings(struct key_type *type,
 					 key_match_func_t match,
 					 struct task_struct *tsk);
 
-extern struct key *find_keyring_by_name(const char *name, key_serial_t bound);
+extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check);
 
 extern int install_thread_keyring(struct task_struct *tsk);
 extern int install_process_keyring(struct task_struct *tsk);
diff --git a/security/keys/key.c b/security/keys/key.c
index d98c61953be6..46f125aa7fa3 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -1,6 +1,6 @@
 /* Basic authentication token and access key management
  *
- * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -137,36 +137,6 @@ void key_user_put(struct key_user *user)
 
 } /* end key_user_put() */
 
-/*****************************************************************************/
-/*
- * insert a key with a fixed serial number
- */
-static void __init __key_insert_serial(struct key *key)
-{
-	struct rb_node *parent, **p;
-	struct key *xkey;
-
-	parent = NULL;
-	p = &key_serial_tree.rb_node;
-
-	while (*p) {
-		parent = *p;
-		xkey = rb_entry(parent, struct key, serial_node);
-
-		if (key->serial < xkey->serial)
-			p = &(*p)->rb_left;
-		else if (key->serial > xkey->serial)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	/* we've found a suitable hole - arrange for this key to occupy it */
-	rb_link_node(&key->serial_node, parent, p);
-	rb_insert_color(&key->serial_node, &key_serial_tree);
-
-} /* end __key_insert_serial() */
-
 /*****************************************************************************/
 /*
  * assign a key the next unique serial number
@@ -1020,17 +990,4 @@ void __init key_init(void)
 	rb_insert_color(&root_key_user.node,
 			&key_user_tree);
 
-	/* record root's user standard keyrings */
-	key_check(&root_user_keyring);
-	key_check(&root_session_keyring);
-
-	__key_insert_serial(&root_user_keyring);
-	__key_insert_serial(&root_session_keyring);
-
-	keyring_publish_name(&root_user_keyring);
-	keyring_publish_name(&root_session_keyring);
-
-	/* link the two root keyrings together */
-	key_link(&root_session_keyring, &root_user_keyring);
-
 } /* end key_init() */
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 70f0c313c888..a9ab8affc092 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -1,6 +1,6 @@
-/* keyring.c: keyring handling
+/* Keyring handling
  *
- * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -79,7 +79,7 @@ static DECLARE_RWSEM(keyring_serialise_link_sem);
  * publish the name of a keyring so that it can be found by name (if it has
  * one)
  */
-void keyring_publish_name(struct key *keyring)
+static void keyring_publish_name(struct key *keyring)
 {
 	int bucket;
 
@@ -516,10 +516,9 @@ key_ref_t __keyring_search_one(key_ref_t keyring_ref,
 /*
  * find a keyring with the specified name
  * - all named keyrings are searched
- * - only find keyrings with search permission for the process
- * - only find keyrings with a serial number greater than the one specified
+ * - normally only finds keyrings with search permission for the current process
  */
-struct key *find_keyring_by_name(const char *name, key_serial_t bound)
+struct key *find_keyring_by_name(const char *name, bool skip_perm_check)
 {
 	struct key *keyring;
 	int bucket;
@@ -545,15 +544,11 @@ struct key *find_keyring_by_name(const char *name, key_serial_t bound)
 			if (strcmp(keyring->description, name) != 0)
 				continue;
 
-			if (key_permission(make_key_ref(keyring, 0),
+			if (!skip_perm_check &&
+			    key_permission(make_key_ref(keyring, 0),
 					   KEY_SEARCH) < 0)
 				continue;
 
-			/* found a potential candidate, but we still need to
-			 * check the serial number */
-			if (keyring->serial <= bound)
-				continue;
-
 			/* we've got a match */
 			atomic_inc(&keyring->usage);
 			read_unlock(&keyring_name_lock);
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index c886a2bb792a..5be6d018759a 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -1,6 +1,6 @@
-/* process_keys.c: management of a process's keyrings
+/* Management of a process's keyrings
  *
- * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -23,6 +23,9 @@
 /* session keyring create vs join semaphore */
 static DEFINE_MUTEX(key_session_mutex);
 
+/* user keyring creation semaphore */
+static DEFINE_MUTEX(key_user_keyring_mutex);
+
 /* the root user's tracking struct */
 struct key_user root_key_user = {
 	.usage		= ATOMIC_INIT(3),
@@ -33,78 +36,84 @@ struct key_user root_key_user = {
 	.uid		= 0,
 };
 
-/* the root user's UID keyring */
-struct key root_user_keyring = {
-	.usage		= ATOMIC_INIT(1),
-	.serial		= 2,
-	.type		= &key_type_keyring,
-	.user		= &root_key_user,
-	.sem		= __RWSEM_INITIALIZER(root_user_keyring.sem),
-	.perm		= (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL,
-	.flags		= 1 << KEY_FLAG_INSTANTIATED,
-	.description	= "_uid.0",
-#ifdef KEY_DEBUGGING
-	.magic		= KEY_DEBUG_MAGIC,
-#endif
-};
-
-/* the root user's default session keyring */
-struct key root_session_keyring = {
-	.usage		= ATOMIC_INIT(1),
-	.serial		= 1,
-	.type		= &key_type_keyring,
-	.user		= &root_key_user,
-	.sem		= __RWSEM_INITIALIZER(root_session_keyring.sem),
-	.perm		= (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL,
-	.flags		= 1 << KEY_FLAG_INSTANTIATED,
-	.description	= "_uid_ses.0",
-#ifdef KEY_DEBUGGING
-	.magic		= KEY_DEBUG_MAGIC,
-#endif
-};
-
 /*****************************************************************************/
 /*
- * allocate the keyrings to be associated with a UID
+ * install user and user session keyrings for a particular UID
  */
-int alloc_uid_keyring(struct user_struct *user,
-		      struct task_struct *ctx)
+static int install_user_keyrings(struct task_struct *tsk)
 {
+	struct user_struct *user = tsk->user;
 	struct key *uid_keyring, *session_keyring;
 	char buf[20];
 	int ret;
 
-	/* concoct a default session keyring */
-	sprintf(buf, "_uid_ses.%u", user->uid);
+	kenter("%p{%u}", user, user->uid);
 
-	session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx,
-					KEY_ALLOC_IN_QUOTA, NULL);
-	if (IS_ERR(session_keyring)) {
-		ret = PTR_ERR(session_keyring);
-		goto error;
+	if (user->uid_keyring) {
+		kleave(" = 0 [exist]");
+		return 0;
 	}
 
-	/* and a UID specific keyring, pointed to by the default session
-	 * keyring */
-	sprintf(buf, "_uid.%u", user->uid);
+	mutex_lock(&key_user_keyring_mutex);
+	ret = 0;
 
-	uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx,
-				    KEY_ALLOC_IN_QUOTA, session_keyring);
-	if (IS_ERR(uid_keyring)) {
-		key_put(session_keyring);
-		ret = PTR_ERR(uid_keyring);
-		goto error;
+	if (!user->uid_keyring) {
+		/* get the UID-specific keyring
+		 * - there may be one in existence already as it may have been
+		 *   pinned by a session, but the user_struct pointing to it
+		 *   may have been destroyed by setuid */
+		sprintf(buf, "_uid.%u", user->uid);
+
+		uid_keyring = find_keyring_by_name(buf, true);
+		if (IS_ERR(uid_keyring)) {
+			uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1,
+						    tsk, KEY_ALLOC_IN_QUOTA,
+						    NULL);
+			if (IS_ERR(uid_keyring)) {
+				ret = PTR_ERR(uid_keyring);
+				goto error;
+			}
+		}
+
+		/* get a default session keyring (which might also exist
+		 * already) */
+		sprintf(buf, "_uid_ses.%u", user->uid);
+
+		session_keyring = find_keyring_by_name(buf, true);
+		if (IS_ERR(session_keyring)) {
+			session_keyring =
+				keyring_alloc(buf, user->uid, (gid_t) -1,
+					      tsk, KEY_ALLOC_IN_QUOTA, NULL);
+			if (IS_ERR(session_keyring)) {
+				ret = PTR_ERR(session_keyring);
+				goto error_release;
+			}
+
+			/* we install a link from the user session keyring to
+			 * the user keyring */
+			ret = key_link(session_keyring, uid_keyring);
+			if (ret < 0)
+				goto error_release_both;
+		}
+
+		/* install the keyrings */
+		user->uid_keyring = uid_keyring;
+		user->session_keyring = session_keyring;
 	}
 
-	/* install the keyrings */
-	user->uid_keyring = uid_keyring;
-	user->session_keyring = session_keyring;
-	ret = 0;
+	mutex_unlock(&key_user_keyring_mutex);
+	kleave(" = 0");
+	return 0;
 
+error_release_both:
+	key_put(session_keyring);
+error_release:
+	key_put(uid_keyring);
 error:
+	mutex_unlock(&key_user_keyring_mutex);
+	kleave(" = %d", ret);
 	return ret;
-
-} /* end alloc_uid_keyring() */
+}
 
 /*****************************************************************************/
 /*
@@ -481,7 +490,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 		}
 	}
 	/* or search the user-session keyring */
-	else {
+	else if (context->user->session_keyring) {
 		key_ref = keyring_search_aux(
 			make_key_ref(context->user->session_keyring, 1),
 			context, type, description, match);
@@ -614,6 +623,9 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
 		if (!context->signal->session_keyring) {
 			/* always install a session keyring upon access if one
 			 * doesn't exist yet */
+			ret = install_user_keyrings(context);
+			if (ret < 0)
+				goto error;
 			ret = install_session_keyring(
 				context, context->user->session_keyring);
 			if (ret < 0)
@@ -628,12 +640,24 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
 		break;
 
 	case KEY_SPEC_USER_KEYRING:
+		if (!context->user->uid_keyring) {
+			ret = install_user_keyrings(context);
+			if (ret < 0)
+				goto error;
+		}
+
 		key = context->user->uid_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_USER_SESSION_KEYRING:
+		if (!context->user->session_keyring) {
+			ret = install_user_keyrings(context);
+			if (ret < 0)
+				goto error;
+		}
+
 		key = context->user->session_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
@@ -744,7 +768,7 @@ long join_session_keyring(const char *name)
 	mutex_lock(&key_session_mutex);
 
 	/* look for an existing keyring of this name */
-	keyring = find_keyring_by_name(name, 0);
+	keyring = find_keyring_by_name(name, false);
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
 		keyring = keyring_alloc(name, tsk->uid, tsk->gid, tsk,
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 838d1e5e63a1..4e4de98941ae 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -5551,14 +5551,6 @@ static __init int selinux_init(void)
 	else
 		printk(KERN_DEBUG "SELinux:  Starting in permissive mode\n");
 
-#ifdef CONFIG_KEYS
-	/* Add security information to initial keyrings */
-	selinux_key_alloc(&root_user_keyring, current,
-			  KEY_ALLOC_NOT_IN_QUOTA);
-	selinux_key_alloc(&root_session_keyring, current,
-			  KEY_ALLOC_NOT_IN_QUOTA);
-#endif
-
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From 0b77f5bfb45c13e1e5142374f9d6ca75292252a4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Apr 2008 01:01:32 -0700
Subject: keys: make the keyring quotas controllable through /proc/sys

Make the keyring quotas controllable through /proc/sys files:

 (*) /proc/sys/kernel/keys/root_maxkeys
     /proc/sys/kernel/keys/root_maxbytes

     Maximum number of keys that root may have and the maximum total number of
     bytes of data that root may have stored in those keys.

 (*) /proc/sys/kernel/keys/maxkeys
     /proc/sys/kernel/keys/maxbytes

     Maximum number of keys that each non-root user may have and the maximum
     total number of bytes of data that each of those users may have stored in
     their keys.

Also increase the quotas as a number of people have been complaining that it's
not big enough.  I'm not sure that it's big enough now either, but on the
other hand, it can now be set in /etc/sysctl.conf.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: <kwc@citi.umich.edu>
Cc: <arunsr@cse.iitk.ac.in>
Cc: <dwalsh@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/keys.txt   | 24 ++++++++++++++++++++++-
 include/linux/key.h      |  5 +++++
 kernel/sysctl.c          |  9 +++++++++
 security/keys/Makefile   |  1 +
 security/keys/internal.h | 14 ++++++++++----
 security/keys/key.c      | 23 +++++++++++++++++-----
 security/keys/keyctl.c   | 12 +++++++++---
 security/keys/proc.c     |  9 ++++++---
 security/keys/sysctl.c   | 50 ++++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 131 insertions(+), 16 deletions(-)
 create mode 100644 security/keys/sysctl.c

(limited to 'kernel')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index be424b02437d..d5c7a57d1700 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -170,7 +170,8 @@ The key service provides a number of features besides keys:
      amount of description and payload space that can be consumed.
 
      The user can view information on this and other statistics through procfs
-     files.
+     files.  The root user may also alter the quota limits through sysctl files
+     (see the section "New procfs files").
 
      Process-specific and thread-specific keyrings are not counted towards a
      user's quota.
@@ -329,6 +330,27 @@ about the status of the key service:
 	<bytes>/<max>		Key size quota
 
 
+Four new sysctl files have been added also for the purpose of controlling the
+quota limits on keys:
+
+ (*) /proc/sys/kernel/keys/root_maxkeys
+     /proc/sys/kernel/keys/root_maxbytes
+
+     These files hold the maximum number of keys that root may have and the
+     maximum total number of bytes of data that root may have stored in those
+     keys.
+
+ (*) /proc/sys/kernel/keys/maxkeys
+     /proc/sys/kernel/keys/maxbytes
+
+     These files hold the maximum number of keys that each non-root user may
+     have and the maximum total number of bytes of data that each of those
+     users may have stored in their keys.
+
+Root may alter these by writing each new limit as a decimal number string to
+the appropriate file.
+
+
 ===============================
 USERSPACE SYSTEM CALL INTERFACE
 ===============================
diff --git a/include/linux/key.h b/include/linux/key.h
index 2effd031a817..ad02d9cfe170 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -19,6 +19,7 @@
 #include <linux/list.h>
 #include <linux/rbtree.h>
 #include <linux/rcupdate.h>
+#include <linux/sysctl.h>
 #include <asm/atomic.h>
 
 #ifdef __KERNEL__
@@ -265,6 +266,10 @@ extern struct key *key_lookup(key_serial_t id);
 
 #define key_serial(key) ((key) ? (key)->serial : 0)
 
+#ifdef CONFIG_SYSCTL
+extern ctl_table key_sysctls[];
+#endif
+
 /*
  * the userspace interface
  */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fd3364827ccf..0a1d2733cf41 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -38,6 +38,7 @@
 #include <linux/writeback.h>
 #include <linux/hugetlb.h>
 #include <linux/initrd.h>
+#include <linux/key.h>
 #include <linux/times.h>
 #include <linux/limits.h>
 #include <linux/dcache.h>
@@ -809,6 +810,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
 	},
+#ifdef CONFIG_KEYS
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "keys",
+		.mode		= 0555,
+		.child		= key_sysctls,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/security/keys/Makefile b/security/keys/Makefile
index 5145adfb6a05..747a464943af 100644
--- a/security/keys/Makefile
+++ b/security/keys/Makefile
@@ -14,3 +14,4 @@ obj-y := \
 
 obj-$(CONFIG_KEYS_COMPAT) += compat.o
 obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 2ab38854c47f..8c05587f5018 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -57,10 +57,6 @@ struct key_user {
 	int			qnbytes;	/* number of bytes allocated to this user */
 };
 
-#define KEYQUOTA_MAX_KEYS	100
-#define KEYQUOTA_MAX_BYTES	10000
-#define KEYQUOTA_LINK_BYTES	4		/* a link in a keyring is worth 4 bytes */
-
 extern struct rb_root	key_user_tree;
 extern spinlock_t	key_user_lock;
 extern struct key_user	root_key_user;
@@ -68,6 +64,16 @@ extern struct key_user	root_key_user;
 extern struct key_user *key_user_lookup(uid_t uid);
 extern void key_user_put(struct key_user *user);
 
+/*
+ * key quota limits
+ * - root has its own separate limits to everyone else
+ */
+extern unsigned key_quota_root_maxkeys;
+extern unsigned key_quota_root_maxbytes;
+extern unsigned key_quota_maxkeys;
+extern unsigned key_quota_maxbytes;
+
+#define KEYQUOTA_LINK_BYTES	4		/* a link in a keyring is worth 4 bytes */
 
 
 extern struct rb_root key_serial_tree;
diff --git a/security/keys/key.c b/security/keys/key.c
index 46f125aa7fa3..14948cf83ef6 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -27,6 +27,11 @@ DEFINE_SPINLOCK(key_serial_lock);
 struct rb_root	key_user_tree; /* tree of quota records indexed by UID */
 DEFINE_SPINLOCK(key_user_lock);
 
+unsigned int key_quota_root_maxkeys = 200;	/* root's key count quota */
+unsigned int key_quota_root_maxbytes = 20000;	/* root's key space quota */
+unsigned int key_quota_maxkeys = 200;		/* general key count quota */
+unsigned int key_quota_maxbytes = 20000;	/* general key space quota */
+
 static LIST_HEAD(key_types_list);
 static DECLARE_RWSEM(key_types_sem);
 
@@ -236,11 +241,16 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	/* check that the user's quota permits allocation of another key and
 	 * its description */
 	if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) {
+		unsigned maxkeys = (uid == 0) ?
+			key_quota_root_maxkeys : key_quota_maxkeys;
+		unsigned maxbytes = (uid == 0) ?
+			key_quota_root_maxbytes : key_quota_maxbytes;
+
 		spin_lock(&user->lock);
 		if (!(flags & KEY_ALLOC_QUOTA_OVERRUN)) {
-			if (user->qnkeys + 1 >= KEYQUOTA_MAX_KEYS ||
-			    user->qnbytes + quotalen >= KEYQUOTA_MAX_BYTES
-			    )
+			if (user->qnkeys + 1 >= maxkeys ||
+			    user->qnbytes + quotalen >= maxbytes ||
+			    user->qnbytes + quotalen < user->qnbytes)
 				goto no_quota;
 		}
 
@@ -345,11 +355,14 @@ int key_payload_reserve(struct key *key, size_t datalen)
 
 	/* contemplate the quota adjustment */
 	if (delta != 0 && test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) {
+		unsigned maxbytes = (key->user->uid == 0) ?
+			key_quota_root_maxbytes : key_quota_maxbytes;
+
 		spin_lock(&key->user->lock);
 
 		if (delta > 0 &&
-		    key->user->qnbytes + delta > KEYQUOTA_MAX_BYTES
-		    ) {
+		    (key->user->qnbytes + delta >= maxbytes ||
+		     key->user->qnbytes + delta < key->user->qnbytes)) {
 			ret = -EDQUOT;
 		}
 		else {
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 993be634a5ef..acc9c89e40a8 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -731,10 +731,16 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid)
 
 		/* transfer the quota burden to the new user */
 		if (test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) {
+			unsigned maxkeys = (uid == 0) ?
+				key_quota_root_maxkeys : key_quota_maxkeys;
+			unsigned maxbytes = (uid == 0) ?
+				key_quota_root_maxbytes : key_quota_maxbytes;
+
 			spin_lock(&newowner->lock);
-			if (newowner->qnkeys + 1 >= KEYQUOTA_MAX_KEYS ||
-			    newowner->qnbytes + key->quotalen >=
-			    KEYQUOTA_MAX_BYTES)
+			if (newowner->qnkeys + 1 >= maxkeys ||
+			    newowner->qnbytes + key->quotalen >= maxbytes ||
+			    newowner->qnbytes + key->quotalen <
+			    newowner->qnbytes)
 				goto quota_overrun;
 
 			newowner->qnkeys++;
diff --git a/security/keys/proc.c b/security/keys/proc.c
index e54679b848cf..f619170da760 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -242,6 +242,10 @@ static int proc_key_users_show(struct seq_file *m, void *v)
 {
 	struct rb_node *_p = v;
 	struct key_user *user = rb_entry(_p, struct key_user, node);
+	unsigned maxkeys = (user->uid == 0) ?
+		key_quota_root_maxkeys : key_quota_maxkeys;
+	unsigned maxbytes = (user->uid == 0) ?
+		key_quota_root_maxbytes : key_quota_maxbytes;
 
 	seq_printf(m, "%5u: %5d %d/%d %d/%d %d/%d\n",
 		   user->uid,
@@ -249,10 +253,9 @@ static int proc_key_users_show(struct seq_file *m, void *v)
 		   atomic_read(&user->nkeys),
 		   atomic_read(&user->nikeys),
 		   user->qnkeys,
-		   KEYQUOTA_MAX_KEYS,
+		   maxkeys,
 		   user->qnbytes,
-		   KEYQUOTA_MAX_BYTES
-		   );
+		   maxbytes);
 
 	return 0;
 
diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c
new file mode 100644
index 000000000000..b611d493c2d8
--- /dev/null
+++ b/security/keys/sysctl.c
@@ -0,0 +1,50 @@
+/* Key management controls
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/key.h>
+#include <linux/sysctl.h>
+#include "internal.h"
+
+ctl_table key_sysctls[] = {
+	{
+		.ctl_name = CTL_UNNUMBERED,
+		.procname = "maxkeys",
+		.data = &key_quota_maxkeys,
+		.maxlen = sizeof(unsigned),
+		.mode = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.ctl_name = CTL_UNNUMBERED,
+		.procname = "maxbytes",
+		.data = &key_quota_maxbytes,
+		.maxlen = sizeof(unsigned),
+		.mode = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.ctl_name = CTL_UNNUMBERED,
+		.procname = "root_maxkeys",
+		.data = &key_quota_root_maxkeys,
+		.maxlen = sizeof(unsigned),
+		.mode = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{
+		.ctl_name = CTL_UNNUMBERED,
+		.procname = "root_maxbytes",
+		.data = &key_quota_root_maxbytes,
+		.maxlen = sizeof(unsigned),
+		.mode = 0644,
+		.proc_handler = &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
-- 
cgit v1.3-14-g43fede


From 925d1c401fa6cfd0df5d2e37da8981494ccdec07 Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Tue, 29 Apr 2008 01:01:36 -0700
Subject: procfs task exe symlink

The kernel implements readlink of /proc/pid/exe by getting the file from
the first executable VMA.  Then the path to the file is reconstructed and
reported as the result.

Because of the VMA walk the code is slightly different on nommu systems.
This patch avoids separate /proc/pid/exe code on nommu systems.  Instead of
walking the VMAs to find the first executable file-backed VMA we store a
reference to the exec'd file in the mm_struct.

That reference would prevent the filesystem holding the executable file
from being unmounted even after unmapping the VMAs.  So we track the number
of VM_EXECUTABLE VMAs and drop the new reference when the last one is
unmapped.  This avoids pinning the mounted filesystem.

[akpm@linux-foundation.org: improve comments]
[yamamoto@valinux.co.jp: fix dup_mmap]
Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: David Howells <dhowells@redhat.com>
Cc:"Eric W. Biederman" <ebiederm@xmission.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_flat.c         |  3 +-
 fs/exec.c                |  2 ++
 fs/proc/base.c           | 75 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/internal.h       |  1 -
 fs/proc/task_mmu.c       | 34 ----------------------
 fs/proc/task_nommu.c     | 34 ----------------------
 include/linux/mm.h       | 13 +++++++++
 include/linux/mm_types.h |  6 ++++
 include/linux/proc_fs.h  | 20 ++++++++++++-
 kernel/fork.c            |  3 ++
 mm/mmap.c                | 24 +++++++++++++---
 mm/nommu.c               | 23 +++++++++++----
 12 files changed, 157 insertions(+), 81 deletions(-)

(limited to 'kernel')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index c12cc362fd3b..3b40d45a3a16 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -531,7 +531,8 @@ static int load_flat_file(struct linux_binprm * bprm,
 		DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n");
 
 		down_write(&current->mm->mmap_sem);
-		textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE, 0);
+		textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
+				  MAP_PRIVATE|MAP_EXECUTABLE, 0);
 		up_write(&current->mm->mmap_sem);
 		if (!textpos  || textpos >= (unsigned long) -4096) {
 			if (!textpos)
diff --git a/fs/exec.c b/fs/exec.c
index 711bc45d789c..a13883903ee9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -964,6 +964,8 @@ int flush_old_exec(struct linux_binprm * bprm)
 	if (retval)
 		goto out;
 
+	set_mm_exe_file(bprm->mm, bprm->file);
+
 	/*
 	 * Release all of the old mmap stuff
 	 */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c5e412a00b17..b48ddb119945 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1181,6 +1181,81 @@ static const struct file_operations proc_pid_sched_operations = {
 
 #endif
 
+/*
+ * We added or removed a vma mapping the executable. The vmas are only mapped
+ * during exec and are not mapped with the mmap system call.
+ * Callers must hold down_write() on the mm's mmap_sem for these
+ */
+void added_exe_file_vma(struct mm_struct *mm)
+{
+	mm->num_exe_file_vmas++;
+}
+
+void removed_exe_file_vma(struct mm_struct *mm)
+{
+	mm->num_exe_file_vmas--;
+	if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
+		fput(mm->exe_file);
+		mm->exe_file = NULL;
+	}
+
+}
+
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+	if (new_exe_file)
+		get_file(new_exe_file);
+	if (mm->exe_file)
+		fput(mm->exe_file);
+	mm->exe_file = new_exe_file;
+	mm->num_exe_file_vmas = 0;
+}
+
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+	struct file *exe_file;
+
+	/* We need mmap_sem to protect against races with removal of
+	 * VM_EXECUTABLE vmas */
+	down_read(&mm->mmap_sem);
+	exe_file = mm->exe_file;
+	if (exe_file)
+		get_file(exe_file);
+	up_read(&mm->mmap_sem);
+	return exe_file;
+}
+
+void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+	/* It's safe to write the exe_file pointer without exe_file_lock because
+	 * this is called during fork when the task is not yet in /proc */
+	newmm->exe_file = get_mm_exe_file(oldmm);
+}
+
+static int proc_exe_link(struct inode *inode, struct path *exe_path)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+	struct file *exe_file;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ENOENT;
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		return -ENOENT;
+	exe_file = get_mm_exe_file(mm);
+	mmput(mm);
+	if (exe_file) {
+		*exe_path = exe_file->f_path;
+		path_get(&exe_file->f_path);
+		fput(exe_file);
+		return 0;
+	} else
+		return -ENOENT;
+}
+
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index bc72f5c8c47d..45abb9803988 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -48,7 +48,6 @@ extern int maps_protect;
 
 extern void create_seq_entry(char *name, mode_t mode,
 				const struct file_operations *f);
-extern int proc_exe_link(struct inode *, struct path *);
 extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *task);
 extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7415eeb7cc3a..e2b8e769f510 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -75,40 +75,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 	return mm->total_vm;
 }
 
-int proc_exe_link(struct inode *inode, struct path *path)
-{
-	struct vm_area_struct * vma;
-	int result = -ENOENT;
-	struct task_struct *task = get_proc_task(inode);
-	struct mm_struct * mm = NULL;
-
-	if (task) {
-		mm = get_task_mm(task);
-		put_task_struct(task);
-	}
-	if (!mm)
-		goto out;
-	down_read(&mm->mmap_sem);
-
-	vma = mm->mmap;
-	while (vma) {
-		if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
-			break;
-		vma = vma->vm_next;
-	}
-
-	if (vma) {
-		*path = vma->vm_file->f_path;
-		path_get(&vma->vm_file->f_path);
-		result = 0;
-	}
-
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-out:
-	return result;
-}
-
 static void pad_len_spaces(struct seq_file *m, int len)
 {
 	len = 25 + sizeof(void*) * 6 - len;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 8011528518bd..4b733f108455 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -103,40 +103,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 	return size;
 }
 
-int proc_exe_link(struct inode *inode, struct path *path)
-{
-	struct vm_list_struct *vml;
-	struct vm_area_struct *vma;
-	struct task_struct *task = get_proc_task(inode);
-	struct mm_struct *mm = get_task_mm(task);
-	int result = -ENOENT;
-
-	if (!mm)
-		goto out;
-	down_read(&mm->mmap_sem);
-
-	vml = mm->context.vmlist;
-	vma = NULL;
-	while (vml) {
-		if ((vml->vma->vm_flags & VM_EXECUTABLE) && vml->vma->vm_file) {
-			vma = vml->vma;
-			break;
-		}
-		vml = vml->next;
-	}
-
-	if (vma) {
-		*path = vma->vm_file->f_path;
-		path_get(&vma->vm_file->f_path);
-		result = 0;
-	}
-
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-out:
-	return result;
-}
-
 /*
  * display mapping lines for a particular process's /proc/pid/maps
  */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fef602d82722..c31a9cd2a30e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1066,6 +1066,19 @@ extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff);
 extern void exit_mmap(struct mm_struct *);
+
+#ifdef CONFIG_PROC_FS
+/* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */
+extern void added_exe_file_vma(struct mm_struct *mm);
+extern void removed_exe_file_vma(struct mm_struct *mm);
+#else
+static inline void added_exe_file_vma(struct mm_struct *mm)
+{}
+
+static inline void removed_exe_file_vma(struct mm_struct *mm)
+{}
+#endif /* CONFIG_PROC_FS */
+
 extern int may_expand_vm(struct mm_struct *mm, unsigned long npages);
 extern int install_special_mapping(struct mm_struct *mm,
 				   unsigned long addr, unsigned long len,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bc97bd54f606..eb7c16cc9559 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -229,6 +229,12 @@ struct mm_struct {
 	struct task_struct *owner;	/* The thread group leader that */
 					/* owns the mm_struct.		*/
 #endif
+
+#ifdef CONFIG_PROC_FS
+	/* store ref to file /proc/<pid>/exe symlink points to */
+	struct file *exe_file;
+	unsigned long num_exe_file_vmas;
+#endif
 };
 
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 9b6c935f69cf..65f2299b772b 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -9,7 +9,6 @@
 
 struct net;
 struct completion;
-
 /*
  * The proc filesystem constants/structures
  */
@@ -206,6 +205,12 @@ extern void proc_net_remove(struct net *net, const char *name);
 extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
 	struct proc_dir_entry *parent);
 
+/* While the {get|set|dup}_mm_exe_file functions are for mm_structs, they are
+ * only needed to implement /proc/<pid>|self/exe so we define them here. */
+extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
+extern struct file *get_mm_exe_file(struct mm_struct *mm);
+extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
+
 #else
 
 #define proc_root_driver NULL
@@ -255,6 +260,19 @@ static inline void pid_ns_release_proc(struct pid_namespace *ns)
 {
 }
 
+static inline void set_mm_exe_file(struct mm_struct *mm,
+				   struct file *new_exe_file)
+{}
+
+static inline struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+	return NULL;
+}
+
+static inline void dup_mm_exe_file(struct mm_struct *oldmm,
+	       			   struct mm_struct *newmm)
+{}
+
 #endif /* CONFIG_PROC_FS */
 
 #if !defined(CONFIG_PROC_KCORE)
diff --git a/kernel/fork.c b/kernel/fork.c
index de5c16c6b6ec..068ffe007529 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -431,6 +431,7 @@ void mmput(struct mm_struct *mm)
 	if (atomic_dec_and_test(&mm->mm_users)) {
 		exit_aio(mm);
 		exit_mmap(mm);
+		set_mm_exe_file(mm, NULL);
 		if (!list_empty(&mm->mmlist)) {
 			spin_lock(&mmlist_lock);
 			list_del(&mm->mmlist);
@@ -543,6 +544,8 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	if (init_new_context(tsk, mm))
 		goto fail_nocontext;
 
+	dup_mm_exe_file(oldmm, mm);
+
 	err = dup_mmap(mm, oldmm);
 	if (err)
 		goto free_pt;
diff --git a/mm/mmap.c b/mm/mmap.c
index 677d184b0d42..fac66337da2a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -230,8 +230,11 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	might_sleep();
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
-	if (vma->vm_file)
+	if (vma->vm_file) {
 		fput(vma->vm_file);
+		if (vma->vm_flags & VM_EXECUTABLE)
+			removed_exe_file_vma(vma->vm_mm);
+	}
 	mpol_put(vma_policy(vma));
 	kmem_cache_free(vm_area_cachep, vma);
 	return next;
@@ -623,8 +626,11 @@ again:			remove_next = 1 + (end > next->vm_end);
 		spin_unlock(&mapping->i_mmap_lock);
 
 	if (remove_next) {
-		if (file)
+		if (file) {
 			fput(file);
+			if (next->vm_flags & VM_EXECUTABLE)
+				removed_exe_file_vma(mm);
+		}
 		mm->map_count--;
 		mpol_put(vma_policy(next));
 		kmem_cache_free(vm_area_cachep, next);
@@ -1154,6 +1160,8 @@ munmap_back:
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
+		if (vm_flags & VM_EXECUTABLE)
+			added_exe_file_vma(mm);
 	} else if (vm_flags & VM_SHARED) {
 		error = shmem_zero_setup(vma);
 		if (error)
@@ -1185,6 +1193,8 @@ munmap_back:
 		mpol_put(vma_policy(vma));
 		kmem_cache_free(vm_area_cachep, vma);
 		fput(file);
+		if (vm_flags & VM_EXECUTABLE)
+			removed_exe_file_vma(mm);
 	} else {
 		vma_link(mm, vma, prev, rb_link, rb_parent);
 		file = vma->vm_file;
@@ -1817,8 +1827,11 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	}
 	vma_set_policy(new, pol);
 
-	if (new->vm_file)
+	if (new->vm_file) {
 		get_file(new->vm_file);
+		if (vma->vm_flags & VM_EXECUTABLE)
+			added_exe_file_vma(mm);
+	}
 
 	if (new->vm_ops && new->vm_ops->open)
 		new->vm_ops->open(new);
@@ -2135,8 +2148,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			new_vma->vm_start = addr;
 			new_vma->vm_end = addr + len;
 			new_vma->vm_pgoff = pgoff;
-			if (new_vma->vm_file)
+			if (new_vma->vm_file) {
 				get_file(new_vma->vm_file);
+				if (vma->vm_flags & VM_EXECUTABLE)
+					added_exe_file_vma(mm);
+			}
 			if (new_vma->vm_ops && new_vma->vm_ops->open)
 				new_vma->vm_ops->open(new_vma);
 			vma_link(mm, new_vma, prev, rb_link, rb_parent);
diff --git a/mm/nommu.c b/mm/nommu.c
index 1d32fe89d57b..ef8c62cec697 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -966,8 +966,13 @@ unsigned long do_mmap_pgoff(struct file *file,
 
 	INIT_LIST_HEAD(&vma->anon_vma_node);
 	atomic_set(&vma->vm_usage, 1);
-	if (file)
+	if (file) {
 		get_file(file);
+		if (vm_flags & VM_EXECUTABLE) {
+			added_exe_file_vma(current->mm);
+			vma->vm_mm = current->mm;
+		}
+	}
 	vma->vm_file	= file;
 	vma->vm_flags	= vm_flags;
 	vma->vm_start	= addr;
@@ -1022,8 +1027,11 @@ unsigned long do_mmap_pgoff(struct file *file,
 	up_write(&nommu_vma_sem);
 	kfree(vml);
 	if (vma) {
-		if (vma->vm_file)
+		if (vma->vm_file) {
 			fput(vma->vm_file);
+			if (vma->vm_flags & VM_EXECUTABLE)
+				removed_exe_file_vma(vma->vm_mm);
+		}
 		kfree(vma);
 	}
 	return ret;
@@ -1053,7 +1061,7 @@ EXPORT_SYMBOL(do_mmap_pgoff);
 /*
  * handle mapping disposal for uClinux
  */
-static void put_vma(struct vm_area_struct *vma)
+static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	if (vma) {
 		down_write(&nommu_vma_sem);
@@ -1075,8 +1083,11 @@ static void put_vma(struct vm_area_struct *vma)
 			realalloc -= kobjsize(vma);
 			askedalloc -= sizeof(*vma);
 
-			if (vma->vm_file)
+			if (vma->vm_file) {
 				fput(vma->vm_file);
+				if (vma->vm_flags & VM_EXECUTABLE)
+					removed_exe_file_vma(mm);
+			}
 			kfree(vma);
 		}
 
@@ -1113,7 +1124,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
  found:
 	vml = *parent;
 
-	put_vma(vml->vma);
+	put_vma(mm, vml->vma);
 
 	*parent = vml->next;
 	realalloc -= kobjsize(vml);
@@ -1158,7 +1169,7 @@ void exit_mmap(struct mm_struct * mm)
 
 		while ((tmp = mm->context.vmlist)) {
 			mm->context.vmlist = tmp->next;
-			put_vma(tmp->vma);
+			put_vma(mm, tmp->vma);
 
 			realalloc -= kobjsize(tmp);
 			askedalloc -= sizeof(*tmp);
-- 
cgit v1.3-14-g43fede


From c74c120a21d87b0b6925ada5830d8cac21e852d9 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 29 Apr 2008 01:01:44 -0700
Subject: proc: remove proc_root from drivers

Remove proc_root export.  Creation and removal works well if parent PDE is
supplied as NULL -- it worked always that way.

So, one useless export removed and consistency added, some drivers created
PDEs with &proc_root as parent but removed them as NULL and so on.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/atags.c                 |  2 +-
 arch/m68k/mac/iop.c                     |  2 +-
 arch/mips/basler/excite/excite_procfs.c |  2 +-
 arch/um/kernel/exitcode.c               |  2 +-
 arch/um/kernel/process.c                |  2 +-
 arch/x86/kernel/cpu/mtrr/if.c           |  2 +-
 drivers/char/ip2/ip2main.c              |  4 ++--
 drivers/mca/mca-proc.c                  |  2 +-
 drivers/misc/hdpuftrs/hdpu_cpustate.c   |  2 +-
 drivers/misc/hdpuftrs/hdpu_nexus.c      | 12 ++++++------
 drivers/s390/block/dasd_proc.c          |  6 +++---
 drivers/s390/char/tape_proc.c           |  4 ++--
 drivers/s390/cio/blacklist.c            |  2 +-
 drivers/s390/cio/qdio.c                 |  4 ++--
 drivers/scsi/megaraid.c                 |  6 +++---
 drivers/video/clps711xfb.c              |  2 +-
 fs/proc/internal.h                      |  1 +
 fs/proc/proc_misc.c                     |  2 +-
 fs/proc/root.c                          |  1 -
 include/linux/proc_fs.h                 |  3 ---
 kernel/configs.c                        |  5 ++---
 sound/core/info.c                       |  4 ++--
 22 files changed, 34 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/atags.c b/arch/arm/kernel/atags.c
index e2e934c38080..64c420805e6f 100644
--- a/arch/arm/kernel/atags.c
+++ b/arch/arm/kernel/atags.c
@@ -35,7 +35,7 @@ create_proc_entries(void)
 {
 	struct proc_dir_entry* tags_entry;
 
-	tags_entry = create_proc_read_entry("atags", 0400, &proc_root, read_buffer, &tags_buffer);
+	tags_entry = create_proc_read_entry("atags", 0400, NULL, read_buffer, &tags_buffer);
 	if (!tags_entry)
 		return -ENOMEM;
 
diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c
index 5b2799eb96a6..092d4b3d8f76 100644
--- a/arch/m68k/mac/iop.c
+++ b/arch/m68k/mac/iop.c
@@ -302,7 +302,7 @@ void __init iop_init(void)
 
 #if 0	/* Crashing in 2.4 now, not yet sure why.   --jmt */
 #ifdef CONFIG_PROC_FS
-	create_proc_info_entry("mac_iop", 0, &proc_root, iop_get_proc_info);
+	create_proc_info_entry("mac_iop", 0, NULL, iop_get_proc_info);
 #endif
 #endif
 }
diff --git a/arch/mips/basler/excite/excite_procfs.c b/arch/mips/basler/excite/excite_procfs.c
index 9ee67a95f6b9..6c08b386fdad 100644
--- a/arch/mips/basler/excite/excite_procfs.c
+++ b/arch/mips/basler/excite/excite_procfs.c
@@ -65,7 +65,7 @@ excite_bootrom_read(char *page, char **start, off_t off, int count,
 void excite_procfs_init(void)
 {
 	/* Create & populate /proc/excite */
-	struct proc_dir_entry * const pdir = proc_mkdir("excite", &proc_root);
+	struct proc_dir_entry * const pdir = proc_mkdir("excite", NULL);
 	if (pdir) {
 		struct proc_dir_entry * e;
 
diff --git a/arch/um/kernel/exitcode.c b/arch/um/kernel/exitcode.c
index 984f80e668ca..6540d2c9fbb7 100644
--- a/arch/um/kernel/exitcode.c
+++ b/arch/um/kernel/exitcode.c
@@ -59,7 +59,7 @@ static int make_proc_exitcode(void)
 {
 	struct proc_dir_entry *ent;
 
-	ent = create_proc_entry("exitcode", 0600, &proc_root);
+	ent = create_proc_entry("exitcode", 0600, NULL);
 	if (ent == NULL) {
 		printk(KERN_WARNING "make_proc_exitcode : Failed to register "
 		       "/proc/exitcode\n");
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index e8cb9ff183e9..83603cfbde81 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -364,7 +364,7 @@ int __init make_proc_sysemu(void)
 	if (!sysemu_supported)
 		return 0;
 
-	ent = create_proc_entry("sysemu", 0600, &proc_root);
+	ent = create_proc_entry("sysemu", 0600, NULL);
 
 	if (ent == NULL)
 	{
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 1960f1985e5e..84c480bb3715 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -424,7 +424,7 @@ static int __init mtrr_if_init(void)
 		return -ENODEV;
 
 	proc_root_mtrr =
-		proc_create("mtrr", S_IWUSR | S_IRUGO, &proc_root, &mtrr_fops);
+		proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
 
 	if (proc_root_mtrr)
 		proc_root_mtrr->owner = THIS_MODULE;
diff --git a/drivers/char/ip2/ip2main.c b/drivers/char/ip2/ip2main.c
index b1d6cad84282..a784f5e22ee9 100644
--- a/drivers/char/ip2/ip2main.c
+++ b/drivers/char/ip2/ip2main.c
@@ -423,7 +423,7 @@ cleanup_module(void)
 	}
 	put_tty_driver(ip2_tty_driver);
 	unregister_chrdev(IP2_IPL_MAJOR, pcIpl);
-	remove_proc_entry("ip2mem", &proc_root);
+	remove_proc_entry("ip2mem", NULL);
 
 	// free memory
 	for (i = 0; i < IP2_MAX_BOARDS; i++) {
@@ -695,7 +695,7 @@ ip2_loadmain(int *iop, int *irqp, unsigned char *firmware, int firmsize)
 		}
 	}
 	/* Register the read_procmem thing */
-	if (!create_proc_info_entry("ip2mem",0,&proc_root,ip2_read_procmem)) {
+	if (!create_proc_info_entry("ip2mem",0,NULL,ip2_read_procmem)) {
 		printk(KERN_ERR "IP2: failed to register read_procmem\n");
 	} else {
 
diff --git a/drivers/mca/mca-proc.c b/drivers/mca/mca-proc.c
index 33d5e0820cc5..81ea0d377bf4 100644
--- a/drivers/mca/mca-proc.c
+++ b/drivers/mca/mca-proc.c
@@ -183,7 +183,7 @@ void __init mca_do_proc_init(void)
 	struct proc_dir_entry* node = NULL;
 	struct mca_device *mca_dev;
 
-	proc_mca = proc_mkdir("mca", &proc_root);
+	proc_mca = proc_mkdir("mca", NULL);
 	create_proc_read_entry("pos",0,proc_mca,get_mca_info,NULL);
 	create_proc_read_entry("machine",0,proc_mca,get_mca_machine_info,NULL);
 
diff --git a/drivers/misc/hdpuftrs/hdpu_cpustate.c b/drivers/misc/hdpuftrs/hdpu_cpustate.c
index 302e92418bbe..154155c9b638 100644
--- a/drivers/misc/hdpuftrs/hdpu_cpustate.c
+++ b/drivers/misc/hdpuftrs/hdpu_cpustate.c
@@ -210,7 +210,7 @@ static int hdpu_cpustate_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	proc_de = create_proc_entry("sky_cpustate", 0666, &proc_root);
+	proc_de = create_proc_entry("sky_cpustate", 0666, NULL);
 	if (!proc_de) {
 		printk(KERN_WARNING "sky_cpustate: "
 		       "Unable to create proc entry\n");
diff --git a/drivers/misc/hdpuftrs/hdpu_nexus.c b/drivers/misc/hdpuftrs/hdpu_nexus.c
index 2fa36f7a6eb3..e92b7efccc72 100644
--- a/drivers/misc/hdpuftrs/hdpu_nexus.c
+++ b/drivers/misc/hdpuftrs/hdpu_nexus.c
@@ -102,8 +102,8 @@ static int hdpu_nexus_probe(struct platform_device *pdev)
 		printk(KERN_ERR "sky_nexus: Could not map slot id\n");
 	}
 
-	hdpu_slot_id = create_proc_entry("sky_slot_id", 0666, &proc_root);
-	if (!hdpu_slot_id) {
+	hdpu_slot_id = create_proc_entry("sky_slot_id", 0666, NULL);
+	if (!hdpu_slot_id)
 		printk(KERN_WARNING "sky_nexus: "
 		       "Unable to create proc dir entry: sky_slot_id\n");
 	} else {
@@ -111,8 +111,8 @@ static int hdpu_nexus_probe(struct platform_device *pdev)
 		hdpu_slot_id->owner = THIS_MODULE;
 	}
 
-	hdpu_chassis_id = create_proc_entry("sky_chassis_id", 0666, &proc_root);
-	if (!hdpu_chassis_id) {
+	hdpu_chassis_id = create_proc_entry("sky_chassis_id", 0666, NULL);
+	if (!hdpu_chassis_id)
 		printk(KERN_WARNING "sky_nexus: "
 		       "Unable to create proc dir entry: sky_chassis_id\n");
 	} else {
@@ -128,8 +128,8 @@ static int hdpu_nexus_remove(struct platform_device *pdev)
 	slot_id = -1;
 	chassis_id = -1;
 
-	remove_proc_entry("sky_slot_id", &proc_root);
-	remove_proc_entry("sky_chassis_id", &proc_root);
+	remove_proc_entry("sky_slot_id", NULL);
+	remove_proc_entry("sky_chassis_id", NULL);
 
 	hdpu_slot_id = 0;
 	hdpu_chassis_id = 0;
diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c
index 556063e8f7a9..8ae9406b10ad 100644
--- a/drivers/s390/block/dasd_proc.c
+++ b/drivers/s390/block/dasd_proc.c
@@ -311,7 +311,7 @@ out_error:
 int
 dasd_proc_init(void)
 {
-	dasd_proc_root_entry = proc_mkdir("dasd", &proc_root);
+	dasd_proc_root_entry = proc_mkdir("dasd", NULL);
 	if (!dasd_proc_root_entry)
 		goto out_nodasd;
 	dasd_proc_root_entry->owner = THIS_MODULE;
@@ -335,7 +335,7 @@ dasd_proc_init(void)
  out_nostatistics:
 	remove_proc_entry("devices", dasd_proc_root_entry);
  out_nodevices:
-	remove_proc_entry("dasd", &proc_root);
+	remove_proc_entry("dasd", NULL);
  out_nodasd:
 	return -ENOENT;
 }
@@ -345,5 +345,5 @@ dasd_proc_exit(void)
 {
 	remove_proc_entry("devices", dasd_proc_root_entry);
 	remove_proc_entry("statistics", dasd_proc_root_entry);
-	remove_proc_entry("dasd", &proc_root);
+	remove_proc_entry("dasd", NULL);
 }
diff --git a/drivers/s390/char/tape_proc.c b/drivers/s390/char/tape_proc.c
index c9b96d51b28f..0c39636b2174 100644
--- a/drivers/s390/char/tape_proc.c
+++ b/drivers/s390/char/tape_proc.c
@@ -125,7 +125,7 @@ tape_proc_init(void)
 {
 	tape_proc_devices =
 		create_proc_entry ("tapedevices", S_IFREG | S_IRUGO | S_IWUSR,
-				   &proc_root);
+				   NULL);
 	if (tape_proc_devices == NULL) {
 		PRINT_WARN("tape: Cannot register procfs entry tapedevices\n");
 		return;
@@ -141,5 +141,5 @@ void
 tape_proc_cleanup(void)
 {
 	if (tape_proc_devices != NULL)
-		remove_proc_entry ("tapedevices", &proc_root);
+		remove_proc_entry ("tapedevices", NULL);
 }
diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index e8597ec92247..ef33d5df2229 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -375,7 +375,7 @@ cio_ignore_proc_init (void)
 	struct proc_dir_entry *entry;
 
 	entry = create_proc_entry ("cio_ignore", S_IFREG | S_IRUGO | S_IWUSR,
-				   &proc_root);
+				   NULL);
 	if (!entry)
 		return -ENOENT;
 
diff --git a/drivers/s390/cio/qdio.c b/drivers/s390/cio/qdio.c
index 10aa1e780801..43876e287370 100644
--- a/drivers/s390/cio/qdio.c
+++ b/drivers/s390/cio/qdio.c
@@ -3632,7 +3632,7 @@ qdio_add_procfs_entry(void)
 {
         proc_perf_file_registration=0;
 	qdio_perf_proc_file=create_proc_entry(QDIO_PERF,
-					      S_IFREG|0444,&proc_root);
+					      S_IFREG|0444,NULL);
 	if (qdio_perf_proc_file) {
 		qdio_perf_proc_file->read_proc=&qdio_perf_procfile_read;
 	} else proc_perf_file_registration=-1;
@@ -3647,7 +3647,7 @@ static void
 qdio_remove_procfs_entry(void)
 {
         if (!proc_perf_file_registration) /* means if it went ok earlier */
-		remove_proc_entry(QDIO_PERF,&proc_root);
+		remove_proc_entry(QDIO_PERF,NULL);
 }
 
 /**
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index b135a1ed4b2c..18551aaf5e09 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -4996,7 +4996,7 @@ static int __init megaraid_init(void)
 		max_mbox_busy_wait = MBOX_BUSY_WAIT;
 
 #ifdef CONFIG_PROC_FS
-	mega_proc_dir_entry = proc_mkdir("megaraid", &proc_root);
+	mega_proc_dir_entry = proc_mkdir("megaraid", NULL);
 	if (!mega_proc_dir_entry) {
 		printk(KERN_WARNING
 				"megaraid: failed to create megaraid root\n");
@@ -5005,7 +5005,7 @@ static int __init megaraid_init(void)
 	error = pci_register_driver(&megaraid_pci_driver);
 	if (error) {
 #ifdef CONFIG_PROC_FS
-		remove_proc_entry("megaraid", &proc_root);
+		remove_proc_entry("megaraid", NULL);
 #endif
 		return error;
 	}
@@ -5035,7 +5035,7 @@ static void __exit megaraid_exit(void)
 	pci_unregister_driver(&megaraid_pci_driver);
 
 #ifdef CONFIG_PROC_FS
-	remove_proc_entry("megaraid", &proc_root);
+	remove_proc_entry("megaraid", NULL);
 #endif
 }
 
diff --git a/drivers/video/clps711xfb.c b/drivers/video/clps711xfb.c
index 17b5267f44d7..9f8a389dc7ae 100644
--- a/drivers/video/clps711xfb.c
+++ b/drivers/video/clps711xfb.c
@@ -381,7 +381,7 @@ int __init clps711xfb_init(void)
 
 	/* Register the /proc entries. */
 	clps7111fb_backlight_proc_entry = create_proc_entry("backlight", 0444,
-		&proc_root);
+		NULL);
 	if (clps7111fb_backlight_proc_entry == NULL) {
 		printk("Couldn't create the /proc entry for the backlight.\n");
 		return -EINVAL;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index b1d6df671edf..28cbca805905 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -11,6 +11,7 @@
 
 #include <linux/proc_fs.h>
 
+extern struct proc_dir_entry proc_root;
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
 #else
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index c4137bb94a9e..48bcf20cec2f 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -854,7 +854,7 @@ void __init proc_misc_init(void)
 
 	/* And now for trickier ones */
 #ifdef CONFIG_PRINTK
-	proc_create("kmsg", S_IRUSR, &proc_root, &proc_kmsg_operations);
+	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
 #endif
 	proc_create("locks", 0, NULL, &proc_locks_operations);
 	proc_create("devices", 0, NULL, &proc_devinfo_operations);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5e93e9b0124e..c741b45a5503 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -232,4 +232,3 @@ EXPORT_SYMBOL(proc_mkdir);
 EXPORT_SYMBOL(create_proc_entry);
 EXPORT_SYMBOL(proc_create);
 EXPORT_SYMBOL(remove_proc_entry);
-EXPORT_SYMBOL(proc_root);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index f56205cbebc0..2183ffdc5489 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -96,7 +96,6 @@ struct vmcore {
 
 #ifdef CONFIG_PROC_FS
 
-extern struct proc_dir_entry proc_root;
 extern struct proc_dir_entry *proc_root_kcore;
 
 extern spinlock_t proc_subdir_lock;
@@ -243,8 +242,6 @@ struct tty_driver;
 static inline void proc_tty_register_driver(struct tty_driver *driver) {};
 static inline void proc_tty_unregister_driver(struct tty_driver *driver) {};
 
-extern struct proc_dir_entry proc_root;
-
 static inline int pid_ns_prepare_proc(struct pid_namespace *ns)
 {
 	return 0;
diff --git a/kernel/configs.c b/kernel/configs.c
index e84d3f9c6c7b..d3a4b82a8a96 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,8 +79,7 @@ static int __init ikconfig_init(void)
 	struct proc_dir_entry *entry;
 
 	/* create the current config file */
-	entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO,
-				  &proc_root);
+	entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, NULL);
 	if (!entry)
 		return -ENOMEM;
 
@@ -95,7 +94,7 @@ static int __init ikconfig_init(void)
 
 static void __exit ikconfig_cleanup(void)
 {
-	remove_proc_entry("config.gz", &proc_root);
+	remove_proc_entry("config.gz", NULL);
 }
 
 module_init(ikconfig_init);
diff --git a/sound/core/info.c b/sound/core/info.c
index 9977ec2eace3..cb5ead3e202d 100644
--- a/sound/core/info.c
+++ b/sound/core/info.c
@@ -544,7 +544,7 @@ int __init snd_info_init(void)
 {
 	struct proc_dir_entry *p;
 
-	p = snd_create_proc_entry("asound", S_IFDIR | S_IRUGO | S_IXUGO, &proc_root);
+	p = snd_create_proc_entry("asound", S_IFDIR | S_IRUGO | S_IXUGO, NULL);
 	if (p == NULL)
 		return -ENOMEM;
 	snd_proc_root = p;
@@ -594,7 +594,7 @@ int __exit snd_info_done(void)
 #ifdef CONFIG_SND_OSSEMUL
 		snd_info_free_entry(snd_oss_root);
 #endif
-		snd_remove_proc_entry(&proc_root, snd_proc_root);
+		snd_remove_proc_entry(NULL, snd_proc_root);
 	}
 	return 0;
 }
-- 
cgit v1.3-14-g43fede


From c33fff0afbef4f0467c99e3f47ee7e98ae78c77e Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 29 Apr 2008 01:02:31 -0700
Subject: kernel: use non-racy method for proc entries creation

Use proc_create()/proc_create_data() to make sure that ->proc_fops and ->data
be setup before gluing PDE to main tree.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/configs.c          |  4 ++--
 kernel/dma.c              |  7 +------
 kernel/kallsyms.c         |  6 +-----
 kernel/latencytop.c       |  9 +--------
 kernel/lockdep_proc.c     | 16 ++++------------
 kernel/profile.c          |  4 ++--
 kernel/resource.c         | 10 ++--------
 kernel/sched_debug.c      |  5 +----
 kernel/time/timer_list.c  |  5 +----
 kernel/time/timer_stats.c |  5 +----
 10 files changed, 16 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/kernel/configs.c b/kernel/configs.c
index d3a4b82a8a96..4c345210ed8c 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,11 +79,11 @@ static int __init ikconfig_init(void)
 	struct proc_dir_entry *entry;
 
 	/* create the current config file */
-	entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, NULL);
+	entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
+			    &ikconfig_file_ops);
 	if (!entry)
 		return -ENOMEM;
 
-	entry->proc_fops = &ikconfig_file_ops;
 	entry->size = kernel_config_data_size;
 
 	return 0;
diff --git a/kernel/dma.c b/kernel/dma.c
index 6a82bb716dac..d2c60a822790 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -149,12 +149,7 @@ static const struct file_operations proc_dma_operations = {
 
 static int __init proc_dma_init(void)
 {
-	struct proc_dir_entry *e;
-
-	e = create_proc_entry("dma", 0, NULL);
-	if (e)
-		e->proc_fops = &proc_dma_operations;
-
+	proc_create("dma", 0, NULL, &proc_dma_operations);
 	return 0;
 }
 
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index f091d13def00..6fc0040f3e3a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -472,11 +472,7 @@ static const struct file_operations kallsyms_operations = {
 
 static int __init kallsyms_init(void)
 {
-	struct proc_dir_entry *entry;
-
-	entry = create_proc_entry("kallsyms", 0444, NULL);
-	if (entry)
-		entry->proc_fops = &kallsyms_operations;
+	proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
 	return 0;
 }
 __initcall(kallsyms_init);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 7c74dab0d21b..5e7b45c56923 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -233,14 +233,7 @@ static struct file_operations lstats_fops = {
 
 static int __init init_lstats_procfs(void)
 {
-	struct proc_dir_entry *pe;
-
-	pe = create_proc_entry("latency_stats", 0644, NULL);
-	if (!pe)
-		return -ENOMEM;
-
-	pe->proc_fops = &lstats_fops;
-
+	proc_create("latency_stats", 0644, NULL, &lstats_fops);
 	return 0;
 }
 __initcall(init_lstats_procfs);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 8a135bd163c2..dc5d29648d85 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -660,20 +660,12 @@ static const struct file_operations proc_lock_stat_operations = {
 
 static int __init lockdep_proc_init(void)
 {
-	struct proc_dir_entry *entry;
-
-	entry = create_proc_entry("lockdep", S_IRUSR, NULL);
-	if (entry)
-		entry->proc_fops = &proc_lockdep_operations;
-
-	entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
-	if (entry)
-		entry->proc_fops = &proc_lockdep_stats_operations;
+	proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+	proc_create("lockdep_stats", S_IRUSR, NULL,
+		    &proc_lockdep_stats_operations);
 
 #ifdef CONFIG_LOCK_STAT
-	entry = create_proc_entry("lock_stat", S_IRUSR, NULL);
-	if (entry)
-		entry->proc_fops = &proc_lock_stat_operations;
+	proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
 #endif
 
 	return 0;
diff --git a/kernel/profile.c b/kernel/profile.c
index 606d7387265c..ae7ead82cbc9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -587,10 +587,10 @@ static int __init create_proc_profile(void)
 		return 0;
 	if (create_hash_tables())
 		return -1;
-	entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
+	entry = proc_create("profile", S_IWUSR | S_IRUGO,
+			    NULL, &proc_profile_operations);
 	if (!entry)
 		return 0;
-	entry->proc_fops = &proc_profile_operations;
 	entry->size = (1+prof_len) * sizeof(atomic_t);
 	hotcpu_notifier(profile_cpu_callback, 0);
 	return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index cee12cc47cab..74af2d7cb5a1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -131,14 +131,8 @@ static const struct file_operations proc_iomem_operations = {
 
 static int __init ioresources_init(void)
 {
-	struct proc_dir_entry *entry;
-
-	entry = create_proc_entry("ioports", 0, NULL);
-	if (entry)
-		entry->proc_fops = &proc_ioports_operations;
-	entry = create_proc_entry("iomem", 0, NULL);
-	if (entry)
-		entry->proc_fops = &proc_iomem_operations;
+	proc_create("ioports", 0, NULL, &proc_ioports_operations);
+	proc_create("iomem", 0, NULL, &proc_iomem_operations);
 	return 0;
 }
 __initcall(ioresources_init);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index f3f4af4b8b0f..8a9498e7c831 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -277,12 +277,9 @@ static int __init init_sched_debug_procfs(void)
 {
 	struct proc_dir_entry *pe;
 
-	pe = create_proc_entry("sched_debug", 0644, NULL);
+	pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops);
 	if (!pe)
 		return -ENOMEM;
-
-	pe->proc_fops = &sched_debug_fops;
-
 	return 0;
 }
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 67fe8fc21fb1..a40e20fd0001 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -278,12 +278,9 @@ static int __init init_timer_list_procfs(void)
 {
 	struct proc_dir_entry *pe;
 
-	pe = create_proc_entry("timer_list", 0644, NULL);
+	pe = proc_create("timer_list", 0644, NULL, &timer_list_fops);
 	if (!pe)
 		return -ENOMEM;
-
-	pe->proc_fops = &timer_list_fops;
-
 	return 0;
 }
 __initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 417da8c5bc72..c994530d166d 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -415,12 +415,9 @@ static int __init init_tstats_procfs(void)
 {
 	struct proc_dir_entry *pe;
 
-	pe = create_proc_entry("timer_stats", 0644, NULL);
+	pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
 	if (!pe)
 		return -ENOMEM;
-
-	pe->proc_fops = &tstats_fops;
-
 	return 0;
 }
 __initcall(init_tstats_procfs);
-- 
cgit v1.3-14-g43fede


From 88f458e4b91348b2e892c72977b5f665d7f374da Mon Sep 17 00:00:00 2001
From: Holger Schurig <hs4233@mail.mn-solutions.de>
Date: Tue, 29 Apr 2008 01:02:36 -0700
Subject: sysctl: allow embedded targets to disable sysctl_check.c

Disable sysctl_check.c for embedded targets. This saves about about 11 kB
in .text and another 11 kB in .data on a PXA255 embedded platform.

Signed-off-by: Holger Schurig <hs4233@mail.mn-solutions.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig    | 11 +++++++++++
 kernel/Makefile |  2 +-
 kernel/sysctl.c | 10 ++++++++--
 3 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index 98fa96eac415..3e7b257fc05f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -550,6 +550,17 @@ config SYSCTL_SYSCALL
 
 	  If unsure say Y here.
 
+config SYSCTL_SYSCALL_CHECK
+	bool "Sysctl checks" if EMBEDDED
+	depends on SYSCTL_SYSCALL
+	default y
+	---help---
+	  sys_sysctl uses binary paths that have been found challenging
+	  to properly maintain and use. This enables checks that help
+	  you to keep things correct.
+
+	  If unsure say Y here.
+
 config KALLSYMS
 	 bool "Load all symbols for debugging/ksymoops" if EMBEDDED
 	 default y
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c5f081132a4..188c43223f52 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,7 +11,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o
 
-obj-$(CONFIG_SYSCTL) += sysctl_check.o
+obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0a1d2733cf41..1cdfe942d160 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1592,9 +1592,13 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 
 static __init int sysctl_init(void)
 {
-	int err;
 	sysctl_set_parent(NULL, root_table);
-	err = sysctl_check_table(current->nsproxy, root_table);
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+	{
+		int err;
+		err = sysctl_check_table(current->nsproxy, root_table);
+	}
+#endif
 	return 0;
 }
 
@@ -1721,10 +1725,12 @@ struct ctl_table_header *__register_sysctl_paths(
 	header->unregistering = NULL;
 	header->root = root;
 	sysctl_set_parent(NULL, header->ctl_table);
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
 	if (sysctl_check_table(namespaces, header->ctl_table)) {
 		kfree(header);
 		return NULL;
 	}
+#endif
 	spin_lock(&sysctl_lock);
 	header_list = lookup_header_list(root, namespaces);
 	list_add_tail(&header->ctl_entry, header_list);
-- 
cgit v1.3-14-g43fede


From 2c4c7155f25192da3511a6c911db4d08102d36c4 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 29 Apr 2008 01:02:41 -0700
Subject: sysctl: clean from unneeded extern and forward declarations

The do_sysctl_strategy isn't used outside kernel/sysctl.c, so this can be
static and without a prototype in header.

Besides, move this one and parse_table() above their callers and drop the
forward declarations of the latter call.

One more "besides" - fix two checkpatch warnings: space before a ( and an
extra space at the end of a line.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@sw.ru>
Cc: Denis V. Lunev <den@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h |   5 --
 kernel/sysctl.c        | 144 +++++++++++++++++++++++--------------------------
 2 files changed, 68 insertions(+), 81 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 5432b34a1e51..39eafd8f97a3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -981,11 +981,6 @@ extern int do_sysctl (int __user *name, int nlen,
 		      void __user *oldval, size_t __user *oldlenp,
 		      void __user *newval, size_t newlen);
 
-extern int do_sysctl_strategy (struct ctl_table *table,
-			       int __user *name, int nlen,
-			       void __user *oldval, size_t __user *oldlenp,
-			       void __user *newval, size_t newlen);
-
 extern ctl_handler sysctl_data;
 extern ctl_handler sysctl_string;
 extern ctl_handler sysctl_intvec;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1cdfe942d160..874e813e40c8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -145,12 +145,6 @@ extern int no_unaligned_warning;
 extern int max_lock_depth;
 #endif
 
-#ifdef CONFIG_SYSCTL_SYSCALL
-static int parse_table(int __user *, int, void __user *, size_t __user *,
-		void __user *, size_t, struct ctl_table *);
-#endif
-
-
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -1439,6 +1433,74 @@ void register_sysctl_root(struct ctl_table_root *root)
 }
 
 #ifdef CONFIG_SYSCTL_SYSCALL
+/* Perform the actual read/write of a sysctl table entry. */
+static int do_sysctl_strategy(struct ctl_table *table,
+			int __user *name, int nlen,
+			void __user *oldval, size_t __user *oldlenp,
+			void __user *newval, size_t newlen)
+{
+	int op = 0, rc;
+
+	if (oldval)
+		op |= 004;
+	if (newval)
+		op |= 002;
+	if (sysctl_perm(table, op))
+		return -EPERM;
+
+	if (table->strategy) {
+		rc = table->strategy(table, name, nlen, oldval, oldlenp,
+				     newval, newlen);
+		if (rc < 0)
+			return rc;
+		if (rc > 0)
+			return 0;
+	}
+
+	/* If there is no strategy routine, or if the strategy returns
+	 * zero, proceed with automatic r/w */
+	if (table->data && table->maxlen) {
+		rc = sysctl_data(table, name, nlen, oldval, oldlenp,
+				 newval, newlen);
+		if (rc < 0)
+			return rc;
+	}
+	return 0;
+}
+
+static int parse_table(int __user *name, int nlen,
+		       void __user *oldval, size_t __user *oldlenp,
+		       void __user *newval, size_t newlen,
+		       struct ctl_table *table)
+{
+	int n;
+repeat:
+	if (!nlen)
+		return -ENOTDIR;
+	if (get_user(n, name))
+		return -EFAULT;
+	for ( ; table->ctl_name || table->procname; table++) {
+		if (!table->ctl_name)
+			continue;
+		if (n == table->ctl_name) {
+			int error;
+			if (table->child) {
+				if (sysctl_perm(table, 001))
+					return -EPERM;
+				name++;
+				nlen--;
+				table = table->child;
+				goto repeat;
+			}
+			error = do_sysctl_strategy(table, name, nlen,
+						   oldval, oldlenp,
+						   newval, newlen);
+			return error;
+		}
+	}
+	return -ENOTDIR;
+}
+
 int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
 	       void __user *newval, size_t newlen)
 {
@@ -1511,76 +1573,6 @@ int sysctl_perm(struct ctl_table *table, int op)
 	return test_perm(table->mode, op);
 }
 
-#ifdef CONFIG_SYSCTL_SYSCALL
-static int parse_table(int __user *name, int nlen,
-		       void __user *oldval, size_t __user *oldlenp,
-		       void __user *newval, size_t newlen,
-		       struct ctl_table *table)
-{
-	int n;
-repeat:
-	if (!nlen)
-		return -ENOTDIR;
-	if (get_user(n, name))
-		return -EFAULT;
-	for ( ; table->ctl_name || table->procname; table++) {
-		if (!table->ctl_name)
-			continue;
-		if (n == table->ctl_name) {
-			int error;
-			if (table->child) {
-				if (sysctl_perm(table, 001))
-					return -EPERM;
-				name++;
-				nlen--;
-				table = table->child;
-				goto repeat;
-			}
-			error = do_sysctl_strategy(table, name, nlen,
-						   oldval, oldlenp,
-						   newval, newlen);
-			return error;
-		}
-	}
-	return -ENOTDIR;
-}
-
-/* Perform the actual read/write of a sysctl table entry. */
-int do_sysctl_strategy (struct ctl_table *table,
-			int __user *name, int nlen,
-			void __user *oldval, size_t __user *oldlenp,
-			void __user *newval, size_t newlen)
-{
-	int op = 0, rc;
-
-	if (oldval)
-		op |= 004;
-	if (newval) 
-		op |= 002;
-	if (sysctl_perm(table, op))
-		return -EPERM;
-
-	if (table->strategy) {
-		rc = table->strategy(table, name, nlen, oldval, oldlenp,
-				     newval, newlen);
-		if (rc < 0)
-			return rc;
-		if (rc > 0)
-			return 0;
-	}
-
-	/* If there is no strategy routine, or if the strategy returns
-	 * zero, proceed with automatic r/w */
-	if (table->data && table->maxlen) {
-		rc = sysctl_data(table, name, nlen, oldval, oldlenp,
-				 newval, newlen);
-		if (rc < 0)
-			return rc;
-	}
-	return 0;
-}
-#endif /* CONFIG_SYSCTL_SYSCALL */
-
 static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 {
 	for (; table->ctl_name || table->procname; table++) {
-- 
cgit v1.3-14-g43fede


From d7321cd62470b70d2717dae5a963e7a8fabff4d5 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 29 Apr 2008 01:02:44 -0700
Subject: sysctl: add the ->permissions callback on the ctl_table_root

When reading from/writing to some table, a root, which this table came from,
may affect this table's permissions, depending on who is working with the
table.

The core hunk is at the bottom of this patch.  All the rest is just pushing
the ctl_table_root argument up to the sysctl_perm() function.

This will be mostly (only?) used in the net sysctls.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@sw.ru>
Cc: Denis V. Lunev <den@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c  |  4 ++--
 include/linux/sysctl.h |  7 ++++++-
 kernel/sysctl.c        | 25 ++++++++++++++++++-------
 3 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5e31585292a0..5acc001d49f6 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -190,7 +190,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 	 * and won't be until we finish.
 	 */
 	error = -EPERM;
-	if (sysctl_perm(table, write ? MAY_WRITE : MAY_READ))
+	if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
 		goto out;
 
 	/* careful: calling conventions are nasty here */
@@ -388,7 +388,7 @@ static int proc_sys_permission(struct inode *inode, int mask, struct nameidata *
 		goto out;
 
 	/* Use the permissions on the sysctl table entry */
-	error = sysctl_perm(table, mask);
+	error = sysctl_perm(head->root, table, mask);
 out:
 	sysctl_head_finish(head);
 	return error;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 39eafd8f97a3..24141b4d1a11 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -945,11 +945,14 @@ enum
 /* For the /proc/sys support */
 struct ctl_table;
 struct nsproxy;
+struct ctl_table_root;
+
 extern struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev);
 extern struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
 						struct ctl_table_header *prev);
 extern void sysctl_head_finish(struct ctl_table_header *prev);
-extern int sysctl_perm(struct ctl_table *table, int op);
+extern int sysctl_perm(struct ctl_table_root *root,
+		struct ctl_table *table, int op);
 
 typedef struct ctl_table ctl_table;
 
@@ -1049,6 +1052,8 @@ struct ctl_table_root {
 	struct list_head header_list;
 	struct list_head *(*lookup)(struct ctl_table_root *root,
 					   struct nsproxy *namespaces);
+	int (*permissions)(struct ctl_table_root *root,
+			struct nsproxy *namespaces, struct ctl_table *table);
 };
 
 /* struct ctl_table_header is used to maintain dynamic lists of
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 874e813e40c8..d7ffdc59816a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1434,7 +1434,8 @@ void register_sysctl_root(struct ctl_table_root *root)
 
 #ifdef CONFIG_SYSCTL_SYSCALL
 /* Perform the actual read/write of a sysctl table entry. */
-static int do_sysctl_strategy(struct ctl_table *table,
+static int do_sysctl_strategy(struct ctl_table_root *root,
+			struct ctl_table *table,
 			int __user *name, int nlen,
 			void __user *oldval, size_t __user *oldlenp,
 			void __user *newval, size_t newlen)
@@ -1445,7 +1446,7 @@ static int do_sysctl_strategy(struct ctl_table *table,
 		op |= 004;
 	if (newval)
 		op |= 002;
-	if (sysctl_perm(table, op))
+	if (sysctl_perm(root, table, op))
 		return -EPERM;
 
 	if (table->strategy) {
@@ -1471,6 +1472,7 @@ static int do_sysctl_strategy(struct ctl_table *table,
 static int parse_table(int __user *name, int nlen,
 		       void __user *oldval, size_t __user *oldlenp,
 		       void __user *newval, size_t newlen,
+		       struct ctl_table_root *root,
 		       struct ctl_table *table)
 {
 	int n;
@@ -1485,14 +1487,14 @@ repeat:
 		if (n == table->ctl_name) {
 			int error;
 			if (table->child) {
-				if (sysctl_perm(table, 001))
+				if (sysctl_perm(root, table, 001))
 					return -EPERM;
 				name++;
 				nlen--;
 				table = table->child;
 				goto repeat;
 			}
-			error = do_sysctl_strategy(table, name, nlen,
+			error = do_sysctl_strategy(root, table, name, nlen,
 						   oldval, oldlenp,
 						   newval, newlen);
 			return error;
@@ -1518,7 +1520,8 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
 	for (head = sysctl_head_next(NULL); head;
 			head = sysctl_head_next(head)) {
 		error = parse_table(name, nlen, oldval, oldlenp, 
-					newval, newlen, head->ctl_table);
+					newval, newlen,
+					head->root, head->ctl_table);
 		if (error != -ENOTDIR) {
 			sysctl_head_finish(head);
 			break;
@@ -1564,13 +1567,21 @@ static int test_perm(int mode, int op)
 	return -EACCES;
 }
 
-int sysctl_perm(struct ctl_table *table, int op)
+int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 {
 	int error;
+	int mode;
+
 	error = security_sysctl(table, op);
 	if (error)
 		return error;
-	return test_perm(table->mode, op);
+
+	if (root->permissions)
+		mode = root->permissions(root, current->nsproxy, table);
+	else
+		mode = table->mode;
+
+	return test_perm(mode, op);
 }
 
 static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
-- 
cgit v1.3-14-g43fede


From 801678c5a3b4c79236970bcca27c733f5559e0d1 Mon Sep 17 00:00:00 2001
From: Hirofumi Nakagawa <hnakagawa@miraclelinux.com>
Date: Tue, 29 Apr 2008 01:03:09 -0700
Subject: Remove duplicated unlikely() in IS_ERR()

Some drivers have duplicated unlikely() macros.  IS_ERR() already has
unlikely() in itself.

This patch cleans up such pointless code.

Signed-off-by: Hirofumi Nakagawa <hnakagawa@miraclelinux.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Jeff Garzik <jeff@garzik.org>
Cc: Paul Clements <paul.clements@steeleye.com>
Cc: Richard Purdie <rpurdie@rpsys.net>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: David Brownell <david-b@pacbell.net>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/nbd.c             |  2 +-
 drivers/leds/led-class.c        |  2 +-
 drivers/message/i2o/i2o_block.c |  2 +-
 drivers/net/myri10ge/myri10ge.c |  2 +-
 drivers/net/tg3.c               |  2 +-
 drivers/rtc/rtc-bfin.c          |  2 +-
 drivers/scsi/scsi_scan.c        |  2 +-
 fs/aio.c                        |  2 +-
 fs/ecryptfs/inode.c             |  2 +-
 fs/inotify_user.c               |  2 +-
 fs/ntfs/mft.c                   |  6 +++---
 kernel/auditfilter.c            | 10 +++++-----
 net/core/dev.c                  |  2 +-
 net/ipv4/af_inet.c              |  2 +-
 net/netfilter/nf_queue.c        |  2 +-
 net/xfrm/xfrm_output.c          |  2 +-
 sound/sh/aica.c                 |  2 +-
 17 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index a3da198ca429..bdba282f15e4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -339,7 +339,7 @@ static struct request *nbd_read_stat(struct nbd_device *lo)
 	}
 
 	req = nbd_find_request(lo, *(struct request **)reply.handle);
-	if (unlikely(IS_ERR(req))) {
+	if (IS_ERR(req)) {
 		result = PTR_ERR(req);
 		if (result != -ENOENT)
 			goto harderror;
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index ac05a928f764..b3c54be74556 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -105,7 +105,7 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
 
 	led_cdev->dev = device_create(leds_class, parent, 0, "%s",
 					    led_cdev->name);
-	if (unlikely(IS_ERR(led_cdev->dev)))
+	if (IS_ERR(led_cdev->dev))
 		return PTR_ERR(led_cdev->dev);
 
 	dev_set_drvdata(led_cdev->dev, led_cdev);
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index a95314897402..81483de8c0fd 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -371,7 +371,7 @@ static int i2o_block_prep_req_fn(struct request_queue *q, struct request *req)
 	/* connect the i2o_block_request to the request */
 	if (!req->special) {
 		ireq = i2o_block_request_alloc();
-		if (unlikely(IS_ERR(ireq))) {
+		if (IS_ERR(ireq)) {
 			osm_debug("unable to allocate i2o_block_request!\n");
 			return BLKPREP_DEFER;
 		}
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index cead81e80f0c..ef63c8d2bd7e 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -2437,7 +2437,7 @@ static int myri10ge_sw_tso(struct sk_buff *skb, struct net_device *dev)
 	int status;
 
 	segs = skb_gso_segment(skb, dev->features & ~NETIF_F_TSO6);
-	if (unlikely(IS_ERR(segs)))
+	if (IS_ERR(segs))
 		goto drop;
 
 	while (segs) {
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index e3f74c9f78bd..b66c75e3b8a1 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -4361,7 +4361,7 @@ static int tg3_tso_bug(struct tg3 *tp, struct sk_buff *skb)
 	}
 
 	segs = skb_gso_segment(skb, tp->dev->features & ~NETIF_F_TSO);
-	if (unlikely(IS_ERR(segs)))
+	if (IS_ERR(segs))
 		goto tg3_tso_bug_end;
 
 	do {
diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c
index 4f28045d9ef2..8624f55d0560 100644
--- a/drivers/rtc/rtc-bfin.c
+++ b/drivers/rtc/rtc-bfin.c
@@ -419,7 +419,7 @@ static int __devinit bfin_rtc_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	rtc->rtc_dev = rtc_device_register(pdev->name, &pdev->dev, &bfin_rtc_ops, THIS_MODULE);
-	if (unlikely(IS_ERR(rtc))) {
+	if (IS_ERR(rtc)) {
 		ret = PTR_ERR(rtc->rtc_dev);
 		goto err;
 	}
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index fcd7455ffc39..a00eee6f7be9 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -1828,7 +1828,7 @@ void scsi_scan_host(struct Scsi_Host *shost)
 	}
 
 	p = kthread_run(do_scan_async, data, "scsi_scan_%d", shost->host_no);
-	if (unlikely(IS_ERR(p)))
+	if (IS_ERR(p))
 		do_scan_async(data);
 }
 EXPORT_SYMBOL(scsi_scan_host);
diff --git a/fs/aio.c b/fs/aio.c
index 81c01290939b..a175ad650b66 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1604,7 +1604,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		 * event using the eventfd_signal() function.
 		 */
 		req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
-		if (unlikely(IS_ERR(req->ki_eventfd))) {
+		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
 			goto out_put_req;
 		}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1623ebf25e51..0a1397335a8e 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -111,7 +111,7 @@ ecryptfs_do_create(struct inode *directory_inode,
 
 	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	if (unlikely(IS_ERR(lower_dir_dentry))) {
+	if (IS_ERR(lower_dir_dentry)) {
 		ecryptfs_printk(KERN_ERR, "Error locking directory of "
 				"dentry\n");
 		rc = PTR_ERR(lower_dir_dentry);
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 7b94a1e3c015..6676c06bb7c1 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -598,7 +598,7 @@ asmlinkage long sys_inotify_init(void)
 	}
 
 	ih = inotify_init(&inotify_user_ops);
-	if (unlikely(IS_ERR(ih))) {
+	if (IS_ERR(ih)) {
 		ret = PTR_ERR(ih);
 		goto out_free_dev;
 	}
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 2ad5c8b104b9..790defb847e7 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1191,7 +1191,7 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
 		if (size) {
 			page = ntfs_map_page(mftbmp_mapping,
 					ofs >> PAGE_CACHE_SHIFT);
-			if (unlikely(IS_ERR(page))) {
+			if (IS_ERR(page)) {
 				ntfs_error(vol->sb, "Failed to read mft "
 						"bitmap, aborting.");
 				return PTR_ERR(page);
@@ -2118,7 +2118,7 @@ static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
 	}
 	/* Read, map, and pin the page containing the mft record. */
 	page = ntfs_map_page(mft_vi->i_mapping, index);
-	if (unlikely(IS_ERR(page))) {
+	if (IS_ERR(page)) {
 		ntfs_error(vol->sb, "Failed to map page containing mft record "
 				"to format 0x%llx.", (long long)mft_no);
 		return PTR_ERR(page);
@@ -2519,7 +2519,7 @@ mft_rec_already_initialized:
 	ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
 	/* Read, map, and pin the page containing the mft record. */
 	page = ntfs_map_page(vol->mft_ino->i_mapping, index);
-	if (unlikely(IS_ERR(page))) {
+	if (IS_ERR(page)) {
 		ntfs_error(vol->sb, "Failed to map page containing allocated "
 				"mft record 0x%llx.", (long long)bit);
 		err = PTR_ERR(page);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 28fef6bf8534..13430176b3c9 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -272,7 +272,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
 		return -EINVAL;
 
 	watch = audit_init_watch(path);
-	if (unlikely(IS_ERR(watch)))
+	if (IS_ERR(watch))
 		return PTR_ERR(watch);
 
 	audit_get_watch(watch);
@@ -848,7 +848,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
 		return ERR_PTR(-ENOMEM);
 
 	new = audit_init_watch(path);
-	if (unlikely(IS_ERR(new))) {
+	if (IS_ERR(new)) {
 		kfree(path);
 		goto out;
 	}
@@ -989,7 +989,7 @@ static void audit_update_watch(struct audit_parent *parent,
 			audit_set_auditable(current->audit_context);
 
 		nwatch = audit_dupe_watch(owatch);
-		if (unlikely(IS_ERR(nwatch))) {
+		if (IS_ERR(nwatch)) {
 			mutex_unlock(&audit_filter_mutex);
 			audit_panic("error updating watch, skipping");
 			return;
@@ -1004,7 +1004,7 @@ static void audit_update_watch(struct audit_parent *parent,
 			list_del_rcu(&oentry->list);
 
 			nentry = audit_dupe_rule(&oentry->rule, nwatch);
-			if (unlikely(IS_ERR(nentry)))
+			if (IS_ERR(nentry))
 				audit_panic("error updating watch, removing");
 			else {
 				int h = audit_hash_ino((u32)ino);
@@ -1785,7 +1785,7 @@ int audit_update_lsm_rules(void)
 			watch = entry->rule.watch;
 			tree = entry->rule.tree;
 			nentry = audit_dupe_rule(&entry->rule, watch);
-			if (unlikely(IS_ERR(nentry))) {
+			if (IS_ERR(nentry)) {
 				/* save the first error encountered for the
 				 * return value */
 				if (!err)
diff --git a/net/core/dev.c b/net/core/dev.c
index e1df1ab3e04a..ed49da592051 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1524,7 +1524,7 @@ static int dev_gso_segment(struct sk_buff *skb)
 	if (!segs)
 		return 0;
 
-	if (unlikely(IS_ERR(segs)))
+	if (IS_ERR(segs))
 		return PTR_ERR(segs);
 
 	skb->next = segs;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f2b5270efdaa..24eca23c2db3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1234,7 +1234,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
 		segs = ops->gso_segment(skb, features);
 	rcu_read_unlock();
 
-	if (!segs || unlikely(IS_ERR(segs)))
+	if (!segs || IS_ERR(segs))
 		goto out;
 
 	skb = segs;
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index bbd26893c0c4..582ec3efc8a5 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -214,7 +214,7 @@ int nf_queue(struct sk_buff *skb,
 
 	segs = skb_gso_segment(skb, 0);
 	kfree_skb(skb);
-	if (unlikely(IS_ERR(segs)))
+	if (IS_ERR(segs))
 		return 1;
 
 	do {
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 2519129c6d21..09cd9c0c2d80 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -150,7 +150,7 @@ static int xfrm_output_gso(struct sk_buff *skb)
 
 	segs = skb_gso_segment(skb, 0);
 	kfree_skb(skb);
-	if (unlikely(IS_ERR(segs)))
+	if (IS_ERR(segs))
 		return PTR_ERR(segs);
 
 	do {
diff --git a/sound/sh/aica.c b/sound/sh/aica.c
index d49417bf78c6..9ca113326143 100644
--- a/sound/sh/aica.c
+++ b/sound/sh/aica.c
@@ -663,7 +663,7 @@ static int __init aica_init(void)
 		return err;
 	pd = platform_device_register_simple(SND_AICA_DRIVER, -1,
 					     aica_memory_space, 2);
-	if (unlikely(IS_ERR(pd))) {
+	if (IS_ERR(pd)) {
 		platform_driver_unregister(&snd_aica_driver);
 		return PTR_ERR(pd);
 	}
-- 
cgit v1.3-14-g43fede


From 68ab3d883a2df13f4b93a923bae3a287cbee29d3 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 29 Apr 2008 01:03:46 -0700
Subject: relayfs: support larger relay buffer

Use vmalloc() and memset() instead of kcalloc() to allocate a page* array when
the array size is bigger than one page.  This enables relayfs to support
bigger relay buffers than 64MB on 4k-page system, 512MB on 16k-page system.

[akpm@linux-foundation.org: cleanup]
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: David Wilder <dwilder@us.ibm.com>
Reviewed-by: Tom Zanussi <zanussi@comcast.net>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index d6204a485818..bc24dcdc570f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -65,6 +65,35 @@ static struct vm_operations_struct relay_file_mmap_ops = {
 	.close = relay_file_mmap_close,
 };
 
+/*
+ * allocate an array of pointers of struct page
+ */
+static struct page **relay_alloc_page_array(unsigned int n_pages)
+{
+	struct page **array;
+	size_t pa_size = n_pages * sizeof(struct page *);
+
+	if (pa_size > PAGE_SIZE) {
+		array = vmalloc(pa_size);
+		if (array)
+			memset(array, 0, pa_size);
+	} else {
+		array = kzalloc(pa_size, GFP_KERNEL);
+	}
+	return array;
+}
+
+/*
+ * free an array of pointers of struct page
+ */
+static void relay_free_page_array(struct page **array)
+{
+	if (is_vmalloc_addr(array))
+		vfree(array);
+	else
+		kfree(array);
+}
+
 /**
  *	relay_mmap_buf: - mmap channel buffer to process address space
  *	@buf: relay channel buffer
@@ -109,7 +138,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
 	*size = PAGE_ALIGN(*size);
 	n_pages = *size >> PAGE_SHIFT;
 
-	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
+	buf->page_array = relay_alloc_page_array(n_pages);
 	if (!buf->page_array)
 		return NULL;
 
@@ -130,7 +159,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
 depopulate:
 	for (j = 0; j < i; j++)
 		__free_page(buf->page_array[j]);
-	kfree(buf->page_array);
+	relay_free_page_array(buf->page_array);
 	return NULL;
 }
 
@@ -189,7 +218,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
 		vunmap(buf->start);
 		for (i = 0; i < buf->page_count; i++)
 			__free_page(buf->page_array[i]);
-		kfree(buf->page_array);
+		relay_free_page_array(buf->page_array);
 	}
 	chan->buf[buf->cpu] = NULL;
 	kfree(buf->padding);
-- 
cgit v1.3-14-g43fede


From 37487a56523d402e25650da16c337acf4cecd13d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Tue, 29 Apr 2008 01:03:49 -0700
Subject: Add kbuild.h that contains common definitions for kbuild users

The same definitions are used for the bounds logic and the asm-offsets.h
generation by kbuild.  Put them into include/linux/kbuild.h file.

Also add a new feature

	COMMENT("text")

which can be used to insert lines of ocmments into asm-offsets.h and
bounds.h.

Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jay Estabrook <jay.estabrook@hp.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Richard Henderson <rth@twiddle.net>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Chris Zankel <chris@zankel.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Bryan Wu <bryan.wu@analog.com>
Cc: Mike Frysinger <vapier.adi@gmail.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Greg Ungerer <gerg@uclinux.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Miles Bader <miles@gnu.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kbuild.h | 15 +++++++++++++++
 kernel/bounds.c        |  6 +-----
 2 files changed, 16 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/kbuild.h

(limited to 'kernel')

diff --git a/include/linux/kbuild.h b/include/linux/kbuild.h
new file mode 100644
index 000000000000..22a72198c14b
--- /dev/null
+++ b/include/linux/kbuild.h
@@ -0,0 +1,15 @@
+#ifndef __LINUX_KBUILD_H
+#define __LINUX_KBUILD_H
+
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+
+#define BLANK() asm volatile("\n->" : : )
+
+#define OFFSET(sym, str, mem) \
+	DEFINE(sym, offsetof(struct str, mem))
+
+#define COMMENT(x) \
+	asm volatile("\n->#" x)
+
+#endif
diff --git a/kernel/bounds.c b/kernel/bounds.c
index c3c55544db2f..3c5301381837 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -8,11 +8,7 @@
 /* Include headers that define the enum constants of interest */
 #include <linux/page-flags.h>
 #include <linux/mmzone.h>
-
-#define DEFINE(sym, val) \
-        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
-
-#define BLANK() asm volatile("\n->" : : )
+#include <linux/kbuild.h>
 
 void foo(void)
 {
-- 
cgit v1.3-14-g43fede


From e1401c6bbb289d154eb0d0c292cc9f8259e4af73 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 30 Apr 2008 00:52:34 -0700
Subject: signals: remove unused variable from send_signal()

This function doesn't change the ret's value and thus always returns 0, with a
single exception of returning -EAGAIN explicitly.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 64ad0ed15992..f5f3b8a61bee 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -661,7 +661,6 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			struct sigpending *signals)
 {
 	struct sigqueue * q = NULL;
-	int ret = 0;
 
 	/*
 	 * Deliver the signal to listening signalfds. This must be called
@@ -719,7 +718,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 
 out_set:
 	sigaddset(&signals->signal, sig);
-	return ret;
+	return 0;
 }
 
 #define LEGACY_QUEUE(sigptr, sig) \
-- 
cgit v1.3-14-g43fede


From af7fff9c13d56657dc328c75590f401c99bcecd9 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 30 Apr 2008 00:52:34 -0700
Subject: signals: turn LEGACY_QUEUE macro into static inline function

This makes the code more readable, due to less brackets and small letters in
name.

I also move it above the send_signal() as a preparation for the 3rd patch.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index f5f3b8a61bee..772aa011dad8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -657,6 +657,11 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 	}
 }
 
+static inline int legacy_queue(struct sigpending *signals, int sig)
+{
+	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
+}
+
 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			struct sigpending *signals)
 {
@@ -721,9 +726,6 @@ out_set:
 	return 0;
 }
 
-#define LEGACY_QUEUE(sigptr, sig) \
-	(((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig)))
-
 int print_fatal_signals;
 
 static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -771,7 +773,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	/* Support queueing exactly one non-rt signal, so that we
 	   can get more detailed information about the cause of
 	   the signal. */
-	if (LEGACY_QUEUE(&t->pending, sig))
+	if (legacy_queue(&t->pending, sig))
 		goto out;
 
 	ret = send_signal(sig, info, t, &t->pending);
@@ -932,7 +934,7 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	if (sig_ignored(p, sig))
 		return ret;
 
-	if (LEGACY_QUEUE(&p->signal->shared_pending, sig))
+	if (legacy_queue(&p->signal->shared_pending, sig))
 		/* This is a non-RT signal and we already have one queued.  */
 		return ret;
 
-- 
cgit v1.3-14-g43fede


From 2acb024d5524eda305523c1d6061fe5ef1949165 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 30 Apr 2008 00:52:35 -0700
Subject: signals: consolidate checking for ignored/legacy signals

Two callers for send_signal() - the specific_send_sig_info and the
__group_send_sig_info - both check for sig to be ignored or already queued.

Move these checks into send_signal() and make it return 1 to indicate that the
signal is dropped, but there's no error in this.

Besides, merge comments and spell-check them.

[oleg@tv-sign.ru: simplifications]
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 42 +++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 772aa011dad8..fb8ffd468854 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -667,6 +667,14 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 {
 	struct sigqueue * q = NULL;
 
+	/*
+	 * Short-circuit ignored signals and support queuing
+	 * exactly one non-rt signal, so that we can get more
+	 * detailed information about the cause of the signal.
+	 */
+	if (sig_ignored(t, sig) || legacy_queue(signals, sig))
+		return 0;
+
 	/*
 	 * Deliver the signal to listening signalfds. This must be called
 	 * with the sighand lock held.
@@ -723,7 +731,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 
 out_set:
 	sigaddset(&signals->signal, sig);
-	return 0;
+	return 1;
 }
 
 int print_fatal_signals;
@@ -761,26 +769,18 @@ __setup("print-fatal-signals=", setup_print_fatal_signals);
 static int
 specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
-	int ret = 0;
+	int ret;
 
 	BUG_ON(!irqs_disabled());
 	assert_spin_locked(&t->sighand->siglock);
 
-	/* Short-circuit ignored signals.  */
-	if (sig_ignored(t, sig))
-		goto out;
-
-	/* Support queueing exactly one non-rt signal, so that we
-	   can get more detailed information about the cause of
-	   the signal. */
-	if (legacy_queue(&t->pending, sig))
-		goto out;
-
 	ret = send_signal(sig, info, t, &t->pending);
-	if (!ret && !sigismember(&t->blocked, sig))
+	if (ret <= 0)
+		return ret;
+
+	if (!sigismember(&t->blocked, sig))
 		signal_wake_up(t, sig == SIGKILL);
-out:
-	return ret;
+	return 0;
 }
 
 /*
@@ -925,26 +925,18 @@ __group_complete_signal(int sig, struct task_struct *p)
 int
 __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
-	int ret = 0;
+	int ret;
 
 	assert_spin_locked(&p->sighand->siglock);
 	handle_stop_signal(sig, p);
 
-	/* Short-circuit ignored signals.  */
-	if (sig_ignored(p, sig))
-		return ret;
-
-	if (legacy_queue(&p->signal->shared_pending, sig))
-		/* This is a non-RT signal and we already have one queued.  */
-		return ret;
-
 	/*
 	 * Put this signal on the shared-pending queue, or fail with EAGAIN.
 	 * We always use the shared queue for process-wide signals,
 	 * to avoid several races.
 	 */
 	ret = send_signal(sig, info, p, &p->signal->shared_pending);
-	if (unlikely(ret))
+	if (ret <= 0)
 		return ret;
 
 	__group_complete_signal(sig, p);
-- 
cgit v1.3-14-g43fede


From 573cf9ad72c13750e86c91de43477e9dfb440523 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:36 -0700
Subject: signals: do_signal_stop(): use signal_group_exit()

do_signal_stop() needs signal_group_exit() but checks sig->group_exit_task.
 This (optimization) is correct, SIGNAL_STOP_DEQUEUED and SIGNAL_GROUP_EXIT
are mutually exclusive, but looks confusing.  Use signal_group_exit(), this
is not fastpath, the code clarity is more important.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index fb8ffd468854..29aca40be33f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1717,7 +1717,7 @@ static int do_signal_stop(int signr)
 		struct task_struct *t;
 
 		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
-		    unlikely(sig->group_exit_task))
+		    unlikely(signal_group_exit(sig)))
 			return 0;
 		/*
 		 * There is no group stop already in progress.
-- 
cgit v1.3-14-g43fede


From bfc4b0890af566940de6e7aeb4b5faf46d3c3513 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:36 -0700
Subject: signals: do_group_exit(): use signal_group_exit() more consistently

do_group_exit() checks SIGNAL_GROUP_EXIT to avoid taking sighand->siglock.
Since ed5d2cac114202fe2978a9cbcab8f5032796d538 exec() doesn't set this
flag, we should use signal_group_exit().

This is not needed for correctness, but can speedup the multithreaded exec
and makes the code more consistent.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ae0f2c4e452b..6d019aa8522e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1115,12 +1115,13 @@ asmlinkage long sys_exit(int error_code)
 NORET_TYPE void
 do_group_exit(int exit_code)
 {
+	struct signal_struct *sig = current->signal;
+
 	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
 
-	if (current->signal->flags & SIGNAL_GROUP_EXIT)
-		exit_code = current->signal->group_exit_code;
+	if (signal_group_exit(sig))
+		exit_code = sig->group_exit_code;
 	else if (!thread_group_empty(current)) {
-		struct signal_struct *const sig = current->signal;
 		struct sighand_struct *const sighand = current->sighand;
 		spin_lock_irq(&sighand->siglock);
 		if (signal_group_exit(sig))
-- 
cgit v1.3-14-g43fede


From 1406f2d321bae5ac5ff729dcb773336d9c05ec74 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:37 -0700
Subject: lock_task_sighand: add rcu lock/unlock

Most of the callers of lock_task_sighand() doesn't actually need rcu_lock().
lock_task_sighand() needs it only to safely play with tsk->sighand, it can
take the lock itself.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 29aca40be33f..4a45bac2c632 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -971,13 +971,11 @@ int __fatal_signal_pending(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(__fatal_signal_pending);
 
-/*
- * Must be called under rcu_read_lock() or with tasklist_lock read-held.
- */
 struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
 {
 	struct sighand_struct *sighand;
 
+	rcu_read_lock();
 	for (;;) {
 		sighand = rcu_dereference(tsk->sighand);
 		if (unlikely(sighand == NULL))
@@ -988,6 +986,7 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
 			break;
 		spin_unlock_irqrestore(&sighand->siglock, *flags);
 	}
+	rcu_read_unlock();
 
 	return sighand;
 }
-- 
cgit v1.3-14-g43fede


From d6cf723a142f63ccb92272bc0e9bfffd3c3a5cac Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:38 -0700
Subject: k_getrusage: don't take rcu_read_lock()

Just a trivial example, more to come.

k_getrusage() holds rcu_read_lock() because it was previously required by
lock_task_sighand().  Unneeded now.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index e423d0d9e6ff..47c30a20b554 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1572,11 +1572,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 		goto out;
 	}
 
-	rcu_read_lock();
-	if (!lock_task_sighand(p, &flags)) {
-		rcu_read_unlock();
+	if (!lock_task_sighand(p, &flags))
 		return;
-	}
 
 	switch (who) {
 		case RUSAGE_BOTH:
@@ -1612,9 +1609,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 		default:
 			BUG();
 	}
-
 	unlock_task_sighand(p, &flags);
-	rcu_read_unlock();
 
 out:
 	cputime_to_timeval(utime, &r->ru_utime);
-- 
cgit v1.3-14-g43fede


From 93585eeaf3d42d608cd7232e7420c93fb676bba1 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 30 Apr 2008 00:52:39 -0700
Subject: signals: consolidate checks for whether or not to ignore a signal

Both sig_ignored() and do_sigaction() check for signr to be explicitly or
implicitly ignored.  Introduce a helper for them.

This patch is aimed to help handling signals by pid namespace's init, and was
derived from one of Oleg's patches
https://lists.linux-foundation.org/pipermail/containers/2007-December/009308.html
so, if he doesn't mind, he should be considered as an author.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 4a45bac2c632..24ee53b7f60c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -39,11 +39,19 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
+static int __sig_ignored(struct task_struct *t, int sig)
+{
+	void __user *handler;
+
+	/* Is it explicitly or implicitly ignored? */
+
+	handler = t->sighand->action[sig - 1].sa.sa_handler;
+	return handler == SIG_IGN ||
+		(handler == SIG_DFL && sig_kernel_ignore(sig));
+}
 
 static int sig_ignored(struct task_struct *t, int sig)
 {
-	void __user * handler;
-
 	/*
 	 * Tracers always want to know about signals..
 	 */
@@ -58,10 +66,7 @@ static int sig_ignored(struct task_struct *t, int sig)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	/* Is it explicitly or implicitly ignored? */
-	handler = t->sighand->action[sig-1].sa.sa_handler;
-	return   handler == SIG_IGN ||
-		(handler == SIG_DFL && sig_kernel_ignore(sig));
+	return __sig_ignored(t, sig);
 }
 
 /*
@@ -2331,13 +2336,14 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
 
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
+	struct task_struct *t = current;
 	struct k_sigaction *k;
 	sigset_t mask;
 
 	if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
 		return -EINVAL;
 
-	k = &current->sighand->action[sig-1];
+	k = &t->sighand->action[sig-1];
 
 	spin_lock_irq(&current->sighand->siglock);
 	if (oact)
@@ -2358,9 +2364,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 		 *   (for example, SIGCHLD), shall cause the pending signal to
 		 *   be discarded, whether or not it is blocked"
 		 */
-		if (act->sa.sa_handler == SIG_IGN ||
-		   (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
-			struct task_struct *t = current;
+		if (__sig_ignored(t, sig)) {
 			sigemptyset(&mask);
 			sigaddset(&mask, sig);
 			rm_from_queue_full(&mask, &t->signal->shared_pending);
-- 
cgit v1.3-14-g43fede


From c5363d03637885310f1101b95cbbd26d067b4c8d Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 30 Apr 2008 00:52:40 -0700
Subject: signals: clean dequeue_signal from excess checks and assignments

The signr variable may be declared without initialization - it is set ro the
return value from __dequeue_signal() right at the function beginning.

Besides, after recalc_sigpending() two checks for signr to be not 0 may be
merged into one.  Both if-s become easier to read.

Thanks to Oleg for pointing out mistakes in the first version of this patch.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 24ee53b7f60c..6610a95506b3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -377,7 +377,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
  */
 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 {
-	int signr = 0;
+	int signr;
 
 	/* We only dequeue private signals from ourselves, we don't let
 	 * signalfd steal them
@@ -410,8 +410,12 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 			}
 		}
 	}
+
 	recalc_sigpending();
-	if (signr && unlikely(sig_kernel_stop(signr))) {
+	if (!signr)
+		return 0;
+
+	if (unlikely(sig_kernel_stop(signr))) {
 		/*
 		 * Set a marker that we have dequeued a stop signal.  Our
 		 * caller might release the siglock and then the pending
@@ -427,9 +431,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
 			tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
 	}
-	if (signr &&
-	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
-	     info->si_sys_private) {
+	if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
 		/*
 		 * Release the siglock to ensure proper locking order
 		 * of timer locks outside of siglocks.  Note, we leave
-- 
cgit v1.3-14-g43fede


From 9e3bd6c3fb2334be171e69b432039cd18bce4458 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 30 Apr 2008 00:52:41 -0700
Subject: signals: consolidate send_sigqueue and send_group_sigqueue

Both functions do the same thing after proper locking, but with
different sigpending structs, so move the common code into a helper.

After this we have 4 places that look very similar: send_sigqueue: calls
do_send_sigqueue and signal_wakeup send_group_sigqueue: calls
do_send_sigqueue and __group_complete_signal __group_send_sig_info:
calls send_signal and __group_complete_signal specific_send_sig_info:
calls send_signal and signal_wakeup

Besides, send_signal performs actions similar to do_send_sigqueue's
and __group_complete_signal - to signal_wakeup.

It looks like they can be consolidated gracefully.

Oleg said:

  Personally, I think this change is very good.  But send_sigqueue() and
  send_group_sigqueue() have a very subtle difference which I was never able
  to understand.

  Let's suppose that sigqueue is already queued, and the signal is ignored
  (the latter means we should re-schedule cpu timer or handle overrruns).  In
  that case send_sigqueue() returns 0, but send_group_sigqueue() returns 1.

  I think this is not the problem (in fact, I think this patch makes the
  behaviour more correct), but I hope Thomas can take a look and confirm.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 86 +++++++++++++++++++--------------------------------------
 1 file changed, 29 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 6610a95506b3..f9a52c721274 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1290,10 +1290,33 @@ void sigqueue_free(struct sigqueue *q)
 	__sigqueue_free(q);
 }
 
+static int do_send_sigqueue(int sig, struct sigqueue *q, struct task_struct *t,
+		struct sigpending *pending)
+{
+	if (unlikely(!list_empty(&q->list))) {
+		/*
+		 * If an SI_TIMER entry is already queue just increment
+		 * the overrun count.
+		 */
+
+		BUG_ON(q->info.si_code != SI_TIMER);
+		q->info.si_overrun++;
+		return 0;
+	}
+
+	if (sig_ignored(t, sig))
+		return 1;
+
+	signalfd_notify(t, sig);
+	list_add_tail(&q->list, &pending->list);
+	sigaddset(&pending->signal, sig);
+	return 0;
+}
+
 int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 {
 	unsigned long flags;
-	int ret = 0;
+	int ret = -1;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
 
@@ -1307,37 +1330,14 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	 */
 	rcu_read_lock();
 
-	if (!likely(lock_task_sighand(p, &flags))) {
-		ret = -1;
+	if (!likely(lock_task_sighand(p, &flags)))
 		goto out_err;
-	}
 
-	if (unlikely(!list_empty(&q->list))) {
-		/*
-		 * If an SI_TIMER entry is already queue just increment
-		 * the overrun count.
-		 */
-		BUG_ON(q->info.si_code != SI_TIMER);
-		q->info.si_overrun++;
-		goto out;
-	}
-	/* Short-circuit ignored signals.  */
-	if (sig_ignored(p, sig)) {
-		ret = 1;
-		goto out;
-	}
-	/*
-	 * Deliver the signal to listening signalfds. This must be called
-	 * with the sighand lock held.
-	 */
-	signalfd_notify(p, sig);
+	ret = do_send_sigqueue(sig, q, p, &p->pending);
 
-	list_add_tail(&q->list, &p->pending.list);
-	sigaddset(&p->pending.signal, sig);
 	if (!sigismember(&p->blocked, sig))
 		signal_wake_up(p, sig == SIGKILL);
 
-out:
 	unlock_task_sighand(p, &flags);
 out_err:
 	rcu_read_unlock();
@@ -1349,7 +1349,7 @@ int
 send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 {
 	unsigned long flags;
-	int ret = 0;
+	int ret;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
 
@@ -1358,38 +1358,10 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 	handle_stop_signal(sig, p);
 
-	/* Short-circuit ignored signals.  */
-	if (sig_ignored(p, sig)) {
-		ret = 1;
-		goto out;
-	}
-
-	if (unlikely(!list_empty(&q->list))) {
-		/*
-		 * If an SI_TIMER entry is already queue just increment
-		 * the overrun count.  Other uses should not try to
-		 * send the signal multiple times.
-		 */
-		BUG_ON(q->info.si_code != SI_TIMER);
-		q->info.si_overrun++;
-		goto out;
-	} 
-	/*
-	 * Deliver the signal to listening signalfds. This must be called
-	 * with the sighand lock held.
-	 */
-	signalfd_notify(p, sig);
-
-	/*
-	 * Put this signal on the shared-pending queue.
-	 * We always use the shared queue for process-wide signals,
-	 * to avoid several races.
-	 */
-	list_add_tail(&q->list, &p->signal->shared_pending.list);
-	sigaddset(&p->signal->shared_pending.signal, sig);
+	ret = do_send_sigqueue(sig, q, p, &p->signal->shared_pending);
 
 	__group_complete_signal(sig, p);
-out:
+
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	read_unlock(&tasklist_lock);
 	return ret;
-- 
cgit v1.3-14-g43fede


From 3b5e9e53c6f31b5a5a0f5c43707503c62bdefa46 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:42 -0700
Subject: signals: cleanup security_task_kill() usage/implementation

Every implementation of ->task_kill() does nothing when the signal comes from
the kernel.  This is correct, but means that check_kill_permission() should
call security_task_kill() only for SI_FROMUSER() case, and we can remove the
same check from ->task_kill() implementations.

(sadly, check_kill_permission() is the last user of signal->session/__session
 but we can't s/task_session_nr/task_session/ here).

NOTE: Eric W.  Biederman pointed out cap_task_kill() should die, and I think
he is very right.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: David Quigley <dpquigl@tycho.nsa.gov>
Cc: Eric Paris <eparis@redhat.com>
Cc: Harald Welte <laforge@gnumonks.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c            | 27 ++++++++++++++-------------
 security/selinux/hooks.c   |  3 ---
 security/smack/smack_lsm.c |  9 ---------
 3 files changed, 14 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index f9a52c721274..91d57f89f5a5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -533,22 +533,23 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
 static int check_kill_permission(int sig, struct siginfo *info,
 				 struct task_struct *t)
 {
-	int error = -EINVAL;
+	int error;
+
 	if (!valid_signal(sig))
-		return error;
+		return -EINVAL;
 
-	if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) {
-		error = audit_signal_info(sig, t); /* Let audit system see the signal */
-		if (error)
-			return error;
-		error = -EPERM;
-		if (((sig != SIGCONT) ||
-			(task_session_nr(current) != task_session_nr(t)))
-		    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
-		    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
-		    && !capable(CAP_KILL))
+	if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
+		return 0;
+
+	error = audit_signal_info(sig, t); /* Let audit system see the signal */
+	if (error)
 		return error;
-	}
+
+	if (((sig != SIGCONT) || (task_session_nr(current) != task_session_nr(t)))
+	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
+	    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
+	    && !capable(CAP_KILL))
+		return -EPERM;
 
 	return security_task_kill(t, info, sig, 0);
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 85a220465a8f..1b50a6ebc55f 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3286,9 +3286,6 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info,
 	if (rc)
 		return rc;
 
-	if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
-		return 0;
-
 	if (!sig)
 		perm = PROCESS__SIGNULL; /* null signal; existence test */
 	else
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index fe0ae1bf1650..b5c8f9237008 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1130,15 +1130,6 @@ static int smack_task_movememory(struct task_struct *p)
 static int smack_task_kill(struct task_struct *p, struct siginfo *info,
 			   int sig, u32 secid)
 {
-	/*
-	 * Special cases where signals really ought to go through
-	 * in spite of policy. Stephen Smalley suggests it may
-	 * make sense to change the caller so that it doesn't
-	 * bother with the LSM hook in these cases.
-	 */
-	if (info != SEND_SIG_NOINFO &&
-	    (is_si_special(info) || SI_FROMKERNEL(info)))
-		return 0;
 	/*
 	 * Sending a signal requires that the sender
 	 * can write the receiver.
-- 
cgit v1.3-14-g43fede


From e442055193e4584218006e616c9bdce0c5e9ae5c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:44 -0700
Subject: signals: re-assign CLD_CONTINUED notification from the sender to
 reciever

Based on discussion with Jiri and Roland.

In short: currently handle_stop_signal(SIGCONT, p) sends the notification to
p->parent, with this patch p itself notifies its parent when it becomes
running.

handle_stop_signal(SIGCONT) has to drop ->siglock temporary in order to notify
the parent with do_notify_parent_cldstop().  This leads to multiple problems:

	- as Jiri Kosina pointed out, the stopped task can resume without
	  actually seeing SIGCONT which may have a handler.

	- we race with another sig_kernel_stop() signal which may come in
	  that window.

	- we race with sig_fatal() signals which may set SIGNAL_GROUP_EXIT
	  in that window.

	- we can't avoid taking tasklist_lock() while sending SIGCONT.

With this patch handle_stop_signal() just sets the new SIGNAL_CLD_CONTINUED
flag in p->signal->flags and returns.  The notification is sent by the first
task which returns from finish_stop() (there should be at least one) or any
other signalled thread from get_signal_to_deliver().

This is a user-visible change.  Say, currently kill(SIGCONT, stopped_child)
can't return without seeing SIGCHLD, with this patch SIGCHLD can be delayed
unpredictably.  Another difference is that if the child is ptraced by another
process, CLD_CONTINUED may be delivered to ->real_parent after ptrace_detach()
while currently it always goes to the tracer which doesn't actually need this
notification.  Hopefully not a problem.

The patch asks for the futher obvious cleanups, I'll send them separately.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  6 ++++++
 kernel/signal.c       | 29 +++++++++++++++++++----------
 2 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1d02babdb2c7..ef5615270342 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -554,6 +554,12 @@ struct signal_struct {
 #define SIGNAL_STOP_DEQUEUED	0x00000002 /* stop signal dequeued */
 #define SIGNAL_STOP_CONTINUED	0x00000004 /* SIGCONT since WCONTINUED reap */
 #define SIGNAL_GROUP_EXIT	0x00000008 /* group exit in progress */
+/*
+ * Pending notifications to parent.
+ */
+#define SIGNAL_CLD_STOPPED	0x00000010
+#define SIGNAL_CLD_CONTINUED	0x00000020
+#define SIGNAL_CLD_MASK		(SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
 
 /* If true, all threads except ->group_exit_task have pending SIGKILL */
 static inline int signal_group_exit(const struct signal_struct *sig)
diff --git a/kernel/signal.c b/kernel/signal.c
index 91d57f89f5a5..115c04f3f143 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -603,10 +603,8 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			 * the SIGCHLD was pending on entry to this kill.
 			 */
 			p->signal->group_stop_count = 0;
-			p->signal->flags = SIGNAL_STOP_CONTINUED;
-			spin_unlock(&p->sighand->siglock);
-			do_notify_parent_cldstop(p, CLD_STOPPED);
-			spin_lock(&p->sighand->siglock);
+			p->signal->flags = SIGNAL_STOP_CONTINUED |
+						SIGNAL_CLD_STOPPED;
 		}
 		rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
 		t = p;
@@ -643,25 +641,23 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			 * We were in fact stopped, and are now continued.
 			 * Notify the parent with CLD_CONTINUED.
 			 */
-			p->signal->flags = SIGNAL_STOP_CONTINUED;
+			p->signal->flags = SIGNAL_STOP_CONTINUED |
+						SIGNAL_CLD_CONTINUED;
 			p->signal->group_exit_code = 0;
-			spin_unlock(&p->sighand->siglock);
-			do_notify_parent_cldstop(p, CLD_CONTINUED);
-			spin_lock(&p->sighand->siglock);
 		} else {
 			/*
 			 * We are not stopped, but there could be a stop
 			 * signal in the middle of being processed after
 			 * being removed from the queue.  Clear that too.
 			 */
-			p->signal->flags = 0;
+			p->signal->flags &= ~SIGNAL_STOP_DEQUEUED;
 		}
 	} else if (sig == SIGKILL) {
 		/*
 		 * Make sure that any pending stop signal already dequeued
 		 * is undone by the wakeup for SIGKILL.
 		 */
-		p->signal->flags = 0;
+		p->signal->flags &= ~SIGNAL_STOP_DEQUEUED;
 	}
 }
 
@@ -1784,6 +1780,19 @@ relock:
 	try_to_freeze();
 
 	spin_lock_irq(&current->sighand->siglock);
+
+	if (unlikely(current->signal->flags & SIGNAL_CLD_MASK)) {
+		int why = (current->signal->flags & SIGNAL_STOP_CONTINUED)
+				? CLD_CONTINUED : CLD_STOPPED;
+		current->signal->flags &= ~SIGNAL_CLD_MASK;
+		spin_unlock_irq(&current->sighand->siglock);
+
+		read_lock(&tasklist_lock);
+		do_notify_parent_cldstop(current->group_leader, why);
+		read_unlock(&tasklist_lock);
+		goto relock;
+	}
+
 	for (;;) {
 		struct k_sigaction *ka;
 
-- 
cgit v1.3-14-g43fede


From 6ca25b551309eb1b1b41f83414a92f7472e0b23d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:45 -0700
Subject: kill_pid_info: don't take now unneeded tasklist_lock

Previously handle_stop_signal(SIGCONT) could drop ->siglock.  That is why
kill_pid_info(SIGCONT) takes tasklist_lock to make sure the target task can't
go away after unlock.  Not needed now.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/signal.h | 2 --
 kernel/signal.c        | 7 +------
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 42d2e0a948f4..84f997f8aa53 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -362,8 +362,6 @@ int unhandled_signal(struct task_struct *tsk, int sig);
 #define sig_kernel_stop(sig) \
 	(((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_STOP_MASK))
 
-#define sig_needs_tasklist(sig)	((sig) == SIGCONT)
-
 #define sig_user_defined(t, signr) \
 	(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&	\
 	 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
diff --git a/kernel/signal.c b/kernel/signal.c
index 115c04f3f143..ce53ab19c21d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1039,9 +1039,6 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
 	struct task_struct *p;
 
 	rcu_read_lock();
-	if (unlikely(sig_needs_tasklist(sig)))
-		read_lock(&tasklist_lock);
-
 retry:
 	p = pid_task(pid, PIDTYPE_PID);
 	if (p) {
@@ -1055,10 +1052,8 @@ retry:
 			 */
 			goto retry;
 	}
-
-	if (unlikely(sig_needs_tasklist(sig)))
-		read_unlock(&tasklist_lock);
 	rcu_read_unlock();
+
 	return error;
 }
 
-- 
cgit v1.3-14-g43fede


From fc321d2e60d6f4eee17206612d0b50519f526daf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:46 -0700
Subject: handle_stop_signal: unify partial/full stop handling

Now that handle_stop_signal() doesn't drop ->siglock, we can't see both
->group_stop_count && SIGNAL_STOP_STOPPED.  Merge two "if" branches.

As Roland pointed out, we never actually needed 2 do_notify_parent_cldstop()
calls.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 45 +++++++++++++++++++--------------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index ce53ab19c21d..dee8cc927a63 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -585,33 +585,16 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			t = next_thread(t);
 		} while (t != p);
 	} else if (sig == SIGCONT) {
+		unsigned int why;
 		/*
 		 * Remove all stop signals from all queues,
 		 * and wake all threads.
 		 */
-		if (unlikely(p->signal->group_stop_count > 0)) {
-			/*
-			 * There was a group stop in progress.  We'll
-			 * pretend it finished before we got here.  We are
-			 * obliged to report it to the parent: if the
-			 * SIGSTOP happened "after" this SIGCONT, then it
-			 * would have cleared this pending SIGCONT.  If it
-			 * happened "before" this SIGCONT, then the parent
-			 * got the SIGCHLD about the stop finishing before
-			 * the continue happened.  We do the notification
-			 * now, and it's as if the stop had finished and
-			 * the SIGCHLD was pending on entry to this kill.
-			 */
-			p->signal->group_stop_count = 0;
-			p->signal->flags = SIGNAL_STOP_CONTINUED |
-						SIGNAL_CLD_STOPPED;
-		}
 		rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
 		t = p;
 		do {
 			unsigned int state;
 			rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
-			
 			/*
 			 * If there is a handler for SIGCONT, we must make
 			 * sure that no thread returns to user mode before
@@ -621,7 +604,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			 * running the handler.  With the TIF_SIGPENDING
 			 * flag set, the thread will pause and acquire the
 			 * siglock that we hold now and until we've queued
-			 * the pending signal. 
+			 * the pending signal.
 			 *
 			 * Wake up the stopped thread _after_ setting
 			 * TIF_SIGPENDING
@@ -636,13 +619,23 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			t = next_thread(t);
 		} while (t != p);
 
-		if (p->signal->flags & SIGNAL_STOP_STOPPED) {
-			/*
-			 * We were in fact stopped, and are now continued.
-			 * Notify the parent with CLD_CONTINUED.
-			 */
-			p->signal->flags = SIGNAL_STOP_CONTINUED |
-						SIGNAL_CLD_CONTINUED;
+		/*
+		 * Notify the parent with CLD_CONTINUED if we were stopped.
+		 *
+		 * If we were in the middle of a group stop, we pretend it
+		 * was already finished, and then continued. Since SIGCHLD
+		 * doesn't queue we report only CLD_STOPPED, as if the next
+		 * CLD_CONTINUED was dropped.
+		 */
+		why = 0;
+		if (p->signal->flags & SIGNAL_STOP_STOPPED)
+			why |= SIGNAL_CLD_CONTINUED;
+		else if (p->signal->group_stop_count)
+			why |= SIGNAL_CLD_STOPPED;
+
+		if (why) {
+			p->signal->flags = why | SIGNAL_STOP_CONTINUED;
+			p->signal->group_stop_count = 0;
 			p->signal->group_exit_code = 0;
 		} else {
 			/*
-- 
cgit v1.3-14-g43fede


From ad16a4606939ce1bedb79c87e412467be803e990 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:46 -0700
Subject: handle_stop_signal: use the cached p->signal value

Cache the value of p->signal, and change the code to use while_each_thread()
helper.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index dee8cc927a63..b266fa46402a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -566,9 +566,10 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
  */
 static void handle_stop_signal(int sig, struct task_struct *p)
 {
+	struct signal_struct *signal = p->signal;
 	struct task_struct *t;
 
-	if (p->signal->flags & SIGNAL_GROUP_EXIT)
+	if (signal->flags & SIGNAL_GROUP_EXIT)
 		/*
 		 * The process is in the middle of dying already.
 		 */
@@ -578,19 +579,18 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 		/*
 		 * This is a stop signal.  Remove SIGCONT from all queues.
 		 */
-		rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending);
+		rm_from_queue(sigmask(SIGCONT), &signal->shared_pending);
 		t = p;
 		do {
 			rm_from_queue(sigmask(SIGCONT), &t->pending);
-			t = next_thread(t);
-		} while (t != p);
+		} while_each_thread(p, t);
 	} else if (sig == SIGCONT) {
 		unsigned int why;
 		/*
 		 * Remove all stop signals from all queues,
 		 * and wake all threads.
 		 */
-		rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
+		rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
 		t = p;
 		do {
 			unsigned int state;
@@ -615,9 +615,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 				state |= TASK_INTERRUPTIBLE;
 			}
 			wake_up_state(t, state);
-
-			t = next_thread(t);
-		} while (t != p);
+		} while_each_thread(p, t);
 
 		/*
 		 * Notify the parent with CLD_CONTINUED if we were stopped.
@@ -628,29 +626,29 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 		 * CLD_CONTINUED was dropped.
 		 */
 		why = 0;
-		if (p->signal->flags & SIGNAL_STOP_STOPPED)
+		if (signal->flags & SIGNAL_STOP_STOPPED)
 			why |= SIGNAL_CLD_CONTINUED;
-		else if (p->signal->group_stop_count)
+		else if (signal->group_stop_count)
 			why |= SIGNAL_CLD_STOPPED;
 
 		if (why) {
-			p->signal->flags = why | SIGNAL_STOP_CONTINUED;
-			p->signal->group_stop_count = 0;
-			p->signal->group_exit_code = 0;
+			signal->flags = why | SIGNAL_STOP_CONTINUED;
+			signal->group_stop_count = 0;
+			signal->group_exit_code = 0;
 		} else {
 			/*
 			 * We are not stopped, but there could be a stop
 			 * signal in the middle of being processed after
 			 * being removed from the queue.  Clear that too.
 			 */
-			p->signal->flags &= ~SIGNAL_STOP_DEQUEUED;
+			signal->flags &= ~SIGNAL_STOP_DEQUEUED;
 		}
 	} else if (sig == SIGKILL) {
 		/*
 		 * Make sure that any pending stop signal already dequeued
 		 * is undone by the wakeup for SIGKILL.
 		 */
-		p->signal->flags &= ~SIGNAL_STOP_DEQUEUED;
+		signal->flags &= ~SIGNAL_STOP_DEQUEUED;
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From f6b76d4fb0039e077824be85ed4ac94e96beef86 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:47 -0700
Subject: get_signal_to_deliver: use the cached ->signal/sighand values

Cache the values of current->signal/sighand.  Shrinks .text a bit and makes
the code more readable.  Also, remove "sigset_t *mask", it is pointless
because in fact we save the constant offset.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index b266fa46402a..f92e6298930c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1753,8 +1753,9 @@ static int ptrace_signal(int signr, siginfo_t *info,
 int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
 			  struct pt_regs *regs, void *cookie)
 {
-	sigset_t *mask = &current->blocked;
-	int signr = 0;
+	struct sighand_struct *sighand = current->sighand;
+	struct signal_struct *signal = current->signal;
+	int signr;
 
 relock:
 	/*
@@ -1765,13 +1766,13 @@ relock:
 	 */
 	try_to_freeze();
 
-	spin_lock_irq(&current->sighand->siglock);
+	spin_lock_irq(&sighand->siglock);
 
-	if (unlikely(current->signal->flags & SIGNAL_CLD_MASK)) {
-		int why = (current->signal->flags & SIGNAL_STOP_CONTINUED)
+	if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
+		int why = (signal->flags & SIGNAL_STOP_CONTINUED)
 				? CLD_CONTINUED : CLD_STOPPED;
-		current->signal->flags &= ~SIGNAL_CLD_MASK;
-		spin_unlock_irq(&current->sighand->siglock);
+		signal->flags &= ~SIGNAL_CLD_MASK;
+		spin_unlock_irq(&sighand->siglock);
 
 		read_lock(&tasklist_lock);
 		do_notify_parent_cldstop(current->group_leader, why);
@@ -1782,12 +1783,11 @@ relock:
 	for (;;) {
 		struct k_sigaction *ka;
 
-		if (unlikely(current->signal->group_stop_count > 0) &&
+		if (unlikely(signal->group_stop_count > 0) &&
 		    do_signal_stop(0))
 			goto relock;
 
-		signr = dequeue_signal(current, mask, info);
-
+		signr = dequeue_signal(current, &current->blocked, info);
 		if (!signr)
 			break; /* will return 0 */
 
@@ -1797,7 +1797,7 @@ relock:
 				continue;
 		}
 
-		ka = &current->sighand->action[signr-1];
+		ka = &sighand->action[signr-1];
 		if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
 			continue;
 		if (ka->sa.sa_handler != SIG_DFL) {
@@ -1834,14 +1834,14 @@ relock:
 			 * We need to check for that and bail out if necessary.
 			 */
 			if (signr != SIGSTOP) {
-				spin_unlock_irq(&current->sighand->siglock);
+				spin_unlock_irq(&sighand->siglock);
 
 				/* signals can be posted during this window */
 
 				if (is_current_pgrp_orphaned())
 					goto relock;
 
-				spin_lock_irq(&current->sighand->siglock);
+				spin_lock_irq(&sighand->siglock);
 			}
 
 			if (likely(do_signal_stop(signr))) {
@@ -1856,7 +1856,7 @@ relock:
 			continue;
 		}
 
-		spin_unlock_irq(&current->sighand->siglock);
+		spin_unlock_irq(&sighand->siglock);
 
 		/*
 		 * Anything else is fatal, maybe with a core dump.
@@ -1882,7 +1882,7 @@ relock:
 		do_group_exit(signr);
 		/* NOTREACHED */
 	}
-	spin_unlock_irq(&current->sighand->siglock);
+	spin_unlock_irq(&sighand->siglock);
 	return signr;
 }
 
-- 
cgit v1.3-14-g43fede


From 5c193e8871b76f3bf8ed1e31f7af7c70890ebc4f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:48 -0700
Subject: signals: send_sigqueue: don't take rcu lock

lock_task_sighand() was changed, send_sigqueue() doesn't need rcu_read_lock()
any longer.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index f92e6298930c..0a8b0aece80d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1311,8 +1311,6 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	 * We return -1, when the task is marked exiting, so
 	 * posix_timer_event can redirect it to the group leader
 	 */
-	rcu_read_lock();
-
 	if (!likely(lock_task_sighand(p, &flags)))
 		goto out_err;
 
@@ -1323,8 +1321,6 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 
 	unlock_task_sighand(p, &flags);
 out_err:
-	rcu_read_unlock();
-
 	return ret;
 }
 
-- 
cgit v1.3-14-g43fede


From 5fc894bb4fb1de8373d1d5fb6db19204a16859e8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:48 -0700
Subject: signals: send_sigqueue: don't forget about handle_stop_signal()

send_group_sigqueue() calls handle_stop_signal(), send_sigqueue() doesn't.
This is not consistent and in fact I'd say this is (minor) bug.

Move handle_stop_signal() from send_group_sigqueue() to do_send_sigqueue(),
the latter is called by send_sigqueue() too.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 0a8b0aece80d..8259262eaa60 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1274,8 +1274,10 @@ void sigqueue_free(struct sigqueue *q)
 }
 
 static int do_send_sigqueue(int sig, struct sigqueue *q, struct task_struct *t,
-		struct sigpending *pending)
+				struct sigpending *pending)
 {
+	handle_stop_signal(sig, t);
+
 	if (unlikely(!list_empty(&q->list))) {
 		/*
 		 * If an SI_TIMER entry is already queue just increment
@@ -1335,7 +1337,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	read_lock(&tasklist_lock);
 	/* Since it_lock is held, p->sighand cannot be NULL. */
 	spin_lock_irqsave(&p->sighand->siglock, flags);
-	handle_stop_signal(sig, p);
 
 	ret = do_send_sigqueue(sig, q, p, &p->signal->shared_pending);
 
-- 
cgit v1.3-14-g43fede


From f8c5b5c06f63fe9aaebefbf9f0b79909066b1b6c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:49 -0700
Subject: signals: __group_complete_signal: cache the value of p->signal

Cosmetic, cache p->signal to make the code a bit more readable.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8259262eaa60..2a06f2441805 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -842,6 +842,7 @@ static inline int wants_signal(int sig, struct task_struct *p)
 static void
 __group_complete_signal(int sig, struct task_struct *p)
 {
+	struct signal_struct *signal = p->signal;
 	struct task_struct *t;
 
 	/*
@@ -862,14 +863,14 @@ __group_complete_signal(int sig, struct task_struct *p)
 		/*
 		 * Otherwise try to find a suitable thread.
 		 */
-		t = p->signal->curr_target;
+		t = signal->curr_target;
 		if (t == NULL)
 			/* restart balancing at this thread */
-			t = p->signal->curr_target = p;
+			t = signal->curr_target = p;
 
 		while (!wants_signal(sig, t)) {
 			t = next_thread(t);
-			if (t == p->signal->curr_target)
+			if (t == signal->curr_target)
 				/*
 				 * No thread needs to be woken.
 				 * Any eligible threads will see
@@ -877,14 +878,14 @@ __group_complete_signal(int sig, struct task_struct *p)
 				 */
 				return;
 		}
-		p->signal->curr_target = t;
+		signal->curr_target = t;
 	}
 
 	/*
 	 * Found a killable thread.  If the signal will be fatal,
 	 * then start taking the whole group down immediately.
 	 */
-	if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) &&
+	if (sig_fatal(p, sig) && !(signal->flags & SIGNAL_GROUP_EXIT) &&
 	    !sigismember(&t->real_blocked, sig) &&
 	    (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
 		/*
@@ -897,9 +898,9 @@ __group_complete_signal(int sig, struct task_struct *p)
 			 * running and doing things after a slower
 			 * thread has the fatal signal pending.
 			 */
-			p->signal->flags = SIGNAL_GROUP_EXIT;
-			p->signal->group_exit_code = sig;
-			p->signal->group_stop_count = 0;
+			signal->flags = SIGNAL_GROUP_EXIT;
+			signal->group_exit_code = sig;
+			signal->group_stop_count = 0;
 			t = p;
 			do {
 				sigaddset(&t->pending.signal, SIGKILL);
-- 
cgit v1.3-14-g43fede


From c99fcf28b87d8cab592db7571e3164f5cb54c5b3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:49 -0700
Subject: signals: send_group_sigqueue: don't take tasklist_lock

handle_stop_signal() was changed, now send_group_sigqueue() doesn't need
tasklist_lock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 2a06f2441805..db442c59219e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1335,7 +1335,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
 
-	read_lock(&tasklist_lock);
 	/* Since it_lock is held, p->sighand cannot be NULL. */
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 
@@ -1344,7 +1343,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	__group_complete_signal(sig, p);
 
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
-	read_unlock(&tasklist_lock);
+
 	return ret;
 }
 
-- 
cgit v1.3-14-g43fede


From 6e65acba7ca8169e38ab55d62d52f29a75fb141f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:50 -0700
Subject: signals: move handle_stop_signal() into send_signal()

Move handle_stop_signal() into send_signal().  This factors out a couple of
callsites and allows us to do further unifications.

Also, with this change specific_send_sig_info() does handle_stop_signal().
Not that this is really important, we never send STOP/CONT via send_sig() and
friends, but still this looks more consistent.

The only (afaics) special case is get_signal_to_deliver().  If the traced task
dequeues SIGCONT, it can re-send it to itself after ptrace_stop() if the
signal was blocked by debugger.  In that case handle_stop_signal() is
unnecessary, but hopefully not a problem.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index db442c59219e..b3dedf1f9323 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -660,8 +660,10 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			struct sigpending *signals)
 {
-	struct sigqueue * q = NULL;
+	struct sigqueue *q;
 
+	assert_spin_locked(&t->sighand->siglock);
+	handle_stop_signal(sig, t);
 	/*
 	 * Short-circuit ignored signals and support queuing
 	 * exactly one non-rt signal, so that we can get more
@@ -766,9 +768,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
 	int ret;
 
-	BUG_ON(!irqs_disabled());
-	assert_spin_locked(&t->sighand->siglock);
-
 	ret = send_signal(sig, info, t, &t->pending);
 	if (ret <= 0)
 		return ret;
@@ -923,9 +922,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	int ret;
 
-	assert_spin_locked(&p->sighand->siglock);
-	handle_stop_signal(sig, p);
-
 	/*
 	 * Put this signal on the shared-pending queue, or fail with EAGAIN.
 	 * We always use the shared queue for process-wide signals,
@@ -2241,7 +2237,6 @@ static int do_tkill(int tgid, int pid, int sig)
 		 */
 		if (!error && sig && p->sighand) {
 			spin_lock_irq(&p->sighand->siglock);
-			handle_stop_signal(sig, p);
 			error = specific_send_sig_info(sig, &info, p);
 			spin_unlock_irq(&p->sighand->siglock);
 		}
-- 
cgit v1.3-14-g43fede


From 3547ff3aefbe092ca35506c60c02e2d17a4f2199 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:51 -0700
Subject: signals: do_tkill: don't use tasklist_lock

Convert do_tkill() to use rcu_read_lock() + lock_task_sighand() to avoid
taking tasklist lock.

Note that we don't return an error if lock_task_sighand() fails, we pretend
the task dies after receiving the signal.  Otherwise, we should fight with the
nasty races with mt-exec without having any advantage.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index b3dedf1f9323..13371d17358d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2219,6 +2219,7 @@ static int do_tkill(int tgid, int pid, int sig)
 	int error;
 	struct siginfo info;
 	struct task_struct *p;
+	unsigned long flags;
 
 	error = -ESRCH;
 	info.si_signo = sig;
@@ -2227,21 +2228,24 @@ static int do_tkill(int tgid, int pid, int sig)
 	info.si_pid = task_tgid_vnr(current);
 	info.si_uid = current->uid;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	p = find_task_by_vpid(pid);
 	if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
 		error = check_kill_permission(sig, &info, p);
 		/*
 		 * The null signal is a permissions and process existence
 		 * probe.  No signal is actually delivered.
+		 *
+		 * If lock_task_sighand() fails we pretend the task dies
+		 * after receiving the signal. The window is tiny, and the
+		 * signal is private anyway.
 		 */
-		if (!error && sig && p->sighand) {
-			spin_lock_irq(&p->sighand->siglock);
+		if (!error && sig && lock_task_sighand(p, &flags)) {
 			error = specific_send_sig_info(sig, &info, p);
-			spin_unlock_irq(&p->sighand->siglock);
+			unlock_task_sighand(p, &flags);
 		}
 	}
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return error;
 }
-- 
cgit v1.3-14-g43fede


From 08d2c30ce98d274137f12b0a9b9c74137455922c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:51 -0700
Subject: signals: send_sig_info: don't take tasklist_lock

The comment in send_sig_info() is wrong, tasklist_lock can't help.

The caller must ensure the task can't go away, otherwise ->sighand can be NULL
even before we take the lock.

p->sighand could be changed by exec(), but I can't imagine how it is possible
to prevent exit(), but not exec().

Since the things seem to work, I assume all callers are correct.  However,
drm_vbl_send_signals() looks broken.  block_all_signals() which is solely used
by drm is definitely broken.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 13371d17358d..17859f0d8411 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1138,8 +1138,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
  */
 
 /*
- * These two are the most common entry points.  They send a signal
- * just to the specific thread.
+ * The caller must ensure the task can't exit.
  */
 int
 send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
@@ -1154,17 +1153,9 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	if (!valid_signal(sig))
 		return -EINVAL;
 
-	/*
-	 * We need the tasklist lock even for the specific
-	 * thread case (when we don't need to follow the group
-	 * lists) in order to avoid races with "p->sighand"
-	 * going away or changing from under us.
-	 */
-	read_lock(&tasklist_lock);  
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 	ret = specific_send_sig_info(sig, info, p);
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
-	read_unlock(&tasklist_lock);
 	return ret;
 }
 
-- 
cgit v1.3-14-g43fede


From db51aeccd7097ce19a522a4c5ff91c320f870e2b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:52 -0700
Subject: signals: microoptimize the usage of ->curr_target

Suggested by Roland McGrath.

Initialize signal->curr_target in copy_signal().  This way ->curr_target is
never == NULL, we can kill the check in __group_complete_signal's hot path.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c   | 2 +-
 kernel/signal.c | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 068ffe007529..2bb675af4de3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -892,7 +892,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->group_exit_code = 0;
 	sig->group_exit_task = NULL;
 	sig->group_stop_count = 0;
-	sig->curr_target = NULL;
+	sig->curr_target = tsk;
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 17859f0d8411..0298bd3d431b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -863,10 +863,6 @@ __group_complete_signal(int sig, struct task_struct *p)
 		 * Otherwise try to find a suitable thread.
 		 */
 		t = signal->curr_target;
-		if (t == NULL)
-			/* restart balancing at this thread */
-			t = signal->curr_target = p;
-
 		while (!wants_signal(sig, t)) {
 			t = next_thread(t);
 			if (t == signal->curr_target)
-- 
cgit v1.3-14-g43fede


From 71f11dc025055cb2ef9226424f26b3287efadd26 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:53 -0700
Subject: signals: move the definition of __group_complete_signal() up

Move the unchanged definition of __group_complete_signal() so that send_signal
can see it.  To simplify the reading of the next patches.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 192 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 96 insertions(+), 96 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 0298bd3d431b..3479a118ba1c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -652,6 +652,102 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 	}
 }
 
+/*
+ * Test if P wants to take SIG.  After we've checked all threads with this,
+ * it's equivalent to finding no threads not blocking SIG.  Any threads not
+ * blocking SIG were ruled out because they are not running and already
+ * have pending signals.  Such threads will dequeue from the shared queue
+ * as soon as they're available, so putting the signal on the shared queue
+ * will be equivalent to sending it to one such thread.
+ */
+static inline int wants_signal(int sig, struct task_struct *p)
+{
+	if (sigismember(&p->blocked, sig))
+		return 0;
+	if (p->flags & PF_EXITING)
+		return 0;
+	if (sig == SIGKILL)
+		return 1;
+	if (task_is_stopped_or_traced(p))
+		return 0;
+	return task_curr(p) || !signal_pending(p);
+}
+
+static void
+__group_complete_signal(int sig, struct task_struct *p)
+{
+	struct signal_struct *signal = p->signal;
+	struct task_struct *t;
+
+	/*
+	 * Now find a thread we can wake up to take the signal off the queue.
+	 *
+	 * If the main thread wants the signal, it gets first crack.
+	 * Probably the least surprising to the average bear.
+	 */
+	if (wants_signal(sig, p))
+		t = p;
+	else if (thread_group_empty(p))
+		/*
+		 * There is just one thread and it does not need to be woken.
+		 * It will dequeue unblocked signals before it runs again.
+		 */
+		return;
+	else {
+		/*
+		 * Otherwise try to find a suitable thread.
+		 */
+		t = signal->curr_target;
+		while (!wants_signal(sig, t)) {
+			t = next_thread(t);
+			if (t == signal->curr_target)
+				/*
+				 * No thread needs to be woken.
+				 * Any eligible threads will see
+				 * the signal in the queue soon.
+				 */
+				return;
+		}
+		signal->curr_target = t;
+	}
+
+	/*
+	 * Found a killable thread.  If the signal will be fatal,
+	 * then start taking the whole group down immediately.
+	 */
+	if (sig_fatal(p, sig) && !(signal->flags & SIGNAL_GROUP_EXIT) &&
+	    !sigismember(&t->real_blocked, sig) &&
+	    (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
+		/*
+		 * This signal will be fatal to the whole group.
+		 */
+		if (!sig_kernel_coredump(sig)) {
+			/*
+			 * Start a group exit and wake everybody up.
+			 * This way we don't have other threads
+			 * running and doing things after a slower
+			 * thread has the fatal signal pending.
+			 */
+			signal->flags = SIGNAL_GROUP_EXIT;
+			signal->group_exit_code = sig;
+			signal->group_stop_count = 0;
+			t = p;
+			do {
+				sigaddset(&t->pending.signal, SIGKILL);
+				signal_wake_up(t, 1);
+			} while_each_thread(p, t);
+			return;
+		}
+	}
+
+	/*
+	 * The signal is already in the shared-pending queue.
+	 * Tell the chosen thread to wake up and dequeue it.
+	 */
+	signal_wake_up(t, sig == SIGKILL);
+	return;
+}
+
 static inline int legacy_queue(struct sigpending *signals, int sig)
 {
 	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
@@ -817,102 +913,6 @@ force_sig_specific(int sig, struct task_struct *t)
 	force_sig_info(sig, SEND_SIG_FORCED, t);
 }
 
-/*
- * Test if P wants to take SIG.  After we've checked all threads with this,
- * it's equivalent to finding no threads not blocking SIG.  Any threads not
- * blocking SIG were ruled out because they are not running and already
- * have pending signals.  Such threads will dequeue from the shared queue
- * as soon as they're available, so putting the signal on the shared queue
- * will be equivalent to sending it to one such thread.
- */
-static inline int wants_signal(int sig, struct task_struct *p)
-{
-	if (sigismember(&p->blocked, sig))
-		return 0;
-	if (p->flags & PF_EXITING)
-		return 0;
-	if (sig == SIGKILL)
-		return 1;
-	if (task_is_stopped_or_traced(p))
-		return 0;
-	return task_curr(p) || !signal_pending(p);
-}
-
-static void
-__group_complete_signal(int sig, struct task_struct *p)
-{
-	struct signal_struct *signal = p->signal;
-	struct task_struct *t;
-
-	/*
-	 * Now find a thread we can wake up to take the signal off the queue.
-	 *
-	 * If the main thread wants the signal, it gets first crack.
-	 * Probably the least surprising to the average bear.
-	 */
-	if (wants_signal(sig, p))
-		t = p;
-	else if (thread_group_empty(p))
-		/*
-		 * There is just one thread and it does not need to be woken.
-		 * It will dequeue unblocked signals before it runs again.
-		 */
-		return;
-	else {
-		/*
-		 * Otherwise try to find a suitable thread.
-		 */
-		t = signal->curr_target;
-		while (!wants_signal(sig, t)) {
-			t = next_thread(t);
-			if (t == signal->curr_target)
-				/*
-				 * No thread needs to be woken.
-				 * Any eligible threads will see
-				 * the signal in the queue soon.
-				 */
-				return;
-		}
-		signal->curr_target = t;
-	}
-
-	/*
-	 * Found a killable thread.  If the signal will be fatal,
-	 * then start taking the whole group down immediately.
-	 */
-	if (sig_fatal(p, sig) && !(signal->flags & SIGNAL_GROUP_EXIT) &&
-	    !sigismember(&t->real_blocked, sig) &&
-	    (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
-		/*
-		 * This signal will be fatal to the whole group.
-		 */
-		if (!sig_kernel_coredump(sig)) {
-			/*
-			 * Start a group exit and wake everybody up.
-			 * This way we don't have other threads
-			 * running and doing things after a slower
-			 * thread has the fatal signal pending.
-			 */
-			signal->flags = SIGNAL_GROUP_EXIT;
-			signal->group_exit_code = sig;
-			signal->group_stop_count = 0;
-			t = p;
-			do {
-				sigaddset(&t->pending.signal, SIGKILL);
-				signal_wake_up(t, 1);
-			} while_each_thread(p, t);
-			return;
-		}
-	}
-
-	/*
-	 * The signal is already in the shared-pending queue.
-	 * Tell the chosen thread to wake up and dequeue it.
-	 */
-	signal_wake_up(t, sig == SIGKILL);
-	return;
-}
-
 int
 __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
-- 
cgit v1.3-14-g43fede


From 2ca3515aa57224edf0151e05a8c9f21a76bf5957 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:54 -0700
Subject: signals: change send_signal/do_send_sigqueue to take "boolean group"
 parameter

send_signal() is used either with ->pending or with ->signal->shared_pending.
Change it to take "int group" instead, this argument will be re-used later.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 3479a118ba1c..f2fc3a9ea8fc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -754,18 +754,21 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
 }
 
 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
-			struct sigpending *signals)
+			int group)
 {
+	struct sigpending *pending;
 	struct sigqueue *q;
 
 	assert_spin_locked(&t->sighand->siglock);
 	handle_stop_signal(sig, t);
+
+	pending = group ? &t->signal->shared_pending : &t->pending;
 	/*
 	 * Short-circuit ignored signals and support queuing
 	 * exactly one non-rt signal, so that we can get more
 	 * detailed information about the cause of the signal.
 	 */
-	if (sig_ignored(t, sig) || legacy_queue(signals, sig))
+	if (sig_ignored(t, sig) || legacy_queue(pending, sig))
 		return 0;
 
 	/*
@@ -793,7 +796,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 					     (is_si_special(info) ||
 					      info->si_code >= 0)));
 	if (q) {
-		list_add_tail(&q->list, &signals->list);
+		list_add_tail(&q->list, &pending->list);
 		switch ((unsigned long) info) {
 		case (unsigned long) SEND_SIG_NOINFO:
 			q->info.si_signo = sig;
@@ -823,7 +826,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	}
 
 out_set:
-	sigaddset(&signals->signal, sig);
+	sigaddset(&pending->signal, sig);
 	return 1;
 }
 
@@ -864,7 +867,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
 	int ret;
 
-	ret = send_signal(sig, info, t, &t->pending);
+	ret = send_signal(sig, info, t, 0);
 	if (ret <= 0)
 		return ret;
 
@@ -923,7 +926,7 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	 * We always use the shared queue for process-wide signals,
 	 * to avoid several races.
 	 */
-	ret = send_signal(sig, info, p, &p->signal->shared_pending);
+	ret = send_signal(sig, info, p, 1);
 	if (ret <= 0)
 		return ret;
 
@@ -1258,8 +1261,10 @@ void sigqueue_free(struct sigqueue *q)
 }
 
 static int do_send_sigqueue(int sig, struct sigqueue *q, struct task_struct *t,
-				struct sigpending *pending)
+				int group)
 {
+	struct sigpending *pending;
+
 	handle_stop_signal(sig, t);
 
 	if (unlikely(!list_empty(&q->list))) {
@@ -1277,8 +1282,10 @@ static int do_send_sigqueue(int sig, struct sigqueue *q, struct task_struct *t,
 		return 1;
 
 	signalfd_notify(t, sig);
+	pending = group ? &t->signal->shared_pending : &t->pending;
 	list_add_tail(&q->list, &pending->list);
 	sigaddset(&pending->signal, sig);
+
 	return 0;
 }
 
@@ -1300,7 +1307,7 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	if (!likely(lock_task_sighand(p, &flags)))
 		goto out_err;
 
-	ret = do_send_sigqueue(sig, q, p, &p->pending);
+	ret = do_send_sigqueue(sig, q, p, 0);
 
 	if (!sigismember(&p->blocked, sig))
 		signal_wake_up(p, sig == SIGKILL);
@@ -1321,7 +1328,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	/* Since it_lock is held, p->sighand cannot be NULL. */
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 
-	ret = do_send_sigqueue(sig, q, p, &p->signal->shared_pending);
+	ret = do_send_sigqueue(sig, q, p, 1);
 
 	__group_complete_signal(sig, p);
 
-- 
cgit v1.3-14-g43fede


From 5fcd835bf8c2cde06404559b1904e2f1dfcb4567 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:55 -0700
Subject: signals: use __group_complete_signal() for the specific signals too

Based on Pavel Emelyanov's suggestion.

Rename __group_complete_signal() to complete_signal() and use it to process
the specific signals too.  To do this we simply add the "int group" argument.

This allows us to greatly simply the signal-sending code and adds a useful
behaviour change.  We can avoid the unneeded wakeups for the private signals
because wants_signal() is more clever than sigismember(blocked), but more
importantly we now take into account the fatal specific signals too.

The latter allows us to kill some subtle checks in handle_stop_signal() and
makes the specific/group signal's behaviour more consistent.  For example,
currently sigtimedwait(FATAL_SIGNAL) behaves differently depending on was the
signal sent by kill() or tkill() if the signal was not blocked.

And.  This allows us to tweak/fix the behaviour when the specific signal is
sent to the dying/dead ->group_leader.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index f2fc3a9ea8fc..fc1cb03c241c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -673,8 +673,7 @@ static inline int wants_signal(int sig, struct task_struct *p)
 	return task_curr(p) || !signal_pending(p);
 }
 
-static void
-__group_complete_signal(int sig, struct task_struct *p)
+static void complete_signal(int sig, struct task_struct *p, int group)
 {
 	struct signal_struct *signal = p->signal;
 	struct task_struct *t;
@@ -687,7 +686,7 @@ __group_complete_signal(int sig, struct task_struct *p)
 	 */
 	if (wants_signal(sig, p))
 		t = p;
-	else if (thread_group_empty(p))
+	else if (!group || thread_group_empty(p))
 		/*
 		 * There is just one thread and it does not need to be woken.
 		 * It will dequeue unblocked signals before it runs again.
@@ -871,8 +870,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	if (ret <= 0)
 		return ret;
 
-	if (!sigismember(&t->blocked, sig))
-		signal_wake_up(t, sig == SIGKILL);
+	complete_signal(sig, t, 0);
 	return 0;
 }
 
@@ -930,7 +928,7 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	if (ret <= 0)
 		return ret;
 
-	__group_complete_signal(sig, p);
+	complete_signal(sig, p, 1);
 	return 0;
 }
 
@@ -1309,8 +1307,7 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 
 	ret = do_send_sigqueue(sig, q, p, 0);
 
-	if (!sigismember(&p->blocked, sig))
-		signal_wake_up(p, sig == SIGKILL);
+	complete_signal(sig, p, 0);
 
 	unlock_task_sighand(p, &flags);
 out_err:
@@ -1330,7 +1327,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 
 	ret = do_send_sigqueue(sig, q, p, 1);
 
-	__group_complete_signal(sig, p);
+	complete_signal(sig, p, 1);
 
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 
-- 
cgit v1.3-14-g43fede


From 4cd4b6d4e0372075f846feb85aea016cbdbfec4c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 30 Apr 2008 00:52:55 -0700
Subject: signals: fold complete_signal() into send_signal/do_send_sigqueue

Factor out complete_signal() callsites.  This change completely unifies the
helpers sending the specific/group signals.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 46 +++++++++++-----------------------------------
 1 file changed, 11 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index fc1cb03c241c..87424f7a4f3d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -826,7 +826,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 
 out_set:
 	sigaddset(&pending->signal, sig);
-	return 1;
+	complete_signal(sig, t, group);
+	return 0;
 }
 
 int print_fatal_signals;
@@ -861,17 +862,16 @@ static int __init setup_print_fatal_signals(char *str)
 
 __setup("print-fatal-signals=", setup_print_fatal_signals);
 
+int
+__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+{
+	return send_signal(sig, info, p, 1);
+}
+
 static int
 specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
-	int ret;
-
-	ret = send_signal(sig, info, t, 0);
-	if (ret <= 0)
-		return ret;
-
-	complete_signal(sig, t, 0);
-	return 0;
+	return send_signal(sig, info, t, 0);
 }
 
 /*
@@ -914,24 +914,6 @@ force_sig_specific(int sig, struct task_struct *t)
 	force_sig_info(sig, SEND_SIG_FORCED, t);
 }
 
-int
-__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
-{
-	int ret;
-
-	/*
-	 * Put this signal on the shared-pending queue, or fail with EAGAIN.
-	 * We always use the shared queue for process-wide signals,
-	 * to avoid several races.
-	 */
-	ret = send_signal(sig, info, p, 1);
-	if (ret <= 0)
-		return ret;
-
-	complete_signal(sig, p, 1);
-	return 0;
-}
-
 /*
  * Nuke all other threads in the group.
  */
@@ -1263,6 +1245,7 @@ static int do_send_sigqueue(int sig, struct sigqueue *q, struct task_struct *t,
 {
 	struct sigpending *pending;
 
+	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
 	handle_stop_signal(sig, t);
 
 	if (unlikely(!list_empty(&q->list))) {
@@ -1283,6 +1266,7 @@ static int do_send_sigqueue(int sig, struct sigqueue *q, struct task_struct *t,
 	pending = group ? &t->signal->shared_pending : &t->pending;
 	list_add_tail(&q->list, &pending->list);
 	sigaddset(&pending->signal, sig);
+	complete_signal(sig, t, group);
 
 	return 0;
 }
@@ -1292,8 +1276,6 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	unsigned long flags;
 	int ret = -1;
 
-	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-
 	/*
 	 * The rcu based delayed sighand destroy makes it possible to
 	 * run this without tasklist lock held. The task struct itself
@@ -1307,8 +1289,6 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 
 	ret = do_send_sigqueue(sig, q, p, 0);
 
-	complete_signal(sig, p, 0);
-
 	unlock_task_sighand(p, &flags);
 out_err:
 	return ret;
@@ -1320,15 +1300,11 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	unsigned long flags;
 	int ret;
 
-	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-
 	/* Since it_lock is held, p->sighand cannot be NULL. */
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 
 	ret = do_send_sigqueue(sig, q, p, 1);
 
-	complete_signal(sig, p, 1);
-
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 
 	return ret;
-- 
cgit v1.3-14-g43fede


From e62e6650e99a3dffcd0bf0d063cd818fbc13fa95 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:56 -0700
Subject: signals: unify send_sigqueue/send_group_sigqueue completely

Suggested by Pavel Emelyanov.

send_sigqueue/send_group_sigqueue are only differ in how they lock ->siglock.
Unify them.  send_group_sigqueue() uses spin_lock() because it knows the task
can't exit, but in that case lock_task_sighand() can't fail and doesn't hurt.

Note that the "sig" argument is ignored, it is always equal to ->si_signo.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 58 +++++++++++++++++++++------------------------------------
 1 file changed, 21 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 87424f7a4f3d..367c6662b12f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1240,14 +1240,27 @@ void sigqueue_free(struct sigqueue *q)
 	__sigqueue_free(q);
 }
 
-static int do_send_sigqueue(int sig, struct sigqueue *q, struct task_struct *t,
+static int do_send_sigqueue(struct sigqueue *q, struct task_struct *t,
 				int group)
 {
+	int sig = q->info.si_signo;
 	struct sigpending *pending;
+	unsigned long flags;
+	int ret;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+
+	ret = -1;
+	if (!likely(lock_task_sighand(t, &flags)))
+		goto ret;
+
 	handle_stop_signal(sig, t);
 
+	ret = 1;
+	if (sig_ignored(t, sig))
+		goto out;
+
+	ret = 0;
 	if (unlikely(!list_empty(&q->list))) {
 		/*
 		 * If an SI_TIMER entry is already queue just increment
@@ -1256,58 +1269,29 @@ static int do_send_sigqueue(int sig, struct sigqueue *q, struct task_struct *t,
 
 		BUG_ON(q->info.si_code != SI_TIMER);
 		q->info.si_overrun++;
-		return 0;
+		goto out;
 	}
 
-	if (sig_ignored(t, sig))
-		return 1;
-
 	signalfd_notify(t, sig);
 	pending = group ? &t->signal->shared_pending : &t->pending;
 	list_add_tail(&q->list, &pending->list);
 	sigaddset(&pending->signal, sig);
 	complete_signal(sig, t, group);
-
-	return 0;
+out:
+	unlock_task_sighand(t, &flags);
+ret:
+	return ret;
 }
 
 int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 {
-	unsigned long flags;
-	int ret = -1;
-
-	/*
-	 * The rcu based delayed sighand destroy makes it possible to
-	 * run this without tasklist lock held. The task struct itself
-	 * cannot go away as create_timer did get_task_struct().
-	 *
-	 * We return -1, when the task is marked exiting, so
-	 * posix_timer_event can redirect it to the group leader
-	 */
-	if (!likely(lock_task_sighand(p, &flags)))
-		goto out_err;
-
-	ret = do_send_sigqueue(sig, q, p, 0);
-
-	unlock_task_sighand(p, &flags);
-out_err:
-	return ret;
+	return do_send_sigqueue(q, p, 0);
 }
 
 int
 send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 {
-	unsigned long flags;
-	int ret;
-
-	/* Since it_lock is held, p->sighand cannot be NULL. */
-	spin_lock_irqsave(&p->sighand->siglock, flags);
-
-	ret = do_send_sigqueue(sig, q, p, 1);
-
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
-
-	return ret;
+	return do_send_sigqueue(q, p, 1);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From ac5c215383f43a106ba4ef298126bf78c126f5e9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:57 -0700
Subject: signals: join send_sigqueue() with send_group_sigqueue()

We export send_sigqueue() and send_group_sigqueue() for the only user,
posix_timer_event().  This is a bit silly, because both are just trivial
helpers on top of do_send_sigqueue() and because the we pass the unused
.si_signo parameter.

Kill them both, rename do_send_sigqueue() to send_sigqueue(), and export it.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  3 +--
 kernel/posix-timers.c |  6 ++----
 kernel/signal.c       | 15 +--------------
 3 files changed, 4 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ef5615270342..0917b3df12d5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1751,8 +1751,7 @@ extern void zap_other_threads(struct task_struct *p);
 extern int kill_proc(pid_t, int, int);
 extern struct sigqueue *sigqueue_alloc(void);
 extern void sigqueue_free(struct sigqueue *);
-extern int send_sigqueue(int, struct sigqueue *,  struct task_struct *);
-extern int send_group_sigqueue(int, struct sigqueue *,  struct task_struct *);
+extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 8476956ffd92..dbd8398ddb0b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -310,8 +310,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
 
 	if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
 		struct task_struct *leader;
-		int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
-					timr->it_process);
+		int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
 
 		if (likely(ret >= 0))
 			return ret;
@@ -322,8 +321,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
 		timr->it_process = leader;
 	}
 
-	return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
-				   timr->it_process);
+	return send_sigqueue(timr->sigq, timr->it_process, 1);
 }
 EXPORT_SYMBOL_GPL(posix_timer_event);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 367c6662b12f..d52a1fe921fa 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1240,8 +1240,7 @@ void sigqueue_free(struct sigqueue *q)
 	__sigqueue_free(q);
 }
 
-static int do_send_sigqueue(struct sigqueue *q, struct task_struct *t,
-				int group)
+int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
 {
 	int sig = q->info.si_signo;
 	struct sigpending *pending;
@@ -1266,7 +1265,6 @@ static int do_send_sigqueue(struct sigqueue *q, struct task_struct *t,
 		 * If an SI_TIMER entry is already queue just increment
 		 * the overrun count.
 		 */
-
 		BUG_ON(q->info.si_code != SI_TIMER);
 		q->info.si_overrun++;
 		goto out;
@@ -1283,17 +1281,6 @@ ret:
 	return ret;
 }
 
-int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
-{
-	return do_send_sigqueue(q, p, 0);
-}
-
-int
-send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
-{
-	return do_send_sigqueue(q, p, 1);
-}
-
 /*
  * Wake up any threads in the parent blocked in wait* syscalls.
  */
-- 
cgit v1.3-14-g43fede


From 34c8f07b9ac499a807918eda377193a55f64f8df Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:58 -0700
Subject: signals: handle_stop_signal: don't worry about SIGKILL

handle_stop_signal() clears SIGNAL_STOP_DEQUEUED when sig == SIGKILL.  Remove
this nasty special case.  It was needed to prevent the race with group stop
and exit caused by thread-specific SIGKILL.  Now that we use complete_signal()
for private signals too this is not needed, complete_signal() will notice
SIGKILL and abort the soon-to-begin group stop.

Except: the target thread is dead (has PF_EXITING).  But in that case we
should not just clear SIGNAL_STOP_DEQUEUED and nothing more.  We should either
kill the whole thread group, or silently ignore the signal.

I suspect we are not right wrt zombie leaders, but this is another issue which
and should be fixed separately.  Note that this check can't abort the group
stop if it was already started/finished, this check only adds a subtle side
effect if we race with the thread which has already dequeued sig_kernel_stop()
signal and temporary released ->siglock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index d52a1fe921fa..0a873279393c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -643,12 +643,6 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			 */
 			signal->flags &= ~SIGNAL_STOP_DEQUEUED;
 		}
-	} else if (sig == SIGKILL) {
-		/*
-		 * Make sure that any pending stop signal already dequeued
-		 * is undone by the wakeup for SIGKILL.
-		 */
-		signal->flags &= ~SIGNAL_STOP_DEQUEUED;
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From 2dce81bff28dceb2153c901883a56f278d91db65 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:58 -0700
Subject: signals: cleanup the usage of print_fatal_signal()

Move the callsite of print_fatal_signal() down, under "if
(sig_kernel_coredump(signr))", so we don't need to check signr != SIGKILL.

We are only interested in the sig_kernel_coredump() signals anyway, and due to
the previous changes we almost never can see other fatal signals here except
SIGKILL.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 0a873279393c..0db1d93c4d68 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1787,9 +1787,10 @@ relock:
 		 * Anything else is fatal, maybe with a core dump.
 		 */
 		current->flags |= PF_SIGNALED;
-		if ((signr != SIGKILL) && print_fatal_signals)
-			print_fatal_signal(regs, signr);
+
 		if (sig_kernel_coredump(signr)) {
+			if (print_fatal_signals)
+				print_fatal_signal(regs, signr);
 			/*
 			 * If it was able to dump core, this kills all
 			 * other threads in the group and synchronizes with
-- 
cgit v1.3-14-g43fede


From 7e695a5ef5c1c768d7feb75cc61e42f13d763623 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:52:59 -0700
Subject: signals: fold sig_ignored() into handle_stop_signal()

Rename handle_stop_signal() to prepare_signal(), make it return a boolean, and
move the callsites of sig_ignored() into it.

No functional changes for now.  But it would be nice to factor out the "should
we drop this signal" checks as much as possible, before we try to fix the bugs
with the sub-namespace init's signals (actually the global /sbin/init has some
problems with signals too).

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 0db1d93c4d68..359c4de7c772 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -558,24 +558,25 @@ static int check_kill_permission(int sig, struct siginfo *info,
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
 
 /*
- * Handle magic process-wide effects of stop/continue signals.
- * Unlike the signal actions, these happen immediately at signal-generation
+ * Handle magic process-wide effects of stop/continue signals. Unlike
+ * the signal actions, these happen immediately at signal-generation
  * time regardless of blocking, ignoring, or handling.  This does the
  * actual continuing for SIGCONT, but not the actual stopping for stop
- * signals.  The process stop is done as a signal action for SIG_DFL.
+ * signals. The process stop is done as a signal action for SIG_DFL.
+ *
+ * Returns true if the signal should be actually delivered, otherwise
+ * it should be dropped.
  */
-static void handle_stop_signal(int sig, struct task_struct *p)
+static int prepare_signal(int sig, struct task_struct *p)
 {
 	struct signal_struct *signal = p->signal;
 	struct task_struct *t;
 
-	if (signal->flags & SIGNAL_GROUP_EXIT)
+	if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) {
 		/*
-		 * The process is in the middle of dying already.
+		 * The process is in the middle of dying, nothing to do.
 		 */
-		return;
-
-	if (sig_kernel_stop(sig)) {
+	} else if (sig_kernel_stop(sig)) {
 		/*
 		 * This is a stop signal.  Remove SIGCONT from all queues.
 		 */
@@ -644,6 +645,8 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			signal->flags &= ~SIGNAL_STOP_DEQUEUED;
 		}
 	}
+
+	return !sig_ignored(p, sig);
 }
 
 /*
@@ -753,7 +756,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	struct sigqueue *q;
 
 	assert_spin_locked(&t->sighand->siglock);
-	handle_stop_signal(sig, t);
+	if (!prepare_signal(sig, t))
+		return 0;
 
 	pending = group ? &t->signal->shared_pending : &t->pending;
 	/*
@@ -761,7 +765,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	 * exactly one non-rt signal, so that we can get more
 	 * detailed information about the cause of the signal.
 	 */
-	if (sig_ignored(t, sig) || legacy_queue(pending, sig))
+	if (legacy_queue(pending, sig))
 		return 0;
 
 	/*
@@ -1247,10 +1251,8 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
 	if (!likely(lock_task_sighand(t, &flags)))
 		goto ret;
 
-	handle_stop_signal(sig, t);
-
-	ret = 1;
-	if (sig_ignored(t, sig))
+	ret = 1; /* the signal is ignored */
+	if (!prepare_signal(sig, t))
 		goto out;
 
 	ret = 0;
-- 
cgit v1.3-14-g43fede


From 021e1ae3d85a76ce962a300c96813f04ae50c87c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:00 -0700
Subject: signals: document CLD_CONTINUED notification mechanics

A couple of small comments about how CLD_CONTINUED notification works.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 359c4de7c772..8423867f7d8f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -633,6 +633,11 @@ static int prepare_signal(int sig, struct task_struct *p)
 			why |= SIGNAL_CLD_STOPPED;
 
 		if (why) {
+			/*
+			 * The first thread which returns from finish_stop()
+			 * will take ->siglock, notice SIGNAL_CLD_MASK, and
+			 * notify its parent. See get_signal_to_deliver().
+			 */
 			signal->flags = why | SIGNAL_STOP_CONTINUED;
 			signal->group_stop_count = 0;
 			signal->group_exit_code = 0;
@@ -1694,7 +1699,11 @@ relock:
 	try_to_freeze();
 
 	spin_lock_irq(&sighand->siglock);
-
+	/*
+	 * Every stopped thread goes here after wakeup. Check to see if
+	 * we should notify the parent, prepare_signal(SIGCONT) encodes
+	 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
+	 */
 	if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
 		int why = (signal->flags & SIGNAL_STOP_CONTINUED)
 				? CLD_CONTINUED : CLD_STOPPED;
-- 
cgit v1.3-14-g43fede


From 53c30337f2c61aff6eecf2a446e839641172f9bd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:00 -0700
Subject: signals: send_signal: be paranoid about signalfd_notify()

send_signal() shouldn't call signalfd_notify() if it then fails with -EAGAIN.
Harmless, just a paranoid cleanup.

Also remove the comment.  It is obsolete, signalfd_notify() was simplified and
does a simple wakeup.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8423867f7d8f..251cc13720bd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -772,13 +772,6 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	 */
 	if (legacy_queue(pending, sig))
 		return 0;
-
-	/*
-	 * Deliver the signal to listening signalfds. This must be called
-	 * with the sighand lock held.
-	 */
-	signalfd_notify(t, sig);
-
 	/*
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
 	 * or SIGKILL.
@@ -828,6 +821,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	}
 
 out_set:
+	signalfd_notify(t, sig);
 	sigaddset(&pending->signal, sig);
 	complete_signal(sig, t, group);
 	return 0;
-- 
cgit v1.3-14-g43fede


From 2e2ba22ea4fd4bb85f0fa37c521066db6775cbef Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:01 -0700
Subject: signals: check_kill_permission: check session under tasklist_lock

This wasn't documented, but as Atsushi Tsuji pointed out
check_kill_permission() needs tasklist_lock for task_session_nr().  I missed
this fact when removed tasklist from the callers.

Change check_kill_permission() to take tasklist_lock for the SIGCONT case.
Re-order security checks so that we take tasklist_lock only if/when it is
actually needed.  This is a minimal fix for now, tasklist will be removed
later.

Also change the code to use task_session() instead of task_session_nr().

Also, remove the SIGCONT check from cap_task_kill(), it is bogus (and the
whole function is bogus.  Serge, Eric, why it is still alive?).

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Atsushi Tsuji <a-tsuji@bk.jp.nec.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 251cc13720bd..24be82c0aae3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -533,6 +533,7 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
 static int check_kill_permission(int sig, struct siginfo *info,
 				 struct task_struct *t)
 {
+	struct pid *sid;
 	int error;
 
 	if (!valid_signal(sig))
@@ -545,11 +546,24 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (error)
 		return error;
 
-	if (((sig != SIGCONT) || (task_session_nr(current) != task_session_nr(t)))
-	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
-	    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
-	    && !capable(CAP_KILL))
-		return -EPERM;
+	if ((current->euid ^ t->suid) && (current->euid ^ t->uid) &&
+	    (current->uid  ^ t->suid) && (current->uid  ^ t->uid) &&
+	    !capable(CAP_KILL)) {
+		switch (sig) {
+		case SIGCONT:
+			read_lock(&tasklist_lock);
+			sid = task_session(t);
+			read_unlock(&tasklist_lock);
+			/*
+			 * We don't return the error if sid == NULL. The
+			 * task was unhashed, the caller must notice this.
+			 */
+			if (!sid || sid == task_session(current))
+				break;
+		default:
+			return -EPERM;
+		}
+	}
 
 	return security_task_kill(t, info, sig, 0);
 }
-- 
cgit v1.3-14-g43fede


From 193191035ad6268db9f561e81e3474b8be89a5ba Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:02 -0700
Subject: signals: check_kill_permission: remove tasklist_lock

Now that task_session() can't return a false NULL, check_kill_permission()
doesn't need tasklist_lock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 24be82c0aae3..02ef3548aeb0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -551,9 +551,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	    !capable(CAP_KILL)) {
 		switch (sig) {
 		case SIGCONT:
-			read_lock(&tasklist_lock);
 			sid = task_session(t);
-			read_unlock(&tasklist_lock);
 			/*
 			 * We don't return the error if sid == NULL. The
 			 * task was unhashed, the caller must notice this.
-- 
cgit v1.3-14-g43fede


From fae5fa44f1fd079ffbed8e0add929dd7bbd1347f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:03 -0700
Subject: signals: fix /sbin/init protection from unwanted signals

The global init has a lot of long standing problems with the unhandled fatal
signals.

	- The "is_global_init(current)" check in get_signal_to_deliver()
	  protects only the main thread. Sub-thread can dequee the fatal
	  signal and shutdown the whole thread group except the main thread.
	  If it dequeues SIGSTOP /sbin/init will be stopped, this is not
	  right too. Note that we can't use is_global_init(->group_leader),
	  this breaks exec and this can't solve other problems we have.

	- Even if afterwards ignored, the fatal signals sets SIGNAL_GROUP_EXIT
	  on delivery. This breaks exec, has other bad implications, and this
	  is just wrong.

Introduce the new SIGNAL_UNKILLABLE flag to fix these problems.  It also helps
to solve some other problems addressed by the subsequent patches.

Currently we use this flag for the global init only, but it could also be used
by kthreads and (perhaps) by the sub-namespace inits.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 2 ++
 init/main.c           | 2 ++
 kernel/signal.c       | 9 ++++++---
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0917b3df12d5..fe970cdca83c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -561,6 +561,8 @@ struct signal_struct {
 #define SIGNAL_CLD_CONTINUED	0x00000020
 #define SIGNAL_CLD_MASK		(SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
 
+#define SIGNAL_UNKILLABLE	0x00000040 /* for init: ignore fatal signals */
+
 /* If true, all threads except ->group_exit_task have pending SIGKILL */
 static inline int signal_group_exit(const struct signal_struct *sig)
 {
diff --git a/init/main.c b/init/main.c
index 624266b524d4..1f4406477f83 100644
--- a/init/main.c
+++ b/init/main.c
@@ -802,6 +802,8 @@ static int noinline init_post(void)
 	(void) sys_dup(0);
 	(void) sys_dup(0);
 
+	current->signal->flags |= SIGNAL_UNKILLABLE;
+
 	if (ramdisk_execute_command) {
 		run_init_process(ramdisk_execute_command);
 		printk(KERN_WARNING "Failed to execute %s\n",
diff --git a/kernel/signal.c b/kernel/signal.c
index 02ef3548aeb0..646a8765696a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -728,7 +728,8 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	 * Found a killable thread.  If the signal will be fatal,
 	 * then start taking the whole group down immediately.
 	 */
-	if (sig_fatal(p, sig) && !(signal->flags & SIGNAL_GROUP_EXIT) &&
+	if (sig_fatal(p, sig) &&
+	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
 	    !sigismember(&t->real_blocked, sig) &&
 	    (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
 		/*
@@ -1615,7 +1616,8 @@ static int do_signal_stop(int signr)
 	} else {
 		struct task_struct *t;
 
-		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
+		if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE))
+					 != SIGNAL_STOP_DEQUEUED) ||
 		    unlikely(signal_group_exit(sig)))
 			return 0;
 		/*
@@ -1761,7 +1763,8 @@ relock:
 		/*
 		 * Global init gets no signals it doesn't want.
 		 */
-		if (is_global_init(current))
+		if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
+		    !signal_group_exit(signal))
 			continue;
 
 		if (sig_kernel_stop(signr)) {
-- 
cgit v1.3-14-g43fede


From 80fe728d593e3a048a56610de932919f7d6d968a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:05 -0700
Subject: signals: allow the kernel to actually kill /sbin/init

Currently the buggy /sbin/init hangs if SIGSEGV/etc happens.  The kernel sends
the signal, init dequeues it and ignores, returns from the exception, repeats
the faulting instruction, and so on forever.

Imho, such a behaviour is not good.  I think that the explicit loud death of
the buggy /sbin/init is better than the silent hang.

Change force_sig_info() to clear SIGNAL_UNKILLABLE when the task should be
really killed.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 646a8765696a..9ac737e53df1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -892,7 +892,8 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
  * since we do not want to have a signal handler that was blocked
  * be invoked when user space had explicitly blocked it.
  *
- * We don't want to have recursive SIGSEGV's etc, for example.
+ * We don't want to have recursive SIGSEGV's etc, for example,
+ * that is why we also clear SIGNAL_UNKILLABLE.
  */
 int
 force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
@@ -912,6 +913,8 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 			recalc_sigpending_and_wake(t);
 		}
 	}
+	if (action->sa.sa_handler == SIG_DFL)
+		t->signal->flags &= ~SIGNAL_UNKILLABLE;
 	ret = specific_send_sig_info(sig, info, t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 
-- 
cgit v1.3-14-g43fede


From 4e4c22c71144c1b2e22c257ec6cf08ccb5be1165 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Apr 2008 00:53:06 -0700
Subject: signals: add set_restore_sigmask

This adds the set_restore_sigmask() inline in <linux/thread_info.h> and
replaces every set_thread_flag(TIF_RESTORE_SIGMASK) with a call to it.  No
change, but abstracts the details of the flag protocol from all the calls.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c                 |  6 +++---
 fs/eventpoll.c              |  3 +--
 fs/select.c                 |  4 ++--
 include/linux/thread_info.h | 15 ++++++++++++++-
 kernel/compat.c             |  3 +--
 kernel/signal.c             |  2 +-
 6 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/fs/compat.c b/fs/compat.c
index 2ce4456aad30..9964d542ae9e 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1720,7 +1720,7 @@ sticky:
 		if (sigmask) {
 			memcpy(&current->saved_sigmask, &sigsaved,
 					sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
+			set_restore_sigmask();
 		}
 	} else if (sigmask)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -1791,7 +1791,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 		if (sigmask) {
 			memcpy(&current->saved_sigmask, &sigsaved,
 				sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
+			set_restore_sigmask();
 		}
 		ret = -ERESTARTNOHAND;
 	} else if (sigmask)
@@ -2117,7 +2117,7 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 		if (err == -EINTR) {
 			memcpy(&current->saved_sigmask, &sigsaved,
 			       sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
+			set_restore_sigmask();
 		} else
 			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 	}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 0d237182d721..71af2fc0041e 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1279,7 +1279,7 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
 		if (error == -EINTR) {
 			memcpy(&current->saved_sigmask, &sigsaved,
 			       sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
+			set_restore_sigmask();
 		} else
 			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 	}
@@ -1309,4 +1309,3 @@ static int __init eventpoll_init(void)
 	return 0;
 }
 fs_initcall(eventpoll_init);
-
diff --git a/fs/select.c b/fs/select.c
index 00f58c5c7e05..32ce2b32fad1 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -498,7 +498,7 @@ sticky:
 		if (sigmask) {
 			memcpy(&current->saved_sigmask, &sigsaved,
 					sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
+			set_restore_sigmask();
 		}
 	} else if (sigmask)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -805,7 +805,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 		if (sigmask) {
 			memcpy(&current->saved_sigmask, &sigsaved,
 					sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
+			set_restore_sigmask();
 		}
 		ret = -ERESTARTNOHAND;
 	} else if (sigmask)
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index accd7bad35b0..43d8162c696e 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -92,6 +92,19 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
 #define set_need_resched()	set_thread_flag(TIF_NEED_RESCHED)
 #define clear_need_resched()	clear_thread_flag(TIF_NEED_RESCHED)
 
-#endif
+#ifdef TIF_RESTORE_SIGMASK
+/**
+ * set_restore_sigmask() - make sure saved_sigmask processing gets done
+ *
+ * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
+ * will run before returning to user mode, to process the flag.
+ */
+static inline void set_restore_sigmask(void)
+{
+	set_thread_flag(TIF_RESTORE_SIGMASK);
+}
+#endif	/* TIF_RESTORE_SIGMASK */
+
+#endif	/* __KERNEL__ */
 
 #endif /* _LINUX_THREAD_INFO_H */
diff --git a/kernel/compat.c b/kernel/compat.c
index e1ef04870c2a..4a856a3643bb 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -898,7 +898,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
 
 	current->state = TASK_INTERRUPTIBLE;
 	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
+	set_restore_sigmask();
 	return -ERESTARTNOHAND;
 }
 #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
@@ -1080,4 +1080,3 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
 
 	return 0;
 }
-
diff --git a/kernel/signal.c b/kernel/signal.c
index 9ac737e53df1..72bb4f51f963 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2541,7 +2541,7 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
 
 	current->state = TASK_INTERRUPTIBLE;
 	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
+	set_restore_sigmask();
 	return -ERESTARTNOHAND;
 }
 #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
-- 
cgit v1.3-14-g43fede


From d839fd4d2e95a5fbc4d50aa9d17eed6a5f2094e6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:11 -0700
Subject: ptrace: introduce task_detached() helper

exit.c has numerous "->exit_signal == -1" comparisons, this check is subtle
and deserves a helper.  Imho makes the code more parseable for humans.  At
least it's surely more greppable.

Also, a couple of whitespace cleanups. No functional changes.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6d019aa8522e..4035d391a0d3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -52,6 +52,11 @@
 
 static void exit_mm(struct task_struct * tsk);
 
+static inline int task_detached(struct task_struct *p)
+{
+	return p->exit_signal == -1;
+}
+
 static void __unhash_process(struct task_struct *p)
 {
 	nr_threads--;
@@ -160,7 +165,7 @@ repeat:
 	zap_leader = 0;
 	leader = p->group_leader;
 	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
-		BUG_ON(leader->exit_signal == -1);
+		BUG_ON(task_detached(leader));
 		do_notify_parent(leader, leader->exit_signal);
 		/*
 		 * If we were the last child thread and the leader has
@@ -170,7 +175,7 @@ repeat:
 		 * do_notify_parent() will have marked it self-reaping in
 		 * that case.
 		 */
-		zap_leader = (leader->exit_signal == -1);
+		zap_leader = task_detached(leader);
 	}
 
 	write_unlock_irq(&tasklist_lock);
@@ -721,14 +726,14 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 		return;
 
 	/* We don't want people slaying init.  */
-	if (p->exit_signal != -1)
+	if (!task_detached(p))
 		p->exit_signal = SIGCHLD;
 
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
 	if (!traced && p->exit_state == EXIT_ZOMBIE &&
-	    p->exit_signal != -1 && thread_group_empty(p))
+	    !task_detached(p) && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
 	kill_orphaned_pgrp(p, father);
@@ -781,18 +786,18 @@ static void forget_original_parent(struct task_struct *father)
 		} else {
 			/* reparent ptraced task to its real parent */
 			__ptrace_unlink (p);
-			if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
+			if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
 			    thread_group_empty(p))
 				do_notify_parent(p, p->exit_signal);
 		}
 
 		/*
-		 * if the ptraced child is a zombie with exit_signal == -1
-		 * we must collect it before we exit, or it will remain
-		 * zombie forever since we prevented it from self-reap itself
-		 * while it was being traced by us, to be able to see it in wait4.
+		 * if the ptraced child is a detached zombie we must collect
+		 * it before we exit, or it will remain zombie forever since
+		 * we prevented it from self-reap itself while it was being
+		 * traced by us, to be able to see it in wait4.
 		 */
-		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
+		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p)))
 			list_add(&p->ptrace_list, &ptrace_dead);
 	}
 
@@ -849,26 +854,26 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 * we have changed execution domain as these two values started
 	 * the same after a fork.
 	 */
-	if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
+	if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
 	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
-	     tsk->self_exec_id != tsk->parent_exec_id)
-	    && !capable(CAP_KILL))
+	     tsk->self_exec_id != tsk->parent_exec_id) &&
+	    !capable(CAP_KILL))
 		tsk->exit_signal = SIGCHLD;
 
-
 	/* If something other than our normal parent is ptracing us, then
 	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
 	 * only has special meaning to our real parent.
 	 */
-	if (tsk->exit_signal != -1 && thread_group_empty(tsk)) {
-		int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD;
+	if (!task_detached(tsk) && thread_group_empty(tsk)) {
+		int signal = (tsk->parent == tsk->real_parent)
+				? tsk->exit_signal : SIGCHLD;
 		do_notify_parent(tsk, signal);
 	} else if (tsk->ptrace) {
 		do_notify_parent(tsk, SIGCHLD);
 	}
 
 	state = EXIT_ZOMBIE;
-	if (tsk->exit_signal == -1 && likely(!tsk->ptrace))
+	if (task_detached(tsk) && likely(!tsk->ptrace))
 		state = EXIT_DEAD;
 	tsk->exit_state = state;
 
@@ -1173,7 +1178,7 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
 	 * Do not consider detached threads that are
 	 * not ptraced:
 	 */
-	if (p->exit_signal == -1 && !p->ptrace)
+	if (task_detached(p) && !p->ptrace)
 		return 0;
 
 	/* Wait for all children (clone and not) if __WALL is set;
@@ -1365,9 +1370,9 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 		 * If it's still not detached after that, don't release
 		 * it now.
 		 */
-		if (p->exit_signal != -1) {
+		if (!task_detached(p)) {
 			do_notify_parent(p, p->exit_signal);
-			if (p->exit_signal != -1) {
+			if (!task_detached(p)) {
 				p->exit_state = EXIT_ZOMBIE;
 				p = NULL;
 			}
-- 
cgit v1.3-14-g43fede


From 376e1d2531860358c8a79fecf5f4f42994d03c4d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:12 -0700
Subject: reparent_thread: use same_thread_group()

Trivial, use same_thread_group() in reparent_thread().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 4035d391a0d3..413c81ec858e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -722,7 +722,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 	/* If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
 	 */
-	if (p->real_parent->group_leader == father->group_leader)
+	if (same_thread_group(p->real_parent, father))
 		return;
 
 	/* We don't want people slaying init.  */
-- 
cgit v1.3-14-g43fede


From 2800d8d19e51414403df8144eaa214bb03400b87 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:12 -0700
Subject: document de_thread() with exit_notify() connection

Add a couple of small comments, it is not easy to see what this code does.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c     | 2 +-
 kernel/exit.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 8fccc276d40d..9f9f931ef949 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -798,7 +798,7 @@ static int de_thread(struct task_struct *tsk)
 	if (!thread_group_leader(tsk)) {
 		leader = tsk->group_leader;
 
-		sig->notify_count = -1;
+		sig->notify_count = -1;	/* for exit_notify() */
 		for (;;) {
 			write_lock_irq(&tasklist_lock);
 			if (likely(leader->exit_state))
diff --git a/kernel/exit.c b/kernel/exit.c
index 413c81ec858e..879ed6e1c883 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -877,6 +877,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		state = EXIT_DEAD;
 	tsk->exit_state = state;
 
+	/* mt-exec, de_thread() is waiting for us */
 	if (thread_group_leader(tsk) &&
 	    tsk->signal->notify_count < 0 &&
 	    tsk->signal->group_exit_task)
-- 
cgit v1.3-14-g43fede


From 53b6f9fbd3b63af14b4f6268e8b5b80d178d05bc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:13 -0700
Subject: ptrace: introduce ptrace_reparented() helper

Add another trivial helper for the sake of grep.  It also auto-documents the
fact that ->parent != real_parent implies ->ptrace.

No functional changes.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 4 ++++
 kernel/exit.c          | 9 ++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index ebe0c17039cf..f98501ba557e 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -98,6 +98,10 @@ extern void ptrace_untrace(struct task_struct *child);
 extern int ptrace_may_attach(struct task_struct *task);
 extern int __ptrace_may_attach(struct task_struct *task);
 
+static inline int ptrace_reparented(struct task_struct *child)
+{
+	return child->real_parent != child->parent;
+}
 static inline void ptrace_link(struct task_struct *child,
 			       struct task_struct *new_parent)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index 879ed6e1c883..0da2921b1e7f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -698,7 +698,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 	if (unlikely(traced)) {
 		/* Preserve ptrace links if someone else is tracing this child.  */
 		list_del_init(&p->ptrace_list);
-		if (p->parent != p->real_parent)
+		if (ptrace_reparented(p))
 			list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
 	} else {
 		/* If this child is being traced, then we're the one tracing it
@@ -865,8 +865,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 * only has special meaning to our real parent.
 	 */
 	if (!task_detached(tsk) && thread_group_empty(tsk)) {
-		int signal = (tsk->parent == tsk->real_parent)
-				? tsk->exit_signal : SIGCHLD;
+		int signal = ptrace_reparented(tsk) ?
+				SIGCHLD : tsk->exit_signal;
 		do_notify_parent(tsk, signal);
 	} else if (tsk->ptrace) {
 		do_notify_parent(tsk, SIGCHLD);
@@ -1269,8 +1269,7 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 		return 0;
 	}
 
-	/* traced means p->ptrace, but not vice versa */
-	traced = (p->real_parent != p->parent);
+	traced = ptrace_reparented(p);
 
 	if (likely(!traced)) {
 		struct signal_struct *psig;
-- 
cgit v1.3-14-g43fede


From 68cb94786630b34196713794a2880ade17fca887 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:14 -0700
Subject: ptrace: __ptrace_unlink: use the ptrace_reparented() helper

Currently __ptrace_unlink() checks list_empty(->ptrace_list) to figure out
whether the child was reparented.  Change the code to use ptrace_reparented()
to make this check more explicit and consistent.

No functional changes.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dac4b4e57293..ce66d66881fd 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -73,7 +73,7 @@ void __ptrace_unlink(struct task_struct *child)
 	BUG_ON(!child->ptrace);
 
 	child->ptrace = 0;
-	if (!list_empty(&child->ptrace_list)) {
+	if (ptrace_reparented(child)) {
 		list_del_init(&child->ptrace_list);
 		remove_parent(child);
 		child->parent = child->real_parent;
-- 
cgit v1.3-14-g43fede


From 33e9fc7d01269737cd5a3b6de1db9d0e796ab708 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:14 -0700
Subject: ptrace: ptrace_attach: use send_sig_info() instead
 force_sig_specific()

Nobody can block/ignore SIGSTOP, no need to use force_sig_specific() in
ptrace_attach.  Use the "regular" send_sig_info().

With this patch stracing of /sbin/init doesn't clear its SIGNAL_UNKILLABLE,
but not that this makes ptracing of init safe.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ce66d66881fd..5f8d452e8111 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -208,8 +208,7 @@ repeat:
 
 	__ptrace_link(task, current);
 
-	force_sig_specific(SIGSTOP, task);
-
+	send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
 bad:
 	write_unlock_irqrestore(&tasklist_lock, flags);
 	task_unlock(task);
-- 
cgit v1.3-14-g43fede


From 00cd5c37afd5f431ac186dd131705048c0a11fdb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:53:15 -0700
Subject: ptrace: permit ptracing of /sbin/init

Afaics, currently there are no kernel problems with ptracing init, it can't
lose SIGNAL_UNKILLABLE flag and be killed/stopped by accident.

The ability to strace/debug init can be very useful if you try to figure out
why it does not work as expected.

However, admin should know what he does, "gdb /sbin/init 1" stops init, it
can't reap orphaned zombies or take care of /etc/inittab until continued.  It
is even possible to crash init (and thus the whole system) if you wish,
ptracer has full control.

See also the long discussion: http://marc.info/?t=120628018600001

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 5f8d452e8111..dcc199c43a12 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -168,8 +168,6 @@ int ptrace_attach(struct task_struct *task)
 	audit_ptrace(task);
 
 	retval = -EPERM;
-	if (task->pid <= 1)
-		goto out;
 	if (same_thread_group(task, current))
 		goto out;
 
@@ -521,12 +519,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
 {
 	struct task_struct *child;
 
-	/*
-	 * Tracing init is not allowed.
-	 */
-	if (pid == 1)
-		return ERR_PTR(-EPERM);
-
 	read_lock(&tasklist_lock);
 	child = find_task_by_vpid(pid);
 	if (child)
-- 
cgit v1.3-14-g43fede


From f34d7a5b7010b82fe97da95496b9971435530062 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Wed, 30 Apr 2008 00:54:13 -0700
Subject: tty: The big operations rework

- Operations are now a shared const function block as with most other Linux
  objects

- Introduce wrappers for some optional functions to get consistent behaviour

- Wrap put_char which used to be patched by the tty layer

- Document which functions are needed/optional

- Make put_char report success/fail

- Cache the driver->ops pointer in the tty as tty->ops

- Remove various surplus lock calls we no longer need

- Remove proc_write method as noted by Alexey Dobriyan

- Introduce some missing sanity checks where certain driver/ldisc
  combinations would oops as they didn't check needed methods were present

[akpm@linux-foundation.org: fix fs/compat_ioctl.c build]
[akpm@linux-foundation.org: fix isicom]
[akpm@linux-foundation.org: fix arch/ia64/hp/sim/simserial.c build]
[akpm@linux-foundation.org: fix kgdb]
Signed-off-by: Alan Cox <alan@redhat.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/hp/sim/simserial.c         |  11 +-
 drivers/bluetooth/hci_ldisc.c        |  13 +-
 drivers/char/ip2/ip2main.c           |  12 +-
 drivers/char/isicom.c                |  15 +-
 drivers/char/keyboard.c              |   2 +-
 drivers/char/n_hdlc.c                |  11 +-
 drivers/char/n_r3964.c               |  17 +--
 drivers/char/n_tty.c                 | 103 ++++++-------
 drivers/char/tty_io.c                | 174 +++++++++-------------
 drivers/char/tty_ioctl.c             |  62 +++++---
 drivers/input/serio/serport.c        |   2 +-
 drivers/isdn/gigaset/ser-gigaset.c   |  15 +-
 drivers/net/hamradio/6pack.c         |  36 ++---
 drivers/net/hamradio/mkiss.c         |  15 +-
 drivers/net/irda/irtty-sir.c         |  95 ++++--------
 drivers/net/ppp_async.c              |   9 +-
 drivers/net/ppp_synctty.c            |   9 +-
 drivers/net/slip.c                   |  13 +-
 drivers/net/wan/x25_asy.c            | 279 +++++++++++++++++------------------
 drivers/serial/kgdboc.c              |   6 +-
 drivers/serial/serial_core.c         |  38 +++--
 drivers/usb/serial/digi_acceleport.c |   3 +-
 drivers/usb/serial/usb-serial.c      | 129 +++-------------
 drivers/usb/serial/whiteheat.c       |   4 +-
 fs/compat_ioctl.c                    |   2 +-
 fs/proc/proc_tty.c                   |   6 +-
 include/linux/tty.h                  |   8 +
 include/linux/tty_driver.h           | 102 +++++++------
 kernel/printk.c                      |   4 +-
 net/irda/ircomm/ircomm_tty.c         |   6 +-
 30 files changed, 537 insertions(+), 664 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c
index eb0c32a85fd7..23cafc80d2a4 100644
--- a/arch/ia64/hp/sim/simserial.c
+++ b/arch/ia64/hp/sim/simserial.c
@@ -210,21 +210,23 @@ static void do_softint(struct work_struct *private_)
 	printk(KERN_ERR "simserial: do_softint called\n");
 }
 
-static void rs_put_char(struct tty_struct *tty, unsigned char ch)
+static int rs_put_char(struct tty_struct *tty, unsigned char ch)
 {
 	struct async_struct *info = (struct async_struct *)tty->driver_data;
 	unsigned long flags;
 
-	if (!tty || !info->xmit.buf) return;
+	if (!tty || !info->xmit.buf)
+		return 0;
 
 	local_irq_save(flags);
 	if (CIRC_SPACE(info->xmit.head, info->xmit.tail, SERIAL_XMIT_SIZE) == 0) {
 		local_irq_restore(flags);
-		return;
+		return 0;
 	}
 	info->xmit.buf[info->xmit.head] = ch;
 	info->xmit.head = (info->xmit.head + 1) & (SERIAL_XMIT_SIZE-1);
 	local_irq_restore(flags);
+	return 1;
 }
 
 static void transmit_chars(struct async_struct *info, int *intr_done)
@@ -621,7 +623,8 @@ static void rs_close(struct tty_struct *tty, struct file * filp)
 	 * the line discipline to only process XON/XOFF characters.
 	 */
 	shutdown(info);
-	if (tty->driver->flush_buffer) tty->driver->flush_buffer(tty);
+	if (tty->ops->flush_buffer)
+		tty->ops->flush_buffer(tty);
 	if (tty->ldisc.flush_buffer) tty->ldisc.flush_buffer(tty);
 	info->event = 0;
 	info->tty = NULL;
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 7e31d5f1bc8a..a6c2619ec782 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -143,7 +143,7 @@ restart:
 		int len;
 
 		set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
-		len = tty->driver->write(tty, skb->data, skb->len);
+		len = tty->ops->write(tty, skb->data, skb->len);
 		hdev->stat.byte_tx += len;
 
 		skb_pull(skb, len);
@@ -190,8 +190,7 @@ static int hci_uart_flush(struct hci_dev *hdev)
 
 	/* Flush any pending characters in the driver and discipline. */
 	tty_ldisc_flush(tty);
-	if (tty->driver && tty->driver->flush_buffer)
-		tty->driver->flush_buffer(tty);
+	tty_driver_flush_buffer(tty);
 
 	if (test_bit(HCI_UART_PROTO_SET, &hu->flags))
 		hu->proto->flush(hu);
@@ -285,9 +284,7 @@ static int hci_uart_tty_open(struct tty_struct *tty)
 
 	if (tty->ldisc.flush_buffer)
 		tty->ldisc.flush_buffer(tty);
-
-	if (tty->driver && tty->driver->flush_buffer)
-		tty->driver->flush_buffer(tty);
+	tty_driver_flush_buffer(tty);
 
 	return 0;
 }
@@ -374,8 +371,8 @@ static void hci_uart_tty_receive(struct tty_struct *tty, const u8 *data, char *f
 	spin_unlock(&hu->rx_lock);
 
 	if (test_and_clear_bit(TTY_THROTTLED, &tty->flags) &&
-					tty->driver->unthrottle)
-		tty->driver->unthrottle(tty);
+					tty->ops->unthrottle)
+		tty->ops->unthrottle(tty);
 }
 
 static int hci_uart_register_dev(struct hci_uart *hu)
diff --git a/drivers/char/ip2/ip2main.c b/drivers/char/ip2/ip2main.c
index c4f4ca31f7c0..5ef69dcd2588 100644
--- a/drivers/char/ip2/ip2main.c
+++ b/drivers/char/ip2/ip2main.c
@@ -169,7 +169,7 @@ static int Fip_firmware_size;
 static int  ip2_open(PTTY, struct file *);
 static void ip2_close(PTTY, struct file *);
 static int  ip2_write(PTTY, const unsigned char *, int);
-static void ip2_putchar(PTTY, unsigned char);
+static int  ip2_putchar(PTTY, unsigned char);
 static void ip2_flush_chars(PTTY);
 static int  ip2_write_room(PTTY);
 static int  ip2_chars_in_buf(PTTY);
@@ -1616,10 +1616,9 @@ ip2_close( PTTY tty, struct file *pFile )
 
 	serviceOutgoingFifo ( pCh->pMyBord );
 
-	if ( tty->driver->flush_buffer ) 
-		tty->driver->flush_buffer(tty);
-	if ( tty->ldisc.flush_buffer )  
-		tty->ldisc.flush_buffer(tty);
+	if ( tty->driver->ops->flush_buffer )
+		tty->driver->ops->flush_buffer(tty);
+	tty_ldisc_flush(tty);
 	tty->closing = 0;
 	
 	pCh->pTTY = NULL;
@@ -1738,7 +1737,7 @@ ip2_write( PTTY tty, const unsigned char *pData, int count)
 /*                                                                            */
 /*                                                                            */
 /******************************************************************************/
-static void
+static int
 ip2_putchar( PTTY tty, unsigned char ch )
 {
 	i2ChanStrPtr  pCh = tty->driver_data;
@@ -1753,6 +1752,7 @@ ip2_putchar( PTTY tty, unsigned char ch )
 		ip2_flush_chars( tty );
 	} else
 		write_unlock_irqrestore(&pCh->Pbuf_spinlock, flags);
+	return 1;
 
 //	ip2trace (CHANN, ITRC_PUTC, ITRC_RETURN, 1, ch );
 }
diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index 57b115272aaa..9c6be8da220c 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -1140,28 +1140,29 @@ static int isicom_write(struct tty_struct *tty,	const unsigned char *buf,
 }
 
 /* put_char et all */
-static void isicom_put_char(struct tty_struct *tty, unsigned char ch)
+static int isicom_put_char(struct tty_struct *tty, unsigned char ch)
 {
 	struct isi_port *port = tty->driver_data;
 	struct isi_board *card = port->card;
 	unsigned long flags;
 
 	if (isicom_paranoia_check(port, tty->name, "isicom_put_char"))
-		return;
+		return 0;
 
 	if (!port->xmit_buf)
-		return;
+		return 0;
 
 	spin_lock_irqsave(&card->card_lock, flags);
-	if (port->xmit_cnt >= SERIAL_XMIT_SIZE - 1)
-		goto out;
+	if (port->xmit_cnt >= SERIAL_XMIT_SIZE - 1) {
+		spin_unlock_irqrestore(&card->card_lock, flags);
+		return 0;
+	}
 
 	port->xmit_buf[port->xmit_head++] = ch;
 	port->xmit_head &= (SERIAL_XMIT_SIZE - 1);
 	port->xmit_cnt++;
 	spin_unlock_irqrestore(&card->card_lock, flags);
-out:
-	return;
+	return 1;
 }
 
 /* flush_chars et all */
diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
index 60b934adea65..d1c50b3302e5 100644
--- a/drivers/char/keyboard.c
+++ b/drivers/char/keyboard.c
@@ -1230,7 +1230,7 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw)
 
 	if (rep &&
 	    (!vc_kbd_mode(kbd, VC_REPEAT) ||
-	     (tty && !L_ECHO(tty) && tty->driver->chars_in_buffer(tty)))) {
+	     (tty && !L_ECHO(tty) && tty_chars_in_buffer(tty)))) {
 		/*
 		 * Don't repeat a key if the input buffers are not empty and the
 		 * characters get aren't echoed locally. This makes key repeat
diff --git a/drivers/char/n_hdlc.c b/drivers/char/n_hdlc.c
index a07c0af4819e..a35bfd7ee80e 100644
--- a/drivers/char/n_hdlc.c
+++ b/drivers/char/n_hdlc.c
@@ -342,12 +342,10 @@ static int n_hdlc_tty_open (struct tty_struct *tty)
 #endif
 	
 	/* Flush any pending characters in the driver and discipline. */
-	
 	if (tty->ldisc.flush_buffer)
-		tty->ldisc.flush_buffer (tty);
+		tty->ldisc.flush_buffer(tty);
 
-	if (tty->driver->flush_buffer)
-		tty->driver->flush_buffer (tty);
+	tty_driver_flush_buffer(tty);
 		
 	if (debuglevel >= DEBUG_LEVEL_INFO)	
 		printk("%s(%d)n_hdlc_tty_open() success\n",__FILE__,__LINE__);
@@ -399,7 +397,7 @@ static void n_hdlc_send_frames(struct n_hdlc *n_hdlc, struct tty_struct *tty)
 			
 		/* Send the next block of data to device */
 		tty->flags |= (1 << TTY_DO_WRITE_WAKEUP);
-		actual = tty->driver->write(tty, tbuf->buf, tbuf->count);
+		actual = tty->ops->write(tty, tbuf->buf, tbuf->count);
 
 		/* rollback was possible and has been done */
 		if (actual == -ERESTARTSYS) {
@@ -752,8 +750,7 @@ static int n_hdlc_tty_ioctl(struct tty_struct *tty, struct file *file,
 
 	case TIOCOUTQ:
 		/* get the pending tx byte count in the driver */
-		count = tty->driver->chars_in_buffer ?
-				tty->driver->chars_in_buffer(tty) : 0;
+		count = tty_chars_in_buffer(tty);
 		/* add size of next output frame in queue */
 		spin_lock_irqsave(&n_hdlc->tx_buf_list.spinlock,flags);
 		if (n_hdlc->tx_buf_list.head)
diff --git a/drivers/char/n_r3964.c b/drivers/char/n_r3964.c
index 3f6486e9f1ec..902169062332 100644
--- a/drivers/char/n_r3964.c
+++ b/drivers/char/n_r3964.c
@@ -376,8 +376,9 @@ static void put_char(struct r3964_info *pInfo, unsigned char ch)
 	if (tty == NULL)
 		return;
 
-	if (tty->driver->put_char) {
-		tty->driver->put_char(tty, ch);
+	/* FIXME: put_char should not be called from an IRQ */
+	if (tty->ops->put_char) {
+		tty->ops->put_char(tty, ch);
 	}
 	pInfo->bcc ^= ch;
 }
@@ -386,12 +387,9 @@ static void flush(struct r3964_info *pInfo)
 {
 	struct tty_struct *tty = pInfo->tty;
 
-	if (tty == NULL)
+	if (tty == NULL || tty->ops->flush_chars == NULL)
 		return;
-
-	if (tty->driver->flush_chars) {
-		tty->driver->flush_chars(tty);
-	}
+	tty->ops->flush_chars(tty);
 }
 
 static void trigger_transmit(struct r3964_info *pInfo)
@@ -449,12 +447,11 @@ static void transmit_block(struct r3964_info *pInfo)
 	struct r3964_block_header *pBlock = pInfo->tx_first;
 	int room = 0;
 
-	if ((tty == NULL) || (pBlock == NULL)) {
+	if (tty == NULL || pBlock == NULL) {
 		return;
 	}
 
-	if (tty->driver->write_room)
-		room = tty->driver->write_room(tty);
+	room = tty_write_room(tty);
 
 	TRACE_PS("transmit_block %p, room %d, length %d",
 		 pBlock, room, pBlock->length);
diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index e1518e17e09d..abc93a93dcdd 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -149,8 +149,8 @@ static void check_unthrottle(struct tty_struct *tty)
 {
 	if (tty->count &&
 	    test_and_clear_bit(TTY_THROTTLED, &tty->flags) &&
-	    tty->driver->unthrottle)
-		tty->driver->unthrottle(tty);
+	    tty->ops->unthrottle)
+		tty->ops->unthrottle(tty);
 }
 
 /**
@@ -273,7 +273,7 @@ static int opost(unsigned char c, struct tty_struct *tty)
 {
 	int	space, spaces;
 
-	space = tty->driver->write_room(tty);
+	space = tty_write_room(tty);
 	if (!space)
 		return -1;
 
@@ -286,7 +286,7 @@ static int opost(unsigned char c, struct tty_struct *tty)
 			if (O_ONLCR(tty)) {
 				if (space < 2)
 					return -1;
-				tty->driver->put_char(tty, '\r');
+				tty_put_char(tty, '\r');
 				tty->column = 0;
 			}
 			tty->canon_column = tty->column;
@@ -308,7 +308,7 @@ static int opost(unsigned char c, struct tty_struct *tty)
 				if (space < spaces)
 					return -1;
 				tty->column += spaces;
-				tty->driver->write(tty, "        ", spaces);
+				tty->ops->write(tty, "        ", spaces);
 				return 0;
 			}
 			tty->column += spaces;
@@ -325,7 +325,7 @@ static int opost(unsigned char c, struct tty_struct *tty)
 			break;
 		}
 	}
-	tty->driver->put_char(tty, c);
+	tty_put_char(tty, c);
 	unlock_kernel();
 	return 0;
 }
@@ -352,7 +352,7 @@ static ssize_t opost_block(struct tty_struct *tty,
 	int 	i;
 	const unsigned char *cp;
 
-	space = tty->driver->write_room(tty);
+	space = tty_write_room(tty);
 	if (!space)
 		return 0;
 	if (nr > space)
@@ -390,27 +390,14 @@ static ssize_t opost_block(struct tty_struct *tty,
 		}
 	}
 break_out:
-	if (tty->driver->flush_chars)
-		tty->driver->flush_chars(tty);
-	i = tty->driver->write(tty, buf, i);
+	if (tty->ops->flush_chars)
+		tty->ops->flush_chars(tty);
+	i = tty->ops->write(tty, buf, i);
 	unlock_kernel();
 	return i;
 }
 
 
-/**
- *	put_char	-	write character to driver
- *	@c: character (or part of unicode symbol)
- *	@tty: terminal device
- *
- *	Queue a byte to the driver layer for output
- */
-
-static inline void put_char(unsigned char c, struct tty_struct *tty)
-{
-	tty->driver->put_char(tty, c);
-}
-
 /**
  *	echo_char	-	echo characters
  *	@c: unicode byte to echo
@@ -423,8 +410,8 @@ static inline void put_char(unsigned char c, struct tty_struct *tty)
 static void echo_char(unsigned char c, struct tty_struct *tty)
 {
 	if (L_ECHOCTL(tty) && iscntrl(c) && c != '\t') {
-		put_char('^', tty);
-		put_char(c ^ 0100, tty);
+		tty_put_char(tty, '^');
+		tty_put_char(tty, c ^ 0100);
 		tty->column += 2;
 	} else
 		opost(c, tty);
@@ -433,7 +420,7 @@ static void echo_char(unsigned char c, struct tty_struct *tty)
 static inline void finish_erasing(struct tty_struct *tty)
 {
 	if (tty->erasing) {
-		put_char('/', tty);
+		tty_put_char(tty, '/');
 		tty->column++;
 		tty->erasing = 0;
 	}
@@ -517,7 +504,7 @@ static void eraser(unsigned char c, struct tty_struct *tty)
 		if (L_ECHO(tty)) {
 			if (L_ECHOPRT(tty)) {
 				if (!tty->erasing) {
-					put_char('\\', tty);
+					tty_put_char(tty, '\\');
 					tty->column++;
 					tty->erasing = 1;
 				}
@@ -525,7 +512,7 @@ static void eraser(unsigned char c, struct tty_struct *tty)
 				echo_char(c, tty);
 				while (--cnt > 0) {
 					head = (head+1) & (N_TTY_BUF_SIZE-1);
-					put_char(tty->read_buf[head], tty);
+					tty_put_char(tty, tty->read_buf[head]);
 				}
 			} else if (kill_type == ERASE && !L_ECHOE(tty)) {
 				echo_char(ERASE_CHAR(tty), tty);
@@ -553,22 +540,22 @@ static void eraser(unsigned char c, struct tty_struct *tty)
 				/* Now backup to that column. */
 				while (tty->column > col) {
 					/* Can't use opost here. */
-					put_char('\b', tty);
+					tty_put_char(tty, '\b');
 					if (tty->column > 0)
 						tty->column--;
 				}
 			} else {
 				if (iscntrl(c) && L_ECHOCTL(tty)) {
-					put_char('\b', tty);
-					put_char(' ', tty);
-					put_char('\b', tty);
+					tty_put_char(tty, '\b');
+					tty_put_char(tty, ' ');
+					tty_put_char(tty, '\b');
 					if (tty->column > 0)
 						tty->column--;
 				}
 				if (!iscntrl(c) || L_ECHOCTL(tty)) {
-					put_char('\b', tty);
-					put_char(' ', tty);
-					put_char('\b', tty);
+					tty_put_char(tty, '\b');
+					tty_put_char(tty, ' ');
+					tty_put_char(tty, '\b');
 					if (tty->column > 0)
 						tty->column--;
 				}
@@ -599,8 +586,7 @@ static inline void isig(int sig, struct tty_struct *tty, int flush)
 		kill_pgrp(tty->pgrp, sig, 1);
 	if (flush || !L_NOFLSH(tty)) {
 		n_tty_flush_buffer(tty);
-		if (tty->driver->flush_buffer)
-			tty->driver->flush_buffer(tty);
+		tty_driver_flush_buffer(tty);
 	}
 }
 
@@ -732,7 +718,7 @@ static inline void n_tty_receive_char(struct tty_struct *tty, unsigned char c)
 		tty->lnext = 0;
 		if (L_ECHO(tty)) {
 			if (tty->read_cnt >= N_TTY_BUF_SIZE-1) {
-				put_char('\a', tty); /* beep if no space */
+				tty_put_char(tty, '\a'); /* beep if no space */
 				return;
 			}
 			/* Record the column of first canon char. */
@@ -776,8 +762,7 @@ send_signal:
 			 */
 			if (!L_NOFLSH(tty)) {
 				n_tty_flush_buffer(tty);
-				if (tty->driver->flush_buffer)
-					tty->driver->flush_buffer(tty);
+				tty_driver_flush_buffer(tty);
 			}
 			if (L_ECHO(tty))
 				echo_char(c, tty);
@@ -806,8 +791,8 @@ send_signal:
 			if (L_ECHO(tty)) {
 				finish_erasing(tty);
 				if (L_ECHOCTL(tty)) {
-					put_char('^', tty);
-					put_char('\b', tty);
+					tty_put_char(tty, '^');
+					tty_put_char(tty, '\b');
 				}
 			}
 			return;
@@ -828,7 +813,7 @@ send_signal:
 		if (c == '\n') {
 			if (L_ECHO(tty) || L_ECHONL(tty)) {
 				if (tty->read_cnt >= N_TTY_BUF_SIZE-1)
-					put_char('\a', tty);
+					tty_put_char(tty, '\a');
 				opost('\n', tty);
 			}
 			goto handle_newline;
@@ -846,7 +831,7 @@ send_signal:
 			 */
 			if (L_ECHO(tty)) {
 				if (tty->read_cnt >= N_TTY_BUF_SIZE-1)
-					put_char('\a', tty);
+					tty_put_char(tty, '\a');
 				/* Record the column of first canon char. */
 				if (tty->canon_head == tty->read_head)
 					tty->canon_column = tty->column;
@@ -876,7 +861,7 @@ handle_newline:
 	finish_erasing(tty);
 	if (L_ECHO(tty)) {
 		if (tty->read_cnt >= N_TTY_BUF_SIZE-1) {
-			put_char('\a', tty); /* beep if no space */
+			tty_put_char(tty, '\a'); /* beep if no space */
 			return;
 		}
 		if (c == '\n')
@@ -980,8 +965,8 @@ static void n_tty_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 				break;
 			}
 		}
-		if (tty->driver->flush_chars)
-			tty->driver->flush_chars(tty);
+		if (tty->ops->flush_chars)
+			tty->ops->flush_chars(tty);
 	}
 
 	n_tty_set_room(tty);
@@ -1000,8 +985,8 @@ static void n_tty_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 	if (tty->receive_room < TTY_THRESHOLD_THROTTLE) {
 		/* check TTY_THROTTLED first so it indicates our state */
 		if (!test_and_set_bit(TTY_THROTTLED, &tty->flags) &&
-		    tty->driver->throttle)
-			tty->driver->throttle(tty);
+		    tty->ops->throttle)
+			tty->ops->throttle(tty);
 	}
 }
 
@@ -1086,6 +1071,9 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
 			tty->real_raw = 0;
 	}
 	n_tty_set_room(tty);
+	/* The termios change make the tty ready for I/O */
+	wake_up_interruptible(&tty->write_wait);
+	wake_up_interruptible(&tty->read_wait);
 }
 
 /**
@@ -1513,11 +1501,11 @@ static ssize_t write_chan(struct tty_struct *tty, struct file *file,
 					break;
 				b++; nr--;
 			}
-			if (tty->driver->flush_chars)
-				tty->driver->flush_chars(tty);
+			if (tty->ops->flush_chars)
+				tty->ops->flush_chars(tty);
 		} else {
 			while (nr > 0) {
-				c = tty->driver->write(tty, b, nr);
+				c = tty->ops->write(tty, b, nr);
 				if (c < 0) {
 					retval = c;
 					goto break_out;
@@ -1554,11 +1542,6 @@ break_out:
  *
  *	This code must be sure never to sleep through a hangup.
  *	Called without the kernel lock held - fine
- *
- *	FIXME: if someone changes the VMIN or discipline settings for the
- *	terminal while another process is in poll() the poll does not
- *	recompute the new limits. Possibly set_termios should issue
- *	a read wakeup to fix this bug.
  */
 
 static unsigned int normal_poll(struct tty_struct *tty, struct file *file,
@@ -1582,9 +1565,9 @@ static unsigned int normal_poll(struct tty_struct *tty, struct file *file,
 		else
 			tty->minimum_to_wake = 1;
 	}
-	if (!tty_is_writelocked(tty) &&
-			tty->driver->chars_in_buffer(tty) < WAKEUP_CHARS &&
-			tty->driver->write_room(tty) > 0)
+	if (tty->ops->write && !tty_is_writelocked(tty) &&
+			tty_chars_in_buffer(tty) < WAKEUP_CHARS &&
+			tty_write_room(tty) > 0)
 		mask |= POLLOUT | POLLWRNORM;
 	return mask;
 }
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index b1692afd797e..f69fb8d7a680 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -1108,8 +1108,8 @@ restart:
 	   a reference to the old ldisc. If we ended up flipping back
 	   to the existing ldisc we have two references to it */
 
-	if (tty->ldisc.num != o_ldisc.num && tty->driver->set_ldisc)
-		tty->driver->set_ldisc(tty);
+	if (tty->ldisc.num != o_ldisc.num && tty->ops->set_ldisc)
+		tty->ops->set_ldisc(tty);
 
 	tty_ldisc_put(o_ldisc.num);
 
@@ -1181,9 +1181,8 @@ struct tty_driver *tty_find_polling_driver(char *name, int *line)
 		if (*str == '\0')
 			str = NULL;
 
-		if (tty_line >= 0 && tty_line <= p->num && p->poll_init &&
-				!p->poll_init(p, tty_line, str)) {
-
+		if (tty_line >= 0 && tty_line <= p->num && p->ops &&
+		    p->ops->poll_init && !p->ops->poll_init(p, tty_line, str)) {
 			res = p;
 			*line = tty_line;
 			break;
@@ -1452,8 +1451,7 @@ static void do_tty_hangup(struct work_struct *work)
 		/* We may have no line discipline at this point */
 		if (ld->flush_buffer)
 			ld->flush_buffer(tty);
-		if (tty->driver->flush_buffer)
-			tty->driver->flush_buffer(tty);
+		tty_driver_flush_buffer(tty);
 		if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) &&
 		    ld->write_wakeup)
 			ld->write_wakeup(tty);
@@ -1516,11 +1514,11 @@ static void do_tty_hangup(struct work_struct *work)
 	 * So we just call close() the right number of times.
 	 */
 	if (cons_filp) {
-		if (tty->driver->close)
+		if (tty->ops->close)
 			for (n = 0; n < closecount; n++)
-				tty->driver->close(tty, cons_filp);
-	} else if (tty->driver->hangup)
-		(tty->driver->hangup)(tty);
+				tty->ops->close(tty, cons_filp);
+	} else if (tty->ops->hangup)
+		(tty->ops->hangup)(tty);
 	/*
 	 * We don't want to have driver/ldisc interactions beyond
 	 * the ones we did here. The driver layer expects no
@@ -1752,8 +1750,8 @@ void stop_tty(struct tty_struct *tty)
 		wake_up_interruptible(&tty->link->read_wait);
 	}
 	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
-	if (tty->driver->stop)
-		(tty->driver->stop)(tty);
+	if (tty->ops->stop)
+		(tty->ops->stop)(tty);
 }
 
 EXPORT_SYMBOL(stop_tty);
@@ -1786,8 +1784,8 @@ void start_tty(struct tty_struct *tty)
 		wake_up_interruptible(&tty->link->read_wait);
 	}
 	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
-	if (tty->driver->start)
-		(tty->driver->start)(tty);
+	if (tty->ops->start)
+		(tty->ops->start)(tty);
 	/* If we have a running line discipline it may need kicking */
 	tty_wakeup(tty);
 }
@@ -1972,10 +1970,13 @@ static ssize_t tty_write(struct file *file, const char __user *buf,
 	tty = (struct tty_struct *)file->private_data;
 	if (tty_paranoia_check(tty, inode, "tty_write"))
 		return -EIO;
-	if (!tty || !tty->driver->write ||
+	if (!tty || !tty->ops->write ||
 		(test_bit(TTY_IO_ERROR, &tty->flags)))
 			return -EIO;
-
+	/* Short term debug to catch buggy drivers */
+	if (tty->ops->write_room == NULL)
+		printk(KERN_ERR "tty driver %s lacks a write_room method.\n",
+			tty->driver->name);
 	ld = tty_ldisc_ref_wait(tty);
 	if (!ld->write)
 		ret = -EIO;
@@ -2122,6 +2123,7 @@ static int init_dev(struct tty_driver *driver, int idx,
 		goto fail_no_mem;
 	initialize_tty_struct(tty);
 	tty->driver = driver;
+	tty->ops = driver->ops;
 	tty->index = idx;
 	tty_line_name(driver, idx, tty->name);
 
@@ -2152,6 +2154,7 @@ static int init_dev(struct tty_driver *driver, int idx,
 			goto free_mem_out;
 		initialize_tty_struct(o_tty);
 		o_tty->driver = driver->other;
+		o_tty->ops = driver->ops;
 		o_tty->index = idx;
 		tty_line_name(driver->other, idx, o_tty->name);
 
@@ -2456,8 +2459,8 @@ static void release_dev(struct file *filp)
 		}
 	}
 #endif
-	if (tty->driver->close)
-		tty->driver->close(tty, filp);
+	if (tty->ops->close)
+		tty->ops->close(tty, filp);
 
 	/*
 	 * Sanity check: if tty->count is going to zero, there shouldn't be
@@ -2740,8 +2743,8 @@ got_driver:
 	printk(KERN_DEBUG "opening %s...", tty->name);
 #endif
 	if (!retval) {
-		if (tty->driver->open)
-			retval = tty->driver->open(tty, filp);
+		if (tty->ops->open)
+			retval = tty->ops->open(tty, filp);
 		else
 			retval = -ENODEV;
 	}
@@ -2840,7 +2843,7 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 		goto out1;
 
 	check_tty_count(tty, "tty_open");
-	retval = ptm_driver->open(tty, filp);
+	retval = ptm_driver->ops->open(tty, filp);
 	if (!retval)
 		return 0;
 out1:
@@ -3336,25 +3339,20 @@ static int tiocsetd(struct tty_struct *tty, int __user *p)
 
 static int send_break(struct tty_struct *tty, unsigned int duration)
 {
-	int retval = -EINTR;
-
-	lock_kernel();
 	if (tty_write_lock(tty, 0) < 0)
-		goto out;
-	tty->driver->break_ctl(tty, -1);
+		return -EINTR;
+	tty->ops->break_ctl(tty, -1);
 	if (!signal_pending(current))
 		msleep_interruptible(duration);
-	tty->driver->break_ctl(tty, 0);
+	tty->ops->break_ctl(tty, 0);
 	tty_write_unlock(tty);
 	if (!signal_pending(current))
-		retval = 0;
-out:
-	unlock_kernel();
-	return retval;
+		return -EINTR;
+	return 0;
 }
 
 /**
- *	tiocmget		-	get modem status
+ *	tty_tiocmget		-	get modem status
  *	@tty: tty device
  *	@file: user file pointer
  *	@p: pointer to result
@@ -3369,10 +3367,8 @@ static int tty_tiocmget(struct tty_struct *tty, struct file *file, int __user *p
 {
 	int retval = -EINVAL;
 
-	if (tty->driver->tiocmget) {
-		lock_kernel();
-		retval = tty->driver->tiocmget(tty, file);
-		unlock_kernel();
+	if (tty->ops->tiocmget) {
+		retval = tty->ops->tiocmget(tty, file);
 
 		if (retval >= 0)
 			retval = put_user(retval, p);
@@ -3381,7 +3377,7 @@ static int tty_tiocmget(struct tty_struct *tty, struct file *file, int __user *p
 }
 
 /**
- *	tiocmset		-	set modem status
+ *	tty_tiocmset		-	set modem status
  *	@tty: tty device
  *	@file: user file pointer
  *	@cmd: command - clear bits, set bits or set all
@@ -3398,7 +3394,7 @@ static int tty_tiocmset(struct tty_struct *tty, struct file *file, unsigned int
 {
 	int retval = -EINVAL;
 
-	if (tty->driver->tiocmset) {
+	if (tty->ops->tiocmset) {
 		unsigned int set, clear, val;
 
 		retval = get_user(val, p);
@@ -3422,9 +3418,7 @@ static int tty_tiocmset(struct tty_struct *tty, struct file *file, unsigned int
 		set &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP;
 		clear &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP;
 
-		lock_kernel();
-		retval = tty->driver->tiocmset(tty, file, set, clear);
-		unlock_kernel();
+		retval = tty->ops->tiocmset(tty, file, set, clear);
 	}
 	return retval;
 }
@@ -3455,23 +3449,25 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	retval = -EINVAL;
 
-	if (!tty->driver->break_ctl) {
+	if (!tty->ops->break_ctl) {
 		switch (cmd) {
 		case TIOCSBRK:
 		case TIOCCBRK:
-			if (tty->driver->ioctl)
-				retval = tty->driver->ioctl(tty, file, cmd, arg);
+			if (tty->ops->ioctl)
+				retval = tty->ops->ioctl(tty, file, cmd, arg);
+			if (retval != -EINVAL && retval != -ENOIOCTLCMD)
+				printk(KERN_WARNING "tty: driver %s needs updating to use break_ctl\n", tty->driver->name);
 			return retval;
 
 		/* These two ioctl's always return success; even if */
 		/* the driver doesn't support them. */
 		case TCSBRK:
 		case TCSBRKP:
-			if (!tty->driver->ioctl)
+			if (!tty->ops->ioctl)
 				return 0;
-			lock_kernel();
-			retval = tty->driver->ioctl(tty, file, cmd, arg);
-			unlock_kernel();
+			retval = tty->ops->ioctl(tty, file, cmd, arg);
+			if (retval != -EINVAL && retval != -ENOIOCTLCMD)
+				printk(KERN_WARNING "tty: driver %s needs updating to use break_ctl\n", tty->driver->name);
 			if (retval == -ENOIOCTLCMD)
 				retval = 0;
 			return retval;
@@ -3491,9 +3487,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (retval)
 			return retval;
 		if (cmd != TIOCCBRK) {
-			lock_kernel();
 			tty_wait_until_sent(tty, 0);
-			unlock_kernel();
 			if (signal_pending(current))
 				return -EINTR;
 		}
@@ -3531,7 +3525,6 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case TIOCGSID:
 		return tiocgsid(tty, real_tty, p);
 	case TIOCGETD:
-		/* FIXME: check this is ok */
 		return put_user(tty->ldisc.num, (int __user *)p);
 	case TIOCSETD:
 		return tiocsetd(tty, p);
@@ -3543,15 +3536,13 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	 * Break handling
 	 */
 	case TIOCSBRK:	/* Turn break on, unconditionally */
-		lock_kernel();
-		tty->driver->break_ctl(tty, -1);
-		unlock_kernel();
+		if (tty->ops->break_ctl)
+			tty->ops->break_ctl(tty, -1);
 		return 0;
 
 	case TIOCCBRK:	/* Turn break off, unconditionally */
-		lock_kernel();
-		tty->driver->break_ctl(tty, 0);
-		unlock_kernel();
+		if (tty->ops->break_ctl)
+			tty->ops->break_ctl(tty, 0);
 		return 0;
 	case TCSBRK:   /* SVID version: non-zero arg --> no break */
 		/* non-zero arg means wait for all output data
@@ -3580,8 +3571,8 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		}
 		break;
 	}
-	if (tty->driver->ioctl) {
-		retval = (tty->driver->ioctl)(tty, file, cmd, arg);
+	if (tty->ops->ioctl) {
+		retval = (tty->ops->ioctl)(tty, file, cmd, arg);
 		if (retval != -ENOIOCTLCMD)
 			return retval;
 	}
@@ -3608,8 +3599,8 @@ static long tty_compat_ioctl(struct file *file, unsigned int cmd,
 	if (tty_paranoia_check(tty, inode, "tty_ioctl"))
 		return -EINVAL;
 
-	if (tty->driver->compat_ioctl) {
-		retval = (tty->driver->compat_ioctl)(tty, file, cmd, arg);
+	if (tty->ops->compat_ioctl) {
+		retval = (tty->ops->compat_ioctl)(tty, file, cmd, arg);
 		if (retval != -ENOIOCTLCMD)
 			return retval;
 	}
@@ -3659,8 +3650,7 @@ void __do_SAK(struct tty_struct *tty)
 
 	tty_ldisc_flush(tty);
 
-	if (tty->driver->flush_buffer)
-		tty->driver->flush_buffer(tty);
+	tty_driver_flush_buffer(tty);
 
 	read_lock(&tasklist_lock);
 	/* Kill the entire session */
@@ -3871,15 +3861,27 @@ static void initialize_tty_struct(struct tty_struct *tty)
 	INIT_WORK(&tty->SAK_work, do_SAK_work);
 }
 
-/*
- * The default put_char routine if the driver did not define one.
+/**
+ *	tty_put_char	-	write one character to a tty
+ *	@tty: tty
+ *	@ch: character
+ *
+ *	Write one byte to the tty using the provided put_char method
+ *	if present. Returns the number of characters successfully output.
+ *
+ *	Note: the specific put_char operation in the driver layer may go
+ *	away soon. Don't call it directly, use this method
  */
 
-static void tty_default_put_char(struct tty_struct *tty, unsigned char ch)
+int tty_put_char(struct tty_struct *tty, unsigned char ch)
 {
-	tty->driver->write(tty, &ch, 1);
+	if (tty->ops->put_char)
+		return tty->ops->put_char(tty, ch);
+	return tty->ops->write(tty, &ch, 1);
 }
 
+EXPORT_SYMBOL_GPL(tty_put_char);
+
 static struct class *tty_class;
 
 /**
@@ -3962,37 +3964,8 @@ void put_tty_driver(struct tty_driver *driver)
 void tty_set_operations(struct tty_driver *driver,
 			const struct tty_operations *op)
 {
-	driver->open = op->open;
-	driver->close = op->close;
-	driver->write = op->write;
-	driver->put_char = op->put_char;
-	driver->flush_chars = op->flush_chars;
-	driver->write_room = op->write_room;
-	driver->chars_in_buffer = op->chars_in_buffer;
-	driver->ioctl = op->ioctl;
-	driver->compat_ioctl = op->compat_ioctl;
-	driver->set_termios = op->set_termios;
-	driver->throttle = op->throttle;
-	driver->unthrottle = op->unthrottle;
-	driver->stop = op->stop;
-	driver->start = op->start;
-	driver->hangup = op->hangup;
-	driver->break_ctl = op->break_ctl;
-	driver->flush_buffer = op->flush_buffer;
-	driver->set_ldisc = op->set_ldisc;
-	driver->wait_until_sent = op->wait_until_sent;
-	driver->send_xchar = op->send_xchar;
-	driver->read_proc = op->read_proc;
-	driver->write_proc = op->write_proc;
-	driver->tiocmget = op->tiocmget;
-	driver->tiocmset = op->tiocmset;
-#ifdef CONFIG_CONSOLE_POLL
-	driver->poll_init = op->poll_init;
-	driver->poll_get_char = op->poll_get_char;
-	driver->poll_put_char = op->poll_put_char;
-#endif
-}
-
+	driver->ops = op;
+};
 
 EXPORT_SYMBOL(alloc_tty_driver);
 EXPORT_SYMBOL(put_tty_driver);
@@ -4055,9 +4028,6 @@ int tty_register_driver(struct tty_driver *driver)
 		return error;
 	}
 
-	if (!driver->put_char)
-		driver->put_char = tty_default_put_char;
-
 	mutex_lock(&tty_mutex);
 	list_add(&driver->tty_drivers, &tty_drivers);
 	mutex_unlock(&tty_mutex);
diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c
index 8c4bf3e48d5b..c10d40c4c5ca 100644
--- a/drivers/char/tty_ioctl.c
+++ b/drivers/char/tty_ioctl.c
@@ -40,6 +40,34 @@
 #define TERMIOS_OLD	8
 
 
+int tty_chars_in_buffer(struct tty_struct *tty)
+{
+	if (tty->ops->chars_in_buffer)
+		return tty->ops->chars_in_buffer(tty);
+	else
+		return 0;
+}
+
+EXPORT_SYMBOL(tty_chars_in_buffer);
+
+int tty_write_room(struct tty_struct *tty)
+{
+	if (tty->ops->write_room)
+		return tty->ops->write_room(tty);
+	return 2048;
+}
+
+EXPORT_SYMBOL(tty_write_room);
+
+void tty_driver_flush_buffer(struct tty_struct *tty)
+{
+	if (tty->ops->flush_buffer)
+		tty->ops->flush_buffer(tty);
+}
+
+EXPORT_SYMBOL(tty_driver_flush_buffer);
+
+
 /**
  *	tty_wait_until_sent	-	wait for I/O to finish
  *	@tty: tty we are waiting for
@@ -58,17 +86,13 @@ void tty_wait_until_sent(struct tty_struct *tty, long timeout)
 
 	printk(KERN_DEBUG "%s wait until sent...\n", tty_name(tty, buf));
 #endif
-	if (!tty->driver->chars_in_buffer)
-		return;
 	if (!timeout)
 		timeout = MAX_SCHEDULE_TIMEOUT;
-	lock_kernel();
 	if (wait_event_interruptible_timeout(tty->write_wait,
-			!tty->driver->chars_in_buffer(tty), timeout) >= 0) {
-		if (tty->driver->wait_until_sent)
-			tty->driver->wait_until_sent(tty, timeout);
+			!tty_chars_in_buffer(tty), timeout) >= 0) {
+		if (tty->ops->wait_until_sent)
+			tty->ops->wait_until_sent(tty, timeout);
 	}
-	unlock_kernel();
 }
 EXPORT_SYMBOL(tty_wait_until_sent);
 
@@ -444,8 +468,8 @@ static void change_termios(struct tty_struct *tty, struct ktermios *new_termios)
 		}
 	}
 
-	if (tty->driver->set_termios)
-		(*tty->driver->set_termios)(tty, &old_termios);
+	if (tty->ops->set_termios)
+		(*tty->ops->set_termios)(tty, &old_termios);
 	else
 		tty_termios_copy_hw(tty->termios, &old_termios);
 
@@ -748,8 +772,8 @@ static int send_prio_char(struct tty_struct *tty, char ch)
 {
 	int	was_stopped = tty->stopped;
 
-	if (tty->driver->send_xchar) {
-		tty->driver->send_xchar(tty, ch);
+	if (tty->ops->send_xchar) {
+		tty->ops->send_xchar(tty, ch);
 		return 0;
 	}
 
@@ -758,7 +782,7 @@ static int send_prio_char(struct tty_struct *tty, char ch)
 
 	if (was_stopped)
 		start_tty(tty);
-	tty->driver->write(tty, &ch, 1);
+	tty->ops->write(tty, &ch, 1);
 	if (was_stopped)
 		stop_tty(tty);
 	tty_write_unlock(tty);
@@ -778,13 +802,14 @@ static int tty_change_softcar(struct tty_struct *tty, int arg)
 {
 	int ret = 0;
 	int bit = arg ? CLOCAL : 0;
-	struct ktermios old = *tty->termios;
+	struct ktermios old;
 
 	mutex_lock(&tty->termios_mutex);
+	old = *tty->termios;
 	tty->termios->c_cflag &= ~CLOCAL;
 	tty->termios->c_cflag |= bit;
-	if (tty->driver->set_termios)
-		tty->driver->set_termios(tty, &old);
+	if (tty->ops->set_termios)
+		tty->ops->set_termios(tty, &old);
 	if ((tty->termios->c_cflag & CLOCAL) != bit)
 		ret = -EINVAL;
 	mutex_unlock(&tty->termios_mutex);
@@ -926,8 +951,7 @@ int tty_perform_flush(struct tty_struct *tty, unsigned long arg)
 			ld->flush_buffer(tty);
 		/* fall through */
 	case TCOFLUSH:
-		if (tty->driver->flush_buffer)
-			tty->driver->flush_buffer(tty);
+		tty_driver_flush_buffer(tty);
 		break;
 	default:
 		tty_ldisc_deref(ld);
@@ -984,9 +1008,7 @@ int n_tty_ioctl(struct tty_struct *tty, struct file *file,
 	case TCFLSH:
 		return tty_perform_flush(tty, arg);
 	case TIOCOUTQ:
-		return put_user(tty->driver->chars_in_buffer ?
-				tty->driver->chars_in_buffer(tty) : 0,
-				(int __user *) arg);
+		return put_user(tty_chars_in_buffer(tty), (int __user *) arg);
 	case TIOCINQ:
 		retval = tty->read_cnt;
 		if (L_ICANON(tty))
diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c
index e1a3a79ab3f9..7ff71ba7b7c9 100644
--- a/drivers/input/serio/serport.c
+++ b/drivers/input/serio/serport.c
@@ -46,7 +46,7 @@ struct serport {
 static int serport_serio_write(struct serio *serio, unsigned char data)
 {
 	struct serport *serport = serio->port_data;
-	return -(serport->tty->driver->write(serport->tty, &data, 1) != 1);
+	return -(serport->tty->ops->write(serport->tty, &data, 1) != 1);
 }
 
 static int serport_serio_open(struct serio *serio)
diff --git a/drivers/isdn/gigaset/ser-gigaset.c b/drivers/isdn/gigaset/ser-gigaset.c
index fceeb1d57682..45d1ee93cd39 100644
--- a/drivers/isdn/gigaset/ser-gigaset.c
+++ b/drivers/isdn/gigaset/ser-gigaset.c
@@ -68,10 +68,10 @@ static int write_modem(struct cardstate *cs)
 	struct tty_struct *tty = cs->hw.ser->tty;
 	struct bc_state *bcs = &cs->bcs[0];	/* only one channel */
 	struct sk_buff *skb = bcs->tx_skb;
-	int sent;
+	int sent = -EOPNOTSUPP;
 
 	if (!tty || !tty->driver || !skb)
-		return -EFAULT;
+		return -EINVAL;
 
 	if (!skb->len) {
 		dev_kfree_skb_any(skb);
@@ -80,7 +80,8 @@ static int write_modem(struct cardstate *cs)
 	}
 
 	set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
-	sent = tty->driver->write(tty, skb->data, skb->len);
+	if (tty->ops->write)
+		sent = tty->ops->write(tty, skb->data, skb->len);
 	gig_dbg(DEBUG_OUTPUT, "write_modem: sent %d", sent);
 	if (sent < 0) {
 		/* error */
@@ -120,7 +121,7 @@ static int send_cb(struct cardstate *cs)
 
 	if (cb->len) {
 		set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
-		sent = tty->driver->write(tty, cb->buf + cb->offset, cb->len);
+		sent = tty->ops->write(tty, cb->buf + cb->offset, cb->len);
 		if (sent < 0) {
 			/* error */
 			gig_dbg(DEBUG_OUTPUT, "send_cb: write error %d", sent);
@@ -440,14 +441,14 @@ static int gigaset_set_modem_ctrl(struct cardstate *cs, unsigned old_state, unsi
 	struct tty_struct *tty = cs->hw.ser->tty;
 	unsigned int set, clear;
 
-	if (!tty || !tty->driver || !tty->driver->tiocmset)
-		return -EFAULT;
+	if (!tty || !tty->driver || !tty->ops->tiocmset)
+		return -EINVAL;
 	set = new_state & ~old_state;
 	clear = old_state & ~new_state;
 	if (!set && !clear)
 		return 0;
 	gig_dbg(DEBUG_IF, "tiocmset set %x clear %x", set, clear);
-	return tty->driver->tiocmset(tty, NULL, set, clear);
+	return tty->ops->tiocmset(tty, NULL, set, clear);
 }
 
 static int gigaset_baud_rate(struct cardstate *cs, unsigned cflag)
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 1da55dd2a5a0..82a36266dfc9 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -148,13 +148,13 @@ static void sp_xmit_on_air(unsigned long channel)
 
 	if (((sp->status1 & SIXP_DCD_MASK) == 0) && (random < sp->persistence)) {
 		sp->led_state = 0x70;
-		sp->tty->driver->write(sp->tty, &sp->led_state, 1);
+		sp->tty->ops->write(sp->tty, &sp->led_state, 1);
 		sp->tx_enable = 1;
-		actual = sp->tty->driver->write(sp->tty, sp->xbuff, sp->status2);
+		actual = sp->tty->ops->write(sp->tty, sp->xbuff, sp->status2);
 		sp->xleft -= actual;
 		sp->xhead += actual;
 		sp->led_state = 0x60;
-		sp->tty->driver->write(sp->tty, &sp->led_state, 1);
+		sp->tty->ops->write(sp->tty, &sp->led_state, 1);
 		sp->status2 = 0;
 	} else
 		mod_timer(&sp->tx_t, jiffies + ((when + 1) * HZ) / 100);
@@ -220,13 +220,13 @@ static void sp_encaps(struct sixpack *sp, unsigned char *icp, int len)
 	 */
 	if (sp->duplex == 1) {
 		sp->led_state = 0x70;
-		sp->tty->driver->write(sp->tty, &sp->led_state, 1);
+		sp->tty->ops->write(sp->tty, &sp->led_state, 1);
 		sp->tx_enable = 1;
-		actual = sp->tty->driver->write(sp->tty, sp->xbuff, count);
+		actual = sp->tty->ops->write(sp->tty, sp->xbuff, count);
 		sp->xleft = count - actual;
 		sp->xhead = sp->xbuff + actual;
 		sp->led_state = 0x60;
-		sp->tty->driver->write(sp->tty, &sp->led_state, 1);
+		sp->tty->ops->write(sp->tty, &sp->led_state, 1);
 	} else {
 		sp->xleft = count;
 		sp->xhead = sp->xbuff;
@@ -444,7 +444,7 @@ static void sixpack_write_wakeup(struct tty_struct *tty)
 	}
 
 	if (sp->tx_enable) {
-		actual = tty->driver->write(tty, sp->xhead, sp->xleft);
+		actual = tty->ops->write(tty, sp->xhead, sp->xleft);
 		sp->xleft -= actual;
 		sp->xhead += actual;
 	}
@@ -492,8 +492,8 @@ static void sixpack_receive_buf(struct tty_struct *tty,
 
 	sp_put(sp);
 	if (test_and_clear_bit(TTY_THROTTLED, &tty->flags)
-	    && tty->driver->unthrottle)
-		tty->driver->unthrottle(tty);
+	    && tty->ops->unthrottle)
+		tty->ops->unthrottle(tty);
 }
 
 /*
@@ -554,8 +554,8 @@ static void resync_tnc(unsigned long channel)
 	/* resync the TNC */
 
 	sp->led_state = 0x60;
-	sp->tty->driver->write(sp->tty, &sp->led_state, 1);
-	sp->tty->driver->write(sp->tty, &resync_cmd, 1);
+	sp->tty->ops->write(sp->tty, &sp->led_state, 1);
+	sp->tty->ops->write(sp->tty, &resync_cmd, 1);
 
 
 	/* Start resync timer again -- the TNC might be still absent */
@@ -573,7 +573,7 @@ static inline int tnc_init(struct sixpack *sp)
 
 	tnc_set_sync_state(sp, TNC_UNSYNC_STARTUP);
 
-	sp->tty->driver->write(sp->tty, &inbyte, 1);
+	sp->tty->ops->write(sp->tty, &inbyte, 1);
 
 	del_timer(&sp->resync_t);
 	sp->resync_t.data = (unsigned long) sp;
@@ -601,6 +601,8 @@ static int sixpack_open(struct tty_struct *tty)
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
+	if (tty->ops->write == NULL)
+		return -EOPNOTSUPP;
 
 	dev = alloc_netdev(sizeof(struct sixpack), "sp%d", sp_setup);
 	if (!dev) {
@@ -914,9 +916,9 @@ static void decode_prio_command(struct sixpack *sp, unsigned char cmd)
 	} else { /* output watchdog char if idle */
 		if ((sp->status2 != 0) && (sp->duplex == 1)) {
 			sp->led_state = 0x70;
-			sp->tty->driver->write(sp->tty, &sp->led_state, 1);
+			sp->tty->ops->write(sp->tty, &sp->led_state, 1);
 			sp->tx_enable = 1;
-			actual = sp->tty->driver->write(sp->tty, sp->xbuff, sp->status2);
+			actual = sp->tty->ops->write(sp->tty, sp->xbuff, sp->status2);
 			sp->xleft -= actual;
 			sp->xhead += actual;
 			sp->led_state = 0x60;
@@ -926,7 +928,7 @@ static void decode_prio_command(struct sixpack *sp, unsigned char cmd)
 	}
 
 	/* needed to trigger the TNC watchdog */
-	sp->tty->driver->write(sp->tty, &sp->led_state, 1);
+	sp->tty->ops->write(sp->tty, &sp->led_state, 1);
 
         /* if the state byte has been received, the TNC is present,
            so the resync timer can be reset. */
@@ -956,12 +958,12 @@ static void decode_std_command(struct sixpack *sp, unsigned char cmd)
 			if ((sp->status & SIXP_RX_DCD_MASK) ==
 				SIXP_RX_DCD_MASK) {
 				sp->led_state = 0x68;
-				sp->tty->driver->write(sp->tty, &sp->led_state, 1);
+				sp->tty->ops->write(sp->tty, &sp->led_state, 1);
 			}
 		} else {
 			sp->led_state = 0x60;
 			/* fill trailing bytes with zeroes */
-			sp->tty->driver->write(sp->tty, &sp->led_state, 1);
+			sp->tty->ops->write(sp->tty, &sp->led_state, 1);
 			rest = sp->rx_count;
 			if (rest != 0)
 				 for (i = rest; i <= 3; i++)
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index 30c9b3b0d131..ebcc5adee7cc 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -516,7 +516,7 @@ static void ax_encaps(struct net_device *dev, unsigned char *icp, int len)
 	spin_unlock_bh(&ax->buflock);
 
 	set_bit(TTY_DO_WRITE_WAKEUP, &ax->tty->flags);
-	actual = ax->tty->driver->write(ax->tty, ax->xbuff, count);
+	actual = ax->tty->ops->write(ax->tty, ax->xbuff, count);
 	ax->stats.tx_packets++;
 	ax->stats.tx_bytes += actual;
 
@@ -546,7 +546,7 @@ static int ax_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 
 		printk(KERN_ERR "mkiss: %s: transmit timed out, %s?\n", dev->name,
-		       (ax->tty->driver->chars_in_buffer(ax->tty) || ax->xleft) ?
+		       (ax->tty->ops->chars_in_buffer(ax->tty) || ax->xleft) ?
 		       "bad line quality" : "driver error");
 
 		ax->xleft = 0;
@@ -736,6 +736,8 @@ static int mkiss_open(struct tty_struct *tty)
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
+	if (tty->ops->write == NULL)
+		return -EOPNOTSUPP;
 
 	dev = alloc_netdev(sizeof(struct mkiss), "ax%d", ax_setup);
 	if (!dev) {
@@ -754,8 +756,7 @@ static int mkiss_open(struct tty_struct *tty)
 	tty->disc_data = ax;
 	tty->receive_room = 65535;
 
-	if (tty->driver->flush_buffer)
-		tty->driver->flush_buffer(tty);
+	tty_driver_flush_buffer(tty);
 
 	/* Restore default settings */
 	dev->type = ARPHRD_AX25;
@@ -936,8 +937,8 @@ static void mkiss_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 
 	mkiss_put(ax);
 	if (test_and_clear_bit(TTY_THROTTLED, &tty->flags)
-	    && tty->driver->unthrottle)
-		tty->driver->unthrottle(tty);
+	    && tty->ops->unthrottle)
+		tty->ops->unthrottle(tty);
 }
 
 /*
@@ -962,7 +963,7 @@ static void mkiss_write_wakeup(struct tty_struct *tty)
 		goto out;
 	}
 
-	actual = tty->driver->write(tty, ax->xhead, ax->xleft);
+	actual = tty->ops->write(tty, ax->xhead, ax->xleft);
 	ax->xleft -= actual;
 	ax->xhead += actual;
 
diff --git a/drivers/net/irda/irtty-sir.c b/drivers/net/irda/irtty-sir.c
index fc753d7f674e..e6f40b7f9041 100644
--- a/drivers/net/irda/irtty-sir.c
+++ b/drivers/net/irda/irtty-sir.c
@@ -64,7 +64,7 @@ static int irtty_chars_in_buffer(struct sir_dev *dev)
 	IRDA_ASSERT(priv != NULL, return -1;);
 	IRDA_ASSERT(priv->magic == IRTTY_MAGIC, return -1;);
 
-	return priv->tty->driver->chars_in_buffer(priv->tty);
+	return tty_chars_in_buffer(priv->tty);
 }
 
 /* Wait (sleep) until underlaying hardware finished transmission
@@ -93,10 +93,8 @@ static void irtty_wait_until_sent(struct sir_dev *dev)
 	IRDA_ASSERT(priv->magic == IRTTY_MAGIC, return;);
 
 	tty = priv->tty;
-	if (tty->driver->wait_until_sent) {
-		lock_kernel();
-		tty->driver->wait_until_sent(tty, msecs_to_jiffies(100));
-		unlock_kernel();
+	if (tty->ops->wait_until_sent) {
+		tty->ops->wait_until_sent(tty, msecs_to_jiffies(100));
 	}
 	else {
 		msleep(USBSERIAL_TX_DONE_DELAY);
@@ -125,48 +123,14 @@ static int irtty_change_speed(struct sir_dev *dev, unsigned speed)
 
 	tty = priv->tty;
 
-	lock_kernel();
+	mutex_lock(&tty->termios_mutex);
 	old_termios = *(tty->termios);
 	cflag = tty->termios->c_cflag;
-
-	cflag &= ~CBAUD;
-
-	IRDA_DEBUG(2, "%s(), Setting speed to %d\n", __FUNCTION__, speed);
-
-	switch (speed) {
-	case 1200:
-		cflag |= B1200;
-		break;
-	case 2400:
-		cflag |= B2400;
-		break;
-	case 4800:
-		cflag |= B4800;
-		break;
-	case 19200:
-		cflag |= B19200;
-		break;
-	case 38400:
-		cflag |= B38400;
-		break;
-	case 57600:
-		cflag |= B57600;
-		break;
-	case 115200:
-		cflag |= B115200;
-		break;
-	case 9600:
-	default:
-		cflag |= B9600;
-		break;
-	}	
-
-	tty->termios->c_cflag = cflag;
-	if (tty->driver->set_termios)
-		tty->driver->set_termios(tty, &old_termios);
-	unlock_kernel();
-
+	tty_encode_baud_rate(tty, speed, speed);
+	if (tty->ops->set_termios)
+		tty->ops->set_termios(tty, &old_termios);
 	priv->io.speed = speed;
+	mutex_unlock(&tty->termios_mutex);
 
 	return 0;
 }
@@ -202,8 +166,8 @@ static int irtty_set_dtr_rts(struct sir_dev *dev, int dtr, int rts)
 	 * This function is not yet defined for all tty driver, so
 	 * let's be careful... Jean II
 	 */
-	IRDA_ASSERT(priv->tty->driver->tiocmset != NULL, return -1;);
-	priv->tty->driver->tiocmset(priv->tty, NULL, set, clear);
+	IRDA_ASSERT(priv->tty->ops->tiocmset != NULL, return -1;);
+	priv->tty->ops->tiocmset(priv->tty, NULL, set, clear);
 
 	return 0;
 }
@@ -225,17 +189,13 @@ static int irtty_do_write(struct sir_dev *dev, const unsigned char *ptr, size_t
 	IRDA_ASSERT(priv->magic == IRTTY_MAGIC, return -1;);
 
 	tty = priv->tty;
-	if (!tty->driver->write)
+	if (!tty->ops->write)
 		return 0;
 	tty->flags |= (1 << TTY_DO_WRITE_WAKEUP);
-	if (tty->driver->write_room) {
-		writelen = tty->driver->write_room(tty);
-		if (writelen > len)
-			writelen = len;
-	}
-	else
+	writelen = tty_write_room(tty);
+	if (writelen > len)
 		writelen = len;
-	return tty->driver->write(tty, ptr, writelen);
+	return tty->ops->write(tty, ptr, writelen);
 }
 
 /* ------------------------------------------------------- */
@@ -321,7 +281,7 @@ static inline void irtty_stop_receiver(struct tty_struct *tty, int stop)
 	struct ktermios old_termios;
 	int cflag;
 
-	lock_kernel();
+	mutex_lock(&tty->termios_mutex);
 	old_termios = *(tty->termios);
 	cflag = tty->termios->c_cflag;
 	
@@ -331,9 +291,9 @@ static inline void irtty_stop_receiver(struct tty_struct *tty, int stop)
 		cflag |= CREAD;
 
 	tty->termios->c_cflag = cflag;
-	if (tty->driver->set_termios)
-		tty->driver->set_termios(tty, &old_termios);
-	unlock_kernel();
+	if (tty->ops->set_termios)
+		tty->ops->set_termios(tty, &old_termios);
+	mutex_unlock(&tty->termios_mutex);
 }
 
 /*****************************************************************/
@@ -359,8 +319,8 @@ static int irtty_start_dev(struct sir_dev *dev)
 
 	tty = priv->tty;
 
-	if (tty->driver->start)
-		tty->driver->start(tty);
+	if (tty->ops->start)
+		tty->ops->start(tty);
 	/* Make sure we can receive more data */
 	irtty_stop_receiver(tty, FALSE);
 
@@ -388,8 +348,8 @@ static int irtty_stop_dev(struct sir_dev *dev)
 
 	/* Make sure we don't receive more data */
 	irtty_stop_receiver(tty, TRUE);
-	if (tty->driver->stop)
-		tty->driver->stop(tty);
+	if (tty->ops->stop)
+		tty->ops->stop(tty);
 
 	mutex_unlock(&irtty_mutex);
 
@@ -483,11 +443,10 @@ static int irtty_open(struct tty_struct *tty)
 
 	/* stop the underlying  driver */
 	irtty_stop_receiver(tty, TRUE);
-	if (tty->driver->stop)
-		tty->driver->stop(tty);
+	if (tty->ops->stop)
+		tty->ops->stop(tty);
 
-	if (tty->driver->flush_buffer)
-		tty->driver->flush_buffer(tty);
+	tty_driver_flush_buffer(tty);
 	
 	/* apply mtt override */
 	sir_tty_drv.qos_mtt_bits = qos_mtt_bits;
@@ -564,8 +523,8 @@ static void irtty_close(struct tty_struct *tty)
 	/* Stop tty */
 	irtty_stop_receiver(tty, TRUE);
 	tty->flags &= ~(1 << TTY_DO_WRITE_WAKEUP);
-	if (tty->driver->stop)
-		tty->driver->stop(tty);
+	if (tty->ops->stop)
+		tty->ops->stop(tty);
 
 	kfree(priv);
 
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
index f023d5b67e6e..1c4b7e37912c 100644
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -158,6 +158,9 @@ ppp_asynctty_open(struct tty_struct *tty)
 	struct asyncppp *ap;
 	int err;
 
+	if (tty->ops->write == NULL)
+		return -EOPNOTSUPP;
+
 	err = -ENOMEM;
 	ap = kzalloc(sizeof(*ap), GFP_KERNEL);
 	if (!ap)
@@ -359,8 +362,8 @@ ppp_asynctty_receive(struct tty_struct *tty, const unsigned char *buf,
 		tasklet_schedule(&ap->tsk);
 	ap_put(ap);
 	if (test_and_clear_bit(TTY_THROTTLED, &tty->flags)
-	    && tty->driver->unthrottle)
-		tty->driver->unthrottle(tty);
+	    && tty->ops->unthrottle)
+		tty->ops->unthrottle(tty);
 }
 
 static void
@@ -676,7 +679,7 @@ ppp_async_push(struct asyncppp *ap)
 		if (!tty_stuffed && ap->optr < ap->olim) {
 			avail = ap->olim - ap->optr;
 			set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
-			sent = tty->driver->write(tty, ap->optr, avail);
+			sent = tty->ops->write(tty, ap->optr, avail);
 			if (sent < 0)
 				goto flush;	/* error, e.g. loss of CD */
 			ap->optr += sent;
diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c
index 0d80fa546719..48ed5fdbfe18 100644
--- a/drivers/net/ppp_synctty.c
+++ b/drivers/net/ppp_synctty.c
@@ -207,6 +207,9 @@ ppp_sync_open(struct tty_struct *tty)
 	struct syncppp *ap;
 	int err;
 
+	if (tty->ops->write == NULL)
+		return -EOPNOTSUPP;
+
 	ap = kzalloc(sizeof(*ap), GFP_KERNEL);
 	err = -ENOMEM;
 	if (!ap)
@@ -399,8 +402,8 @@ ppp_sync_receive(struct tty_struct *tty, const unsigned char *buf,
 		tasklet_schedule(&ap->tsk);
 	sp_put(ap);
 	if (test_and_clear_bit(TTY_THROTTLED, &tty->flags)
-	    && tty->driver->unthrottle)
-		tty->driver->unthrottle(tty);
+	    && tty->ops->unthrottle)
+		tty->ops->unthrottle(tty);
 }
 
 static void
@@ -653,7 +656,7 @@ ppp_sync_push(struct syncppp *ap)
 			tty_stuffed = 0;
 		if (!tty_stuffed && ap->tpkt) {
 			set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
-			sent = tty->driver->write(tty, ap->tpkt->data, ap->tpkt->len);
+			sent = tty->ops->write(tty, ap->tpkt->data, ap->tpkt->len);
 			if (sent < 0)
 				goto flush;	/* error, e.g. loss of CD */
 			if (sent < ap->tpkt->len) {
diff --git a/drivers/net/slip.c b/drivers/net/slip.c
index 5a55ede352f4..84af68fdb6c2 100644
--- a/drivers/net/slip.c
+++ b/drivers/net/slip.c
@@ -396,14 +396,14 @@ static void sl_encaps(struct slip *sl, unsigned char *icp, int len)
 
 	/* Order of next two lines is *very* important.
 	 * When we are sending a little amount of data,
-	 * the transfer may be completed inside driver.write()
+	 * the transfer may be completed inside the ops->write()
 	 * routine, because it's running with interrupts enabled.
 	 * In this case we *never* got WRITE_WAKEUP event,
 	 * if we did not request it before write operation.
 	 *       14 Oct 1994  Dmitry Gorodchanin.
 	 */
 	sl->tty->flags |= (1 << TTY_DO_WRITE_WAKEUP);
-	actual = sl->tty->driver->write(sl->tty, sl->xbuff, count);
+	actual = sl->tty->ops->write(sl->tty, sl->xbuff, count);
 #ifdef SL_CHECK_TRANSMIT
 	sl->dev->trans_start = jiffies;
 #endif
@@ -437,7 +437,7 @@ static void slip_write_wakeup(struct tty_struct *tty)
 		return;
 	}
 
-	actual = tty->driver->write(tty, sl->xhead, sl->xleft);
+	actual = tty->ops->write(tty, sl->xhead, sl->xleft);
 	sl->xleft -= actual;
 	sl->xhead += actual;
 }
@@ -462,7 +462,7 @@ static void sl_tx_timeout(struct net_device *dev)
 		}
 		printk(KERN_WARNING "%s: transmit timed out, %s?\n",
 			dev->name,
-			(sl->tty->driver->chars_in_buffer(sl->tty) || sl->xleft) ?
+			(tty_chars_in_buffer(sl->tty) || sl->xleft) ?
 				"bad line quality" : "driver error");
 		sl->xleft = 0;
 		sl->tty->flags &= ~(1 << TTY_DO_WRITE_WAKEUP);
@@ -830,6 +830,9 @@ static int slip_open(struct tty_struct *tty)
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
+	if (tty->ops->write == NULL)
+		return -EOPNOTSUPP;
+
 	/* RTnetlink lock is misused here to serialize concurrent
 	   opens of slip channels. There are better ways, but it is
 	   the simplest one.
@@ -1432,7 +1435,7 @@ static void sl_outfill(unsigned long sls)
 			/* put END into tty queue. Is it right ??? */
 			if (!netif_queue_stopped(sl->dev)) {
 				/* if device busy no outfill */
-				sl->tty->driver->write(sl->tty, &s, 1);
+				sl->tty->ops->write(sl->tty, &s, 1);
 			}
 		} else
 			set_bit(SLF_OUTWAIT, &sl->flags);
diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c
index 0f8aca8a4d43..249e18053d5f 100644
--- a/drivers/net/wan/x25_asy.c
+++ b/drivers/net/wan/x25_asy.c
@@ -17,7 +17,7 @@
 #include <linux/module.h>
 
 #include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/bitops.h>
 #include <linux/string.h>
 #include <linux/mm.h>
@@ -95,7 +95,7 @@ static struct x25_asy *x25_asy_alloc(void)
 			x25_asy_devs[i] = dev;
 			return sl;
 		} else {
-			printk("x25_asy_alloc() - register_netdev() failure.\n");
+			printk(KERN_WARNING "x25_asy_alloc() - register_netdev() failure.\n");
 			free_netdev(dev);
 		}
 	}
@@ -112,23 +112,22 @@ static void x25_asy_free(struct x25_asy *sl)
 	kfree(sl->xbuff);
 	sl->xbuff = NULL;
 
-	if (!test_and_clear_bit(SLF_INUSE, &sl->flags)) {
-		printk("%s: x25_asy_free for already free unit.\n", sl->dev->name);
-	}
+	if (!test_and_clear_bit(SLF_INUSE, &sl->flags))
+		printk(KERN_ERR "%s: x25_asy_free for already free unit.\n",
+			sl->dev->name);
 }
 
 static int x25_asy_change_mtu(struct net_device *dev, int newmtu)
 {
 	struct x25_asy *sl = dev->priv;
 	unsigned char *xbuff, *rbuff;
-	int len = 2* newmtu;
+	int len = 2 * newmtu;
 
 	xbuff = kmalloc(len + 4, GFP_ATOMIC);
 	rbuff = kmalloc(len + 4, GFP_ATOMIC);
 
-	if (xbuff == NULL || rbuff == NULL)  
-	{
-		printk("%s: unable to grow X.25 buffers, MTU change cancelled.\n",
+	if (xbuff == NULL || rbuff == NULL) {
+		printk(KERN_WARNING "%s: unable to grow X.25 buffers, MTU change cancelled.\n",
 		       dev->name);
 		kfree(xbuff);
 		kfree(rbuff);
@@ -193,25 +192,23 @@ static void x25_asy_bump(struct x25_asy *sl)
 	int err;
 
 	count = sl->rcount;
-	sl->stats.rx_bytes+=count;
-	
+	sl->stats.rx_bytes += count;
+
 	skb = dev_alloc_skb(count+1);
-	if (skb == NULL)  
-	{
-		printk("%s: memory squeeze, dropping packet.\n", sl->dev->name);
+	if (skb == NULL) {
+		printk(KERN_WARNING "%s: memory squeeze, dropping packet.\n",
+			sl->dev->name);
 		sl->stats.rx_dropped++;
 		return;
 	}
-	skb_push(skb,1);	/* LAPB internal control */
-	memcpy(skb_put(skb,count), sl->rbuff, count);
+	skb_push(skb, 1);	/* LAPB internal control */
+	memcpy(skb_put(skb, count), sl->rbuff, count);
 	skb->protocol = x25_type_trans(skb, sl->dev);
-	if((err=lapb_data_received(skb->dev, skb))!=LAPB_OK)
-	{
+	err = lapb_data_received(skb->dev, skb);
+	if (err != LAPB_OK) {
 		kfree_skb(skb);
-		printk(KERN_DEBUG "x25_asy: data received err - %d\n",err);
-	}
-	else
-	{
+		printk(KERN_DEBUG "x25_asy: data received err - %d\n", err);
+	} else {
 		netif_rx(skb);
 		sl->dev->last_rx = jiffies;
 		sl->stats.rx_packets++;
@@ -224,10 +221,11 @@ static void x25_asy_encaps(struct x25_asy *sl, unsigned char *icp, int len)
 	unsigned char *p;
 	int actual, count, mtu = sl->dev->mtu;
 
-	if (len > mtu) 
-	{		/* Sigh, shouldn't occur BUT ... */
+	if (len > mtu) {
+		/* Sigh, shouldn't occur BUT ... */
 		len = mtu;
-		printk ("%s: truncating oversized transmit packet!\n", sl->dev->name);
+		printk(KERN_DEBUG "%s: truncating oversized transmit packet!\n",
+					sl->dev->name);
 		sl->stats.tx_dropped++;
 		x25_asy_unlock(sl);
 		return;
@@ -245,7 +243,7 @@ static void x25_asy_encaps(struct x25_asy *sl, unsigned char *icp, int len)
 	 *       14 Oct 1994  Dmitry Gorodchanin.
 	 */
 	sl->tty->flags |= (1 << TTY_DO_WRITE_WAKEUP);
-	actual = sl->tty->driver->write(sl->tty, sl->xbuff, count);
+	actual = sl->tty->ops->write(sl->tty, sl->xbuff, count);
 	sl->xleft = count - actual;
 	sl->xhead = sl->xbuff + actual;
 	/* VSV */
@@ -265,8 +263,7 @@ static void x25_asy_write_wakeup(struct tty_struct *tty)
 	if (!sl || sl->magic != X25_ASY_MAGIC || !netif_running(sl->dev))
 		return;
 
-	if (sl->xleft <= 0)  
-	{
+	if (sl->xleft <= 0) {
 		/* Now serial buffer is almost free & we can start
 		 * transmission of another packet */
 		sl->stats.tx_packets++;
@@ -275,14 +272,14 @@ static void x25_asy_write_wakeup(struct tty_struct *tty)
 		return;
 	}
 
-	actual = tty->driver->write(tty, sl->xhead, sl->xleft);
+	actual = tty->ops->write(tty, sl->xhead, sl->xleft);
 	sl->xleft -= actual;
 	sl->xhead += actual;
 }
 
 static void x25_asy_timeout(struct net_device *dev)
 {
-	struct x25_asy *sl = (struct x25_asy*)(dev->priv);
+	struct x25_asy *sl = dev->priv;
 
 	spin_lock(&sl->lock);
 	if (netif_queue_stopped(dev)) {
@@ -290,7 +287,7 @@ static void x25_asy_timeout(struct net_device *dev)
 		 *      14 Oct 1994 Dmitry Gorodchanin.
 		 */
 		printk(KERN_WARNING "%s: transmit timed out, %s?\n", dev->name,
-		       (sl->tty->driver->chars_in_buffer(sl->tty) || sl->xleft) ?
+		       (tty_chars_in_buffer(sl->tty) || sl->xleft) ?
 		       "bad line quality" : "driver error");
 		sl->xleft = 0;
 		sl->tty->flags &= ~(1 << TTY_DO_WRITE_WAKEUP);
@@ -303,31 +300,34 @@ static void x25_asy_timeout(struct net_device *dev)
 
 static int x25_asy_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct x25_asy *sl = (struct x25_asy*)(dev->priv);
+	struct x25_asy *sl = dev->priv;
 	int err;
 
 	if (!netif_running(sl->dev)) {
-		printk("%s: xmit call when iface is down\n", dev->name);
+		printk(KERN_ERR "%s: xmit call when iface is down\n",
+			dev->name);
 		kfree_skb(skb);
 		return 0;
 	}
-	
-	switch(skb->data[0])
-	{
-		case 0x00:break;
-		case 0x01: /* Connection request .. do nothing */
-			if((err=lapb_connect_request(dev))!=LAPB_OK)
-				printk(KERN_ERR "x25_asy: lapb_connect_request error - %d\n", err);
-			kfree_skb(skb);
-			return 0;
-		case 0x02: /* Disconnect request .. do nothing - hang up ?? */
-			if((err=lapb_disconnect_request(dev))!=LAPB_OK)
-				printk(KERN_ERR "x25_asy: lapb_disconnect_request error - %d\n", err);
-		default:
-			kfree_skb(skb);
-			return  0;
+
+	switch (skb->data[0]) {
+	case 0x00:
+		break;
+	case 0x01: /* Connection request .. do nothing */
+		err = lapb_connect_request(dev);
+		if (err != LAPB_OK)
+			printk(KERN_ERR "x25_asy: lapb_connect_request error - %d\n", err);
+		kfree_skb(skb);
+		return 0;
+	case 0x02: /* Disconnect request .. do nothing - hang up ?? */
+		err = lapb_disconnect_request(dev);
+		if (err != LAPB_OK)
+			printk(KERN_ERR "x25_asy: lapb_disconnect_request error - %d\n", err);
+	default:
+		kfree_skb(skb);
+		return  0;
 	}
-	skb_pull(skb,1);	/* Remove control byte */
+	skb_pull(skb, 1);	/* Remove control byte */
 	/*
 	 * If we are busy already- too bad.  We ought to be able
 	 * to queue things at this point, to allow for a little
@@ -338,10 +338,10 @@ static int x25_asy_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * So, no queues !
 	 *        14 Oct 1994  Dmitry Gorodchanin.
 	 */
-	
-	if((err=lapb_data_request(dev,skb))!=LAPB_OK)
-	{
-		printk(KERN_ERR "lapbeth: lapb_data_request error - %d\n", err);
+
+	err = lapb_data_request(dev, skb);
+	if (err != LAPB_OK) {
+		printk(KERN_ERR "x25_asy: lapb_data_request error - %d\n", err);
 		kfree_skb(skb);
 		return 0;
 	}
@@ -357,7 +357,7 @@ static int x25_asy_xmit(struct sk_buff *skb, struct net_device *dev)
  *	Called when I frame data arrives. We did the work above - throw it
  *	at the net layer.
  */
-  
+
 static int x25_asy_data_indication(struct net_device *dev, struct sk_buff *skb)
 {
 	skb->dev->last_rx = jiffies;
@@ -369,24 +369,22 @@ static int x25_asy_data_indication(struct net_device *dev, struct sk_buff *skb)
  *	busy cases too well. Its tricky to see how to do this nicely -
  *	perhaps lapb should allow us to bounce this ?
  */
- 
+
 static void x25_asy_data_transmit(struct net_device *dev, struct sk_buff *skb)
 {
-	struct x25_asy *sl=dev->priv;
-	
+	struct x25_asy *sl = dev->priv;
+
 	spin_lock(&sl->lock);
-	if (netif_queue_stopped(sl->dev) || sl->tty == NULL)
-	{
+	if (netif_queue_stopped(sl->dev) || sl->tty == NULL) {
 		spin_unlock(&sl->lock);
 		printk(KERN_ERR "x25_asy: tbusy drop\n");
 		kfree_skb(skb);
 		return;
 	}
 	/* We were not busy, so we are now... :-) */
-	if (skb != NULL) 
-	{
+	if (skb != NULL) {
 		x25_asy_lock(sl);
-		sl->stats.tx_bytes+=skb->len;
+		sl->stats.tx_bytes += skb->len;
 		x25_asy_encaps(sl, skb->data, skb->len);
 		dev_kfree_skb(skb);
 	}
@@ -396,15 +394,16 @@ static void x25_asy_data_transmit(struct net_device *dev, struct sk_buff *skb)
 /*
  *	LAPB connection establish/down information.
  */
- 
+
 static void x25_asy_connected(struct net_device *dev, int reason)
 {
 	struct x25_asy *sl = dev->priv;
 	struct sk_buff *skb;
 	unsigned char *ptr;
 
-	if ((skb = dev_alloc_skb(1)) == NULL) {
-		printk(KERN_ERR "lapbeth: out of memory\n");
+	skb = dev_alloc_skb(1);
+	if (skb == NULL) {
+		printk(KERN_ERR "x25_asy: out of memory\n");
 		return;
 	}
 
@@ -422,7 +421,8 @@ static void x25_asy_disconnected(struct net_device *dev, int reason)
 	struct sk_buff *skb;
 	unsigned char *ptr;
 
-	if ((skb = dev_alloc_skb(1)) == NULL) {
+	skb = dev_alloc_skb(1);
+	if (skb == NULL) {
 		printk(KERN_ERR "x25_asy: out of memory\n");
 		return;
 	}
@@ -449,7 +449,7 @@ static struct lapb_register_struct x25_asy_callbacks = {
 /* Open the low-level part of the X.25 channel. Easy! */
 static int x25_asy_open(struct net_device *dev)
 {
-	struct x25_asy *sl = (struct x25_asy*)(dev->priv);
+	struct x25_asy *sl = dev->priv;
 	unsigned long len;
 	int err;
 
@@ -466,13 +466,11 @@ static int x25_asy_open(struct net_device *dev)
 	len = dev->mtu * 2;
 
 	sl->rbuff = kmalloc(len + 4, GFP_KERNEL);
-	if (sl->rbuff == NULL)   {
+	if (sl->rbuff == NULL)
 		goto norbuff;
-	}
 	sl->xbuff = kmalloc(len + 4, GFP_KERNEL);
-	if (sl->xbuff == NULL)   {
+	if (sl->xbuff == NULL)
 		goto noxbuff;
-	}
 
 	sl->buffsize = len;
 	sl->rcount   = 0;
@@ -480,11 +478,12 @@ static int x25_asy_open(struct net_device *dev)
 	sl->flags   &= (1 << SLF_INUSE);      /* Clear ESCAPE & ERROR flags */
 
 	netif_start_queue(dev);
-			
+
 	/*
 	 *	Now attach LAPB
 	 */
-	if((err=lapb_register(dev, &x25_asy_callbacks))==LAPB_OK)
+	err = lapb_register(dev, &x25_asy_callbacks);
+	if (err == LAPB_OK)
 		return 0;
 
 	/* Cleanup */
@@ -499,18 +498,20 @@ norbuff:
 /* Close the low-level part of the X.25 channel. Easy! */
 static int x25_asy_close(struct net_device *dev)
 {
-	struct x25_asy *sl = (struct x25_asy*)(dev->priv);
+	struct x25_asy *sl = dev->priv;
 	int err;
 
 	spin_lock(&sl->lock);
-	if (sl->tty) 
+	if (sl->tty)
 		sl->tty->flags &= ~(1 << TTY_DO_WRITE_WAKEUP);
 
 	netif_stop_queue(dev);
 	sl->rcount = 0;
 	sl->xleft  = 0;
-	if((err=lapb_unregister(dev))!=LAPB_OK)
-		printk(KERN_ERR "x25_asy_close: lapb_unregister error -%d\n",err);
+	err = lapb_unregister(dev);
+	if (err != LAPB_OK)
+		printk(KERN_ERR "x25_asy_close: lapb_unregister error -%d\n",
+			err);
 	spin_unlock(&sl->lock);
 	return 0;
 }
@@ -521,8 +522,9 @@ static int x25_asy_close(struct net_device *dev)
  * a block of X.25 data has been received, which can now be decapsulated
  * and sent on to some IP layer for further processing.
  */
- 
-static void x25_asy_receive_buf(struct tty_struct *tty, const unsigned char *cp, char *fp, int count)
+
+static void x25_asy_receive_buf(struct tty_struct *tty,
+				const unsigned char *cp, char *fp, int count)
 {
 	struct x25_asy *sl = (struct x25_asy *) tty->disc_data;
 
@@ -533,9 +535,8 @@ static void x25_asy_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 	/* Read the characters out of the buffer */
 	while (count--) {
 		if (fp && *fp++) {
-			if (!test_and_set_bit(SLF_ERROR, &sl->flags))  {
+			if (!test_and_set_bit(SLF_ERROR, &sl->flags))
 				sl->stats.rx_errors++;
-			}
 			cp++;
 			continue;
 		}
@@ -556,31 +557,31 @@ static int x25_asy_open_tty(struct tty_struct *tty)
 	struct x25_asy *sl = (struct x25_asy *) tty->disc_data;
 	int err;
 
+	if (tty->ops->write == NULL)
+		return -EOPNOTSUPP;
+
 	/* First make sure we're not already connected. */
-	if (sl && sl->magic == X25_ASY_MAGIC) {
+	if (sl && sl->magic == X25_ASY_MAGIC)
 		return -EEXIST;
-	}
 
 	/* OK.  Find a free X.25 channel to use. */
-	if ((sl = x25_asy_alloc()) == NULL) {
+	sl = x25_asy_alloc();
+	if (sl == NULL)
 		return -ENFILE;
-	}
 
 	sl->tty = tty;
 	tty->disc_data = sl;
 	tty->receive_room = 65536;
-	if (tty->driver->flush_buffer)  {
-		tty->driver->flush_buffer(tty);
-	}
+	tty_driver_flush_buffer(tty);
 	tty_ldisc_flush(tty);
 
 	/* Restore default settings */
 	sl->dev->type = ARPHRD_X25;
-	
+
 	/* Perform the low-level X.25 async init */
-	if ((err = x25_asy_open(sl->dev)))
+	err = x25_asy_open(sl->dev);
+	if (err)
 		return err;
-
 	/* Done.  We have linked the TTY line to a channel. */
 	return sl->dev->base_addr;
 }
@@ -601,9 +602,7 @@ static void x25_asy_close_tty(struct tty_struct *tty)
 		return;
 
 	if (sl->dev->flags & IFF_UP)
-	{
-		(void) dev_close(sl->dev);
-	}
+		dev_close(sl->dev);
 
 	tty->disc_data = NULL;
 	sl->tty = NULL;
@@ -613,8 +612,7 @@ static void x25_asy_close_tty(struct tty_struct *tty)
 
 static struct net_device_stats *x25_asy_get_stats(struct net_device *dev)
 {
-	struct x25_asy *sl = (struct x25_asy*)(dev->priv);
-
+	struct x25_asy *sl = dev->priv;
 	return &sl->stats;
 }
 
@@ -641,21 +639,19 @@ int x25_asy_esc(unsigned char *s, unsigned char *d, int len)
 	 * character sequence, according to the X.25 protocol.
 	 */
 
-	while (len-- > 0) 
-	{
-		switch(c = *s++) 
-		{
-			case X25_END:
-				*ptr++ = X25_ESC;
-				*ptr++ = X25_ESCAPE(X25_END);
-				break;
-			case X25_ESC:
-				*ptr++ = X25_ESC;
-				*ptr++ = X25_ESCAPE(X25_ESC);
-				break;
-			 default:
-				*ptr++ = c;
-				break;
+	while (len-- > 0) {
+		switch (c = *s++) {
+		case X25_END:
+			*ptr++ = X25_ESC;
+			*ptr++ = X25_ESCAPE(X25_END);
+			break;
+		case X25_ESC:
+			*ptr++ = X25_ESC;
+			*ptr++ = X25_ESCAPE(X25_ESC);
+			break;
+		default:
+			*ptr++ = c;
+			break;
 		}
 	}
 	*ptr++ = X25_END;
@@ -665,31 +661,25 @@ int x25_asy_esc(unsigned char *s, unsigned char *d, int len)
 static void x25_asy_unesc(struct x25_asy *sl, unsigned char s)
 {
 
-	switch(s) 
-	{
-		case X25_END:
-			if (!test_and_clear_bit(SLF_ERROR, &sl->flags) && (sl->rcount > 2))  
-			{
-				x25_asy_bump(sl);
-			}
-			clear_bit(SLF_ESCAPE, &sl->flags);
-			sl->rcount = 0;
-			return;
-
-		case X25_ESC:
-			set_bit(SLF_ESCAPE, &sl->flags);
-			return;
-			
-		case X25_ESCAPE(X25_ESC):
-		case X25_ESCAPE(X25_END):
-			if (test_and_clear_bit(SLF_ESCAPE, &sl->flags))
-				s = X25_UNESCAPE(s);
-			break;
-	}
-	if (!test_bit(SLF_ERROR, &sl->flags))  
-	{
-		if (sl->rcount < sl->buffsize)  
-		{
+	switch (s) {
+	case X25_END:
+		if (!test_and_clear_bit(SLF_ERROR, &sl->flags)
+			&& sl->rcount > 2)
+			x25_asy_bump(sl);
+		clear_bit(SLF_ESCAPE, &sl->flags);
+		sl->rcount = 0;
+		return;
+	case X25_ESC:
+		set_bit(SLF_ESCAPE, &sl->flags);
+		return;
+	case X25_ESCAPE(X25_ESC):
+	case X25_ESCAPE(X25_END):
+		if (test_and_clear_bit(SLF_ESCAPE, &sl->flags))
+			s = X25_UNESCAPE(s);
+		break;
+	}
+	if (!test_bit(SLF_ERROR, &sl->flags)) {
+		if (sl->rcount < sl->buffsize) {
 			sl->rbuff[sl->rcount++] = s;
 			return;
 		}
@@ -709,7 +699,7 @@ static int x25_asy_ioctl(struct tty_struct *tty, struct file *file,
 	if (!sl || sl->magic != X25_ASY_MAGIC)
 		return -EINVAL;
 
-	switch(cmd) {
+	switch (cmd) {
 	case SIOCGIFNAME:
 		if (copy_to_user((void __user *)arg, sl->dev->name,
 					strlen(sl->dev->name) + 1))
@@ -724,8 +714,8 @@ static int x25_asy_ioctl(struct tty_struct *tty, struct file *file,
 
 static int x25_asy_open_dev(struct net_device *dev)
 {
-	struct x25_asy *sl = (struct x25_asy*)(dev->priv);
-	if(sl->tty==NULL)
+	struct x25_asy *sl = dev->priv;
+	if (sl->tty == NULL)
 		return -ENODEV;
 	return 0;
 }
@@ -741,9 +731,9 @@ static void x25_asy_setup(struct net_device *dev)
 	set_bit(SLF_INUSE, &sl->flags);
 
 	/*
-	 *	Finish setting up the DEVICE info. 
+	 *	Finish setting up the DEVICE info.
 	 */
-	 
+
 	dev->mtu		= SL_MTU;
 	dev->hard_start_xmit	= x25_asy_xmit;
 	dev->tx_timeout		= x25_asy_timeout;
@@ -778,9 +768,10 @@ static int __init init_x25_asy(void)
 		x25_asy_maxdev = 4; /* Sanity */
 
 	printk(KERN_INFO "X.25 async: version 0.00 ALPHA "
-			"(dynamic channels, max=%d).\n", x25_asy_maxdev );
+			"(dynamic channels, max=%d).\n", x25_asy_maxdev);
 
-	x25_asy_devs = kcalloc(x25_asy_maxdev, sizeof(struct net_device*), GFP_KERNEL);
+	x25_asy_devs = kcalloc(x25_asy_maxdev, sizeof(struct net_device *),
+				GFP_KERNEL);
 	if (!x25_asy_devs) {
 		printk(KERN_WARNING "X25 async: Can't allocate x25_asy_ctrls[] "
 				"array! Uaargh! (-> No X.25 available)\n");
@@ -802,7 +793,7 @@ static void __exit exit_x25_asy(void)
 			struct x25_asy *sl = dev->priv;
 
 			spin_lock_bh(&sl->lock);
-			if (sl->tty) 
+			if (sl->tty)
 				tty_hangup(sl->tty);
 
 			spin_unlock_bh(&sl->lock);
diff --git a/drivers/serial/kgdboc.c b/drivers/serial/kgdboc.c
index 9cf03327386a..eadc1ab6bbce 100644
--- a/drivers/serial/kgdboc.c
+++ b/drivers/serial/kgdboc.c
@@ -96,12 +96,14 @@ static void cleanup_kgdboc(void)
 
 static int kgdboc_get_char(void)
 {
-	return kgdb_tty_driver->poll_get_char(kgdb_tty_driver, kgdb_tty_line);
+	return kgdb_tty_driver->ops->poll_get_char(kgdb_tty_driver,
+						kgdb_tty_line);
 }
 
 static void kgdboc_put_char(u8 chr)
 {
-	kgdb_tty_driver->poll_put_char(kgdb_tty_driver, kgdb_tty_line, chr);
+	kgdb_tty_driver->ops->poll_put_char(kgdb_tty_driver,
+					kgdb_tty_line, chr);
 }
 
 static int param_set_kgdboc_var(const char *kmessage, struct kernel_param *kp)
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 6c7a5cf76582..1e2b9d826f69 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -532,15 +532,25 @@ uart_write(struct tty_struct *tty, const unsigned char *buf, int count)
 static int uart_write_room(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
+	unsigned long flags;
+	int ret;
 
-	return uart_circ_chars_free(&state->info->xmit);
+	spin_lock_irqsave(&state->port->lock, flags);
+	ret = uart_circ_chars_free(&state->info->xmit);
+	spin_unlock_irqrestore(&state->port->lock, flags);
+	return ret;
 }
 
 static int uart_chars_in_buffer(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
+	unsigned long flags;
+	int ret;
 
-	return uart_circ_chars_pending(&state->info->xmit);
+	spin_lock_irqsave(&state->port->lock, flags);
+	ret = uart_circ_chars_pending(&state->info->xmit);
+	spin_unlock_irqrestore(&state->port->lock, flags);
+	return ret;
 }
 
 static void uart_flush_buffer(struct tty_struct *tty)
@@ -622,6 +632,11 @@ static int uart_get_info(struct uart_state *state,
 	struct serial_struct tmp;
 
 	memset(&tmp, 0, sizeof(tmp));
+
+	/* Ensure the state we copy is consistent and no hardware changes
+	   occur as we go */
+	mutex_lock(&state->mutex);
+
 	tmp.type	    = port->type;
 	tmp.line	    = port->line;
 	tmp.port	    = port->iobase;
@@ -641,6 +656,8 @@ static int uart_get_info(struct uart_state *state,
 	tmp.iomem_reg_shift = port->regshift;
 	tmp.iomem_base      = (void *)(unsigned long)port->mapbase;
 
+	mutex_unlock(&state->mutex);
+
 	if (copy_to_user(retinfo, &tmp, sizeof(*retinfo)))
 		return -EFAULT;
 	return 0;
@@ -918,14 +935,12 @@ static void uart_break_ctl(struct tty_struct *tty, int break_state)
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port = state->port;
 
-	lock_kernel();
 	mutex_lock(&state->mutex);
 
 	if (port->type != PORT_UNKNOWN)
 		port->ops->break_ctl(port, break_state);
 
 	mutex_unlock(&state->mutex);
-	unlock_kernel();
 }
 
 static int uart_do_autoconfig(struct uart_state *state)
@@ -1074,7 +1089,6 @@ uart_ioctl(struct tty_struct *tty, struct file *filp, unsigned int cmd,
 	int ret = -ENOIOCTLCMD;
 
 
-	lock_kernel();
 	/*
 	 * These ioctls don't rely on the hardware to be present.
 	 */
@@ -1144,10 +1158,9 @@ uart_ioctl(struct tty_struct *tty, struct file *filp, unsigned int cmd,
 		break;
 	}
 	}
- out_up:
+out_up:
 	mutex_unlock(&state->mutex);
- out:
-	unlock_kernel();
+out:
 	return ret;
 }
 
@@ -1173,7 +1186,6 @@ static void uart_set_termios(struct tty_struct *tty,
 		return;
 	}
 
-	lock_kernel();
 	uart_change_speed(state, old_termios);
 
 	/* Handle transition to B0 status */
@@ -1206,7 +1218,6 @@ static void uart_set_termios(struct tty_struct *tty,
 		}
 		spin_unlock_irqrestore(&state->port->lock, flags);
 	}
-	unlock_kernel();
 #if 0
 	/*
 	 * No need to wake up processes in open wait, since they
@@ -1322,11 +1333,11 @@ static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 	struct uart_port *port = state->port;
 	unsigned long char_time, expire;
 
-	BUG_ON(!kernel_locked());
-
 	if (port->type == PORT_UNKNOWN || port->fifosize == 0)
 		return;
 
+	lock_kernel();
+
 	/*
 	 * Set the check interval to be 1/5 of the estimated time to
 	 * send a single character, and make it at least 1.  The check
@@ -1372,6 +1383,7 @@ static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 			break;
 	}
 	set_current_state(TASK_RUNNING); /* might not be needed */
+	unlock_kernel();
 }
 
 /*
@@ -2085,7 +2097,9 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 		int ret;
 
 		uart_change_pm(state, 0);
+		spin_lock_irq(&port->lock);
 		ops->set_mctrl(port, 0);
+		spin_unlock_irq(&port->lock);
 		ret = ops->startup(port);
 		if (ret == 0) {
 			uart_change_speed(state, NULL);
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index d17d1645714f..04a56f300ea6 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -1421,8 +1421,7 @@ static void digi_close(struct usb_serial_port *port, struct file *filp)
 		tty_wait_until_sent(tty, DIGI_CLOSE_TIMEOUT);
 
 	/* flush driver and line discipline buffers */
-	if (tty->driver->flush_buffer)
-		tty->driver->flush_buffer(tty);
+	tty_driver_flush_buffer(tty);
 	tty_ldisc_flush(tty);
 
 	if (port->serial->dev) {
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index a9934a3f9845..0cb0d77dc429 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -296,16 +296,14 @@ static int serial_write (struct tty_struct * tty, const unsigned char *buf, int
 	struct usb_serial_port *port = tty->driver_data;
 	int retval = -ENODEV;
 
-	if (!port || port->serial->dev->state == USB_STATE_NOTATTACHED)
+	if (port->serial->dev->state == USB_STATE_NOTATTACHED)
 		goto exit;
 
 	dbg("%s - port %d, %d byte(s)", __func__, port->number, count);
 
-	if (!port->open_count) {
-		retval = -EINVAL;
-		dbg("%s - port not opened", __func__);
-		goto exit;
-	}
+	/* open_count is managed under the mutex lock for the tty so cannot
+           drop to zero until after the last close completes */
+	WARN_ON(!port->open_count);
 
 	/* pass on to the driver specific version of this function */
 	retval = port->serial->type->write(port, buf, count);
@@ -317,61 +315,28 @@ exit:
 static int serial_write_room (struct tty_struct *tty) 
 {
 	struct usb_serial_port *port = tty->driver_data;
-	int retval = -ENODEV;
-
-	if (!port)
-		goto exit;
-
 	dbg("%s - port %d", __func__, port->number);
-
-	if (!port->open_count) {
-		dbg("%s - port not open", __func__);
-		goto exit;
-	}
-
+	WARN_ON(!port->open_count);
 	/* pass on to the driver specific version of this function */
-	retval = port->serial->type->write_room(port);
-
-exit:
-	return retval;
+	return port->serial->type->write_room(port);
 }
 
 static int serial_chars_in_buffer (struct tty_struct *tty) 
 {
 	struct usb_serial_port *port = tty->driver_data;
-	int retval = -ENODEV;
-
-	if (!port)
-		goto exit;
-
 	dbg("%s = port %d", __func__, port->number);
 
-	if (!port->open_count) {
-		dbg("%s - port not open", __func__);
-		goto exit;
-	}
-
+	WARN_ON(!port->open_count);
 	/* pass on to the driver specific version of this function */
-	retval = port->serial->type->chars_in_buffer(port);
-
-exit:
-	return retval;
+	return port->serial->type->chars_in_buffer(port);
 }
 
 static void serial_throttle (struct tty_struct * tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
-
-	if (!port)
-		return;
-
 	dbg("%s - port %d", __func__, port->number);
 
-	if (!port->open_count) {
-		dbg ("%s - port not open", __func__);
-		return;
-	}
-
+	WARN_ON(!port->open_count);
 	/* pass on to the driver specific version of this function */
 	if (port->serial->type->throttle)
 		port->serial->type->throttle(port);
@@ -380,17 +345,9 @@ static void serial_throttle (struct tty_struct * tty)
 static void serial_unthrottle (struct tty_struct * tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
-
-	if (!port)
-		return;
-
 	dbg("%s - port %d", __func__, port->number);
 
-	if (!port->open_count) {
-		dbg("%s - port not open", __func__);
-		return;
-	}
-
+	WARN_ON(!port->open_count);
 	/* pass on to the driver specific version of this function */
 	if (port->serial->type->unthrottle)
 		port->serial->type->unthrottle(port);
@@ -401,42 +358,27 @@ static int serial_ioctl (struct tty_struct *tty, struct file * file, unsigned in
 	struct usb_serial_port *port = tty->driver_data;
 	int retval = -ENODEV;
 
-	lock_kernel();
-	if (!port)
-		goto exit;
-
 	dbg("%s - port %d, cmd 0x%.4x", __func__, port->number, cmd);
 
-	/* Caution - port->open_count is BKL protected */
-	if (!port->open_count) {
-		dbg ("%s - port not open", __func__);
-		goto exit;
-	}
+	WARN_ON(!port->open_count);
 
 	/* pass on to the driver specific version of this function if it is available */
-	if (port->serial->type->ioctl)
+	if (port->serial->type->ioctl) {
+		lock_kernel();
 		retval = port->serial->type->ioctl(port, file, cmd, arg);
+		unlock_kernel();
+	}
 	else
 		retval = -ENOIOCTLCMD;
-exit:
-	unlock_kernel();
 	return retval;
 }
 
 static void serial_set_termios (struct tty_struct *tty, struct ktermios * old)
 {
 	struct usb_serial_port *port = tty->driver_data;
-
-	if (!port)
-		return;
-
 	dbg("%s - port %d", __func__, port->number);
 
-	if (!port->open_count) {
-		dbg("%s - port not open", __func__);
-		return;
-	}
-
+	WARN_ON(!port->open_count);
 	/* pass on to the driver specific version of this function if it is available */
 	if (port->serial->type->set_termios)
 		port->serial->type->set_termios(port, old);
@@ -448,24 +390,15 @@ static void serial_break (struct tty_struct *tty, int break_state)
 {
 	struct usb_serial_port *port = tty->driver_data;
 
-	lock_kernel();
-	if (!port) {
-		unlock_kernel();
-		return;
-	}
-
 	dbg("%s - port %d", __func__, port->number);
 
-	if (!port->open_count) {
-		dbg("%s - port not open", __func__);
-		unlock_kernel();
-		return;
-	}
-
+	WARN_ON(!port->open_count);
 	/* pass on to the driver specific version of this function if it is available */
-	if (port->serial->type->break_ctl)
+	if (port->serial->type->break_ctl) {
+		lock_kernel();
 		port->serial->type->break_ctl(port, break_state);
-	unlock_kernel();
+		unlock_kernel();
+	}
 }
 
 static int serial_read_proc (char *page, char **start, off_t off, int count, int *eof, void *data)
@@ -519,19 +452,11 @@ static int serial_tiocmget (struct tty_struct *tty, struct file *file)
 {
 	struct usb_serial_port *port = tty->driver_data;
 
-	if (!port)
-		return -ENODEV;
-
 	dbg("%s - port %d", __func__, port->number);
 
-	if (!port->open_count) {
-		dbg("%s - port not open", __func__);
-		return -ENODEV;
-	}
-
+	WARN_ON(!port->open_count);
 	if (port->serial->type->tiocmget)
 		return port->serial->type->tiocmget(port, file);
-
 	return -EINVAL;
 }
 
@@ -540,19 +465,11 @@ static int serial_tiocmset (struct tty_struct *tty, struct file *file,
 {
 	struct usb_serial_port *port = tty->driver_data;
 
-	if (!port)
-		return -ENODEV;
-
 	dbg("%s - port %d", __func__, port->number);
 
-	if (!port->open_count) {
-		dbg("%s - port not open", __func__);
-		return -ENODEV;
-	}
-
+	WARN_ON(!port->open_count);
 	if (port->serial->type->tiocmset)
 		return port->serial->type->tiocmset(port, file, set, clear);
-
 	return -EINVAL;
 }
 
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index e96bf8663ffc..f07e8a4c1f3d 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -673,15 +673,13 @@ static void whiteheat_close(struct usb_serial_port *port, struct file * filp)
 	}
 */
 
-	if (port->tty->driver->flush_buffer)
-		port->tty->driver->flush_buffer(port->tty);
+	tty_driver_flush_buffer(port->tty);
 	tty_ldisc_flush(port->tty);
 
 	firm_report_tx_done(port);
 
 	firm_close(port);
 
-printk(KERN_ERR"Before processing rx_urbs_submitted.\n");
 	/* shutdown our bulk reads and writes */
 	mutex_lock(&info->deathwarrant);
 	spin_lock_irq(&info->lock);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 9663e8776724..97dba0d92348 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1053,7 +1053,7 @@ static int vt_check(struct file *file)
 	if (tty_paranoia_check(tty, inode, "tty_ioctl"))
 		return -EINVAL;
 	                                                
-	if (tty->driver->ioctl != vt_ioctl)
+	if (tty->ops->ioctl != vt_ioctl)
 		return -EINVAL;
 
 	vc = (struct vc_data *)tty->driver_data;
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index ac26ccc25f42..21f490f5d65c 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -192,16 +192,14 @@ void proc_tty_register_driver(struct tty_driver *driver)
 {
 	struct proc_dir_entry *ent;
 		
-	if ((!driver->read_proc && !driver->write_proc) ||
-	    !driver->driver_name ||
+	if (!driver->ops->read_proc || !driver->driver_name ||
 	    driver->proc_entry)
 		return;
 
 	ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
 	if (!ent)
 		return;
-	ent->read_proc = driver->read_proc;
-	ent->write_proc = driver->write_proc;
+	ent->read_proc = driver->ops->read_proc;
 	ent->owner = driver->owner;
 	ent->data = driver;
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 2699298b00ef..c36a76da2ae2 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -177,9 +177,13 @@ struct signal_struct;
  * size each time the window is created or resized anyway.
  * 						- TYT, 9/14/92
  */
+
+struct tty_operations;
+
 struct tty_struct {
 	int	magic;
 	struct tty_driver *driver;
+	const struct tty_operations *ops;
 	int index;
 	struct tty_ldisc ldisc;
 	struct mutex termios_mutex;
@@ -295,6 +299,10 @@ extern void tty_unregister_device(struct tty_driver *driver, unsigned index);
 extern int tty_read_raw_data(struct tty_struct *tty, unsigned char *bufp,
 			     int buflen);
 extern void tty_write_message(struct tty_struct *tty, char *msg);
+extern int tty_put_char(struct tty_struct *tty, unsigned char c);
+extern int tty_chars_in_buffer(struct tty_struct *tty);
+extern int tty_write_room(struct tty_struct *tty);
+extern void tty_driver_flush_buffer(struct tty_struct *tty);
 
 extern int is_current_pgrp_orphaned(void);
 extern struct pid *tty_get_pgrp(struct tty_struct *tty);
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 21f69aca4505..f80d73b690f6 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -12,11 +12,15 @@
  * 	This routine is called when a particular tty device is opened.
  * 	This routine is mandatory; if this routine is not filled in,
  * 	the attempted open will fail with ENODEV.
+ *
+ *	Required method.
  *     
  * void (*close)(struct tty_struct * tty, struct file * filp);
  *
  * 	This routine is called when a particular tty device is closed.
  *
+ *	Required method.
+ *
  * int (*write)(struct tty_struct * tty,
  * 		 const unsigned char *buf, int count);
  *
@@ -26,7 +30,9 @@
  *	number of characters actually accepted for writing.  This
  *	routine is mandatory.
  *
- * void (*put_char)(struct tty_struct *tty, unsigned char ch);
+ *	Optional: Required for writable devices.
+ *
+ * int (*put_char)(struct tty_struct *tty, unsigned char ch);
  *
  * 	This routine is called by the kernel to write a single
  * 	character to the tty device.  If the kernel uses this routine,
@@ -34,10 +40,18 @@
  * 	done stuffing characters into the driver.  If there is no room
  * 	in the queue, the character is ignored.
  *
+ *	Optional: Kernel will use the write method if not provided.
+ *
+ *	Note: Do not call this function directly, call tty_put_char
+ *
  * void (*flush_chars)(struct tty_struct *tty);
  *
  * 	This routine is called by the kernel after it has written a
  * 	series of characters to the tty device using put_char().  
+ *
+ *	Optional:
+ *
+ *	Note: Do not call this function directly, call tty_driver_flush_chars
  * 
  * int  (*write_room)(struct tty_struct *tty);
  *
@@ -45,6 +59,10 @@
  * 	will accept for queuing to be written.  This number is subject
  * 	to change as output buffers get emptied, or if the output flow
  *	control is acted.
+ *
+ *	Required if write method is provided else not needed.
+ *
+ *	Note: Do not call this function directly, call tty_write_room
  * 
  * int  (*ioctl)(struct tty_struct *tty, struct file * file,
  * 	    unsigned int cmd, unsigned long arg);
@@ -53,22 +71,29 @@
  *	device-specific ioctl's.  If the ioctl number passed in cmd
  * 	is not recognized by the driver, it should return ENOIOCTLCMD.
  *
+ *	Optional
+ *
  * long (*compat_ioctl)(struct tty_struct *tty, struct file * file,
  * 	                unsigned int cmd, unsigned long arg);
  *
  * 	implement ioctl processing for 32 bit process on 64 bit system
+ *
+ *	Optional
  * 
  * void (*set_termios)(struct tty_struct *tty, struct ktermios * old);
  *
  * 	This routine allows the tty driver to be notified when
- * 	device's termios settings have changed.  Note that a
- * 	well-designed tty driver should be prepared to accept the case
- * 	where old == NULL, and try to do something rational.
+ * 	device's termios settings have changed.
+ *
+ *	Optional: Called under the termios lock
+ *
  *
  * void (*set_ldisc)(struct tty_struct *tty);
  *
  * 	This routine allows the tty driver to be notified when the
  * 	device's termios settings have changed.
+ *
+ *	Optional: Called under BKL (currently)
  * 
  * void (*throttle)(struct tty_struct * tty);
  *
@@ -86,17 +111,27 @@
  *
  * 	This routine notifies the tty driver that it should stop
  * 	outputting characters to the tty device.  
+ *
+ *	Optional:
+ *
+ *	Note: Call stop_tty not this method.
  * 
  * void (*start)(struct tty_struct *tty);
  *
  * 	This routine notifies the tty driver that it resume sending
  *	characters to the tty device.
+ *
+ *	Optional:
+ *
+ *	Note: Call start_tty not this method.
  * 
  * void (*hangup)(struct tty_struct *tty);
  *
  * 	This routine notifies the tty driver that it should hangup the
  * 	tty device.
  *
+ *	Required:
+ *
  * void (*break_ctl)(struct tty_stuct *tty, int state);
  *
  * 	This optional routine requests the tty driver to turn on or
@@ -106,18 +141,26 @@
  *
  * 	If this routine is implemented, the high-level tty driver will
  * 	handle the following ioctls: TCSBRK, TCSBRKP, TIOCSBRK,
- * 	TIOCCBRK.  Otherwise, these ioctls will be passed down to the
- * 	driver to handle.
+ * 	TIOCCBRK.
+ *
+ *	Optional: Required for TCSBRK/BRKP/etc handling.
  *
  * void (*wait_until_sent)(struct tty_struct *tty, int timeout);
  * 
  * 	This routine waits until the device has written out all of the
  * 	characters in its transmitter FIFO.
  *
+ *	Optional: If not provided the device is assumed to have no FIFO
+ *
+ *	Note: Usually correct to call tty_wait_until_sent
+ *
  * void (*send_xchar)(struct tty_struct *tty, char ch);
  *
  * 	This routine is used to send a high-priority XON/XOFF
  * 	character to the device.
+ *
+ *	Optional: If not provided then the write method is called under
+ *	the atomic write lock to keep it serialized with the ldisc.
  */
 
 #include <linux/fs.h>
@@ -132,7 +175,7 @@ struct tty_operations {
 	void (*close)(struct tty_struct * tty, struct file * filp);
 	int  (*write)(struct tty_struct * tty,
 		      const unsigned char *buf, int count);
-	void (*put_char)(struct tty_struct *tty, unsigned char ch);
+	int  (*put_char)(struct tty_struct *tty, unsigned char ch);
 	void (*flush_chars)(struct tty_struct *tty);
 	int  (*write_room)(struct tty_struct *tty);
 	int  (*chars_in_buffer)(struct tty_struct *tty);
@@ -153,8 +196,6 @@ struct tty_operations {
 	void (*send_xchar)(struct tty_struct *tty, char ch);
 	int (*read_proc)(char *page, char **start, off_t off,
 			  int count, int *eof, void *data);
-	int (*write_proc)(struct file *file, const char __user *buffer,
-			  unsigned long count, void *data);
 	int (*tiocmget)(struct tty_struct *tty, struct file *file);
 	int (*tiocmset)(struct tty_struct *tty, struct file *file,
 			unsigned int set, unsigned int clear);
@@ -190,48 +231,13 @@ struct tty_driver {
 	struct tty_struct **ttys;
 	struct ktermios **termios;
 	struct ktermios **termios_locked;
-	void *driver_state;	/* only used for the PTY driver */
-	
+	void *driver_state;
+
 	/*
-	 * Interface routines from the upper tty layer to the tty
-	 * driver.	Will be replaced with struct tty_operations.
+	 * Driver methods
 	 */
-	int  (*open)(struct tty_struct * tty, struct file * filp);
-	void (*close)(struct tty_struct * tty, struct file * filp);
-	int  (*write)(struct tty_struct * tty,
-		      const unsigned char *buf, int count);
-	void (*put_char)(struct tty_struct *tty, unsigned char ch);
-	void (*flush_chars)(struct tty_struct *tty);
-	int  (*write_room)(struct tty_struct *tty);
-	int  (*chars_in_buffer)(struct tty_struct *tty);
-	int  (*ioctl)(struct tty_struct *tty, struct file * file,
-		    unsigned int cmd, unsigned long arg);
-	long (*compat_ioctl)(struct tty_struct *tty, struct file * file,
-			     unsigned int cmd, unsigned long arg);
-	void (*set_termios)(struct tty_struct *tty, struct ktermios * old);
-	void (*throttle)(struct tty_struct * tty);
-	void (*unthrottle)(struct tty_struct * tty);
-	void (*stop)(struct tty_struct *tty);
-	void (*start)(struct tty_struct *tty);
-	void (*hangup)(struct tty_struct *tty);
-	void (*break_ctl)(struct tty_struct *tty, int state);
-	void (*flush_buffer)(struct tty_struct *tty);
-	void (*set_ldisc)(struct tty_struct *tty);
-	void (*wait_until_sent)(struct tty_struct *tty, int timeout);
-	void (*send_xchar)(struct tty_struct *tty, char ch);
-	int (*read_proc)(char *page, char **start, off_t off,
-			  int count, int *eof, void *data);
-	int (*write_proc)(struct file *file, const char __user *buffer,
-			  unsigned long count, void *data);
-	int (*tiocmget)(struct tty_struct *tty, struct file *file);
-	int (*tiocmset)(struct tty_struct *tty, struct file *file,
-			unsigned int set, unsigned int clear);
-#ifdef CONFIG_CONSOLE_POLL
-	int (*poll_init)(struct tty_driver *driver, int line, char *options);
-	int (*poll_get_char)(struct tty_driver *driver, int line);
-	void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
-#endif
 
+	const struct tty_operations *ops;
 	struct list_head tty_drivers;
 };
 
diff --git a/kernel/printk.c b/kernel/printk.c
index d3f9c0f788bf..0d232589a923 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1272,8 +1272,8 @@ late_initcall(disable_boot_consoles);
  */
 void tty_write_message(struct tty_struct *tty, char *msg)
 {
-	if (tty && tty->driver->write)
-		tty->driver->write(tty, msg, strlen(msg));
+	if (tty && tty->ops->write)
+		tty->ops->write(tty, msg, strlen(msg));
 	return;
 }
 
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index d2620410cb0a..76c3057d0179 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -555,10 +555,8 @@ static void ircomm_tty_close(struct tty_struct *tty, struct file *filp)
 
 	ircomm_tty_shutdown(self);
 
-	if (tty->driver->flush_buffer)
-		tty->driver->flush_buffer(tty);
-	if (tty->ldisc.flush_buffer)
-		tty->ldisc.flush_buffer(tty);
+	tty_driver_flush_buffer(tty);
+	tty_ldisc_flush(tty);
 
 	tty->closing = 0;
 	self->tty = NULL;
-- 
cgit v1.3-14-g43fede


From b7127aa4547d8cc8a5b569631e2b6ef613af1bb7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:54:22 -0700
Subject: free_pidmap: turn it into free_pidmap(struct upid *)

The callers of free_pidmap() pass 2 members of "struct upid", we can just
pass "struct upid *" instead.  Shaves off 10 bytes from pid.o.

Also, simplify the alloc_pid's "out_free:" error path a little bit.  This
way it looks more clear which subset of pid->numbers[] we are freeing.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc :Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 477691576b33..b322cdf401bf 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -111,10 +111,11 @@ EXPORT_SYMBOL(is_container_init);
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
-static void free_pidmap(struct pid_namespace *pid_ns, int pid)
+static void free_pidmap(struct upid *upid)
 {
-	struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
-	int offset = pid & BITS_PER_PAGE_MASK;
+	int nr = upid->nr;
+	struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
+	int offset = nr & BITS_PER_PAGE_MASK;
 
 	clear_bit(offset, map->page);
 	atomic_inc(&map->nr_free);
@@ -232,7 +233,7 @@ void free_pid(struct pid *pid)
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
 	for (i = 0; i <= pid->level; i++)
-		free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr);
+		free_pidmap(pid->numbers + i);
 
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
@@ -278,8 +279,8 @@ out:
 	return pid;
 
 out_free:
-	for (i++; i <= ns->level; i++)
-		free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr);
+	while (++i <= ns->level)
+		free_pidmap(pid->numbers + i);
 
 	kmem_cache_free(ns->pid_cachep, pid);
 	pid = NULL;
-- 
cgit v1.3-14-g43fede


From cb41d6d068716b2b3666925da34d3d7e658bf4f3 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 30 Apr 2008 00:54:23 -0700
Subject: Use find_task_by_vpid in taskstats

The pid to lookup a task by is passed inside taskstats code via genetlink
message.

Since netlink packets are now processed in the context of the sending task,
this is correct to lookup the task with find_task_by_vpid() here.

Besides, I fix the call to fill_pid() from taskstats_exit(), since the
tsk->pid is not required in fill_pid() in this case, and the pid field on
task_struct is going to be deprecated as well.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Jonathan Lim <jlim@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/taskstats.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 07e86a828073..4a23517169a6 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -183,7 +183,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
 
 	if (!tsk) {
 		rcu_read_lock();
-		tsk = find_task_by_pid(pid);
+		tsk = find_task_by_vpid(pid);
 		if (tsk)
 			get_task_struct(tsk);
 		rcu_read_unlock();
@@ -230,7 +230,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
 	 */
 	rcu_read_lock();
 	if (!first)
-		first = find_task_by_pid(tgid);
+		first = find_task_by_vpid(tgid);
 
 	if (!first || !lock_task_sighand(first, &flags))
 		goto out;
@@ -547,7 +547,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	if (!stats)
 		goto err;
 
-	rc = fill_pid(tsk->pid, tsk, stats);
+	rc = fill_pid(-1, tsk, stats);
 	if (rc < 0)
 		goto err;
 
-- 
cgit v1.3-14-g43fede


From 5cd204550b1a006f2b0c986b0e0f53220ebfd391 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 30 Apr 2008 00:54:24 -0700
Subject: Deprecate find_task_by_pid()

There are some places that are known to operate on tasks'
global pids only:

* the rest_init() call (called on boot)
* the kgdb's getthread
* the create_kthread() (since the kthread is run in init ns)

So use the find_task_by_pid_ns(..., &init_pid_ns) there
and schedule the find_task_by_pid for removal.

[sukadev@us.ibm.com: Fix warning in kernel/pid.c]
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/feature-removal-schedule.txt | 18 ++++++++++++++++++
 include/linux/sched.h                      |  5 ++++-
 init/main.c                                |  2 +-
 kernel/kthread.c                           |  2 +-
 kernel/pid.c                               |  6 ------
 5 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 599fe55bf297..3c35d452b1a9 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -138,6 +138,24 @@ Who:	Kay Sievers <kay.sievers@suse.de>
 
 ---------------------------
 
+What:	find_task_by_pid
+When:	2.6.26
+Why:	With pid namespaces, calling this funciton will return the
+	wrong task when called from inside a namespace.
+
+	The best way to save a task pid and find a task by this
+	pid later, is to find this task's struct pid pointer (or get
+	it directly from the task) and call pid_task() later.
+
+	If someone really needs to get a task by its pid_t, then
+	he most likely needs the find_task_by_vpid() to get the
+	task from the same namespace as the current task is in, but
+	this may be not so in general.
+
+Who:	Pavel Emelyanov <xemul@openvz.org>
+
+---------------------------
+
 What:	ACPI procfs interface
 When:	July 2008
 Why:	ACPI sysfs conversion should be finished by January 2008.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 86e60796db62..03c238088aee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1677,7 +1677,10 @@ extern struct pid_namespace init_pid_ns;
 extern struct task_struct *find_task_by_pid_type_ns(int type, int pid,
 		struct pid_namespace *ns);
 
-extern struct task_struct *find_task_by_pid(pid_t nr);
+static inline struct task_struct *__deprecated find_task_by_pid(pid_t nr)
+{
+	return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns);
+}
 extern struct task_struct *find_task_by_vpid(pid_t nr);
 extern struct task_struct *find_task_by_pid_ns(pid_t nr,
 		struct pid_namespace *ns);
diff --git a/init/main.c b/init/main.c
index 1f4406477f83..dff253cfcd9f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -459,7 +459,7 @@ static void noinline __init_refok rest_init(void)
 	kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
 	numa_default_policy();
 	pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
-	kthreadd_task = find_task_by_pid(pid);
+	kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
 	unlock_kernel();
 
 	/*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ac72eea48339..bd1b9ea024e1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -98,7 +98,7 @@ static void create_kthread(struct kthread_create_info *create)
 		struct sched_param param = { .sched_priority = 0 };
 		wait_for_completion(&create->started);
 		read_lock(&tasklist_lock);
-		create->result = find_task_by_pid(pid);
+		create->result = find_task_by_pid_ns(pid, &init_pid_ns);
 		read_unlock(&tasklist_lock);
 		/*
 		 * root may have changed our (kthreadd's) priority or CPU mask.
diff --git a/kernel/pid.c b/kernel/pid.c
index b322cdf401bf..a9ae9f7fb229 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -381,12 +381,6 @@ struct task_struct *find_task_by_pid_type_ns(int type, int nr,
 
 EXPORT_SYMBOL(find_task_by_pid_type_ns);
 
-struct task_struct *find_task_by_pid(pid_t nr)
-{
-	return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns);
-}
-EXPORT_SYMBOL(find_task_by_pid);
-
 struct task_struct *find_task_by_vpid(pid_t vnr)
 {
 	return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
-- 
cgit v1.3-14-g43fede


From 65450cebc6a2efde80ed45514f727e6e4dc1eafd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:54:25 -0700
Subject: pids: de_thread: don't clear session/pgrp pids for the old leader

Based on Eric W. Biederman's idea.

Unless task == current, without tasklist_lock held task_session()/task_pgrp()
can return NULL if the caller races with de_thread() which switches the group
leader.

Change transfer_pid() to not clear old->pids[type].pid for the old leader.
This means that its .pid can point to "nowhere", but this is already true for
sub-threads, and the old leader is not group_leader() any longer.  IOW, with
or without this change we can't trust task's special pids unless it is the
group leader.

With this change the following code

	rcu_read_lock();
	task = find_task_by_xxx();
	do_something(task_pgrp(task), task_session(task));
	rcu_read_unlock();

can't race with exec and hit the NULL pid.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc:  "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index a9ae9f7fb229..e9a31d362b28 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -354,7 +354,6 @@ void transfer_pid(struct task_struct *old, struct task_struct *new,
 {
 	new->pids[type].pid = old->pids[type].pid;
 	hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
-	old->pids[type].pid = NULL;
 }
 
 struct task_struct *pid_task(struct pid *pid, enum pid_type type)
-- 
cgit v1.3-14-g43fede


From 24336eaeecea860b2a82530e07c80bc7e0558b73 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:54:26 -0700
Subject: pids: introduce change_pid() helper

Based on Eric W. Biederman's idea.

Without tasklist_lock held task_session()/task_pgrp() can return NULL if the
caller races with setprgp()/setsid() which does detach_pid() + attach_pid().
This can happen even if task == current.

Intoduce the new helper, change_pid(), which should be used instead.  This way
the caller always sees the special pid != NULL, either old or new.

Also change the prototype of attach_pid(), it always returns 0 and nobody
check the returned value.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc:  "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid.h |  6 ++++--
 kernel/pid.c        | 21 ++++++++++++++++-----
 2 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index c7980810eb09..8d199033c0ca 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -89,9 +89,11 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
  * attach_pid() and detach_pid() must be called with the tasklist_lock
  * write-held.
  */
-extern int attach_pid(struct task_struct *task, enum pid_type type,
-		      struct pid *pid);
+extern void attach_pid(struct task_struct *task, enum pid_type type,
+			struct pid *pid);
 extern void detach_pid(struct task_struct *task, enum pid_type);
+extern void change_pid(struct task_struct *task, enum pid_type,
+			struct pid *pid);
 extern void transfer_pid(struct task_struct *old, struct task_struct *new,
 			 enum pid_type);
 
diff --git a/kernel/pid.c b/kernel/pid.c
index e9a31d362b28..20d59fa2d493 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -317,7 +317,7 @@ EXPORT_SYMBOL_GPL(find_pid);
 /*
  * attach_pid() must be called with the tasklist_lock write-held.
  */
-int attach_pid(struct task_struct *task, enum pid_type type,
+void attach_pid(struct task_struct *task, enum pid_type type,
 		struct pid *pid)
 {
 	struct pid_link *link;
@@ -325,11 +325,10 @@ int attach_pid(struct task_struct *task, enum pid_type type,
 	link = &task->pids[type];
 	link->pid = pid;
 	hlist_add_head_rcu(&link->node, &pid->tasks[type]);
-
-	return 0;
 }
 
-void detach_pid(struct task_struct *task, enum pid_type type)
+static void __change_pid(struct task_struct *task, enum pid_type type,
+			struct pid *new)
 {
 	struct pid_link *link;
 	struct pid *pid;
@@ -339,7 +338,7 @@ void detach_pid(struct task_struct *task, enum pid_type type)
 	pid = link->pid;
 
 	hlist_del_rcu(&link->node);
-	link->pid = NULL;
+	link->pid = new;
 
 	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
 		if (!hlist_empty(&pid->tasks[tmp]))
@@ -348,6 +347,18 @@ void detach_pid(struct task_struct *task, enum pid_type type)
 	free_pid(pid);
 }
 
+void detach_pid(struct task_struct *task, enum pid_type type)
+{
+	__change_pid(task, type, NULL);
+}
+
+void change_pid(struct task_struct *task, enum pid_type type,
+		struct pid *pid)
+{
+	__change_pid(task, type, pid);
+	attach_pid(task, type, pid);
+}
+
 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
 void transfer_pid(struct task_struct *old, struct task_struct *new,
 			   enum pid_type type)
-- 
cgit v1.3-14-g43fede


From 83beaf3c6c75b36b7c9be7f555c8cf7797842cc5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:54:27 -0700
Subject: pids: sys_setpgid: use change_pid() helper

Use change_pid() instead of detach_pid() + attach_pid() in sys_setpgid().

This way task_pgrp() is not NULL in between.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc:  "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 47c30a20b554..5d0b44cd435c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -978,8 +978,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 		goto out;
 
 	if (task_pgrp(p) != pgrp) {
-		detach_pid(p, PIDTYPE_PGID);
-		attach_pid(p, PIDTYPE_PGID, pgrp);
+		change_pid(p, PIDTYPE_PGID, pgrp);
 		set_task_pgrp(p, pid_nr(pgrp));
 	}
 
-- 
cgit v1.3-14-g43fede


From 7d8da0962eaee30b4a380ded177349bfbdd6ac46 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:54:27 -0700
Subject: pids: __set_special_pids: use change_pid() helper

Use change_pid() instead of detach_pid() + attach_pid() in
__set_special_pids().

This way task_session() is not NULL in between.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc:  "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 0da2921b1e7f..d3ad54677f9c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -334,13 +334,11 @@ void __set_special_pids(struct pid *pid)
 	pid_t nr = pid_nr(pid);
 
 	if (task_session(curr) != pid) {
-		detach_pid(curr, PIDTYPE_SID);
-		attach_pid(curr, PIDTYPE_SID, pid);
+		change_pid(curr, PIDTYPE_SID, pid);
 		set_task_session(curr, nr);
 	}
 	if (task_pgrp(curr) != pid) {
-		detach_pid(curr, PIDTYPE_PGID);
-		attach_pid(curr, PIDTYPE_PGID, pid);
+		change_pid(curr, PIDTYPE_PGID, pid);
 		set_task_pgrp(curr, nr);
 	}
 }
-- 
cgit v1.3-14-g43fede


From 1dd768c0815334d2319d6377f0750ace075b6142 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:54:28 -0700
Subject: pids: sys_getsid: fix unsafe *pid usage, fix possible 0 instead of
 -ESRCH

1. sys_getsid() needs rcu_read_lock() to derive the session _nr, even if
   the task is current, otherwise we can race with another thread which
   does sys_setsid().

2. The task can exit between find_task_by_vpid() and task_session_vnr(),
   in that unlikely case sys_getsid() returns 0 instead of -ESRCH.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 5d0b44cd435c..ddd28e261f3a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1022,23 +1022,30 @@ asmlinkage long sys_getpgrp(void)
 
 asmlinkage long sys_getsid(pid_t pid)
 {
+	struct task_struct *p;
+	struct pid *sid;
+	int retval;
+
+	rcu_read_lock();
 	if (!pid)
-		return task_session_vnr(current);
+		sid = task_session(current);
 	else {
-		int retval;
-		struct task_struct *p;
-
-		rcu_read_lock();
-		p = find_task_by_vpid(pid);
 		retval = -ESRCH;
-		if (p) {
-			retval = security_task_getsid(p);
-			if (!retval)
-				retval = task_session_vnr(p);
-		}
-		rcu_read_unlock();
-		return retval;
+		p = find_task_by_vpid(pid);
+		if (!p)
+			goto out;
+		sid = task_session(p);
+		if (!sid)
+			goto out;
+
+		retval = security_task_getsid(p);
+		if (retval)
+			goto out;
 	}
+	retval = pid_vnr(sid);
+out:
+	rcu_read_unlock();
+	return retval;
 }
 
 asmlinkage long sys_setsid(void)
-- 
cgit v1.3-14-g43fede


From 12a3de0a965826096d8adc593bcf4392a7d5b459 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 30 Apr 2008 00:54:29 -0700
Subject: pids: sys_getpgid: fix unsafe *pid usage, s/tasklist/rcu/

1. sys_getpgid() needs rcu_read_lock() to derive the pgrp _nr, even if
   the task is current, otherwise we can race with another thread which
   does sys_setpgid().

2. Use rcu_read_lock() instead of tasklist_lock when pid != 0, make sure
   that we don't use the NULL pid if the task exits right after successful
   find_task_by_vpid().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index ddd28e261f3a..895d2d4c9493 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -991,31 +991,37 @@ out:
 
 asmlinkage long sys_getpgid(pid_t pid)
 {
+	struct task_struct *p;
+	struct pid *grp;
+	int retval;
+
+	rcu_read_lock();
 	if (!pid)
-		return task_pgrp_vnr(current);
+		grp = task_pgrp(current);
 	else {
-		int retval;
-		struct task_struct *p;
-
-		read_lock(&tasklist_lock);
-		p = find_task_by_vpid(pid);
 		retval = -ESRCH;
-		if (p) {
-			retval = security_task_getpgid(p);
-			if (!retval)
-				retval = task_pgrp_vnr(p);
-		}
-		read_unlock(&tasklist_lock);
-		return retval;
+		p = find_task_by_vpid(pid);
+		if (!p)
+			goto out;
+		grp = task_pgrp(p);
+		if (!grp)
+			goto out;
+
+		retval = security_task_getpgid(p);
+		if (retval)
+			goto out;
 	}
+	retval = pid_vnr(grp);
+out:
+	rcu_read_unlock();
+	return retval;
 }
 
 #ifdef __ARCH_WANT_SYS_GETPGRP
 
 asmlinkage long sys_getpgrp(void)
 {
-	/* SMP - assuming writes are word atomic this is fine */
-	return task_pgrp_vnr(current);
+	return sys_getpgid(0);
 }
 
 #endif
-- 
cgit v1.3-14-g43fede


From ab883af53ec1b87add43b32a28d8347f17d5155b Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 30 Apr 2008 00:54:30 -0700
Subject: make marker_debug static

With the needlessly global marker_debug being static gcc can optimize the
unused code away.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/marker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 139260e5460c..b5a9fe1d50d5 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -29,7 +29,7 @@ extern struct marker __start___markers[];
 extern struct marker __stop___markers[];
 
 /* Set to 1 to enable marker debug output */
-const int marker_debug;
+static const int marker_debug;
 
 /*
  * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
-- 
cgit v1.3-14-g43fede


From caafa4324335aeb11bc233d5f87aca8cce30beba Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 30 Apr 2008 00:54:31 -0700
Subject: pidns: make pid->level and pid_ns->level unsigned

These values represent the nesting level of a namespace and pids living in it,
and it's always non-negative.

Turning this from int to unsigned int saves some space in pid.c (11 bytes on
x86 and 64 on ia64) by letting the compiler optimize the pid_nr_ns a bit.
E.g.  on ia64 this removes the sign extension calls, which compiler adds to
optimize access to pid->nubers[ns->level].

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid.h           | 2 +-
 include/linux/pid_namespace.h | 2 +-
 kernel/pid_namespace.c        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 8d199033c0ca..c21c7e8124a7 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -60,7 +60,7 @@ struct pid
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
 	struct rcu_head rcu;
-	int level;
+	unsigned int level;
 	struct upid numbers[1];
 };
 
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index fcd61fa2c833..caff5283d15c 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -20,7 +20,7 @@ struct pid_namespace {
 	int last_pid;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
-	int level;
+	unsigned int level;
 	struct pid_namespace *parent;
 #ifdef CONFIG_PROC_FS
 	struct vfsmount *proc_mnt;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 5ca37fa50beb..98702b4b8851 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -66,7 +66,7 @@ err_alloc:
 	return NULL;
 }
 
-static struct pid_namespace *create_pid_namespace(int level)
+static struct pid_namespace *create_pid_namespace(unsigned int level)
 {
 	struct pid_namespace *ns;
 	int i;
-- 
cgit v1.3-14-g43fede


From e4ad08fe64afca4ef79ecc4c624e6e871688da0d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 30 Apr 2008 00:54:37 -0700
Subject: mm: bdi: add separate writeback accounting capability

Add a new BDI capability flag: BDI_CAP_NO_ACCT_WB.  If this flag is
set, then don't update the per-bdi writeback stats from
test_set_page_writeback() and test_clear_page_writeback().

Misc cleanups:

 - convert bdi_cap_writeback_dirty() and friends to static inline functions
 - create a flag that includes all three dirty/writeback related flags,
   since almst all users will want to have them toghether

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/configfs/inode.c         |  2 +-
 fs/hugetlbfs/inode.c        |  2 +-
 fs/ocfs2/dlm/dlmfs.c        |  2 +-
 fs/ramfs/inode.c            |  2 +-
 fs/sysfs/inode.c            |  2 +-
 include/linux/backing-dev.h | 77 +++++++++++++++++++++++++++++++++------------
 kernel/cgroup.c             |  2 +-
 mm/page-writeback.c         |  4 +--
 mm/shmem.c                  |  2 +-
 mm/swap_state.c             |  2 +-
 10 files changed, 67 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4c1ebff778ee..b9a1d810346d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -47,7 +47,7 @@ static const struct address_space_operations configfs_aops = {
 
 static struct backing_dev_info configfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
 static const struct inode_operations configfs_inode_operations ={
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9783723e8ffe..aeabf80f81a5 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -45,7 +45,7 @@ static const struct inode_operations hugetlbfs_inode_operations;
 
 static struct backing_dev_info hugetlbfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
 int sysctl_hugetlb_shm_group;
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 61a000f8524c..e48aba698b77 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -327,7 +327,7 @@ clear_fields:
 
 static struct backing_dev_info dlmfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
 static struct inode *dlmfs_get_root_inode(struct super_block *sb)
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 8428d5b2711d..b13123424e49 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -44,7 +44,7 @@ static const struct inode_operations ramfs_dir_inode_operations;
 
 static struct backing_dev_info ramfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK |
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK |
 			  BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
 			  BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
 };
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index d9262f74f94e..f8b82e73b3bf 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -30,7 +30,7 @@ static const struct address_space_operations sysfs_aops = {
 
 static struct backing_dev_info sysfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
 static const struct inode_operations sysfs_inode_operations ={
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c49a2d045e11..13ab79d99268 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -12,6 +12,7 @@
 #include <linux/log2.h>
 #include <linux/proportions.h>
 #include <linux/kernel.h>
+#include <linux/fs.h>
 #include <asm/atomic.h>
 
 struct page;
@@ -151,22 +152,43 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 
 /*
  * Flags in backing_dev_info::capability
- * - The first two flags control whether dirty pages will contribute to the
- *   VM's accounting and whether writepages() should be called for dirty pages
- *   (something that would not, for example, be appropriate for ramfs)
- * - These flags let !MMU mmap() govern direct device mapping vs immediate
- *   copying more easily for MAP_PRIVATE, especially for ROM filesystems
+ *
+ * The first three flags control whether dirty pages will contribute to the
+ * VM's accounting and whether writepages() should be called for dirty pages
+ * (something that would not, for example, be appropriate for ramfs)
+ *
+ * WARNING: these flags are closely related and should not normally be
+ * used separately.  The BDI_CAP_NO_ACCT_AND_WRITEBACK combines these
+ * three flags into a single convenience macro.
+ *
+ * BDI_CAP_NO_ACCT_DIRTY:  Dirty pages shouldn't contribute to accounting
+ * BDI_CAP_NO_WRITEBACK:   Don't write pages back
+ * BDI_CAP_NO_ACCT_WB:     Don't automatically account writeback pages
+ *
+ * These flags let !MMU mmap() govern direct device mapping vs immediate
+ * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
+ *
+ * BDI_CAP_MAP_COPY:       Copy can be mapped (MAP_PRIVATE)
+ * BDI_CAP_MAP_DIRECT:     Can be mapped directly (MAP_SHARED)
+ * BDI_CAP_READ_MAP:       Can be mapped for reading
+ * BDI_CAP_WRITE_MAP:      Can be mapped for writing
+ * BDI_CAP_EXEC_MAP:       Can be mapped for execution
  */
-#define BDI_CAP_NO_ACCT_DIRTY	0x00000001	/* Dirty pages shouldn't contribute to accounting */
-#define BDI_CAP_NO_WRITEBACK	0x00000002	/* Don't write pages back */
-#define BDI_CAP_MAP_COPY	0x00000004	/* Copy can be mapped (MAP_PRIVATE) */
-#define BDI_CAP_MAP_DIRECT	0x00000008	/* Can be mapped directly (MAP_SHARED) */
-#define BDI_CAP_READ_MAP	0x00000010	/* Can be mapped for reading */
-#define BDI_CAP_WRITE_MAP	0x00000020	/* Can be mapped for writing */
-#define BDI_CAP_EXEC_MAP	0x00000040	/* Can be mapped for execution */
+#define BDI_CAP_NO_ACCT_DIRTY	0x00000001
+#define BDI_CAP_NO_WRITEBACK	0x00000002
+#define BDI_CAP_MAP_COPY	0x00000004
+#define BDI_CAP_MAP_DIRECT	0x00000008
+#define BDI_CAP_READ_MAP	0x00000010
+#define BDI_CAP_WRITE_MAP	0x00000020
+#define BDI_CAP_EXEC_MAP	0x00000040
+#define BDI_CAP_NO_ACCT_WB	0x00000080
+
 #define BDI_CAP_VMFLAGS \
 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
 
+#define BDI_CAP_NO_ACCT_AND_WRITEBACK \
+	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
+
 #if defined(VM_MAYREAD) && \
 	(BDI_CAP_READ_MAP != VM_MAYREAD || \
 	 BDI_CAP_WRITE_MAP != VM_MAYWRITE || \
@@ -206,17 +228,32 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
 void set_bdi_congested(struct backing_dev_info *bdi, int rw);
 long congestion_wait(int rw, long timeout);
 
-#define bdi_cap_writeback_dirty(bdi) \
-	(!((bdi)->capabilities & BDI_CAP_NO_WRITEBACK))
 
-#define bdi_cap_account_dirty(bdi) \
-	(!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
+static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
+{
+	return !(bdi->capabilities & BDI_CAP_NO_WRITEBACK);
+}
+
+static inline bool bdi_cap_account_dirty(struct backing_dev_info *bdi)
+{
+	return !(bdi->capabilities & BDI_CAP_NO_ACCT_DIRTY);
+}
 
-#define mapping_cap_writeback_dirty(mapping) \
-	bdi_cap_writeback_dirty((mapping)->backing_dev_info)
+static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
+{
+	/* Paranoia: BDI_CAP_NO_WRITEBACK implies BDI_CAP_NO_ACCT_WB */
+	return !(bdi->capabilities & (BDI_CAP_NO_ACCT_WB |
+				      BDI_CAP_NO_WRITEBACK));
+}
 
-#define mapping_cap_account_dirty(mapping) \
-	bdi_cap_account_dirty((mapping)->backing_dev_info)
+static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
+{
+	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
+}
 
+static inline bool mapping_cap_account_dirty(struct address_space *mapping)
+{
+	return bdi_cap_account_dirty(mapping->backing_dev_info);
+}
 
 #endif		/* _LINUX_BACKING_DEV_H */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b9d467d83fc1..fbc6fc8949b4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -575,7 +575,7 @@ static struct inode_operations cgroup_dir_inode_operations;
 static struct file_operations proc_cgroupstats_operations;
 
 static struct backing_dev_info cgroup_backing_dev_info = {
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2a9942f5387c..bbcb916190c9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1246,7 +1246,7 @@ int test_clear_page_writeback(struct page *page)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-			if (bdi_cap_writeback_dirty(bdi)) {
+			if (bdi_cap_account_writeback(bdi)) {
 				__dec_bdi_stat(bdi, BDI_WRITEBACK);
 				__bdi_writeout_inc(bdi);
 			}
@@ -1275,7 +1275,7 @@ int test_set_page_writeback(struct page *page)
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-			if (bdi_cap_writeback_dirty(bdi))
+			if (bdi_cap_account_writeback(bdi))
 				__inc_bdi_stat(bdi, BDI_WRITEBACK);
 		}
 		if (!PageDirty(page))
diff --git a/mm/shmem.c b/mm/shmem.c
index e6d9298aa22a..e2a6ae1a44e9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -201,7 +201,7 @@ static struct vm_operations_struct shmem_vm_ops;
 
 static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 	.unplug_io_fn	= default_unplug_io_fn,
 };
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 50757ee3f9f3..d8aadaf2a0ba 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = {
 };
 
 static struct backing_dev_info swap_backing_dev_info = {
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 	.unplug_io_fn	= swap_unplug_io_fn,
 };
 
-- 
cgit v1.3-14-g43fede


From f7511d5f66f01fc451747b24e79f3ada7a3af9af Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@ens-lyon.org>
Date: Wed, 30 Apr 2008 00:54:51 -0700
Subject: Basic braille screen reader support

This adds a minimalistic braille screen reader support.  This is meant to
be used by blind people e.g.  on boot failures or when / cannot be mounted
etc and thus the userland screen readers can not work.

[akpm@linux-foundation.org: fix exports]
Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Cc: Jiri Kosina <jikos@jikos.cz>
Cc: Dmitry Torokhov <dtor@mail.ru>
Acked-by: Alan Cox <alan@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/braille-console.txt               |  34 ++
 Documentation/kernel-parameters.txt             |   5 +
 arch/powerpc/kernel/ppc_ksyms.c                 |   3 -
 arch/ppc/kernel/ppc_ksyms.c                     |   3 -
 drivers/Kconfig                                 |   2 +
 drivers/Makefile                                |   1 +
 drivers/accessibility/Kconfig                   |  23 ++
 drivers/accessibility/Makefile                  |   1 +
 drivers/accessibility/braille/Makefile          |   1 +
 drivers/accessibility/braille/braille_console.c | 397 ++++++++++++++++++++++++
 drivers/char/consolemap.c                       |   1 +
 drivers/char/keyboard.c                         |   2 +
 drivers/char/vt.c                               |   1 +
 include/linux/console.h                         |   4 +
 kernel/printk.c                                 |  90 ++++--
 15 files changed, 538 insertions(+), 30 deletions(-)
 create mode 100644 Documentation/braille-console.txt
 create mode 100644 drivers/accessibility/Kconfig
 create mode 100644 drivers/accessibility/Makefile
 create mode 100644 drivers/accessibility/braille/Makefile
 create mode 100644 drivers/accessibility/braille/braille_console.c

(limited to 'kernel')

diff --git a/Documentation/braille-console.txt b/Documentation/braille-console.txt
new file mode 100644
index 000000000000..000b0fbdc105
--- /dev/null
+++ b/Documentation/braille-console.txt
@@ -0,0 +1,34 @@
+                       Linux Braille Console
+
+To get early boot messages on a braille device (before userspace screen
+readers can start), you first need to compile the support for the usual serial
+console (see serial-console.txt), and for braille device (in Device Drivers -
+Accessibility).
+
+Then you need to specify a console=brl, option on the kernel command line, the
+format is:
+
+	console=brl,serial_options...
+
+where serial_options... are the same as described in serial-console.txt
+
+So for instance you can use console=brl,ttyS0 if the braille device is connected
+to the first serial port, and console=brl,ttyS0,115200 to override the baud rate
+to 115200, etc.
+
+By default, the braille device will just show the last kernel message (console
+mode).  To review previous messages, press the Insert key to switch to the VT
+review mode.  In review mode, the arrow keys permit to browse in the VT content,
+page up/down keys go at the top/bottom of the screen, and the home key goes back
+to the cursor, hence providing very basic screen reviewing facility.
+
+Sound feedback can be obtained by adding the braille_console.sound=1 kernel
+parameter.
+
+For simplicity, only one braille console can be enabled, other uses of
+console=brl,... will be discarded.  Also note that it does not interfere with
+the console selection mecanism described in serial-console.txt
+
+For now, only the VisioBraille device is supported.
+
+Samuel Thibault <samuel.thibault@ens-lyon.org>
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 3ce193f86565..0ba0861b5d18 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -496,6 +496,11 @@ and is between 256 and 4096 characters. It is defined in the file
 			switching to the matching ttyS device later.  The
 			options are the same as for ttyS, above.
 
+                If the device connected to the port is not a TTY but a braille
+                device, prepend "brl," before the device type, for instance
+			console=brl,ttyS0
+		For now, only VisioBraille is supported.
+
 	earlycon=	[KNL] Output early console device and options.
 		uart[8250],io,<addr>[,options]
 		uart[8250],mmio,<addr>[,options]
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 09fcb50c45ae..cf6b5a7d8b3f 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -133,9 +133,6 @@ EXPORT_SYMBOL(adb_try_handler_change);
 EXPORT_SYMBOL(cuda_request);
 EXPORT_SYMBOL(cuda_poll);
 #endif /* CONFIG_ADB_CUDA */
-#ifdef CONFIG_VT
-EXPORT_SYMBOL(kd_mksound);
-#endif
 EXPORT_SYMBOL(to_tm);
 
 #ifdef CONFIG_PPC32
diff --git a/arch/ppc/kernel/ppc_ksyms.c b/arch/ppc/kernel/ppc_ksyms.c
index d9036ef0b658..16ac11ca7ba0 100644
--- a/arch/ppc/kernel/ppc_ksyms.c
+++ b/arch/ppc/kernel/ppc_ksyms.c
@@ -183,9 +183,6 @@ EXPORT_SYMBOL(cuda_poll);
 #if defined(CONFIG_BOOTX_TEXT)
 EXPORT_SYMBOL(btext_update_display);
 #endif
-#ifdef CONFIG_VT
-EXPORT_SYMBOL(kd_mksound);
-#endif
 EXPORT_SYMBOL(to_tm);
 
 EXPORT_SYMBOL(pm_power_off);
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 80f0ec91e2cf..59f33fa6af3e 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -84,6 +84,8 @@ source "drivers/memstick/Kconfig"
 
 source "drivers/leds/Kconfig"
 
+source "drivers/accessibility/Kconfig"
+
 source "drivers/infiniband/Kconfig"
 
 source "drivers/edac/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index e5e394a7e6c0..f65deda72d61 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_WATCHDOG)		+= watchdog/
 obj-$(CONFIG_PHONE)		+= telephony/
 obj-$(CONFIG_MD)		+= md/
 obj-$(CONFIG_BT)		+= bluetooth/
+obj-$(CONFIG_ACCESSIBILITY)	+= accessibility/
 obj-$(CONFIG_ISDN)		+= isdn/
 obj-$(CONFIG_EDAC)		+= edac/
 obj-$(CONFIG_MCA)		+= mca/
diff --git a/drivers/accessibility/Kconfig b/drivers/accessibility/Kconfig
new file mode 100644
index 000000000000..1264c4b98094
--- /dev/null
+++ b/drivers/accessibility/Kconfig
@@ -0,0 +1,23 @@
+menuconfig ACCESSIBILITY
+	bool "Accessibility support"
+	---help---
+	  Enable a submenu where accessibility items may be enabled.
+
+	  If unsure, say N.
+
+if ACCESSIBILITY
+config A11Y_BRAILLE_CONSOLE
+	bool "Console on braille device"
+	depends on VT
+	depends on SERIAL_CORE_CONSOLE
+	---help---
+	  Enables console output on a braille device connected to a 8250
+	  serial port. For now only the VisioBraille device is supported.
+
+	  To actually enable it, you need to pass option
+	  console=brl,ttyS0
+	  to the kernel. Options are the same as for serial console.
+
+	  If unsure, say N.
+
+endif # ACCESSIBILITY
diff --git a/drivers/accessibility/Makefile b/drivers/accessibility/Makefile
new file mode 100644
index 000000000000..72b01a46546f
--- /dev/null
+++ b/drivers/accessibility/Makefile
@@ -0,0 +1 @@
+obj-y				+= braille/
diff --git a/drivers/accessibility/braille/Makefile b/drivers/accessibility/braille/Makefile
new file mode 100644
index 000000000000..2e9f16c91347
--- /dev/null
+++ b/drivers/accessibility/braille/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)		+= braille_console.o
diff --git a/drivers/accessibility/braille/braille_console.c b/drivers/accessibility/braille/braille_console.c
new file mode 100644
index 000000000000..0a5f6b2114c5
--- /dev/null
+++ b/drivers/accessibility/braille/braille_console.c
@@ -0,0 +1,397 @@
+/*
+ * Minimalistic braille device kernel support.
+ *
+ * By default, shows console messages on the braille device.
+ * Pressing Insert switches to VC browsing.
+ *
+ *  Copyright (C) Samuel Thibault <samuel.thibault@ens-lyon.org>
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/autoconf.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/console.h>
+#include <linux/notifier.h>
+
+#include <linux/selection.h>
+#include <linux/vt_kern.h>
+#include <linux/consolemap.h>
+
+#include <linux/keyboard.h>
+#include <linux/kbd_kern.h>
+#include <linux/input.h>
+
+MODULE_AUTHOR("samuel.thibault@ens-lyon.org");
+MODULE_DESCRIPTION("braille device");
+MODULE_LICENSE("GPL");
+
+/*
+ * Braille device support part.
+ */
+
+/* Emit various sounds */
+static int sound;
+module_param(sound, bool, 0);
+MODULE_PARM_DESC(sound, "emit sounds");
+
+static void beep(unsigned int freq)
+{
+	if (sound)
+		kd_mksound(freq, HZ/10);
+}
+
+/* mini console */
+#define WIDTH 40
+#define BRAILLE_KEY KEY_INSERT
+static u16 console_buf[WIDTH];
+static int console_cursor;
+
+/* mini view of VC */
+static int vc_x, vc_y, lastvc_x, lastvc_y;
+
+/* show console ? (or show VC) */
+static int console_show = 1;
+/* pending newline ? */
+static int console_newline = 1;
+static int lastVC = -1;
+
+static struct console *braille_co;
+
+/* Very VisioBraille-specific */
+static void braille_write(u16 *buf)
+{
+	static u16 lastwrite[WIDTH];
+	unsigned char data[1 + 1 + 2*WIDTH + 2 + 1], csum = 0, *c;
+	u16 out;
+	int i;
+
+	if (!braille_co)
+		return;
+
+	if (!memcmp(lastwrite, buf, WIDTH * sizeof(*buf)))
+		return;
+	memcpy(lastwrite, buf, WIDTH * sizeof(*buf));
+
+#define SOH 1
+#define STX 2
+#define ETX 2
+#define EOT 4
+#define ENQ 5
+	data[0] = STX;
+	data[1] = '>';
+	csum ^= '>';
+	c = &data[2];
+	for (i = 0; i < WIDTH; i++) {
+		out = buf[i];
+		if (out >= 0x100)
+			out = '?';
+		else if (out == 0x00)
+			out = ' ';
+		csum ^= out;
+		if (out <= 0x05) {
+			*c++ = SOH;
+			out |= 0x40;
+		}
+		*c++ = out;
+	}
+
+	if (csum <= 0x05) {
+		*c++ = SOH;
+		csum |= 0x40;
+	}
+	*c++ = csum;
+	*c++ = ETX;
+
+	braille_co->write(braille_co, data, c - data);
+}
+
+/* Follow the VC cursor*/
+static void vc_follow_cursor(struct vc_data *vc)
+{
+	vc_x = vc->vc_x - (vc->vc_x % WIDTH);
+	vc_y = vc->vc_y;
+	lastvc_x = vc->vc_x;
+	lastvc_y = vc->vc_y;
+}
+
+/* Maybe the VC cursor moved, if so follow it */
+static void vc_maybe_cursor_moved(struct vc_data *vc)
+{
+	if (vc->vc_x != lastvc_x || vc->vc_y != lastvc_y)
+		vc_follow_cursor(vc);
+}
+
+/* Show portion of VC at vc_x, vc_y */
+static void vc_refresh(struct vc_data *vc)
+{
+	u16 buf[WIDTH];
+	int i;
+
+	for (i = 0; i < WIDTH; i++) {
+		u16 glyph = screen_glyph(vc,
+				2 * (vc_x + i) + vc_y * vc->vc_size_row);
+		buf[i] = inverse_translate(vc, glyph, 1);
+	}
+	braille_write(buf);
+}
+
+/*
+ * Link to keyboard
+ */
+
+static int keyboard_notifier_call(struct notifier_block *blk,
+				  unsigned long code, void *_param)
+{
+	struct keyboard_notifier_param *param = _param;
+	struct vc_data *vc = param->vc;
+	int ret = NOTIFY_OK;
+
+	if (!param->down)
+		return ret;
+
+	switch (code) {
+	case KBD_KEYCODE:
+		if (console_show) {
+			if (param->value == BRAILLE_KEY) {
+				console_show = 0;
+				beep(880);
+				vc_maybe_cursor_moved(vc);
+				vc_refresh(vc);
+				ret = NOTIFY_STOP;
+			}
+		} else {
+			ret = NOTIFY_STOP;
+			switch (param->value) {
+			case KEY_INSERT:
+				beep(440);
+				console_show = 1;
+				lastVC = -1;
+				braille_write(console_buf);
+				break;
+			case KEY_LEFT:
+				if (vc_x > 0) {
+					vc_x -= WIDTH;
+					if (vc_x < 0)
+						vc_x = 0;
+				} else if (vc_y >= 1) {
+					beep(880);
+					vc_y--;
+					vc_x = vc->vc_cols-WIDTH;
+				} else
+					beep(220);
+				break;
+			case KEY_RIGHT:
+				if (vc_x + WIDTH < vc->vc_cols) {
+					vc_x += WIDTH;
+				} else if (vc_y + 1 < vc->vc_rows) {
+					beep(880);
+					vc_y++;
+					vc_x = 0;
+				} else
+					beep(220);
+				break;
+			case KEY_DOWN:
+				if (vc_y + 1 < vc->vc_rows)
+					vc_y++;
+				else
+					beep(220);
+				break;
+			case KEY_UP:
+				if (vc_y >= 1)
+					vc_y--;
+				else
+					beep(220);
+				break;
+			case KEY_HOME:
+				vc_follow_cursor(vc);
+				break;
+			case KEY_PAGEUP:
+				vc_x = 0;
+				vc_y = 0;
+				break;
+			case KEY_PAGEDOWN:
+				vc_x = 0;
+				vc_y = vc->vc_rows-1;
+				break;
+			default:
+				ret = NOTIFY_OK;
+				break;
+			}
+			if (ret == NOTIFY_STOP)
+				vc_refresh(vc);
+		}
+		break;
+	case KBD_POST_KEYSYM:
+	{
+		unsigned char type = KTYP(param->value) - 0xf0;
+		if (type == KT_SPEC) {
+			unsigned char val = KVAL(param->value);
+			int on_off = -1;
+
+			switch (val) {
+			case KVAL(K_CAPS):
+				on_off = vc_kbd_led(kbd_table + fg_console,
+						VC_CAPSLOCK);
+				break;
+			case KVAL(K_NUM):
+				on_off = vc_kbd_led(kbd_table + fg_console,
+						VC_NUMLOCK);
+				break;
+			case KVAL(K_HOLD):
+				on_off = vc_kbd_led(kbd_table + fg_console,
+						VC_SCROLLOCK);
+				break;
+			}
+			if (on_off == 1)
+				beep(880);
+			else if (on_off == 0)
+				beep(440);
+		}
+	}
+	case KBD_UNBOUND_KEYCODE:
+	case KBD_UNICODE:
+	case KBD_KEYSYM:
+		/* Unused */
+		break;
+	}
+	return ret;
+}
+
+static struct notifier_block keyboard_notifier_block = {
+	.notifier_call = keyboard_notifier_call,
+};
+
+static int vt_notifier_call(struct notifier_block *blk,
+			    unsigned long code, void *_param)
+{
+	struct vt_notifier_param *param = _param;
+	struct vc_data *vc = param->vc;
+	switch (code) {
+	case VT_ALLOCATE:
+		break;
+	case VT_DEALLOCATE:
+		break;
+	case VT_WRITE:
+	{
+		unsigned char c = param->c;
+		if (vc->vc_num != fg_console)
+			break;
+		switch (c) {
+		case '\b':
+		case 127:
+			if (console_cursor > 0) {
+				console_cursor--;
+				console_buf[console_cursor] = ' ';
+			}
+			break;
+		case '\n':
+		case '\v':
+		case '\f':
+		case '\r':
+			console_newline = 1;
+			break;
+		case '\t':
+			c = ' ';
+			/* Fallthrough */
+		default:
+			if (c < 32)
+				/* Ignore other control sequences */
+				break;
+			if (console_newline) {
+				memset(console_buf, 0, sizeof(console_buf));
+				console_cursor = 0;
+				console_newline = 0;
+			}
+			if (console_cursor == WIDTH)
+				memmove(console_buf, &console_buf[1],
+					(WIDTH-1) * sizeof(*console_buf));
+			else
+				console_cursor++;
+			console_buf[console_cursor-1] = c;
+			break;
+		}
+		if (console_show)
+			braille_write(console_buf);
+		else {
+			vc_maybe_cursor_moved(vc);
+			vc_refresh(vc);
+		}
+		break;
+	}
+	case VT_UPDATE:
+		/* Maybe a VT switch, flush */
+		if (console_show) {
+			if (vc->vc_num != lastVC) {
+				lastVC = vc->vc_num;
+				memset(console_buf, 0, sizeof(console_buf));
+				console_cursor = 0;
+				braille_write(console_buf);
+			}
+		} else {
+			vc_maybe_cursor_moved(vc);
+			vc_refresh(vc);
+		}
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block vt_notifier_block = {
+	.notifier_call = vt_notifier_call,
+};
+
+/*
+ * Called from printk.c when console=brl is given
+ */
+
+int braille_register_console(struct console *console, int index,
+		char *console_options, char *braille_options)
+{
+	int ret;
+	if (!console_options)
+		/* Only support VisioBraille for now */
+		console_options = "57600o8";
+	if (braille_co)
+		return -ENODEV;
+	if (console->setup) {
+		ret = console->setup(console, console_options);
+		if (ret != 0)
+			return ret;
+	}
+	console->flags |= CON_ENABLED;
+	console->index = index;
+	braille_co = console;
+	return 0;
+}
+
+int braille_unregister_console(struct console *console)
+{
+	if (braille_co != console)
+		return -EINVAL;
+	braille_co = NULL;
+	return 0;
+}
+
+static int __init braille_init(void)
+{
+	register_keyboard_notifier(&keyboard_notifier_block);
+	register_vt_notifier(&vt_notifier_block);
+	return 0;
+}
+
+console_initcall(braille_init);
diff --git a/drivers/char/consolemap.c b/drivers/char/consolemap.c
index 6b104e45a322..4246b8e36cb3 100644
--- a/drivers/char/consolemap.c
+++ b/drivers/char/consolemap.c
@@ -277,6 +277,7 @@ u16 inverse_translate(struct vc_data *conp, int glyph, int use_unicode)
 			return p->inverse_translations[m][glyph];
 	}
 }
+EXPORT_SYMBOL_GPL(inverse_translate);
 
 static void update_user_maps(void)
 {
diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
index d1c50b3302e5..7f7e798c1384 100644
--- a/drivers/char/keyboard.c
+++ b/drivers/char/keyboard.c
@@ -110,6 +110,7 @@ const int max_vals[] = {
 const int NR_TYPES = ARRAY_SIZE(max_vals);
 
 struct kbd_struct kbd_table[MAX_NR_CONSOLES];
+EXPORT_SYMBOL_GPL(kbd_table);
 static struct kbd_struct *kbd = kbd_table;
 
 struct vt_spawn_console vt_spawn_con = {
@@ -260,6 +261,7 @@ void kd_mksound(unsigned int hz, unsigned int ticks)
 	} else
 		kd_nosound(0);
 }
+EXPORT_SYMBOL(kd_mksound);
 
 /*
  * Setting the keyboard rate.
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 71cf203d282d..e458b08139af 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -4004,6 +4004,7 @@ u16 screen_glyph(struct vc_data *vc, int offset)
 		c |= 0x100;
 	return c;
 }
+EXPORT_SYMBOL_GPL(screen_glyph);
 
 /* used by vcs - note the word offset */
 unsigned short *screen_pos(struct vc_data *vc, int w_offset, int viewed)
diff --git a/include/linux/console.h b/include/linux/console.h
index a5f88a6a259d..a4f27fbdf549 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -91,6 +91,7 @@ void give_up_console(const struct consw *sw);
 #define CON_ENABLED	(4)
 #define CON_BOOT	(8)
 #define CON_ANYTIME	(16) /* Safe to call when cpu is offline */
+#define CON_BRL		(32) /* Used for a braille device */
 
 struct console {
 	char	name[16];
@@ -121,6 +122,9 @@ extern struct tty_driver *console_device(int *);
 extern void console_stop(struct console *);
 extern void console_start(struct console *);
 extern int is_console_locked(void);
+extern int braille_register_console(struct console *, int index,
+		char *console_options, char *braille_options);
+extern int braille_unregister_console(struct console *);
 
 extern int console_suspend_enabled;
 
diff --git a/kernel/printk.c b/kernel/printk.c
index 0d232589a923..e61346faf6a5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -111,6 +111,9 @@ struct console_cmdline
 	char	name[8];			/* Name of the driver	    */
 	int	index;				/* Minor dev. to use	    */
 	char	*options;			/* Options for the driver   */
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+	char	*brl_options;			/* Options for braille driver */
+#endif
 };
 
 #define MAX_CMDLINECONSOLES 8
@@ -808,15 +811,60 @@ static void call_console_drivers(unsigned start, unsigned end)
 
 #endif
 
+static int __add_preferred_console(char *name, int idx, char *options,
+				   char *brl_options)
+{
+	struct console_cmdline *c;
+	int i;
+
+	/*
+	 *	See if this tty is not yet registered, and
+	 *	if we have a slot free.
+	 */
+	for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+		if (strcmp(console_cmdline[i].name, name) == 0 &&
+			  console_cmdline[i].index == idx) {
+				if (!brl_options)
+					selected_console = i;
+				return 0;
+		}
+	if (i == MAX_CMDLINECONSOLES)
+		return -E2BIG;
+	if (!brl_options)
+		selected_console = i;
+	c = &console_cmdline[i];
+	strlcpy(c->name, name, sizeof(c->name));
+	c->options = options;
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+	c->brl_options = brl_options;
+#endif
+	c->index = idx;
+	return 0;
+}
 /*
  * Set up a list of consoles.  Called from init/main.c
  */
 static int __init console_setup(char *str)
 {
 	char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
-	char *s, *options;
+	char *s, *options, *brl_options = NULL;
 	int idx;
 
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+	if (!memcmp(str, "brl,", 4)) {
+		brl_options = "";
+		str += 4;
+	} else if (!memcmp(str, "brl=", 4)) {
+		brl_options = str + 4;
+		str = strchr(brl_options, ',');
+		if (!str) {
+			printk(KERN_ERR "need port name after brl=\n");
+			return 1;
+		}
+		*(str++) = 0;
+	}
+#endif
+
 	/*
 	 * Decode str into name, index, options.
 	 */
@@ -841,7 +889,7 @@ static int __init console_setup(char *str)
 	idx = simple_strtoul(s, NULL, 10);
 	*s = 0;
 
-	add_preferred_console(buf, idx, options);
+	__add_preferred_console(buf, idx, options, brl_options);
 	return 1;
 }
 __setup("console=", console_setup);
@@ -861,28 +909,7 @@ __setup("console=", console_setup);
  */
 int add_preferred_console(char *name, int idx, char *options)
 {
-	struct console_cmdline *c;
-	int i;
-
-	/*
-	 *	See if this tty is not yet registered, and
-	 *	if we have a slot free.
-	 */
-	for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
-		if (strcmp(console_cmdline[i].name, name) == 0 &&
-			  console_cmdline[i].index == idx) {
-				selected_console = i;
-				return 0;
-		}
-	if (i == MAX_CMDLINECONSOLES)
-		return -E2BIG;
-	selected_console = i;
-	c = &console_cmdline[i];
-	memcpy(c->name, name, sizeof(c->name));
-	c->name[sizeof(c->name) - 1] = 0;
-	c->options = options;
-	c->index = idx;
-	return 0;
+	return __add_preferred_console(name, idx, options, NULL);
 }
 
 int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
@@ -1163,6 +1190,16 @@ void register_console(struct console *console)
 			continue;
 		if (console->index < 0)
 			console->index = console_cmdline[i].index;
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+		if (console_cmdline[i].brl_options) {
+			console->flags |= CON_BRL;
+			braille_register_console(console,
+					console_cmdline[i].index,
+					console_cmdline[i].options,
+					console_cmdline[i].brl_options);
+			return;
+		}
+#endif
 		if (console->setup &&
 		    console->setup(console, console_cmdline[i].options) != 0)
 			break;
@@ -1221,6 +1258,11 @@ int unregister_console(struct console *console)
         struct console *a, *b;
 	int res = 1;
 
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+	if (console->flags & CON_BRL)
+		return braille_unregister_console(console);
+#endif
+
 	acquire_console_sem();
 	if (console_drivers == console) {
 		console_drivers=console->next;
-- 
cgit v1.3-14-g43fede


From f735295b14ae073a8302d7b1da894bc597724557 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Wed, 30 Apr 2008 00:54:52 -0700
Subject: printk: don't read beyond string arguments' terminating zero

Fix update_console_cmdline() not to to read beyond the terminating zero of its
name argument.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index e61346faf6a5..8fb01c32aa3b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -921,7 +921,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
 		if (strcmp(console_cmdline[i].name, name) == 0 &&
 			  console_cmdline[i].index == idx) {
 				c = &console_cmdline[i];
-				memcpy(c->name, name_new, sizeof(c->name));
+				strlcpy(c->name, name_new, sizeof(c->name));
 				c->name[sizeof(c->name) - 1] = 0;
 				c->options = options;
 				c->index = idx_new;
-- 
cgit v1.3-14-g43fede


From 354a1f4d99240f53980275416ca3e1ac2ee73d5d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 30 Apr 2008 00:54:54 -0700
Subject: alloc_uid: cleanup

Use kmem_cache_zalloc(), remove large amounts of initialisation code and
ifdeffery.

Note: this assumes that memset(*atomic_t, 0) correctly initialises the
atomic_t.  This is true for all present archtiectures and if it becomes false
for a future architecture then we'll need to make large changes all over the
place anyway.

Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index aefbbfa3159f..865ecf57a096 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -384,7 +384,7 @@ void free_uid(struct user_struct *up)
 		local_irq_restore(flags);
 }
 
-struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
+struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 {
 	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up, *new;
@@ -399,26 +399,12 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 	spin_unlock_irq(&uidhash_lock);
 
 	if (!up) {
-		new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
+		new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
 		if (!new)
 			goto out_unlock;
 
 		new->uid = uid;
 		atomic_set(&new->__count, 1);
-		atomic_set(&new->processes, 0);
-		atomic_set(&new->files, 0);
-		atomic_set(&new->sigpending, 0);
-#ifdef CONFIG_INOTIFY_USER
-		atomic_set(&new->inotify_watches, 0);
-		atomic_set(&new->inotify_devs, 0);
-#endif
-#ifdef CONFIG_POSIX_MQUEUE
-		new->mq_bytes = 0;
-#endif
-		new->locked_shm = 0;
-#ifdef CONFIG_KEYS
-		new->uid_keyring = new->session_keyring = NULL;
-#endif
 
 		if (sched_create_user(new) < 0)
 			goto out_free_user;
-- 
cgit v1.3-14-g43fede


From c6f3a97f86a5c97be0ca255976110bb9c3cfe669 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Apr 2008 00:55:03 -0700
Subject: debugobjects: add timer specific object debugging code

Add calls to the generic object debugging infrastructure and provide fixup
functions which allow to keep the system alive when recoverable problems have
been detected by the object debugging core code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Greg KH <greg@kroah.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/parport/ieee1284.c |   4 +-
 fs/aio.c                   |   5 +-
 include/linux/poison.h     |   7 +++
 include/linux/timer.h      |  23 ++++++-
 kernel/timer.c             | 153 ++++++++++++++++++++++++++++++++++++++++++---
 lib/Kconfig.debug          |   8 +++
 6 files changed, 187 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c
index 54a6ef72906e..0338b0912674 100644
--- a/drivers/parport/ieee1284.c
+++ b/drivers/parport/ieee1284.c
@@ -76,7 +76,7 @@ int parport_wait_event (struct parport *port, signed long timeout)
 		   semaphore. */
 		return 1;
 
-	init_timer (&timer);
+	init_timer_on_stack(&timer);
 	timer.expires = jiffies + timeout;
 	timer.function = timeout_waiting_on_port;
 	port_from_cookie[port->number % PARPORT_MAX] = port;
@@ -88,6 +88,8 @@ int parport_wait_event (struct parport *port, signed long timeout)
 		/* Timed out. */
 		ret = 1;
 
+	destroy_timer_on_stack(&timer);
+
 	return ret;
 }
 
diff --git a/fs/aio.c b/fs/aio.c
index 99c2352906a0..b5253e77eb2f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1078,9 +1078,7 @@ static void timeout_func(unsigned long data)
 
 static inline void init_timeout(struct aio_timeout *to)
 {
-	init_timer(&to->timer);
-	to->timer.data = (unsigned long)to;
-	to->timer.function = timeout_func;
+	setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
 	to->timed_out = 0;
 	to->p = current;
 }
@@ -1213,6 +1211,7 @@ retry:
 	if (timeout)
 		clear_timeout(&to);
 out:
+	destroy_timer_on_stack(&to.timer);
 	return i ? i : ret;
 }
 
diff --git a/include/linux/poison.h b/include/linux/poison.h
index a9c31be7052c..9f31683728fd 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -10,6 +10,13 @@
 #define LIST_POISON1  ((void *) 0x00100100)
 #define LIST_POISON2  ((void *) 0x00200200)
 
+/********** include/linux/timer.h **********/
+/*
+ * Magic number "tsta" to indicate a static timer initializer
+ * for the object debugging code.
+ */
+#define TIMER_ENTRY_STATIC	((void *) 0x74737461)
+
 /********** mm/slab.c **********/
 /*
  * Magic nums for obj red zoning.
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 979fefdeb862..d4ba79248a27 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -4,6 +4,7 @@
 #include <linux/list.h>
 #include <linux/ktime.h>
 #include <linux/stddef.h>
+#include <linux/debugobjects.h>
 
 struct tvec_base;
 
@@ -25,6 +26,7 @@ struct timer_list {
 extern struct tvec_base boot_tvec_bases;
 
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
+		.entry = { .prev = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
@@ -38,6 +40,17 @@ extern struct tvec_base boot_tvec_bases;
 void init_timer(struct timer_list *timer);
 void init_timer_deferrable(struct timer_list *timer);
 
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
+extern void init_timer_on_stack(struct timer_list *timer);
+extern void destroy_timer_on_stack(struct timer_list *timer);
+#else
+static inline void destroy_timer_on_stack(struct timer_list *timer) { }
+static inline void init_timer_on_stack(struct timer_list *timer)
+{
+	init_timer(timer);
+}
+#endif
+
 static inline void setup_timer(struct timer_list * timer,
 				void (*function)(unsigned long),
 				unsigned long data)
@@ -47,6 +60,15 @@ static inline void setup_timer(struct timer_list * timer,
 	init_timer(timer);
 }
 
+static inline void setup_timer_on_stack(struct timer_list *timer,
+					void (*function)(unsigned long),
+					unsigned long data)
+{
+	timer->function = function;
+	timer->data = data;
+	init_timer_on_stack(timer);
+}
+
 /**
  * timer_pending - is a timer pending?
  * @timer: the timer in question
@@ -164,5 +186,4 @@ unsigned long __round_jiffies_relative(unsigned long j, int cpu);
 unsigned long round_jiffies(unsigned long j);
 unsigned long round_jiffies_relative(unsigned long j);
 
-
 #endif
diff --git a/kernel/timer.c b/kernel/timer.c
index f3d35d4ea42e..ceacc6626572 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -320,14 +320,130 @@ static void timer_stats_account_timer(struct timer_list *timer)
 static void timer_stats_account_timer(struct timer_list *timer) {}
 #endif
 
-/**
- * init_timer - initialize a timer.
- * @timer: the timer to be initialized
- *
- * init_timer() must be done to a timer prior calling *any* of the
- * other timer functions.
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
+
+static struct debug_obj_descr timer_debug_descr;
+
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
  */
-void init_timer(struct timer_list *timer)
+static int timer_fixup_init(void *addr, enum debug_obj_state state)
+{
+	struct timer_list *timer = addr;
+
+	switch (state) {
+	case ODEBUG_STATE_ACTIVE:
+		del_timer_sync(timer);
+		debug_object_init(timer, &timer_debug_descr);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int timer_fixup_activate(void *addr, enum debug_obj_state state)
+{
+	struct timer_list *timer = addr;
+
+	switch (state) {
+
+	case ODEBUG_STATE_NOTAVAILABLE:
+		/*
+		 * This is not really a fixup. The timer was
+		 * statically initialized. We just make sure that it
+		 * is tracked in the object tracker.
+		 */
+		if (timer->entry.next == NULL &&
+		    timer->entry.prev == TIMER_ENTRY_STATIC) {
+			debug_object_init(timer, &timer_debug_descr);
+			debug_object_activate(timer, &timer_debug_descr);
+			return 0;
+		} else {
+			WARN_ON_ONCE(1);
+		}
+		return 0;
+
+	case ODEBUG_STATE_ACTIVE:
+		WARN_ON(1);
+
+	default:
+		return 0;
+	}
+}
+
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int timer_fixup_free(void *addr, enum debug_obj_state state)
+{
+	struct timer_list *timer = addr;
+
+	switch (state) {
+	case ODEBUG_STATE_ACTIVE:
+		del_timer_sync(timer);
+		debug_object_free(timer, &timer_debug_descr);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static struct debug_obj_descr timer_debug_descr = {
+	.name		= "timer_list",
+	.fixup_init	= timer_fixup_init,
+	.fixup_activate	= timer_fixup_activate,
+	.fixup_free	= timer_fixup_free,
+};
+
+static inline void debug_timer_init(struct timer_list *timer)
+{
+	debug_object_init(timer, &timer_debug_descr);
+}
+
+static inline void debug_timer_activate(struct timer_list *timer)
+{
+	debug_object_activate(timer, &timer_debug_descr);
+}
+
+static inline void debug_timer_deactivate(struct timer_list *timer)
+{
+	debug_object_deactivate(timer, &timer_debug_descr);
+}
+
+static inline void debug_timer_free(struct timer_list *timer)
+{
+	debug_object_free(timer, &timer_debug_descr);
+}
+
+static void __init_timer(struct timer_list *timer);
+
+void init_timer_on_stack(struct timer_list *timer)
+{
+	debug_object_init_on_stack(timer, &timer_debug_descr);
+	__init_timer(timer);
+}
+EXPORT_SYMBOL_GPL(init_timer_on_stack);
+
+void destroy_timer_on_stack(struct timer_list *timer)
+{
+	debug_object_free(timer, &timer_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
+
+#else
+static inline void debug_timer_init(struct timer_list *timer) { }
+static inline void debug_timer_activate(struct timer_list *timer) { }
+static inline void debug_timer_deactivate(struct timer_list *timer) { }
+#endif
+
+static void __init_timer(struct timer_list *timer)
 {
 	timer->entry.next = NULL;
 	timer->base = __raw_get_cpu_var(tvec_bases);
@@ -337,6 +453,19 @@ void init_timer(struct timer_list *timer)
 	memset(timer->start_comm, 0, TASK_COMM_LEN);
 #endif
 }
+
+/**
+ * init_timer - initialize a timer.
+ * @timer: the timer to be initialized
+ *
+ * init_timer() must be done to a timer prior calling *any* of the
+ * other timer functions.
+ */
+void init_timer(struct timer_list *timer)
+{
+	debug_timer_init(timer);
+	__init_timer(timer);
+}
 EXPORT_SYMBOL(init_timer);
 
 void init_timer_deferrable(struct timer_list *timer)
@@ -351,6 +480,8 @@ static inline void detach_timer(struct timer_list *timer,
 {
 	struct list_head *entry = &timer->entry;
 
+	debug_timer_deactivate(timer);
+
 	__list_del(entry->prev, entry->next);
 	if (clear_pending)
 		entry->next = NULL;
@@ -405,6 +536,8 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 		ret = 1;
 	}
 
+	debug_timer_activate(timer);
+
 	new_base = __get_cpu_var(tvec_bases);
 
 	if (base != new_base) {
@@ -450,6 +583,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	BUG_ON(timer_pending(timer) || !timer->function);
 	spin_lock_irqsave(&base->lock, flags);
 	timer_set_base(timer, base);
+	debug_timer_activate(timer);
 	internal_add_timer(base, timer);
 	/*
 	 * Check whether the other CPU is idle and needs to be
@@ -1086,11 +1220,14 @@ signed long __sched schedule_timeout(signed long timeout)
 
 	expire = timeout + jiffies;
 
-	setup_timer(&timer, process_timeout, (unsigned long)current);
+	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
 	__mod_timer(&timer, expire);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
+	/* Remove the timer from the object tracker */
+	destroy_timer_on_stack(&timer);
+
 	timeout = expire - jiffies;
 
  out:
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 3e132b0a59cc..d2099f41aa1e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -217,6 +217,14 @@ config DEBUG_OBJECTS_FREE
 	  properly. This can make kmalloc/kfree-intensive workloads
 	  much slower.
 
+config DEBUG_OBJECTS_TIMERS
+	bool "Debug timer objects"
+	depends on DEBUG_OBJECTS
+	help
+	  If you say Y here, additional code will be inserted into the
+	  timer routines to track the life time of timer objects and
+	  validate the timer operations.
+
 config DEBUG_SLAB
 	bool "Debug slab memory allocations"
 	depends on DEBUG_KERNEL && SLAB
-- 
cgit v1.3-14-g43fede


From 237fc6e7a35076f584b9d0794a5204fe4bd9b9e5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Apr 2008 00:55:04 -0700
Subject: add hrtimer specific debugobjects code

hrtimers have now dynamic users in the network code.  Put them under
debugobjects surveillance as well.

Add calls to the generic object debugging infrastructure and provide fixup
functions which allow to keep the system alive when recoverable problems have
been detected by the object debugging core code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg KH <greg@kroah.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hrtimer.h |  15 ++++
 kernel/futex.c          |  17 ++++-
 kernel/hrtimer.c        | 177 ++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 186 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 56f3236da829..31a4d653389f 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -266,6 +266,21 @@ extern ktime_t ktime_get_real(void);
 extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
 			 enum hrtimer_mode mode);
 
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
+extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
+				  enum hrtimer_mode mode);
+
+extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
+#else
+static inline void hrtimer_init_on_stack(struct hrtimer *timer,
+					 clockid_t which_clock,
+					 enum hrtimer_mode mode)
+{
+	hrtimer_init(timer, which_clock, mode);
+}
+static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
+#endif
+
 /* Basic timer operations: */
 extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
 			 const enum hrtimer_mode mode);
diff --git a/kernel/futex.c b/kernel/futex.c
index e43945e995f5..98092c9817f4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1266,11 +1266,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 		if (!abs_time)
 			schedule();
 		else {
-			hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+			hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
+						HRTIMER_MODE_ABS);
 			hrtimer_init_sleeper(&t, current);
 			t.timer.expires = *abs_time;
 
-			hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
+			hrtimer_start(&t.timer, t.timer.expires,
+						HRTIMER_MODE_ABS);
 			if (!hrtimer_active(&t.timer))
 				t.task = NULL;
 
@@ -1286,6 +1288,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 
 			/* Flag if a timeout occured */
 			rem = (t.task == NULL);
+
+			destroy_hrtimer_on_stack(&t.timer);
 		}
 	}
 	__set_current_state(TASK_RUNNING);
@@ -1367,7 +1371,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 
 	if (time) {
 		to = &timeout;
-		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+		hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
+				      HRTIMER_MODE_ABS);
 		hrtimer_init_sleeper(to, current);
 		to->timer.expires = *time;
 	}
@@ -1581,6 +1586,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 	unqueue_me_pi(&q);
 	futex_unlock_mm(fshared);
 
+	if (to)
+		destroy_hrtimer_on_stack(&to->timer);
 	return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
  out_unlock_release_sem:
@@ -1588,6 +1595,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 
  out_release_sem:
 	futex_unlock_mm(fshared);
+	if (to)
+		destroy_hrtimer_on_stack(&to->timer);
 	return ret;
 
  uaddr_faulted:
@@ -1615,6 +1624,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 	if (!ret && (uval != -EFAULT))
 		goto retry;
 
+	if (to)
+		destroy_hrtimer_on_stack(&to->timer);
 	return ret;
 }
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index dea4c9124ac8..9af1d6a8095e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,7 @@
 #include <linux/tick.h>
 #include <linux/seq_file.h>
 #include <linux/err.h>
+#include <linux/debugobjects.h>
 
 #include <asm/uaccess.h>
 
@@ -342,6 +343,115 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
 	return res;
 }
 
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
+
+static struct debug_obj_descr hrtimer_debug_descr;
+
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
+{
+	struct hrtimer *timer = addr;
+
+	switch (state) {
+	case ODEBUG_STATE_ACTIVE:
+		hrtimer_cancel(timer);
+		debug_object_init(timer, &hrtimer_debug_descr);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
+{
+	switch (state) {
+
+	case ODEBUG_STATE_NOTAVAILABLE:
+		WARN_ON_ONCE(1);
+		return 0;
+
+	case ODEBUG_STATE_ACTIVE:
+		WARN_ON(1);
+
+	default:
+		return 0;
+	}
+}
+
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
+{
+	struct hrtimer *timer = addr;
+
+	switch (state) {
+	case ODEBUG_STATE_ACTIVE:
+		hrtimer_cancel(timer);
+		debug_object_free(timer, &hrtimer_debug_descr);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static struct debug_obj_descr hrtimer_debug_descr = {
+	.name		= "hrtimer",
+	.fixup_init	= hrtimer_fixup_init,
+	.fixup_activate	= hrtimer_fixup_activate,
+	.fixup_free	= hrtimer_fixup_free,
+};
+
+static inline void debug_hrtimer_init(struct hrtimer *timer)
+{
+	debug_object_init(timer, &hrtimer_debug_descr);
+}
+
+static inline void debug_hrtimer_activate(struct hrtimer *timer)
+{
+	debug_object_activate(timer, &hrtimer_debug_descr);
+}
+
+static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
+{
+	debug_object_deactivate(timer, &hrtimer_debug_descr);
+}
+
+static inline void debug_hrtimer_free(struct hrtimer *timer)
+{
+	debug_object_free(timer, &hrtimer_debug_descr);
+}
+
+static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+			   enum hrtimer_mode mode);
+
+void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
+			   enum hrtimer_mode mode)
+{
+	debug_object_init_on_stack(timer, &hrtimer_debug_descr);
+	__hrtimer_init(timer, clock_id, mode);
+}
+
+void destroy_hrtimer_on_stack(struct hrtimer *timer)
+{
+	debug_object_free(timer, &hrtimer_debug_descr);
+}
+
+#else
+static inline void debug_hrtimer_init(struct hrtimer *timer) { }
+static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
+static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
+#endif
+
 /*
  * Check, whether the timer is on the callback pending list
  */
@@ -567,6 +677,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 		/* Timer is expired, act upon the callback mode */
 		switch(timer->cb_mode) {
 		case HRTIMER_CB_IRQSAFE_NO_RESTART:
+			debug_hrtimer_deactivate(timer);
 			/*
 			 * We can call the callback from here. No restart
 			 * happens, so no danger of recursion
@@ -581,6 +692,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 			 * the tick timer in the softirq ! The calling site
 			 * takes care of this.
 			 */
+			debug_hrtimer_deactivate(timer);
 			return 1;
 		case HRTIMER_CB_IRQSAFE:
 		case HRTIMER_CB_SOFTIRQ:
@@ -735,6 +847,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
 	struct hrtimer *entry;
 	int leftmost = 1;
 
+	debug_hrtimer_activate(timer);
+
 	/*
 	 * Find the right place in the rbtree:
 	 */
@@ -831,6 +945,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 		 * reprogramming happens in the interrupt handler. This is a
 		 * rare case and less expensive than a smp call.
 		 */
+		debug_hrtimer_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
 		__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
@@ -878,6 +993,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 		tim = ktime_add_safe(tim, base->resolution);
 #endif
 	}
+
 	timer->expires = tim;
 
 	timer_stats_hrtimer_set_start_info(timer);
@@ -1011,14 +1127,8 @@ ktime_t hrtimer_get_next_event(void)
 }
 #endif
 
-/**
- * hrtimer_init - initialize a timer to the given clock
- * @timer:	the timer to be initialized
- * @clock_id:	the clock to be used
- * @mode:	timer mode abs/rel
- */
-void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
-		  enum hrtimer_mode mode)
+static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+			   enum hrtimer_mode mode)
 {
 	struct hrtimer_cpu_base *cpu_base;
 
@@ -1039,6 +1149,19 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 	memset(timer->start_comm, 0, TASK_COMM_LEN);
 #endif
 }
+
+/**
+ * hrtimer_init - initialize a timer to the given clock
+ * @timer:	the timer to be initialized
+ * @clock_id:	the clock to be used
+ * @mode:	timer mode abs/rel
+ */
+void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+		  enum hrtimer_mode mode)
+{
+	debug_hrtimer_init(timer);
+	__hrtimer_init(timer, clock_id, mode);
+}
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
 /**
@@ -1072,6 +1195,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
 		timer = list_entry(cpu_base->cb_pending.next,
 				   struct hrtimer, cb_entry);
 
+		debug_hrtimer_deactivate(timer);
 		timer_stats_account_hrtimer(timer);
 
 		fn = timer->function;
@@ -1120,6 +1244,7 @@ static void __run_hrtimer(struct hrtimer *timer)
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
+	debug_hrtimer_deactivate(timer);
 	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
 	timer_stats_account_hrtimer(timer);
 
@@ -1378,22 +1503,27 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 {
 	struct hrtimer_sleeper t;
 	struct timespec __user  *rmtp;
+	int ret = 0;
 
-	hrtimer_init(&t.timer, restart->nanosleep.index, HRTIMER_MODE_ABS);
+	hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
+				HRTIMER_MODE_ABS);
 	t.timer.expires.tv64 = restart->nanosleep.expires;
 
 	if (do_nanosleep(&t, HRTIMER_MODE_ABS))
-		return 0;
+		goto out;
 
 	rmtp = restart->nanosleep.rmtp;
 	if (rmtp) {
-		int ret = update_rmtp(&t.timer, rmtp);
+		ret = update_rmtp(&t.timer, rmtp);
 		if (ret <= 0)
-			return ret;
+			goto out;
 	}
 
 	/* The other values in restart are already filled in */
-	return -ERESTART_RESTARTBLOCK;
+	ret = -ERESTART_RESTARTBLOCK;
+out:
+	destroy_hrtimer_on_stack(&t.timer);
+	return ret;
 }
 
 long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
@@ -1401,20 +1531,23 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 {
 	struct restart_block *restart;
 	struct hrtimer_sleeper t;
+	int ret = 0;
 
-	hrtimer_init(&t.timer, clockid, mode);
+	hrtimer_init_on_stack(&t.timer, clockid, mode);
 	t.timer.expires = timespec_to_ktime(*rqtp);
 	if (do_nanosleep(&t, mode))
-		return 0;
+		goto out;
 
 	/* Absolute timers do not update the rmtp value and restart: */
-	if (mode == HRTIMER_MODE_ABS)
-		return -ERESTARTNOHAND;
+	if (mode == HRTIMER_MODE_ABS) {
+		ret = -ERESTARTNOHAND;
+		goto out;
+	}
 
 	if (rmtp) {
-		int ret = update_rmtp(&t.timer, rmtp);
+		ret = update_rmtp(&t.timer, rmtp);
 		if (ret <= 0)
-			return ret;
+			goto out;
 	}
 
 	restart = &current_thread_info()->restart_block;
@@ -1423,7 +1556,10 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	restart->nanosleep.rmtp = rmtp;
 	restart->nanosleep.expires = t.timer.expires.tv64;
 
-	return -ERESTART_RESTARTBLOCK;
+	ret = -ERESTART_RESTARTBLOCK;
+out:
+	destroy_hrtimer_on_stack(&t.timer);
+	return ret;
 }
 
 asmlinkage long
@@ -1468,6 +1604,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 	while ((node = rb_first(&old_base->active))) {
 		timer = rb_entry(node, struct hrtimer, node);
 		BUG_ON(hrtimer_callback_running(timer));
+		debug_hrtimer_deactivate(timer);
 		__remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
 		timer->base = new_base;
 		/*
-- 
cgit v1.3-14-g43fede


From af1f16d08f38ab6f17b5760e6ec9d2b7d3a5ff1a Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Apr 2008 00:55:08 -0700
Subject: kernel: replace remaining __FUNCTION__ occurrences

__FUNCTION__ is gcc-specific, use __func__

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c       | 4 ++--
 kernel/workqueue.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index a98f6ab16ecd..c77bc3a1c722 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -215,7 +215,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		__raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
 					  hcpu, nr_calls, NULL);
 		printk("%s: attempt to take down CPU %u failed\n",
-				__FUNCTION__, cpu);
+				__func__, cpu);
 		err = -EINVAL;
 		goto out_release;
 	}
@@ -295,7 +295,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 	if (ret == NOTIFY_BAD) {
 		nr_calls--;
 		printk("%s: attempt to bring up CPU %u failed\n",
-				__FUNCTION__, cpu);
+				__func__, cpu);
 		ret = -EINVAL;
 		goto out_notify;
 	}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7db251a959c5..721093a22561 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -247,7 +247,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 	if (cwq->run_depth > 3) {
 		/* morton gets to eat his hat */
 		printk("%s: recursion depth exceeded: %d\n",
-			__FUNCTION__, cwq->run_depth);
+			__func__, cwq->run_depth);
 		dump_stack();
 	}
 	while (!list_empty(&cwq->worklist)) {
-- 
cgit v1.3-14-g43fede


From a58730c42174672fe0012a4edbe3e38f94ef2bad Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 13 Mar 2008 09:03:44 +0000
Subject: module: make module_sect_attrs private to kernel/module.c

No-one else is using these afaics.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 17 -----------------
 kernel/module.c        | 16 +++++++++++++++-
 2 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 819c4e889bf1..a29c2ebb7c38 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -229,23 +229,6 @@ enum module_state
 	MODULE_STATE_GOING,
 };
 
-/* Similar stuff for section attributes. */
-struct module_sect_attr
-{
-	struct module_attribute mattr;
-	char *name;
-	unsigned long address;
-};
-
-struct module_sect_attrs
-{
-	struct attribute_group grp;
-	int nsections;
-	struct module_sect_attr attrs[0];
-};
-
-struct module_param_attrs;
-
 struct module
 {
 	enum module_state state;
diff --git a/kernel/module.c b/kernel/module.c
index 8d6cccc6c3cf..b0d7c2a41bd9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -991,6 +991,20 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
  * J. Corbet <corbet@lwn.net>
  */
 #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
+struct module_sect_attr
+{
+	struct module_attribute mattr;
+	char *name;
+	unsigned long address;
+};
+
+struct module_sect_attrs
+{
+	struct attribute_group grp;
+	unsigned int nsections;
+	struct module_sect_attr attrs[0];
+};
+
 static ssize_t module_sect_show(struct module_attribute *mattr,
 				struct module *mod, char *buf)
 {
@@ -1001,7 +1015,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
 
 static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
 {
-	int section;
+	unsigned int section;
 
 	for (section = 0; section < sect_attrs->nsections; section++)
 		kfree(sect_attrs->attrs[section].name);
-- 
cgit v1.3-14-g43fede


From ea01e798e2d27fd04142e0473ca36570fa9d9218 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 13 Mar 2008 09:02:17 +0000
Subject: module: reduce module image and resident size

Resulting reduction (x86-64, gcc 4.1.2) with my (special purpose, i.e.
much reduced) configurations:
- 16k kernel resident size
- 180k module resident size
- 10k module image size

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 2 +-
 kernel/module.c        | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index a29c2ebb7c38..3e03b1acbc94 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -190,7 +190,7 @@ void *__symbol_get_gpl(const char *symbol);
 	extern typeof(sym) sym;					\
 	__CRC_SYMBOL(sym, sec)					\
 	static const char __kstrtab_##sym[]			\
-	__attribute__((section("__ksymtab_strings")))		\
+	__attribute__((section("__ksymtab_strings"), aligned(1))) \
 	= MODULE_SYMBOL_PREFIX #sym;                    	\
 	static const struct kernel_symbol __ksymtab_##sym	\
 	__used							\
diff --git a/kernel/module.c b/kernel/module.c
index b0d7c2a41bd9..031bf26af8ea 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1828,8 +1828,9 @@ static struct module *load_module(void __user *umod,
 	unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
 #endif
 
-	/* Don't keep modinfo section */
+	/* Don't keep modinfo and version sections. */
 	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+	sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
 #ifdef CONFIG_KALLSYMS
 	/* Keep symbol and string tables for decoding later. */
 	sechdrs[symindex].sh_flags |= SHF_ALLOC;
-- 
cgit v1.3-14-g43fede


From ad9546c9917d44eddc7676b639296d624cee455e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 May 2008 21:14:59 -0500
Subject: module: neaten __find_symbol, rename to find_symbol

__find_symbol() has grown over time: there are now 5 different arrays
of symbols it traverses.  It also shouldn't print out a warning on
some calls (ie. verify_symbol which simply checks for name clashes,
and __symbol_put which checks for bugs).

1) Rename to find_symbol: no need for underscores.
2) Use bool and add "warn" parameter to suppress warnings.
3) Make table-driven rather than open coded.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 246 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 125 insertions(+), 121 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 031bf26af8ea..679e4c88ed9e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -164,131 +164,140 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
 	return NULL;
 }
 
-static void printk_unused_warning(const char *name)
+static bool always_ok(bool gplok, bool warn, const char *name)
 {
-	printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
-		"however this module is using it.\n", name);
-	printk(KERN_WARNING "This symbol will go away in the future.\n");
-	printk(KERN_WARNING "Please evalute if this is the right api to use, "
-		"and if it really is, submit a report the linux kernel "
-		"mailinglist together with submitting your code for "
-		"inclusion.\n");
+	return true;
 }
 
-/* Find a symbol, return value, crc and module which owns it */
-static unsigned long __find_symbol(const char *name,
-				   struct module **owner,
-				   const unsigned long **crc,
-				   int gplok)
+static bool printk_unused_warning(bool gplok, bool warn, const char *name)
 {
-	struct module *mod;
-	const struct kernel_symbol *ks;
-
-	/* Core kernel first. */
-	*owner = NULL;
-	ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
-	if (ks) {
-		*crc = symversion(__start___kcrctab, (ks - __start___ksymtab));
-		return ks->value;
+	if (warn) {
+		printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+		       "however this module is using it.\n", name);
+		printk(KERN_WARNING
+		       "This symbol will go away in the future.\n");
+		printk(KERN_WARNING
+		       "Please evalute if this is the right api to use and if "
+		       "it really is, submit a report the linux kernel "
+		       "mailinglist together with submitting your code for "
+		       "inclusion.\n");
 	}
-	if (gplok) {
-		ks = lookup_symbol(name, __start___ksymtab_gpl,
-					 __stop___ksymtab_gpl);
-		if (ks) {
-			*crc = symversion(__start___kcrctab_gpl,
-					  (ks - __start___ksymtab_gpl));
-			return ks->value;
-		}
-	}
-	ks = lookup_symbol(name, __start___ksymtab_gpl_future,
-				 __stop___ksymtab_gpl_future);
-	if (ks) {
-		if (!gplok) {
-			printk(KERN_WARNING "Symbol %s is being used "
-			       "by a non-GPL module, which will not "
-			       "be allowed in the future\n", name);
-			printk(KERN_WARNING "Please see the file "
-			       "Documentation/feature-removal-schedule.txt "
-			       "in the kernel source tree for more "
-			       "details.\n");
-		}
-		*crc = symversion(__start___kcrctab_gpl_future,
-				  (ks - __start___ksymtab_gpl_future));
-		return ks->value;
+	return true;
+}
+
+static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name)
+{
+	if (!gplok)
+		return false;
+	return printk_unused_warning(gplok, warn, name);
+}
+
+static bool gpl_only(bool gplok, bool warn, const char *name)
+{
+	return gplok;
+}
+
+static bool warn_if_not_gpl(bool gplok, bool warn, const char *name)
+{
+	if (!gplok && warn) {
+		printk(KERN_WARNING "Symbol %s is being used "
+		       "by a non-GPL module, which will not "
+		       "be allowed in the future\n", name);
+		printk(KERN_WARNING "Please see the file "
+		       "Documentation/feature-removal-schedule.txt "
+		       "in the kernel source tree for more details.\n");
 	}
+	return true;
+}
 
-	ks = lookup_symbol(name, __start___ksymtab_unused,
-				 __stop___ksymtab_unused);
-	if (ks) {
-		printk_unused_warning(name);
-		*crc = symversion(__start___kcrctab_unused,
-				  (ks - __start___ksymtab_unused));
-		return ks->value;
+struct symsearch {
+	const struct kernel_symbol *start, *stop;
+	const unsigned long *crcs;
+	bool (*check)(bool gplok, bool warn, const char *name);
+};
+
+/* Look through this array of symbol tables for a symbol match which
+ * passes the check function. */
+static const struct kernel_symbol *search_symarrays(const struct symsearch *arr,
+						    unsigned int num,
+						    const char *name,
+						    bool gplok,
+						    bool warn,
+						    const unsigned long **crc)
+{
+	unsigned int i;
+	const struct kernel_symbol *ks;
+
+	for (i = 0; i < num; i++) {
+		ks = lookup_symbol(name, arr[i].start, arr[i].stop);
+		if (!ks || !arr[i].check(gplok, warn, name))
+			continue;
+
+		if (crc)
+			*crc = symversion(arr[i].crcs, ks - arr[i].start);
+		return ks;
 	}
+	return NULL;
+}
 
-	if (gplok)
-		ks = lookup_symbol(name, __start___ksymtab_unused_gpl,
-				 __stop___ksymtab_unused_gpl);
+/* Find a symbol, return value, (optional) crc and (optional) module
+ * which owns it */
+static unsigned long find_symbol(const char *name,
+				 struct module **owner,
+				 const unsigned long **crc,
+				 bool gplok,
+				 bool warn)
+{
+	struct module *mod;
+	const struct kernel_symbol *ks;
+	const struct symsearch arr[] = {
+		{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
+		  always_ok },
+		{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
+		  __start___kcrctab_gpl, gpl_only },
+		{ __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
+		  __start___kcrctab_gpl_future, warn_if_not_gpl },
+		{ __start___ksymtab_unused, __stop___ksymtab_unused,
+		  __start___kcrctab_unused, printk_unused_warning },
+		{ __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
+		  __start___kcrctab_unused_gpl, gpl_only_unused_warning },
+	};
+
+	/* Core kernel first. */
+	ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc);
 	if (ks) {
-		printk_unused_warning(name);
-		*crc = symversion(__start___kcrctab_unused_gpl,
-				  (ks - __start___ksymtab_unused_gpl));
+		if (owner)
+			*owner = NULL;
 		return ks->value;
 	}
 
 	/* Now try modules. */
 	list_for_each_entry(mod, &modules, list) {
-		*owner = mod;
-		ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
+		struct symsearch arr[] = {
+			{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
+			  always_ok },
+			{ mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
+			  mod->gpl_crcs, gpl_only },
+			{ mod->gpl_future_syms,
+			  mod->gpl_future_syms + mod->num_gpl_future_syms,
+			  mod->gpl_future_crcs, warn_if_not_gpl },
+			{ mod->unused_syms,
+			  mod->unused_syms + mod->num_unused_syms,
+			  mod->unused_crcs, printk_unused_warning },
+			{ mod->unused_gpl_syms,
+			  mod->unused_gpl_syms + mod->num_unused_gpl_syms,
+			  mod->unused_gpl_crcs, gpl_only_unused_warning },
+		};
+
+		ks = search_symarrays(arr, ARRAY_SIZE(arr),
+				      name, gplok, warn, crc);
 		if (ks) {
-			*crc = symversion(mod->crcs, (ks - mod->syms));
-			return ks->value;
-		}
-
-		if (gplok) {
-			ks = lookup_symbol(name, mod->gpl_syms,
-					   mod->gpl_syms + mod->num_gpl_syms);
-			if (ks) {
-				*crc = symversion(mod->gpl_crcs,
-						  (ks - mod->gpl_syms));
-				return ks->value;
-			}
-		}
-		ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms);
-		if (ks) {
-			printk_unused_warning(name);
-			*crc = symversion(mod->unused_crcs, (ks - mod->unused_syms));
-			return ks->value;
-		}
-
-		if (gplok) {
-			ks = lookup_symbol(name, mod->unused_gpl_syms,
-					   mod->unused_gpl_syms + mod->num_unused_gpl_syms);
-			if (ks) {
-				printk_unused_warning(name);
-				*crc = symversion(mod->unused_gpl_crcs,
-						  (ks - mod->unused_gpl_syms));
-				return ks->value;
-			}
-		}
-		ks = lookup_symbol(name, mod->gpl_future_syms,
-				   (mod->gpl_future_syms +
-				    mod->num_gpl_future_syms));
-		if (ks) {
-			if (!gplok) {
-				printk(KERN_WARNING "Symbol %s is being used "
-				       "by a non-GPL module, which will not "
-				       "be allowed in the future\n", name);
-				printk(KERN_WARNING "Please see the file "
-				       "Documentation/feature-removal-schedule.txt "
-				       "in the kernel source tree for more "
-				       "details.\n");
-			}
-			*crc = symversion(mod->gpl_future_crcs,
-					  (ks - mod->gpl_future_syms));
+			if (owner)
+				*owner = mod;
 			return ks->value;
 		}
 	}
+
 	DEBUGP("Failed to find symbol %s\n", name);
 	return -ENOENT;
 }
@@ -777,10 +786,9 @@ static void print_unload_info(struct seq_file *m, struct module *mod)
 void __symbol_put(const char *symbol)
 {
 	struct module *owner;
-	const unsigned long *crc;
 
 	preempt_disable();
-	if (IS_ERR_VALUE(__find_symbol(symbol, &owner, &crc, 1)))
+	if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false)))
 		BUG();
 	module_put(owner);
 	preempt_enable();
@@ -924,13 +932,10 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 					  struct module *mod)
 {
 	const unsigned long *crc;
-	struct module *owner;
 
-	if (IS_ERR_VALUE(__find_symbol("struct_module",
-						&owner, &crc, 1)))
+	if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false)))
 		BUG();
-	return check_version(sechdrs, versindex, "struct_module", mod,
-			     crc);
+	return check_version(sechdrs, versindex, "struct_module", mod, crc);
 }
 
 /* First part is kernel version, which we ignore. */
@@ -974,8 +979,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
 	unsigned long ret;
 	const unsigned long *crc;
 
-	ret = __find_symbol(name, &owner, &crc,
-			!(mod->taints & TAINT_PROPRIETARY_MODULE));
+	ret = find_symbol(name, &owner, &crc,
+			  !(mod->taints & TAINT_PROPRIETARY_MODULE), true);
 	if (!IS_ERR_VALUE(ret)) {
 		/* use_module can fail due to OOM,
 		   or module initialization or unloading */
@@ -1376,10 +1381,9 @@ void *__symbol_get(const char *symbol)
 {
 	struct module *owner;
 	unsigned long value;
-	const unsigned long *crc;
 
 	preempt_disable();
-	value = __find_symbol(symbol, &owner, &crc, 1);
+	value = find_symbol(symbol, &owner, NULL, true, true);
 	if (IS_ERR_VALUE(value))
 		value = 0;
 	else if (strong_try_module_get(owner))
@@ -1402,16 +1406,16 @@ static int verify_export_symbols(struct module *mod)
 	const unsigned long *crc;
 
 	for (i = 0; i < mod->num_syms; i++)
-		if (!IS_ERR_VALUE(__find_symbol(mod->syms[i].name,
-							&owner, &crc, 1))) {
+		if (!IS_ERR_VALUE(find_symbol(mod->syms[i].name,
+					      &owner, &crc, true, false))) {
 			name = mod->syms[i].name;
 			ret = -ENOEXEC;
 			goto dup;
 		}
 
 	for (i = 0; i < mod->num_gpl_syms; i++)
-		if (!IS_ERR_VALUE(__find_symbol(mod->gpl_syms[i].name,
-							&owner, &crc, 1))) {
+		if (!IS_ERR_VALUE(find_symbol(mod->gpl_syms[i].name,
+					      &owner, &crc, true, false))) {
 			name = mod->gpl_syms[i].name;
 			ret = -ENOEXEC;
 			goto dup;
-- 
cgit v1.3-14-g43fede


From 4e2d92454b2d822fe1d474efabccc2a3806d5f86 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 May 2008 21:15:00 -0500
Subject: module: set unused_gpl_crcs instead of overwriting unused_crcs

Obvious typo, but I don't know of any modules with unused GPL exports,
and then it would take someone noticing that the version shouldn't
have matched in a dependent module.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 679e4c88ed9e..ee918938518a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1996,7 +1996,8 @@ static struct module *load_module(void __user *umod,
 		mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
 	mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
 	if (unusedgplcrcindex)
-		mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr;
+		mod->unused_gpl_crcs
+			= (void *)sechdrs[unusedgplcrcindex].sh_addr;
 
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !crcindex) ||
-- 
cgit v1.3-14-g43fede


From b211104d111c99dbb97c636b57bd9db711455684 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 May 2008 21:15:00 -0500
Subject: module: Enhance verify_export_symbols

Make verify_export_symbols check the modules unused, unused_gpl and
gpl_future syms.

Inspired by Jan Beulich's fix, but table-driven.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index ee918938518a..d2d093e74165 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1400,33 +1400,33 @@ EXPORT_SYMBOL_GPL(__symbol_get);
  */
 static int verify_export_symbols(struct module *mod)
 {
-	const char *name = NULL;
-	unsigned long i, ret = 0;
+	unsigned int i;
 	struct module *owner;
-	const unsigned long *crc;
-
-	for (i = 0; i < mod->num_syms; i++)
-		if (!IS_ERR_VALUE(find_symbol(mod->syms[i].name,
-					      &owner, &crc, true, false))) {
-			name = mod->syms[i].name;
-			ret = -ENOEXEC;
-			goto dup;
-		}
+	const struct kernel_symbol *s;
+	struct {
+		const struct kernel_symbol *sym;
+		unsigned int num;
+	} arr[] = {
+		{ mod->syms, mod->num_syms },
+		{ mod->gpl_syms, mod->num_gpl_syms },
+		{ mod->gpl_future_syms, mod->num_gpl_future_syms },
+		{ mod->unused_syms, mod->num_unused_syms },
+		{ mod->unused_gpl_syms, mod->num_unused_gpl_syms },
+	};
 
-	for (i = 0; i < mod->num_gpl_syms; i++)
-		if (!IS_ERR_VALUE(find_symbol(mod->gpl_syms[i].name,
-					      &owner, &crc, true, false))) {
-			name = mod->gpl_syms[i].name;
-			ret = -ENOEXEC;
-			goto dup;
+	for (i = 0; i < ARRAY_SIZE(arr); i++) {
+		for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
+			if (!IS_ERR_VALUE(find_symbol(s->name, &owner,
+						      NULL, true, false))) {
+				printk(KERN_ERR
+				       "%s: exports duplicate symbol %s"
+				       " (owned by %s)\n",
+				       mod->name, s->name, module_name(owner));
+				return -ENOEXEC;
+			}
 		}
-
-dup:
-	if (ret)
-		printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n",
-			mod->name, name, module_name(owner));
-
-	return ret;
+	}
+	return 0;
 }
 
 /* Change all symbols so that st_value encodes the pointer directly. */
-- 
cgit v1.3-14-g43fede


From df4b565e1fbc777bb6e274378a41fa8ff7485680 Mon Sep 17 00:00:00 2001
From: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Date: Mon, 21 Apr 2008 14:34:31 +0200
Subject: module: add MODULE_STATE_GOING notifier call

Provide module unload callback. Required by the gcov profiling
infrastructure to keep track of profiling data structures.

Signed-off-by: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index d2d093e74165..8674a390a2e8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -745,12 +745,13 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 	if (!forced && module_refcount(mod) != 0)
 		wait_for_zero_refcount(mod);
 
+	mutex_unlock(&module_mutex);
 	/* Final destruction now noone is using it. */
-	if (mod->exit != NULL) {
-		mutex_unlock(&module_mutex);
+	if (mod->exit != NULL)
 		mod->exit();
-		mutex_lock(&module_mutex);
-	}
+	blocking_notifier_call_chain(&module_notify_list,
+				     MODULE_STATE_GOING, mod);
+	mutex_lock(&module_mutex);
 	/* Store the name of the last unloaded module for diagnostic purposes */
 	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
 	free_module(mod);
@@ -2191,6 +2192,8 @@ sys_init_module(void __user *umod,
 		mod->state = MODULE_STATE_GOING;
 		synchronize_sched();
 		module_put(mod);
+		blocking_notifier_call_chain(&module_notify_list,
+					     MODULE_STATE_GOING, mod);
 		mutex_lock(&module_mutex);
 		free_module(mod);
 		mutex_unlock(&module_mutex);
-- 
cgit v1.3-14-g43fede


From e5e417232e7c9ecc58a77902d2e8dd46792cd092 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 1 May 2008 04:34:23 -0700
Subject: Fix cpu hotplug problem in softirq code

currently cpu hotplug (unplug) seems broken on s390 and likely others. On cpu
unplug the system starts to behave very strange and hangs.

I bisected the problem to the following commit:

commit 48f20a9a9488c432fc86df1ff4b7f4fa895d1183
Author: Olof Johansson <olof@lixom.net>
Date: Tue Mar 4 15:23:25 2008 -0800
	tasklets: execute tasklets in the same order they were queued

Reverting this patch seems to fix the problem.  I looked into takeover_tasklet
and it seems that there is a way to corrupt the tail pointer of the current
cpu.  If the tasklet list of the frozen cpu is empty, the tail pointer of the
current cpu points to the address of the head pointer of the stopped cpu and
not to the next pointer of a tasklet_struct.

This patch avoids the list splice of the list is empty and cpu hotplug seems
to work as the tail pointer is not corrupted.  Olof, can you look into that
patch and ACK/NACK it so Andrew can push this to Linus, if appropriate?
Please note that some lines are longer than 80 chars, but line-wrapping looked
worse that this version.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Olof Johansson <olof@lixom.net>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/softirq.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3c44956ee7e2..36e061740047 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -589,16 +589,20 @@ static void takeover_tasklets(unsigned int cpu)
 	local_irq_disable();
 
 	/* Find end, append list for that CPU. */
-	*__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).head;
-	__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
-	per_cpu(tasklet_vec, cpu).head = NULL;
-	per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
+	if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
+		*(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head;
+		__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
+		per_cpu(tasklet_vec, cpu).head = NULL;
+		per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
+	}
 	raise_softirq_irqoff(TASKLET_SOFTIRQ);
 
-	*__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
-	__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
-	per_cpu(tasklet_hi_vec, cpu).head = NULL;
-	per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
+	if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
+		*__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
+		__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
+		per_cpu(tasklet_hi_vec, cpu).head = NULL;
+		per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
+	}
 	raise_softirq_irqoff(HI_SOFTIRQ);
 
 	local_irq_enable();
-- 
cgit v1.3-14-g43fede


From 71abb3af62dfa52930755f3b6497eafbe1d6ec85 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Thu, 1 May 2008 04:34:26 -0700
Subject: convert a few do_div users

This converts a few users of do_div to div_[su]64 and this demonstrates nicely
how it can reduce some expressions to one-liners.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time.c     | 29 +++++++++--------------------
 kernel/time/ntp.c | 25 ++++++-------------------
 2 files changed, 15 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index 86729042e4cd..343e2515375a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -36,6 +36,7 @@
 #include <linux/security.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/math64.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -587,9 +588,7 @@ clock_t jiffies_to_clock_t(long x)
 	return x / (HZ / USER_HZ);
 # endif
 #else
-	u64 tmp = (u64)x * TICK_NSEC;
-	do_div(tmp, (NSEC_PER_SEC / USER_HZ));
-	return (long)tmp;
+	return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
 #endif
 }
 EXPORT_SYMBOL(jiffies_to_clock_t);
@@ -601,16 +600,12 @@ unsigned long clock_t_to_jiffies(unsigned long x)
 		return ~0UL;
 	return x * (HZ / USER_HZ);
 #else
-	u64 jif;
-
 	/* Don't worry about loss of precision here .. */
 	if (x >= ~0UL / HZ * USER_HZ)
 		return ~0UL;
 
 	/* .. but do try to contain it here */
-	jif = x * (u64) HZ;
-	do_div(jif, USER_HZ);
-	return jif;
+	return div_u64((u64)x * HZ, USER_HZ);
 #endif
 }
 EXPORT_SYMBOL(clock_t_to_jiffies);
@@ -619,10 +614,9 @@ u64 jiffies_64_to_clock_t(u64 x)
 {
 #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
 # if HZ < USER_HZ
-	x *= USER_HZ;
-	do_div(x, HZ);
+	x = div_u64(x * USER_HZ, HZ);
 # elif HZ > USER_HZ
-	do_div(x, HZ / USER_HZ);
+	x = div_u64(x, HZ / USER_HZ);
 # else
 	/* Nothing to do */
 # endif
@@ -632,8 +626,7 @@ u64 jiffies_64_to_clock_t(u64 x)
 	 * but even this doesn't overflow in hundreds of years
 	 * in 64 bits, so..
 	 */
-	x *= TICK_NSEC;
-	do_div(x, (NSEC_PER_SEC / USER_HZ));
+	x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
 #endif
 	return x;
 }
@@ -642,21 +635,17 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t);
 u64 nsec_to_clock_t(u64 x)
 {
 #if (NSEC_PER_SEC % USER_HZ) == 0
-	do_div(x, (NSEC_PER_SEC / USER_HZ));
+	return div_u64(x, NSEC_PER_SEC / USER_HZ);
 #elif (USER_HZ % 512) == 0
-	x *= USER_HZ/512;
-	do_div(x, (NSEC_PER_SEC / 512));
+	return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
 #else
 	/*
          * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
          * overflow after 64.99 years.
          * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
          */
-	x *= 9;
-	do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) /
-				  USER_HZ));
+	return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
 #endif
-	return x;
 }
 
 #if (BITS_PER_LONG < 64)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5fd9b9469770..a4492f3d64db 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,7 +15,7 @@
 #include <linux/jiffies.h>
 #include <linux/hrtimer.h>
 #include <linux/capability.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
 #include <asm/timex.h>
 
 /*
@@ -53,10 +53,8 @@ static void ntp_update_frequency(void)
 
 	tick_length_base = second_length;
 
-	do_div(second_length, HZ);
-	tick_nsec = second_length >> TICK_LENGTH_SHIFT;
-
-	do_div(tick_length_base, NTP_INTERVAL_FREQ);
+	tick_nsec = div_u64(second_length, HZ) >> TICK_LENGTH_SHIFT;
+	tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
 }
 
 /**
@@ -237,7 +235,7 @@ static inline void notify_cmos_timer(void) { }
 int do_adjtimex(struct timex *txc)
 {
 	long mtemp, save_adjust, rem;
-	s64 freq_adj, temp64;
+	s64 freq_adj;
 	int result;
 
 	/* In order to modify anything, you gotta be super-user! */
@@ -342,19 +340,8 @@ int do_adjtimex(struct timex *txc)
 		    freq_adj = time_offset * mtemp;
 		    freq_adj = shift_right(freq_adj, time_constant * 2 +
 					   (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
-		    if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
-			u64 utemp64;
-			temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
-			if (time_offset < 0) {
-			    utemp64 = -temp64;
-			    do_div(utemp64, mtemp);
-			    freq_adj -= utemp64;
-			} else {
-			    utemp64 = temp64;
-			    do_div(utemp64, mtemp);
-			    freq_adj += utemp64;
-			}
-		    }
+		    if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC))
+			freq_adj += div_s64(time_offset << (SHIFT_NSEC - SHIFT_FLL), mtemp);
 		    freq_adj += time_freq;
 		    freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
 		    time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
-- 
cgit v1.3-14-g43fede


From 6f6d6a1a6a1336431a6cba60ace9e97c3a496a19 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Thu, 1 May 2008 04:34:28 -0700
Subject: rename div64_64 to div64_u64

Rename div64_64 to div64_u64 to make it consistent with the other divide
functions, so it clearly includes the type of the divide.  Move its definition
to math64.h as currently no architecture overrides the generic implementation.
 They can still override it of course, but the duplicated declarations are
avoided.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Patrick McHardy <kaber@trash.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kvm/kvm-ia64.c     |  3 +--
 arch/x86/kvm/i8254.c         |  6 +++---
 arch/x86/kvm/lapic.c         |  6 +++---
 include/asm-arm/div64.h      |  2 --
 include/asm-generic/div64.h  |  7 -------
 include/asm-m68k/div64.h     |  1 -
 include/asm-mips/div64.h     |  6 ------
 include/asm-mn10300/div64.h  |  3 ---
 include/asm-um/div64.h       |  1 -
 include/asm-x86/div64.h      |  2 --
 include/linux/math64.h       | 12 ++++++++++++
 kernel/sched.c               |  6 +++---
 kernel/sched_debug.c         |  4 ++--
 lib/div64.c                  | 12 ++++++------
 net/ipv4/tcp_cubic.c         |  4 ++--
 net/netfilter/xt_connbytes.c |  5 ++---
 16 files changed, 34 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 6df073240135..318b81100623 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1,4 +1,3 @@
-
 /*
  * kvm_ia64.c: Basic KVM suppport On Itanium series processors
  *
@@ -431,7 +430,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 	if (itc_diff < 0)
 		itc_diff = -itc_diff;
 
-	expires = div64_64(itc_diff, cyc_per_usec);
+	expires = div64_u64(itc_diff, cyc_per_usec);
 	kt = ktime_set(0, 1000 * expires);
 	vcpu->arch.ht_active = 1;
 	hrtimer_start(p_ht, kt, HRTIMER_MODE_ABS);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 361e31611276..4c943eabacc3 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -35,7 +35,7 @@
 #include "i8254.h"
 
 #ifndef CONFIG_X86_64
-#define mod_64(x, y) ((x) - (y) * div64_64(x, y))
+#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
 #else
 #define mod_64(x, y) ((x) % (y))
 #endif
@@ -60,8 +60,8 @@ static u64 muldiv64(u64 a, u32 b, u32 c)
 	rl = (u64)u.l.low * (u64)b;
 	rh = (u64)u.l.high * (u64)b;
 	rh += (rl >> 32);
-	res.l.high = div64_64(rh, c);
-	res.l.low = div64_64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
+	res.l.high = div64_u64(rh, c);
+	res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
 	return res.ll;
 }
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 57ac4e4c556a..36809d79788b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -25,13 +25,13 @@
 #include <linux/hrtimer.h>
 #include <linux/io.h>
 #include <linux/module.h>
+#include <linux/math64.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/page.h>
 #include <asm/current.h>
 #include <asm/apicdef.h>
 #include <asm/atomic.h>
-#include <asm/div64.h>
 #include "irq.h"
 
 #define PRId64 "d"
@@ -526,8 +526,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
 	} else
 		passed = ktime_sub(now, apic->timer.last_update);
 
-	counter_passed = div64_64(ktime_to_ns(passed),
-				  (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
+	counter_passed = div64_u64(ktime_to_ns(passed),
+				   (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
 
 	if (counter_passed > tmcct) {
 		if (unlikely(!apic_lvtt_period(apic))) {
diff --git a/include/asm-arm/div64.h b/include/asm-arm/div64.h
index 0b5f881c3d85..5001390be958 100644
--- a/include/asm-arm/div64.h
+++ b/include/asm-arm/div64.h
@@ -224,6 +224,4 @@
 
 #endif
 
-extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
-
 #endif
diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index a4a49370793c..8f4e3193342e 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -30,11 +30,6 @@
 	__rem;							\
  })
 
-static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
-{
-	return dividend / divisor;
-}
-
 #elif BITS_PER_LONG == 32
 
 extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
@@ -54,8 +49,6 @@ extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
 	__rem;						\
  })
 
-extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
-
 #else /* BITS_PER_LONG == ?? */
 
 # error do_div() does not yet support the C64
diff --git a/include/asm-m68k/div64.h b/include/asm-m68k/div64.h
index 33caad1628d4..8243c931b5c0 100644
--- a/include/asm-m68k/div64.h
+++ b/include/asm-m68k/div64.h
@@ -25,5 +25,4 @@
 	__rem;							\
 })
 
-extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
 #endif /* _M68K_DIV64_H */
diff --git a/include/asm-mips/div64.h b/include/asm-mips/div64.h
index 716371bd0980..d1d699105c11 100644
--- a/include/asm-mips/div64.h
+++ b/include/asm-mips/div64.h
@@ -82,7 +82,6 @@
 	(n) = __quot; \
 	__mod; })
 
-extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
 #endif /* (_MIPS_SZLONG == 32) */
 
 #if (_MIPS_SZLONG == 64)
@@ -106,11 +105,6 @@ extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
 	(n) = __quot; \
 	__mod; })
 
-static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
-{
-	return dividend / divisor;
-}
-
 #endif /* (_MIPS_SZLONG == 64) */
 
 #endif /* _ASM_DIV64_H */
diff --git a/include/asm-mn10300/div64.h b/include/asm-mn10300/div64.h
index bf9c515a998c..3a8329b3e869 100644
--- a/include/asm-mn10300/div64.h
+++ b/include/asm-mn10300/div64.h
@@ -97,7 +97,4 @@ signed __muldiv64s(signed val, signed mult, signed div)
 	return result;
 }
 
-extern __attribute__((const))
-uint64_t div64_64(uint64_t dividend, uint64_t divisor);
-
 #endif /* _ASM_DIV64 */
diff --git a/include/asm-um/div64.h b/include/asm-um/div64.h
index 7b73b2cd5b34..1e17f7409cab 100644
--- a/include/asm-um/div64.h
+++ b/include/asm-um/div64.h
@@ -3,5 +3,4 @@
 
 #include "asm/arch/div64.h"
 
-extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
 #endif
diff --git a/include/asm-x86/div64.h b/include/asm-x86/div64.h
index c7892cfe9ce6..32fdbddaae55 100644
--- a/include/asm-x86/div64.h
+++ b/include/asm-x86/div64.h
@@ -71,8 +71,6 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 }
 #define div_u64_rem	div_u64_rem
 
-extern uint64_t div64_64(uint64_t dividend, uint64_t divisor);
-
 #else
 # include <asm-generic/div64.h>
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 6d1716641008..c1a5f81501ff 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -27,6 +27,14 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
 	return dividend / divisor;
 }
 
+/**
+ * div64_u64 - unsigned 64bit divide with 64bit divisor
+ */
+static inline u64 div64_u64(u64 dividend, u64 divisor)
+{
+	return dividend / divisor;
+}
+
 #elif BITS_PER_LONG == 32
 
 #ifndef div_u64_rem
@@ -41,6 +49,10 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder);
 #endif
 
+#ifndef div64_u64
+extern u64 div64_u64(u64 dividend, u64 divisor);
+#endif
+
 #endif /* BITS_PER_LONG */
 
 /**
diff --git a/kernel/sched.c b/kernel/sched.c
index e2f7f5acc807..34bcc5bc120e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8025,7 +8025,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 
 	se->my_q = cfs_rq;
 	se->load.weight = tg->shares;
-	se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
+	se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight);
 	se->parent = parent;
 }
 #endif
@@ -8692,7 +8692,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 		dequeue_entity(cfs_rq, se, 0);
 
 	se->load.weight = shares;
-	se->load.inv_weight = div64_64((1ULL<<32), shares);
+	se->load.inv_weight = div64_u64((1ULL<<32), shares);
 
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
@@ -8787,7 +8787,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 	if (runtime == RUNTIME_INF)
 		return 1ULL << 16;
 
-	return div64_64(runtime << 16, period);
+	return div64_u64(runtime << 16, period);
 }
 
 #ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8a9498e7c831..6b4a12558e88 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -357,8 +357,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
 		avg_per_cpu = p->se.sum_exec_runtime;
 		if (p->se.nr_migrations) {
-			avg_per_cpu = div64_64(avg_per_cpu,
-					       p->se.nr_migrations);
+			avg_per_cpu = div64_u64(avg_per_cpu,
+						p->se.nr_migrations);
 		} else {
 			avg_per_cpu = -1LL;
 		}
diff --git a/lib/div64.c b/lib/div64.c
index 689bd76833fa..bb5bd0c0f030 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -78,9 +78,10 @@ EXPORT_SYMBOL(div_s64_rem);
 #endif
 
 /* 64bit divisor, dividend and result. dynamic precision */
-uint64_t div64_64(uint64_t dividend, uint64_t divisor)
+#ifndef div64_u64
+u64 div64_u64(u64 dividend, u64 divisor)
 {
-	uint32_t high, d;
+	u32 high, d;
 
 	high = divisor >> 32;
 	if (high) {
@@ -91,10 +92,9 @@ uint64_t div64_64(uint64_t dividend, uint64_t divisor)
 	} else
 		d = divisor;
 
-	do_div(dividend, d);
-
-	return dividend;
+	return div_u64(dividend, d);
 }
-EXPORT_SYMBOL(div64_64);
+EXPORT_SYMBOL(div64_u64);
+#endif
 
 #endif /* BITS_PER_LONG == 32 */
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index eb5b9854c8c7..4a1221e5e8ee 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -15,8 +15,8 @@
 
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/math64.h>
 #include <net/tcp.h>
-#include <asm/div64.h>
 
 #define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
 					 * max_cwnd = snd_cwnd * beta
@@ -128,7 +128,7 @@ static u32 cubic_root(u64 a)
 	 * x    = ( 2 * x  +  a / x  ) / 3
 	 *  k+1          k         k
 	 */
-	x = (2 * x + (u32)div64_64(a, (u64)x * (u64)(x - 1)));
+	x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
 	x = ((x * 341) >> 10);
 	return x;
 }
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index b15e7e2fa143..d7e8983cd37f 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -4,12 +4,11 @@
 #include <linux/module.h>
 #include <linux/bitops.h>
 #include <linux/skbuff.h>
+#include <linux/math64.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_connbytes.h>
 #include <net/netfilter/nf_conntrack.h>
 
-#include <asm/div64.h>
-
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_DESCRIPTION("Xtables: Number of packets/bytes per connection matching");
@@ -82,7 +81,7 @@ connbytes_mt(const struct sk_buff *skb, const struct net_device *in,
 			break;
 		}
 		if (pkts != 0)
-			what = div64_64(bytes, pkts);
+			what = div64_u64(bytes, pkts);
 		break;
 	}
 
-- 
cgit v1.3-14-g43fede


From f8bd2258e2d520dff28c855658bd24bdafb5102d Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Thu, 1 May 2008 04:34:31 -0700
Subject: remove div_long_long_rem

x86 is the only arch right now, which provides an optimized for
div_long_long_rem and it has the downside that one has to be very careful that
the divide doesn't overflow.

The API is a little akward, as the arguments for the unsigned divide are
signed.  The signed version also doesn't handle a negative divisor and
produces worse code on 64bit archs.

There is little incentive to keep this API alive, so this converts the few
users to the new API.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/kernel/binfmt_elfn32.c |  5 ++--
 arch/mips/kernel/binfmt_elfo32.c |  5 ++--
 drivers/char/mmtimer.c           | 24 +++++++++-----------
 include/asm-x86/div64.h          | 18 ---------------
 include/linux/calc64.h           | 49 ----------------------------------------
 include/linux/jiffies.h          |  2 +-
 kernel/posix-cpu-timers.c        | 11 ++++-----
 kernel/time.c                    | 25 ++++++++++++--------
 kernel/time/ntp.c                |  6 ++---
 mm/slub.c                        |  9 ++++----
 10 files changed, 44 insertions(+), 110 deletions(-)
 delete mode 100644 include/linux/calc64.h

(limited to 'kernel')

diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c
index 77db3473deab..9fdd8bcdd21e 100644
--- a/arch/mips/kernel/binfmt_elfn32.c
+++ b/arch/mips/kernel/binfmt_elfn32.c
@@ -54,6 +54,7 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
 #include <linux/module.h>
 #include <linux/elfcore.h>
 #include <linux/compat.h>
+#include <linux/math64.h>
 
 #define elf_prstatus elf_prstatus32
 struct elf_prstatus32
@@ -102,8 +103,8 @@ jiffies_to_compat_timeval(unsigned long jiffies, struct compat_timeval *value)
 	 * one divide.
 	 */
 	u64 nsec = (u64)jiffies * TICK_NSEC;
-	long rem;
-	value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &rem);
+	u32 rem;
+	value->tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
 	value->tv_usec = rem / NSEC_PER_USEC;
 }
 
diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c
index 08f4cd781ee3..e1333d7319e2 100644
--- a/arch/mips/kernel/binfmt_elfo32.c
+++ b/arch/mips/kernel/binfmt_elfo32.c
@@ -56,6 +56,7 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
 #include <linux/module.h>
 #include <linux/elfcore.h>
 #include <linux/compat.h>
+#include <linux/math64.h>
 
 #define elf_prstatus elf_prstatus32
 struct elf_prstatus32
@@ -104,8 +105,8 @@ jiffies_to_compat_timeval(unsigned long jiffies, struct compat_timeval *value)
 	 * one divide.
 	 */
 	u64 nsec = (u64)jiffies * TICK_NSEC;
-	long rem;
-	value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &rem);
+	u32 rem;
+	value->tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
 	value->tv_usec = rem / NSEC_PER_USEC;
 }
 
diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index d83db5d880e0..192961fd7173 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -30,6 +30,8 @@
 #include <linux/miscdevice.h>
 #include <linux/posix-timers.h>
 #include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/math64.h>
 
 #include <asm/uaccess.h>
 #include <asm/sn/addrs.h>
@@ -472,8 +474,8 @@ static int sgi_clock_get(clockid_t clockid, struct timespec *tp)
 
 	nsec = rtc_time() * sgi_clock_period
 			+ sgi_clock_offset.tv_nsec;
-	tp->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tp->tv_nsec)
-			+ sgi_clock_offset.tv_sec;
+	*tp = ns_to_timespec(nsec);
+	tp->tv_sec += sgi_clock_offset.tv_sec;
 	return 0;
 };
 
@@ -481,11 +483,11 @@ static int sgi_clock_set(clockid_t clockid, struct timespec *tp)
 {
 
 	u64 nsec;
-	u64 rem;
+	u32 rem;
 
 	nsec = rtc_time() * sgi_clock_period;
 
-	sgi_clock_offset.tv_sec = tp->tv_sec - div_long_long_rem(nsec, NSEC_PER_SEC, &rem);
+	sgi_clock_offset.tv_sec = tp->tv_sec - div_u64_rem(nsec, NSEC_PER_SEC, &rem);
 
 	if (rem <= tp->tv_nsec)
 		sgi_clock_offset.tv_nsec = tp->tv_sec - rem;
@@ -644,9 +646,6 @@ static int sgi_timer_del(struct k_itimer *timr)
 	return 0;
 }
 
-#define timespec_to_ns(x) ((x).tv_nsec + (x).tv_sec * NSEC_PER_SEC)
-#define ns_to_timespec(ts, nsec) (ts).tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &(ts).tv_nsec)
-
 /* Assumption: it_lock is already held with irq's disabled */
 static void sgi_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 {
@@ -659,9 +658,8 @@ static void sgi_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 		return;
 	}
 
-	ns_to_timespec(cur_setting->it_interval, timr->it.mmtimer.incr * sgi_clock_period);
-	ns_to_timespec(cur_setting->it_value, (timr->it.mmtimer.expires - rtc_time())* sgi_clock_period);
-	return;
+	cur_setting->it_interval = ns_to_timespec(timr->it.mmtimer.incr * sgi_clock_period);
+	cur_setting->it_value = ns_to_timespec((timr->it.mmtimer.expires - rtc_time()) * sgi_clock_period);
 }
 
 
@@ -679,8 +677,8 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
 		sgi_timer_get(timr, old_setting);
 
 	sgi_timer_del(timr);
-	when = timespec_to_ns(new_setting->it_value);
-	period = timespec_to_ns(new_setting->it_interval);
+	when = timespec_to_ns(&new_setting->it_value);
+	period = timespec_to_ns(&new_setting->it_interval);
 
 	if (when == 0)
 		/* Clear timer */
@@ -695,7 +693,7 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
 		unsigned long now;
 
 		getnstimeofday(&n);
-		now = timespec_to_ns(n);
+		now = timespec_to_ns(&n);
 		if (when > now)
 			when -= now;
 		else
diff --git a/include/asm-x86/div64.h b/include/asm-x86/div64.h
index 32fdbddaae55..9a2d644c08ef 100644
--- a/include/asm-x86/div64.h
+++ b/include/asm-x86/div64.h
@@ -33,24 +33,6 @@
 	__mod;							\
 })
 
-/*
- * (long)X = ((long long)divs) / (long)div
- * (long)rem = ((long long)divs) % (long)div
- *
- * Warning, this will do an exception if X overflows.
- */
-#define div_long_long_rem(a, b, c) div_ll_X_l_rem(a, b, c)
-
-static inline long div_ll_X_l_rem(long long divs, long div, long *rem)
-{
-	long dum2;
-	asm("divl %2":"=a"(dum2), "=d"(*rem)
-	    : "rm"(div), "A"(divs));
-
-	return dum2;
-
-}
-
 static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 {
 	union {
diff --git a/include/linux/calc64.h b/include/linux/calc64.h
deleted file mode 100644
index ebf4b8f38d88..000000000000
--- a/include/linux/calc64.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef _LINUX_CALC64_H
-#define _LINUX_CALC64_H
-
-#include <linux/types.h>
-#include <asm/div64.h>
-
-/*
- * This is a generic macro which is used when the architecture
- * specific div64.h does not provide a optimized one.
- *
- * The 64bit dividend is divided by the divisor (data type long), the
- * result is returned and the remainder stored in the variable
- * referenced by remainder (data type long *). In contrast to the
- * do_div macro the dividend is kept intact.
- */
-#ifndef div_long_long_rem
-#define div_long_long_rem(dividend, divisor, remainder)	\
-	do_div_llr((dividend), divisor, remainder)
-
-static inline unsigned long do_div_llr(const long long dividend,
-				       const long divisor, long *remainder)
-{
-	u64 result = dividend;
-
-	*(remainder) = do_div(result, divisor);
-	return (unsigned long) result;
-}
-#endif
-
-/*
- * Sign aware variation of the above. On some architectures a
- * negative dividend leads to an divide overflow exception, which
- * is avoided by the sign check.
- */
-static inline long div_long_long_rem_signed(const long long dividend,
-					    const long divisor, long *remainder)
-{
-	long res;
-
-	if (unlikely(dividend < 0)) {
-		res = -div_long_long_rem(-dividend, divisor, remainder);
-		*remainder = -(*remainder);
-	} else
-		res = div_long_long_rem(dividend, divisor, remainder);
-
-	return res;
-}
-
-#endif
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 33ef710dac24..abb6ac639e8e 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_JIFFIES_H
 #define _LINUX_JIFFIES_H
 
-#include <linux/calc64.h>
+#include <linux/math64.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/time.h>
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ae5c6c147c4b..f1525ad06cb3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -4,8 +4,9 @@
 
 #include <linux/sched.h>
 #include <linux/posix-timers.h>
-#include <asm/uaccess.h>
 #include <linux/errno.h>
+#include <linux/math64.h>
+#include <asm/uaccess.h>
 
 static int check_clock(const clockid_t which_clock)
 {
@@ -47,12 +48,10 @@ static void sample_to_timespec(const clockid_t which_clock,
 			       union cpu_time_count cpu,
 			       struct timespec *tp)
 {
-	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-		tp->tv_sec = div_long_long_rem(cpu.sched,
-					       NSEC_PER_SEC, &tp->tv_nsec);
-	} else {
+	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
+		*tp = ns_to_timespec(cpu.sched);
+	else
 		cputime_to_timespec(cpu.cpu, tp);
-	}
 }
 
 static inline int cpu_time_before(const clockid_t which_clock,
diff --git a/kernel/time.c b/kernel/time.c
index 343e2515375a..cbe0d5a222ff 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -392,13 +392,17 @@ EXPORT_SYMBOL(set_normalized_timespec);
 struct timespec ns_to_timespec(const s64 nsec)
 {
 	struct timespec ts;
+	s32 rem;
 
 	if (!nsec)
 		return (struct timespec) {0, 0};
 
-	ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec);
-	if (unlikely(nsec < 0))
-		set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec);
+	ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+	if (unlikely(rem < 0)) {
+		ts.tv_sec--;
+		rem += NSEC_PER_SEC;
+	}
+	ts.tv_nsec = rem;
 
 	return ts;
 }
@@ -528,8 +532,10 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
 	 * Convert jiffies to nanoseconds and separate with
 	 * one divide.
 	 */
-	u64 nsec = (u64)jiffies * TICK_NSEC;
-	value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec);
+	u32 rem;
+	value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
+				    NSEC_PER_SEC, &rem);
+	value->tv_nsec = rem;
 }
 EXPORT_SYMBOL(jiffies_to_timespec);
 
@@ -567,12 +573,11 @@ void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
 	 * Convert jiffies to nanoseconds and separate with
 	 * one divide.
 	 */
-	u64 nsec = (u64)jiffies * TICK_NSEC;
-	long tv_usec;
+	u32 rem;
 
-	value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec);
-	tv_usec /= NSEC_PER_USEC;
-	value->tv_usec = tv_usec;
+	value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
+				    NSEC_PER_SEC, &rem);
+	value->tv_usec = rem / NSEC_PER_USEC;
 }
 EXPORT_SYMBOL(jiffies_to_timeval);
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index a4492f3d64db..dbd6f8905614 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -234,7 +234,7 @@ static inline void notify_cmos_timer(void) { }
  */
 int do_adjtimex(struct timex *txc)
 {
-	long mtemp, save_adjust, rem;
+	long mtemp, save_adjust;
 	s64 freq_adj;
 	int result;
 
@@ -345,9 +345,7 @@ int do_adjtimex(struct timex *txc)
 		    freq_adj += time_freq;
 		    freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
 		    time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
-		    time_offset = div_long_long_rem_signed(time_offset,
-							   NTP_INTERVAL_FREQ,
-							   &rem);
+		    time_offset = div_s64(time_offset, NTP_INTERVAL_FREQ);
 		    time_offset <<= SHIFT_UPDATE;
 		} /* STA_PLL */
 	    } /* txc->modes & ADJ_OFFSET */
diff --git a/mm/slub.c b/mm/slub.c
index 70db2897c1ea..32b62623846a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -22,6 +22,7 @@
 #include <linux/debugobjects.h>
 #include <linux/kallsyms.h>
 #include <linux/memory.h>
+#include <linux/math64.h>
 
 /*
  * Lock order:
@@ -3621,12 +3622,10 @@ static int list_locations(struct kmem_cache *s, char *buf,
 			len += sprintf(buf + len, "<not-available>");
 
 		if (l->sum_time != l->min_time) {
-			unsigned long remainder;
-
 			len += sprintf(buf + len, " age=%ld/%ld/%ld",
-			l->min_time,
-			div_long_long_rem(l->sum_time, l->count, &remainder),
-			l->max_time);
+				l->min_time,
+				(long)div_u64(l->sum_time, l->count),
+				l->max_time);
 		} else
 			len += sprintf(buf + len, " age=%ld",
 				l->min_time);
-- 
cgit v1.3-14-g43fede


From ee9851b218b8bafa22942b5404505ff3d2d34324 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Thu, 1 May 2008 04:34:32 -0700
Subject: ntp: cleanup ntp.c

This is mostly a style cleanup of ntp.c and extracts part of do_adjtimex as
ntp_update_offset().  Otherwise the functionality is still the same as before.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/ntp.c | 173 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 91 insertions(+), 82 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index dbd6f8905614..2586c30f0658 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -35,7 +35,7 @@ static u64 tick_length, tick_length_base;
 /* TIME_ERROR prevents overwriting the CMOS clock */
 static int time_state = TIME_OK;	/* clock synchronization status	*/
 int time_status = STA_UNSYNC;		/* clock status bits		*/
-static s64 time_offset;		/* time adjustment (ns)		*/
+static s64 time_offset;			/* time adjustment (ns)		*/
 static long time_constant = 2;		/* pll time constant		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
@@ -57,6 +57,44 @@ static void ntp_update_frequency(void)
 	tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
 }
 
+static void ntp_update_offset(long offset)
+{
+	long mtemp;
+	s64 freq_adj;
+
+	if (!(time_status & STA_PLL))
+		return;
+
+	time_offset = offset * NSEC_PER_USEC;
+
+	/*
+	 * Scale the phase adjustment and
+	 * clamp to the operating range.
+	 */
+	time_offset = min(time_offset, (s64)MAXPHASE * NSEC_PER_USEC);
+	time_offset = max(time_offset, (s64)-MAXPHASE * NSEC_PER_USEC);
+
+	/*
+	 * Select how the frequency is to be controlled
+	 * and in which mode (PLL or FLL).
+	 */
+	if (time_status & STA_FREQHOLD || time_reftime == 0)
+		time_reftime = xtime.tv_sec;
+	mtemp = xtime.tv_sec - time_reftime;
+	time_reftime = xtime.tv_sec;
+
+	freq_adj = time_offset * mtemp;
+	freq_adj = shift_right(freq_adj, time_constant * 2 +
+			   (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
+	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC))
+		freq_adj += div_s64(time_offset << (SHIFT_NSEC - SHIFT_FLL), mtemp);
+	freq_adj += time_freq;
+	freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
+	time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
+	time_offset = div_s64(time_offset, NTP_INTERVAL_FREQ);
+	time_offset <<= SHIFT_UPDATE;
+}
+
 /**
  * ntp_clear - Clears the NTP state variables
  *
@@ -131,7 +169,7 @@ void second_overflow(void)
 		break;
 	case TIME_WAIT:
 		if (!(time_status & (STA_INS | STA_DEL)))
-		time_state = TIME_OK;
+			time_state = TIME_OK;
 	}
 
 	/*
@@ -234,8 +272,7 @@ static inline void notify_cmos_timer(void) { }
  */
 int do_adjtimex(struct timex *txc)
 {
-	long mtemp, save_adjust;
-	s64 freq_adj;
+	long save_adjust;
 	int result;
 
 	/* In order to modify anything, you gotta be super-user! */
@@ -272,94 +309,63 @@ int do_adjtimex(struct timex *txc)
 	time_status &= ~STA_CLOCKERR;		/* reset STA_CLOCKERR */
 #endif
 	/* If there are input parameters, then process them */
-	if (txc->modes)
-	{
-	    if (txc->modes & ADJ_STATUS)	/* only set allowed bits */
-		time_status =  (txc->status & ~STA_RONLY) |
-			      (time_status & STA_RONLY);
-
-	    if (txc->modes & ADJ_FREQUENCY) {	/* p. 22 */
-		if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
-		    result = -EINVAL;
-		    goto leave;
+	if (txc->modes) {
+		if (txc->modes & ADJ_STATUS)	/* only set allowed bits */
+			time_status = (txc->status & ~STA_RONLY) |
+				      (time_status & STA_RONLY);
+
+		if (txc->modes & ADJ_FREQUENCY) {
+			if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
+				result = -EINVAL;
+				goto leave;
+			}
+			time_freq = ((s64)txc->freq * NSEC_PER_USEC)
+					>> (SHIFT_USEC - SHIFT_NSEC);
 		}
-		time_freq = ((s64)txc->freq * NSEC_PER_USEC)
-				>> (SHIFT_USEC - SHIFT_NSEC);
-	    }
-
-	    if (txc->modes & ADJ_MAXERROR) {
-		if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
-		    result = -EINVAL;
-		    goto leave;
+
+		if (txc->modes & ADJ_MAXERROR) {
+			if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
+				result = -EINVAL;
+				goto leave;
+			}
+			time_maxerror = txc->maxerror;
 		}
-		time_maxerror = txc->maxerror;
-	    }
 
-	    if (txc->modes & ADJ_ESTERROR) {
-		if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
-		    result = -EINVAL;
-		    goto leave;
+		if (txc->modes & ADJ_ESTERROR) {
+			if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
+				result = -EINVAL;
+				goto leave;
+			}
+			time_esterror = txc->esterror;
 		}
-		time_esterror = txc->esterror;
-	    }
 
-	    if (txc->modes & ADJ_TIMECONST) {	/* p. 24 */
-		if (txc->constant < 0) {	/* NTP v4 uses values > 6 */
-		    result = -EINVAL;
-		    goto leave;
+		if (txc->modes & ADJ_TIMECONST) {
+			if (txc->constant < 0) {	/* NTP v4 uses values > 6 */
+				result = -EINVAL;
+				goto leave;
+			}
+			time_constant = min(txc->constant + 4, (long)MAXTC);
 		}
-		time_constant = min(txc->constant + 4, (long)MAXTC);
-	    }
 
-	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
-		if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
-		    /* adjtime() is independent from ntp_adjtime() */
-		    time_adjust = txc->offset;
+		if (txc->modes & ADJ_OFFSET) {
+			if (txc->modes == ADJ_OFFSET_SINGLESHOT)
+				/* adjtime() is independent from ntp_adjtime() */
+				time_adjust = txc->offset;
+			else
+				ntp_update_offset(txc->offset);
 		}
-		else if (time_status & STA_PLL) {
-		    time_offset = txc->offset * NSEC_PER_USEC;
-
-		    /*
-		     * Scale the phase adjustment and
-		     * clamp to the operating range.
-		     */
-		    time_offset = min(time_offset, (s64)MAXPHASE * NSEC_PER_USEC);
-		    time_offset = max(time_offset, (s64)-MAXPHASE * NSEC_PER_USEC);
-
-		    /*
-		     * Select whether the frequency is to be controlled
-		     * and in which mode (PLL or FLL). Clamp to the operating
-		     * range. Ugly multiply/divide should be replaced someday.
-		     */
-
-		    if (time_status & STA_FREQHOLD || time_reftime == 0)
-		        time_reftime = xtime.tv_sec;
-		    mtemp = xtime.tv_sec - time_reftime;
-		    time_reftime = xtime.tv_sec;
-
-		    freq_adj = time_offset * mtemp;
-		    freq_adj = shift_right(freq_adj, time_constant * 2 +
-					   (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
-		    if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC))
-			freq_adj += div_s64(time_offset << (SHIFT_NSEC - SHIFT_FLL), mtemp);
-		    freq_adj += time_freq;
-		    freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
-		    time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
-		    time_offset = div_s64(time_offset, NTP_INTERVAL_FREQ);
-		    time_offset <<= SHIFT_UPDATE;
-		} /* STA_PLL */
-	    } /* txc->modes & ADJ_OFFSET */
-	    if (txc->modes & ADJ_TICK)
-		tick_usec = txc->tick;
-
-	    if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
-		    ntp_update_frequency();
-	} /* txc->modes */
-leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
+		if (txc->modes & ADJ_TICK)
+			tick_usec = txc->tick;
+
+		if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
+			ntp_update_frequency();
+	}
+leave:
+	if (time_status & (STA_UNSYNC|STA_CLOCKERR))
 		result = TIME_ERROR;
 
 	if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
-			(txc->modes == ADJ_OFFSET_SS_READ))
+	    (txc->modes == ADJ_OFFSET_SS_READ))
 		txc->offset = save_adjust;
 	else
 		txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) *
@@ -384,9 +390,12 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 	txc->errcnt	   = 0;
 	txc->stbcnt	   = 0;
 	write_sequnlock_irq(&xtime_lock);
+
 	do_gettimeofday(&txc->time);
+
 	notify_cmos_timer();
-	return(result);
+
+	return result;
 }
 
 static int __init ntp_tick_adj_setup(char *str)
-- 
cgit v1.3-14-g43fede


From eea83d896e318bda54be2d2770d2c5d6668d11db Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Thu, 1 May 2008 04:34:33 -0700
Subject: ntp: NTP4 user space bits update

This adds a few more things from the ntp nanokernel related to user space.
It's now possible to select the resolution used of some values via STA_NANO
and the kernel reports in which mode it works (pll/fll).

If some values for adjtimex() are outside the acceptable range, they are now
simply normalized instead of letting the syscall fail.  I removed
MOD_CLKA/MOD_CLKB as the mapping didn't really makes any sense, the kernel
doesn't support setting the clock.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/timex.h | 12 +++++--
 kernel/time/ntp.c     | 91 ++++++++++++++++++++++++++-------------------------
 2 files changed, 56 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 8ea3e71ba7fa..3c49d173bf39 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -58,6 +58,8 @@
 
 #include <asm/param.h>
 
+#define NTP_API		4	/* NTP API version */
+
 /*
  * SHIFT_KG and SHIFT_KF establish the damping of the PLL and are chosen
  * for a slightly underdamped convergence characteristic. SHIFT_KH
@@ -135,6 +137,8 @@ struct timex {
 #define ADJ_ESTERROR		0x0008	/* estimated time error */
 #define ADJ_STATUS		0x0010	/* clock status */
 #define ADJ_TIMECONST		0x0020	/* pll time constant */
+#define ADJ_MICRO		0x1000	/* select microsecond resolution */
+#define ADJ_NANO		0x2000	/* select nanosecond resolution */
 #define ADJ_TICK		0x4000	/* tick value */
 #define ADJ_OFFSET_SINGLESHOT	0x8001	/* old-fashioned adjtime */
 #define ADJ_OFFSET_SS_READ	0xa001  /* read-only adjtime */
@@ -146,8 +150,6 @@ struct timex {
 #define MOD_ESTERROR	ADJ_ESTERROR
 #define MOD_STATUS	ADJ_STATUS
 #define MOD_TIMECONST	ADJ_TIMECONST
-#define MOD_CLKB	ADJ_TICK
-#define MOD_CLKA	ADJ_OFFSET_SINGLESHOT /* 0x8000 in original */
 
 
 /*
@@ -169,9 +171,13 @@ struct timex {
 #define STA_PPSERROR	0x0800	/* PPS signal calibration error (ro) */
 
 #define STA_CLOCKERR	0x1000	/* clock hardware fault (ro) */
+#define STA_NANO	0x2000	/* resolution (0 = us, 1 = ns) (ro) */
+#define STA_MODE	0x4000	/* mode (0 = PLL, 1 = FLL) (ro) */
+#define STA_CLK		0x8000	/* clock source (0 = A, 1 = B) (ro) */
 
+/* read-only bits */
 #define STA_RONLY (STA_PPSSIGNAL | STA_PPSJITTER | STA_PPSWANDER | \
-    STA_PPSERROR | STA_CLOCKERR) /* read-only bits */
+	STA_PPSERROR | STA_CLOCKERR | STA_NANO | STA_MODE | STA_CLK)
 
 /*
  * Clock states (time_state)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 2586c30f0658..3fc81066d7f1 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -65,7 +65,9 @@ static void ntp_update_offset(long offset)
 	if (!(time_status & STA_PLL))
 		return;
 
-	time_offset = offset * NSEC_PER_USEC;
+	time_offset = offset;
+	if (!(time_status & STA_NANO))
+		time_offset *= NSEC_PER_USEC;
 
 	/*
 	 * Scale the phase adjustment and
@@ -86,8 +88,11 @@ static void ntp_update_offset(long offset)
 	freq_adj = time_offset * mtemp;
 	freq_adj = shift_right(freq_adj, time_constant * 2 +
 			   (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
-	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC))
+	time_status &= ~STA_MODE;
+	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
 		freq_adj += div_s64(time_offset << (SHIFT_NSEC - SHIFT_FLL), mtemp);
+		time_status |= STA_MODE;
+	}
 	freq_adj += time_freq;
 	freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
 	time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
@@ -272,6 +277,7 @@ static inline void notify_cmos_timer(void) { }
  */
 int do_adjtimex(struct timex *txc)
 {
+	struct timespec ts;
 	long save_adjust;
 	int result;
 
@@ -282,17 +288,11 @@ int do_adjtimex(struct timex *txc)
 	/* Now we validate the data before disabling interrupts */
 
 	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
-	  /* singleshot must not be used with any other mode bits */
-		if (txc->modes != ADJ_OFFSET_SINGLESHOT &&
-					txc->modes != ADJ_OFFSET_SS_READ)
+		/* singleshot must not be used with any other mode bits */
+		if (txc->modes & ~ADJ_OFFSET_SS_READ)
 			return -EINVAL;
 	}
 
-	if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
-	  /* adjustment Offset limited to +- .512 seconds */
-		if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
-			return -EINVAL;
-
 	/* if the quartz is off by more than 10% something is VERY wrong ! */
 	if (txc->modes & ADJ_TICK)
 		if (txc->tick <  900000/USER_HZ ||
@@ -300,51 +300,46 @@ int do_adjtimex(struct timex *txc)
 			return -EINVAL;
 
 	write_seqlock_irq(&xtime_lock);
-	result = time_state;	/* mostly `TIME_OK' */
 
 	/* Save for later - semantics of adjtime is to return old value */
 	save_adjust = time_adjust;
 
-#if 0	/* STA_CLOCKERR is never set yet */
-	time_status &= ~STA_CLOCKERR;		/* reset STA_CLOCKERR */
-#endif
 	/* If there are input parameters, then process them */
 	if (txc->modes) {
-		if (txc->modes & ADJ_STATUS)	/* only set allowed bits */
-			time_status = (txc->status & ~STA_RONLY) |
-				      (time_status & STA_RONLY);
+		if (txc->modes & ADJ_STATUS) {
+			if ((time_status & STA_PLL) &&
+			    !(txc->status & STA_PLL)) {
+				time_state = TIME_OK;
+				time_status = STA_UNSYNC;
+			}
+			/* only set allowed bits */
+			time_status &= STA_RONLY;
+			time_status |= txc->status & ~STA_RONLY;
+		}
+
+		if (txc->modes & ADJ_NANO)
+			time_status |= STA_NANO;
+		if (txc->modes & ADJ_MICRO)
+			time_status &= ~STA_NANO;
 
 		if (txc->modes & ADJ_FREQUENCY) {
-			if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
-				result = -EINVAL;
-				goto leave;
-			}
-			time_freq = ((s64)txc->freq * NSEC_PER_USEC)
+			time_freq = min(txc->freq, MAXFREQ);
+			time_freq = min(time_freq, -MAXFREQ);
+			time_freq = ((s64)time_freq * NSEC_PER_USEC)
 					>> (SHIFT_USEC - SHIFT_NSEC);
 		}
 
-		if (txc->modes & ADJ_MAXERROR) {
-			if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
-				result = -EINVAL;
-				goto leave;
-			}
+		if (txc->modes & ADJ_MAXERROR)
 			time_maxerror = txc->maxerror;
-		}
-
-		if (txc->modes & ADJ_ESTERROR) {
-			if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
-				result = -EINVAL;
-				goto leave;
-			}
+		if (txc->modes & ADJ_ESTERROR)
 			time_esterror = txc->esterror;
-		}
 
 		if (txc->modes & ADJ_TIMECONST) {
-			if (txc->constant < 0) {	/* NTP v4 uses values > 6 */
-				result = -EINVAL;
-				goto leave;
-			}
-			time_constant = min(txc->constant + 4, (long)MAXTC);
+			time_constant = txc->constant;
+			if (!(time_status & STA_NANO))
+				time_constant += 4;
+			time_constant = min(time_constant, (long)MAXTC);
+			time_constant = max(time_constant, 0l);
 		}
 
 		if (txc->modes & ADJ_OFFSET) {
@@ -360,16 +355,20 @@ int do_adjtimex(struct timex *txc)
 		if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
 			ntp_update_frequency();
 	}
-leave:
+
+	result = time_state;	/* mostly `TIME_OK' */
 	if (time_status & (STA_UNSYNC|STA_CLOCKERR))
 		result = TIME_ERROR;
 
 	if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
 	    (txc->modes == ADJ_OFFSET_SS_READ))
 		txc->offset = save_adjust;
-	else
+	else {
 		txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) *
-	    			NTP_INTERVAL_FREQ / 1000;
+	    			NTP_INTERVAL_FREQ;
+		if (!(time_status & STA_NANO))
+			txc->offset /= NSEC_PER_USEC;
+	}
 	txc->freq	   = (time_freq / NSEC_PER_USEC) <<
 				(SHIFT_USEC - SHIFT_NSEC);
 	txc->maxerror	   = time_maxerror;
@@ -391,7 +390,11 @@ leave:
 	txc->stbcnt	   = 0;
 	write_sequnlock_irq(&xtime_lock);
 
-	do_gettimeofday(&txc->time);
+	getnstimeofday(&ts);
+	txc->time.tv_sec = ts.tv_sec;
+	txc->time.tv_usec = ts.tv_nsec;
+	if (!(time_status & STA_NANO))
+		txc->time.tv_usec /= NSEC_PER_USEC;
 
 	notify_cmos_timer();
 
-- 
cgit v1.3-14-g43fede


From 074b3b87941c99bc0ce35385b5817924b1ed0c23 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Thu, 1 May 2008 04:34:34 -0700
Subject: ntp: increase time_freq resolution

This changes time_freq to a 64bit value and makes it static (the only outside
user had no real need to modify it).  Intermediate values were already 64bit,
so the change isn't that big, but it saves a little in shifts by replacing
SHIFT_NSEC with TICK_LENGTH_SHIFT.  PPM_SCALE is then used to convert between
user space and kernel space representation.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/time.c |  2 --
 include/linux/timex.h      | 11 ++++++-----
 kernel/time/ntp.c          | 30 +++++++++++++++---------------
 3 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 3b26fbd6bec9..c146af995854 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -1007,8 +1007,6 @@ void __init time_init(void)
 	vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC;
 	vdso_data->tb_to_xs = tb_to_xs;
 
-	time_freq = 0;
-
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
 	/* Register the clocksource, if we're not running on iSeries */
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 3c49d173bf39..48c3376dce71 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -86,11 +86,14 @@
  */
 #define SHIFT_UPDATE (SHIFT_HZ + 1) /* time offset scale (shift) */
 #define SHIFT_USEC 16		/* frequency offset scale (shift) */
-#define SHIFT_NSEC 12		/* kernel frequency offset scale */
+#define PPM_SCALE (NSEC_PER_USEC << (TICK_LENGTH_SHIFT - SHIFT_USEC))
+#define PPM_SCALE_INV_SHIFT 20
+#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + TICK_LENGTH_SHIFT)) / \
+		       PPM_SCALE + 1)
 
 #define MAXPHASE 512000L        /* max phase error (us) */
-#define MAXFREQ (512L << SHIFT_USEC)  /* max frequency error (ppm) */
-#define MAXFREQ_NSEC (512000L << SHIFT_NSEC) /* max frequency error (ppb) */
+#define MAXFREQ 500000		/* max frequency error (ns/s) */
+#define MAXFREQ_SCALED ((s64)MAXFREQ << TICK_LENGTH_SHIFT)
 #define MINSEC 256		/* min interval between updates (s) */
 #define MAXSEC 2048		/* max interval between updates (s) */
 #define	NTP_PHASE_LIMIT	(MAXPHASE << 5)	/* beyond max. dispersion */
@@ -209,8 +212,6 @@ extern int time_status;		/* clock synchronization status bits */
 extern long time_maxerror;	/* maximum error */
 extern long time_esterror;	/* estimated error */
 
-extern long time_freq;		/* frequency offset (scaled ppm) */
-
 extern long time_adjust;	/* The amount of adjtime left */
 
 extern void ntp_clear(void);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 3fc81066d7f1..c6ae0c249891 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -39,7 +39,7 @@ static s64 time_offset;			/* time adjustment (ns)		*/
 static long time_constant = 2;		/* pll time constant		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
-long time_freq;				/* frequency offset (scaled ppm)*/
+static s64 time_freq;			/* frequency offset (scaled ns/s)*/
 static long time_reftime;		/* time at last adjustment (s)	*/
 long time_adjust;
 static long ntp_tick_adj;
@@ -49,7 +49,7 @@ static void ntp_update_frequency(void)
 	u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
 				<< TICK_LENGTH_SHIFT;
 	second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT;
-	second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
+	second_length += time_freq;
 
 	tick_length_base = second_length;
 
@@ -86,16 +86,16 @@ static void ntp_update_offset(long offset)
 	time_reftime = xtime.tv_sec;
 
 	freq_adj = time_offset * mtemp;
-	freq_adj = shift_right(freq_adj, time_constant * 2 +
-			   (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
+	freq_adj <<= TICK_LENGTH_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant);
 	time_status &= ~STA_MODE;
 	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
-		freq_adj += div_s64(time_offset << (SHIFT_NSEC - SHIFT_FLL), mtemp);
+		freq_adj += div_s64(time_offset << (TICK_LENGTH_SHIFT - SHIFT_FLL),
+				    mtemp);
 		time_status |= STA_MODE;
 	}
 	freq_adj += time_freq;
-	freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
-	time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
+	freq_adj = min(freq_adj, MAXFREQ_SCALED);
+	time_freq = max(freq_adj, -MAXFREQ_SCALED);
 	time_offset = div_s64(time_offset, NTP_INTERVAL_FREQ);
 	time_offset <<= SHIFT_UPDATE;
 }
@@ -131,7 +131,7 @@ void second_overflow(void)
 	long time_adj;
 
 	/* Bump the maxerror field */
-	time_maxerror += MAXFREQ >> SHIFT_USEC;
+	time_maxerror += MAXFREQ / NSEC_PER_USEC;
 	if (time_maxerror > NTP_PHASE_LIMIT) {
 		time_maxerror = NTP_PHASE_LIMIT;
 		time_status |= STA_UNSYNC;
@@ -323,10 +323,9 @@ int do_adjtimex(struct timex *txc)
 			time_status &= ~STA_NANO;
 
 		if (txc->modes & ADJ_FREQUENCY) {
-			time_freq = min(txc->freq, MAXFREQ);
-			time_freq = min(time_freq, -MAXFREQ);
-			time_freq = ((s64)time_freq * NSEC_PER_USEC)
-					>> (SHIFT_USEC - SHIFT_NSEC);
+			time_freq = (s64)txc->freq * PPM_SCALE;
+			time_freq = min(time_freq, MAXFREQ_SCALED);
+			time_freq = max(time_freq, -MAXFREQ_SCALED);
 		}
 
 		if (txc->modes & ADJ_MAXERROR)
@@ -369,14 +368,15 @@ int do_adjtimex(struct timex *txc)
 		if (!(time_status & STA_NANO))
 			txc->offset /= NSEC_PER_USEC;
 	}
-	txc->freq	   = (time_freq / NSEC_PER_USEC) <<
-				(SHIFT_USEC - SHIFT_NSEC);
+	txc->freq	   = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
+					 (s64)PPM_SCALE_INV,
+					 TICK_LENGTH_SHIFT);
 	txc->maxerror	   = time_maxerror;
 	txc->esterror	   = time_esterror;
 	txc->status	   = time_status;
 	txc->constant	   = time_constant;
 	txc->precision	   = 1;
-	txc->tolerance	   = MAXFREQ;
+	txc->tolerance	   = MAXFREQ_SCALED / PPM_SCALE;
 	txc->tick	   = tick_usec;
 
 	/* PPS is not implemented, so these are zero */
-- 
cgit v1.3-14-g43fede


From 9f14f669d18477fe3df071e2fa4da36c00acee8e Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Thu, 1 May 2008 04:34:36 -0700
Subject: ntp: increase time_offset resolution

time_offset is already a 64bit value but its resolution barely used, so this
makes better use of it by replacing SHIFT_UPDATE with TICK_LENGTH_SHIFT.

Side note: the SHIFT_HZ in SHIFT_UPDATE was incorrect for CONFIG_NO_HZ and the
primary reason for changing time_offset to 64bit to avoid the overflow.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/timex.h |  9 ++-------
 kernel/time/ntp.c     | 23 +++++++++++------------
 2 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 48c3376dce71..9fbdd12a52f1 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -76,27 +76,22 @@
 #define MAXTC		10	/* maximum time constant (shift) */
 
 /*
- * The SHIFT_UPDATE define establishes the decimal point of the
- * time_offset variable which represents the current offset with
- * respect to standard time.
- *
  * SHIFT_USEC defines the scaling (shift) of the time_freq and
  * time_tolerance variables, which represent the current frequency
  * offset and maximum frequency tolerance.
  */
-#define SHIFT_UPDATE (SHIFT_HZ + 1) /* time offset scale (shift) */
 #define SHIFT_USEC 16		/* frequency offset scale (shift) */
 #define PPM_SCALE (NSEC_PER_USEC << (TICK_LENGTH_SHIFT - SHIFT_USEC))
 #define PPM_SCALE_INV_SHIFT 20
 #define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + TICK_LENGTH_SHIFT)) / \
 		       PPM_SCALE + 1)
 
-#define MAXPHASE 512000L        /* max phase error (us) */
+#define MAXPHASE 500000000l	/* max phase error (ns) */
 #define MAXFREQ 500000		/* max frequency error (ns/s) */
 #define MAXFREQ_SCALED ((s64)MAXFREQ << TICK_LENGTH_SHIFT)
 #define MINSEC 256		/* min interval between updates (s) */
 #define MAXSEC 2048		/* max interval between updates (s) */
-#define	NTP_PHASE_LIMIT	(MAXPHASE << 5)	/* beyond max. dispersion */
+#define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */
 
 /*
  * syscall interface - used (mainly by NTP daemon)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c6ae0c249891..44491de312a0 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -65,16 +65,15 @@ static void ntp_update_offset(long offset)
 	if (!(time_status & STA_PLL))
 		return;
 
-	time_offset = offset;
 	if (!(time_status & STA_NANO))
-		time_offset *= NSEC_PER_USEC;
+		offset *= NSEC_PER_USEC;
 
 	/*
 	 * Scale the phase adjustment and
 	 * clamp to the operating range.
 	 */
-	time_offset = min(time_offset, (s64)MAXPHASE * NSEC_PER_USEC);
-	time_offset = max(time_offset, (s64)-MAXPHASE * NSEC_PER_USEC);
+	offset = min(offset, MAXPHASE);
+	offset = max(offset, -MAXPHASE);
 
 	/*
 	 * Select how the frequency is to be controlled
@@ -85,19 +84,19 @@ static void ntp_update_offset(long offset)
 	mtemp = xtime.tv_sec - time_reftime;
 	time_reftime = xtime.tv_sec;
 
-	freq_adj = time_offset * mtemp;
+	freq_adj = (s64)offset * mtemp;
 	freq_adj <<= TICK_LENGTH_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant);
 	time_status &= ~STA_MODE;
 	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
-		freq_adj += div_s64(time_offset << (TICK_LENGTH_SHIFT - SHIFT_FLL),
+		freq_adj += div_s64((s64)offset << (TICK_LENGTH_SHIFT - SHIFT_FLL),
 				    mtemp);
 		time_status |= STA_MODE;
 	}
 	freq_adj += time_freq;
 	freq_adj = min(freq_adj, MAXFREQ_SCALED);
 	time_freq = max(freq_adj, -MAXFREQ_SCALED);
-	time_offset = div_s64(time_offset, NTP_INTERVAL_FREQ);
-	time_offset <<= SHIFT_UPDATE;
+
+	time_offset = div_s64((s64)offset << TICK_LENGTH_SHIFT, NTP_INTERVAL_FREQ);
 }
 
 /**
@@ -128,7 +127,7 @@ void ntp_clear(void)
  */
 void second_overflow(void)
 {
-	long time_adj;
+	s64 time_adj;
 
 	/* Bump the maxerror field */
 	time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -184,7 +183,7 @@ void second_overflow(void)
 	tick_length = tick_length_base;
 	time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
 	time_offset -= time_adj;
-	tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE);
+	tick_length += time_adj;
 
 	if (unlikely(time_adjust)) {
 		if (time_adjust > MAX_TICKADJ) {
@@ -363,8 +362,8 @@ int do_adjtimex(struct timex *txc)
 	    (txc->modes == ADJ_OFFSET_SS_READ))
 		txc->offset = save_adjust;
 	else {
-		txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) *
-	    			NTP_INTERVAL_FREQ;
+		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+					  TICK_LENGTH_SHIFT);
 		if (!(time_status & STA_NANO))
 			txc->offset /= NSEC_PER_USEC;
 	}
-- 
cgit v1.3-14-g43fede


From 153b5d054ac2d98ea0d86504884326b6777f683d Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Thu, 1 May 2008 04:34:37 -0700
Subject: ntp: support for TAI

This adds support for setting the TAI value (International Atomic Time).  The
value is reported back to userspace via timex (as we don't have a
ntp_gettime() syscall).

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h | 3 ++-
 include/linux/timex.h  | 5 ++++-
 kernel/compat.c        | 3 ++-
 kernel/time/ntp.c      | 7 +++++++
 4 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 8fa7857e153b..cf8d11cad5ae 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -65,10 +65,11 @@ struct compat_timex {
 	compat_long_t calcnt;
 	compat_long_t errcnt;
 	compat_long_t stbcnt;
+	compat_int_t tai;
 
 	compat_int_t :32; compat_int_t :32; compat_int_t :32; compat_int_t :32;
 	compat_int_t :32; compat_int_t :32; compat_int_t :32; compat_int_t :32;
-	compat_int_t :32; compat_int_t :32; compat_int_t :32; compat_int_t :32;
+	compat_int_t :32; compat_int_t :32; compat_int_t :32;
 };
 
 #define _COMPAT_NSIG_WORDS	(_COMPAT_NSIG / _COMPAT_NSIG_BPW)
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 9fbdd12a52f1..6ff92237759a 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -121,9 +121,11 @@ struct timex {
 	long errcnt;            /* calibration errors (ro) */
 	long stbcnt;            /* stability limit exceeded (ro) */
 
+	int tai;		/* TAI offset (ro) */
+
 	int  :32; int  :32; int  :32; int  :32;
 	int  :32; int  :32; int  :32; int  :32;
-	int  :32; int  :32; int  :32; int  :32;
+	int  :32; int  :32; int  :32;
 };
 
 /*
@@ -135,6 +137,7 @@ struct timex {
 #define ADJ_ESTERROR		0x0008	/* estimated time error */
 #define ADJ_STATUS		0x0010	/* clock status */
 #define ADJ_TIMECONST		0x0020	/* pll time constant */
+#define ADJ_TAI			0x0080	/* set TAI offset */
 #define ADJ_MICRO		0x1000	/* select microsecond resolution */
 #define ADJ_NANO		0x2000	/* select nanosecond resolution */
 #define ADJ_TICK		0x4000	/* tick value */
diff --git a/kernel/compat.c b/kernel/compat.c
index 4a856a3643bb..32c254a8ab9a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -955,7 +955,8 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 			__put_user(txc.jitcnt, &utp->jitcnt) ||
 			__put_user(txc.calcnt, &utp->calcnt) ||
 			__put_user(txc.errcnt, &utp->errcnt) ||
-			__put_user(txc.stbcnt, &utp->stbcnt))
+			__put_user(txc.stbcnt, &utp->stbcnt) ||
+			__put_user(txc.tai, &utp->tai))
 		ret = -EFAULT;
 
 	return ret;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 44491de312a0..10fe17df45a0 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -35,6 +35,7 @@ static u64 tick_length, tick_length_base;
 /* TIME_ERROR prevents overwriting the CMOS clock */
 static int time_state = TIME_OK;	/* clock synchronization status	*/
 int time_status = STA_UNSYNC;		/* clock status bits		*/
+static long time_tai;			/* TAI offset (s)		*/
 static s64 time_offset;			/* time adjustment (ns)		*/
 static long time_constant = 2;		/* pll time constant		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
@@ -162,6 +163,7 @@ void second_overflow(void)
 	case TIME_DEL:
 		if ((xtime.tv_sec + 1) % 86400 == 0) {
 			xtime.tv_sec++;
+			time_tai--;
 			wall_to_monotonic.tv_sec--;
 			time_state = TIME_WAIT;
 			printk(KERN_NOTICE "Clock: deleting leap second "
@@ -169,6 +171,7 @@ void second_overflow(void)
 		}
 		break;
 	case TIME_OOP:
+		time_tai++;
 		time_state = TIME_WAIT;
 		break;
 	case TIME_WAIT:
@@ -340,6 +343,9 @@ int do_adjtimex(struct timex *txc)
 			time_constant = max(time_constant, 0l);
 		}
 
+		if (txc->modes & ADJ_TAI && txc->constant > 0)
+			time_tai = txc->constant;
+
 		if (txc->modes & ADJ_OFFSET) {
 			if (txc->modes == ADJ_OFFSET_SINGLESHOT)
 				/* adjtime() is independent from ntp_adjtime() */
@@ -377,6 +383,7 @@ int do_adjtimex(struct timex *txc)
 	txc->precision	   = 1;
 	txc->tolerance	   = MAXFREQ_SCALED / PPM_SCALE;
 	txc->tick	   = tick_usec;
+	txc->tai	   = time_tai;
 
 	/* PPS is not implemented, so these are zero */
 	txc->ppsfreq	   = 0;
-- 
cgit v1.3-14-g43fede


From 7fc5c78409479d826341b103bdf734cb4fb02436 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Thu, 1 May 2008 04:34:38 -0700
Subject: ntp: rename TICK_LENGTH_SHIFT to NTP_SCALE_SHIFT

As TICK_LENGTH_SHIFT is used for more than just the tick length, the name
isn't quite approriate anymore, so this renames it to NTP_SCALE_SHIFT.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/time.c |  2 +-
 include/linux/timex.h      |  8 ++++----
 kernel/time/ntp.c          | 20 ++++++++++----------
 kernel/time/timekeeping.c  | 10 +++++-----
 4 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index c146af995854..73401e83739a 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -149,7 +149,7 @@ EXPORT_SYMBOL(tb_ticks_per_sec);	/* for cputime_t conversions */
 u64 tb_to_xs;
 unsigned tb_to_us;
 
-#define TICKLEN_SCALE	TICK_LENGTH_SHIFT
+#define TICKLEN_SCALE	NTP_SCALE_SHIFT
 u64 last_tick_len;	/* units are ns / 2^TICKLEN_SCALE */
 u64 ticklen_to_xs;	/* 0.64 fraction */
 
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 6ff92237759a..5994d39de778 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -81,14 +81,14 @@
  * offset and maximum frequency tolerance.
  */
 #define SHIFT_USEC 16		/* frequency offset scale (shift) */
-#define PPM_SCALE (NSEC_PER_USEC << (TICK_LENGTH_SHIFT - SHIFT_USEC))
+#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC))
 #define PPM_SCALE_INV_SHIFT 20
-#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + TICK_LENGTH_SHIFT)) / \
+#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \
 		       PPM_SCALE + 1)
 
 #define MAXPHASE 500000000l	/* max phase error (ns) */
 #define MAXFREQ 500000		/* max frequency error (ns/s) */
-#define MAXFREQ_SCALED ((s64)MAXFREQ << TICK_LENGTH_SHIFT)
+#define MAXFREQ_SCALED ((s64)MAXFREQ << NTP_SCALE_SHIFT)
 #define MINSEC 256		/* min interval between updates (s) */
 #define MAXSEC 2048		/* max interval between updates (s) */
 #define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */
@@ -230,7 +230,7 @@ static inline int ntp_synced(void)
 	__x < 0 ? -(-__x >> __s) : __x >> __s;	\
 })
 
-#define TICK_LENGTH_SHIFT	32
+#define NTP_SCALE_SHIFT		32
 
 #ifdef CONFIG_NO_HZ
 #define NTP_INTERVAL_FREQ  (2)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 10fe17df45a0..a8fd1ba1ef19 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -27,7 +27,7 @@ static u64 tick_length, tick_length_base;
 
 #define MAX_TICKADJ		500		/* microsecs */
 #define MAX_TICKADJ_SCALED	(((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
-				  TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ)
+				  NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
 
 /*
  * phase-lock loop variables
@@ -48,13 +48,13 @@ static long ntp_tick_adj;
 static void ntp_update_frequency(void)
 {
 	u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
-				<< TICK_LENGTH_SHIFT;
-	second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT;
+				<< NTP_SCALE_SHIFT;
+	second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
 	second_length += time_freq;
 
 	tick_length_base = second_length;
 
-	tick_nsec = div_u64(second_length, HZ) >> TICK_LENGTH_SHIFT;
+	tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
 	tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
 }
 
@@ -86,10 +86,10 @@ static void ntp_update_offset(long offset)
 	time_reftime = xtime.tv_sec;
 
 	freq_adj = (s64)offset * mtemp;
-	freq_adj <<= TICK_LENGTH_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant);
+	freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant);
 	time_status &= ~STA_MODE;
 	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
-		freq_adj += div_s64((s64)offset << (TICK_LENGTH_SHIFT - SHIFT_FLL),
+		freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL),
 				    mtemp);
 		time_status |= STA_MODE;
 	}
@@ -97,7 +97,7 @@ static void ntp_update_offset(long offset)
 	freq_adj = min(freq_adj, MAXFREQ_SCALED);
 	time_freq = max(freq_adj, -MAXFREQ_SCALED);
 
-	time_offset = div_s64((s64)offset << TICK_LENGTH_SHIFT, NTP_INTERVAL_FREQ);
+	time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
 }
 
 /**
@@ -197,7 +197,7 @@ void second_overflow(void)
 			tick_length -= MAX_TICKADJ_SCALED;
 		} else {
 			tick_length += (s64)(time_adjust * NSEC_PER_USEC /
-					NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT;
+					NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT;
 			time_adjust = 0;
 		}
 	}
@@ -369,13 +369,13 @@ int do_adjtimex(struct timex *txc)
 		txc->offset = save_adjust;
 	else {
 		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
-					  TICK_LENGTH_SHIFT);
+					  NTP_SCALE_SHIFT);
 		if (!(time_status & STA_NANO))
 			txc->offset /= NSEC_PER_USEC;
 	}
 	txc->freq	   = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
 					 (s64)PPM_SCALE_INV,
-					 TICK_LENGTH_SHIFT);
+					 NTP_SCALE_SHIFT);
 	txc->maxerror	   = time_maxerror;
 	txc->esterror	   = time_esterror;
 	txc->status	   = time_status;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2d6087c7cf98..a26429bc772a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -371,7 +371,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
 	 * here.  This is tuned so that an error of about 1 msec is adjusted
 	 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
 	 */
-	error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
+	error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
 	error2 = abs(error2);
 	for (look_ahead = 0; error2 > 0; look_ahead++)
 		error2 >>= 2;
@@ -381,7 +381,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
 	 * remove the single look ahead already included in the error.
 	 */
 	tick_error = current_tick_length() >>
-		(TICK_LENGTH_SHIFT - clock->shift + 1);
+		(NTP_SCALE_SHIFT - clock->shift + 1);
 	tick_error -= clock->xtime_interval >> 1;
 	error = ((error - tick_error) >> look_ahead) + tick_error;
 
@@ -412,7 +412,7 @@ static void clocksource_adjust(s64 offset)
 	s64 error, interval = clock->cycle_interval;
 	int adj;
 
-	error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
+	error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
 	if (error > interval) {
 		error >>= 2;
 		if (likely(error <= interval))
@@ -434,7 +434,7 @@ static void clocksource_adjust(s64 offset)
 	clock->xtime_interval += interval;
 	clock->xtime_nsec -= offset;
 	clock->error -= (interval - offset) <<
-			(TICK_LENGTH_SHIFT - clock->shift);
+			(NTP_SCALE_SHIFT - clock->shift);
 }
 
 /**
@@ -474,7 +474,7 @@ void update_wall_time(void)
 
 		/* accumulate error between NTP and clock interval */
 		clock->error += current_tick_length();
-		clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
+		clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
 	}
 
 	/* correct the clock when NTP error is too big */
-- 
cgit v1.3-14-g43fede


From 8383c42399f394a89bd6c2f03632c53689bdde7a Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Thu, 1 May 2008 04:34:39 -0700
Subject: ntp: remove current_tick_length()

current_tick_length used to do a little more, but now it just returns
tick_length, which we can also access directly at the few places, where it's
needed.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/timex.h     |  4 ++--
 kernel/time/ntp.c         | 16 ++--------------
 kernel/time/timekeeping.c |  5 ++---
 3 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 5994d39de778..da431f8c5c6e 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -239,8 +239,8 @@ static inline int ntp_synced(void)
 #endif
 #define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
 
-/* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */
-extern u64 current_tick_length(void);
+/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
+extern u64 tick_length;
 
 extern void second_overflow(void);
 extern void update_ntp_one_tick(void);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index a8fd1ba1ef19..df9718bac8d0 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -23,7 +23,8 @@
  */
 unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */
 unsigned long tick_nsec;			/* ACTHZ period (nsec) */
-static u64 tick_length, tick_length_base;
+u64 tick_length;
+static u64 tick_length_base;
 
 #define MAX_TICKADJ		500		/* microsecs */
 #define MAX_TICKADJ_SCALED	(((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
@@ -203,19 +204,6 @@ void second_overflow(void)
 	}
 }
 
-/*
- * Return how long ticks are at the moment, that is, how much time
- * update_wall_time_one_tick will add to xtime next time we call it
- * (assuming no calls to do_adjtimex in the meantime).
- * The return value is in fixed-point nanoseconds shifted by the
- * specified number of bits to the right of the binary point.
- * This function has no side-effects.
- */
-u64 current_tick_length(void)
-{
-	return tick_length;
-}
-
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
 
 /* Disable the cmos update - used by virtualization and embedded */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a26429bc772a..7e74d8092067 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -380,8 +380,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
 	 * Now calculate the error in (1 << look_ahead) ticks, but first
 	 * remove the single look ahead already included in the error.
 	 */
-	tick_error = current_tick_length() >>
-		(NTP_SCALE_SHIFT - clock->shift + 1);
+	tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
 	tick_error -= clock->xtime_interval >> 1;
 	error = ((error - tick_error) >> look_ahead) + tick_error;
 
@@ -473,7 +472,7 @@ void update_wall_time(void)
 		}
 
 		/* accumulate error between NTP and clock interval */
-		clock->error += current_tick_length();
+		clock->error += tick_length;
 		clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
 	}
 
-- 
cgit v1.3-14-g43fede


From 7dffa3c673fbcf835cd7be80bb4aec8ad3f51168 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Thu, 1 May 2008 04:34:41 -0700
Subject: ntp: handle leap second via timer

Remove the leap second handling from second_overflow(), which doesn't have to
check for it every second anymore.  With CONFIG_NO_HZ this also makes sure the
leap second is handled close to the full second.  Additionally this makes it
possible to abort a leap second properly by resetting the STA_INS/STA_DEL
status bits.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/clocksource.h |   2 +
 include/linux/timex.h       |   1 +
 kernel/time/ntp.c           | 133 ++++++++++++++++++++++++++++++--------------
 kernel/time/timekeeping.c   |   4 +-
 4 files changed, 95 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 35094479ca55..55e434feec99 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -93,6 +93,8 @@ struct clocksource {
 #endif
 };
 
+extern struct clocksource *clock;	/* current clocksource */
+
 /*
  * Clock source flags bits::
  */
diff --git a/include/linux/timex.h b/include/linux/timex.h
index da431f8c5c6e..fc6035d29d56 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -212,6 +212,7 @@ extern long time_esterror;	/* estimated error */
 
 extern long time_adjust;	/* The amount of adjtime left */
 
+extern void ntp_init(void);
 extern void ntp_clear(void);
 
 /**
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index df9718bac8d0..5125ddd8196b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,6 +16,7 @@
 #include <linux/hrtimer.h>
 #include <linux/capability.h>
 #include <linux/math64.h>
+#include <linux/clocksource.h>
 #include <asm/timex.h>
 
 /*
@@ -26,6 +27,8 @@ unsigned long tick_nsec;			/* ACTHZ period (nsec) */
 u64 tick_length;
 static u64 tick_length_base;
 
+static struct hrtimer leap_timer;
+
 #define MAX_TICKADJ		500		/* microsecs */
 #define MAX_TICKADJ_SCALED	(((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
 				  NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -120,64 +123,70 @@ void ntp_clear(void)
 }
 
 /*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
+ * Leap second processing. If in leap-insert state at the end of the
+ * day, the system clock is set back one second; if in leap-delete
+ * state, the system clock is set ahead one second.
  */
-void second_overflow(void)
+static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 {
-	s64 time_adj;
+	enum hrtimer_restart res = HRTIMER_NORESTART;
 
-	/* Bump the maxerror field */
-	time_maxerror += MAXFREQ / NSEC_PER_USEC;
-	if (time_maxerror > NTP_PHASE_LIMIT) {
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_status |= STA_UNSYNC;
-	}
+	write_seqlock_irq(&xtime_lock);
 
-	/*
-	 * Leap second processing. If in leap-insert state at the end of the
-	 * day, the system clock is set back one second; if in leap-delete
-	 * state, the system clock is set ahead one second. The microtime()
-	 * routine or external clock driver will insure that reported time is
-	 * always monotonic. The ugly divides should be replaced.
-	 */
 	switch (time_state) {
 	case TIME_OK:
-		if (time_status & STA_INS)
-			time_state = TIME_INS;
-		else if (time_status & STA_DEL)
-			time_state = TIME_DEL;
 		break;
 	case TIME_INS:
-		if (xtime.tv_sec % 86400 == 0) {
-			xtime.tv_sec--;
-			wall_to_monotonic.tv_sec++;
-			time_state = TIME_OOP;
-			printk(KERN_NOTICE "Clock: inserting leap second "
-					"23:59:60 UTC\n");
-		}
+		xtime.tv_sec--;
+		wall_to_monotonic.tv_sec++;
+		time_state = TIME_OOP;
+		printk(KERN_NOTICE "Clock: "
+		       "inserting leap second 23:59:60 UTC\n");
+		leap_timer.expires = ktime_add_ns(leap_timer.expires,
+						  NSEC_PER_SEC);
+		res = HRTIMER_RESTART;
 		break;
 	case TIME_DEL:
-		if ((xtime.tv_sec + 1) % 86400 == 0) {
-			xtime.tv_sec++;
-			time_tai--;
-			wall_to_monotonic.tv_sec--;
-			time_state = TIME_WAIT;
-			printk(KERN_NOTICE "Clock: deleting leap second "
-					"23:59:59 UTC\n");
-		}
+		xtime.tv_sec++;
+		time_tai--;
+		wall_to_monotonic.tv_sec--;
+		time_state = TIME_WAIT;
+		printk(KERN_NOTICE "Clock: "
+		       "deleting leap second 23:59:59 UTC\n");
 		break;
 	case TIME_OOP:
 		time_tai++;
 		time_state = TIME_WAIT;
-		break;
+		/* fall through */
 	case TIME_WAIT:
 		if (!(time_status & (STA_INS | STA_DEL)))
 			time_state = TIME_OK;
+		break;
+	}
+	update_vsyscall(&xtime, clock);
+
+	write_sequnlock_irq(&xtime_lock);
+
+	return res;
+}
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ */
+void second_overflow(void)
+{
+	s64 time_adj;
+
+	/* Bump the maxerror field */
+	time_maxerror += MAXFREQ / NSEC_PER_USEC;
+	if (time_maxerror > NTP_PHASE_LIMIT) {
+		time_maxerror = NTP_PHASE_LIMIT;
+		time_status |= STA_UNSYNC;
 	}
 
 	/*
@@ -268,7 +277,7 @@ static inline void notify_cmos_timer(void) { }
 int do_adjtimex(struct timex *txc)
 {
 	struct timespec ts;
-	long save_adjust;
+	long save_adjust, sec;
 	int result;
 
 	/* In order to modify anything, you gotta be super-user! */
@@ -289,6 +298,10 @@ int do_adjtimex(struct timex *txc)
 		    txc->tick > 1100000/USER_HZ)
 			return -EINVAL;
 
+	if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
+		hrtimer_cancel(&leap_timer);
+	getnstimeofday(&ts);
+
 	write_seqlock_irq(&xtime_lock);
 
 	/* Save for later - semantics of adjtime is to return old value */
@@ -305,6 +318,34 @@ int do_adjtimex(struct timex *txc)
 			/* only set allowed bits */
 			time_status &= STA_RONLY;
 			time_status |= txc->status & ~STA_RONLY;
+
+			switch (time_state) {
+			case TIME_OK:
+			start_timer:
+				sec = ts.tv_sec;
+				if (time_status & STA_INS) {
+					time_state = TIME_INS;
+					sec += 86400 - sec % 86400;
+					hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
+				} else if (time_status & STA_DEL) {
+					time_state = TIME_DEL;
+					sec += 86400 - (sec + 1) % 86400;
+					hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
+				}
+				break;
+			case TIME_INS:
+			case TIME_DEL:
+				time_state = TIME_OK;
+				goto start_timer;
+				break;
+			case TIME_WAIT:
+				if (!(time_status & (STA_INS | STA_DEL)))
+					time_state = TIME_OK;
+				break;
+			case TIME_OOP:
+				hrtimer_restart(&leap_timer);
+				break;
+			}
 		}
 
 		if (txc->modes & ADJ_NANO)
@@ -384,7 +425,6 @@ int do_adjtimex(struct timex *txc)
 	txc->stbcnt	   = 0;
 	write_sequnlock_irq(&xtime_lock);
 
-	getnstimeofday(&ts);
 	txc->time.tv_sec = ts.tv_sec;
 	txc->time.tv_usec = ts.tv_nsec;
 	if (!(time_status & STA_NANO))
@@ -402,3 +442,10 @@ static int __init ntp_tick_adj_setup(char *str)
 }
 
 __setup("ntp_tick_adj=", ntp_tick_adj_setup);
+
+void __init ntp_init(void)
+{
+	ntp_clear();
+	hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+	leap_timer.function = ntp_leap_second;
+}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7e74d8092067..e91c29f961c9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -53,7 +53,7 @@ void update_xtime_cache(u64 nsec)
 	timespec_add_ns(&xtime_cache, nsec);
 }
 
-static struct clocksource *clock; /* pointer to current clocksource */
+struct clocksource *clock;
 
 
 #ifdef CONFIG_GENERIC_TIME
@@ -246,7 +246,7 @@ void __init timekeeping_init(void)
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 
-	ntp_clear();
+	ntp_init();
 
 	clock = clocksource_get_next();
 	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-- 
cgit v1.3-14-g43fede


From be089d79c46f5efa77fbdf03c5e576e220bf143f Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Thu, 1 May 2008 04:34:49 -0700
Subject: kexec: make extended crashkernel= syntax less confusing

The extended crashkernel syntax is a little confusing in the way it handles
ranges.  eg:

 crashkernel=512M-2G:64M,2G-:128M

Means if the machine has between 512M and 2G of memory the crash region should
be 64M, and if the machine has 2G of memory the region should be 64M.  Only if
the machine has more than 2G memory will 128M be allocated.

Although that semantic is correct, it is somewhat baffling.  Instead I propose
that the end of the range means the first address past the end of the range,
ie: 512M up to but not including 2G.

[bwalle@suse.de: clarify inclusive/exclusive in crashkernel commandline in documentation]
Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Acked-by: Bernhard Walle <bwalle@suse.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Simon Horman <horms@verge.net.au>
Signed-off-by: Bernhard Walle <bwalle@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kdump/kdump.txt | 5 ++++-
 kernel/kexec.c                | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt
index d0ac72cc19ff..b8e52c0355d3 100644
--- a/Documentation/kdump/kdump.txt
+++ b/Documentation/kdump/kdump.txt
@@ -245,6 +245,8 @@ The syntax is:
     crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset]
     range=start-[end]
 
+    'start' is inclusive and 'end' is exclusive.
+
 For example:
 
     crashkernel=512M-2G:64M,2G-:128M
@@ -253,10 +255,11 @@ This would mean:
 
     1) if the RAM is smaller than 512M, then don't reserve anything
        (this is the "rescue" case)
-    2) if the RAM size is between 512M and 2G, then reserve 64M
+    2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M
     3) if the RAM size is larger than 2G, then reserve 128M
 
 
+
 Boot into System Kernel
 =======================
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cb85c79989b4..1c5fcacbcf33 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1217,7 +1217,7 @@ static int __init parse_crashkernel_mem(char 			*cmdline,
 		}
 
 		/* match ? */
-		if (system_ram >= start && system_ram <= end) {
+		if (system_ram >= start && system_ram < end) {
 			*crash_size = size;
 			break;
 		}
-- 
cgit v1.3-14-g43fede


From 8a3e77cc212f3bc8eccc95e0d046405cf2a02764 Mon Sep 17 00:00:00 2001
From: Andrew Liu <shengping.liu@windriver.com>
Date: Thu, 1 May 2008 04:35:14 -0700
Subject: workqueue: remove redundant function invocation

timer_stats_timer_set_start_info is invoked twice, additionally, the
invocation of this function can be moved to where it is only called when a
delay is really required.

Signed-off-by: Andrew Liu <shengping.liu@windriver.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 721093a22561..29fc39f1029c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -195,7 +195,6 @@ static void delayed_work_timer_fn(unsigned long __data)
 int queue_delayed_work(struct workqueue_struct *wq,
 			struct delayed_work *dwork, unsigned long delay)
 {
-	timer_stats_timer_set_start_info(&dwork->timer);
 	if (delay == 0)
 		return queue_work(wq, &dwork->work);
 
@@ -219,11 +218,12 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct timer_list *timer = &dwork->timer;
 	struct work_struct *work = &dwork->work;
 
-	timer_stats_timer_set_start_info(&dwork->timer);
 	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
+		timer_stats_timer_set_start_info(&dwork->timer);
+
 		/* This stores cwq for the moment, for the timer_fn */
 		set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
 		timer->expires = jiffies + delay;
@@ -564,7 +564,6 @@ EXPORT_SYMBOL(schedule_work);
 int schedule_delayed_work(struct delayed_work *dwork,
 					unsigned long delay)
 {
-	timer_stats_timer_set_start_info(&dwork->timer);
 	return queue_delayed_work(keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
@@ -581,7 +580,6 @@ EXPORT_SYMBOL(schedule_delayed_work);
 int schedule_delayed_work_on(int cpu,
 			struct delayed_work *dwork, unsigned long delay)
 {
-	timer_stats_timer_set_start_info(&dwork->timer);
 	return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
-- 
cgit v1.3-14-g43fede


From 9f3acc3140444a900ab280de942291959f0f615d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Apr 2008 07:44:08 -0400
Subject: [PATCH] split linux/file.h

Initial splitoff of the low-level stuff; taken to fdtable.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/mips/kernel/kspd.c                      |  1 +
 arch/powerpc/platforms/cell/spufs/coredump.c |  1 +
 drivers/char/tty_audit.c                     |  1 +
 drivers/char/tty_io.c                        |  1 +
 fs/compat.c                                  |  1 +
 fs/dnotify.c                                 |  2 +-
 fs/exec.c                                    |  1 +
 fs/fcntl.c                                   |  1 +
 fs/file.c                                    |  1 +
 fs/file_table.c                              |  1 +
 fs/locks.c                                   |  1 +
 fs/open.c                                    |  1 +
 fs/proc/array.c                              |  1 +
 fs/proc/base.c                               |  1 +
 fs/select.c                                  |  1 +
 include/linux/fdtable.h                      | 99 ++++++++++++++++++++++++++++
 include/linux/file.h                         | 86 +-----------------------
 include/linux/init_task.h                    |  2 +-
 kernel/exit.c                                |  1 +
 kernel/fork.c                                |  1 +
 kernel/kmod.c                                |  1 +
 security/selinux/hooks.c                     |  1 +
 22 files changed, 121 insertions(+), 86 deletions(-)
 create mode 100644 include/linux/fdtable.h

(limited to 'kernel')

diff --git a/arch/mips/kernel/kspd.c b/arch/mips/kernel/kspd.c
index 998c4efcce88..ceb62dce1c9c 100644
--- a/arch/mips/kernel/kspd.c
+++ b/arch/mips/kernel/kspd.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/syscalls.h>
 #include <linux/workqueue.h>
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index b962c3ab470c..af116aadba10 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -22,6 +22,7 @@
 
 #include <linux/elf.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/module.h>
diff --git a/drivers/char/tty_audit.c b/drivers/char/tty_audit.c
index 6342b0534f4d..3582f43345a8 100644
--- a/drivers/char/tty_audit.c
+++ b/drivers/char/tty_audit.c
@@ -11,6 +11,7 @@
 
 #include <linux/audit.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/tty.h>
 
 struct tty_audit_buf {
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 1d298c2cf930..49c1a2267a55 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -78,6 +78,7 @@
 #include <linux/tty_flip.h>
 #include <linux/devpts_fs.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/console.h>
 #include <linux/timer.h>
 #include <linux/ctype.h>
diff --git a/fs/compat.c b/fs/compat.c
index 139dc93c092d..332a869d2c53 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -24,6 +24,7 @@
 #include <linux/fcntl.h>
 #include <linux/namei.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/vfs.h>
 #include <linux/ioctl.h>
 #include <linux/init.h>
diff --git a/fs/dnotify.c b/fs/dnotify.c
index eaecc4cfe540..676073b8dda5 100644
--- a/fs/dnotify.c
+++ b/fs/dnotify.c
@@ -20,7 +20,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
-#include <linux/file.h>
+#include <linux/fdtable.h>
 
 int dir_notify_enable __read_mostly = 1;
 
diff --git a/fs/exec.c b/fs/exec.c
index 9f9f931ef949..aeaa9791d8be 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -24,6 +24,7 @@
 
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/mman.h>
 #include <linux/a.out.h>
 #include <linux/stat.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3f3ac630ccde..bfd776509a72 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/capability.h>
 #include <linux/dnotify.h>
 #include <linux/smp_lock.h>
diff --git a/fs/file.c b/fs/file.c
index 5110acb1c9ef..f6fbcb49faf7 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -12,6 +12,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/bitops.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
diff --git a/fs/file_table.c b/fs/file_table.c
index 7a0a9b872251..83084225b4c3 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -8,6 +8,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/fs.h>
diff --git a/fs/locks.c b/fs/locks.c
index 44d9a6a7ec50..663c069b59b3 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -116,6 +116,7 @@
 
 #include <linux/capability.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/module.h>
diff --git a/fs/open.c b/fs/open.c
index 7af1f05d5978..a1450086e92f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -7,6 +7,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/quotaops.h>
 #include <linux/fsnotify.h>
 #include <linux/module.h>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index c135cbdd9127..dca997a93bff 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -73,6 +73,7 @@
 #include <linux/signal.h>
 #include <linux/highmem.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/times.h>
 #include <linux/cpuset.h>
 #include <linux/rcupdate.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index fcf02f2deeba..808cbdc193d3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -56,6 +56,7 @@
 #include <linux/init.h>
 #include <linux/capability.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/string.h>
 #include <linux/seq_file.h>
 #include <linux/namei.h>
diff --git a/fs/select.c b/fs/select.c
index 80d70387e790..8dda969614a9 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -21,6 +21,7 @@
 #include <linux/poll.h>
 #include <linux/personality.h> /* for STICKY_TIMEOUTS */
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/rcupdate.h>
 
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
new file mode 100644
index 000000000000..a118f3c0b240
--- /dev/null
+++ b/include/linux/fdtable.h
@@ -0,0 +1,99 @@
+/*
+ * descriptor table internals; you almost certainly want file.h instead.
+ */
+
+#ifndef __LINUX_FDTABLE_H
+#define __LINUX_FDTABLE_H
+
+#include <asm/atomic.h>
+#include <linux/posix_types.h>
+#include <linux/compiler.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+
+/*
+ * The default fd array needs to be at least BITS_PER_LONG,
+ * as this is the granularity returned by copy_fdset().
+ */
+#define NR_OPEN_DEFAULT BITS_PER_LONG
+
+/*
+ * The embedded_fd_set is a small fd_set,
+ * suitable for most tasks (which open <= BITS_PER_LONG files)
+ */
+struct embedded_fd_set {
+	unsigned long fds_bits[1];
+};
+
+struct fdtable {
+	unsigned int max_fds;
+	struct file ** fd;      /* current fd array */
+	fd_set *close_on_exec;
+	fd_set *open_fds;
+	struct rcu_head rcu;
+	struct fdtable *next;
+};
+
+/*
+ * Open file table structure
+ */
+struct files_struct {
+  /*
+   * read mostly part
+   */
+	atomic_t count;
+	struct fdtable *fdt;
+	struct fdtable fdtab;
+  /*
+   * written part on a separate cache line in SMP
+   */
+	spinlock_t file_lock ____cacheline_aligned_in_smp;
+	int next_fd;
+	struct embedded_fd_set close_on_exec_init;
+	struct embedded_fd_set open_fds_init;
+	struct file * fd_array[NR_OPEN_DEFAULT];
+};
+
+#define files_fdtable(files) (rcu_dereference((files)->fdt))
+
+extern struct kmem_cache *filp_cachep;
+
+struct file_operations;
+struct vfsmount;
+struct dentry;
+
+extern int expand_files(struct files_struct *, int nr);
+extern void free_fdtable_rcu(struct rcu_head *rcu);
+extern void __init files_defer_init(void);
+
+static inline void free_fdtable(struct fdtable *fdt)
+{
+	call_rcu(&fdt->rcu, free_fdtable_rcu);
+}
+
+static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
+{
+	struct file * file = NULL;
+	struct fdtable *fdt = files_fdtable(files);
+
+	if (fd < fdt->max_fds)
+		file = rcu_dereference(fdt->fd[fd]);
+	return file;
+}
+
+/*
+ * Check whether the specified fd has an open file.
+ */
+#define fcheck(fd)	fcheck_files(current->files, fd)
+
+struct task_struct;
+
+struct files_struct *get_files_struct(struct task_struct *);
+void put_files_struct(struct files_struct *fs);
+void reset_files_struct(struct files_struct *);
+int unshare_files(struct files_struct **);
+
+extern struct kmem_cache *files_cachep;
+
+#endif /* __LINUX_FDTABLE_H */
diff --git a/include/linux/file.h b/include/linux/file.h
index 69baf5a4f0a5..27c64bdc68c9 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -5,59 +5,11 @@
 #ifndef __LINUX_FILE_H
 #define __LINUX_FILE_H
 
-#include <asm/atomic.h>
-#include <linux/posix_types.h>
 #include <linux/compiler.h>
-#include <linux/spinlock.h>
-#include <linux/rcupdate.h>
 #include <linux/types.h>
+#include <linux/posix_types.h>
 
-/*
- * The default fd array needs to be at least BITS_PER_LONG,
- * as this is the granularity returned by copy_fdset().
- */
-#define NR_OPEN_DEFAULT BITS_PER_LONG
-
-/*
- * The embedded_fd_set is a small fd_set,
- * suitable for most tasks (which open <= BITS_PER_LONG files)
- */
-struct embedded_fd_set {
-	unsigned long fds_bits[1];
-};
-
-struct fdtable {
-	unsigned int max_fds;
-	struct file ** fd;      /* current fd array */
-	fd_set *close_on_exec;
-	fd_set *open_fds;
-	struct rcu_head rcu;
-	struct fdtable *next;
-};
-
-/*
- * Open file table structure
- */
-struct files_struct {
-  /*
-   * read mostly part
-   */
-	atomic_t count;
-	struct fdtable *fdt;
-	struct fdtable fdtab;
-  /*
-   * written part on a separate cache line in SMP
-   */
-	spinlock_t file_lock ____cacheline_aligned_in_smp;
-	int next_fd;
-	struct embedded_fd_set close_on_exec_init;
-	struct embedded_fd_set open_fds_init;
-	struct file * fd_array[NR_OPEN_DEFAULT];
-};
-
-#define files_fdtable(files) (rcu_dereference((files)->fdt))
-
-extern struct kmem_cache *filp_cachep;
+struct file;
 
 extern void __fput(struct file *);
 extern void fput(struct file *);
@@ -85,41 +37,7 @@ extern void put_filp(struct file *);
 extern int get_unused_fd(void);
 extern int get_unused_fd_flags(int flags);
 extern void put_unused_fd(unsigned int fd);
-struct kmem_cache;
-
-extern int expand_files(struct files_struct *, int nr);
-extern void free_fdtable_rcu(struct rcu_head *rcu);
-extern void __init files_defer_init(void);
-
-static inline void free_fdtable(struct fdtable *fdt)
-{
-	call_rcu(&fdt->rcu, free_fdtable_rcu);
-}
-
-static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
-{
-	struct file * file = NULL;
-	struct fdtable *fdt = files_fdtable(files);
-
-	if (fd < fdt->max_fds)
-		file = rcu_dereference(fdt->fd[fd]);
-	return file;
-}
-
-/*
- * Check whether the specified fd has an open file.
- */
-#define fcheck(fd)	fcheck_files(current->files, fd)
 
 extern void fd_install(unsigned int fd, struct file *file);
 
-struct task_struct;
-
-struct files_struct *get_files_struct(struct task_struct *);
-void put_files_struct(struct files_struct *fs);
-void reset_files_struct(struct files_struct *);
-int unshare_files(struct files_struct **);
-
-extern struct kmem_cache *files_cachep;
-
 #endif /* __LINUX_FILE_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index bf6b8a61f8db..b24c2875aa05 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX__INIT_TASK_H
 #define _LINUX__INIT_TASK_H
 
-#include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/rcupdate.h>
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index d3ad54677f9c..1510f78a0ffa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -19,6 +19,7 @@
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/binfmts.h>
 #include <linux/nsproxy.h>
 #include <linux/pid_namespace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 2bb675af4de3..933e60ebccae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -22,6 +22,7 @@
 #include <linux/mempolicy.h>
 #include <linux/sem.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/key.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index e2764047ec03..8df97d3dfda8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -27,6 +27,7 @@
 #include <linux/mnt_namespace.h>
 #include <linux/completion.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
 #include <linux/mount.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 1b50a6ebc55f..1c864c0efe2b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -39,6 +39,7 @@
 #include <linux/spinlock.h>
 #include <linux/syscalls.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/ext2_fs.h>
-- 
cgit v1.3-14-g43fede


From bcf35afb528109a31264b45d4851fa6ae72dbe18 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 May 2008 18:43:12 +0200
Subject: make generic sys_ptrace unconditional

With s390 the last arch switched to the generic sys_ptrace yesterday so
we can now kill the ifdef around it to enforce every new port it using
it instead of introducing new weirdo versions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dcc199c43a12..6c19e94fd0a5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -534,7 +534,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
 #define arch_ptrace_attach(child)	do { } while (0)
 #endif
 
-#ifndef __ARCH_SYS_PTRACE
 asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 {
 	struct task_struct *child;
@@ -582,7 +581,6 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 	unlock_kernel();
 	return ret;
 }
-#endif /* __ARCH_SYS_PTRACE */
 
 int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
 {
-- 
cgit v1.3-14-g43fede


From 1adb0850a1254333d81e64121c80af100c6d6e06 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Apr 2008 17:01:56 +0200
Subject: genirq: reenable a nobody cared disabled irq when a new driver
 arrives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Uwe Kleine-Koenig has some strange hardware where one of the shared
interrupts can be asserted during boot before the appropriate driver
loads. Requesting the shared irq line from another driver result in a
spurious interrupt storm which finally disables the interrupt line.

I have seen similar behaviour on resume before (the hardware does not
work anymore so I can not verify).

Change the spurious disable logic to increment the disable depth and
mark the interrupt with an extra flag which allows us to reenable the
interrupt when a new driver arrives which requests the same irq
line. In the worst case this will disable the irq again via the
spurious trap, but there is a decent chance that the new driver is the
one which can handle the already asserted interrupt and makes the box
usable again.

Eric Biederman said further: This case also happens on a regular basis
in kdump kernels where we deliberately don't shutdown the hardware
before starting the new kernel.  This patch should reduce the need for
using irqpoll in that situation by a small amount.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-and-Acked-by: Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
---
 include/linux/irq.h   |  1 +
 kernel/irq/manage.c   | 49 ++++++++++++++++++++++++++++++++-----------------
 kernel/irq/spurious.c |  4 ++--
 3 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 1883a85625dd..552e0ec269c9 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -61,6 +61,7 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_WAKEUP		0x00100000	/* IRQ triggers system wakeup */
 #define IRQ_MOVE_PENDING	0x00200000	/* need to re-target IRQ destination */
 #define IRQ_NO_BALANCING	0x00400000	/* IRQ is excluded from balancing */
+#define IRQ_SPURIOUS_DISABLED	0x00800000	/* IRQ was disabled by the spurious trap */
 
 #ifdef CONFIG_IRQ_PER_CPU
 # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 46e4ad1723f0..46d6611a33bb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -150,6 +150,26 @@ void disable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(disable_irq);
 
+static void __enable_irq(struct irq_desc *desc, unsigned int irq)
+{
+	switch (desc->depth) {
+	case 0:
+		printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+		WARN_ON(1);
+		break;
+	case 1: {
+		unsigned int status = desc->status & ~IRQ_DISABLED;
+
+		/* Prevent probing on this irq: */
+		desc->status = status | IRQ_NOPROBE;
+		check_irq_resend(desc, irq);
+		/* fall-through */
+	}
+	default:
+		desc->depth--;
+	}
+}
+
 /**
  *	enable_irq - enable handling of an irq
  *	@irq: Interrupt to enable
@@ -169,22 +189,7 @@ void enable_irq(unsigned int irq)
 		return;
 
 	spin_lock_irqsave(&desc->lock, flags);
-	switch (desc->depth) {
-	case 0:
-		printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
-		WARN_ON(1);
-		break;
-	case 1: {
-		unsigned int status = desc->status & ~IRQ_DISABLED;
-
-		/* Prevent probing on this irq: */
-		desc->status = status | IRQ_NOPROBE;
-		check_irq_resend(desc, irq);
-		/* fall-through */
-	}
-	default:
-		desc->depth--;
-	}
+	__enable_irq(desc, irq);
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 EXPORT_SYMBOL(enable_irq);
@@ -365,7 +370,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 			compat_irq_chip_set_default_handler(desc);
 
 		desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
-				  IRQ_INPROGRESS);
+				  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
 
 		if (!(desc->status & IRQ_NOAUTOEN)) {
 			desc->depth = 0;
@@ -381,6 +386,16 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	/* Reset broken irq detection when installing new handler */
 	desc->irq_count = 0;
 	desc->irqs_unhandled = 0;
+
+	/*
+	 * Check whether we disabled the irq via the spurious handler
+	 * before. Reenable it and give it another chance.
+	 */
+	if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
+		desc->status &= ~IRQ_SPURIOUS_DISABLED;
+		__enable_irq(desc, irq);
+	}
+
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	new->irq = irq;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 088dabbf2d6a..c66d3f10e853 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -209,8 +209,8 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		 * Now kill the IRQ
 		 */
 		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
-		desc->status |= IRQ_DISABLED;
-		desc->depth = 1;
+		desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
+		desc->depth++;
 		desc->chip->disable(irq);
 	}
 	desc->irqs_unhandled = 0;
-- 
cgit v1.3-14-g43fede


From b9095fd8a7f41dc7ac0b0b7864f74766a3056f96 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 2 May 2008 16:18:42 -0700
Subject: Make constants in kernel/timeconst.h fixed 64 bits

Force constants in kernel/timeconst.h (except shift counts) to be 64 bits,
using U64_C() constructor macros, and eliminate constants that cannot
be represented at all in 64 bits.  This avoids warnings with some gcc
versions.

Drop generating 64-bit constants, since we have no real hope of
getting a full set (operation on 64-bit values requires a 128-bit
intermediate result, which gcc only supports on 64-bit platforms, and
only with libgcc support on some.)  Note that the use of these
constants does not depend on if we are on a 32- or 64-bit architecture.

This resolves Bugzilla 10153.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 kernel/time.c       |   8 ++--
 kernel/timeconst.pl | 120 +++++++++++++++++++++-------------------------------
 2 files changed, 52 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index cbe0d5a222ff..6a08660b4fac 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -246,7 +246,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
 	return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
 #else
 # if BITS_PER_LONG == 32
-	return ((u64)HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
+	return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
 # else
 	return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
 # endif
@@ -262,7 +262,7 @@ unsigned int inline jiffies_to_usecs(const unsigned long j)
 	return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
 #else
 # if BITS_PER_LONG == 32
-	return ((u64)HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
+	return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
 # else
 	return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
 # endif
@@ -476,7 +476,7 @@ unsigned long msecs_to_jiffies(const unsigned int m)
 	if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
 		return MAX_JIFFY_OFFSET;
 
-	return ((u64)MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
+	return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
 		>> MSEC_TO_HZ_SHR32;
 #endif
 }
@@ -491,7 +491,7 @@ unsigned long usecs_to_jiffies(const unsigned int u)
 #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
 	return u * (HZ / USEC_PER_SEC);
 #else
-	return ((u64)USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
+	return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
 		>> USEC_TO_HZ_SHR32;
 #endif
 }
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index 41468035473c..eb51d76e058a 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/perl
 # -----------------------------------------------------------------------
 #
-#   Copyright 2007 rPath, Inc. - All Rights Reserved
+#   Copyright 2007-2008 rPath, Inc. - All Rights Reserved
 #
 #   This file is part of the Linux kernel, and is made available under
 #   the terms of the GNU General Public License version 2 or (at your
@@ -20,198 +20,138 @@
 %canned_values = (
 	24 => [
 		'0xa6aaaaab','0x2aaaaaa',26,
-		'0xa6aaaaaaaaaaaaab','0x2aaaaaaaaaaaaaa',58,
 		125,3,
 		'0xc49ba5e4','0x1fbe76c8b4',37,
-		'0xc49ba5e353f7ceda','0x1fbe76c8b439581062',69,
 		3,125,
 		'0xa2c2aaab','0xaaaa',16,
-		'0xa2c2aaaaaaaaaaab','0xaaaaaaaaaaaa',48,
 		125000,3,
 		'0xc9539b89','0x7fffbce4217d',47,
-		'0xc9539b8887229e91','0x7fffbce4217d2849cb25',79,
 		3,125000,
 	], 32 => [
 		'0xfa000000','0x6000000',27,
-		'0xfa00000000000000','0x600000000000000',59,
 		125,4,
 		'0x83126e98','0xfdf3b645a',36,
-		'0x83126e978d4fdf3c','0xfdf3b645a1cac0831',68,
 		4,125,
 		'0xf4240000','0x0',17,
-		'0xf424000000000000','0x0',49,
 		31250,1,
 		'0x8637bd06','0x3fff79c842fa',46,
-		'0x8637bd05af6c69b6','0x3fff79c842fa5093964a',78,
 		1,31250,
 	], 48 => [
 		'0xa6aaaaab','0x6aaaaaa',27,
-		'0xa6aaaaaaaaaaaaab','0x6aaaaaaaaaaaaaa',59,
 		125,6,
 		'0xc49ba5e4','0xfdf3b645a',36,
-		'0xc49ba5e353f7ceda','0xfdf3b645a1cac0831',68,
 		6,125,
 		'0xa2c2aaab','0x15555',17,
-		'0xa2c2aaaaaaaaaaab','0x1555555555555',49,
 		62500,3,
 		'0xc9539b89','0x3fffbce4217d',46,
-		'0xc9539b8887229e91','0x3fffbce4217d2849cb25',78,
 		3,62500,
 	], 64 => [
 		'0xfa000000','0xe000000',28,
-		'0xfa00000000000000','0xe00000000000000',60,
 		125,8,
 		'0x83126e98','0x7ef9db22d',35,
-		'0x83126e978d4fdf3c','0x7ef9db22d0e560418',67,
 		8,125,
 		'0xf4240000','0x0',18,
-		'0xf424000000000000','0x0',50,
 		15625,1,
 		'0x8637bd06','0x1fff79c842fa',45,
-		'0x8637bd05af6c69b6','0x1fff79c842fa5093964a',77,
 		1,15625,
 	], 100 => [
 		'0xa0000000','0x0',28,
-		'0xa000000000000000','0x0',60,
 		10,1,
 		'0xcccccccd','0x733333333',35,
-		'0xcccccccccccccccd','0x73333333333333333',67,
 		1,10,
 		'0x9c400000','0x0',18,
-		'0x9c40000000000000','0x0',50,
 		10000,1,
 		'0xd1b71759','0x1fff2e48e8a7',45,
-		'0xd1b71758e219652c','0x1fff2e48e8a71de69ad4',77,
 		1,10000,
 	], 122 => [
 		'0x8325c53f','0xfbcda3a',28,
-		'0x8325c53ef368eb05','0xfbcda3ac10c9714',60,
 		500,61,
 		'0xf9db22d1','0x7fbe76c8b',35,
-		'0xf9db22d0e560418a','0x7fbe76c8b43958106',67,
 		61,500,
 		'0x8012e2a0','0x3ef36',18,
-		'0x8012e29f79b47583','0x3ef368eb04325',50,
 		500000,61,
 		'0xffda4053','0x1ffffbce4217',45,
-		'0xffda4052d666a983','0x1ffffbce4217d2849cb2',77,
 		61,500000,
 	], 128 => [
 		'0xfa000000','0x1e000000',29,
-		'0xfa00000000000000','0x1e00000000000000',61,
 		125,16,
 		'0x83126e98','0x3f7ced916',34,
-		'0x83126e978d4fdf3c','0x3f7ced916872b020c',66,
 		16,125,
 		'0xf4240000','0x40000',19,
-		'0xf424000000000000','0x4000000000000',51,
 		15625,2,
 		'0x8637bd06','0xfffbce4217d',44,
-		'0x8637bd05af6c69b6','0xfffbce4217d2849cb25',76,
 		2,15625,
 	], 200 => [
 		'0xa0000000','0x0',29,
-		'0xa000000000000000','0x0',61,
 		5,1,
 		'0xcccccccd','0x333333333',34,
-		'0xcccccccccccccccd','0x33333333333333333',66,
 		1,5,
 		'0x9c400000','0x0',19,
-		'0x9c40000000000000','0x0',51,
 		5000,1,
 		'0xd1b71759','0xfff2e48e8a7',44,
-		'0xd1b71758e219652c','0xfff2e48e8a71de69ad4',76,
 		1,5000,
 	], 250 => [
 		'0x80000000','0x0',29,
-		'0x8000000000000000','0x0',61,
 		4,1,
 		'0x80000000','0x180000000',33,
-		'0x8000000000000000','0x18000000000000000',65,
 		1,4,
 		'0xfa000000','0x0',20,
-		'0xfa00000000000000','0x0',52,
 		4000,1,
 		'0x83126e98','0x7ff7ced9168',43,
-		'0x83126e978d4fdf3c','0x7ff7ced916872b020c4',75,
 		1,4000,
 	], 256 => [
 		'0xfa000000','0x3e000000',30,
-		'0xfa00000000000000','0x3e00000000000000',62,
 		125,32,
 		'0x83126e98','0x1fbe76c8b',33,
-		'0x83126e978d4fdf3c','0x1fbe76c8b43958106',65,
 		32,125,
 		'0xf4240000','0xc0000',20,
-		'0xf424000000000000','0xc000000000000',52,
 		15625,4,
 		'0x8637bd06','0x7ffde7210be',43,
-		'0x8637bd05af6c69b6','0x7ffde7210be9424e592',75,
 		4,15625,
 	], 300 => [
 		'0xd5555556','0x2aaaaaaa',30,
-		'0xd555555555555556','0x2aaaaaaaaaaaaaaa',62,
 		10,3,
 		'0x9999999a','0x1cccccccc',33,
-		'0x999999999999999a','0x1cccccccccccccccc',65,
 		3,10,
 		'0xd0555556','0xaaaaa',20,
-		'0xd055555555555556','0xaaaaaaaaaaaaa',52,
 		10000,3,
 		'0x9d495183','0x7ffcb923a29',43,
-		'0x9d495182a9930be1','0x7ffcb923a29c779a6b5',75,
 		3,10000,
 	], 512 => [
 		'0xfa000000','0x7e000000',31,
-		'0xfa00000000000000','0x7e00000000000000',63,
 		125,64,
 		'0x83126e98','0xfdf3b645',32,
-		'0x83126e978d4fdf3c','0xfdf3b645a1cac083',64,
 		64,125,
 		'0xf4240000','0x1c0000',21,
-		'0xf424000000000000','0x1c000000000000',53,
 		15625,8,
 		'0x8637bd06','0x3ffef39085f',42,
-		'0x8637bd05af6c69b6','0x3ffef39085f4a1272c9',74,
 		8,15625,
 	], 1000 => [
 		'0x80000000','0x0',31,
-		'0x8000000000000000','0x0',63,
 		1,1,
 		'0x80000000','0x0',31,
-		'0x8000000000000000','0x0',63,
 		1,1,
 		'0xfa000000','0x0',22,
-		'0xfa00000000000000','0x0',54,
 		1000,1,
 		'0x83126e98','0x1ff7ced9168',41,
-		'0x83126e978d4fdf3c','0x1ff7ced916872b020c4',73,
 		1,1000,
 	], 1024 => [
 		'0xfa000000','0xfe000000',32,
-		'0xfa00000000000000','0xfe00000000000000',64,
 		125,128,
 		'0x83126e98','0x7ef9db22',31,
-		'0x83126e978d4fdf3c','0x7ef9db22d0e56041',63,
 		128,125,
 		'0xf4240000','0x3c0000',22,
-		'0xf424000000000000','0x3c000000000000',54,
 		15625,16,
 		'0x8637bd06','0x1fff79c842f',41,
-		'0x8637bd05af6c69b6','0x1fff79c842fa5093964',73,
 		16,15625,
 	], 1200 => [
 		'0xd5555556','0xd5555555',32,
-		'0xd555555555555556','0xd555555555555555',64,
 		5,6,
 		'0x9999999a','0x66666666',31,
-		'0x999999999999999a','0x6666666666666666',63,
 		6,5,
 		'0xd0555556','0x2aaaaa',22,
-		'0xd055555555555556','0x2aaaaaaaaaaaaa',54,
 		2500,3,
 		'0x9d495183','0x1ffcb923a29',41,
-		'0x9d495182a9930be1','0x1ffcb923a29c779a6b5',73,
 		3,2500,
 	]
 );
@@ -264,6 +204,15 @@ sub fmuls($$$) {
 	return 0;
 }
 
+# Generate a hex value if the result fits in 64 bits;
+# otherwise skip.
+sub bignum_hex($) {
+	my($x) = @_;
+	my $s = $x->as_hex();
+
+	return (length($s) > 18) ? undef : $s;
+}
+
 # Provides mul, adj, and shr factors for a specific
 # (bit, time, hz) combination
 sub muladj($$$) {
@@ -271,7 +220,7 @@ sub muladj($$$) {
 	my $s = fmuls($b, $t, $hz);
 	my $m = fmul($s, $t, $hz);
 	my $a = fadj($s, $t, $hz);
-	return ($m->as_hex(), $a->as_hex(), $s);
+	return (bignum_hex($m), bignum_hex($a), $s);
 }
 
 # Provides numerator, denominator values
@@ -288,12 +237,10 @@ sub conversions($$) {
 
 	# HZ_TO_xx
 	push(@val, muladj(32, $t, $hz));
-	push(@val, muladj(64, $t, $hz));
 	push(@val, numden($t, $hz));
 
 	# xx_TO_HZ
 	push(@val, muladj(32, $hz, $t));
-	push(@val, muladj(64, $hz, $t));
 	push(@val, numden($hz, $t));
 
 	return @val;
@@ -318,6 +265,19 @@ sub compute_values($) {
 	return @val;
 }
 
+sub outputval($$)
+{
+	my($name, $val) = @_;
+	my $csuf;
+
+	if (defined($val)) {
+	    if ($name !~ /SHR/) {
+		$val = "U64_C($val)";
+	    }
+	    printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
+	}
+}
+
 sub output($@)
 {
 	my($hz, @val) = @_;
@@ -331,6 +291,7 @@ sub output($@)
 	print "\n";
 
 	print "#include <linux/param.h>\n";
+	print "#include <linux/types.h>\n";
 
 	print "\n";
 	print "#if HZ != $hz\n";
@@ -340,15 +301,13 @@ sub output($@)
 
 	foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
 		      'HZ_TO_USEC','USEC_TO_HZ') {
-		foreach $bit (32, 64) {
+		foreach $bit (32) {
 			foreach $suf ('MUL', 'ADJ', 'SHR') {
-				printf "#define %-23s %s\n",
-					"${pfx}_$suf$bit", shift(@val);
+				outputval("${pfx}_$suf$bit", shift(@val));
 			}
 		}
 		foreach $suf ('NUM', 'DEN') {
-			printf "#define %-23s %s\n",
-				"${pfx}_$suf", shift(@val);
+			outputval("${pfx}_$suf", shift(@val));
 		}
 	}
 
@@ -356,6 +315,23 @@ sub output($@)
 	print "#endif /* KERNEL_TIMECONST_H */\n";
 }
 
+# Pretty-print Perl values
+sub perlvals(@) {
+	my $v;
+	my @l = ();
+
+	foreach $v (@_) {
+		if (!defined($v)) {
+			push(@l, 'undef');
+		} elsif ($v =~ /^0x/) {
+			push(@l, "\'".$v."\'");
+		} else {
+			push(@l, $v.'');
+		}
+	}
+	return join(',', @l);
+}
+
 ($hz) = @ARGV;
 
 # Use this to generate the %canned_values structure
@@ -373,15 +349,15 @@ if ($hz eq '--can') {
 		print "$pf$hz => [\n";
 		while (scalar(@values)) {
 			my $bit;
-			foreach $bit (32, 64) {
+			foreach $bit (32) {
 				my $m = shift(@values);
 				my $a = shift(@values);
 				my $s = shift(@values);
-				print "\t\t\'",$m,"\',\'",$a,"\',",$s,",\n";
+				print "\t\t", perlvals($m,$a,$s), ",\n";
 			}
 			my $n = shift(@values);
 			my $d = shift(@values);
-			print "\t\t",$n,',',$d,",\n";
+			print "\t\t", perlvals($n,$d), ",\n";
 		}
 		print "\t]";
 		$pf = ', ';
-- 
cgit v1.3-14-g43fede


From 4346f65426cbceb64794b468e4af6f5632d58c5e Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <oliver@hartkopp.net>
Date: Wed, 30 Apr 2008 23:04:37 +0200
Subject: hrtimer: remove duplicate helper function

The helper function hrtimer_callback_running() is used in
kernel/hrtimer.c as well as in the updated net/can/bcm.c which now
supports hrtimers. Moving the helper function to hrtimer.h removes the
duplicate definition in the C-files.

Signed-off-by: Oliver Hartkopp <oliver@hartkopp.net>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 9 +++++++++
 kernel/hrtimer.c        | 9 ---------
 net/can/bcm.c           | 6 ------
 3 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 31a4d653389f..6d93dce61cbb 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -316,6 +316,15 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
 		(HRTIMER_STATE_ENQUEUED | HRTIMER_STATE_PENDING);
 }
 
+/*
+ * Helper function to check, whether the timer is running the callback
+ * function
+ */
+static inline int hrtimer_callback_running(struct hrtimer *timer)
+{
+	return timer->state & HRTIMER_STATE_CALLBACK;
+}
+
 /* Forward a hrtimer so it expires after now: */
 extern u64
 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9af1d6a8095e..421be5fe5cc7 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -153,15 +153,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 		ktime_add(xtim, tomono);
 }
 
-/*
- * Helper function to check, whether the timer is running the callback
- * function
- */
-static inline int hrtimer_callback_running(struct hrtimer *timer)
-{
-	return timer->state & HRTIMER_STATE_CALLBACK;
-}
-
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 74fd2d33aff4..d9a3a9d13bed 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -412,12 +412,6 @@ static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data)
 	bcm_send_to_user(op, &head, data, 1);
 }
 
-/* TODO: move to linux/hrtimer.h */
-static inline int hrtimer_callback_running(struct hrtimer *timer)
-{
-        return timer->state & HRTIMER_STATE_CALLBACK;
-}
-
 /*
  * bcm_rx_update_and_send - process a detected relevant receive content change
  *                          1. update the last received data
-- 
cgit v1.3-14-g43fede


From 4359a023a8c3b247b348c310bf510b23f3c1ab64 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 2 May 2008 12:49:40 +0200
Subject: clocksource: Fix permissions for available_clocksource

File permissions for
/sys/devices/system/clocksource/clocksource0/available_clocksource
are 600 which allows write access. But this is in fact a read only
file. So change permissions to 400.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 73961f35fdc8..83221ed76e64 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -474,7 +474,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
 static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
 		   sysfs_override_clocksource);
 
-static SYSDEV_ATTR(available_clocksource, 0600,
+static SYSDEV_ATTR(available_clocksource, 0400,
 		   sysfs_show_available_clocksources, NULL);
 
 static struct sysdev_class clocksource_sysclass = {
-- 
cgit v1.3-14-g43fede


From 4f95f81a48623982879f4fa80c641933444afd18 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 3 May 2008 14:23:14 +0200
Subject: clocksource: allow read access to available/current_clocksource

There is no harm, when users can read the info and we ask often enough
during debugging for this kind of information.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 83221ed76e64..dadde5361f32 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -471,10 +471,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
 /*
  * Sysfs setup bits:
  */
-static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
+static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
 		   sysfs_override_clocksource);
 
-static SYSDEV_ATTR(available_clocksource, 0400,
+static SYSDEV_ATTR(available_clocksource, 0444,
 		   sysfs_show_available_clocksources, NULL);
 
 static struct sysdev_class clocksource_sysclass = {
-- 
cgit v1.3-14-g43fede


From 826e4506a0acb6487910a5ebafe839f708a00e1c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 4 May 2008 17:04:16 -0700
Subject: Make forced module loading optional

The kernel module loader used to be much too happy to allow loading of
modules for the wrong kernel version by default.  For example, if you
had MODVERSIONS enabled, but tried to load a module with no version
info, it would happily load it and taint the kernel - whether it was
likely to actually work or not!

Generally, such forced module loading should be considered a really
really bad idea, so make it conditional on a new config option
(MODULE_FORCE_LOAD), and make it default to off.

If somebody really wants to force module loads, that's their problem,
but we should not encourage it.  Especially as it happened to me by
mistake (ie regular unversioned Fedora modules getting loaded) causing
lots of strange behavior.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig    |  9 +++++++++
 kernel/module.c | 44 +++++++++++++++++++++++++++++---------------
 2 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index 6a44defac3ec..f0e62e5ce0dc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -825,6 +825,15 @@ menuconfig MODULES
 
 	  If unsure, say Y.
 
+config MODULE_FORCE_LOAD
+	bool "Forced module loading"
+	depends on MODULES
+	default n
+	help
+	  This option allows loading of modules even if that would set the
+          'F' (forced) taint, due to lack of version info.  Which is
+	  usually a really bad idea.
+
 config MODULE_UNLOAD
 	bool "Module unloading"
 	depends on MODULES
diff --git a/kernel/module.c b/kernel/module.c
index 8674a390a2e8..8e4528c9909f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -890,6 +890,19 @@ static struct module_attribute *modinfo_attrs[] = {
 
 static const char vermagic[] = VERMAGIC_STRING;
 
+static int try_to_force_load(struct module *mod, const char *symname)
+{
+#ifdef CONFIG_MODULE_FORCE_LOAD
+	if (!(tainted & TAINT_FORCED_MODULE))
+		printk("%s: no version for \"%s\" found: kernel tainted.\n",
+		       mod->name, symname);
+	add_taint_module(mod, TAINT_FORCED_MODULE);
+	return 0;
+#else
+	return -ENOEXEC;
+#endif
+}
+
 #ifdef CONFIG_MODVERSIONS
 static int check_version(Elf_Shdr *sechdrs,
 			 unsigned int versindex,
@@ -914,18 +927,18 @@ static int check_version(Elf_Shdr *sechdrs,
 
 		if (versions[i].crc == *crc)
 			return 1;
-		printk("%s: disagrees about version of symbol %s\n",
-		       mod->name, symname);
 		DEBUGP("Found checksum %lX vs module %lX\n",
 		       *crc, versions[i].crc);
-		return 0;
+		goto bad_version;
 	}
-	/* Not in module's version table.  OK, but that taints the kernel. */
-	if (!(tainted & TAINT_FORCED_MODULE))
-		printk("%s: no version for \"%s\" found: kernel tainted.\n",
-		       mod->name, symname);
-	add_taint_module(mod, TAINT_FORCED_MODULE);
-	return 1;
+
+	if (!try_to_force_load(mod, symname))
+		return 1;
+
+bad_version:
+	printk("%s: disagrees about version of symbol %s\n",
+	       mod->name, symname);
+	return 0;
 }
 
 static inline int check_modstruct_version(Elf_Shdr *sechdrs,
@@ -1853,9 +1866,9 @@ static struct module *load_module(void __user *umod,
 	modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
 	/* This is allowed: modprobe --force will invalidate it. */
 	if (!modmagic) {
-		add_taint_module(mod, TAINT_FORCED_MODULE);
-		printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
-		       mod->name);
+		err = try_to_force_load(mod, "magic");
+		if (err)
+			goto free_hdr;
 	} else if (!same_magic(modmagic, vermagic)) {
 		printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
 		       mod->name, modmagic, vermagic);
@@ -2006,9 +2019,10 @@ static struct module *load_module(void __user *umod,
 	    (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
 	    (mod->num_unused_syms && !unusedcrcindex) ||
 	    (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
-		printk(KERN_WARNING "%s: No versions for exported symbols."
-		       " Tainting kernel.\n", mod->name);
-		add_taint_module(mod, TAINT_FORCED_MODULE);
+		printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
+		err = try_to_force_load(mod, "nocrc");
+		if (err)
+			goto cleanup;
 	}
 #endif
 	markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
-- 
cgit v1.3-14-g43fede


From 688b744d8bc84dc5cc646e97509113dc5e8818ed Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 24 Apr 2008 16:57:23 -0500
Subject: kgdb: fix signedness mixmatches, add statics, add declaration to
 header

Noticed by sparse:
arch/x86/kernel/kgdb.c:556:15: warning: symbol 'kgdb_arch_pc' was not declared. Should it be static?
kernel/kgdb.c:149:8: warning: symbol 'kgdb_do_roundup' was not declared. Should it be static?
kernel/kgdb.c:193:22: warning: symbol 'kgdb_arch_pc' was not declared. Should it be static?
kernel/kgdb.c:712:5: warning: symbol 'remove_all_break' was not declared. Should it be static?

Related to kgdb_hex2long:
arch/x86/kernel/kgdb.c:371:28: warning: incorrect type in argument 2 (different signedness)
arch/x86/kernel/kgdb.c:371:28:    expected long *long_val
arch/x86/kernel/kgdb.c:371:28:    got unsigned long *<noident>
kernel/kgdb.c:469:27: warning: incorrect type in argument 2 (different signedness)
kernel/kgdb.c:469:27:    expected long *long_val
kernel/kgdb.c:469:27:    got unsigned long *<noident>
kernel/kgdb.c:470:27: warning: incorrect type in argument 2 (different signedness)
kernel/kgdb.c:470:27:    expected long *long_val
kernel/kgdb.c:470:27:    got unsigned long *<noident>
kernel/kgdb.c:894:27: warning: incorrect type in argument 2 (different signedness)
kernel/kgdb.c:894:27:    expected long *long_val
kernel/kgdb.c:894:27:    got unsigned long *<noident>
kernel/kgdb.c:895:27: warning: incorrect type in argument 2 (different signedness)
kernel/kgdb.c:895:27:    expected long *long_val
kernel/kgdb.c:895:27:    got unsigned long *<noident>
kernel/kgdb.c:1127:28: warning: incorrect type in argument 2 (different signedness)
kernel/kgdb.c:1127:28:    expected long *long_val
kernel/kgdb.c:1127:28:    got unsigned long *<noident>
kernel/kgdb.c:1132:25: warning: incorrect type in argument 2 (different signedness)
kernel/kgdb.c:1132:25:    expected long *long_val
kernel/kgdb.c:1132:25:    got unsigned long *<noident>

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 include/linux/kgdb.h | 4 +++-
 kernel/kgdb.c        | 8 ++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 9757b1a6d9dc..6adcc297e354 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -261,10 +261,12 @@ struct kgdb_io {
 
 extern struct kgdb_arch		arch_kgdb_ops;
 
+extern unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs);
+
 extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
 
-extern int kgdb_hex2long(char **ptr, long *long_val);
+extern int kgdb_hex2long(char **ptr, unsigned long *long_val);
 extern int kgdb_mem2hex(char *mem, char *buf, int count);
 extern int kgdb_hex2mem(char *buf, char *mem, int count);
 
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 1bd0ec1c80b2..39e31a036f5b 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -61,7 +61,7 @@ struct kgdb_state {
 	int			err_code;
 	int			cpu;
 	int			pass_exception;
-	long			threadid;
+	unsigned long		threadid;
 	long			kgdb_usethreadid;
 	struct pt_regs		*linux_regs;
 };
@@ -146,7 +146,7 @@ atomic_t			kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
  * the other CPUs might interfere with your debugging context, so
  * use this with care:
  */
-int				kgdb_do_roundup = 1;
+static int kgdb_do_roundup = 1;
 
 static int __init opt_nokgdbroundup(char *str)
 {
@@ -438,7 +438,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
  * While we find nice hex chars, build a long_val.
  * Return number of chars processed.
  */
-int kgdb_hex2long(char **ptr, long *long_val)
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
 {
 	int hex_val;
 	int num = 0;
@@ -709,7 +709,7 @@ int kgdb_isremovedbreak(unsigned long addr)
 	return 0;
 }
 
-int remove_all_break(void)
+static int remove_all_break(void)
 {
 	unsigned long addr;
 	int error;
-- 
cgit v1.3-14-g43fede


From 82af7aca56c67061420d618cc5a30f0fd4106b80 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Fri, 25 Jan 2008 10:40:46 +0100
Subject: Removal of FUTEX_FD

Since FUTEX_FD was scheduled for removal in June 2007 lets remove it.

Google Code search found no users for it and NGPT was abandoned in 2003
according to IBM.  futex.h is left untouched to make sure the id does
not get reassigned.  Since queue_me() has no users left it is commented
out to avoid a warning, i didnt remove it completely since it is part of
the internal api (matching unqueue_me())

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (removed rest)
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 176 ++-------------------------------------------------------
 1 file changed, 6 insertions(+), 170 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 98092c9817f4..449def8074fe 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -104,10 +104,6 @@ struct futex_q {
 	/* Key which the futex is hashed on: */
 	union futex_key key;
 
-	/* For fd, sigio sent using these: */
-	int fd;
-	struct file *filp;
-
 	/* Optional priority inheritance state: */
 	struct futex_pi_state *pi_state;
 	struct task_struct *task;
@@ -126,9 +122,6 @@ struct futex_hash_bucket {
 
 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
 
-/* Futex-fs vfsmount entry: */
-static struct vfsmount *futex_mnt;
-
 /*
  * Take mm->mmap_sem, when futex is shared
  */
@@ -610,8 +603,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 static void wake_futex(struct futex_q *q)
 {
 	plist_del(&q->list, &q->list.plist);
-	if (q->filp)
-		send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
 	/*
 	 * The lock in wake_up_all() is a crucial memory barrier after the
 	 * plist_del() and also before assigning to q->lock_ptr.
@@ -988,14 +979,10 @@ out:
 }
 
 /* The key must be already stored in q->key. */
-static inline struct futex_hash_bucket *
-queue_lock(struct futex_q *q, int fd, struct file *filp)
+static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 {
 	struct futex_hash_bucket *hb;
 
-	q->fd = fd;
-	q->filp = filp;
-
 	init_waitqueue_head(&q->waiters);
 
 	get_futex_key_refs(&q->key);
@@ -1006,7 +993,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
 	return hb;
 }
 
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 {
 	int prio;
 
@@ -1041,15 +1028,6 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
  * exactly once.  They are called with the hashed spinlock held.
  */
 
-/* The key must be already stored in q->key. */
-static void queue_me(struct futex_q *q, int fd, struct file *filp)
-{
-	struct futex_hash_bucket *hb;
-
-	hb = queue_lock(q, fd, filp);
-	__queue_me(q, hb);
-}
-
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static int unqueue_me(struct futex_q *q)
 {
@@ -1194,7 +1172,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
-	hb = queue_lock(&q, -1, NULL);
+	hb = queue_lock(&q);
 
 	/*
 	 * Access the page AFTER the futex is queued.
@@ -1238,7 +1216,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 		goto out_unlock_release_sem;
 
 	/* Only actually queue if *uaddr contained val.  */
-	__queue_me(&q, hb);
+	queue_me(&q, hb);
 
 	/*
 	 * Now the futex is queued and we have checked the data, we
@@ -1386,7 +1364,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 		goto out_release_sem;
 
  retry_unlocked:
-	hb = queue_lock(&q, -1, NULL);
+	hb = queue_lock(&q);
 
  retry_locked:
 	ret = lock_taken = 0;
@@ -1499,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 	/*
 	 * Only actually queue now that the atomic ops are done:
 	 */
-	__queue_me(&q, hb);
+	queue_me(&q, hb);
 
 	/*
 	 * Now the futex is queued and we have checked the data, we
@@ -1746,121 +1724,6 @@ pi_faulted:
 	return ret;
 }
 
-static int futex_close(struct inode *inode, struct file *filp)
-{
-	struct futex_q *q = filp->private_data;
-
-	unqueue_me(q);
-	kfree(q);
-
-	return 0;
-}
-
-/* This is one-shot: once it's gone off you need a new fd */
-static unsigned int futex_poll(struct file *filp,
-			       struct poll_table_struct *wait)
-{
-	struct futex_q *q = filp->private_data;
-	int ret = 0;
-
-	poll_wait(filp, &q->waiters, wait);
-
-	/*
-	 * plist_node_empty() is safe here without any lock.
-	 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
-	 */
-	if (plist_node_empty(&q->list))
-		ret = POLLIN | POLLRDNORM;
-
-	return ret;
-}
-
-static const struct file_operations futex_fops = {
-	.release	= futex_close,
-	.poll		= futex_poll,
-};
-
-/*
- * Signal allows caller to avoid the race which would occur if they
- * set the sigio stuff up afterwards.
- */
-static int futex_fd(u32 __user *uaddr, int signal)
-{
-	struct futex_q *q;
-	struct file *filp;
-	int ret, err;
-	struct rw_semaphore *fshared;
-	static unsigned long printk_interval;
-
-	if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
-		printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
-		       "will be removed from the kernel in June 2007\n",
-		       current->comm);
-	}
-
-	ret = -EINVAL;
-	if (!valid_signal(signal))
-		goto out;
-
-	ret = get_unused_fd();
-	if (ret < 0)
-		goto out;
-	filp = get_empty_filp();
-	if (!filp) {
-		put_unused_fd(ret);
-		ret = -ENFILE;
-		goto out;
-	}
-	filp->f_op = &futex_fops;
-	filp->f_path.mnt = mntget(futex_mnt);
-	filp->f_path.dentry = dget(futex_mnt->mnt_root);
-	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
-
-	if (signal) {
-		err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
-		if (err < 0) {
-			goto error;
-		}
-		filp->f_owner.signum = signal;
-	}
-
-	q = kmalloc(sizeof(*q), GFP_KERNEL);
-	if (!q) {
-		err = -ENOMEM;
-		goto error;
-	}
-	q->pi_state = NULL;
-
-	fshared = &current->mm->mmap_sem;
-	down_read(fshared);
-	err = get_futex_key(uaddr, fshared, &q->key);
-
-	if (unlikely(err != 0)) {
-		up_read(fshared);
-		kfree(q);
-		goto error;
-	}
-
-	/*
-	 * queue_me() must be called before releasing mmap_sem, because
-	 * key->shared.inode needs to be referenced while holding it.
-	 */
-	filp->private_data = q;
-
-	queue_me(q, ret, filp);
-	up_read(fshared);
-
-	/* Now we map fd to filp, so userspace can access it */
-	fd_install(ret, filp);
-out:
-	return ret;
-error:
-	put_unused_fd(ret);
-	put_filp(filp);
-	ret = err;
-	goto out;
-}
-
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
  * thread exit time.
@@ -2092,10 +1955,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	case FUTEX_WAKE_BITSET:
 		ret = futex_wake(uaddr, fshared, val, val3);
 		break;
-	case FUTEX_FD:
-		/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
-		ret = futex_fd(uaddr, val);
-		break;
 	case FUTEX_REQUEUE:
 		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
 		break;
@@ -2156,19 +2015,6 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 
-static int futexfs_get_sb(struct file_system_type *fs_type,
-			  int flags, const char *dev_name, void *data,
-			  struct vfsmount *mnt)
-{
-	return get_sb_pseudo(fs_type, "futex", NULL, FUTEXFS_SUPER_MAGIC, mnt);
-}
-
-static struct file_system_type futex_fs_type = {
-	.name		= "futexfs",
-	.get_sb		= futexfs_get_sb,
-	.kill_sb	= kill_anon_super,
-};
-
 static int __init futex_init(void)
 {
 	u32 curval;
@@ -2193,16 +2039,6 @@ static int __init futex_init(void)
 		spin_lock_init(&futex_queues[i].lock);
 	}
 
-	i = register_filesystem(&futex_fs_type);
-	if (i)
-		return i;
-
-	futex_mnt = kern_mount(&futex_fs_type);
-	if (IS_ERR(futex_mnt)) {
-		unregister_filesystem(&futex_fs_type);
-		return PTR_ERR(futex_mnt);
-	}
-
 	return 0;
 }
 __initcall(futex_init);
-- 
cgit v1.3-14-g43fede


From a992241de614dd2b7c97a9ba64e28c0e563f19bf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 5 May 2008 23:56:17 +0200
Subject: sched: fix normalized sleeper

Normalized sleeper uses calc_delta*() which requires that the rq load is
already updated, so move account_entity_enqueue() before place_entity()

Tested-by: Frans Pop <elendil@planet.nl>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 89fa32b4edf2..1295ddc5656b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -682,6 +682,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+	account_entity_enqueue(cfs_rq, se);
 
 	if (wakeup) {
 		place_entity(cfs_rq, se, 0);
@@ -692,7 +693,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	check_spread(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
-	account_entity_enqueue(cfs_rq, se);
 }
 
 static void update_avg(u64 *avg, u64 sample)
-- 
cgit v1.3-14-g43fede


From e05510d01ad1565e5e086a939261084d67ba2b10 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 5 May 2008 23:56:17 +0200
Subject: sched: optimize calc_delta_mine()

Joel noticed that the !lw->inv_weight contition isn't unlikely anymore so
remove the unlikely annotation. Also, remove the two div64_u64() inv_weight
calculations, which makes them rely on the calc_delta_mine() path as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Joel Schopp <jschopp@austin.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 34bcc5bc120e..00c1ba706a5a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1438,8 +1438,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 {
 	u64 tmp;
 
-	if (unlikely(!lw->inv_weight))
-		lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
+	if (!lw->inv_weight)
+		lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1);
 
 	tmp = (u64)delta_exec * weight;
 	/*
@@ -8025,7 +8025,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 
 	se->my_q = cfs_rq;
 	se->load.weight = tg->shares;
-	se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight);
+	se->load.inv_weight = 0;
 	se->parent = parent;
 }
 #endif
@@ -8692,7 +8692,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 		dequeue_entity(cfs_rq, se, 0);
 
 	se->load.weight = shares;
-	se->load.inv_weight = div64_u64((1ULL<<32), shares);
+	se->load.inv_weight = 0;
 
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
-- 
cgit v1.3-14-g43fede


From 2abdad0a4cd8f9413f778cc998e0ee7d60b28417 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 25 Apr 2008 10:53:13 -0700
Subject: sched: make rt_sched_class, idle_sched_class static

The C files are included directly in sched.c, so they are
effectively static.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_idletask.c | 2 +-
 kernel/sched_rt.c       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 2bcafa375633..3a4f92dbbe66 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -99,7 +99,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
-const struct sched_class idle_sched_class = {
+static const struct sched_class idle_sched_class = {
 	/* .next is NULL */
 	/* no enqueue/yield_task for idle tasks */
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c2730a5a4f05..dcd649588593 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1309,7 +1309,7 @@ static void set_curr_task_rt(struct rq *rq)
 	p->se.exec_start = rq->clock;
 }
 
-const struct sched_class rt_sched_class = {
+static const struct sched_class rt_sched_class = {
 	.next			= &fair_sched_class,
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
-- 
cgit v1.3-14-g43fede


From d478c2cfaa2476f8b6876f9eb4d8fddcfa986479 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sat, 26 Apr 2008 11:30:34 -0700
Subject: sched: add debug checks to idle functions

Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: "Justin Mattock" <justinmattock@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 00c1ba706a5a..ed3caf26990d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1124,6 +1124,7 @@ void sched_clock_idle_sleep_event(void)
 {
 	struct rq *rq = cpu_rq(smp_processor_id());
 
+	WARN_ON(!irqs_disabled());
 	spin_lock(&rq->lock);
 	__update_rq_clock(rq);
 	spin_unlock(&rq->lock);
@@ -1139,6 +1140,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	struct rq *rq = cpu_rq(smp_processor_id());
 	u64 now = sched_clock();
 
+	WARN_ON(!irqs_disabled());
 	rq->idle_clock += delta_ns;
 	/*
 	 * Override the previous timestamp and ignore all
-- 
cgit v1.3-14-g43fede


From 983ed7a66bcec9dc307d89dc7af47cdf209e56af Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 24 Apr 2008 18:17:55 -0700
Subject: sched: add statics, don't return void expressions

Noticed by sparse:
kernel/sched.c:760:20: warning: symbol 'sched_feat_names' was not declared. Should it be static?
kernel/sched.c:767:5: warning: symbol 'sched_feat_open' was not declared. Should it be static?
kernel/sched_fair.c:845:3: warning: returning void-valued expression
kernel/sched.c:4386:3: warning: returning void-valued expression

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 10 ++++++----
 kernel/sched_fair.c |  6 ++++--
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ed3caf26990d..d941ddc9ec1d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -757,14 +757,14 @@ const_debug unsigned int sysctl_sched_features =
 #define SCHED_FEAT(name, enabled)	\
 	#name ,
 
-__read_mostly char *sched_feat_names[] = {
+static __read_mostly char *sched_feat_names[] = {
 #include "sched_features.h"
 	NULL
 };
 
 #undef SCHED_FEAT
 
-int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_open(struct inode *inode, struct file *filp)
 {
 	filp->private_data = inode->i_private;
 	return 0;
@@ -4341,8 +4341,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	struct rq *rq = this_rq();
 	cputime64_t tmp;
 
-	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
-		return account_guest_time(p, cputime);
+	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
+		account_guest_time(p, cputime);
+		return;
+	}
 
 	p->stime = cputime_add(p->stime, cputime);
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1295ddc5656b..e8e5ad2614b0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -841,8 +841,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * queued ticks are scheduled to match the slice, so don't bother
 	 * validating it and just reschedule.
 	 */
-	if (queued)
-		return resched_task(rq_of(cfs_rq)->curr);
+	if (queued) {
+		resched_task(rq_of(cfs_rq)->curr);
+		return;
+	}
 	/*
 	 * don't let the period tick interfere with the hrtick preemption
 	 */
-- 
cgit v1.3-14-g43fede


From 8ae121ac8666b0421aa20fd80d4597ec66fa54bc Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Wed, 23 Apr 2008 07:13:29 -0400
Subject: sched: fix RT task-wakeup logic

Dmitry Adamushko pointed out a logic error in task_wake_up_rt() where we
will always evaluate to "true".  You can find the thread here:

http://lkml.org/lkml/2008/4/22/296

In reality, we only want to try to push tasks away when a wake up request is
not going to preempt the current task.  So lets fix it.

Note: We introduce test_tsk_need_resched() instead of open-coding the flag
check so that the merge-conflict with -rt should help remind us that we
may need to support NEEDS_RESCHED_DELAYED in the future, too.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 7 ++++++-
 kernel/sched_rt.c     | 7 +++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 03c238088aee..698b5a4d25a7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1977,6 +1977,11 @@ static inline void clear_tsk_need_resched(struct task_struct *tsk)
 	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
 }
 
+static inline int test_tsk_need_resched(struct task_struct *tsk)
+{
+	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
+}
+
 static inline int signal_pending(struct task_struct *p)
 {
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
@@ -1991,7 +1996,7 @@ static inline int fatal_signal_pending(struct task_struct *p)
 
 static inline int need_resched(void)
 {
-	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
+	return unlikely(test_tsk_need_resched(current));
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index dcd649588593..060e87b0cb1c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1098,11 +1098,14 @@ static void post_schedule_rt(struct rq *rq)
 	}
 }
 
-
+/*
+ * If we are not running and we are not going to reschedule soon, we should
+ * try to push tasks away now
+ */
 static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
-	    (p->prio >= rq->rt.highest_prio) &&
+	    !test_tsk_need_resched(rq->curr) &&
 	    rq->rt.overloaded)
 		push_rt_tasks(rq);
 }
-- 
cgit v1.3-14-g43fede


From 104f64549c961a797ff5f7c59946a7caa335c5b0 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 28 Apr 2008 12:40:01 -0400
Subject: sched: fix SCHED_FAIR wake-idle logic error

We currently use an optimization to skip the overhead of wake-idle
processing if more than one task is assigned to a run-queue.  The
assumption is that the system must already be load-balanced or we
wouldnt be overloaded to begin with.

The problem is that we are looking at rq->nr_running, which may include
RT tasks in addition to CFS tasks.  Since the presence of RT tasks
really has no bearing on the balance status of CFS tasks, this throws
the calculation off.

This patch changes the logic to only consider the number of CFS tasks
when making the decision to optimze the wake-idle.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e8e5ad2614b0..1d5f35b4636e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1009,7 +1009,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 	 * sibling runqueue info. This will avoid the checks and cache miss
 	 * penalities associated with that.
 	 */
-	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
+	if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
 		return cpu;
 
 	for_each_domain(cpu, sd) {
-- 
cgit v1.3-14-g43fede


From b328ca182f01c2a04b85e0ee8a410720b104fbcc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 29 Apr 2008 10:02:46 +0200
Subject: sched: fix hrtick_start_fair and CPU-Hotplug

Gautham R Shenoy reported:

 > While running the usual CPU-Hotplug stress tests on linux-2.6.25,
 > I noticed the following in the console logs.
 >
 > This is a wee bit difficult to reproduce. In the past 10 runs I hit this
 > only once.
 >
 > ------------[ cut here ]------------
 >
 > WARNING: at kernel/sched.c:962 hrtick+0x2e/0x65()
 >
 > Just wondering if we are doing a good job at handling the cancellation
 > of any per-cpu scheduler timers during CPU-Hotplug.

This looks like its indeed not cancelled at all and migrates the it to
another cpu. Fix it via a proper hotplug notifier mechanism.

Reported-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d941ddc9ec1d..bee9cbe13c15 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1191,6 +1191,7 @@ static inline void resched_rq(struct rq *rq)
 enum {
 	HRTICK_SET,		/* re-programm hrtick_timer */
 	HRTICK_RESET,		/* not a new slice */
+	HRTICK_BLOCK,		/* stop hrtick operations */
 };
 
 /*
@@ -1202,6 +1203,8 @@ static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
+	if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
+		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
 
@@ -1284,7 +1287,63 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-static inline void init_rq_hrtick(struct rq *rq)
+static void hotplug_hrtick_disable(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	rq->hrtick_flags = 0;
+	__set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	hrtick_clear(rq);
+}
+
+static void hotplug_hrtick_enable(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	__clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static int
+hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int cpu = (int)(long)hcpu;
+
+	switch (action) {
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		hotplug_hrtick_disable(cpu);
+		return NOTIFY_OK;
+
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		hotplug_hrtick_enable(cpu);
+		return NOTIFY_OK;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static void init_hrtick(void)
+{
+	hotcpu_notifier(hotplug_hrtick, 0);
+}
+
+static void init_rq_hrtick(struct rq *rq)
 {
 	rq->hrtick_flags = 0;
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -1321,6 +1380,10 @@ static inline void init_rq_hrtick(struct rq *rq)
 void hrtick_resched(void)
 {
 }
+
+static inline void init_hrtick(void)
+{
+}
 #endif
 
 /*
@@ -7943,6 +8006,7 @@ void __init sched_init_smp(void)
 	put_online_cpus();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
+	init_hrtick();
 
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
-- 
cgit v1.3-14-g43fede


From 673a90a1e05c8127886f7659d1a457169378371f Mon Sep 17 00:00:00 2001
From: David Simner <djs203@srcf.ucam.org>
Date: Tue, 29 Apr 2008 10:08:59 +0100
Subject: sched: fix sched_info_switch not being called according to
 documentation

http://bugzilla.kernel.org/show_bug.cgi?id=10545

sched_stats.h says that __sched_info_switch is "called when prev !=
next" in the comment.  sched.c should therefore do that.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index bee9cbe13c15..3ac3d7af04a1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4662,9 +4662,9 @@ need_resched_nonpreemptible:
 	prev->sched_class->put_prev_task(rq, prev);
 	next = pick_next_task(rq, prev);
 
-	sched_info_switch(prev, next);
-
 	if (likely(prev != next)) {
+		sched_info_switch(prev, next);
+
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
-- 
cgit v1.3-14-g43fede


From d7dcdc11cfa6a8860a29b09f985467b89224699d Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 29 Apr 2008 12:23:09 +0200
Subject: sched: fix debugging

Revert debugging commit 7ba2e74ab5a0518bc953042952dd165724bc70c9.
print_cfs_rq_tasks() can induce live-lock if a task is dequeued
during list traversal.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 27 ---------------------------
 1 file changed, 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1d5f35b4636e..d99e01f6929a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1613,30 +1613,6 @@ static const struct sched_class fair_sched_class = {
 };
 
 #ifdef CONFIG_SCHED_DEBUG
-static void
-print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
-{
-	struct sched_entity *se;
-
-	if (!cfs_rq)
-		return;
-
-	list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
-		int i;
-
-		for (i = depth; i; i--)
-			seq_puts(m, "  ");
-
-		seq_printf(m, "%lu %s %lu\n",
-				se->load.weight,
-				entity_is_task(se) ? "T" : "G",
-				calc_delta_weight(SCHED_LOAD_SCALE, se)
-				);
-		if (!entity_is_task(se))
-			print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
-	}
-}
-
 static void print_cfs_stats(struct seq_file *m, int cpu)
 {
 	struct cfs_rq *cfs_rq;
@@ -1644,9 +1620,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
 	rcu_read_lock();
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
-
-	seq_printf(m, "\nWeight tree:\n");
-	print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
 	rcu_read_unlock();
 }
 #endif
-- 
cgit v1.3-14-g43fede


From 690229a0912ca2fef8b542fe4d8b73acfcdc6e24 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 23 Apr 2008 09:31:35 +0200
Subject: sched: make clock sync tunable by architecture code

make time_sync_thresh tunable to architecture code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 ++
 kernel/sched.c        | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 698b5a4d25a7..54c9ca26b7d8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -158,6 +158,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 }
 #endif
 
+extern unsigned long long time_sync_thresh;
+
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
diff --git a/kernel/sched.c b/kernel/sched.c
index 3ac3d7af04a1..8f433fedfcb3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -899,7 +899,7 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
-static const unsigned long long time_sync_thresh = 100000;
+unsigned long long time_sync_thresh = 100000;
 
 static DEFINE_PER_CPU(unsigned long long, time_offset);
 static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
-- 
cgit v1.3-14-g43fede


From 712555ee4f873515612f89554ad1a3fda5fa887e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 28 Apr 2008 11:33:07 +0200
Subject: sched: fix missing locking in sched_domains code

Concurrent calls to detach_destroy_domains and arch_init_sched_domains
were prevented by the old scheduler subsystem cpu hotplug mutex. When
this got converted to get_online_cpus() the locking got broken.
Unlike before now several processes can concurrently enter the critical
sections that were protected by the old lock.

So use the already present doms_cur_mutex to protect these sections again.

Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8f433fedfcb3..561b3b39bdb8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -242,6 +242,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 }
 #endif
 
+/*
+ * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * detach_destroy_domains and partition_sched_domains.
+ */
+static DEFINE_MUTEX(sched_domains_mutex);
+
 #ifdef CONFIG_GROUP_SCHED
 
 #include <linux/cgroup.h>
@@ -308,9 +314,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
  */
 static DEFINE_SPINLOCK(task_group_lock);
 
-/* doms_cur_mutex serializes access to doms_cur[] array */
-static DEFINE_MUTEX(doms_cur_mutex);
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
@@ -358,21 +361,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #endif
 }
 
-static inline void lock_doms_cur(void)
-{
-	mutex_lock(&doms_cur_mutex);
-}
-
-static inline void unlock_doms_cur(void)
-{
-	mutex_unlock(&doms_cur_mutex);
-}
-
 #else
 
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline void lock_doms_cur(void) { }
-static inline void unlock_doms_cur(void) { }
 
 #endif	/* CONFIG_GROUP_SCHED */
 
@@ -7822,7 +7813,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 {
 	int i, j;
 
-	lock_doms_cur();
+	mutex_lock(&sched_domains_mutex);
 
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
@@ -7871,7 +7862,7 @@ match2:
 
 	register_sched_domain_sysctl();
 
-	unlock_doms_cur();
+	mutex_unlock(&sched_domains_mutex);
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -7880,8 +7871,10 @@ int arch_reinit_sched_domains(void)
 	int err;
 
 	get_online_cpus();
+	mutex_lock(&sched_domains_mutex);
 	detach_destroy_domains(&cpu_online_map);
 	err = arch_init_sched_domains(&cpu_online_map);
+	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
 
 	return err;
@@ -7999,10 +7992,12 @@ void __init sched_init_smp(void)
 	BUG_ON(sched_group_nodes_bycpu == NULL);
 #endif
 	get_online_cpus();
+	mutex_lock(&sched_domains_mutex);
 	arch_init_sched_domains(&cpu_online_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 	if (cpus_empty(non_isolated_cpus))
 		cpu_set(smp_processor_id(), non_isolated_cpus);
+	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
-- 
cgit v1.3-14-g43fede


From cb4ad1ffc7c0d8ea7dc8cd8ba303d83551716d46 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 28 Apr 2008 12:54:56 +0800
Subject: sched: fair-group: fix a Div0 error of the fair group scheduler

When I echoed 0 into the "cpu.shares" file, a Div0 error occured.

We found it is caused by the following calling.

   sched_group_set_shares(tg, shares)
       set_se_shares(tg->se[i], shares/nr_cpu_ids)
           __set_se_shares(se, shares)
               div64_64((1ULL<<32), shares)

When the echoed value was less than the number of processores, the result of the
sentence "shares/nr_cpu_ids" was 0, and then the system called div64() to divide
the result, the Div0 error occured.

It is unnecessary that the shares value is divided by nr_cpu_ids, I think.
Because in the function  __update_group_shares_cpu() and init_tg_cfs_entry(),
the shares value isn't divided by nr_cpu_ids when setting shares of the sched
entity.

This patch fixes this bug. And echoing ULONG_MAX value into cpu.shares also
causes Div0 error, so we set a macro MAX_SHARES to limit the max value of
shares.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 561b3b39bdb8..f98f75f3c708 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -321,7 +321,13 @@ static DEFINE_SPINLOCK(task_group_lock);
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 
+/*
+ * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
+ * (The default weight is 1024 - so there's no practical
+ *  limitation from this.)
+ */
 #define MIN_SHARES	2
+#define MAX_SHARES	(ULONG_MAX - 1)
 
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 #endif
@@ -1804,6 +1810,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
 
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
+	else if (shares > MAX_SHARES)
+		shares = MAX_SHARES;
 
 	__set_se_shares(tg->se[tcpu], shares);
 }
@@ -8785,13 +8793,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	if (!tg->se[0])
 		return -EINVAL;
 
-	/*
-	 * A weight of 0 or 1 can cause arithmetics problems.
-	 * (The default weight is 1024 - so there's no practical
-	 *  limitation from this.)
-	 */
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
+	else if (shares > MAX_SHARES)
+		shares = MAX_SHARES;
 
 	mutex_lock(&shares_mutex);
 	if (tg->shares == shares)
@@ -8816,7 +8821,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		 * force a rebalance
 		 */
 		cfs_rq_set_shares(tg->cfs_rq[i], 0);
-		set_se_shares(tg->se[i], shares/nr_cpu_ids);
+		set_se_shares(tg->se[i], shares);
 	}
 
 	/*
-- 
cgit v1.3-14-g43fede


From dfbf4a1bc319f0f9a31e39b2da1fa5c55e85af89 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 23 Apr 2008 09:24:06 +0200
Subject: sched: fix cpu clock

David Miller pointed it out that nothing in cpu_clock() sets
prev_cpu_time. This caused __sync_cpu_clock() to be called
all the time - against the intention of this code.

The result was that in practice we hit a global spinlock every
time cpu_clock() is called - which - even though cpu_clock()
is used for tracing and debugging, is suboptimal.

While at it, also:

- move the irq disabling to the outest layer,
  this should make cpu_clock() warp-free when called with irqs
  enabled.

- use long long instead of cycles_t - for platforms where cycles_t
  is 32-bit.

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f98f75f3c708..9457106b18af 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -910,11 +910,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
 static DEFINE_SPINLOCK(time_sync_lock);
 static unsigned long long prev_global_time;
 
-static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
+static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&time_sync_lock, flags);
+	/*
+	 * We want this inlined, to not get tracer function calls
+	 * in this critical section:
+	 */
+	spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
+	__raw_spin_lock(&time_sync_lock.raw_lock);
 
 	if (time < prev_global_time) {
 		per_cpu(time_offset, cpu) += prev_global_time - time;
@@ -923,7 +926,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
 		prev_global_time = time;
 	}
 
-	spin_unlock_irqrestore(&time_sync_lock, flags);
+	__raw_spin_unlock(&time_sync_lock.raw_lock);
+	spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
 
 	return time;
 }
@@ -931,7 +935,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
 static unsigned long long __cpu_clock(int cpu)
 {
 	unsigned long long now;
-	unsigned long flags;
 	struct rq *rq;
 
 	/*
@@ -941,11 +944,9 @@ static unsigned long long __cpu_clock(int cpu)
 	if (unlikely(!scheduler_running))
 		return 0;
 
-	local_irq_save(flags);
 	rq = cpu_rq(cpu);
 	update_rq_clock(rq);
 	now = rq->clock;
-	local_irq_restore(flags);
 
 	return now;
 }
@@ -957,13 +958,18 @@ static unsigned long long __cpu_clock(int cpu)
 unsigned long long cpu_clock(int cpu)
 {
 	unsigned long long prev_cpu_time, time, delta_time;
+	unsigned long flags;
 
+	local_irq_save(flags);
 	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
 	time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
 	delta_time = time-prev_cpu_time;
 
-	if (unlikely(delta_time > time_sync_thresh))
+	if (unlikely(delta_time > time_sync_thresh)) {
 		time = __sync_cpu_clock(time, cpu);
+		per_cpu(prev_cpu_time, cpu) = time;
+	}
+	local_irq_restore(flags);
 
 	return time;
 }
-- 
cgit v1.3-14-g43fede


From 3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 3 May 2008 18:29:28 +0200
Subject: sched: add optional support for CONFIG_HAVE_UNSTABLE_SCHED_CLOCK

this replaces the rq->clock stuff (and possibly cpu_clock()).

 - architectures that have an 'imperfect' hardware clock can set
   CONFIG_HAVE_UNSTABLE_SCHED_CLOCK

 - the 'jiffie' window might be superfulous when we update tick_gtod
   before the __update_sched_clock() call in sched_clock_tick()

 - cpu_clock() might be implemented as:

     sched_clock_cpu(smp_processor_id())

   if the accuracy proves good enough - how far can TSC drift in a
   single jiffie when considering the filtering and idle hooks?

[ mingo@elte.hu: various fixes and cleanups ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  29 +++++++
 init/main.c           |   1 +
 kernel/Makefile       |   2 +-
 kernel/sched.c        | 165 +++--------------------------------
 kernel/sched_clock.c  | 236 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_debug.c  |   7 --
 kernel/sched_fair.c   |   2 +-
 7 files changed, 281 insertions(+), 161 deletions(-)
 create mode 100644 kernel/sched_clock.c

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 54c9ca26b7d8..0c35b0343a76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1553,6 +1553,35 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 
 extern unsigned long long sched_clock(void);
 
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline void sched_clock_init(void)
+{
+}
+
+static inline u64 sched_clock_cpu(int cpu)
+{
+	return sched_clock();
+}
+
+static inline void sched_clock_tick(void)
+{
+}
+
+static inline void sched_clock_idle_sleep_event(void)
+{
+}
+
+static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+}
+#else
+extern void sched_clock_init(void);
+extern u64 sched_clock_cpu(int cpu);
+extern void sched_clock_tick(void);
+extern void sched_clock_idle_sleep_event(void);
+extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+#endif
+
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
diff --git a/init/main.c b/init/main.c
index a87d4ca5c36c..ddada7acf363 100644
--- a/init/main.c
+++ b/init/main.c
@@ -602,6 +602,7 @@ asmlinkage void __init start_kernel(void)
 	softirq_init();
 	timekeeping_init();
 	time_init();
+	sched_clock_init();
 	profile_init();
 	if (!irqs_disabled())
 		printk("start_kernel(): bug: interrupts were enabled early\n");
diff --git a/kernel/Makefile b/kernel/Makefile
index 188c43223f52..1c9938addb9d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-	    notifier.o ksysfs.o pm_qos_params.o
+	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/sched.c b/kernel/sched.c
index 9457106b18af..58fb8af15776 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -74,16 +74,6 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 
-/*
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
-{
-	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
-}
-
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -557,13 +547,7 @@ struct rq {
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 
-	u64 clock, prev_clock_raw;
-	s64 clock_max_delta;
-
-	unsigned int clock_warps, clock_overflows, clock_underflows;
-	u64 idle_clock;
-	unsigned int clock_deep_idle_events;
-	u64 tick_timestamp;
+	u64 clock;
 
 	atomic_t nr_iowait;
 
@@ -628,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-#ifdef CONFIG_NO_HZ
-static inline bool nohz_on(int cpu)
-{
-	return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
-}
-
-static inline u64 max_skipped_ticks(struct rq *rq)
-{
-	return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
-}
-
-static inline void update_last_tick_seen(struct rq *rq)
-{
-	rq->last_tick_seen = jiffies;
-}
-#else
-static inline u64 max_skipped_ticks(struct rq *rq)
-{
-	return 1;
-}
-
-static inline void update_last_tick_seen(struct rq *rq)
-{
-}
-#endif
-
-/*
- * Update the per-runqueue clock, as finegrained as the platform can give
- * us, but without assuming monotonicity, etc.:
- */
-static void __update_rq_clock(struct rq *rq)
-{
-	u64 prev_raw = rq->prev_clock_raw;
-	u64 now = sched_clock();
-	s64 delta = now - prev_raw;
-	u64 clock = rq->clock;
-
-#ifdef CONFIG_SCHED_DEBUG
-	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-#endif
-	/*
-	 * Protect against sched_clock() occasionally going backwards:
-	 */
-	if (unlikely(delta < 0)) {
-		clock++;
-		rq->clock_warps++;
-	} else {
-		/*
-		 * Catch too large forward jumps too:
-		 */
-		u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
-		u64 max_time = rq->tick_timestamp + max_jump;
-
-		if (unlikely(clock + delta > max_time)) {
-			if (clock < max_time)
-				clock = max_time;
-			else
-				clock++;
-			rq->clock_overflows++;
-		} else {
-			if (unlikely(delta > rq->clock_max_delta))
-				rq->clock_max_delta = delta;
-			clock += delta;
-		}
-	}
-
-	rq->prev_clock_raw = now;
-	rq->clock = clock;
-}
-
-static void update_rq_clock(struct rq *rq)
-{
-	if (likely(smp_processor_id() == cpu_of(rq)))
-		__update_rq_clock(rq);
-}
-
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
@@ -719,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+static inline void update_rq_clock(struct rq *rq)
+{
+	rq->clock = sched_clock_cpu(cpu_of(rq));
+}
+
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -935,7 +848,6 @@ static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
 static unsigned long long __cpu_clock(int cpu)
 {
 	unsigned long long now;
-	struct rq *rq;
 
 	/*
 	 * Only call sched_clock() if the scheduler has already been
@@ -944,9 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
 	if (unlikely(!scheduler_running))
 		return 0;
 
-	rq = cpu_rq(cpu);
-	update_rq_clock(rq);
-	now = rq->clock;
+	now = sched_clock_cpu(cpu);
 
 	return now;
 }
@@ -1120,45 +1030,6 @@ static struct rq *this_rq_lock(void)
 	return rq;
 }
 
-/*
- * We are going deep-idle (irqs are disabled):
- */
-void sched_clock_idle_sleep_event(void)
-{
-	struct rq *rq = cpu_rq(smp_processor_id());
-
-	WARN_ON(!irqs_disabled());
-	spin_lock(&rq->lock);
-	__update_rq_clock(rq);
-	spin_unlock(&rq->lock);
-	rq->clock_deep_idle_events++;
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
-
-/*
- * We just idled delta nanoseconds (called with irqs disabled):
- */
-void sched_clock_idle_wakeup_event(u64 delta_ns)
-{
-	struct rq *rq = cpu_rq(smp_processor_id());
-	u64 now = sched_clock();
-
-	WARN_ON(!irqs_disabled());
-	rq->idle_clock += delta_ns;
-	/*
-	 * Override the previous timestamp and ignore all
-	 * sched_clock() deltas that occured while we idled,
-	 * and use the PM-provided delta_ns to advance the
-	 * rq clock:
-	 */
-	spin_lock(&rq->lock);
-	rq->prev_clock_raw = now;
-	rq->clock += delta_ns;
-	spin_unlock(&rq->lock);
-	touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-
 static void __resched_task(struct task_struct *p, int tif_bit);
 
 static inline void resched_task(struct task_struct *p)
@@ -1283,7 +1154,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 
 	spin_lock(&rq->lock);
-	__update_rq_clock(rq);
+	update_rq_clock(rq);
 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 	spin_unlock(&rq->lock);
 
@@ -4476,19 +4347,11 @@ void scheduler_tick(void)
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
-	u64 next_tick = rq->tick_timestamp + TICK_NSEC;
+
+	sched_clock_tick();
 
 	spin_lock(&rq->lock);
-	__update_rq_clock(rq);
-	/*
-	 * Let rq->clock advance by at least TICK_NSEC:
-	 */
-	if (unlikely(rq->clock < next_tick)) {
-		rq->clock = next_tick;
-		rq->clock_underflows++;
-	}
-	rq->tick_timestamp = rq->clock;
-	update_last_tick_seen(rq);
+	update_rq_clock(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	spin_unlock(&rq->lock);
@@ -4642,7 +4505,7 @@ need_resched_nonpreemptible:
 	 * Do the rq-clock update outside the rq lock:
 	 */
 	local_irq_disable();
-	__update_rq_clock(rq);
+	update_rq_clock(rq);
 	spin_lock(&rq->lock);
 	clear_tsk_need_resched(prev);
 
@@ -8226,8 +8089,6 @@ void __init sched_init(void)
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
-		rq->clock = 1;
-		update_last_tick_seen(rq);
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8371,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep);
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
 	int on_rq;
+
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	if (on_rq)
@@ -8402,7 +8264,6 @@ void normalize_rt_tasks(void)
 		p->se.sleep_start		= 0;
 		p->se.block_start		= 0;
 #endif
-		task_rq(p)->clock		= 0;
 
 		if (!rt_task(p)) {
 			/*
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
new file mode 100644
index 000000000000..9c597e37f7de
--- /dev/null
+++ b/kernel/sched_clock.c
@@ -0,0 +1,236 @@
+/*
+ * sched_clock for unstable cpu clocks
+ *
+ *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Based on code by:
+ *   Ingo Molnar <mingo@redhat.com>
+ *   Guillaume Chazarain <guichaz@gmail.com>
+ *
+ * Create a semi stable clock from a mixture of other events, including:
+ *  - gtod
+ *  - jiffies
+ *  - sched_clock()
+ *  - explicit idle events
+ *
+ * We use gtod as base and the unstable clock deltas. The deltas are filtered,
+ * making it monotonic and keeping it within an expected window.  This window
+ * is set up using jiffies.
+ *
+ * Furthermore, explicit sleep and wakeup hooks allow us to account for time
+ * that is otherwise invisible (TSC gets stopped).
+ *
+ * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
+ * consistent between cpus (never more than 1 jiffies difference).
+ */
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+
+
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+
+struct sched_clock_data {
+	/*
+	 * Raw spinlock - this is a special case: this might be called
+	 * from within instrumentation code so we dont want to do any
+	 * instrumentation ourselves.
+	 */
+	raw_spinlock_t		lock;
+
+	unsigned long		prev_jiffies;
+	u64			prev_raw;
+	u64			tick_raw;
+	u64			tick_gtod;
+	u64			clock;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+
+static inline struct sched_clock_data *this_scd(void)
+{
+	return &__get_cpu_var(sched_clock_data);
+}
+
+static inline struct sched_clock_data *cpu_sdc(int cpu)
+{
+	return &per_cpu(sched_clock_data, cpu);
+}
+
+void sched_clock_init(void)
+{
+	u64 ktime_now = ktime_to_ns(ktime_get());
+	u64 now = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct sched_clock_data *scd = cpu_sdc(cpu);
+
+		scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+		scd->prev_jiffies = jiffies;
+		scd->prev_raw = now;
+		scd->tick_raw = now;
+		scd->tick_gtod = ktime_now;
+		scd->clock = ktime_now;
+	}
+}
+
+/*
+ * update the percpu scd from the raw @now value
+ *
+ *  - filter out backward motion
+ *  - use jiffies to generate a min,max window to clip the raw values
+ */
+static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
+{
+	unsigned long now_jiffies = jiffies;
+	long delta_jiffies = now_jiffies - scd->prev_jiffies;
+	u64 clock = scd->clock;
+	u64 min_clock, max_clock;
+	s64 delta = now - scd->prev_raw;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
+
+	if (unlikely(delta < 0)) {
+		clock++;
+		goto out;
+	}
+
+	max_clock = min_clock + TICK_NSEC;
+
+	if (unlikely(clock + delta > max_clock)) {
+		if (clock < max_clock)
+			clock = max_clock;
+		else
+			clock++;
+	} else {
+		clock += delta;
+	}
+
+ out:
+	if (unlikely(clock < min_clock))
+		clock = min_clock;
+
+	scd->prev_raw = now;
+	scd->prev_jiffies = now_jiffies;
+	scd->clock = clock;
+}
+
+static void lock_double_clock(struct sched_clock_data *data1,
+				struct sched_clock_data *data2)
+{
+	if (data1 < data2) {
+		__raw_spin_lock(&data1->lock);
+		__raw_spin_lock(&data2->lock);
+	} else {
+		__raw_spin_lock(&data2->lock);
+		__raw_spin_lock(&data1->lock);
+	}
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+	struct sched_clock_data *scd = cpu_sdc(cpu);
+	u64 now, clock;
+
+	WARN_ON_ONCE(!irqs_disabled());
+	now = sched_clock();
+
+	if (cpu != raw_smp_processor_id()) {
+		/*
+		 * in order to update a remote cpu's clock based on our
+		 * unstable raw time rebase it against:
+		 *   tick_raw		(offset between raw counters)
+		 *   tick_gotd          (tick offset between cpus)
+		 */
+		struct sched_clock_data *my_scd = this_scd();
+
+		lock_double_clock(scd, my_scd);
+
+		now -= my_scd->tick_raw;
+		now += scd->tick_raw;
+
+		now -= my_scd->tick_gtod;
+		now += scd->tick_gtod;
+
+		__raw_spin_unlock(&my_scd->lock);
+	} else {
+		__raw_spin_lock(&scd->lock);
+	}
+
+	__update_sched_clock(scd, now);
+	clock = scd->clock;
+
+	__raw_spin_unlock(&scd->lock);
+
+	return clock;
+}
+
+void sched_clock_tick(void)
+{
+	struct sched_clock_data *scd = this_scd();
+	u64 now, now_gtod;
+
+	WARN_ON_ONCE(!irqs_disabled());
+
+	now = sched_clock();
+	now_gtod = ktime_to_ns(ktime_get());
+
+	__raw_spin_lock(&scd->lock);
+	__update_sched_clock(scd, now);
+	/*
+	 * update tick_gtod after __update_sched_clock() because that will
+	 * already observe 1 new jiffy; adding a new tick_gtod to that would
+	 * increase the clock 2 jiffies.
+	 */
+	scd->tick_raw = now;
+	scd->tick_gtod = now_gtod;
+	__raw_spin_unlock(&scd->lock);
+}
+
+/*
+ * We are going deep-idle (irqs are disabled):
+ */
+void sched_clock_idle_sleep_event(void)
+{
+	sched_clock_cpu(smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+	struct sched_clock_data *scd = this_scd();
+	u64 now = sched_clock();
+
+	/*
+	 * Override the previous timestamp and ignore all
+	 * sched_clock() deltas that occured while we idled,
+	 * and use the PM-provided delta_ns to advance the
+	 * rq clock:
+	 */
+	__raw_spin_lock(&scd->lock);
+	scd->prev_raw = now;
+	scd->clock += delta_ns;
+	__raw_spin_unlock(&scd->lock);
+
+	touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+
+#endif
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6b4a12558e88..5f06118fbc31 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu)
 	PN(next_balance);
 	P(curr->pid);
 	PN(clock);
-	PN(idle_clock);
-	PN(prev_clock_raw);
-	P(clock_warps);
-	P(clock_overflows);
-	P(clock_underflows);
-	P(clock_deep_idle_events);
-	PN(clock_max_delta);
 	P(cpu_load[0]);
 	P(cpu_load[1]);
 	P(cpu_load[2]);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d99e01f6929a..c863663d204d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -959,7 +959,7 @@ static void yield_task_fair(struct rq *rq)
 		return;
 
 	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
-		__update_rq_clock(rq);
+		update_rq_clock(rq);
 		/*
 		 * Update run-time statistics of the 'current'.
 		 */
-- 
cgit v1.3-14-g43fede