From c2f0c7c356dc9ae15419f00c725a2fcc58eeff58 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Fri, 6 May 2005 12:38:39 +0100
Subject: The attached patch addresses the problem with getting the audit
 daemon shutdown credential information. It creates a new message type
 AUDIT_TERM_INFO, which is used by the audit daemon to query who issued the
 shutdown.

It requires the placement of a hook function that gathers the information. The
hook is after the DAC & MAC checks and before the function returns. Racing
threads could overwrite the uid & pid - but they would have to be root and
have policy that allows signalling the audit daemon. That should be a
manageable risk.

The userspace component will be released later in audit 0.7.2. When it
receives the TERM signal, it queries the kernel for shutdown information.
When it receives it, it writes the message and exits. The message looks
like this:

type=DAEMON msg=auditd(1114551182.000) auditd normal halt, sending pid=2650
uid=525, auditd pid=1685

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c   | 14 +++++++++++++-
 kernel/auditsc.c | 19 +++++++++++++++++++
 kernel/signal.c  |  7 ++++++-
 3 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 9c4f1af0c794..6f344b44d3d3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -68,7 +68,7 @@ static int	audit_failure = AUDIT_FAIL_PRINTK;
 
 /* If audit records are to be written to the netlink socket, audit_pid
  * contains the (non-zero) pid. */
-static int	audit_pid;
+int		audit_pid;
 
 /* If audit_limit is non-zero, limit the rate of sending audit records
  * to that number per second.  This prevents DoS attacks, but results in
@@ -79,6 +79,10 @@ static int	audit_rate_limit;
 static int	audit_backlog_limit = 64;
 static atomic_t	audit_backlog	    = ATOMIC_INIT(0);
 
+/* The identity of the user shutting down the audit system. */
+uid_t		audit_sig_uid = -1;
+pid_t		audit_sig_pid = -1;
+
 /* Records can be lost in several ways:
    0) [suppressed in audit_alloc]
    1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
@@ -321,6 +325,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
 	case AUDIT_SET:
 	case AUDIT_ADD:
 	case AUDIT_DEL:
+	case AUDIT_SIGNAL_INFO:
 		if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
 			err = -EPERM;
 		break;
@@ -344,6 +349,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	struct audit_buffer	*ab;
 	u16			msg_type = nlh->nlmsg_type;
 	uid_t			loginuid; /* loginuid of sender */
+	struct audit_sig_info   sig_data;
 
 	err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
 	if (err)
@@ -419,6 +425,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		err = -EOPNOTSUPP;
 #endif
 		break;
+	case AUDIT_SIGNAL_INFO:
+		sig_data.uid = audit_sig_uid;
+		sig_data.pid = audit_sig_pid;
+		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 
+				0, 0, &sig_data, sizeof(sig_data));
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 37b3ac94bc47..f1bf66510cd3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1056,3 +1056,22 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 	context->aux = (void *)ax;
 	return 0;
 }
+
+void audit_signal_info(int sig, struct task_struct *t)
+{
+	extern pid_t audit_sig_pid;
+	extern uid_t audit_sig_uid;
+	extern int audit_pid;
+
+	if (unlikely(audit_pid && t->pid == audit_pid)) {
+		if (sig == SIGTERM || sig == SIGHUP) {
+			struct audit_context *ctx = current->audit_context;
+			audit_sig_pid = current->pid;
+			if (ctx)
+				audit_sig_uid = ctx->loginuid;
+			else
+				audit_sig_uid = current->uid;
+		}
+	}
+}
+
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f3debc77c5b..293e189d8bc3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -24,6 +24,7 @@
 #include <linux/ptrace.h>
 #include <linux/posix-timers.h>
 #include <linux/signal.h>
+#include <linux/audit.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -658,7 +659,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
 	    && !capable(CAP_KILL))
 		return error;
-	return security_task_kill(t, info, sig);
+
+	error = security_task_kill(t, info, sig);
+	if (!error)
+		audit_signal_info(sig, t); /* Let audit system see the signal */
+	return error;
 }
 
 /* forward decl */
-- 
cgit v1.3-14-g43fede


From 16e1904e694d459ec2ca9b33c22b818eaaa4c63f Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@osdl.org>
Date: Fri, 6 May 2005 15:53:34 +0100
Subject: AUDIT: Add helper functions to allocate and free audit_buffers.

Signed-off-by: Chris Wright <chrisw@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 61 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 38 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 6f344b44d3d3..e5bdba3e3ae1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -620,6 +620,42 @@ static int __init audit_enable(char *str)
 
 __setup("audit=", audit_enable);
 
+static void audit_buffer_free(struct audit_buffer *ab)
+{
+	unsigned long flags;
+
+	atomic_dec(&audit_backlog);
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (++audit_freelist_count > AUDIT_MAXFREE)
+		kfree(ab);
+	else
+		list_add(&ab->list, &audit_freelist);
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+static struct audit_buffer * audit_buffer_alloc(int gfp_mask)
+{
+	unsigned long flags;
+	struct audit_buffer *ab = NULL;
+
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (!list_empty(&audit_freelist)) {
+		ab = list_entry(audit_freelist.next,
+				struct audit_buffer, list);
+		list_del(&ab->list);
+		--audit_freelist_count;
+	}
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+	if (!ab) {
+		ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
+		if (!ab)
+			goto out;
+	}
+	atomic_inc(&audit_backlog);
+out:
+	return ab;
+}
 
 /* Obtain an audit buffer.  This routine does locking to obtain the
  * audit buffer, but then no locking is required for calls to
@@ -630,7 +666,6 @@ __setup("audit=", audit_enable);
 struct audit_buffer *audit_log_start(struct audit_context *ctx)
 {
 	struct audit_buffer	*ab	= NULL;
-	unsigned long		flags;
 	struct timespec		t;
 	unsigned int		serial;
 
@@ -649,23 +684,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
 		return NULL;
 	}
 
-	spin_lock_irqsave(&audit_freelist_lock, flags);
-	if (!list_empty(&audit_freelist)) {
-		ab = list_entry(audit_freelist.next,
-				struct audit_buffer, list);
-		list_del(&ab->list);
-		--audit_freelist_count;
-	}
-	spin_unlock_irqrestore(&audit_freelist_lock, flags);
-
-	if (!ab)
-		ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
+	ab = audit_buffer_alloc(GFP_ATOMIC);
 	if (!ab) {
 		audit_log_lost("out of memory in audit_log_start");
 		return NULL;
 	}
 
-	atomic_inc(&audit_backlog);
 	skb_queue_head_init(&ab->sklist);
 
 	ab->ctx   = ctx;
@@ -824,8 +848,6 @@ static void audit_log_end_irq(struct audit_buffer *ab)
  * be called in an irq context. */
 static void audit_log_end_fast(struct audit_buffer *ab)
 {
-	unsigned long flags;
-
 	BUG_ON(in_irq());
 	if (!ab)
 		return;
@@ -836,14 +858,7 @@ static void audit_log_end_fast(struct audit_buffer *ab)
 		if (audit_log_drain(ab))
 			return;
 	}
-
-	atomic_dec(&audit_backlog);
-	spin_lock_irqsave(&audit_freelist_lock, flags);
-	if (++audit_freelist_count > AUDIT_MAXFREE)
-		kfree(ab);
-	else
-		list_add(&ab->list, &audit_freelist);
-	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+	audit_buffer_free(ab);
 }
 
 /* Send or queue the message in the audit buffer, depending on the
-- 
cgit v1.3-14-g43fede


From 8fc6115c2a04099a6e846dc0b2d85cba43821b54 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@osdl.org>
Date: Fri, 6 May 2005 15:54:17 +0100
Subject: AUDIT: expand audit tmp buffer as needed

Introduce audit_expand and make the audit_buffer use a dynamic buffer
which can be resized.  When audit buffer is moved to skb it will not
be fragmented across skb's, so we can eliminate the sklist in the
audit_buffer.  During audit_log_move, we simply copy the full buffer
into a single skb, and then audit_log_drain sends it on.

Signed-off-by: Chris Wright <chrisw@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 139 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 79 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index e5bdba3e3ae1..c6e31d209c41 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -136,14 +136,11 @@ static DECLARE_MUTEX(audit_netlink_sem);
  * use simultaneously. */
 struct audit_buffer {
 	struct list_head     list;
-	struct sk_buff_head  sklist;	/* formatted skbs ready to send */
+	struct sk_buff       *skb;	/* formatted skb ready to send */
 	struct audit_context *ctx;	/* NULL or associated context */
 	int		     len;	/* used area of tmp */
-	char		     tmp[AUDIT_BUFSIZ];
-
-				/* Pointer to header and contents */
-	struct nlmsghdr      *nlh;
-	int		     total;
+	int		     size;	/* size of tmp */
+	char		     *tmp;	
 	int		     type;
 	int		     pid;
 };
@@ -488,55 +485,47 @@ static void audit_receive(struct sock *sk, int length)
 static void audit_log_move(struct audit_buffer *ab)
 {
 	struct sk_buff	*skb;
+	struct nlmsghdr *nlh;
 	char		*start;
-	int		extra = ab->nlh ? 0 : NLMSG_SPACE(0);
+	int		len = NLMSG_SPACE(0) + ab->len + 1;
 
 	/* possible resubmission */
-	if (ab->len == 0)
+	if (ab->skb)
 		return;
 
-	skb = skb_peek_tail(&ab->sklist);
-	if (!skb || skb_tailroom(skb) <= ab->len + extra) {
-		skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
-		if (!skb) {
-			ab->len = 0; /* Lose information in ab->tmp */
-			audit_log_lost("out of memory in audit_log_move");
-			return;
-		}
-		__skb_queue_tail(&ab->sklist, skb);
-		if (!ab->nlh)
-			ab->nlh = (struct nlmsghdr *)skb_put(skb,
-							     NLMSG_SPACE(0));
+	skb = alloc_skb(len, GFP_ATOMIC);
+	if (!skb) {
+		/* Lose information in ab->tmp */
+		audit_log_lost("out of memory in audit_log_move");
+		return;
 	}
+	ab->skb = skb;
+	nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_SPACE(0));
+	nlh->nlmsg_type = ab->type;
+	nlh->nlmsg_len = ab->len;
+	nlh->nlmsg_flags = 0;
+	nlh->nlmsg_pid = ab->pid;
+	nlh->nlmsg_seq = 0;
 	start = skb_put(skb, ab->len);
 	memcpy(start, ab->tmp, ab->len);
-	ab->len = 0;
 }
 
 /* Iterate over the skbuff in the audit_buffer, sending their contents
  * to user space. */
 static inline int audit_log_drain(struct audit_buffer *ab)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb = ab->skb;
 
-	while ((skb = skb_dequeue(&ab->sklist))) {
+	if (skb) {
 		int retval = 0;
 
 		if (audit_pid) {
-			if (ab->nlh) {
-				ab->nlh->nlmsg_len   = ab->total;
-				ab->nlh->nlmsg_type  = ab->type;
-				ab->nlh->nlmsg_flags = 0;
-				ab->nlh->nlmsg_seq   = 0;
-				ab->nlh->nlmsg_pid   = ab->pid;
-			}
 			skb_get(skb); /* because netlink_* frees */
 			retval = netlink_unicast(audit_sock, skb, audit_pid,
 						 MSG_DONTWAIT);
 		}
 		if (retval == -EAGAIN &&
 		    (atomic_read(&audit_backlog)) < audit_backlog_limit) {
-			skb_queue_head(&ab->sklist, skb);
 			audit_log_end_irq(ab);
 			return 1;
 		}
@@ -550,13 +539,12 @@ static inline int audit_log_drain(struct audit_buffer *ab)
 				audit_log_lost("netlink socket too busy");
 		}
 		if (!audit_pid) { /* No daemon */
-			int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
+			int offset = NLMSG_SPACE(0);
 			int len    = skb->len - offset;
 			skb->data[offset + len] = '\0';
 			printk(KERN_ERR "%s\n", skb->data + offset);
 		}
 		kfree_skb(skb);
-		ab->nlh = NULL;
 	}
 	return 0;
 }
@@ -624,6 +612,10 @@ static void audit_buffer_free(struct audit_buffer *ab)
 {
 	unsigned long flags;
 
+	if (!ab)
+		return;
+
+	kfree(ab->tmp);
 	atomic_dec(&audit_backlog);
 	spin_lock_irqsave(&audit_freelist_lock, flags);
 	if (++audit_freelist_count > AUDIT_MAXFREE)
@@ -633,7 +625,8 @@ static void audit_buffer_free(struct audit_buffer *ab)
 	spin_unlock_irqrestore(&audit_freelist_lock, flags);
 }
 
-static struct audit_buffer * audit_buffer_alloc(int gfp_mask)
+static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
+						int gfp_mask)
 {
 	unsigned long flags;
 	struct audit_buffer *ab = NULL;
@@ -650,11 +643,24 @@ static struct audit_buffer * audit_buffer_alloc(int gfp_mask)
 	if (!ab) {
 		ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
 		if (!ab)
-			goto out;
+			goto err;
 	}
 	atomic_inc(&audit_backlog);
-out:
+
+	ab->tmp = kmalloc(AUDIT_BUFSIZ, GFP_ATOMIC);
+	if (!ab->tmp)
+		goto err;
+
+	ab->skb   = NULL;
+	ab->ctx   = ctx;
+	ab->len   = 0;
+	ab->size  = AUDIT_BUFSIZ;
+	ab->type  = AUDIT_KERNEL;
+	ab->pid   = 0;
 	return ab;
+err:
+	audit_buffer_free(ab);
+	return NULL;
 }
 
 /* Obtain an audit buffer.  This routine does locking to obtain the
@@ -684,21 +690,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
 		return NULL;
 	}
 
-	ab = audit_buffer_alloc(GFP_ATOMIC);
+	ab = audit_buffer_alloc(ctx, GFP_ATOMIC);
 	if (!ab) {
 		audit_log_lost("out of memory in audit_log_start");
 		return NULL;
 	}
 
-	skb_queue_head_init(&ab->sklist);
-
-	ab->ctx   = ctx;
-	ab->len   = 0;
-	ab->nlh   = NULL;
-	ab->total = 0;
-	ab->type  = AUDIT_KERNEL;
-	ab->pid   = 0;
-
 #ifdef CONFIG_AUDITSYSCALL
 	if (ab->ctx)
 		audit_get_stamp(ab->ctx, &t, &serial);
@@ -713,6 +710,27 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
 	return ab;
 }
 
+/**
+ * audit_expand - expand tmp buffer in the audit buffer
+ * @ab: audit_buffer
+ *
+ * Returns 0 (no space) on failed expansion, or available space if
+ * successful.
+ */
+static inline int audit_expand(struct audit_buffer *ab)
+{
+	char *tmp;
+	int len = ab->size + AUDIT_BUFSIZ;
+
+	tmp = kmalloc(len, GFP_ATOMIC);
+	if (!tmp)
+		return 0;
+	memcpy(tmp, ab->tmp, ab->len);
+	kfree(ab->tmp);
+	ab->tmp = tmp;
+	ab->size = len;
+	return ab->size - ab->len;
+}
 
 /* Format an audit message into the audit buffer.  If there isn't enough
  * room in the audit buffer, more room will be allocated and vsnprint
@@ -726,22 +744,25 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 	if (!ab)
 		return;
 
-	avail = sizeof(ab->tmp) - ab->len;
+	avail = ab->size - ab->len;
 	if (avail <= 0) {
-		audit_log_move(ab);
-		avail = sizeof(ab->tmp) - ab->len;
+		avail = audit_expand(ab);
+		if (!avail)
+			goto out;
 	}
-	len   = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+	len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
 	if (len >= avail) {
 		/* The printk buffer is 1024 bytes long, so if we get
 		 * here and AUDIT_BUFSIZ is at least 1024, then we can
 		 * log everything that printk could have logged. */
-		audit_log_move(ab);
-		avail = sizeof(ab->tmp) - ab->len;
-		len   = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+		avail = audit_expand(ab);
+		if (!avail)
+			goto out;
+		len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
 	}
 	ab->len   += (len < avail) ? len : avail;
-	ab->total += (len < avail) ? len : avail;
+out:
+	return;
 }
 
 /* Format a message into the audit buffer.  All the work is done in
@@ -789,21 +810,19 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	char *p;
 	int  len, avail;
 
-	if (prefix) audit_log_format(ab, " %s", prefix);
+	if (prefix)
+		audit_log_format(ab, " %s", prefix);
 
-	if (ab->len > 128)
-		audit_log_move(ab);
-	avail = sizeof(ab->tmp) - ab->len;
+	avail = ab->size - ab->len;
 	p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
 	if (IS_ERR(p)) {
 		/* FIXME: can we save some information here? */
 		audit_log_format(ab, "<toolong>");
 	} else {
 				/* path isn't at start of buffer */
-		len	   = (ab->tmp + sizeof(ab->tmp) - 1) - p;
+		len = (ab->tmp + ab->size - 1) - p;
 		memmove(ab->tmp + ab->len, p, len);
 		ab->len   += len;
-		ab->total += len;
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From 5ac52f33b6f05fcb91a97124155183b779a4efdf Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@osdl.org>
Date: Fri, 6 May 2005 15:54:53 +0100
Subject: AUDIT: buffer audit msgs directly to skb

Drop the use of a tmp buffer in the audit_buffer, and just buffer
directly to the skb.  All header data that was temporarily stored in
the audit_buffer can now be stored directly in the netlink header in
the skb.  Resize skb as needed.  This eliminates the extra copy (and
the audit_log_move function which was responsible for copying).

Signed-off-by: Chris Wright <chrisw@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 122 ++++++++++++++++++++++-----------------------------------
 1 file changed, 46 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index c6e31d209c41..993e445418a7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -138,16 +138,18 @@ struct audit_buffer {
 	struct list_head     list;
 	struct sk_buff       *skb;	/* formatted skb ready to send */
 	struct audit_context *ctx;	/* NULL or associated context */
-	int		     len;	/* used area of tmp */
-	int		     size;	/* size of tmp */
-	char		     *tmp;	
-	int		     type;
-	int		     pid;
 };
 
 void audit_set_type(struct audit_buffer *ab, int type)
 {
-	ab->type = type;
+	struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
+	nlh->nlmsg_type = type;
+}
+
+static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
+{
+	struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
+	nlh->nlmsg_pid = pid;
 }
 
 struct audit_entry {
@@ -405,8 +407,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 				 (int)(nlh->nlmsg_len
 				       - ((char *)data - (char *)nlh)),
 				 loginuid, (char *)data);
-		ab->type = AUDIT_USER;
-		ab->pid  = pid;
+		audit_set_type(ab, AUDIT_USER);
+		audit_set_pid(ab, pid);
 		audit_log_end(ab);
 		break;
 	case AUDIT_ADD:
@@ -476,42 +478,7 @@ static void audit_receive(struct sock *sk, int length)
 	up(&audit_netlink_sem);
 }
 
-/* Move data from tmp buffer into an skb.  This is an extra copy, and
- * that is unfortunate.  However, the copy will only occur when a record
- * is being written to user space, which is already a high-overhead
- * operation.  (Elimination of the copy is possible, for example, by
- * writing directly into a pre-allocated skb, at the cost of wasting
- * memory. */
-static void audit_log_move(struct audit_buffer *ab)
-{
-	struct sk_buff	*skb;
-	struct nlmsghdr *nlh;
-	char		*start;
-	int		len = NLMSG_SPACE(0) + ab->len + 1;
-
-	/* possible resubmission */
-	if (ab->skb)
-		return;
-
-	skb = alloc_skb(len, GFP_ATOMIC);
-	if (!skb) {
-		/* Lose information in ab->tmp */
-		audit_log_lost("out of memory in audit_log_move");
-		return;
-	}
-	ab->skb = skb;
-	nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_SPACE(0));
-	nlh->nlmsg_type = ab->type;
-	nlh->nlmsg_len = ab->len;
-	nlh->nlmsg_flags = 0;
-	nlh->nlmsg_pid = ab->pid;
-	nlh->nlmsg_seq = 0;
-	start = skb_put(skb, ab->len);
-	memcpy(start, ab->tmp, ab->len);
-}
-
-/* Iterate over the skbuff in the audit_buffer, sending their contents
- * to user space. */
+/* Grab skbuff from the audit_buffer and send to user space. */
 static inline int audit_log_drain(struct audit_buffer *ab)
 {
 	struct sk_buff *skb = ab->skb;
@@ -520,6 +487,8 @@ static inline int audit_log_drain(struct audit_buffer *ab)
 		int retval = 0;
 
 		if (audit_pid) {
+			struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
+			nlh->nlmsg_len = skb->len;
 			skb_get(skb); /* because netlink_* frees */
 			retval = netlink_unicast(audit_sock, skb, audit_pid,
 						 MSG_DONTWAIT);
@@ -544,7 +513,6 @@ static inline int audit_log_drain(struct audit_buffer *ab)
 			skb->data[offset + len] = '\0';
 			printk(KERN_ERR "%s\n", skb->data + offset);
 		}
-		kfree_skb(skb);
 	}
 	return 0;
 }
@@ -615,7 +583,8 @@ static void audit_buffer_free(struct audit_buffer *ab)
 	if (!ab)
 		return;
 
-	kfree(ab->tmp);
+	if (ab->skb)
+		kfree_skb(ab->skb);
 	atomic_dec(&audit_backlog);
 	spin_lock_irqsave(&audit_freelist_lock, flags);
 	if (++audit_freelist_count > AUDIT_MAXFREE)
@@ -630,6 +599,7 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
 {
 	unsigned long flags;
 	struct audit_buffer *ab = NULL;
+	struct nlmsghdr *nlh;
 
 	spin_lock_irqsave(&audit_freelist_lock, flags);
 	if (!list_empty(&audit_freelist)) {
@@ -647,16 +617,16 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
 	}
 	atomic_inc(&audit_backlog);
 
-	ab->tmp = kmalloc(AUDIT_BUFSIZ, GFP_ATOMIC);
-	if (!ab->tmp)
+	ab->skb = alloc_skb(AUDIT_BUFSIZ, GFP_ATOMIC);
+	if (!ab->skb)
 		goto err;
 
-	ab->skb   = NULL;
 	ab->ctx   = ctx;
-	ab->len   = 0;
-	ab->size  = AUDIT_BUFSIZ;
-	ab->type  = AUDIT_KERNEL;
-	ab->pid   = 0;
+	nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
+	nlh->nlmsg_type = AUDIT_KERNEL;
+	nlh->nlmsg_flags = 0;
+	nlh->nlmsg_pid = 0;
+	nlh->nlmsg_seq = 0;
 	return ab;
 err:
 	audit_buffer_free(ab);
@@ -711,7 +681,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
 }
 
 /**
- * audit_expand - expand tmp buffer in the audit buffer
+ * audit_expand - expand skb in the audit buffer
  * @ab: audit_buffer
  *
  * Returns 0 (no space) on failed expansion, or available space if
@@ -719,17 +689,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
  */
 static inline int audit_expand(struct audit_buffer *ab)
 {
-	char *tmp;
-	int len = ab->size + AUDIT_BUFSIZ;
-
-	tmp = kmalloc(len, GFP_ATOMIC);
-	if (!tmp)
+	struct sk_buff *skb = ab->skb;
+	int ret = pskb_expand_head(skb, skb_headroom(skb), AUDIT_BUFSIZ,
+				   GFP_ATOMIC);
+	if (ret < 0) {
+		audit_log_lost("out of memory in audit_expand");
 		return 0;
-	memcpy(tmp, ab->tmp, ab->len);
-	kfree(ab->tmp);
-	ab->tmp = tmp;
-	ab->size = len;
-	return ab->size - ab->len;
+	}
+	return skb_tailroom(skb);
 }
 
 /* Format an audit message into the audit buffer.  If there isn't enough
@@ -740,17 +707,20 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 			      va_list args)
 {
 	int len, avail;
+	struct sk_buff *skb;
 
 	if (!ab)
 		return;
 
-	avail = ab->size - ab->len;
-	if (avail <= 0) {
+	BUG_ON(!ab->skb);
+	skb = ab->skb;
+	avail = skb_tailroom(skb);
+	if (avail == 0) {
 		avail = audit_expand(ab);
 		if (!avail)
 			goto out;
 	}
-	len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+	len = vsnprintf(skb->tail, avail, fmt, args);
 	if (len >= avail) {
 		/* The printk buffer is 1024 bytes long, so if we get
 		 * here and AUDIT_BUFSIZ is at least 1024, then we can
@@ -758,9 +728,9 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 		avail = audit_expand(ab);
 		if (!avail)
 			goto out;
-		len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+		len = vsnprintf(skb->tail, avail, fmt, args);
 	}
-	ab->len   += (len < avail) ? len : avail;
+	skb_put(skb, (len < avail) ? len : avail);
 out:
 	return;
 }
@@ -808,21 +778,22 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 		      struct dentry *dentry, struct vfsmount *vfsmnt)
 {
 	char *p;
+	struct sk_buff *skb = ab->skb;
 	int  len, avail;
 
 	if (prefix)
 		audit_log_format(ab, " %s", prefix);
 
-	avail = ab->size - ab->len;
-	p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
+	avail = skb_tailroom(skb);
+	p = d_path(dentry, vfsmnt, skb->tail, avail);
 	if (IS_ERR(p)) {
 		/* FIXME: can we save some information here? */
 		audit_log_format(ab, "<toolong>");
 	} else {
-				/* path isn't at start of buffer */
-		len = (ab->tmp + ab->size - 1) - p;
-		memmove(ab->tmp + ab->len, p, len);
-		ab->len   += len;
+		/* path isn't at start of buffer */
+		len = ((char *)skb->tail + avail - 1) - p;
+		memmove(skb->tail, p, len);
+		skb_put(skb, len);
 	}
 }
 
@@ -873,7 +844,6 @@ static void audit_log_end_fast(struct audit_buffer *ab)
 	if (!audit_rate_check()) {
 		audit_log_lost("rate limit exceeded");
 	} else {
-		audit_log_move(ab);
 		if (audit_log_drain(ab))
 			return;
 	}
-- 
cgit v1.3-14-g43fede


From 4332bdd332a2dca93dc3b1d017b2dd27d5c8cef3 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Fri, 6 May 2005 15:59:57 +0100
Subject: AUDIT: Honour gfp_mask in audit_buffer_alloc()

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 993e445418a7..b86007da8a3f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -611,13 +611,13 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
 	spin_unlock_irqrestore(&audit_freelist_lock, flags);
 
 	if (!ab) {
-		ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
+		ab = kmalloc(sizeof(*ab), gfp_mask);
 		if (!ab)
 			goto err;
 	}
 	atomic_inc(&audit_backlog);
 
-	ab->skb = alloc_skb(AUDIT_BUFSIZ, GFP_ATOMIC);
+	ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
 	if (!ab->skb)
 		goto err;
 
-- 
cgit v1.3-14-g43fede


From 8c5aa40c94ef8bb7f7da95ecd5942e2d20fc3c9d Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Tue, 10 May 2005 18:53:07 +0100
Subject: AUDIT: Fix reported length of audit messages.

We were setting nlmsg_len to skb->len, but we should be subtracting
the size of the header.

From: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index b86007da8a3f..2ddd1a2b66d0 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -488,7 +488,7 @@ static inline int audit_log_drain(struct audit_buffer *ab)
 
 		if (audit_pid) {
 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
-			nlh->nlmsg_len = skb->len;
+			nlh->nlmsg_len = skb->len - sizeof(*nlh);
 			skb_get(skb); /* because netlink_* frees */
 			retval = netlink_unicast(audit_sock, skb, audit_pid,
 						 MSG_DONTWAIT);
-- 
cgit v1.3-14-g43fede


From e3b926b4c1499ba7b1b9513aa6113944d572aba5 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Tue, 10 May 2005 18:56:08 +0100
Subject: AUDIT: pass size argument to audit_expand().

Let audit_expand() know how much it's expected to grow the buffer, in
the case that we have that information to hand.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 2ddd1a2b66d0..1dd456c90ae5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -687,10 +687,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
  * Returns 0 (no space) on failed expansion, or available space if
  * successful.
  */
-static inline int audit_expand(struct audit_buffer *ab)
+static inline int audit_expand(struct audit_buffer *ab, int extra)
 {
 	struct sk_buff *skb = ab->skb;
-	int ret = pskb_expand_head(skb, skb_headroom(skb), AUDIT_BUFSIZ,
+	int ret = pskb_expand_head(skb, skb_headroom(skb), extra,
 				   GFP_ATOMIC);
 	if (ret < 0) {
 		audit_log_lost("out of memory in audit_expand");
@@ -716,7 +716,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 	skb = ab->skb;
 	avail = skb_tailroom(skb);
 	if (avail == 0) {
-		avail = audit_expand(ab);
+		avail = audit_expand(ab, AUDIT_BUFSIZ);
 		if (!avail)
 			goto out;
 	}
@@ -725,7 +725,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 		/* The printk buffer is 1024 bytes long, so if we get
 		 * here and AUDIT_BUFSIZ is at least 1024, then we can
 		 * log everything that printk could have logged. */
-		avail = audit_expand(ab);
+		avail = audit_expand(ab, 1+len-avail);
 		if (!avail)
 			goto out;
 		len = vsnprintf(skb->tail, avail, fmt, args);
-- 
cgit v1.3-14-g43fede


From eecb0a7338ef6504aa49def4dde6429853025801 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Tue, 10 May 2005 18:58:51 +0100
Subject: AUDIT: Fix abuse of va_args.

We're not allowed to use args twice; we need to use va_copy.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 1dd456c90ae5..ddb69a458203 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -708,6 +708,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 {
 	int len, avail;
 	struct sk_buff *skb;
+	va_list args2;
 
 	if (!ab)
 		return;
@@ -720,6 +721,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 		if (!avail)
 			goto out;
 	}
+	va_copy(args2, args);
 	len = vsnprintf(skb->tail, avail, fmt, args);
 	if (len >= avail) {
 		/* The printk buffer is 1024 bytes long, so if we get
@@ -728,7 +730,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 		avail = audit_expand(ab, 1+len-avail);
 		if (!avail)
 			goto out;
-		len = vsnprintf(skb->tail, avail, fmt, args);
+		len = vsnprintf(skb->tail, avail, fmt, args2);
 	}
 	skb_put(skb, (len < avail) ? len : avail);
 out:
-- 
cgit v1.3-14-g43fede


From 5a241d77039a2632e81070619d5733258728f8bd Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@osdl.org>
Date: Wed, 11 May 2005 10:43:07 +0100
Subject: AUDIT: Properly account for alignment difference in nlmsg_len.

Signed-off-by: Chris Wright <chrisw@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index ddb69a458203..a5f03cb2c0f5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -488,7 +488,7 @@ static inline int audit_log_drain(struct audit_buffer *ab)
 
 		if (audit_pid) {
 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
-			nlh->nlmsg_len = skb->len - sizeof(*nlh);
+			nlh->nlmsg_len = skb->len - NLMSG_SPACE(0);
 			skb_get(skb); /* because netlink_* frees */
 			retval = netlink_unicast(audit_sock, skb, audit_pid,
 						 MSG_DONTWAIT);
-- 
cgit v1.3-14-g43fede


From 804a6a49d874841a98ebea3247ad2e672812ad6a Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@osdl.org>
Date: Wed, 11 May 2005 10:52:45 +0100
Subject: Audit requires CONFIG_NET

Audit now actually requires netlink.  So make it depend on CONFIG_NET,
and remove the inline dependencies on CONFIG_NET.

Signed-off-by: Chris Wright <chrisw@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 init/Kconfig     |  1 +
 kernel/audit.c   | 30 ------------------------------
 kernel/auditsc.c |  2 --
 3 files changed, 1 insertion(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index 705497653465..448939d183dd 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -164,6 +164,7 @@ config SYSCTL
 
 config AUDIT
 	bool "Auditing support"
+	depends on NET
 	default y if SECURITY_SELINUX
 	help
 	  Enable auditing infrastructure that can be used with another
diff --git a/kernel/audit.c b/kernel/audit.c
index a5f03cb2c0f5..dc4aba21f30a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -283,7 +283,6 @@ static int audit_set_failure(int state, uid_t loginuid)
 	return old;
 }
 
-#ifdef CONFIG_NET
 void audit_send_reply(int pid, int seq, int type, int done, int multi,
 		      void *payload, int size)
 {
@@ -531,35 +530,6 @@ static int __init audit_init(void)
 	audit_log(NULL, "initialized");
 	return 0;
 }
-
-#else
-/* Without CONFIG_NET, we have no skbuffs.  For now, print what we have
- * in the buffer. */
-static void audit_log_move(struct audit_buffer *ab)
-{
-	printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
-	ab->len = 0;
-}
-
-static inline int audit_log_drain(struct audit_buffer *ab)
-{
-	return 0;
-}
-
-/* Initialize audit support at boot time. */
-int __init audit_init(void)
-{
-	printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
-	audit_sock = NULL;
-	audit_pid  = 0;
-
-	audit_initialized = 1;
-	audit_enabled = audit_default;
-	audit_log(NULL, "initialized");
-	return 0;
-}
-#endif
-
 __initcall(audit_init);
 
 /* Process kernel command-line parameter at boot time.  audit=0 or audit=1. */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f1bf66510cd3..680bb928343b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -226,7 +226,6 @@ static inline int audit_del_rule(struct audit_rule *rule,
 	return -EFAULT;		/* No matching rule */
 }
 
-#ifdef CONFIG_NET
 /* Copy rule from user-space to kernel-space.  Called during
  * AUDIT_ADD. */
 static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
@@ -305,7 +304,6 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 
 	return err;
 }
-#endif
 
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise. */
-- 
cgit v1.3-14-g43fede


From 197c69c6afd2deb7eec44040ff533d90d26c6161 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@osdl.org>
Date: Wed, 11 May 2005 10:54:05 +0100
Subject: Move ifdef CONFIG_AUDITSYSCALL to header

Remove code conditionally dependent on CONFIG_AUDITSYSCALL from audit.c.
Move these dependencies to audit.h with the rest.

Signed-off-by: Chris Wright <chrisw@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h |  4 +++-
 kernel/audit.c        | 12 ++----------
 kernel/auditsc.c      |  7 +++----
 3 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index baa80760824c..58c5589b531f 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -192,7 +192,7 @@ extern void audit_inode(const char *name, const struct inode *inode);
 				/* Private API (for audit.c only) */
 extern int  audit_receive_filter(int type, int pid, int uid, int seq,
 				 void *data, uid_t loginuid);
-extern void audit_get_stamp(struct audit_context *ctx,
+extern int audit_get_stamp(struct audit_context *ctx,
 			    struct timespec *t, unsigned int *serial);
 extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 extern uid_t audit_get_loginuid(struct audit_context *ctx);
@@ -206,6 +206,8 @@ extern void audit_signal_info(int sig, struct task_struct *t);
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
 #define audit_inode(n,i) do { ; } while (0)
+#define audit_receive_filter(t,p,u,s,d,l) ({ -EOPNOTSUPP; })
+#define audit_get_stamp(c,t,s) ({ 0; })
 #define audit_get_loginuid(c) ({ -1; })
 #define audit_ipc_perms(q,u,g,m) ({ 0; })
 #define audit_signal_info(s,t) do { ; } while (0)
diff --git a/kernel/audit.c b/kernel/audit.c
index dc4aba21f30a..c18b769e23a2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -416,12 +416,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			return -EINVAL;
 		/* fallthrough */
 	case AUDIT_LIST:
-#ifdef CONFIG_AUDITSYSCALL
 		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
 					   uid, seq, data, loginuid);
-#else
-		err = -EOPNOTSUPP;
-#endif
 		break;
 	case AUDIT_SIGNAL_INFO:
 		sig_data.uid = audit_sig_uid;
@@ -636,15 +632,11 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
 		return NULL;
 	}
 
-#ifdef CONFIG_AUDITSYSCALL
-	if (ab->ctx)
-		audit_get_stamp(ab->ctx, &t, &serial);
-	else
-#endif
-	{
+	if (!audit_get_stamp(ab->ctx, &t, &serial)) {
 		t = CURRENT_TIME;
 		serial = 0;
 	}
+
 	audit_log_format(ab, "audit(%lu.%03lu:%u): ",
 			 t.tv_sec, t.tv_nsec/1000000, serial);
 	return ab;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 680bb928343b..94338abf76f5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -992,7 +992,7 @@ void audit_inode(const char *name, const struct inode *inode)
 	context->names[idx].rdev = inode->i_rdev;
 }
 
-void audit_get_stamp(struct audit_context *ctx,
+int audit_get_stamp(struct audit_context *ctx,
 		     struct timespec *t, unsigned int *serial)
 {
 	if (ctx) {
@@ -1000,10 +1000,9 @@ void audit_get_stamp(struct audit_context *ctx,
 		t->tv_nsec = ctx->ctime.tv_nsec;
 		*serial    = ctx->serial;
 		ctx->auditable = 1;
-	} else {
-		*t      = CURRENT_TIME;
-		*serial = 0;
+		return 1;
 	}
+	return 0;
 }
 
 extern int audit_set_type(struct audit_buffer *ab, int type);
-- 
cgit v1.3-14-g43fede


From c1b773d87eadc3972d697444127e89a7291769a2 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@osdl.org>
Date: Wed, 11 May 2005 10:55:10 +0100
Subject: Add audit_log_type

Add audit_log_type to allow callers to specify type and pid when logging.
Convert audit_log to wrapper around audit_log_type.  Could have
converted all audit_log callers directly, but common case is default
of type AUDIT_KERNEL and pid 0.  Update audit_log_start to take type
and pid values when creating a new audit_buffer.  Move sequences that
did audit_log_start, audit_log_format, audit_set_type, audit_log_end,
to simply call audit_log_type directly.  This obsoletes audit_set_type
and audit_set_pid, so remove them.

Signed-off-by: Chris Wright <chrisw@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h  | 16 ++++++++++------
 kernel/audit.c         | 48 +++++++++++++++---------------------------------
 kernel/auditsc.c       | 23 +++++++----------------
 security/selinux/avc.c |  2 +-
 4 files changed, 33 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 58c5589b531f..405332ebf3c6 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -216,11 +216,14 @@ extern void audit_signal_info(int sig, struct task_struct *t);
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
 				/* Public API */
-extern void		    audit_log(struct audit_context *ctx,
-				      const char *fmt, ...)
-			    __attribute__((format(printf,2,3)));
+#define audit_log(ctx, fmt, args...) \
+	audit_log_type(ctx, AUDIT_KERNEL, 0, fmt, ##args)
+extern void		    audit_log_type(struct audit_context *ctx, int type,
+				      int pid, const char *fmt, ...)
+			    __attribute__((format(printf,4,5)));
 
-extern struct audit_buffer *audit_log_start(struct audit_context *ctx);
+extern struct audit_buffer *audit_log_start(struct audit_context *ctx, int type,
+					    int pid);
 extern void		    audit_log_format(struct audit_buffer *ab,
 					     const char *fmt, ...)
 			    __attribute__((format(printf,2,3)));
@@ -240,8 +243,9 @@ extern void		    audit_send_reply(int pid, int seq, int type,
 					     void *payload, int size);
 extern void		    audit_log_lost(const char *message);
 #else
-#define audit_log(t,f,...) do { ; } while (0)
-#define audit_log_start(t) ({ NULL; })
+#define audit_log(c,f,...) do { ; } while (0)
+#define audit_log_type(c,t,p,f,...) do { ; } while (0)
+#define audit_log_start(c,t,p) ({ NULL; })
 #define audit_log_vformat(b,f,a) do { ; } while (0)
 #define audit_log_format(b,f,...) do { ; } while (0)
 #define audit_log_end(b) do { ; } while (0)
diff --git a/kernel/audit.c b/kernel/audit.c
index c18b769e23a2..060b554f481e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -140,18 +140,6 @@ struct audit_buffer {
 	struct audit_context *ctx;	/* NULL or associated context */
 };
 
-void audit_set_type(struct audit_buffer *ab, int type)
-{
-	struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
-	nlh->nlmsg_type = type;
-}
-
-static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
-{
-	struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
-	nlh->nlmsg_pid = pid;
-}
-
 struct audit_entry {
 	struct list_head  list;
 	struct audit_rule rule;
@@ -344,7 +332,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	void			*data;
 	struct audit_status	*status_get, status_set;
 	int			err;
-	struct audit_buffer	*ab;
 	u16			msg_type = nlh->nlmsg_type;
 	uid_t			loginuid; /* loginuid of sender */
 	struct audit_sig_info   sig_data;
@@ -396,19 +383,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 							loginuid);
 		break;
 	case AUDIT_USER:
-		ab = audit_log_start(NULL);
-		if (!ab)
-			break;	/* audit_panic has been called */
-		audit_log_format(ab,
+		audit_log_type(NULL, AUDIT_USER, pid,
 				 "user pid=%d uid=%d length=%d loginuid=%u"
 				 " msg='%.1024s'",
 				 pid, uid,
 				 (int)(nlh->nlmsg_len
 				       - ((char *)data - (char *)nlh)),
 				 loginuid, (char *)data);
-		audit_set_type(ab, AUDIT_USER);
-		audit_set_pid(ab, pid);
-		audit_log_end(ab);
 		break;
 	case AUDIT_ADD:
 	case AUDIT_DEL:
@@ -560,12 +541,10 @@ static void audit_buffer_free(struct audit_buffer *ab)
 	spin_unlock_irqrestore(&audit_freelist_lock, flags);
 }
 
-static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
-						int gfp_mask)
+static struct audit_buffer * audit_buffer_alloc(int gfp_mask)
 {
 	unsigned long flags;
 	struct audit_buffer *ab = NULL;
-	struct nlmsghdr *nlh;
 
 	spin_lock_irqsave(&audit_freelist_lock, flags);
 	if (!list_empty(&audit_freelist)) {
@@ -587,12 +566,6 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
 	if (!ab->skb)
 		goto err;
 
-	ab->ctx   = ctx;
-	nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
-	nlh->nlmsg_type = AUDIT_KERNEL;
-	nlh->nlmsg_flags = 0;
-	nlh->nlmsg_pid = 0;
-	nlh->nlmsg_seq = 0;
 	return ab;
 err:
 	audit_buffer_free(ab);
@@ -605,11 +578,12 @@ err:
  * syscall, then the syscall is marked as auditable and an audit record
  * will be written at syscall exit.  If there is no associated task, tsk
  * should be NULL. */
-struct audit_buffer *audit_log_start(struct audit_context *ctx)
+struct audit_buffer *audit_log_start(struct audit_context *ctx, int type, int pid)
 {
 	struct audit_buffer	*ab	= NULL;
 	struct timespec		t;
 	unsigned int		serial;
+	struct nlmsghdr *nlh;
 
 	if (!audit_initialized)
 		return NULL;
@@ -626,12 +600,19 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
 		return NULL;
 	}
 
-	ab = audit_buffer_alloc(ctx, GFP_ATOMIC);
+	ab = audit_buffer_alloc(GFP_ATOMIC);
 	if (!ab) {
 		audit_log_lost("out of memory in audit_log_start");
 		return NULL;
 	}
 
+	ab->ctx   = ctx;
+	nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
+	nlh->nlmsg_type = type;
+	nlh->nlmsg_flags = 0;
+	nlh->nlmsg_pid = pid;
+	nlh->nlmsg_seq = 0;
+
 	if (!audit_get_stamp(ab->ctx, &t, &serial)) {
 		t = CURRENT_TIME;
 		serial = 0;
@@ -828,12 +809,13 @@ void audit_log_end(struct audit_buffer *ab)
 /* Log an audit record.  This is a convenience function that calls
  * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
  * called in any context. */
-void audit_log(struct audit_context *ctx, const char *fmt, ...)
+void audit_log_type(struct audit_context *ctx, int type, int pid,
+		    const char *fmt, ...)
 {
 	struct audit_buffer *ab;
 	va_list args;
 
-	ab = audit_log_start(ctx);
+	ab = audit_log_start(ctx, type, pid);
 	if (ab) {
 		va_start(args, fmt);
 		audit_log_vformat(ab, fmt, args);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 94338abf76f5..d089263253a7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -648,7 +648,7 @@ static void audit_log_exit(struct audit_context *context)
 	int i;
 	struct audit_buffer *ab;
 
-	ab = audit_log_start(context);
+	ab = audit_log_start(context, AUDIT_KERNEL, 0);
 	if (!ab)
 		return;		/* audit_panic has been called */
 	audit_log_format(ab, "syscall=%d", context->major);
@@ -680,7 +680,7 @@ static void audit_log_exit(struct audit_context *context)
 	while (context->aux) {
 		struct audit_aux_data *aux;
 
-		ab = audit_log_start(context);
+		ab = audit_log_start(context, AUDIT_KERNEL, 0);
 		if (!ab)
 			continue; /* audit_panic has been called */
 
@@ -701,7 +701,7 @@ static void audit_log_exit(struct audit_context *context)
 	}
 
 	for (i = 0; i < context->name_count; i++) {
-		ab = audit_log_start(context);
+		ab = audit_log_start(context, AUDIT_KERNEL, 0);
 		if (!ab)
 			continue; /* audit_panic has been called */
 		audit_log_format(ab, "item=%d", i);
@@ -1005,22 +1005,13 @@ int audit_get_stamp(struct audit_context *ctx,
 	return 0;
 }
 
-extern int audit_set_type(struct audit_buffer *ab, int type);
-
 int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 {
 	if (task->audit_context) {
-		struct audit_buffer *ab;
-
-		ab = audit_log_start(NULL);
-		if (ab) {
-			audit_log_format(ab, "login pid=%d uid=%u "
-				"old loginuid=%u new loginuid=%u",
-				task->pid, task->uid, 
-				task->audit_context->loginuid, loginuid);
-			audit_set_type(ab, AUDIT_LOGIN);
-			audit_log_end(ab);
-		}
+		audit_log_type(NULL, AUDIT_LOGIN, 0,
+			  "login pid=%d uid=%u old loginuid=%u new loginuid=%u",
+			  task->pid, task->uid, task->audit_context->loginuid,
+			  loginuid);
 		task->audit_context->loginuid = loginuid;
 	}
 	return 0;
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 85a6f66a873f..9e71a1bbe011 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -549,7 +549,7 @@ void avc_audit(u32 ssid, u32 tsid,
 			return;
 	}
 
-	ab = audit_log_start(current->audit_context);
+	ab = audit_log_start(current->audit_context, AUDIT_KERNEL, 0);
 	if (!ab)
 		return;		/* audit_panic has been called */
 	audit_log_format(ab, "avc:  %s ", denied ? "denied" : "granted");
-- 
cgit v1.3-14-g43fede


From 9ea74f0655412d0fbd12bf9adb6c14c8fe707a42 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Fri, 13 May 2005 16:35:19 +0100
Subject: AUDIT: Round up audit skb expansion to AUDIT_BUFSIZ.

Otherwise, we will be repeatedly reallocating, even if we're only
adding a few bytes at a time. Pointed out by Steve Grubb.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 060b554f481e..187164572bd0 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -670,7 +670,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 		/* The printk buffer is 1024 bytes long, so if we get
 		 * here and AUDIT_BUFSIZ is at least 1024, then we can
 		 * log everything that printk could have logged. */
-		avail = audit_expand(ab, 1+len-avail);
+		avail = audit_expand(ab, max_t(AUDIT_BUFSIZ, 1+len-avail));
 		if (!avail)
 			goto out;
 		len = vsnprintf(skb->tail, avail, fmt, args2);
-- 
cgit v1.3-14-g43fede


From c04049939f88b29e235d2da217bce6e8ead44f32 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Fri, 13 May 2005 18:17:42 +0100
Subject: AUDIT: Add message types to audit records

This patch adds more messages types to the audit subsystem so that audit
analysis is quicker, intuitive, and more useful.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
---
I forgot one type in the big patch. I need to add one for user space
originating SE Linux avc messages. This is used by dbus and nscd.

-Steve
---
Updated to 2.6.12-rc4-mm1.
-dwmw2

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h          | 66 ++++++++++++++++++++++++++---------
 kernel/audit.c                 | 78 +++++++++++++++++++++++++++++-------------
 kernel/auditsc.c               | 42 ++++++++++++++---------
 security/selinux/avc.c         |  4 +--
 security/selinux/hooks.c       |  2 +-
 security/selinux/nlmsgtab.c    |  8 +++++
 security/selinux/ss/services.c |  4 +--
 7 files changed, 143 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 405332ebf3c6..1a15ba38c660 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -27,15 +27,53 @@
 #include <linux/sched.h>
 #include <linux/elf.h>
 
-/* Request and reply types */
+/* The netlink messages for the audit system is divided into blocks:
+ * 1000 - 1099 are for commanding the audit system
+ * 1100 - 1199 user space trusted application messages
+ * 1200 - 1299 messages internal to the audit daemon
+ * 1300 - 1399 audit event messages
+ * 1400 - 1499 SE Linux use
+ * 1500 - 1999 future use
+ * 2000 is for otherwise unclassified kernel audit messages
+ *
+ * Messages from 1000-1199 are bi-directional. 1200-1299 are exclusively user
+ * space. Anything over that is kernel --> user space communication.
+ */
 #define AUDIT_GET		1000	/* Get status */
 #define AUDIT_SET		1001	/* Set status (enable/disable/auditd) */
-#define AUDIT_LIST		1002	/* List filtering rules */
-#define AUDIT_ADD		1003	/* Add filtering rule */
-#define AUDIT_DEL		1004	/* Delete filtering rule */
-#define AUDIT_USER		1005	/* Send a message from user-space */
+#define AUDIT_LIST		1002	/* List syscall filtering rules */
+#define AUDIT_ADD		1003	/* Add syscall filtering rule */
+#define AUDIT_DEL		1004	/* Delete syscall filtering rule */
+#define AUDIT_USER		1005	/* Message from userspace -- deprecated */
 #define AUDIT_LOGIN		1006	/* Define the login id and information */
-#define AUDIT_SIGNAL_INFO	1010	/* Get information about sender of signal*/
+#define AUDIT_WATCH_INS		1007	/* Insert file/dir watch entry */
+#define AUDIT_WATCH_REM		1008	/* Remove file/dir watch entry */
+#define AUDIT_WATCH_LIST	1009	/* List all file/dir watches */
+#define AUDIT_SIGNAL_INFO	1010	/* Get info about sender of signal to auditd */
+
+#define AUDIT_USER_AUTH		1100	/* User space authentication */
+#define AUDIT_USER_ACCT		1101	/* User space acct change */
+#define AUDIT_USER_MGMT		1102	/* User space acct management */
+#define AUDIT_CRED_ACQ		1103	/* User space credential acquired */
+#define AUDIT_CRED_DISP		1104	/* User space credential disposed */
+#define AUDIT_USER_START	1105	/* User space session start */ 
+#define AUDIT_USER_END		1106	/* User space session end */
+#define AUDIT_USER_AVC		1107	/* User space avc message */
+ 
+#define AUDIT_DAEMON_START      1200    /* Daemon startup record */
+#define AUDIT_DAEMON_END        1201    /* Daemon normal stop record */
+#define AUDIT_DAEMON_ABORT      1202    /* Daemon error stop record */
+#define AUDIT_DAEMON_CONFIG     1203    /* Daemon config change */
+
+#define AUDIT_SYSCALL		1300	/* Syscall event */
+#define AUDIT_FS_WATCH		1301	/* Filesystem watch event */
+#define AUDIT_PATH		1302	/* Filname path information */
+#define AUDIT_IPC		1303	/* IPC record */
+#define AUDIT_SOCKET		1304	/* Socket record */
+#define AUDIT_CONFIG_CHANGE	1305	/* Audit system configuration change */
+
+#define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
+#define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
 
 #define AUDIT_KERNEL		2000	/* Asynchronous audit record. NOT A REQUEST. */
 
@@ -216,14 +254,11 @@ extern void audit_signal_info(int sig, struct task_struct *t);
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
 				/* Public API */
-#define audit_log(ctx, fmt, args...) \
-	audit_log_type(ctx, AUDIT_KERNEL, 0, fmt, ##args)
-extern void		    audit_log_type(struct audit_context *ctx, int type,
-				      int pid, const char *fmt, ...)
-			    __attribute__((format(printf,4,5)));
+extern void		    audit_log(struct audit_context *ctx, int type,
+				      const char *fmt, ...)
+			    __attribute__((format(printf,3,4)));
 
-extern struct audit_buffer *audit_log_start(struct audit_context *ctx, int type,
-					    int pid);
+extern struct audit_buffer *audit_log_start(struct audit_context *ctx,int type);
 extern void		    audit_log_format(struct audit_buffer *ab,
 					     const char *fmt, ...)
 			    __attribute__((format(printf,2,3)));
@@ -243,9 +278,8 @@ extern void		    audit_send_reply(int pid, int seq, int type,
 					     void *payload, int size);
 extern void		    audit_log_lost(const char *message);
 #else
-#define audit_log(c,f,...) do { ; } while (0)
-#define audit_log_type(c,t,p,f,...) do { ; } while (0)
-#define audit_log_start(c,t,p) ({ NULL; })
+#define audit_log(c,t,f,...) do { ; } while (0)
+#define audit_log_start(c,t) ({ NULL; })
 #define audit_log_vformat(b,f,a) do { ; } while (0)
 #define audit_log_format(b,f,...) do { ; } while (0)
 #define audit_log_end(b) do { ; } while (0)
diff --git a/kernel/audit.c b/kernel/audit.c
index 187164572bd0..4e940c05ede8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -140,6 +140,12 @@ struct audit_buffer {
 	struct audit_context *ctx;	/* NULL or associated context */
 };
 
+static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
+{
+	struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
+	nlh->nlmsg_pid = pid;
+}
+
 struct audit_entry {
 	struct list_head  list;
 	struct audit_rule rule;
@@ -233,7 +239,8 @@ static int audit_set_rate_limit(int limit, uid_t loginuid)
 {
 	int old		 = audit_rate_limit;
 	audit_rate_limit = limit;
-	audit_log(NULL, "audit_rate_limit=%d old=%d by auid %u",
+	audit_log(NULL, AUDIT_CONFIG_CHANGE, 
+			"audit_rate_limit=%d old=%d by auid %u",
 			audit_rate_limit, old, loginuid);
 	return old;
 }
@@ -242,7 +249,8 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid)
 {
 	int old		 = audit_backlog_limit;
 	audit_backlog_limit = limit;
-	audit_log(NULL, "audit_backlog_limit=%d old=%d by auid %u",
+	audit_log(NULL, AUDIT_CONFIG_CHANGE,
+			"audit_backlog_limit=%d old=%d by auid %u",
 			audit_backlog_limit, old, loginuid);
 	return old;
 }
@@ -253,8 +261,9 @@ static int audit_set_enabled(int state, uid_t loginuid)
 	if (state != 0 && state != 1)
 		return -EINVAL;
 	audit_enabled = state;
-	audit_log(NULL, "audit_enabled=%d old=%d by auid %u",
-		  audit_enabled, old, loginuid);
+	audit_log(NULL, AUDIT_CONFIG_CHANGE,
+			"audit_enabled=%d old=%d by auid %u",
+			audit_enabled, old, loginuid);
 	return old;
 }
 
@@ -266,8 +275,9 @@ static int audit_set_failure(int state, uid_t loginuid)
 	    && state != AUDIT_FAIL_PANIC)
 		return -EINVAL;
 	audit_failure = state;
-	audit_log(NULL, "audit_failure=%d old=%d by auid %u",
-		  audit_failure, old, loginuid);
+	audit_log(NULL, AUDIT_CONFIG_CHANGE,
+			"audit_failure=%d old=%d by auid %u",
+			audit_failure, old, loginuid);
 	return old;
 }
 
@@ -316,6 +326,14 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
 			err = -EPERM;
 		break;
 	case AUDIT_USER:
+	case AUDIT_USER_AUTH:
+	case AUDIT_USER_ACCT:
+	case AUDIT_USER_MGMT:
+	case AUDIT_CRED_ACQ:
+	case AUDIT_CRED_DISP:
+	case AUDIT_USER_START:
+	case AUDIT_USER_END:
+	case AUDIT_USER_AVC:
 		if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
 			err = -EPERM;
 		break;
@@ -332,6 +350,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	void			*data;
 	struct audit_status	*status_get, status_set;
 	int			err;
+	struct audit_buffer	*ab;
 	u16			msg_type = nlh->nlmsg_type;
 	uid_t			loginuid; /* loginuid of sender */
 	struct audit_sig_info   sig_data;
@@ -373,7 +392,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (status_get->mask & AUDIT_STATUS_PID) {
 			int old   = audit_pid;
 			audit_pid = status_get->pid;
-			audit_log(NULL, "audit_pid=%d old=%d by auid %u",
+			audit_log(NULL, AUDIT_CONFIG_CHANGE,
+				"audit_pid=%d old=%d by auid %u",
 				  audit_pid, old, loginuid);
 		}
 		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
@@ -383,13 +403,26 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 							loginuid);
 		break;
 	case AUDIT_USER:
-		audit_log_type(NULL, AUDIT_USER, pid,
+	case AUDIT_USER_AUTH:
+	case AUDIT_USER_ACCT:
+	case AUDIT_USER_MGMT:
+	case AUDIT_CRED_ACQ:
+	case AUDIT_CRED_DISP:
+	case AUDIT_USER_START:
+	case AUDIT_USER_END:
+	case AUDIT_USER_AVC:
+		ab = audit_log_start(NULL, msg_type);
+		if (!ab)
+			break;	/* audit_panic has been called */
+		audit_log_format(ab,
 				 "user pid=%d uid=%d length=%d loginuid=%u"
 				 " msg='%.1024s'",
 				 pid, uid,
 				 (int)(nlh->nlmsg_len
 				       - ((char *)data - (char *)nlh)),
 				 loginuid, (char *)data);
+		audit_set_pid(ab, pid);
+		audit_log_end(ab);
 		break;
 	case AUDIT_ADD:
 	case AUDIT_DEL:
@@ -504,7 +537,7 @@ static int __init audit_init(void)
 
 	audit_initialized = 1;
 	audit_enabled = audit_default;
-	audit_log(NULL, "initialized");
+	audit_log(NULL, AUDIT_KERNEL, "initialized");
 	return 0;
 }
 __initcall(audit_init);
@@ -541,10 +574,12 @@ static void audit_buffer_free(struct audit_buffer *ab)
 	spin_unlock_irqrestore(&audit_freelist_lock, flags);
 }
 
-static struct audit_buffer * audit_buffer_alloc(int gfp_mask)
+static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
+						int gfp_mask, int type)
 {
 	unsigned long flags;
 	struct audit_buffer *ab = NULL;
+	struct nlmsghdr *nlh;
 
 	spin_lock_irqsave(&audit_freelist_lock, flags);
 	if (!list_empty(&audit_freelist)) {
@@ -566,6 +601,12 @@ static struct audit_buffer * audit_buffer_alloc(int gfp_mask)
 	if (!ab->skb)
 		goto err;
 
+	ab->ctx   = ctx;
+	nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
+	nlh->nlmsg_type = type;
+	nlh->nlmsg_flags = 0;
+	nlh->nlmsg_pid = 0;
+	nlh->nlmsg_seq = 0;
 	return ab;
 err:
 	audit_buffer_free(ab);
@@ -578,12 +619,11 @@ err:
  * syscall, then the syscall is marked as auditable and an audit record
  * will be written at syscall exit.  If there is no associated task, tsk
  * should be NULL. */
-struct audit_buffer *audit_log_start(struct audit_context *ctx, int type, int pid)
+struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
 {
 	struct audit_buffer	*ab	= NULL;
 	struct timespec		t;
 	unsigned int		serial;
-	struct nlmsghdr *nlh;
 
 	if (!audit_initialized)
 		return NULL;
@@ -600,19 +640,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, int type, int pi
 		return NULL;
 	}
 
-	ab = audit_buffer_alloc(GFP_ATOMIC);
+	ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type);
 	if (!ab) {
 		audit_log_lost("out of memory in audit_log_start");
 		return NULL;
 	}
 
-	ab->ctx   = ctx;
-	nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
-	nlh->nlmsg_type = type;
-	nlh->nlmsg_flags = 0;
-	nlh->nlmsg_pid = pid;
-	nlh->nlmsg_seq = 0;
-
 	if (!audit_get_stamp(ab->ctx, &t, &serial)) {
 		t = CURRENT_TIME;
 		serial = 0;
@@ -809,13 +842,12 @@ void audit_log_end(struct audit_buffer *ab)
 /* Log an audit record.  This is a convenience function that calls
  * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
  * called in any context. */
-void audit_log_type(struct audit_context *ctx, int type, int pid,
-		    const char *fmt, ...)
+void audit_log(struct audit_context *ctx, int type, const char *fmt, ...)
 {
 	struct audit_buffer *ab;
 	va_list args;
 
-	ab = audit_log_start(ctx, type, pid);
+	ab = audit_log_start(ctx, type);
 	if (ab) {
 		va_start(args, fmt);
 		audit_log_vformat(ab, fmt, args);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d089263253a7..1b7c91f9d5ff 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -286,7 +286,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 			err = audit_add_rule(entry, &audit_entlist);
 		if (!err && (flags & AUDIT_AT_EXIT))
 			err = audit_add_rule(entry, &audit_extlist);
-		audit_log(NULL, "auid %u added an audit rule\n", loginuid);
+		audit_log(NULL, AUDIT_CONFIG_CHANGE, 
+				"auid %u added an audit rule\n", loginuid);
 		break;
 	case AUDIT_DEL:
 		flags =((struct audit_rule *)data)->flags;
@@ -296,7 +297,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 			err = audit_del_rule(data, &audit_entlist);
 		if (!err && (flags & AUDIT_AT_EXIT))
 			err = audit_del_rule(data, &audit_extlist);
-		audit_log(NULL, "auid %u removed an audit rule\n", loginuid);
+		audit_log(NULL, AUDIT_CONFIG_CHANGE,
+				"auid %u removed an audit rule\n", loginuid);
 		break;
 	default:
 		return -EINVAL;
@@ -648,7 +650,7 @@ static void audit_log_exit(struct audit_context *context)
 	int i;
 	struct audit_buffer *ab;
 
-	ab = audit_log_start(context, AUDIT_KERNEL, 0);
+	ab = audit_log_start(context, AUDIT_SYSCALL);
 	if (!ab)
 		return;		/* audit_panic has been called */
 	audit_log_format(ab, "syscall=%d", context->major);
@@ -680,28 +682,28 @@ static void audit_log_exit(struct audit_context *context)
 	while (context->aux) {
 		struct audit_aux_data *aux;
 
-		ab = audit_log_start(context, AUDIT_KERNEL, 0);
+		aux = context->aux;
+
+		ab = audit_log_start(context, aux->type);
 		if (!ab)
 			continue; /* audit_panic has been called */
 
-		aux = context->aux;
-		context->aux = aux->next;
-
-		audit_log_format(ab, "auxitem=%d", aux->type);
 		switch (aux->type) {
-		case AUDIT_AUX_IPCPERM: {
+		case AUDIT_IPC: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab, 
-					 " qbytes=%lx uid=%d gid=%d mode=%x",
+					 " qbytes=%lx iuid=%d igid=%d mode=%x",
 					 axi->qbytes, axi->uid, axi->gid, axi->mode);
 			}
 		}
 		audit_log_end(ab);
+
+		context->aux = aux->next;
 		kfree(aux);
 	}
 
 	for (i = 0; i < context->name_count; i++) {
-		ab = audit_log_start(context, AUDIT_KERNEL, 0);
+		ab = audit_log_start(context, AUDIT_PATH);
 		if (!ab)
 			continue; /* audit_panic has been called */
 		audit_log_format(ab, "item=%d", i);
@@ -711,7 +713,7 @@ static void audit_log_exit(struct audit_context *context)
 		}
 		if (context->names[i].ino != (unsigned long)-1)
 			audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
-					     " uid=%d gid=%d rdev=%02x:%02x",
+					     " ouid=%d ogid=%d rdev=%02x:%02x",
 					 context->names[i].ino,
 					 MAJOR(context->names[i].dev),
 					 MINOR(context->names[i].dev),
@@ -1008,10 +1010,16 @@ int audit_get_stamp(struct audit_context *ctx,
 int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 {
 	if (task->audit_context) {
-		audit_log_type(NULL, AUDIT_LOGIN, 0,
-			  "login pid=%d uid=%u old loginuid=%u new loginuid=%u",
-			  task->pid, task->uid, task->audit_context->loginuid,
-			  loginuid);
+		struct audit_buffer *ab;
+
+		ab = audit_log_start(NULL, AUDIT_LOGIN);
+		if (ab) {
+			audit_log_format(ab, "login pid=%d uid=%u "
+				"old loginuid=%u new loginuid=%u",
+				task->pid, task->uid, 
+				task->audit_context->loginuid, loginuid);
+			audit_log_end(ab);
+		}
 		task->audit_context->loginuid = loginuid;
 	}
 	return 0;
@@ -1039,7 +1047,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 	ax->gid = gid;
 	ax->mode = mode;
 
-	ax->d.type = AUDIT_AUX_IPCPERM;
+	ax->d.type = AUDIT_IPC;
 	ax->d.next = context->aux;
 	context->aux = (void *)ax;
 	return 0;
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 9e71a1bbe011..042f91e9f9d2 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -242,7 +242,7 @@ void __init avc_init(void)
 	avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node),
 					     0, SLAB_PANIC, NULL, NULL);
 
-	audit_log(current->audit_context, "AVC INITIALIZED\n");
+	audit_log(current->audit_context, AUDIT_KERNEL, "AVC INITIALIZED\n");
 }
 
 int avc_get_hash_stats(char *page)
@@ -549,7 +549,7 @@ void avc_audit(u32 ssid, u32 tsid,
 			return;
 	}
 
-	ab = audit_log_start(current->audit_context, AUDIT_KERNEL, 0);
+	ab = audit_log_start(current->audit_context, AUDIT_AVC);
 	if (!ab)
 		return;		/* audit_panic has been called */
 	audit_log_format(ab, "avc:  %s ", denied ? "denied" : "granted");
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index aae1e794fe48..db845cbd5841 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3419,7 +3419,7 @@ static int selinux_nlmsg_perm(struct sock *sk, struct sk_buff *skb)
 	err = selinux_nlmsg_lookup(isec->sclass, nlh->nlmsg_type, &perm);
 	if (err) {
 		if (err == -EINVAL) {
-			audit_log(current->audit_context,
+			audit_log(current->audit_context, AUDIT_SELINUX_ERR,
 				  "SELinux:  unrecognized netlink message"
 				  " type=%hu for sclass=%hu\n",
 				  nlh->nlmsg_type, isec->sclass);
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index deac14367d43..67e77acc4795 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -98,6 +98,14 @@ static struct nlmsg_perm nlmsg_audit_perms[] =
 	{ AUDIT_DEL,		NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 	{ AUDIT_USER,		NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
 	{ AUDIT_SIGNAL_INFO,	NETLINK_AUDIT_SOCKET__NLMSG_READ     },
+	{ AUDIT_USER_AUTH,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
+	{ AUDIT_USER_ACCT,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
+	{ AUDIT_USER_MGMT,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
+	{ AUDIT_CRED_ACQ,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
+	{ AUDIT_CRED_DISP,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
+	{ AUDIT_USER_START,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
+	{ AUDIT_USER_END,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
+	{ AUDIT_USER_AVC,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
 };
 
 
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 5a820cf88c9c..07fdf6ee6148 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -365,7 +365,7 @@ static int security_validtrans_handle_fail(struct context *ocontext,
 		goto out;
 	if (context_struct_to_string(tcontext, &t, &tlen) < 0)
 		goto out;
-	audit_log(current->audit_context,
+	audit_log(current->audit_context, AUDIT_SELINUX_ERR,
 	          "security_validate_transition:  denied for"
 	          " oldcontext=%s newcontext=%s taskcontext=%s tclass=%s",
 	          o, n, t, policydb.p_class_val_to_name[tclass-1]);
@@ -742,7 +742,7 @@ static int compute_sid_handle_invalid_context(
 		goto out;
 	if (context_struct_to_string(newcontext, &n, &nlen) < 0)
 		goto out;
-	audit_log(current->audit_context,
+	audit_log(current->audit_context, AUDIT_SELINUX_ERR,
 		  "security_compute_sid:  invalid context %s"
 		  " for scontext=%s"
 		  " tcontext=%s"
-- 
cgit v1.3-14-g43fede


From 23f32d18aa589e228c5a9e12e0d0c67c9b5bcdce Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Fri, 13 May 2005 18:35:15 +0100
Subject: AUDIT: Fix some spelling errors

I'm going through the kernel code and have a patch that corrects
several spelling errors in comments.

From: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h | 2 +-
 kernel/audit.c        | 4 ++--
 kernel/auditsc.c      | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 1a15ba38c660..51e5879af7fc 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -181,7 +181,7 @@ struct audit_message {
 
 struct audit_status {
 	__u32		mask;		/* Bit mask for valid entries */
-	__u32		enabled;	/* 1 = enabled, 0 = disbaled */
+	__u32		enabled;	/* 1 = enabled, 0 = disabled */
 	__u32		failure;	/* Failure-to-log action */
 	__u32		pid;		/* pid of auditd process */
 	__u32		rate_limit;	/* messages rate limit (per second) */
diff --git a/kernel/audit.c b/kernel/audit.c
index 4e940c05ede8..74779d3769fa 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -116,7 +116,7 @@ static LIST_HEAD(audit_entlist);
 static LIST_HEAD(audit_extlist);
 
 /* The netlink socket is only to be read by 1 CPU, which lets us assume
- * that list additions and deletions never happen simultaneiously in
+ * that list additions and deletions never happen simultaneously in
  * auditsc.c */
 static DECLARE_MUTEX(audit_netlink_sem);
 
@@ -775,7 +775,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	}
 }
 
-/* Remove queued messages from the audit_txlist and send them to userspace. */
+/* Remove queued messages from the audit_txlist and send them to user space. */
 static void audit_tasklet_handler(unsigned long arg)
 {
 	LIST_HEAD(list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1b7c91f9d5ff..773d28a3f701 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -444,7 +444,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
 
 /* At syscall entry and exit time, this filter is called if the
  * audit_state is not low enough that auditing cannot take place, but is
- * also not high enough that we already know we have to write and audit
+ * also not high enough that we already know we have to write an audit
  * record (i.e., the state is AUDIT_SETUP_CONTEXT or  AUDIT_BUILD_CONTEXT).
  */
 static enum audit_state audit_filter_syscall(struct task_struct *tsk,
@@ -750,7 +750,7 @@ void audit_free(struct task_struct *tsk)
 /* Compute a serial number for the audit record.  Audit records are
  * written to user-space as soon as they are generated, so a complete
  * audit record may be written in several pieces.  The timestamp of the
- * record and this serial number are used by the user-space daemon to
+ * record and this serial number are used by the user-space tools to
  * determine which pieces belong to the same audit record.  The
  * (timestamp,serial) tuple is unique for each syscall and is live from
  * syscall entry to syscall exit.
-- 
cgit v1.3-14-g43fede


From 5e014b10ef8477c32a939a48fa02aedcad35a226 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Fri, 13 May 2005 18:50:33 +0100
Subject: AUDIT: fix max_t thinko.

Der... if you use max_t it helps if you give it a type.

Note to self: Always just apply the tested patches, don't try to port
them by hand. You're not clever enough.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 74779d3769fa..a0e33b6897d7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -703,7 +703,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 		/* The printk buffer is 1024 bytes long, so if we get
 		 * here and AUDIT_BUFSIZ is at least 1024, then we can
 		 * log everything that printk could have logged. */
-		avail = audit_expand(ab, max_t(AUDIT_BUFSIZ, 1+len-avail));
+		avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
 		if (!avail)
 			goto out;
 		len = vsnprintf(skb->tail, avail, fmt, args2);
-- 
cgit v1.3-14-g43fede


From 3ec3b2fba526ead2fa3f3d7c91924f39a0733749 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Tue, 17 May 2005 12:08:48 +0100
Subject: AUDIT: Capture sys_socketcall arguments and sockaddrs

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h |  7 ++++-
 kernel/auditsc.c      | 73 +++++++++++++++++++++++++++++++++++++++++++++++++--
 net/socket.c          |  9 +++++--
 3 files changed, 84 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 51e5879af7fc..2f5dc60f8bbd 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -69,8 +69,9 @@
 #define AUDIT_FS_WATCH		1301	/* Filesystem watch event */
 #define AUDIT_PATH		1302	/* Filname path information */
 #define AUDIT_IPC		1303	/* IPC record */
-#define AUDIT_SOCKET		1304	/* Socket record */
+#define AUDIT_SOCKETCALL	1304	/* sys_socketcall arguments */
 #define AUDIT_CONFIG_CHANGE	1305	/* Audit system configuration change */
+#define AUDIT_SOCKADDR		1306	/* sockaddr copied as syscall arg */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
@@ -235,6 +236,8 @@ extern int audit_get_stamp(struct audit_context *ctx,
 extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 extern uid_t audit_get_loginuid(struct audit_context *ctx);
 extern int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
+extern int audit_socketcall(int nargs, unsigned long *args);
+extern int audit_sockaddr(int len, void *addr);
 extern void audit_signal_info(int sig, struct task_struct *t);
 #else
 #define audit_alloc(t) ({ 0; })
@@ -248,6 +251,8 @@ extern void audit_signal_info(int sig, struct task_struct *t);
 #define audit_get_stamp(c,t,s) ({ 0; })
 #define audit_get_loginuid(c) ({ -1; })
 #define audit_ipc_perms(q,u,g,m) ({ 0; })
+#define audit_socketcall(n,a) ({ 0; })
+#define audit_sockaddr(len, addr) ({ 0; })
 #define audit_signal_info(s,t) do { ; } while (0)
 #endif
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 773d28a3f701..818778d5b6ad 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -34,7 +34,7 @@
 #include <asm/types.h>
 #include <linux/mm.h>
 #include <linux/module.h>
-
+#include <linux/socket.h>
 #include <linux/audit.h>
 #include <linux/personality.h>
 #include <linux/time.h>
@@ -112,6 +112,18 @@ struct audit_aux_data_ipcctl {
 	mode_t			mode;
 };
 
+struct audit_aux_data_socketcall {
+	struct audit_aux_data	d;
+	int			nargs;
+	unsigned long		args[0];
+};
+
+struct audit_aux_data_sockaddr {
+	struct audit_aux_data	d;
+	int			len;
+	char			a[0];
+};
+
 
 /* The per-task audit context. */
 struct audit_context {
@@ -694,7 +706,22 @@ static void audit_log_exit(struct audit_context *context)
 			audit_log_format(ab, 
 					 " qbytes=%lx iuid=%d igid=%d mode=%x",
 					 axi->qbytes, axi->uid, axi->gid, axi->mode);
-			}
+			break; }
+
+		case AUDIT_SOCKETCALL: {
+			int i;
+			struct audit_aux_data_socketcall *axs = (void *)aux;
+			audit_log_format(ab, "nargs=%d", axs->nargs);
+			for (i=0; i<axs->nargs; i++)
+				audit_log_format(ab, " a%d=%lx", i, axs->args[i]);
+			break; }
+
+		case AUDIT_SOCKADDR: {
+			struct audit_aux_data_sockaddr *axs = (void *)aux;
+
+			audit_log_format(ab, "saddr=");
+			audit_log_hex(ab, axs->a, axs->len);
+			break; }
 		}
 		audit_log_end(ab);
 
@@ -1053,6 +1080,48 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 	return 0;
 }
 
+int audit_socketcall(int nargs, unsigned long *args)
+{
+	struct audit_aux_data_socketcall *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (likely(!context))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
+	if (!ax)
+		return -ENOMEM;
+
+	ax->nargs = nargs;
+	memcpy(ax->args, args, nargs * sizeof(unsigned long));
+
+	ax->d.type = AUDIT_SOCKETCALL;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+	return 0;
+}
+
+int audit_sockaddr(int len, void *a)
+{
+	struct audit_aux_data_sockaddr *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (likely(!context))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL);
+	if (!ax)
+		return -ENOMEM;
+
+	ax->len = len;
+	memcpy(ax->a, a, len);
+
+	ax->d.type = AUDIT_SOCKADDR;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+	return 0;
+}
+
 void audit_signal_info(int sig, struct task_struct *t)
 {
 	extern pid_t audit_sig_pid;
diff --git a/net/socket.c b/net/socket.c
index cec0cb38b9ce..6b7c3b51a7c1 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -81,6 +81,7 @@
 #include <linux/syscalls.h>
 #include <linux/compat.h>
 #include <linux/kmod.h>
+#include <linux/audit.h>
 
 #ifdef CONFIG_NET_RADIO
 #include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */
@@ -226,7 +227,7 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
 		return 0;
 	if(copy_from_user(kaddr,uaddr,ulen))
 		return -EFAULT;
-	return 0;
+	return audit_sockaddr(ulen, kaddr);
 }
 
 /**
@@ -1906,7 +1907,11 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 	/* copy_from_user should be SMP safe. */
 	if (copy_from_user(a, args, nargs[call]))
 		return -EFAULT;
-		
+
+	err = audit_socketcall(nargs[call]/sizeof(unsigned long), args);
+	if (err)
+		return err;
+
 	a0=a[0];
 	a1=a[1];
 	
-- 
cgit v1.3-14-g43fede


From 4f167fb491725ca0be9df0d76b4b2dd862cdfe0b Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin.zhang@intel.com>
Date: Mon, 16 May 2005 21:53:43 -0700
Subject: [PATCH] spurious interrupt fix

On my IA64 machine, after kernel 2.6.12-rc3 boots, an edge-triggered
interrupt (IRQ 46) keeps triggered over and over again.  There is no IRQ 46
interrupt action handler.  It has lots of impact on performance.

Kernel 2.6.10 and its prior versions have no the problem.  Basically,
kernel 2.6.10 will mask the spurious edge interrupt if the interrupt is
triggered for the second time and its status includes
IRQ_DISABLE|IRQ_PENDING.

Originally, IA64 kernel has its own specific _irq_desc definitions in file
arch/ia64/kernel/irq.c.  The definition initiates _irq_desc[irq].status to
IRQ_DISABLE.  Since kernel 2.6.11, it was moved to architecture independent
codes, i.e.  kernel/irq/handle.c, but kernel/irq/handle.c initiates
_irq_desc[irq].status to 0 instead of IRQ_DISABLE.

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/handle.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 2fb0e46e11f3..06b5a6323998 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -30,6 +30,7 @@
  */
 irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 	[0 ... NR_IRQS-1] = {
+		.status = IRQ_DISABLED,
 		.handler = &no_irq_type,
 		.lock = SPIN_LOCK_UNLOCKED
 	}
-- 
cgit v1.3-14-g43fede


From 3c0547ba8b3bbd8b26ae35e33ac17ff51f67f78c Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 16 May 2005 21:53:47 -0700
Subject: [PATCH] add_preferred_console() build fix

Move add_preferred_console out of CONFIG_PRINTK so serial console does the
right thing.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 72 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 290a07ce2c8a..01b58d7d17ff 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -160,42 +160,6 @@ static int __init console_setup(char *str)
 
 __setup("console=", console_setup);
 
-/**
- * add_preferred_console - add a device to the list of preferred consoles.
- *
- * The last preferred console added will be used for kernel messages
- * and stdin/out/err for init.  Normally this is used by console_setup
- * above to handle user-supplied console arguments; however it can also
- * be used by arch-specific code either to override the user or more
- * commonly to provide a default console (ie from PROM variables) when
- * the user has not supplied one.
- */
-int __init add_preferred_console(char *name, int idx, char *options)
-{
-	struct console_cmdline *c;
-	int i;
-
-	/*
-	 *	See if this tty is not yet registered, and
-	 *	if we have a slot free.
-	 */
-	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
-		if (strcmp(console_cmdline[i].name, name) == 0 &&
-			  console_cmdline[i].index == idx) {
-				selected_console = i;
-				return 0;
-		}
-	if (i == MAX_CMDLINECONSOLES)
-		return -E2BIG;
-	selected_console = i;
-	c = &console_cmdline[i];
-	memcpy(c->name, name, sizeof(c->name));
-	c->name[sizeof(c->name) - 1] = 0;
-	c->options = options;
-	c->index = idx;
-	return 0;
-}
-
 static int __init log_buf_len_setup(char *str)
 {
 	unsigned long size = memparse(str, &str);
@@ -670,6 +634,42 @@ static void call_console_drivers(unsigned long start, unsigned long end) {}
 
 #endif
 
+/**
+ * add_preferred_console - add a device to the list of preferred consoles.
+ *
+ * The last preferred console added will be used for kernel messages
+ * and stdin/out/err for init.  Normally this is used by console_setup
+ * above to handle user-supplied console arguments; however it can also
+ * be used by arch-specific code either to override the user or more
+ * commonly to provide a default console (ie from PROM variables) when
+ * the user has not supplied one.
+ */
+int __init add_preferred_console(char *name, int idx, char *options)
+{
+	struct console_cmdline *c;
+	int i;
+
+	/*
+	 *	See if this tty is not yet registered, and
+	 *	if we have a slot free.
+	 */
+	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+		if (strcmp(console_cmdline[i].name, name) == 0 &&
+			  console_cmdline[i].index == idx) {
+				selected_console = i;
+				return 0;
+		}
+	if (i == MAX_CMDLINECONSOLES)
+		return -E2BIG;
+	selected_console = i;
+	c = &console_cmdline[i];
+	memcpy(c->name, name, sizeof(c->name));
+	c->name[sizeof(c->name) - 1] = 0;
+	c->options = options;
+	c->index = idx;
+	return 0;
+}
+
 /**
  * acquire_console_sem - lock the console system for exclusive use.
  *
-- 
cgit v1.3-14-g43fede


From dfaa9c94b13071c9b5f8578d0ae99acc76c60139 Mon Sep 17 00:00:00 2001
From: William Lee Irwin III <wli@holomorphy.com>
Date: Mon, 16 May 2005 21:53:58 -0700
Subject: [PATCH] profile.c: `schedule' parsing fix

profile=schedule parsing is not quite what it should be.  First, str[7] is
'e', not ',', but then even if it did fall through, prof_on =
SCHED_PROFILING would be clobbered inside if (get_option(...)) So a small
amount of rearrangement is done in this patch to correct it.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/profile.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index 0221a50ca867..ad8cbb75ffa2 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -49,15 +49,19 @@ static DECLARE_MUTEX(profile_flip_mutex);
 
 static int __init profile_setup(char * str)
 {
+	static char __initdata schedstr[] = "schedule";
 	int par;
 
-	if (!strncmp(str, "schedule", 8)) {
+	if (!strncmp(str, schedstr, strlen(schedstr))) {
 		prof_on = SCHED_PROFILING;
-		printk(KERN_INFO "kernel schedule profiling enabled\n");
-		if (str[7] == ',')
-			str += 8;
-	}
-	if (get_option(&str,&par)) {
+		if (str[strlen(schedstr)] == ',')
+			str += strlen(schedstr) + 1;
+		if (get_option(&str, &par))
+			prof_shift = par;
+		printk(KERN_INFO
+			"kernel schedule profiling enabled (shift: %ld)\n",
+			prof_shift);
+	} else if (get_option(&str, &par)) {
 		prof_shift = par;
 		prof_on = CPU_PROFILING;
 		printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
-- 
cgit v1.3-14-g43fede


From 82428b62aa6294ea640c7e920a9224ecaf46db65 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Mon, 9 May 2005 08:07:00 -0700
Subject: [PATCH] Driver Core: pm diagnostics update, check for errors

This patch includes various tweaks in the messaging that appears during
system pm state transitions:

  * Warn about certain illegal calls in the device tree, like resuming
    child before parent or suspending parent before child.  This could
    happen easily enough through sysfs, or in some cases when drivers
    use device_pm_set_parent().

  * Be more consistent about dev_dbg() tracing ... do it for resume() and
    shutdown() too, and never if the driver doesn't have that method.

  * Say which type of system sleep state is being entered.

Except for the warnings, these only affect debug messaging.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/power/resume.c   | 11 ++++++++++-
 drivers/base/power/shutdown.c | 13 +++++++------
 drivers/base/power/suspend.c  | 17 +++++++++++++++--
 kernel/power/main.c           |  6 +++---
 4 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/power/resume.c b/drivers/base/power/resume.c
index f8f5055754d6..26468971ef5a 100644
--- a/drivers/base/power/resume.c
+++ b/drivers/base/power/resume.c
@@ -22,8 +22,17 @@ extern int sysdev_resume(void);
 
 int resume_device(struct device * dev)
 {
-	if (dev->bus && dev->bus->resume)
+	if (dev->power.pm_parent
+			&& dev->power.pm_parent->power.power_state) {
+		dev_err(dev, "PM: resume from %d, parent %s still %d\n",
+			dev->power.power_state,
+			dev->power.pm_parent->bus_id,
+			dev->power.pm_parent->power.power_state);
+	}
+	if (dev->bus && dev->bus->resume) {
+		dev_dbg(dev,"resuming\n");
 		return dev->bus->resume(dev);
+	}
 	return 0;
 }
 
diff --git a/drivers/base/power/shutdown.c b/drivers/base/power/shutdown.c
index d1e023fbe169..97979901c149 100644
--- a/drivers/base/power/shutdown.c
+++ b/drivers/base/power/shutdown.c
@@ -25,8 +25,10 @@ int device_detach_shutdown(struct device * dev)
 		return 0;
 
 	if (dev->detach_state == DEVICE_PM_OFF) {
-		if (dev->driver && dev->driver->shutdown)
+		if (dev->driver && dev->driver->shutdown) {
+			dev_dbg(dev, "shutdown\n");
 			dev->driver->shutdown(dev);
+		}
 		return 0;
 	}
 	return dpm_runtime_suspend(dev, dev->detach_state);
@@ -52,13 +54,12 @@ void device_shutdown(void)
 	struct device * dev;
 
 	down_write(&devices_subsys.rwsem);
-	list_for_each_entry_reverse(dev, &devices_subsys.kset.list, kobj.entry) {
-		pr_debug("shutting down %s: ", dev->bus_id);
+	list_for_each_entry_reverse(dev, &devices_subsys.kset.list,
+				kobj.entry) {
 		if (dev->driver && dev->driver->shutdown) {
-			pr_debug("Ok\n");
+			dev_dbg(dev, "shutdown\n");
 			dev->driver->shutdown(dev);
-		} else
-			pr_debug("Ignored.\n");
+		}
 	}
 	up_write(&devices_subsys.rwsem);
 
diff --git a/drivers/base/power/suspend.c b/drivers/base/power/suspend.c
index a0b5cf689e63..0ec44ef840be 100644
--- a/drivers/base/power/suspend.c
+++ b/drivers/base/power/suspend.c
@@ -39,12 +39,25 @@ int suspend_device(struct device * dev, pm_message_t state)
 {
 	int error = 0;
 
-	dev_dbg(dev, "suspending\n");
+	if (dev->power.power_state) {
+		dev_dbg(dev, "PM: suspend %d-->%d\n",
+			dev->power.power_state, state);
+	}
+	if (dev->power.pm_parent
+			&& dev->power.pm_parent->power.power_state) {
+		dev_err(dev,
+			"PM: suspend %d->%d, parent %s already %d\n",
+			dev->power.power_state, state,
+			dev->power.pm_parent->bus_id,
+			dev->power.pm_parent->power.power_state);
+	}
 
 	dev->power.prev_state = dev->power.power_state;
 
-	if (dev->bus && dev->bus->suspend && !dev->power.power_state)
+	if (dev->bus && dev->bus->suspend && !dev->power.power_state) {
+		dev_dbg(dev, "suspending\n");
 		error = dev->bus->suspend(dev, state);
+	}
 
 	return error;
 }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7960ddf04a57..4cdebc972ff2 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -156,14 +156,14 @@ static int enter_state(suspend_state_t state)
 		goto Unlock;
 	}
 
-	pr_debug("PM: Preparing system for suspend\n");
+	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
 	if ((error = suspend_prepare(state)))
 		goto Unlock;
 
-	pr_debug("PM: Entering state.\n");
+	pr_debug("PM: Entering %s sleep\n", pm_states[state]);
 	error = suspend_enter(state);
 
-	pr_debug("PM: Finishing up.\n");
+	pr_debug("PM: Finishing wakeup.\n");
 	suspend_finish(state);
  Unlock:
 	up(&pm_sem);
-- 
cgit v1.3-14-g43fede


From 209aba03243ee42a22f8df8d08aa9963f62aec64 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Wed, 18 May 2005 10:21:07 +0100
Subject: AUDIT: Treat all user messages identically.

It's silly to have to add explicit entries for new userspace messages
as we invent them. Just treat all messages in the user range the same.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h       | 17 ++---------------
 kernel/audit.c              | 20 ++------------------
 security/selinux/nlmsgtab.c | 17 +++++++----------
 3 files changed, 11 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2f5dc60f8bbd..17ea5d522d81 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -51,14 +51,8 @@
 #define AUDIT_WATCH_LIST	1009	/* List all file/dir watches */
 #define AUDIT_SIGNAL_INFO	1010	/* Get info about sender of signal to auditd */
 
-#define AUDIT_USER_AUTH		1100	/* User space authentication */
-#define AUDIT_USER_ACCT		1101	/* User space acct change */
-#define AUDIT_USER_MGMT		1102	/* User space acct management */
-#define AUDIT_CRED_ACQ		1103	/* User space credential acquired */
-#define AUDIT_CRED_DISP		1104	/* User space credential disposed */
-#define AUDIT_USER_START	1105	/* User space session start */ 
-#define AUDIT_USER_END		1106	/* User space session end */
-#define AUDIT_USER_AVC		1107	/* User space avc message */
+#define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages uninteresting to kernel */
+#define AUDIT_LAST_USER_MSG	1199
  
 #define AUDIT_DAEMON_START      1200    /* Daemon startup record */
 #define AUDIT_DAEMON_END        1201    /* Daemon normal stop record */
@@ -173,13 +167,6 @@
 #define AUDIT_ARCH_V850		(EM_V850|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 
-#ifndef __KERNEL__
-struct audit_message {
-	struct nlmsghdr nlh;
-	char		data[1200];
-};
-#endif
-
 struct audit_status {
 	__u32		mask;		/* Bit mask for valid entries */
 	__u32		enabled;	/* 1 = enabled, 0 = disabled */
diff --git a/kernel/audit.c b/kernel/audit.c
index a0e33b6897d7..e6d88635032c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -325,15 +325,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
 		if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
 			err = -EPERM;
 		break;
-	case AUDIT_USER:
-	case AUDIT_USER_AUTH:
-	case AUDIT_USER_ACCT:
-	case AUDIT_USER_MGMT:
-	case AUDIT_CRED_ACQ:
-	case AUDIT_CRED_DISP:
-	case AUDIT_USER_START:
-	case AUDIT_USER_END:
-	case AUDIT_USER_AVC:
+	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
 		if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
 			err = -EPERM;
 		break;
@@ -402,15 +394,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			audit_set_backlog_limit(status_get->backlog_limit,
 							loginuid);
 		break;
-	case AUDIT_USER:
-	case AUDIT_USER_AUTH:
-	case AUDIT_USER_ACCT:
-	case AUDIT_USER_MGMT:
-	case AUDIT_CRED_ACQ:
-	case AUDIT_CRED_DISP:
-	case AUDIT_USER_START:
-	case AUDIT_USER_END:
-	case AUDIT_USER_AVC:
+	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
 		ab = audit_log_start(NULL, msg_type);
 		if (!ab)
 			break;	/* audit_panic has been called */
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 67e77acc4795..f0fb6d76f7c5 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -98,14 +98,6 @@ static struct nlmsg_perm nlmsg_audit_perms[] =
 	{ AUDIT_DEL,		NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 	{ AUDIT_USER,		NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
 	{ AUDIT_SIGNAL_INFO,	NETLINK_AUDIT_SOCKET__NLMSG_READ     },
-	{ AUDIT_USER_AUTH,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
-	{ AUDIT_USER_ACCT,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
-	{ AUDIT_USER_MGMT,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
-	{ AUDIT_CRED_ACQ,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
-	{ AUDIT_CRED_DISP,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
-	{ AUDIT_USER_START,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
-	{ AUDIT_USER_END,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
-	{ AUDIT_USER_AVC,	NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
 };
 
 
@@ -150,8 +142,13 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
 		break;
 
 	case SECCLASS_NETLINK_AUDIT_SOCKET:
-		err = nlmsg_perm(nlmsg_type, perm, nlmsg_audit_perms,
-				 sizeof(nlmsg_audit_perms));
+		if (nlmsg_type >= AUDIT_FIRST_USER_MSG &&
+		    nlmsg_type <= AUDIT_LAST_USER_MSG) {
+			*perm = NETLINK_AUDIT_SOCKET__NLMSG_RELAY;
+		} else {
+			err = nlmsg_perm(nlmsg_type, perm, nlmsg_audit_perms,
+					 sizeof(nlmsg_audit_perms));
+		}
 		break;
 
 	/* No messaging from userspace, or class unknown/unhandled */
-- 
cgit v1.3-14-g43fede


From 168b7173959f80d20720dd1f7ec909a88ef2689d Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Thu, 19 May 2005 10:24:22 +0100
Subject: AUDIT: Clean up logging of untrusted strings

* If vsnprintf returns -1, it will mess up the sk buffer space accounting.
This is fixed by not calling skb_put with bogus len values.

* audit_log_hex was a loop that called audit_log_vformat with %02X for each
character. This is very inefficient since conversion from unsigned character
to Ascii representation is essentially masking, shifting, and byte lookups.
Also, the length of the converted string is well known - it's twice the
original. Fixed by rewriting the function.

* audit_log_untrustedstring had no comments. This makes it hard for
someone to understand what the string format will be.

* audit_log_d_path was never fixed to use untrustedstring. This could mess
up user space parsers. This was fixed to make a temp buffer, call d_path,
and log temp buffer using untrustedstring.

From: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 71 +++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 48 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index e6d88635032c..dae3570b3a3b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -692,7 +692,8 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 			goto out;
 		len = vsnprintf(skb->tail, avail, fmt, args2);
 	}
-	skb_put(skb, (len < avail) ? len : avail);
+	if (len > 0)
+		skb_put(skb, len);
 out:
 	return;
 }
@@ -710,20 +711,47 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
 	va_end(args);
 }
 
-void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len)
+/* This function will take the passed buf and convert it into a string of
+ * ascii hex digits. The new string is placed onto the skb. */
+void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, 
+		size_t len)
 {
-	int i;
+	int i, avail, new_len;
+	unsigned char *ptr;
+	struct sk_buff *skb;
+	static const unsigned char *hex = "0123456789ABCDEF";
+
+	BUG_ON(!ab->skb);
+	skb = ab->skb;
+	avail = skb_tailroom(skb);
+	new_len = len<<1;
+	if (new_len >= avail) {
+		/* Round the buffer request up to the next multiple */
+		new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
+		avail = audit_expand(ab, new_len);
+		if (!avail)
+			return;
+	}
 
-	for (i=0; i<len; i++)
-		audit_log_format(ab, "%02x", buf[i]);
+	ptr = skb->tail;
+	for (i=0; i<len; i++) {
+		*ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
+		*ptr++ = hex[buf[i] & 0x0F];	  /* Lower nibble */
+	}
+	*ptr = 0;
+	skb_put(skb, len << 1); /* new string is twice the old string */
 }
 
+/* This code will escape a string that is passed to it if the string
+ * contains a control character, unprintable character, double quote mark, 
+ * or a space. Unescaped strings will start and end with a double quote mark.
+ * Strings that are escaped are printed in hex (2 digits per char). */
 void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 {
 	const unsigned char *p = string;
 
 	while (*p) {
-		if (*p == '"' || *p == ' ' || *p < 0x20 || *p > 0x7f) {
+		if (*p == '"' || *p < 0x21 || *p > 0x7f) {
 			audit_log_hex(ab, string, strlen(string));
 			return;
 		}
@@ -732,31 +760,28 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 	audit_log_format(ab, "\"%s\"", string);
 }
 
-
-/* This is a helper-function to print the d_path without using a static
- * buffer or allocating another buffer in addition to the one in
- * audit_buffer. */
+/* This is a helper-function to print the escaped d_path */
 void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 		      struct dentry *dentry, struct vfsmount *vfsmnt)
 {
-	char *p;
-	struct sk_buff *skb = ab->skb;
-	int  len, avail;
+	char *p, *path;
 
 	if (prefix)
 		audit_log_format(ab, " %s", prefix);
 
-	avail = skb_tailroom(skb);
-	p = d_path(dentry, vfsmnt, skb->tail, avail);
-	if (IS_ERR(p)) {
-		/* FIXME: can we save some information here? */
-		audit_log_format(ab, "<toolong>");
-	} else {
-		/* path isn't at start of buffer */
-		len = ((char *)skb->tail + avail - 1) - p;
-		memmove(skb->tail, p, len);
-		skb_put(skb, len);
+	/* We will allow 11 spaces for ' (deleted)' to be appended */
+	path = kmalloc(PATH_MAX+11, GFP_KERNEL);
+	if (!path) {
+		audit_log_format(ab, "<no memory>");
+		return;
 	}
+	p = d_path(dentry, vfsmnt, path, PATH_MAX+11);
+	if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
+		/* FIXME: can we save some information here? */
+		audit_log_format(ab, "<too long>");
+	} else 
+		audit_log_untrustedstring(ab, p);
+	kfree(path);
 }
 
 /* Remove queued messages from the audit_txlist and send them to user space. */
-- 
cgit v1.3-14-g43fede


From b7d1125817c9a46cc46f57db89d9c195e7af22f8 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Thu, 19 May 2005 10:56:58 +0100
Subject: AUDIT: Send netlink messages from a separate kernel thread

netlink_unicast() will attempt to reallocate and will free messages if
the socket's rcvbuf limit is reached unless we give it an infinite
timeout. So do that, from a kernel thread which is dedicated to spewing
stuff up the netlink socket.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 191 +++++++++++++++++++++------------------------------------
 1 file changed, 70 insertions(+), 121 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index dae3570b3a3b..bbc6f542c8f7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,8 @@
 #include <asm/types.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
 
 #include <linux/audit.h>
 
@@ -77,7 +79,6 @@ static int	audit_rate_limit;
 
 /* Number of outstanding audit_buffers allowed. */
 static int	audit_backlog_limit = 64;
-static atomic_t	audit_backlog	    = ATOMIC_INIT(0);
 
 /* The identity of the user shutting down the audit system. */
 uid_t		audit_sig_uid = -1;
@@ -95,19 +96,17 @@ static atomic_t    audit_lost = ATOMIC_INIT(0);
 /* The netlink socket. */
 static struct sock *audit_sock;
 
-/* There are two lists of audit buffers.  The txlist contains audit
- * buffers that cannot be sent immediately to the netlink device because
- * we are in an irq context (these are sent later in a tasklet).
- *
- * The second list is a list of pre-allocated audit buffers (if more
+/* The audit_freelist is a list of pre-allocated audit buffers (if more
  * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
  * being placed on the freelist). */
-static DEFINE_SPINLOCK(audit_txlist_lock);
 static DEFINE_SPINLOCK(audit_freelist_lock);
 static int	   audit_freelist_count = 0;
-static LIST_HEAD(audit_txlist);
 static LIST_HEAD(audit_freelist);
 
+static struct sk_buff_head audit_skb_queue;
+static struct task_struct *kauditd_task;
+static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+
 /* There are three lists of rules -- one to search at task creation
  * time, one to search at syscall entry time, and another to search at
  * syscall exit time. */
@@ -151,9 +150,6 @@ struct audit_entry {
 	struct audit_rule rule;
 };
 
-static void audit_log_end_irq(struct audit_buffer *ab);
-static void audit_log_end_fast(struct audit_buffer *ab);
-
 static void audit_panic(const char *message)
 {
 	switch (audit_failure)
@@ -224,10 +220,8 @@ void audit_log_lost(const char *message)
 
 	if (print) {
 		printk(KERN_WARNING
-		       "audit: audit_lost=%d audit_backlog=%d"
-		       " audit_rate_limit=%d audit_backlog_limit=%d\n",
+		       "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n",
 		       atomic_read(&audit_lost),
-		       atomic_read(&audit_backlog),
 		       audit_rate_limit,
 		       audit_backlog_limit);
 		audit_panic(message);
@@ -281,6 +275,38 @@ static int audit_set_failure(int state, uid_t loginuid)
 	return old;
 }
 
+int kauditd_thread(void *dummy)
+{
+	struct sk_buff *skb;
+
+	while (1) {
+		skb = skb_dequeue(&audit_skb_queue);
+		if (skb) {
+			if (audit_pid) {
+				int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
+				if (err < 0) {
+					BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+					printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+					audit_pid = 0;
+				}
+			} else {
+				printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0));
+				kfree_skb(skb);
+			}
+		} else {
+			DECLARE_WAITQUEUE(wait, current);
+			set_current_state(TASK_INTERRUPTIBLE);
+			add_wait_queue(&kauditd_wait, &wait);
+
+			if (!skb_queue_len(&audit_skb_queue))
+				schedule();
+
+			__set_current_state(TASK_RUNNING);
+			remove_wait_queue(&kauditd_wait, &wait);
+		}
+	}
+}
+
 void audit_send_reply(int pid, int seq, int type, int done, int multi,
 		      void *payload, int size)
 {
@@ -293,13 +319,16 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
 
 	skb = alloc_skb(len, GFP_KERNEL);
 	if (!skb)
-		goto nlmsg_failure;
+		return;
 
-	nlh		 = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
+	nlh		 = NLMSG_PUT(skb, pid, seq, t, size);
 	nlh->nlmsg_flags = flags;
 	data		 = NLMSG_DATA(nlh);
 	memcpy(data, payload, size);
-	netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+
+	/* Ignore failure. It'll only happen if the sender goes away,
+	   because our timeout is set to infinite. */
+	netlink_unicast(audit_sock, skb, pid, 0);
 	return;
 
 nlmsg_failure:			/* Used by NLMSG_PUT */
@@ -351,6 +380,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (err)
 		return err;
 
+	/* As soon as there's any sign of userspace auditd, start kauditd to talk to it */
+	if (!kauditd_task)
+		kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+	if (IS_ERR(kauditd_task)) {
+		err = PTR_ERR(kauditd_task);
+		kauditd_task = NULL;
+		return err;
+	}
+
 	pid  = NETLINK_CREDS(skb)->pid;
 	uid  = NETLINK_CREDS(skb)->uid;
 	loginuid = NETLINK_CB(skb).loginuid;
@@ -365,7 +403,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		status_set.rate_limit	 = audit_rate_limit;
 		status_set.backlog_limit = audit_backlog_limit;
 		status_set.lost		 = atomic_read(&audit_lost);
-		status_set.backlog	 = atomic_read(&audit_backlog);
+		status_set.backlog	 = skb_queue_len(&audit_skb_queue);
 		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
 				 &status_set, sizeof(status_set));
 		break;
@@ -471,44 +509,6 @@ static void audit_receive(struct sock *sk, int length)
 	up(&audit_netlink_sem);
 }
 
-/* Grab skbuff from the audit_buffer and send to user space. */
-static inline int audit_log_drain(struct audit_buffer *ab)
-{
-	struct sk_buff *skb = ab->skb;
-
-	if (skb) {
-		int retval = 0;
-
-		if (audit_pid) {
-			struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
-			nlh->nlmsg_len = skb->len - NLMSG_SPACE(0);
-			skb_get(skb); /* because netlink_* frees */
-			retval = netlink_unicast(audit_sock, skb, audit_pid,
-						 MSG_DONTWAIT);
-		}
-		if (retval == -EAGAIN &&
-		    (atomic_read(&audit_backlog)) < audit_backlog_limit) {
-			audit_log_end_irq(ab);
-			return 1;
-		}
-		if (retval < 0) {
-			if (retval == -ECONNREFUSED) {
-				printk(KERN_ERR
-				       "audit: *NO* daemon at audit_pid=%d\n",
-				       audit_pid);
-				audit_pid = 0;
-			} else
-				audit_log_lost("netlink socket too busy");
-		}
-		if (!audit_pid) { /* No daemon */
-			int offset = NLMSG_SPACE(0);
-			int len    = skb->len - offset;
-			skb->data[offset + len] = '\0';
-			printk(KERN_ERR "%s\n", skb->data + offset);
-		}
-	}
-	return 0;
-}
 
 /* Initialize audit support at boot time. */
 static int __init audit_init(void)
@@ -519,6 +519,8 @@ static int __init audit_init(void)
 	if (!audit_sock)
 		audit_panic("cannot initialize netlink socket");
 
+	audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+	skb_queue_head_init(&audit_skb_queue);
 	audit_initialized = 1;
 	audit_enabled = audit_default;
 	audit_log(NULL, AUDIT_KERNEL, "initialized");
@@ -549,7 +551,7 @@ static void audit_buffer_free(struct audit_buffer *ab)
 
 	if (ab->skb)
 		kfree_skb(ab->skb);
-	atomic_dec(&audit_backlog);
+
 	spin_lock_irqsave(&audit_freelist_lock, flags);
 	if (++audit_freelist_count > AUDIT_MAXFREE)
 		kfree(ab);
@@ -579,13 +581,12 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
 		if (!ab)
 			goto err;
 	}
-	atomic_inc(&audit_backlog);
 
 	ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
 	if (!ab->skb)
 		goto err;
 
-	ab->ctx   = ctx;
+	ab->ctx = ctx;
 	nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
 	nlh->nlmsg_type = type;
 	nlh->nlmsg_flags = 0;
@@ -612,18 +613,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
 	if (!audit_initialized)
 		return NULL;
 
-	if (audit_backlog_limit
-	    && atomic_read(&audit_backlog) > audit_backlog_limit) {
-		if (audit_rate_check())
-			printk(KERN_WARNING
-			       "audit: audit_backlog=%d > "
-			       "audit_backlog_limit=%d\n",
-			       atomic_read(&audit_backlog),
-			       audit_backlog_limit);
-		audit_log_lost("backlog limit exceeded");
-		return NULL;
-	}
-
 	ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type);
 	if (!ab) {
 		audit_log_lost("out of memory in audit_log_start");
@@ -784,70 +773,30 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	kfree(path);
 }
 
-/* Remove queued messages from the audit_txlist and send them to user space. */
-static void audit_tasklet_handler(unsigned long arg)
-{
-	LIST_HEAD(list);
-	struct audit_buffer *ab;
-	unsigned long	    flags;
-
-	spin_lock_irqsave(&audit_txlist_lock, flags);
-	list_splice_init(&audit_txlist, &list);
-	spin_unlock_irqrestore(&audit_txlist_lock, flags);
-
-	while (!list_empty(&list)) {
-		ab = list_entry(list.next, struct audit_buffer, list);
-		list_del(&ab->list);
-		audit_log_end_fast(ab);
-	}
-}
-
-static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
-
 /* The netlink_* functions cannot be called inside an irq context, so
  * the audit buffer is places on a queue and a tasklet is scheduled to
  * remove them from the queue outside the irq context.  May be called in
  * any context. */
-static void audit_log_end_irq(struct audit_buffer *ab)
-{
-	unsigned long flags;
-
-	if (!ab)
-		return;
-	spin_lock_irqsave(&audit_txlist_lock, flags);
-	list_add_tail(&ab->list, &audit_txlist);
-	spin_unlock_irqrestore(&audit_txlist_lock, flags);
-
-	tasklet_schedule(&audit_tasklet);
-}
-
-/* Send the message in the audit buffer directly to user space.  May not
- * be called in an irq context. */
-static void audit_log_end_fast(struct audit_buffer *ab)
+void audit_log_end(struct audit_buffer *ab)
 {
-	BUG_ON(in_irq());
 	if (!ab)
 		return;
 	if (!audit_rate_check()) {
 		audit_log_lost("rate limit exceeded");
 	} else {
-		if (audit_log_drain(ab))
-			return;
+		if (audit_pid) {
+			struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
+			nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+			skb_queue_tail(&audit_skb_queue, ab->skb);
+			ab->skb = NULL;
+			wake_up_interruptible(&kauditd_wait);
+		} else {
+			printk("%s\n", ab->skb->data + NLMSG_SPACE(0));
+		}
 	}
 	audit_buffer_free(ab);
 }
 
-/* Send or queue the message in the audit buffer, depending on the
- * current context.  (A convenience function that may be called in any
- * context.) */
-void audit_log_end(struct audit_buffer *ab)
-{
-	if (in_irq())
-		audit_log_end_irq(ab);
-	else
-		audit_log_end_fast(ab);
-}
-
 /* Log an audit record.  This is a convenience function that calls
  * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
  * called in any context. */
-- 
cgit v1.3-14-g43fede


From 7ca0026495dbb644b4e32ede76be44072cb2bc7a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Thu, 19 May 2005 11:23:13 +0100
Subject: AUDIT: Quis Custodiet Ipsos Custodes?

Nobody does. Really, it gets very silly if auditd is recording its
own actions.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/auditsc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 818778d5b6ad..78d7a13fc86f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -169,6 +169,8 @@ struct audit_entry {
 	struct audit_rule rule;
 };
 
+extern int audit_pid;
+
 /* Check to see if two rules are identical.  It is called from
  * audit_del_rule during AUDIT_DEL. */
 static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
@@ -768,7 +770,7 @@ void audit_free(struct task_struct *tsk)
 
 	/* Check for system calls that do not go through the exit
 	 * function (e.g., exit_group), then free context block. */
-	if (context->in_syscall && context->auditable)
+	if (context->in_syscall && context->auditable && context->pid != audit_pid)
 		audit_log_exit(context);
 
 	audit_free_context(context);
@@ -903,7 +905,7 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
 	if (likely(!context))
 		return;
 
-	if (context->in_syscall && context->auditable)
+	if (context->in_syscall && context->auditable && context->pid != audit_pid)
 		audit_log_exit(context);
 
 	context->in_syscall = 0;
@@ -1126,7 +1128,6 @@ void audit_signal_info(int sig, struct task_struct *t)
 {
 	extern pid_t audit_sig_pid;
 	extern uid_t audit_sig_uid;
-	extern int audit_pid;
 
 	if (unlikely(audit_pid && t->pid == audit_pid)) {
 		if (sig == SIGTERM || sig == SIGHUP) {
-- 
cgit v1.3-14-g43fede


From fb19b4c6aa024837a0071f07baa07dbf49d07151 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Thu, 19 May 2005 14:55:56 +0100
Subject: AUDIT: Honour audit_backlog_limit again.

The limit on the number of outstanding audit messages was inadvertently
removed with the switch to queuing skbs directly for sending by a kernel
thread. Put it back again.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index bbc6f542c8f7..41581413529c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -613,6 +613,18 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
 	if (!audit_initialized)
 		return NULL;
 
+	if (audit_backlog_limit
+	    && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
+		if (audit_rate_check())
+			printk(KERN_WARNING
+			       "audit: audit_backlog=%d > "
+			       "audit_backlog_limit=%d\n",
+			       skb_queue_len(&audit_skb_queue),
+			       audit_backlog_limit);
+		audit_log_lost("backlog limit exceeded");
+		return NULL;
+	}
+
 	ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type);
 	if (!ab) {
 		audit_log_lost("out of memory in audit_log_start");
-- 
cgit v1.3-14-g43fede


From b39c4fab259b216148e705344a892c96efe1946d Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 20 May 2005 13:59:15 -0700
Subject: [PATCH] cpusets+hotplug+preepmt broken

This patch removes the entwining of cpusets and hotplug code in the "No
more Mr.  Nice Guy" case of sched.c move_task_off_dead_cpu().

Since the hotplug code is holding a spinlock at this point, we cannot take
the cpuset semaphore, cpuset_sem, as would seem to be required either to
update the tasks cpuset, or to scan up the nested cpuset chain, looking for
the nearest cpuset ancestor that still has some CPUs that are online.  So
we just punt and blast the tasks cpus_allowed with all bits allowed.

This reverts these lines of code to what they were before the cpuset patch.
 And it updates the cpuset Doc file, to match.

The one known alternative to this that seems to work came from Dinakar
Guniguntala, and required the hotplug code to take the cpuset_sem semaphore
much earlier in its processing.  So far as we know, the increased locking
entanglement between cpusets and hot plug of this alternative approach is
not worth doing in this case.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Nathan Lynch <ntl@pobox.com>
Acked-by: Dinakar Guniguntala <dino@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt | 3 +--
 kernel/sched.c            | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index 1ad26d2c20ae..2f8f24eaefd9 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -252,8 +252,7 @@ in a tasks processor placement.
 There is an exception to the above.  If hotplug funtionality is used
 to remove all the CPUs that are currently assigned to a cpuset,
 then the kernel will automatically update the cpus_allowed of all
-tasks attached to CPUs in that cpuset with the online CPUs of the
-nearest parent cpuset that still has some CPUs online.  When memory
+tasks attached to CPUs in that cpuset to allow all CPUs.  When memory
 hotplug functionality for removing Memory Nodes is available, a
 similar exception is expected to apply there as well.  In general,
 the kernel prefers to violate cpuset placement, over starving a task
diff --git a/kernel/sched.c b/kernel/sched.c
index 0dc3158667a2..66b2ed784822 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4243,7 +4243,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
 
 	/* No more Mr. Nice Guy. */
 	if (dest_cpu == NR_CPUS) {
-		tsk->cpus_allowed = cpuset_cpus_allowed(tsk);
+		cpus_setall(tsk->cpus_allowed);
 		dest_cpu = any_online_cpu(tsk->cpus_allowed);
 
 		/*
-- 
cgit v1.3-14-g43fede


From 011161051bbc25f7f8b7df059dbd934c534443f0 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Sat, 21 May 2005 00:15:52 +0100
Subject: AUDIT: Avoid sleeping function in SElinux AVC audit.

This patch changes the SELinux AVC to defer logging of paths to the audit
framework upon syscall exit, by saving a reference to the (dentry,vfsmount)
pair in an auxiliary audit item on the current audit context for processing
by audit_log_exit.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h  |  3 +++
 kernel/auditsc.c       | 40 ++++++++++++++++++++++++++++++++++++++++
 security/selinux/avc.c | 17 ++++++++---------
 3 files changed, 51 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 17ea5d522d81..4b7caf0c6e10 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -69,6 +69,7 @@
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
+#define AUDIT_AVC_PATH		1402	/* dentry, vfsmount pair from avc */
 
 #define AUDIT_KERNEL		2000	/* Asynchronous audit record. NOT A REQUEST. */
 
@@ -225,6 +226,7 @@ extern uid_t audit_get_loginuid(struct audit_context *ctx);
 extern int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
 extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
+extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
 extern void audit_signal_info(int sig, struct task_struct *t);
 #else
 #define audit_alloc(t) ({ 0; })
@@ -240,6 +242,7 @@ extern void audit_signal_info(int sig, struct task_struct *t);
 #define audit_ipc_perms(q,u,g,m) ({ 0; })
 #define audit_socketcall(n,a) ({ 0; })
 #define audit_sockaddr(len, addr) ({ 0; })
+#define audit_avc_path(dentry, mnt) ({ 0; })
 #define audit_signal_info(s,t) do { ; } while (0)
 #endif
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 78d7a13fc86f..8dc5b2767145 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -34,6 +34,7 @@
 #include <asm/types.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/mount.h>
 #include <linux/socket.h>
 #include <linux/audit.h>
 #include <linux/personality.h>
@@ -124,6 +125,11 @@ struct audit_aux_data_sockaddr {
 	char			a[0];
 };
 
+struct audit_aux_data_path {
+	struct audit_aux_data	d;
+	struct dentry		*dentry;
+	struct vfsmount		*mnt;
+};
 
 /* The per-task audit context. */
 struct audit_context {
@@ -553,6 +559,11 @@ static inline void audit_free_aux(struct audit_context *context)
 	struct audit_aux_data *aux;
 
 	while ((aux = context->aux)) {
+		if (aux->type == AUDIT_AVC_PATH) {
+			struct audit_aux_data_path *axi = (void *)aux;
+			dput(axi->dentry);
+			mntput(axi->mnt);
+		}
 		context->aux = aux->next;
 		kfree(aux);
 	}
@@ -724,6 +735,14 @@ static void audit_log_exit(struct audit_context *context)
 			audit_log_format(ab, "saddr=");
 			audit_log_hex(ab, axs->a, axs->len);
 			break; }
+
+		case AUDIT_AVC_PATH: {
+			struct audit_aux_data_path *axi = (void *)aux;
+			audit_log_d_path(ab, "path=", axi->dentry, axi->mnt);
+			dput(axi->dentry);
+			mntput(axi->mnt);
+			break; }
+
 		}
 		audit_log_end(ab);
 
@@ -1124,6 +1143,27 @@ int audit_sockaddr(int len, void *a)
 	return 0;
 }
 
+int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
+{
+	struct audit_aux_data_path *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (likely(!context))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+	if (!ax)
+		return -ENOMEM;
+
+	ax->dentry = dget(dentry);
+	ax->mnt = mntget(mnt);
+
+	ax->d.type = AUDIT_AVC_PATH;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+	return 0;
+}
+
 void audit_signal_info(int sig, struct task_struct *t)
 {
 	extern pid_t audit_sig_pid;
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 62b963aca275..0fbc3e98c5ea 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -573,13 +573,10 @@ void avc_audit(u32 ssid, u32 tsid,
 		case AVC_AUDIT_DATA_FS:
 			if (a->u.fs.dentry) {
 				struct dentry *dentry = a->u.fs.dentry;
-				if (a->u.fs.mnt) {
-					audit_log_d_path(ab, "path=", dentry,
-							a->u.fs.mnt);
-				} else {
-					audit_log_format(ab, " name=%s",
-							 dentry->d_name.name);
-				}
+				if (a->u.fs.mnt)
+					audit_avc_path(dentry, a->u.fs.mnt);
+				audit_log_format(ab, " name=%s",
+						 dentry->d_name.name);
 				inode = dentry->d_inode;
 			} else if (a->u.fs.inode) {
 				struct dentry *dentry;
@@ -630,8 +627,10 @@ void avc_audit(u32 ssid, u32 tsid,
 				case AF_UNIX:
 					u = unix_sk(sk);
 					if (u->dentry) {
-						audit_log_d_path(ab, "path=",
-							u->dentry, u->mnt);
+						audit_avc_path(u->dentry, u->mnt);
+						audit_log_format(ab, " name=%s",
+								 u->dentry->d_name.name);
+
 						break;
 					}
 					if (!u->addr)
-- 
cgit v1.3-14-g43fede


From 05474106a41f44d16d649bc8c7687fc24ce4370a Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Sat, 21 May 2005 00:18:37 +0100
Subject: AUDIT: Fix AVC_USER message passing.

The original AVC_USER message wasn't consolidated with the new range of
user messages. The attached patch fixes the kernel so the old messages
work again.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 41581413529c..5e72895f4826 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -354,6 +354,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
 		if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
 			err = -EPERM;
 		break;
+	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
 		if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
 			err = -EPERM;
@@ -432,6 +433,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			audit_set_backlog_limit(status_get->backlog_limit,
 							loginuid);
 		break;
+	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
 		ab = audit_log_start(NULL, msg_type);
 		if (!ab)
-- 
cgit v1.3-14-g43fede


From 326e9c8ba6a149f47e020719b23b24a14ba740d6 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Sat, 21 May 2005 00:22:31 +0100
Subject: AUDIT: Fix inconsistent use of loginuid vs. auid, signed vs. unsigned

The attached patch changes all occurrences of loginuid to auid. It also
changes everything to %u that is an unsigned type.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c   |  7 ++-----
 kernel/auditsc.c | 12 ++++++------
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 5e72895f4826..f0a003acf621 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -439,12 +439,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (!ab)
 			break;	/* audit_panic has been called */
 		audit_log_format(ab,
-				 "user pid=%d uid=%d length=%d loginuid=%u"
+				 "user pid=%d uid=%u auid=%u"
 				 " msg='%.1024s'",
-				 pid, uid,
-				 (int)(nlh->nlmsg_len
-				       - ((char *)data - (char *)nlh)),
-				 loginuid, (char *)data);
+				 pid, uid, loginuid, (char *)data);
 		audit_set_pid(ab, pid);
 		audit_log_end(ab);
 		break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8dc5b2767145..4193811d4fe1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -688,9 +688,9 @@ static void audit_log_exit(struct audit_context *context)
 				 context->return_code);
 	audit_log_format(ab,
 		  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
-		  " pid=%d loginuid=%d uid=%d gid=%d"
-		  " euid=%d suid=%d fsuid=%d"
-		  " egid=%d sgid=%d fsgid=%d",
+		  " pid=%d auid=%u uid=%u gid=%u"
+		  " euid=%u suid=%u fsuid=%u"
+		  " egid=%u sgid=%u fsgid=%u",
 		  context->argv[0],
 		  context->argv[1],
 		  context->argv[2],
@@ -717,7 +717,7 @@ static void audit_log_exit(struct audit_context *context)
 		case AUDIT_IPC: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab, 
-					 " qbytes=%lx iuid=%d igid=%d mode=%x",
+					 " qbytes=%lx iuid=%u igid=%u mode=%x",
 					 axi->qbytes, axi->uid, axi->gid, axi->mode);
 			break; }
 
@@ -761,7 +761,7 @@ static void audit_log_exit(struct audit_context *context)
 		}
 		if (context->names[i].ino != (unsigned long)-1)
 			audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
-					     " ouid=%d ogid=%d rdev=%02x:%02x",
+					     " ouid=%u ogid=%u rdev=%02x:%02x",
 					 context->names[i].ino,
 					 MAJOR(context->names[i].dev),
 					 MINOR(context->names[i].dev),
@@ -1063,7 +1063,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 		ab = audit_log_start(NULL, AUDIT_LOGIN);
 		if (ab) {
 			audit_log_format(ab, "login pid=%d uid=%u "
-				"old loginuid=%u new loginuid=%u",
+				"old auid=%u new auid=%u",
 				task->pid, task->uid, 
 				task->audit_context->loginuid, loginuid);
 			audit_log_end(ab);
-- 
cgit v1.3-14-g43fede


From 10f02d1c59e55f529140dda3a92f0099d748451c Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@labri.fr>
Date: Sat, 21 May 2005 17:50:15 +0200
Subject: [PATCH] spin_unlock_bh() and preempt_check_resched()

In _spin_unlock_bh(lock):
	do { \
		_raw_spin_unlock(lock); \
		preempt_enable(); \
		local_bh_enable(); \
		__release(lock); \
	} while (0)

there is no reason for using preempt_enable() instead of a simple
preempt_enable_no_resched()

Since we know bottom halves are disabled, preempt_schedule() will always
return at once (preempt_count!=0), and hence preempt_check_resched() is
useless here...

This fixes it by using "preempt_enable_no_resched()" instead of the
"preempt_enable()", and thus avoids the useless preempt_check_resched()
just before re-enabling bottom halves.

Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/spinlock.h | 8 ++++----
 kernel/spinlock.c        | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index e895f3eaf53a..d6ba068719b6 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -248,7 +248,7 @@ typedef struct {
 
 #define _spin_trylock_bh(lock)	({preempt_disable(); local_bh_disable(); \
 				_raw_spin_trylock(lock) ? \
-				1 : ({preempt_enable(); local_bh_enable(); 0;});})
+				1 : ({preempt_enable_no_resched(); local_bh_enable(); 0;});})
 
 #define _spin_lock(lock)	\
 do { \
@@ -383,7 +383,7 @@ do { \
 #define _spin_unlock_bh(lock) \
 do { \
 	_raw_spin_unlock(lock); \
-	preempt_enable(); \
+	preempt_enable_no_resched(); \
 	local_bh_enable(); \
 	__release(lock); \
 } while (0)
@@ -391,7 +391,7 @@ do { \
 #define _write_unlock_bh(lock) \
 do { \
 	_raw_write_unlock(lock); \
-	preempt_enable(); \
+	preempt_enable_no_resched(); \
 	local_bh_enable(); \
 	__release(lock); \
 } while (0)
@@ -423,8 +423,8 @@ do { \
 #define _read_unlock_bh(lock)	\
 do { \
 	_raw_read_unlock(lock);	\
+	preempt_enable_no_resched();	\
 	local_bh_enable();	\
-	preempt_enable();	\
 	__release(lock); \
 } while (0)
 
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index e15ed17863f1..0c3f9d8bbe17 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -294,7 +294,7 @@ EXPORT_SYMBOL(_spin_unlock_irq);
 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 {
 	_raw_spin_unlock(lock);
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
 EXPORT_SYMBOL(_spin_unlock_bh);
@@ -318,7 +318,7 @@ EXPORT_SYMBOL(_read_unlock_irq);
 void __lockfunc _read_unlock_bh(rwlock_t *lock)
 {
 	_raw_read_unlock(lock);
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
 EXPORT_SYMBOL(_read_unlock_bh);
@@ -342,7 +342,7 @@ EXPORT_SYMBOL(_write_unlock_irq);
 void __lockfunc _write_unlock_bh(rwlock_t *lock)
 {
 	_raw_write_unlock(lock);
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
 EXPORT_SYMBOL(_write_unlock_bh);
@@ -354,7 +354,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 	if (_raw_spin_trylock(lock))
 		return 1;
 
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 	return 0;
 }
-- 
cgit v1.3-14-g43fede


From bfb4496e7239c9132d732a65cdcf3d6a7431ad1a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Sat, 21 May 2005 21:08:09 +0100
Subject: AUDIT: Assign serial number to non-syscall messages

Move audit_serial() into audit.c and use it to generate serial numbers
on messages even when there is no audit context from syscall auditing.
This allows us to disambiguate audit records when more than one is
generated in the same millisecond.

Based on a patch by Steve Grubb after he observed the problem.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h |  7 ++++---
 kernel/audit.c        | 46 ++++++++++++++++++++++++++++++++++++++++++----
 kernel/auditsc.c      | 46 ++++++----------------------------------------
 3 files changed, 52 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 4b7caf0c6e10..3278ddf41ce6 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -219,8 +219,9 @@ extern void audit_inode(const char *name, const struct inode *inode);
 				/* Private API (for audit.c only) */
 extern int  audit_receive_filter(int type, int pid, int uid, int seq,
 				 void *data, uid_t loginuid);
-extern int audit_get_stamp(struct audit_context *ctx,
-			    struct timespec *t, unsigned int *serial);
+extern unsigned int audit_serial(void);
+extern void auditsc_get_stamp(struct audit_context *ctx,
+			      struct timespec *t, unsigned int *serial);
 extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 extern uid_t audit_get_loginuid(struct audit_context *ctx);
 extern int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
@@ -237,7 +238,7 @@ extern void audit_signal_info(int sig, struct task_struct *t);
 #define audit_putname(n) do { ; } while (0)
 #define audit_inode(n,i) do { ; } while (0)
 #define audit_receive_filter(t,p,u,s,d,l) ({ -EOPNOTSUPP; })
-#define audit_get_stamp(c,t,s) ({ 0; })
+#define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
 #define audit_get_loginuid(c) ({ -1; })
 #define audit_ipc_perms(q,u,g,m) ({ 0; })
 #define audit_socketcall(n,a) ({ 0; })
diff --git a/kernel/audit.c b/kernel/audit.c
index f0a003acf621..35306f4369e7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -597,6 +597,47 @@ err:
 	return NULL;
 }
 
+/* Compute a serial number for the audit record.  Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces.  The timestamp of the
+ * record and this serial number are used by the user-space tools to
+ * determine which pieces belong to the same audit record.  The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * Atomic values are only guaranteed to be 24-bit, so we count down.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit.  However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts). */
+unsigned int audit_serial(void)
+{
+	static atomic_t serial = ATOMIC_INIT(0xffffff);
+	unsigned int a, b;
+
+	do {
+		a = atomic_read(&serial);
+		if (atomic_dec_and_test(&serial))
+			atomic_set(&serial, 0xffffff);
+		b = atomic_read(&serial);
+	} while (b != a - 1);
+
+	return 0xffffff - b;
+}
+
+static inline void audit_get_stamp(struct audit_context *ctx, 
+				   struct timespec *t, unsigned int *serial)
+{
+	if (ctx)
+		auditsc_get_stamp(ctx, t, serial);
+	else {
+		*t = CURRENT_TIME;
+		*serial = audit_serial();
+	}
+}
+
 /* Obtain an audit buffer.  This routine does locking to obtain the
  * audit buffer, but then no locking is required for calls to
  * audit_log_*format.  If the tsk is a task that is currently in a
@@ -630,10 +671,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
 		return NULL;
 	}
 
-	if (!audit_get_stamp(ab->ctx, &t, &serial)) {
-		t = CURRENT_TIME;
-		serial = 0;
-	}
+	audit_get_stamp(ab->ctx, &t, &serial);
 
 	audit_log_format(ab, "audit(%lu.%03lu:%u): ",
 			 t.tv_sec, t.tv_nsec/1000000, serial);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4193811d4fe1..74c2ae804ca8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -795,36 +795,6 @@ void audit_free(struct task_struct *tsk)
 	audit_free_context(context);
 }
 
-/* Compute a serial number for the audit record.  Audit records are
- * written to user-space as soon as they are generated, so a complete
- * audit record may be written in several pieces.  The timestamp of the
- * record and this serial number are used by the user-space tools to
- * determine which pieces belong to the same audit record.  The
- * (timestamp,serial) tuple is unique for each syscall and is live from
- * syscall entry to syscall exit.
- *
- * Atomic values are only guaranteed to be 24-bit, so we count down.
- *
- * NOTE: Another possibility is to store the formatted records off the
- * audit context (for those records that have a context), and emit them
- * all at syscall exit.  However, this could delay the reporting of
- * significant errors until syscall exit (or never, if the system
- * halts). */
-static inline unsigned int audit_serial(void)
-{
-	static atomic_t serial = ATOMIC_INIT(0xffffff);
-	unsigned int a, b;
-
-	do {
-		a = atomic_read(&serial);
-		if (atomic_dec_and_test(&serial))
-			atomic_set(&serial, 0xffffff);
-		b = atomic_read(&serial);
-	} while (b != a - 1);
-
-	return 0xffffff - b;
-}
-
 /* Fill in audit context at syscall entry.  This only happens if the
  * audit context was created when the task was created and the state or
  * filters demand the audit context be built.  If the state from the
@@ -1042,17 +1012,13 @@ void audit_inode(const char *name, const struct inode *inode)
 	context->names[idx].rdev = inode->i_rdev;
 }
 
-int audit_get_stamp(struct audit_context *ctx,
-		     struct timespec *t, unsigned int *serial)
+void auditsc_get_stamp(struct audit_context *ctx,
+		       struct timespec *t, unsigned int *serial)
 {
-	if (ctx) {
-		t->tv_sec  = ctx->ctime.tv_sec;
-		t->tv_nsec = ctx->ctime.tv_nsec;
-		*serial    = ctx->serial;
-		ctx->auditable = 1;
-		return 1;
-	}
-	return 0;
+	t->tv_sec  = ctx->ctime.tv_sec;
+	t->tv_nsec = ctx->ctime.tv_nsec;
+	*serial    = ctx->serial;
+	ctx->auditable = 1;
 }
 
 int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
-- 
cgit v1.3-14-g43fede


From bccf6ae083318ea08094d6ab185fdf7c49906b3a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Mon, 23 May 2005 21:35:28 +0100
Subject: AUDIT: Unify auid reporting, put arch before syscall number

These changes make processing of audit logs easier. Based on a patch
from Steve Grubb <sgrubb@redhat.com>

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c   | 10 +++++-----
 kernel/auditsc.c |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 35306f4369e7..ef35166fdc29 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -234,7 +234,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid)
 	int old		 = audit_rate_limit;
 	audit_rate_limit = limit;
 	audit_log(NULL, AUDIT_CONFIG_CHANGE, 
-			"audit_rate_limit=%d old=%d by auid %u",
+			"audit_rate_limit=%d old=%d by auid=%u",
 			audit_rate_limit, old, loginuid);
 	return old;
 }
@@ -244,7 +244,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid)
 	int old		 = audit_backlog_limit;
 	audit_backlog_limit = limit;
 	audit_log(NULL, AUDIT_CONFIG_CHANGE,
-			"audit_backlog_limit=%d old=%d by auid %u",
+			"audit_backlog_limit=%d old=%d by auid=%u",
 			audit_backlog_limit, old, loginuid);
 	return old;
 }
@@ -256,7 +256,7 @@ static int audit_set_enabled(int state, uid_t loginuid)
 		return -EINVAL;
 	audit_enabled = state;
 	audit_log(NULL, AUDIT_CONFIG_CHANGE,
-			"audit_enabled=%d old=%d by auid %u",
+			"audit_enabled=%d old=%d by auid=%u",
 			audit_enabled, old, loginuid);
 	return old;
 }
@@ -270,7 +270,7 @@ static int audit_set_failure(int state, uid_t loginuid)
 		return -EINVAL;
 	audit_failure = state;
 	audit_log(NULL, AUDIT_CONFIG_CHANGE,
-			"audit_failure=%d old=%d by auid %u",
+			"audit_failure=%d old=%d by auid=%u",
 			audit_failure, old, loginuid);
 	return old;
 }
@@ -424,7 +424,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			int old   = audit_pid;
 			audit_pid = status_get->pid;
 			audit_log(NULL, AUDIT_CONFIG_CHANGE,
-				"audit_pid=%d old=%d by auid %u",
+				"audit_pid=%d old=%d by auid=%u",
 				  audit_pid, old, loginuid);
 		}
 		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 74c2ae804ca8..5fc4f52d218f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -307,7 +307,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 		if (!err && (flags & AUDIT_AT_EXIT))
 			err = audit_add_rule(entry, &audit_extlist);
 		audit_log(NULL, AUDIT_CONFIG_CHANGE, 
-				"auid %u added an audit rule\n", loginuid);
+				"auid=%u added an audit rule\n", loginuid);
 		break;
 	case AUDIT_DEL:
 		flags =((struct audit_rule *)data)->flags;
@@ -318,7 +318,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 		if (!err && (flags & AUDIT_AT_EXIT))
 			err = audit_del_rule(data, &audit_extlist);
 		audit_log(NULL, AUDIT_CONFIG_CHANGE,
-				"auid %u removed an audit rule\n", loginuid);
+				"auid=%u removed an audit rule\n", loginuid);
 		break;
 	default:
 		return -EINVAL;
@@ -678,10 +678,10 @@ static void audit_log_exit(struct audit_context *context)
 	ab = audit_log_start(context, AUDIT_SYSCALL);
 	if (!ab)
 		return;		/* audit_panic has been called */
-	audit_log_format(ab, "syscall=%d", context->major);
+	audit_log_format(ab, "arch=%x syscall=%d",
+			 context->arch, context->major);
 	if (context->personality != PER_LINUX)
 		audit_log_format(ab, " per=%lx", context->personality);
-	audit_log_format(ab, " arch=%x", context->arch);
 	if (context->return_valid)
 		audit_log_format(ab, " success=%s exit=%ld", 
 				 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
-- 
cgit v1.3-14-g43fede


From 99e45eeac867d51ff3395dcf3d7aedf5ac2812c8 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Mon, 23 May 2005 21:57:41 +0100
Subject: AUDIT: Escape comm when logging task info

It comes from the user; it needs to be escaped.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/auditsc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 5fc4f52d218f..b45677eba78f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -650,7 +650,8 @@ static void audit_log_task_info(struct audit_buffer *ab)
 	struct vm_area_struct *vma;
 
 	get_task_comm(name, current);
-	audit_log_format(ab, " comm=%s", name);
+	audit_log_format(ab, " comm=");
+	audit_log_untrustedstring(ab, name);
 
 	if (!mm)
 		return;
-- 
cgit v1.3-14-g43fede


From c33880aaddbbab1ccf36f4457ed1090621f2e39a Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@sw.ru>
Date: Tue, 24 May 2005 19:29:47 -0700
Subject: [PATCH] sigkill priority fix

If SIGKILL does not have priority, we cannot instantly kill task before it
makes some unexpected job.  It can be critical, but we were unable to
reproduce this easily until Heiko Carstens <Heiko.Carstens@de.ibm.com>
reported this problem on LKML.

Signed-Off-By: Kirill Korotaev <dev@sw.ru>
Signed-Off-By: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8f3debc77c5b..b3c24c732c5a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -522,7 +522,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 {
 	int sig = 0;
 
-	sig = next_signal(pending, mask);
+	/* SIGKILL must have priority, otherwise it is quite easy
+	 * to create an unkillable process, sending sig < SIGKILL
+	 * to self */
+	if (unlikely(sigismember(&pending->signal, SIGKILL))) {
+		if (!sigismember(mask, SIGKILL))
+			sig = SIGKILL;
+	}
+
+	if (likely(!sig))
+		sig = next_signal(pending, mask);
 	if (sig) {
 		if (current->notifier) {
 			if (sigismember(current->notifier_mask, sig)) {
-- 
cgit v1.3-14-g43fede


From 7551ced334ce6eb2a7a765309871e619f645add1 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Thu, 26 May 2005 12:04:57 +0100
Subject: AUDIT: Defer freeing aux items until audit_free_context()

While they were all just simple blobs it made sense to just free them
as we walked through and logged them. Now that there are pointers to
other objects which need refcounting, we might as well revert to
_only_ logging them in audit_log_exit(), and put the code to free them
properly in only one place -- in audit_free_aux().

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
----------------------------------------------------------
---
 kernel/auditsc.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b45677eba78f..7556c479d5af 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -675,6 +675,7 @@ static void audit_log_exit(struct audit_context *context)
 {
 	int i;
 	struct audit_buffer *ab;
+	struct audit_aux_data *aux;
 
 	ab = audit_log_start(context, AUDIT_SYSCALL);
 	if (!ab)
@@ -705,10 +706,8 @@ static void audit_log_exit(struct audit_context *context)
 		  context->egid, context->sgid, context->fsgid);
 	audit_log_task_info(ab);
 	audit_log_end(ab);
-	while (context->aux) {
-		struct audit_aux_data *aux;
 
-		aux = context->aux;
+	for (aux = context->aux; aux; aux = aux->next) {
 
 		ab = audit_log_start(context, aux->type);
 		if (!ab)
@@ -740,15 +739,10 @@ static void audit_log_exit(struct audit_context *context)
 		case AUDIT_AVC_PATH: {
 			struct audit_aux_data_path *axi = (void *)aux;
 			audit_log_d_path(ab, "path=", axi->dentry, axi->mnt);
-			dput(axi->dentry);
-			mntput(axi->mnt);
 			break; }
 
 		}
 		audit_log_end(ab);
-
-		context->aux = aux->next;
-		kfree(aux);
 	}
 
 	for (i = 0; i < context->name_count; i++) {
-- 
cgit v1.3-14-g43fede


From 8f37d47c9bf74cb48692691086b482e315d07f40 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Fri, 27 May 2005 12:17:28 +0100
Subject: AUDIT: Record working directory when syscall arguments are pathnames

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h |  3 ++-
 kernel/auditsc.c      | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 3278ddf41ce6..bf2ad3ba72eb 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -61,11 +61,12 @@
 
 #define AUDIT_SYSCALL		1300	/* Syscall event */
 #define AUDIT_FS_WATCH		1301	/* Filesystem watch event */
-#define AUDIT_PATH		1302	/* Filname path information */
+#define AUDIT_PATH		1302	/* Filename path information */
 #define AUDIT_IPC		1303	/* IPC record */
 #define AUDIT_SOCKETCALL	1304	/* sys_socketcall arguments */
 #define AUDIT_CONFIG_CHANGE	1305	/* Audit system configuration change */
 #define AUDIT_SOCKADDR		1306	/* sockaddr copied as syscall arg */
+#define AUDIT_CWD		1307	/* Current working directory */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7556c479d5af..e75f84e1a1a0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -145,6 +145,8 @@ struct audit_context {
 	int		    auditable;  /* 1 if record should be written */
 	int		    name_count;
 	struct audit_names  names[AUDIT_NAMES];
+	struct dentry *	    pwd;
+	struct vfsmount *   pwdmnt;
 	struct audit_context *previous; /* For nested syscalls */
 	struct audit_aux_data *aux;
 
@@ -552,6 +554,12 @@ static inline void audit_free_names(struct audit_context *context)
 		if (context->names[i].name)
 			__putname(context->names[i].name);
 	context->name_count = 0;
+	if (context->pwd)
+		dput(context->pwd);
+	if (context->pwdmnt)
+		mntput(context->pwdmnt);
+	context->pwd = NULL;
+	context->pwdmnt = NULL;
 }
 
 static inline void audit_free_aux(struct audit_context *context)
@@ -745,10 +753,18 @@ static void audit_log_exit(struct audit_context *context)
 		audit_log_end(ab);
 	}
 
+	if (context->pwd && context->pwdmnt) {
+		ab = audit_log_start(context, AUDIT_CWD);
+		if (ab) {
+			audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
+			audit_log_end(ab);
+		}
+	}
 	for (i = 0; i < context->name_count; i++) {
 		ab = audit_log_start(context, AUDIT_PATH);
 		if (!ab)
 			continue; /* audit_panic has been called */
+
 		audit_log_format(ab, "item=%d", i);
 		if (context->names[i].name) {
 			audit_log_format(ab, " name=");
@@ -929,6 +945,13 @@ void audit_getname(const char *name)
 	context->names[context->name_count].name = name;
 	context->names[context->name_count].ino  = (unsigned long)-1;
 	++context->name_count;
+	if (!context->pwd) {
+		read_lock(&current->fs->lock);
+		context->pwd = dget(current->fs->pwd);
+		context->pwdmnt = mntget(current->fs->pwdmnt);
+		read_unlock(&current->fs->lock);
+	}
+		
 }
 
 /* Intercept a putname request.  Called from
-- 
cgit v1.3-14-g43fede


From 2efe86b809d97debaaf9fcc13b041aedf15bd3d2 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 27 May 2005 02:02:43 -0700
Subject: [PATCH] cpuset exit NULL dereference fix

There is a race in the kernel cpuset code, between the code
to handle notify_on_release, and the code to remove a cpuset.
The notify_on_release code can end up trying to access a
cpuset that has been removed.  In the most common case, this
causes a NULL pointer dereference from the routine cpuset_path.
However all manner of bad things are possible, in theory at least.

The existing code decrements the cpuset use count, and if the
count goes to zero, processes the notify_on_release request,
if appropriate.  However, once the count goes to zero, unless we
are holding the global cpuset_sem semaphore, there is nothing to
stop another task from immediately removing the cpuset entirely,
and recycling its memory.

The obvious fix would be to always hold the cpuset_sem
semaphore while decrementing the use count and dealing with
notify_on_release.  However we don't want to force a global
semaphore into the mainline task exit path, as that might create
a scaling problem.

The actual fix is almost as easy - since this is only an issue
for cpusets using notify_on_release, which the top level big
cpusets don't normally need to use, only take the cpuset_sem
for cpusets using notify_on_release.

This code has been run for hours without a hiccup, while running
a cpuset create/destroy stress test that could crash the existing
kernel in seconds.  This patch applies to the current -linus
git kernel.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Simon Derr <simon.derr@bull.net>
Acked-by: Dinakar Guniguntala <dino@in.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 961d74044deb..00e8f2575512 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -166,9 +166,8 @@ static struct super_block *cpuset_sb = NULL;
  * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
  * (usually) grab cpuset_sem.  These are the two most performance
  * critical pieces of code here.  The exception occurs on exit(),
- * if the last task using a cpuset exits, and the cpuset was marked
- * notify_on_release.  In that case, the cpuset_sem is taken, the
- * path to the released cpuset calculated, and a usermode call made
+ * when a task in a notify_on_release cpuset exits.  Then cpuset_sem
+ * is taken, and if the cpuset count is zero, a usermode call made
  * to /sbin/cpuset_release_agent with the name of the cpuset (path
  * relative to the root of cpuset file system) as the argument.
  *
@@ -1404,6 +1403,18 @@ void cpuset_fork(struct task_struct *tsk)
  *
  * Description: Detach cpuset from @tsk and release it.
  *
+ * Note that cpusets marked notify_on_release force every task
+ * in them to take the global cpuset_sem semaphore when exiting.
+ * This could impact scaling on very large systems.  Be reluctant
+ * to use notify_on_release cpusets where very high task exit
+ * scaling is required on large systems.
+ *
+ * Don't even think about derefencing 'cs' after the cpuset use
+ * count goes to zero, except inside a critical section guarded
+ * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
+ * then a zero cpuset use count is a license to any other task to
+ * nuke the cpuset immediately.
+ *
  **/
 
 void cpuset_exit(struct task_struct *tsk)
@@ -1415,10 +1426,13 @@ void cpuset_exit(struct task_struct *tsk)
 	tsk->cpuset = NULL;
 	task_unlock(tsk);
 
-	if (atomic_dec_and_test(&cs->count)) {
+	if (notify_on_release(cs)) {
 		down(&cpuset_sem);
-		check_for_release(cs);
+		if (atomic_dec_and_test(&cs->count))
+			check_for_release(cs);
 		up(&cpuset_sem);
+	} else {
+		atomic_dec(&cs->count);
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From b60c1f6ffd88850079ae419aa933ab0eddbd5535 Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Fri, 27 May 2005 12:53:00 -0700
Subject: [PATCH] drop note_interrupt() for per-CPU for proper scaling

The "unhandled interrupts" catcher, note_interrupt(), increments a global
desc->irq_count and grossly damages scaling of very large systems, e.g.,
>192p ia64 Altix, because of this highly contented cacheline, especially
for timer interrupts.  384p is severely crippled, and 512p is unuseable.

All calls to note_interrupt() can be disabled by booting with "noirqdebug",
but this disables the useful interrupt checking for all interrupts.

I propose eliminating note_interrupt() for all per-CPU interrupts.  This
was the behavior of linux-2.6.10 and earlier, but in 2.6.11 a code
restructuring added a call to note_interrupt() for per-CPU interrupts.
Besides, note_interrupt() is a bit racy for concurrent CPU calls anyway, as
the desc->irq_count++ increment isn't atomic (which, if done, would make
scaling even worse).

Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/handle.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 06b5a6323998..436c7d93c00a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -119,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 		 */
 		desc->handler->ack(irq);
 		action_ret = handle_IRQ_event(irq, regs, desc->action);
-		if (!noirqdebug)
-			note_interrupt(irq, desc, action_ret);
 		desc->handler->end(irq);
 		return 1;
 	}
-- 
cgit v1.3-14-g43fede


From ae92ef8a442421356950a0a8dfdc35e8e783000e Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Tue, 31 May 2005 14:39:29 -0700
Subject: [PATCH] flush icache in correct context

flush_icache_range() is used in two different situation - in binfmt_elf.c &
co for user space mappings and module.c for kernel modules.  On m68k
flush_icache_range() doesn't know which data to flush, as it has separate
address spaces and the pointer argument can be valid in either address
space.

First I considered splitting flush_icache_range(), but this patch is
simpler.  Setting the correct context gives flush_icache_range() enough
information to flush the correct data.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5734ab09d3f9..83b3d376708c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1758,6 +1758,7 @@ sys_init_module(void __user *umod,
 		const char __user *uargs)
 {
 	struct module *mod;
+	mm_segment_t old_fs = get_fs();
 	int ret = 0;
 
 	/* Must have permission */
@@ -1775,6 +1776,9 @@ sys_init_module(void __user *umod,
 		return PTR_ERR(mod);
 	}
 
+	/* flush the icache in correct context */
+	set_fs(KERNEL_DS);
+
 	/* Flush the instruction cache, since we've played with text */
 	if (mod->module_init)
 		flush_icache_range((unsigned long)mod->module_init,
@@ -1783,6 +1787,8 @@ sys_init_module(void __user *umod,
 	flush_icache_range((unsigned long)mod->module_core,
 			   (unsigned long)mod->module_core + mod->core_size);
 
+	set_fs(old_fs);
+
 	/* Now sew it into the lists.  They won't access us, since
            strong_try_module_get() will fail. */
 	stop_machine_run(__link_module, mod, NR_CPUS);
-- 
cgit v1.3-14-g43fede


From 6df3cecbb95345981718b38d357c50bc3425420a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 13 Jun 2005 15:52:32 -0700
Subject: [PATCH] cond_resched_lock() fix

On one path, cond_resched_lock() fails to return true if it dropped the lock.
We think this might be causing the crashes in JBD's log_do_checkpoint().

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 66b2ed784822..f12a0c8a7d98 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3755,19 +3755,22 @@ EXPORT_SYMBOL(cond_resched);
  */
 int cond_resched_lock(spinlock_t * lock)
 {
+	int ret = 0;
+
 	if (need_lockbreak(lock)) {
 		spin_unlock(lock);
 		cpu_relax();
+		ret = 1;
 		spin_lock(lock);
 	}
 	if (need_resched()) {
 		_raw_spin_unlock(lock);
 		preempt_enable_no_resched();
 		__cond_resched();
+		ret = 1;
 		spin_lock(lock);
-		return 1;
 	}
-	return 0;
+	return ret;
 }
 
 EXPORT_SYMBOL(cond_resched_lock);
-- 
cgit v1.3-14-g43fede


From caf2857ac6e0ba2651e722f05d5f7d3ec8ef2615 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 17 Jun 2005 11:36:36 +0200
Subject: [PATCH] timer exit cleanup

Do all timer zapping in exit_itimers.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c         | 4 +---
 kernel/posix-timers.c | 1 +
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index edaa50b5bbfa..2ef2ad540201 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -811,10 +811,8 @@ fastcall NORET_TYPE void do_exit(long code)
 	acct_update_integrals(tsk);
 	update_mem_hiwater(tsk);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
-	if (group_dead) {
- 		del_timer_sync(&tsk->signal->real_timer);
+	if (group_dead)
 		acct_process(code);
-	}
 	exit_mm(tsk);
 
 	exit_sem(tsk);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fd316c272260..cabb63fc9e16 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1197,6 +1197,7 @@ void exit_itimers(struct signal_struct *sig)
 		tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
 		itimer_delete(tmr);
 	}
+	del_timer_sync(&sig->real_timer);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 70f2817a43c89b784dc2ec3d06ba5bf3064f8235 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Fri, 29 Apr 2005 01:27:34 -0500
Subject: [PATCH] sysfs: (rest) if show/store is missing return -EIO

sysfs: fix the rest of the kernel so if an attribute doesn't
       implement show or store method read/write will return
       -EIO instead of 0 or -EINVAL or -EPERM.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/acpi/scan.c             | 4 ++--
 drivers/cpufreq/cpufreq.c       | 4 ++--
 drivers/firmware/edd.c          | 2 +-
 drivers/firmware/efivars.c      | 4 ++--
 drivers/infiniband/core/sysfs.c | 2 +-
 kernel/params.c                 | 4 ++--
 security/seclvl.c               | 4 ++--
 7 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 119c94093a13..e85885593280 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -65,14 +65,14 @@ static ssize_t acpi_device_attr_show(struct kobject *kobj,
 {
 	struct acpi_device *device = to_acpi_device(kobj);
 	struct acpi_device_attribute *attribute = to_handle_attr(attr);
-	return attribute->show ? attribute->show(device, buf) : 0;
+	return attribute->show ? attribute->show(device, buf) : -EIO;
 }
 static ssize_t acpi_device_attr_store(struct kobject *kobj,
 		struct attribute *attr, const char *buf, size_t len)
 {
 	struct acpi_device *device = to_acpi_device(kobj);
 	struct acpi_device_attribute *attribute = to_handle_attr(attr);
-	return attribute->store ? attribute->store(device, buf, len) : len;
+	return attribute->store ? attribute->store(device, buf, len) : -EIO;
 }
 
 static struct sysfs_ops acpi_device_sysfs_ops = {
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 03b5fb2ddcf4..bf62dfe4976a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -521,7 +521,7 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr ,char * buf)
 	policy = cpufreq_cpu_get(policy->cpu);
 	if (!policy)
 		return -EINVAL;
-	ret = fattr->show ? fattr->show(policy,buf) : 0;
+	ret = fattr->show ? fattr->show(policy,buf) : -EIO;
 	cpufreq_cpu_put(policy);
 	return ret;
 }
@@ -535,7 +535,7 @@ static ssize_t store(struct kobject * kobj, struct attribute * attr,
 	policy = cpufreq_cpu_get(policy->cpu);
 	if (!policy)
 		return -EINVAL;
-	ret = fattr->store ? fattr->store(policy,buf,count) : 0;
+	ret = fattr->store ? fattr->store(policy,buf,count) : -EIO;
 	cpufreq_cpu_put(policy);
 	return ret;
 }
diff --git a/drivers/firmware/edd.c b/drivers/firmware/edd.c
index 33b669e6f977..6996476669f1 100644
--- a/drivers/firmware/edd.c
+++ b/drivers/firmware/edd.c
@@ -115,7 +115,7 @@ edd_attr_show(struct kobject * kobj, struct attribute *attr, char *buf)
 {
 	struct edd_device *dev = to_edd_device(kobj);
 	struct edd_attribute *edd_attr = to_edd_attr(attr);
-	ssize_t ret = 0;
+	ssize_t ret = -EIO;
 
 	if (edd_attr->show)
 		ret = edd_attr->show(dev, buf);
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 0287ff65963b..a3451cb94004 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -352,7 +352,7 @@ static ssize_t efivar_attr_show(struct kobject *kobj, struct attribute *attr,
 {
 	struct efivar_entry *var = to_efivar_entry(kobj);
 	struct efivar_attribute *efivar_attr = to_efivar_attr(attr);
-	ssize_t ret = 0;
+	ssize_t ret = -EIO;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -368,7 +368,7 @@ static ssize_t efivar_attr_store(struct kobject *kobj, struct attribute *attr,
 {
 	struct efivar_entry *var = to_efivar_entry(kobj);
 	struct efivar_attribute *efivar_attr = to_efivar_attr(attr);
-	ssize_t ret = 0;
+	ssize_t ret = -EIO;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 5febd6d8b885..90d51b179abe 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -71,7 +71,7 @@ static ssize_t port_attr_show(struct kobject *kobj,
 	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
 
 	if (!port_attr->show)
-		return 0;
+		return -EIO;
 
 	return port_attr->show(p, port_attr, buf);
 }
diff --git a/kernel/params.c b/kernel/params.c
index 5513844bec13..d586c35ef8fc 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -629,7 +629,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
 	mk = to_module_kobject(kobj);
 
 	if (!attribute->show)
-		return -EPERM;
+		return -EIO;
 
 	if (!try_module_get(mk->mod))
 		return -ENODEV;
@@ -653,7 +653,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
 	mk = to_module_kobject(kobj);
 
 	if (!attribute->store)
-		return -EPERM;
+		return -EIO;
 
 	if (!try_module_get(mk->mod))
 		return -ENODEV;
diff --git a/security/seclvl.c b/security/seclvl.c
index 8a0ab0d7949e..c8e87b22c9bd 100644
--- a/security/seclvl.c
+++ b/security/seclvl.c
@@ -155,7 +155,7 @@ seclvl_attr_store(struct kobject *kobj,
 	struct seclvl_obj *obj = container_of(kobj, struct seclvl_obj, kobj);
 	struct seclvl_attribute *attribute =
 	    container_of(attr, struct seclvl_attribute, attr);
-	return (attribute->store ? attribute->store(obj, buf, len) : 0);
+	return attribute->store ? attribute->store(obj, buf, len) : -EIO;
 }
 
 static ssize_t
@@ -164,7 +164,7 @@ seclvl_attr_show(struct kobject *kobj, struct attribute *attr, char *buf)
 	struct seclvl_obj *obj = container_of(kobj, struct seclvl_obj, kobj);
 	struct seclvl_attribute *attribute =
 	    container_of(attr, struct seclvl_attribute, attr);
-	return (attribute->show ? attribute->show(obj, buf) : 0);
+	return attribute->show ? attribute->show(obj, buf) : -EIO;
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From 39c715b71740c4a78ba4769fb54826929bac03cb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 21 Jun 2005 17:14:34 -0700
Subject: [PATCH] smp_processor_id() cleanup

This patch implements a number of smp_processor_id() cleanup ideas that
Arjan van de Ven and I came up with.

The previous __smp_processor_id/_smp_processor_id/smp_processor_id API
spaghetti was hard to follow both on the implementational and on the
usage side.

Some of the complexity arose from picking wrong names, some of the
complexity comes from the fact that not all architectures defined
__smp_processor_id.

In the new code, there are two externally visible symbols:

 - smp_processor_id(): debug variant.

 - raw_smp_processor_id(): nondebug variant. Replaces all existing
   uses of _smp_processor_id() and __smp_processor_id(). Defined
   by every SMP architecture in include/asm-*/smp.h.

There is one new internal symbol, dependent on DEBUG_PREEMPT:

 - debug_smp_processor_id(): internal debug variant, mapped to
                             smp_processor_id().

Also, i moved debug_smp_processor_id() from lib/kernel_lock.c into a new
lib/smp_processor_id.c file.  All related comments got updated and/or
clarified.

I have build/boot tested the following 8 .config combinations on x86:

 {SMP,UP} x {PREEMPT,!PREEMPT} x {DEBUG_PREEMPT,!DEBUG_PREEMPT}

I have also build/boot tested x64 on UP/PREEMPT/DEBUG_PREEMPT.  (Other
architectures are untested, but should work just fine.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c          |  2 +-
 arch/i386/lib/delay.c             |  2 +-
 arch/ppc/lib/locks.c              |  4 +--
 arch/ppc64/kernel/idle.c          |  2 +-
 arch/sh/lib/delay.c               |  2 +-
 arch/sparc64/lib/delay.c          |  2 +-
 arch/x86_64/lib/delay.c           |  2 +-
 drivers/acpi/processor_idle.c     |  2 +-
 drivers/input/gameport/gameport.c |  2 +-
 drivers/oprofile/buffer_sync.c    |  4 +--
 fs/xfs/linux-2.6/xfs_linux.h      |  6 ++---
 include/asm-alpha/smp.h           |  2 +-
 include/asm-arm/smp.h             |  2 +-
 include/asm-i386/smp.h            |  2 +-
 include/asm-ia64/smp.h            |  2 +-
 include/asm-m32r/smp.h            |  2 +-
 include/asm-mips/smp.h            |  2 +-
 include/asm-parisc/smp.h          |  2 +-
 include/asm-ppc/smp.h             |  2 +-
 include/asm-ppc64/smp.h           |  2 +-
 include/asm-s390/smp.h            |  2 +-
 include/asm-sh/smp.h              |  2 +-
 include/asm-sparc/smp.h           |  2 +-
 include/asm-sparc64/smp.h         |  2 +-
 include/asm-um/smp.h              |  3 ++-
 include/asm-x86_64/smp.h          |  2 +-
 include/linux/mmzone.h            |  2 +-
 include/linux/smp.h               | 40 ++++++++++++----------------
 include/net/route.h               |  2 +-
 include/net/snmp.h                | 14 +++++-----
 kernel/module.c                   |  2 +-
 kernel/power/smp.c                |  4 +--
 kernel/sched.c                    |  4 +--
 kernel/stop_machine.c             |  4 +--
 lib/Makefile                      |  1 +
 lib/kernel_lock.c                 | 55 ---------------------------------------
 lib/smp_processor_id.c            | 55 +++++++++++++++++++++++++++++++++++++++
 37 files changed, 119 insertions(+), 125 deletions(-)
 create mode 100644 lib/smp_processor_id.c

(limited to 'kernel')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 00c63419c06f..83c579e82a81 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -306,7 +306,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 	};
 	static int die_counter;
 
-	if (die.lock_owner != _smp_processor_id()) {
+	if (die.lock_owner != raw_smp_processor_id()) {
 		console_verbose();
 		spin_lock_irq(&die.lock);
 		die.lock_owner = smp_processor_id();
diff --git a/arch/i386/lib/delay.c b/arch/i386/lib/delay.c
index 080639f262b1..eb0cdfe9280f 100644
--- a/arch/i386/lib/delay.c
+++ b/arch/i386/lib/delay.c
@@ -34,7 +34,7 @@ inline void __const_udelay(unsigned long xloops)
 	xloops *= 4;
 	__asm__("mull %0"
 		:"=d" (xloops), "=&a" (d0)
-		:"1" (xloops),"0" (cpu_data[_smp_processor_id()].loops_per_jiffy * (HZ/4)));
+		:"1" (xloops),"0" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
         __delay(++xloops);
 }
 
diff --git a/arch/ppc/lib/locks.c b/arch/ppc/lib/locks.c
index 694163d696d8..c450dc4b766e 100644
--- a/arch/ppc/lib/locks.c
+++ b/arch/ppc/lib/locks.c
@@ -130,7 +130,7 @@ void _raw_read_lock(rwlock_t *rw)
 		while (!read_can_lock(rw)) {
 			if (--stuck == 0) {
 				printk("_read_lock(%p) CPU#%d lock %d\n",
-				       rw, _smp_processor_id(), rw->lock);
+				       rw, raw_smp_processor_id(), rw->lock);
 				stuck = INIT_STUCK;
 			}
 		}
@@ -158,7 +158,7 @@ void _raw_write_lock(rwlock_t *rw)
 		while (!write_can_lock(rw)) {
 			if (--stuck == 0) {
 				printk("write_lock(%p) CPU#%d lock %d)\n",
-				       rw, _smp_processor_id(), rw->lock);
+				       rw, raw_smp_processor_id(), rw->lock);
 				stuck = INIT_STUCK;
 			}
 		}
diff --git a/arch/ppc64/kernel/idle.c b/arch/ppc64/kernel/idle.c
index f24ce2b87200..ff8a7db142d3 100644
--- a/arch/ppc64/kernel/idle.c
+++ b/arch/ppc64/kernel/idle.c
@@ -292,7 +292,7 @@ static int native_idle(void)
 		if (need_resched())
 			schedule();
 
-		if (cpu_is_offline(_smp_processor_id()) &&
+		if (cpu_is_offline(raw_smp_processor_id()) &&
 		    system_state == SYSTEM_RUNNING)
 			cpu_die();
 	}
diff --git a/arch/sh/lib/delay.c b/arch/sh/lib/delay.c
index 50b36037d86b..351714694d6d 100644
--- a/arch/sh/lib/delay.c
+++ b/arch/sh/lib/delay.c
@@ -24,7 +24,7 @@ inline void __const_udelay(unsigned long xloops)
 	__asm__("dmulu.l	%0, %2\n\t"
 		"sts	mach, %0"
 		: "=r" (xloops)
-		: "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy)
+		: "0" (xloops), "r" (cpu_data[raw_smp_processor_id()].loops_per_jiffy)
 		: "macl", "mach");
 	__delay(xloops * HZ);
 }
diff --git a/arch/sparc64/lib/delay.c b/arch/sparc64/lib/delay.c
index f6b4c784d53e..e8808727617a 100644
--- a/arch/sparc64/lib/delay.c
+++ b/arch/sparc64/lib/delay.c
@@ -31,7 +31,7 @@ void __const_udelay(unsigned long n)
 {
 	n *= 4;
 
-	n *= (cpu_data(_smp_processor_id()).udelay_val * (HZ/4));
+	n *= (cpu_data(raw_smp_processor_id()).udelay_val * (HZ/4));
 	n >>= 32;
 
 	__delay(n + 1);
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
index 6e2d66472eb1..aed61a668a1b 100644
--- a/arch/x86_64/lib/delay.c
+++ b/arch/x86_64/lib/delay.c
@@ -34,7 +34,7 @@ void __delay(unsigned long loops)
 
 inline void __const_udelay(unsigned long xloops)
 {
-	__delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
+	__delay(((xloops * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
 }
 
 void __udelay(unsigned long usecs)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index ff64d333e95f..c9d671cf7857 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -171,7 +171,7 @@ static void acpi_processor_idle (void)
 	int			sleep_ticks = 0;
 	u32			t1, t2 = 0;
 
-	pr = processors[_smp_processor_id()];
+	pr = processors[raw_smp_processor_id()];
 	if (!pr)
 		return;
 
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index 9b8ff396e6f8..e152d0fa0cdd 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -134,7 +134,7 @@ static int gameport_measure_speed(struct gameport *gameport)
 	}
 
 	gameport_close(gameport);
-	return (cpu_data[_smp_processor_id()].loops_per_jiffy * (unsigned long)HZ / (1000 / 50)) / (tx < 1 ? 1 : tx);
+	return (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (unsigned long)HZ / (1000 / 50)) / (tx < 1 ? 1 : tx);
 
 #else
 
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 55720dc6ec43..745a14183634 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -62,7 +62,7 @@ static int task_exit_notify(struct notifier_block * self, unsigned long val, voi
 	/* To avoid latency problems, we only process the current CPU,
 	 * hoping that most samples for the task are on this CPU
 	 */
-	sync_buffer(_smp_processor_id());
+	sync_buffer(raw_smp_processor_id());
   	return 0;
 }
 
@@ -86,7 +86,7 @@ static int munmap_notify(struct notifier_block * self, unsigned long val, void *
 		/* To avoid latency problems, we only process the current CPU,
 		 * hoping that most samples for the task are on this CPU
 		 */
-		sync_buffer(_smp_processor_id());
+		sync_buffer(raw_smp_processor_id());
 		return 0;
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 71bb41019a12..7d7c8788ea75 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -145,10 +145,10 @@ static inline void set_buffer_unwritten_io(struct buffer_head *bh)
 #define xfs_inherit_nosymlinks	xfs_params.inherit_nosym.val
 #define xfs_rotorstep		xfs_params.rotorstep.val
 
-#ifndef __smp_processor_id
-#define __smp_processor_id()	smp_processor_id()
+#ifndef raw_smp_processor_id
+#define raw_smp_processor_id()	smp_processor_id()
 #endif
-#define current_cpu()		__smp_processor_id()
+#define current_cpu()		raw_smp_processor_id()
 #define current_pid()		(current->pid)
 #define current_fsuid(cred)	(current->fsuid)
 #define current_fsgid(cred)	(current->fsgid)
diff --git a/include/asm-alpha/smp.h b/include/asm-alpha/smp.h
index cbc173ae45aa..9950706abdf8 100644
--- a/include/asm-alpha/smp.h
+++ b/include/asm-alpha/smp.h
@@ -43,7 +43,7 @@ extern struct cpuinfo_alpha cpu_data[NR_CPUS];
 #define PROC_CHANGE_PENALTY     20
 
 #define hard_smp_processor_id()	__hard_smp_processor_id()
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 extern cpumask_t cpu_present_mask;
 extern cpumask_t cpu_online_map;
diff --git a/include/asm-arm/smp.h b/include/asm-arm/smp.h
index bd44f894690f..6c6c60adbbaa 100644
--- a/include/asm-arm/smp.h
+++ b/include/asm-arm/smp.h
@@ -21,7 +21,7 @@
 # error "<asm-arm/smp.h> included in non-SMP build"
 #endif
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern cpumask_t cpu_present_mask;
 #define cpu_possible_map cpu_present_mask
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index e03a206dfa36..55ef31f66bbe 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -51,7 +51,7 @@ extern u8 x86_cpu_to_apicid[];
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define __smp_processor_id() (current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern cpumask_t cpu_callout_map;
 extern cpumask_t cpu_callin_map;
diff --git a/include/asm-ia64/smp.h b/include/asm-ia64/smp.h
index 3ba1a061e4ae..a3914352c995 100644
--- a/include/asm-ia64/smp.h
+++ b/include/asm-ia64/smp.h
@@ -46,7 +46,7 @@ ia64_get_lid (void)
 #define SMP_IRQ_REDIRECTION	(1 << 0)
 #define SMP_IPI_REDIRECTION	(1 << 1)
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern struct smp_boot_data {
 	int cpu_count;
diff --git a/include/asm-m32r/smp.h b/include/asm-m32r/smp.h
index 8cd4d0da4be1..b9a20cdad65f 100644
--- a/include/asm-m32r/smp.h
+++ b/include/asm-m32r/smp.h
@@ -66,7 +66,7 @@ extern volatile int cpu_2_physid[NR_CPUS];
 #define physid_to_cpu(physid)	physid_2_cpu[physid]
 #define cpu_to_physid(cpu_id)	cpu_2_physid[cpu_id]
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 extern cpumask_t cpu_callout_map;
 #define cpu_possible_map cpu_callout_map
diff --git a/include/asm-mips/smp.h b/include/asm-mips/smp.h
index 8ba370ecfd4c..5618f1e12f40 100644
--- a/include/asm-mips/smp.h
+++ b/include/asm-mips/smp.h
@@ -21,7 +21,7 @@
 #include <linux/cpumask.h>
 #include <asm/atomic.h>
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 /* Map from cpu id to sequential logical cpu number.  This will only
    not be idempotent when cpus failed to come on-line.  */
diff --git a/include/asm-parisc/smp.h b/include/asm-parisc/smp.h
index fde77ac35463..9413f67a540b 100644
--- a/include/asm-parisc/smp.h
+++ b/include/asm-parisc/smp.h
@@ -51,7 +51,7 @@ extern void smp_send_reschedule(int cpu);
 
 extern unsigned long cpu_present_mask;
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 #endif /* CONFIG_SMP */
 
diff --git a/include/asm-ppc/smp.h b/include/asm-ppc/smp.h
index ebfb614f55f6..17530c232c76 100644
--- a/include/asm-ppc/smp.h
+++ b/include/asm-ppc/smp.h
@@ -44,7 +44,7 @@ extern void smp_message_recv(int, struct pt_regs *);
 #define NO_PROC_ID		0xFF            /* No processor magic marker */
 #define PROC_CHANGE_PENALTY	20
 
-#define smp_processor_id() (current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 extern int __cpu_up(unsigned int cpu);
 
diff --git a/include/asm-ppc64/smp.h b/include/asm-ppc64/smp.h
index c8646fa999c2..8115ecb8feee 100644
--- a/include/asm-ppc64/smp.h
+++ b/include/asm-ppc64/smp.h
@@ -45,7 +45,7 @@ void generic_cpu_die(unsigned int cpu);
 void generic_mach_cpu_die(void);
 #endif
 
-#define __smp_processor_id() (get_paca()->paca_index)
+#define raw_smp_processor_id()	(get_paca()->paca_index)
 #define hard_smp_processor_id() (get_paca()->hw_cpu_id)
 
 extern cpumask_t cpu_sibling_map[NR_CPUS];
diff --git a/include/asm-s390/smp.h b/include/asm-s390/smp.h
index 9473786387a3..dd50e57a928f 100644
--- a/include/asm-s390/smp.h
+++ b/include/asm-s390/smp.h
@@ -47,7 +47,7 @@ extern int smp_call_function_on(void (*func) (void *info), void *info,
  
 #define PROC_CHANGE_PENALTY	20		/* Schedule penalty */
 
-#define smp_processor_id() (S390_lowcore.cpu_data.cpu_nr)
+#define raw_smp_processor_id()	(S390_lowcore.cpu_data.cpu_nr)
 
 extern int smp_get_cpu(cpumask_t cpu_map);
 extern void smp_put_cpu(int cpu);
diff --git a/include/asm-sh/smp.h b/include/asm-sh/smp.h
index 38b54469d7d1..f19a8b3b69a6 100644
--- a/include/asm-sh/smp.h
+++ b/include/asm-sh/smp.h
@@ -25,7 +25,7 @@ extern cpumask_t cpu_possible_map;
 
 #define cpu_online(cpu)		cpu_isset(cpu, cpu_online_map)
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 /* I've no idea what the real meaning of this is */
 #define PROC_CHANGE_PENALTY	20
diff --git a/include/asm-sparc/smp.h b/include/asm-sparc/smp.h
index f986c0d0922a..4f96d8333a12 100644
--- a/include/asm-sparc/smp.h
+++ b/include/asm-sparc/smp.h
@@ -148,7 +148,7 @@ extern __inline__ int hard_smp_processor_id(void)
 }
 #endif
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()		(current_thread_info()->cpu)
 
 #define prof_multiplier(__cpu)		cpu_data(__cpu).multiplier
 #define prof_counter(__cpu)		cpu_data(__cpu).counter
diff --git a/include/asm-sparc64/smp.h b/include/asm-sparc64/smp.h
index 5e3e06d908fe..110a2de89123 100644
--- a/include/asm-sparc64/smp.h
+++ b/include/asm-sparc64/smp.h
@@ -64,7 +64,7 @@ static __inline__ int hard_smp_processor_id(void)
 	}
 }
 
-#define smp_processor_id() (current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 #endif /* !(__ASSEMBLY__) */
 
diff --git a/include/asm-um/smp.h b/include/asm-um/smp.h
index 4412d5d9c26b..d879eba2b52c 100644
--- a/include/asm-um/smp.h
+++ b/include/asm-um/smp.h
@@ -8,7 +8,8 @@
 #include "asm/current.h"
 #include "linux/cpumask.h"
 
-#define smp_processor_id() (current_thread->cpu)
+#define raw_smp_processor_id() (current_thread->cpu)
+
 #define cpu_logical_map(n) (n)
 #define cpu_number_map(n) (n)
 #define PROC_CHANGE_PENALTY	15 /* Pick a number, any number */
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 96844fecbde8..a7425aa5a3b7 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -68,7 +68,7 @@ static inline int num_booting_cpus(void)
 	return cpus_weight(cpu_callout_map);
 }
 
-#define __smp_processor_id() read_pda(cpunumber)
+#define raw_smp_processor_id() read_pda(cpunumber)
 
 extern __inline int hard_smp_processor_id(void)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e530c6c092f1..beacd931b606 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -381,7 +381,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
-#define numa_node_id()		(cpu_to_node(_smp_processor_id()))
+#define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
 
 #ifndef CONFIG_DISCONTIGMEM
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index dcf1db3b35d3..9dfa3ee769ae 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -92,10 +92,7 @@ void smp_prepare_boot_cpu(void);
 /*
  *	These macros fold the SMP functionality into a single CPU system
  */
-
-#if !defined(__smp_processor_id) || !defined(CONFIG_PREEMPT)
-# define smp_processor_id()			0
-#endif
+#define raw_smp_processor_id()			0
 #define hard_smp_processor_id()			0
 #define smp_call_function(func,info,retry,wait)	({ 0; })
 #define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
@@ -106,30 +103,25 @@ static inline void smp_send_reschedule(int cpu) { }
 #endif /* !SMP */
 
 /*
- * DEBUG_PREEMPT support: check whether smp_processor_id() is being
- * used in a preemption-safe way.
+ * smp_processor_id(): get the current CPU ID.
  *
- * An architecture has to enable this debugging code explicitly.
- * It can do so by renaming the smp_processor_id() macro to
- * __smp_processor_id().  This should only be done after some minimal
- * testing, because usually there are a number of false positives
- * that an architecture will trigger.
+ * if DEBUG_PREEMPT is enabled the we check whether it is
+ * used in a preemption-safe way. (smp_processor_id() is safe
+ * if it's used in a preemption-off critical section, or in
+ * a thread that is bound to the current CPU.)
  *
- * To fix a false positive (i.e. smp_processor_id() use that the
- * debugging code reports but which use for some reason is legal),
- * change the smp_processor_id() reference to _smp_processor_id(),
- * which is the nondebug variant.  NOTE: don't use this to hack around
- * real bugs.
+ * NOTE: raw_smp_processor_id() is for internal use only
+ * (smp_processor_id() is the preferred variant), but in rare
+ * instances it might also be used to turn off false positives
+ * (i.e. smp_processor_id() use that the debugging code reports but
+ * which use for some reason is legal). Don't use this to hack around
+ * the warning message, as your code might not work under PREEMPT.
  */
-#ifdef __smp_processor_id
-# if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
-   extern unsigned int smp_processor_id(void);
-# else
-#  define smp_processor_id() __smp_processor_id()
-# endif
-# define _smp_processor_id() __smp_processor_id()
+#ifdef CONFIG_DEBUG_PREEMPT
+  extern unsigned int debug_smp_processor_id(void);
+# define smp_processor_id() debug_smp_processor_id()
 #else
-# define _smp_processor_id() smp_processor_id()
+# define smp_processor_id() raw_smp_processor_id()
 #endif
 
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
diff --git a/include/net/route.h b/include/net/route.h
index d34ca8fc6756..c3cd069a9aca 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -107,7 +107,7 @@ struct rt_cache_stat
 
 extern struct rt_cache_stat *rt_cache_stat;
 #define RT_CACHE_STAT_INC(field)					  \
-		(per_cpu_ptr(rt_cache_stat, _smp_processor_id())->field++)
+		(per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
 
 extern struct ip_rt_acct *ip_rt_acct;
 
diff --git a/include/net/snmp.h b/include/net/snmp.h
index a15ab256276e..a36bed8ea210 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -128,18 +128,18 @@ struct linux_mib {
 #define SNMP_STAT_USRPTR(name)	(name[1])
 
 #define SNMP_INC_STATS_BH(mib, field) 	\
-	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field]++)
+	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field]++)
 #define SNMP_INC_STATS_OFFSET_BH(mib, field, offset)	\
-	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field + (offset)]++)
+	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field + (offset)]++)
 #define SNMP_INC_STATS_USER(mib, field) \
-	(per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field]++)
+	(per_cpu_ptr(mib[1], raw_smp_processor_id())->mibs[field]++)
 #define SNMP_INC_STATS(mib, field) 	\
-	(per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]++)
+	(per_cpu_ptr(mib[!in_softirq()], raw_smp_processor_id())->mibs[field]++)
 #define SNMP_DEC_STATS(mib, field) 	\
-	(per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]--)
+	(per_cpu_ptr(mib[!in_softirq()], raw_smp_processor_id())->mibs[field]--)
 #define SNMP_ADD_STATS_BH(mib, field, addend) 	\
-	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field] += addend)
+	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field] += addend)
 #define SNMP_ADD_STATS_USER(mib, field, addend) 	\
-	(per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field] += addend)
+	(per_cpu_ptr(mib[1], raw_smp_processor_id())->mibs[field] += addend)
 
 #endif
diff --git a/kernel/module.c b/kernel/module.c
index 83b3d376708c..a566745dde62 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -379,7 +379,7 @@ static void module_unload_init(struct module *mod)
 	for (i = 0; i < NR_CPUS; i++)
 		local_set(&mod->ref[i].count, 0);
 	/* Hold reference count during initialization. */
-	local_set(&mod->ref[_smp_processor_id()].count, 1);
+	local_set(&mod->ref[raw_smp_processor_id()].count, 1);
 	/* Backwards compatibility macros put refcount during init. */
 	mod->waiter = current;
 }
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index cba3584b80fe..457c2302ed42 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -48,11 +48,11 @@ void disable_nonboot_cpus(void)
 {
 	oldmask = current->cpus_allowed;
 	set_cpus_allowed(current, cpumask_of_cpu(0));
-	printk("Freezing CPUs (at %d)", _smp_processor_id());
+	printk("Freezing CPUs (at %d)", raw_smp_processor_id());
 	current->state = TASK_INTERRUPTIBLE;
 	schedule_timeout(HZ);
 	printk("...");
-	BUG_ON(_smp_processor_id() != 0);
+	BUG_ON(raw_smp_processor_id() != 0);
 
 	/* FIXME: for this to work, all the CPUs must be running
 	 * "idle" thread (or we deadlock). Is that guaranteed? */
diff --git a/kernel/sched.c b/kernel/sched.c
index f12a0c8a7d98..deca041fc364 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3814,7 +3814,7 @@ EXPORT_SYMBOL(yield);
  */
 void __sched io_schedule(void)
 {
-	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
 
 	atomic_inc(&rq->nr_iowait);
 	schedule();
@@ -3825,7 +3825,7 @@ EXPORT_SYMBOL(io_schedule);
 
 long __sched io_schedule_timeout(long timeout)
 {
-	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
 	long ret;
 
 	atomic_inc(&rq->nr_iowait);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6116b25aa7cf..84a9d18aa8da 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -100,7 +100,7 @@ static int stop_machine(void)
 	stopmachine_state = STOPMACHINE_WAIT;
 
 	for_each_online_cpu(i) {
-		if (i == _smp_processor_id())
+		if (i == raw_smp_processor_id())
 			continue;
 		ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
 		if (ret < 0)
@@ -182,7 +182,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 
 	/* If they don't care which CPU fn runs on, bind to any online one. */
 	if (cpu == NR_CPUS)
-		cpu = _smp_processor_id();
+		cpu = raw_smp_processor_id();
 
 	p = kthread_create(do_stop, &smdata, "kstopmachine");
 	if (!IS_ERR(p)) {
diff --git a/lib/Makefile b/lib/Makefile
index 9eccea9429a7..5f10cb898407 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -20,6 +20,7 @@ lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
+obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 
 ifneq ($(CONFIG_HAVE_DEC_LOCK),y) 
   lib-y += dec_and_lock.o
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index 99b0ae3d51dd..bd2bc5d887b8 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -9,61 +9,6 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 
-#if defined(CONFIG_PREEMPT) && defined(__smp_processor_id) && \
-		defined(CONFIG_DEBUG_PREEMPT)
-
-/*
- * Debugging check.
- */
-unsigned int smp_processor_id(void)
-{
-	unsigned long preempt_count = preempt_count();
-	int this_cpu = __smp_processor_id();
-	cpumask_t this_mask;
-
-	if (likely(preempt_count))
-		goto out;
-
-	if (irqs_disabled())
-		goto out;
-
-	/*
-	 * Kernel threads bound to a single CPU can safely use
-	 * smp_processor_id():
-	 */
-	this_mask = cpumask_of_cpu(this_cpu);
-
-	if (cpus_equal(current->cpus_allowed, this_mask))
-		goto out;
-
-	/*
-	 * It is valid to assume CPU-locality during early bootup:
-	 */
-	if (system_state != SYSTEM_RUNNING)
-		goto out;
-
-	/*
-	 * Avoid recursion:
-	 */
-	preempt_disable();
-
-	if (!printk_ratelimit())
-		goto out_enable;
-
-	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
-	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
-	dump_stack();
-
-out_enable:
-	preempt_enable_no_resched();
-out:
-	return this_cpu;
-}
-
-EXPORT_SYMBOL(smp_processor_id);
-
-#endif /* PREEMPT && __smp_processor_id && DEBUG_PREEMPT */
-
 #ifdef CONFIG_PREEMPT_BKL
 /*
  * The 'big kernel semaphore'
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
new file mode 100644
index 000000000000..42c08ef828c5
--- /dev/null
+++ b/lib/smp_processor_id.c
@@ -0,0 +1,55 @@
+/*
+ * lib/smp_processor_id.c
+ *
+ * DEBUG_PREEMPT variant of smp_processor_id().
+ */
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+
+unsigned int debug_smp_processor_id(void)
+{
+	unsigned long preempt_count = preempt_count();
+	int this_cpu = raw_smp_processor_id();
+	cpumask_t this_mask;
+
+	if (likely(preempt_count))
+		goto out;
+
+	if (irqs_disabled())
+		goto out;
+
+	/*
+	 * Kernel threads bound to a single CPU can safely use
+	 * smp_processor_id():
+	 */
+	this_mask = cpumask_of_cpu(this_cpu);
+
+	if (cpus_equal(current->cpus_allowed, this_mask))
+		goto out;
+
+	/*
+	 * It is valid to assume CPU-locality during early bootup:
+	 */
+	if (system_state != SYSTEM_RUNNING)
+		goto out;
+
+	/*
+	 * Avoid recursion:
+	 */
+	preempt_disable();
+
+	if (!printk_ratelimit())
+		goto out_enable;
+
+	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
+	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
+	dump_stack();
+
+out_enable:
+	preempt_enable_no_resched();
+out:
+	return this_cpu;
+}
+
+EXPORT_SYMBOL(debug_smp_processor_id);
+
-- 
cgit v1.3-14-g43fede


From 753ee728964e5afb80c17659cc6c3a6fd0a42fe0 Mon Sep 17 00:00:00 2001
From: Martin Hicks <mort@sgi.com>
Date: Tue, 21 Jun 2005 17:14:41 -0700
Subject: [PATCH] VM: early zone reclaim

This is the core of the (much simplified) early reclaim.  The goal of this
patch is to reclaim some easily-freed pages from a zone before falling back
onto another zone.

One of the major uses of this is NUMA machines.  With the default allocator
behavior the allocator would look for memory in another zone, which might be
off-node, before trying to reclaim from the current zone.

This adds a zone tuneable to enable early zone reclaim.  It is selected on a
per-zone basis and is turned on/off via syscall.

Adding some extra throttling on the reclaim was also required (patch
4/4).  Without the machine would grind to a crawl when doing a "make -j"
kernel build.  Even with this patch the System Time is higher on
average, but it seems tolerable.  Here are some numbers for kernbench
runs on a 2-node, 4cpu, 8Gig RAM Altix in the "make -j" run:

			wall  user   sys   %cpu  ctx sw.  sleeps
			----  ----   ---   ----   ------  ------
No patch		1009  1384   847   258   298170   504402
w/patch, no reclaim     880   1376   667   288   254064   396745
w/patch & reclaim       1079  1385   926   252   291625   548873

These numbers are the average of 2 runs of 3 "make -j" runs done right
after system boot.  Run-to-run variability for "make -j" is huge, so
these numbers aren't terribly useful except to seee that with reclaim
the benchmark still finishes in a reasonable amount of time.

I also looked at the NUMA hit/miss stats for the "make -j" runs and the
reclaim doesn't make any difference when the machine is thrashing away.

Doing a "make -j8" on a single node that is filled with page cache pages
takes 700 seconds with reclaim turned on and 735 seconds without reclaim
(due to remote memory accesses).

The simple zone_reclaim syscall program is at
http://www.bork.org/~mort/sgi/zone_reclaim.c

Signed-off-by: Martin Hicks <mort@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/syscall_table.S |  2 +-
 arch/ia64/kernel/entry.S         |  2 +-
 include/asm-i386/unistd.h        |  2 +-
 include/asm-ia64/unistd.h        |  1 +
 include/linux/mmzone.h           |  6 ++++
 include/linux/swap.h             |  1 +
 kernel/sys_ni.c                  |  1 +
 mm/page_alloc.c                  | 33 +++++++++++++++++----
 mm/vmscan.c                      | 64 ++++++++++++++++++++++++++++++++++++++++
 9 files changed, 104 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 6cd1ed311f02..d408afaf6495 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -251,7 +251,7 @@ ENTRY(sys_call_table)
 	.long sys_io_submit
 	.long sys_io_cancel
 	.long sys_fadvise64	/* 250 */
-	.long sys_ni_syscall
+	.long sys_set_zone_reclaim
 	.long sys_exit_group
 	.long sys_lookup_dcookie
 	.long sys_epoll_create
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index d99316c9be28..b1d5d3d5276c 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1579,7 +1579,7 @@ sys_call_table:
 	data8 sys_keyctl
 	data8 sys_ni_syscall
 	data8 sys_ni_syscall			// 1275
-	data8 sys_ni_syscall
+	data8 sys_set_zone_reclaim
 	data8 sys_ni_syscall
 	data8 sys_ni_syscall
 	data8 sys_ni_syscall
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 61bcc1b1e3f4..176413fb9ae3 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -256,7 +256,7 @@
 #define __NR_io_submit		248
 #define __NR_io_cancel		249
 #define __NR_fadvise64		250
-
+#define __NR_set_zone_reclaim	251
 #define __NR_exit_group		252
 #define __NR_lookup_dcookie	253
 #define __NR_epoll_create	254
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 33e26c557c5c..f7f43ec2483a 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -263,6 +263,7 @@
 #define __NR_add_key			1271
 #define __NR_request_key		1272
 #define __NR_keyctl			1273
+#define __NR_set_zone_reclaim		1276
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index beacd931b606..dfc2452ccb10 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -144,6 +144,12 @@ struct zone {
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	int			all_unreclaimable; /* All pages pinned */
 
+	/*
+	 * Does the allocator try to reclaim pages from the zone as soon
+	 * as it fails a watermark_ok() in __alloc_pages?
+	 */
+	int			reclaim_pages;
+
 	/*
 	 * prev_priority holds the scanning priority for this zone.  It is
 	 * defined as the scanning priority at which we achieved our reclaim
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3bbc41be9bd0..0d21e682d99d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -173,6 +173,7 @@ extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(struct zone **, unsigned int, unsigned int);
+extern int zone_reclaim(struct zone *, unsigned int, unsigned int);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0dda70ed1f98..6f15bea7d1a8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -77,6 +77,7 @@ cond_syscall(sys_request_key);
 cond_syscall(sys_keyctl);
 cond_syscall(compat_sys_keyctl);
 cond_syscall(compat_sys_socketcall);
+cond_syscall(sys_set_zone_reclaim);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40169f0b7e9e..3c0f69ded6b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -724,6 +724,14 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 	return 1;
 }
 
+static inline int
+should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
+{
+	if (!z->reclaim_pages)
+		return 0;
+	return 1;
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -760,17 +768,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
 
 	classzone_idx = zone_idx(zones[0]);
 
- restart:
+restart:
 	/* Go through the zonelist once, looking for a zone with enough free */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
-
-		if (!zone_watermark_ok(z, order, z->pages_low,
-				       classzone_idx, 0, 0))
-			continue;
+		int do_reclaim = should_reclaim_zone(z, gfp_mask);
 
 		if (!cpuset_zone_allowed(z))
 			continue;
 
+		/*
+		 * If the zone is to attempt early page reclaim then this loop
+		 * will try to reclaim pages and check the watermark a second
+		 * time before giving up and falling back to the next zone.
+		 */
+zone_reclaim_retry:
+		if (!zone_watermark_ok(z, order, z->pages_low,
+				       classzone_idx, 0, 0)) {
+			if (!do_reclaim)
+				continue;
+			else {
+				zone_reclaim(z, gfp_mask, order);
+				/* Only try reclaim once */
+				do_reclaim = 0;
+				goto zone_reclaim_retry;
+			}
+		}
+
 		page = buffered_rmqueue(z, order, gfp_mask);
 		if (page)
 			goto got_pg;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6379ddbffd9b..7da846960d8a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1323,3 +1323,67 @@ static int __init kswapd_init(void)
 }
 
 module_init(kswapd_init)
+
+
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
+{
+	struct scan_control sc;
+	int nr_pages = 1 << order;
+	int total_reclaimed = 0;
+
+	/* The reclaim may sleep, so don't do it if sleep isn't allowed */
+	if (!(gfp_mask & __GFP_WAIT))
+		return 0;
+	if (zone->all_unreclaimable)
+		return 0;
+
+	sc.gfp_mask = gfp_mask;
+	sc.may_writepage = 0;
+	sc.may_swap = 0;
+	sc.nr_mapped = read_page_state(nr_mapped);
+	sc.nr_scanned = 0;
+	sc.nr_reclaimed = 0;
+	/* scan at the highest priority */
+	sc.priority = 0;
+
+	if (nr_pages > SWAP_CLUSTER_MAX)
+		sc.swap_cluster_max = nr_pages;
+	else
+		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+	shrink_zone(zone, &sc);
+	total_reclaimed = sc.nr_reclaimed;
+
+	return total_reclaimed;
+}
+
+asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
+				     unsigned int state)
+{
+	struct zone *z;
+	int i;
+
+	if (node >= MAX_NUMNODES || !node_online(node))
+		return -EINVAL;
+
+	/* This will break if we ever add more zones */
+	if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
+		return -EINVAL;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		if (!(zone & 1<<i))
+			continue;
+
+		z = &NODE_DATA(node)->node_zones[i];
+
+		if (state)
+			z->reclaim_pages = 1;
+		else
+			z->reclaim_pages = 0;
+	}
+
+	return 0;
+}
-- 
cgit v1.3-14-g43fede


From 1363c3cd8603a913a27e2995dccbd70d5312d8e6 Mon Sep 17 00:00:00 2001
From: Wolfgang Wander <wwc@rentec.com>
Date: Tue, 21 Jun 2005 17:14:49 -0700
Subject: [PATCH] Avoiding mmap fragmentation

Ingo recently introduced a great speedup for allocating new mmaps using the
free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and
causes huge performance increases in thread creation.

The downside of this patch is that it does lead to fragmentation in the
mmap-ed areas (visible via /proc/self/maps), such that some applications
that work fine under 2.4 kernels quickly run out of memory on any 2.6
kernel.

The problem is twofold:

  1) the free_area_cache is used to continue a search for memory where
     the last search ended.  Before the change new areas were always
     searched from the base address on.

     So now new small areas are cluttering holes of all sizes
     throughout the whole mmap-able region whereas before small holes
     tended to close holes near the base leaving holes far from the base
     large and available for larger requests.

  2) the free_area_cache also is set to the location of the last
     munmap-ed area so in scenarios where we allocate e.g.  five regions of
     1K each, then free regions 4 2 3 in this order the next request for 1K
     will be placed in the position of the old region 3, whereas before we
     appended it to the still active region 1, placing it at the location
     of the old region 2.  Before we had 1 free region of 2K, now we only
     get two free regions of 1K -> fragmentation.

The patch addresses thes issues by introducing yet another cache descriptor
cached_hole_size that contains the largest known hole size below the
current free_area_cache.  If a new request comes in the size is compared
against the cached_hole_size and if the request can be filled with a hole
below free_area_cache the search is started from the base instead.

The results look promising: Whereas 2.6.12-rc4 fragments quickly and my
(earlier posted) leakme.c test program terminates after 50000+ iterations
with 96 distinct and fragmented maps in /proc/self/maps it performs nicely
(as expected) with thread creation, Ingo's test_str02 with 20000 threads
requires 0.7s system time.

Taking out Ingo's patch (un-patch available per request) by basically
deleting all mentions of free_area_cache from the kernel and starting the
search for new memory always at the respective bases we observe: leakme
terminates successfully with 11 distinctive hardly fragmented areas in
/proc/self/maps but thread creating is gringdingly slow: 30+s(!) system
time for Ingo's test_str02 with 20000 threads.

Now - drumroll ;-) the appended patch works fine with leakme: it ends with
only 7 distinct areas in /proc/self/maps and also thread creation seems
sufficiently fast with 0.71s for 20000 threads.

Signed-off-by: Wolfgang Wander <wwc@rentec.com>
Credit-to: "Richard Purdie" <rpurdie@rpsys.net>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu> (partly)
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mm/mmap.c              | 10 +++++++-
 arch/i386/mm/hugetlbpage.c      | 34 ++++++++++++++++++++++----
 arch/ppc64/mm/hugetlbpage.c     | 34 ++++++++++++++++++++++----
 arch/sh/kernel/sys_sh.c         |  8 +++++++
 arch/sparc64/kernel/sys_sparc.c |  8 +++++++
 arch/x86_64/ia32/ia32_aout.c    |  1 +
 arch/x86_64/kernel/sys_x86_64.c |  9 +++++++
 fs/binfmt_aout.c                |  1 +
 fs/binfmt_elf.c                 |  1 +
 fs/hugetlbfs/inode.c            |  3 +++
 include/linux/sched.h           | 11 +++++----
 kernel/fork.c                   |  2 ++
 mm/mmap.c                       | 53 +++++++++++++++++++++++++++++++----------
 mm/nommu.c                      |  2 +-
 14 files changed, 147 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 32c4b0e35b37..3de7f84b53c2 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -73,7 +73,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	if (do_align)
@@ -90,6 +95,7 @@ full_search:
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -101,6 +107,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 		if (do_align)
 			addr = COLOUR_ALIGN(addr, pgoff);
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 5aa06001a4bd..3b099f32b948 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -140,7 +140,12 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 	struct vm_area_struct *vma;
 	unsigned long start_addr;
 
-	start_addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = mm->free_area_cache;
+	} else {
+	        start_addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	addr = ALIGN(start_addr, HPAGE_SIZE);
@@ -154,6 +159,7 @@ full_search:
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -162,6 +168,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
 	}
 }
@@ -173,12 +181,17 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev_vma;
 	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
 	int first_time = 1;
 
 	/* don't allow allocations above current base */
 	if (mm->free_area_cache > base)
 		mm->free_area_cache = base;
 
+	if (len <= largest_hole) {
+	        largest_hole = 0;
+		mm->free_area_cache  = base;
+	}
 try_again:
 	/* make sure it can fit in the remaining address space */
 	if (mm->free_area_cache < len)
@@ -199,13 +212,21 @@ try_again:
 		 * vma->vm_start, use it:
 		 */
 		if (addr + len <= vma->vm_start &&
-				(!prev_vma || (addr >= prev_vma->vm_end)))
+		            (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr);
-		else
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		} else {
 			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end)
+		        if (mm->free_area_cache == vma->vm_end) {
 				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+		}
+
+		/* remember the largest hole we saw so far */
+		if (addr + largest_hole < vma->vm_start)
+		        largest_hole = vma->vm_start - addr;
 
 		/* try just below the current vma->vm_start */
 		addr = (vma->vm_start - len) & HPAGE_MASK;
@@ -218,6 +239,7 @@ fail:
 	 */
 	if (first_time) {
 		mm->free_area_cache = base;
+		largest_hole = 0;
 		first_time = 0;
 		goto try_again;
 	}
@@ -228,6 +250,7 @@ fail:
 	 * allocations.
 	 */
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 	addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
 			len, pgoff, flags);
 
@@ -235,6 +258,7 @@ fail:
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = base;
+	mm->cached_hole_size = ~0UL;
 
 	return addr;
 }
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index b4ab766f5980..fdcfe97c75c1 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -292,7 +292,12 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    && !is_hugepage_only_range(mm, addr,len))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	vma = find_vma(mm, addr);
@@ -316,6 +321,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 		vma = vma->vm_next;
 	}
@@ -323,6 +330,7 @@ full_search:
 	/* Make sure we didn't miss any holes */
 	if (start_addr != TASK_UNMAPPED_BASE) {
 		start_addr = addr = TASK_UNMAPPED_BASE;
+		mm->cached_hole_size = 0;
 		goto full_search;
 	}
 	return -ENOMEM;
@@ -344,6 +352,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	struct vm_area_struct *vma, *prev_vma;
 	struct mm_struct *mm = current->mm;
 	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
 	int first_time = 1;
 
 	/* requested length too big for entire address space */
@@ -364,6 +373,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			return addr;
 	}
 
+	if (len <= largest_hole) {
+	        largest_hole = 0;
+		mm->free_area_cache = base;
+	}
 try_again:
 	/* make sure it can fit in the remaining address space */
 	if (mm->free_area_cache < len)
@@ -392,13 +405,21 @@ hugepage_recheck:
 		 * vma->vm_start, use it:
 		 */
 		if (addr+len <= vma->vm_start &&
-				(!prev_vma || (addr >= prev_vma->vm_end)))
+		          (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr);
-		else
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		} else {
 			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end)
+		        if (mm->free_area_cache == vma->vm_end) {
 				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+		}
+
+		/* remember the largest hole we saw so far */
+		if (addr + largest_hole < vma->vm_start)
+		        largest_hole = vma->vm_start - addr;
 
 		/* try just below the current vma->vm_start */
 		addr = vma->vm_start-len;
@@ -411,6 +432,7 @@ fail:
 	 */
 	if (first_time) {
 		mm->free_area_cache = base;
+		largest_hole = 0;
 		first_time = 0;
 		goto try_again;
 	}
@@ -421,11 +443,13 @@ fail:
 	 * allocations.
 	 */
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 	/*
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = base;
+	mm->cached_hole_size = ~0UL;
 
 	return addr;
 }
diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c
index df5ac294c379..917b2f32f260 100644
--- a/arch/sh/kernel/sys_sh.c
+++ b/arch/sh/kernel/sys_sh.c
@@ -79,6 +79,10 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
+	if (len <= mm->cached_hole_size) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = TASK_UNMAPPED_BASE;
+	}
 	if (flags & MAP_PRIVATE)
 		addr = PAGE_ALIGN(mm->free_area_cache);
 	else
@@ -95,6 +99,7 @@ full_search:
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -106,6 +111,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
 		addr = vma->vm_end;
 		if (!(flags & MAP_PRIVATE))
 			addr = COLOUR_ALIGN(addr);
diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c
index 0077f02f4b37..5f8c822a2b4a 100644
--- a/arch/sparc64/kernel/sys_sparc.c
+++ b/arch/sparc64/kernel/sys_sparc.c
@@ -84,6 +84,10 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
 			return addr;
 	}
 
+	if (len <= mm->cached_hole_size) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = TASK_UNMAPPED_BASE;
+	}
 	start_addr = addr = mm->free_area_cache;
 
 	task_size -= len;
@@ -103,6 +107,7 @@ full_search:
 		if (task_size < addr) {
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -114,6 +119,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
 		addr = vma->vm_end;
 		if (do_color_align)
 			addr = COLOUR_ALIGN(addr, pgoff);
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 1965efc974dc..c12edf5d97f0 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -312,6 +312,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
+	current->mm->cached_hole_size = 0;
 
 	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
index d9798dd433fc..cc7821c68851 100644
--- a/arch/x86_64/kernel/sys_x86_64.c
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -105,6 +105,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
+	if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
+	    && len <= mm->cached_hole_size) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = begin;
+	}
 	addr = mm->free_area_cache;
 	if (addr < begin) 
 		addr = begin; 
@@ -120,6 +125,7 @@ full_search:
 			 */
 			if (start_addr != begin) {
 				start_addr = addr = begin;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -131,6 +137,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
 		addr = vma->vm_end;
 	}
 }
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 009b8920c1ff..dd9baabaf016 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -316,6 +316,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = 0;
 
 	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8f6b6b76179..7976a238f0a3 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -775,6 +775,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	   change some of these later */
 	set_mm_counter(current->mm, rss, 0);
 	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = 0;
 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
 				 executable_stack);
 	if (retval < 0) {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2af3338f891b..3a9b6d179cbd 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -122,6 +122,9 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 
 	start_addr = mm->free_area_cache;
 
+	if (len <= mm->cached_hole_size)
+		start_addr = TASK_UNMAPPED_BASE;
+
 full_search:
 	addr = ALIGN(start_addr, HPAGE_SIZE);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4dbb109022f3..b58afd97a180 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -201,8 +201,8 @@ extern unsigned long
 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long len, unsigned long pgoff,
 			  unsigned long flags);
-extern void arch_unmap_area(struct vm_area_struct *area);
-extern void arch_unmap_area_topdown(struct vm_area_struct *area);
+extern void arch_unmap_area(struct mm_struct *, unsigned long);
+extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 
 #define set_mm_counter(mm, member, value) (mm)->_##member = (value)
 #define get_mm_counter(mm, member) ((mm)->_##member)
@@ -218,9 +218,10 @@ struct mm_struct {
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
-	void (*unmap_area) (struct vm_area_struct *area);
-	unsigned long mmap_base;		/* base of mmap area */
-	unsigned long free_area_cache;		/* first hole */
+	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
+        unsigned long mmap_base;		/* base of mmap area */
+        unsigned long cached_hole_size;         /* if non-zero, the largest hole below free_area_cache */
+	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
diff --git a/kernel/fork.c b/kernel/fork.c
index f42a17f88699..876b31cd822d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -194,6 +194,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
 	mm->free_area_cache = oldmm->mmap_base;
+	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
 	set_mm_counter(mm, rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
@@ -322,6 +323,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	mm->ioctx_list = NULL;
 	mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index de54acd9942f..9da23c1ef9dc 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1175,7 +1175,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
@@ -1186,7 +1191,9 @@ full_search:
 			 * some holes.
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
-				start_addr = addr = TASK_UNMAPPED_BASE;
+				addr = TASK_UNMAPPED_BASE;
+			        start_addr = addr;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -1198,19 +1205,22 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 	}
 }
 #endif	
 
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 	/*
 	 * Is this a new hole at the lowest possible address?
 	 */
-	if (area->vm_start >= TASK_UNMAPPED_BASE &&
-			area->vm_start < area->vm_mm->free_area_cache)
-		area->vm_mm->free_area_cache = area->vm_start;
+	if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
+		mm->free_area_cache = addr;
+		mm->cached_hole_size = ~0UL;
+	}
 }
 
 /*
@@ -1240,6 +1250,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			return addr;
 	}
 
+	/* check if free_area_cache is useful for us */
+	if (len <= mm->cached_hole_size) {
+ 	        mm->cached_hole_size = 0;
+ 		mm->free_area_cache = mm->mmap_base;
+ 	}
+
 	/* either no address requested or can't fit in requested address hole */
 	addr = mm->free_area_cache;
 
@@ -1264,6 +1280,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			/* remember the address as a hint for next time */
 			return (mm->free_area_cache = addr);
 
+ 		/* remember the largest hole we saw so far */
+ 		if (addr + mm->cached_hole_size < vma->vm_start)
+ 		        mm->cached_hole_size = vma->vm_start - addr;
+
 		/* try just below the current vma->vm_start */
 		addr = vma->vm_start-len;
 	} while (len < vma->vm_start);
@@ -1274,28 +1294,30 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
+  	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 	/*
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = mm->mmap_base;
+	mm->cached_hole_size = ~0UL;
 
 	return addr;
 }
 #endif
 
-void arch_unmap_area_topdown(struct vm_area_struct *area)
+void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 {
 	/*
 	 * Is this a new hole at the highest possible address?
 	 */
-	if (area->vm_end > area->vm_mm->free_area_cache)
-		area->vm_mm->free_area_cache = area->vm_end;
+	if (addr > mm->free_area_cache)
+		mm->free_area_cache = addr;
 
 	/* dont allow allocations above current base */
-	if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base)
-		area->vm_mm->free_area_cache = area->vm_mm->mmap_base;
+	if (mm->free_area_cache > mm->mmap_base)
+		mm->free_area_cache = mm->mmap_base;
 }
 
 unsigned long
@@ -1595,7 +1617,6 @@ static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
 	if (area->vm_flags & VM_LOCKED)
 		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
 	vm_stat_unaccount(area);
-	area->vm_mm->unmap_area(area);
 	remove_vm_struct(area);
 }
 
@@ -1649,6 +1670,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct vm_area_struct **insertion_point;
 	struct vm_area_struct *tail_vma = NULL;
+	unsigned long addr;
 
 	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	do {
@@ -1659,6 +1681,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 	} while (vma && vma->vm_start < end);
 	*insertion_point = vma;
 	tail_vma->vm_next = NULL;
+	if (mm->unmap_area == arch_unmap_area)
+		addr = prev ? prev->vm_end : mm->mmap_base;
+	else
+		addr = vma ?  vma->vm_start : mm->mmap_base;
+	mm->unmap_area(mm, addr);
 	mm->mmap_cache = NULL;		/* Kill the cache. */
 }
 
diff --git a/mm/nommu.c b/mm/nommu.c
index c53e9c8f6b4a..ce74452c02d9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1067,7 +1067,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
 	return -ENOMEM;
 }
 
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 }
 
-- 
cgit v1.3-14-g43fede


From 45918e1a8bfcabc1cb4570b8df276655020eac45 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 21 Jun 2005 17:15:08 -0700
Subject: [PATCH] dup_mmap: update comment on new vma

Remove part of comment on linking new vma in dup_mmap: since anon_vma rmap
came in, try_to_unmap_one knows the vma without needing find_vma.  But add
a comment to note that here vma is inserted without mmap_sem.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 876b31cd822d..a28d11e10877 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -250,8 +250,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 
 		/*
 		 * Link in the new vma and copy the page table entries:
-		 * link in first so that swapoff can see swap entries,
-		 * and try_to_unmap_one's find_vma find the new vma.
+		 * link in first so that swapoff can see swap entries.
+		 * Note that, exceptionally, here the vma is inserted
+		 * without holding mm->mmap_sem.
 		 */
 		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
-- 
cgit v1.3-14-g43fede


From dbce706e2550253c5ab6043f4f5dfde0cd02470f Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Tue, 21 Jun 2005 17:16:19 -0700
Subject: [PATCH] uml: add and use generic hw_controller_type->release

With Chris Wedgwood <cw@f00f.org>

Currently UML must explicitly call the UML-specific
free_irq_by_irq_and_dev() for each free_irq call it's done.

This is needed because ->shutdown and/or ->disable are only called when the
last "action" for that irq is removed.

Instead, for UML shared IRQs (UML IRQs are very often, if not always,
shared), for each dev_id some setup is done, which must be cleared on the
release of that fd.  For instance, for each open console a new instance
(i.e.  new dev_id) of the same IRQ is requested().

Exactly, a fd is stored in an array (pollfds), which is after read by a
host thread and passed to poll().  Each event registered by poll() triggers
an interrupt.  So, for each free_irq() we must remove the corresponding
host fd from the table, which we do via this -release() method.

In this patch we add an appropriate hook for this, and remove all uses of
it by pointing the hook to the said procedure; this is safe to do since the
said procedure.

Also some cosmetic improvements are included.

This is heavily based on some work by Chris Wedgwood, which however didn't
get the patch merged for something I'd call a "misunderstanding" (the need
for this patch wasn't cleanly explained, thus adding the generic hook was
felt as undesirable).

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
CC: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/line.c       |  2 --
 arch/um/drivers/net_kern.c   |  1 -
 arch/um/drivers/port_kern.c  |  1 -
 arch/um/drivers/xterm_kern.c |  1 -
 arch/um/kernel/irq.c         | 11 +++++++----
 arch/um/kernel/irq_user.c    |  2 --
 include/linux/irq.h          |  1 +
 kernel/irq/manage.c          |  4 ++++
 8 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 025d3be8aca4..562f864254ba 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -406,14 +406,12 @@ void line_disable(struct tty_struct *tty, int current_irq)
 	if(line->driver->read_irq == current_irq)
 		free_irq_later(line->driver->read_irq, tty);
 	else {
-		free_irq_by_irq_and_dev(line->driver->read_irq, tty);
 		free_irq(line->driver->read_irq, tty);
 	}
 
 	if(line->driver->write_irq == current_irq)
 		free_irq_later(line->driver->write_irq, tty);
 	else {
-		free_irq_by_irq_and_dev(line->driver->write_irq, tty);
 		free_irq(line->driver->write_irq, tty);
 	}
 
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 4eeaf88c1e97..5388a7428691 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -146,7 +146,6 @@ static int uml_net_close(struct net_device *dev)
 	netif_stop_queue(dev);
 	spin_lock(&lp->lock);
 
-	free_irq_by_irq_and_dev(dev->irq, dev);
 	free_irq(dev->irq, dev);
 	if(lp->close != NULL)
 		(*lp->close)(lp->fd, &lp->user);
diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c
index b5ee07472f79..c41efd207fcc 100644
--- a/arch/um/drivers/port_kern.c
+++ b/arch/um/drivers/port_kern.c
@@ -257,7 +257,6 @@ int port_wait(void *data)
 		 * connection.  Then we loop here throwing out failed 
 		 * connections until a good one is found.
 		 */
-		free_irq_by_irq_and_dev(TELNETD_IRQ, conn);
 		free_irq(TELNETD_IRQ, conn);
 
 		if(conn->fd >= 0) break;
diff --git a/arch/um/drivers/xterm_kern.c b/arch/um/drivers/xterm_kern.c
index a4fdf3584ad2..d269a80f4b0c 100644
--- a/arch/um/drivers/xterm_kern.c
+++ b/arch/um/drivers/xterm_kern.c
@@ -69,7 +69,6 @@ int xterm_fd(int socket, int *pid_out)
 	 * isn't set) this will hang... */
 	wait_for_completion(&data->ready);
 
-	free_irq_by_irq_and_dev(XTERM_IRQ, data);
 	free_irq(XTERM_IRQ, data);
 
 	ret = data->new_fd;
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index d44fb5282547..9f18061ef4c9 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -124,14 +124,16 @@ void irq_unlock(unsigned long flags)
 	spin_unlock_irqrestore(&irq_spinlock, flags);
 }
 
-/*  presently hw_interrupt_type must define (startup || enable) &&
- *  disable && end */
+/* hw_interrupt_type must define (startup || enable) &&
+ * (shutdown || disable) && end */
 static void dummy(unsigned int irq)
 {
 }
 
-static struct hw_interrupt_type SIGIO_irq_type = {
+/* This is used for everything else than the timer. */
+static struct hw_interrupt_type normal_irq_type = {
 	.typename = "SIGIO",
+	.release = free_irq_by_irq_and_dev,
 	.disable = dummy,
 	.enable = dummy,
 	.ack = dummy,
@@ -140,6 +142,7 @@ static struct hw_interrupt_type SIGIO_irq_type = {
 
 static struct hw_interrupt_type SIGVTALRM_irq_type = {
 	.typename = "SIGVTALRM",
+	.release = free_irq_by_irq_and_dev,
 	.shutdown = dummy, /* never called */
 	.disable = dummy,
 	.enable = dummy,
@@ -160,7 +163,7 @@ void __init init_IRQ(void)
 		irq_desc[i].status = IRQ_DISABLED;
 		irq_desc[i].action = NULL;
 		irq_desc[i].depth = 1;
-		irq_desc[i].handler = &SIGIO_irq_type;
+		irq_desc[i].handler = &normal_irq_type;
 		enable_irq(i);
 	}
 }
diff --git a/arch/um/kernel/irq_user.c b/arch/um/kernel/irq_user.c
index b3074cbaa479..c3ccaf24f3e0 100644
--- a/arch/um/kernel/irq_user.c
+++ b/arch/um/kernel/irq_user.c
@@ -85,8 +85,6 @@ void sigio_handler(int sig, union uml_pt_regs *regs)
 				next = irq_fd->next;
 				if(irq_fd->freed){
 					free_irq(irq_fd->irq, irq_fd->id);
-					free_irq_by_irq_and_dev(irq_fd->irq,
-								irq_fd->id);
 				}
 			}
 		}
diff --git a/include/linux/irq.h b/include/linux/irq.h
index c3ff4d101667..b68ad80e2464 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -47,6 +47,7 @@ struct hw_interrupt_type {
 	void (*ack)(unsigned int irq);
 	void (*end)(unsigned int irq);
 	void (*set_affinity)(unsigned int irq, cpumask_t dest);
+	void (*release)(unsigned int irq, void *dev_id);
 };
 
 typedef struct hw_interrupt_type  hw_irq_controller;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5202e4c4a5b6..5fde8177eedf 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -255,6 +255,10 @@ void free_irq(unsigned int irq, void *dev_id)
 
 			/* Found it - now remove it from the list of entries */
 			*pp = action->next;
+
+			if (desc->handler->release)
+				desc->handler->release(irq, dev_id);
+
 			if (!desc->action) {
 				desc->status |= IRQ_DISABLED;
 				if (desc->handler->shutdown)
-- 
cgit v1.3-14-g43fede


From b77d6adc922b8bbf8b16b67f567958c42962cf88 Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Tue, 21 Jun 2005 17:16:24 -0700
Subject: [PATCH] uml: make hw_controller_type->release exist only for archs
 needing it

With Chris Wedgwood <cw@f00f.org>

As suggested by Chris, we can make the "just added" method ->release
conditional to UML only (better: to archs requesting it, i.e.  only UML
currently), so that other archs don't get this unneeded crud, and if UML
won't need it any more we can kill this.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
CC: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/Kconfig     | 5 +++++
 include/linux/irq.h | 3 +++
 kernel/irq/manage.c | 4 ++++
 3 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index c5292181a664..b8e952c88fd1 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -35,6 +35,11 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+# Used in kernel/irq/manage.c and include/linux/irq.h
+config IRQ_RELEASE_METHOD
+	bool
+	default y
+
 menu "UML-specific options"
 
 config MODE_TT
diff --git a/include/linux/irq.h b/include/linux/irq.h
index b68ad80e2464..7fc1022be9ee 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -47,7 +47,10 @@ struct hw_interrupt_type {
 	void (*ack)(unsigned int irq);
 	void (*end)(unsigned int irq);
 	void (*set_affinity)(unsigned int irq, cpumask_t dest);
+	/* Currently used only by UML, might disappear one day.*/
+#ifdef CONFIG_IRQ_RELEASE_METHOD
 	void (*release)(unsigned int irq, void *dev_id);
+#endif
 };
 
 typedef struct hw_interrupt_type  hw_irq_controller;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5fde8177eedf..ac6700985705 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -6,6 +6,7 @@
  * This file contains driver APIs to the irq subsystem.
  */
 
+#include <linux/config.h>
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/random.h>
@@ -256,8 +257,11 @@ void free_irq(unsigned int irq, void *dev_id)
 			/* Found it - now remove it from the list of entries */
 			*pp = action->next;
 
+			/* Currently used only by UML, might disappear one day.*/
+#ifdef CONFIG_IRQ_RELEASE_METHOD
 			if (desc->handler->release)
 				desc->handler->release(irq, dev_id);
+#endif
 
 			if (!desc->action) {
 				desc->status |= IRQ_DISABLED;
-- 
cgit v1.3-14-g43fede


From 59121003721a8fad11ee72e646fd9d3076b5679c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 00:08:25 -0700
Subject: [PATCH] i386: Selectable Frequency of the Timer Interrupt

Make the timer frequency selectable. The timer interrupt may cause bus
and memory contention in large NUMA systems since the interrupt occurs
on each processor HZ times per second.

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig          |  2 ++
 arch/x86_64/Kconfig        |  2 ++
 include/asm-i386/param.h   |  4 +++-
 include/asm-x86_64/param.h |  6 ++++--
 kernel/Kconfig.hz          | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 57 insertions(+), 3 deletions(-)
 create mode 100644 kernel/Kconfig.hz

(limited to 'kernel')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index bfdcedef06e1..d4ae5f9ceae6 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -961,6 +961,8 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+source kernel/Kconfig.hz
+
 endmenu
 
 
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 61ed16652347..db259757dc8a 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -402,6 +402,8 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+source kernel/Kconfig.hz
+
 endmenu
 
 #
diff --git a/include/asm-i386/param.h b/include/asm-i386/param.h
index b6440526e42a..fa02e67ea86b 100644
--- a/include/asm-i386/param.h
+++ b/include/asm-i386/param.h
@@ -1,8 +1,10 @@
+#include <linux/config.h>
+
 #ifndef _ASMi386_PARAM_H
 #define _ASMi386_PARAM_H
 
 #ifdef __KERNEL__
-# define HZ		1000		/* Internal kernel timer frequency */
+# define HZ		CONFIG_HZ	/* Internal kernel timer frequency */
 # define USER_HZ	100		/* .. some user interfaces are in "ticks" */
 # define CLOCKS_PER_SEC		(USER_HZ)	/* like times() */
 #endif
diff --git a/include/asm-x86_64/param.h b/include/asm-x86_64/param.h
index b707f0568c9e..40b11937180d 100644
--- a/include/asm-x86_64/param.h
+++ b/include/asm-x86_64/param.h
@@ -1,9 +1,11 @@
+#include <linux/config.h>
+
 #ifndef _ASMx86_64_PARAM_H
 #define _ASMx86_64_PARAM_H
 
 #ifdef __KERNEL__
-# define HZ            1000            /* Internal kernel timer frequency */
-# define USER_HZ       100          /* .. some user interfaces are in "ticks */
+# define HZ            CONFIG_HZ	/* Internal kernel timer frequency */
+# define USER_HZ       100		/* .. some user interfaces are in "ticks */
 #define CLOCKS_PER_SEC        (USER_HZ)       /* like times() */
 #endif
 
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
new file mode 100644
index 000000000000..248e1c396f8b
--- /dev/null
+++ b/kernel/Kconfig.hz
@@ -0,0 +1,46 @@
+#
+# Timer Interrupt Frequency Configuration
+#
+
+choice
+	prompt "Timer frequency"
+	default HZ_250
+	help
+	 Allows the configuration of the timer frequency. It is customary
+	 to have the timer interrupt run at 1000 HZ but 100 HZ may be more
+	 beneficial for servers and NUMA systems that do not need to have
+	 a fast response for user interaction and that may experience bus
+	 contention and cacheline bounces as a result of timer interrupts.
+	 Note that the timer interrupt occurs on each processor in an SMP
+	 environment leading to NR_CPUS * HZ number of timer interrupts
+	 per second.
+
+
+	config HZ_100
+		bool "100 HZ"
+	help
+	  100 HZ is a typical choice for servers, SMP and NUMA systems
+	  with lots of processors that may show reduced performance if
+	  too many timer interrupts are occurring.
+
+	config HZ_250
+		bool "250 HZ"
+	help
+	 250 HZ is a good compromise choice allowing server performance
+	 while also showing good interactive responsiveness even
+	 on SMP and NUMA systems.
+
+	config HZ_1000
+		bool "1000 HZ"
+	help
+	 1000 HZ is the preferred choice for desktop systems and other
+	 systems requiring fast interactive responses to events.
+
+endchoice
+
+config HZ
+	int
+	default 100 if HZ_100
+	default 250 if HZ_250
+	default 1000 if HZ_1000
+
-- 
cgit v1.3-14-g43fede


From 55c888d6d09a0df236adfaf8ccf06ff5d0646775 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Jun 2005 00:08:56 -0700
Subject: [PATCH] timers fixes/improvements

This patch tries to solve following problems:

1. del_timer_sync() is racy. The timer can be fired again after
   del_timer_sync have checked all cpus and before it will recheck
   timer_pending().

2. It has scalability problems. All cpus are scanned to determine
   if the timer is running on that cpu.

   With this patch del_timer_sync is O(1) and no slower than plain
   del_timer(pending_timer), unless it has to actually wait for
   completion of the currently running timer.

   The only restriction is that the recurring timer should not use
   add_timer_on().

3. The timers are not serialized wrt to itself.

   If CPU_0 does mod_timer(jiffies+1) while the timer is currently
   running on CPU 1, it is quite possible that local interrupt on
   CPU_0 will start that timer before it finished on CPU_1.

4. The timers locking is suboptimal. __mod_timer() takes 3 locks
   at once and still requires wmb() in del_timer/run_timers.

   The new implementation takes 2 locks sequentially and does not
   need memory barriers.

Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.

This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.

The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.

So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).

When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.

This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.

__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.

__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.

So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.

We don't need timer_list->lock anymore, this patch kills it.

We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.

One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global

        struct timer_base_s {
                spinlock_t lock;
                struct timer_list *running_timer;
        } __init_timer_base;

which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.

It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timer.h |  30 ++---
 kernel/timer.c        | 328 ++++++++++++++++++++++++--------------------------
 2 files changed, 166 insertions(+), 192 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 90db1cc62ddd..2e78fedfc069 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -6,45 +6,33 @@
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 
-struct tvec_t_base_s;
+struct timer_base_s;
 
 struct timer_list {
 	struct list_head entry;
 	unsigned long expires;
 
-	spinlock_t lock;
 	unsigned long magic;
 
 	void (*function)(unsigned long);
 	unsigned long data;
 
-	struct tvec_t_base_s *base;
+	struct timer_base_s *base;
 };
 
 #define TIMER_MAGIC	0x4b87ad6e
 
+extern struct timer_base_s __init_timer_base;
+
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
-		.base = NULL,					\
+		.base = &__init_timer_base,			\
 		.magic = TIMER_MAGIC,				\
-		.lock = SPIN_LOCK_UNLOCKED,			\
 	}
 
-/***
- * init_timer - initialize a timer.
- * @timer: the timer to be initialized
- *
- * init_timer() must be done to a timer prior calling *any* of the
- * other timer functions.
- */
-static inline void init_timer(struct timer_list * timer)
-{
-	timer->base = NULL;
-	timer->magic = TIMER_MAGIC;
-	spin_lock_init(&timer->lock);
-}
+void fastcall init_timer(struct timer_list * timer);
 
 /***
  * timer_pending - is a timer pending?
@@ -58,7 +46,7 @@ static inline void init_timer(struct timer_list * timer)
  */
 static inline int timer_pending(const struct timer_list * timer)
 {
-	return timer->base != NULL;
+	return timer->entry.next != NULL;
 }
 
 extern void add_timer_on(struct timer_list *timer, int cpu);
@@ -89,12 +77,12 @@ static inline void add_timer(struct timer_list * timer)
 
 #ifdef CONFIG_SMP
   extern int del_timer_sync(struct timer_list *timer);
-  extern int del_singleshot_timer_sync(struct timer_list *timer);
 #else
 # define del_timer_sync(t) del_timer(t)
-# define del_singleshot_timer_sync(t) del_timer(t)
 #endif
 
+#define del_singleshot_timer_sync(t) del_timer_sync(t)
+
 extern void init_timers(void);
 extern void run_local_timers(void);
 extern void it_real_fn(unsigned long);
diff --git a/kernel/timer.c b/kernel/timer.c
index 207aa4f0aa10..8aadc62efd65 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -57,6 +57,11 @@ static void time_interpolator_update(long delta_nsec);
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
 
+struct timer_base_s {
+	spinlock_t lock;
+	struct timer_list *running_timer;
+};
+
 typedef struct tvec_s {
 	struct list_head vec[TVN_SIZE];
 } tvec_t;
@@ -66,9 +71,8 @@ typedef struct tvec_root_s {
 } tvec_root_t;
 
 struct tvec_t_base_s {
-	spinlock_t lock;
+	struct timer_base_s t_base;
 	unsigned long timer_jiffies;
-	struct timer_list *running_timer;
 	tvec_root_t tv1;
 	tvec_t tv2;
 	tvec_t tv3;
@@ -77,18 +81,16 @@ struct tvec_t_base_s {
 } ____cacheline_aligned_in_smp;
 
 typedef struct tvec_t_base_s tvec_base_t;
+static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
 
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
 {
 #ifdef CONFIG_SMP
-	base->running_timer = timer;
+	base->t_base.running_timer = timer;
 #endif
 }
 
-/* Fake initialization */
-static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
-
 static void check_timer_failed(struct timer_list *timer)
 {
 	static int whine_count;
@@ -103,7 +105,6 @@ static void check_timer_failed(struct timer_list *timer)
 	/*
 	 * Now fix it up
 	 */
-	spin_lock_init(&timer->lock);
 	timer->magic = TIMER_MAGIC;
 }
 
@@ -156,65 +157,113 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 	list_add_tail(&timer->entry, vec);
 }
 
+typedef struct timer_base_s timer_base_t;
+/*
+ * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
+ * at compile time, and we need timer->base to lock the timer.
+ */
+timer_base_t __init_timer_base
+	____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
+EXPORT_SYMBOL(__init_timer_base);
+
+/***
+ * init_timer - initialize a timer.
+ * @timer: the timer to be initialized
+ *
+ * init_timer() must be done to a timer prior calling *any* of the
+ * other timer functions.
+ */
+void fastcall init_timer(struct timer_list *timer)
+{
+	timer->entry.next = NULL;
+	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
+	timer->magic = TIMER_MAGIC;
+}
+EXPORT_SYMBOL(init_timer);
+
+static inline void detach_timer(struct timer_list *timer,
+					int clear_pending)
+{
+	struct list_head *entry = &timer->entry;
+
+	__list_del(entry->prev, entry->next);
+	if (clear_pending)
+		entry->next = NULL;
+	entry->prev = LIST_POISON2;
+}
+
+/*
+ * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
+ * means that all timers which are tied to this base via timer->base are
+ * locked, and the base itself is locked too.
+ *
+ * So __run_timers/migrate_timers can safely modify all timers which could
+ * be found on ->tvX lists.
+ *
+ * When the timer's base is locked, and the timer removed from list, it is
+ * possible to set timer->base = NULL and drop the lock: the timer remains
+ * locked.
+ */
+static timer_base_t *lock_timer_base(struct timer_list *timer,
+					unsigned long *flags)
+{
+	timer_base_t *base;
+
+	for (;;) {
+		base = timer->base;
+		if (likely(base != NULL)) {
+			spin_lock_irqsave(&base->lock, *flags);
+			if (likely(base == timer->base))
+				return base;
+			/* The timer has migrated to another CPU */
+			spin_unlock_irqrestore(&base->lock, *flags);
+		}
+		cpu_relax();
+	}
+}
+
 int __mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	tvec_base_t *old_base, *new_base;
+	timer_base_t *base;
+	tvec_base_t *new_base;
 	unsigned long flags;
 	int ret = 0;
 
 	BUG_ON(!timer->function);
-
 	check_timer(timer);
 
-	spin_lock_irqsave(&timer->lock, flags);
+	base = lock_timer_base(timer, &flags);
+
+	if (timer_pending(timer)) {
+		detach_timer(timer, 0);
+		ret = 1;
+	}
+
 	new_base = &__get_cpu_var(tvec_bases);
-repeat:
-	old_base = timer->base;
 
-	/*
-	 * Prevent deadlocks via ordering by old_base < new_base.
-	 */
-	if (old_base && (new_base != old_base)) {
-		if (old_base < new_base) {
-			spin_lock(&new_base->lock);
-			spin_lock(&old_base->lock);
-		} else {
-			spin_lock(&old_base->lock);
-			spin_lock(&new_base->lock);
-		}
+	if (base != &new_base->t_base) {
 		/*
-		 * The timer base might have been cancelled while we were
-		 * trying to take the lock(s):
+		 * We are trying to schedule the timer on the local CPU.
+		 * However we can't change timer's base while it is running,
+		 * otherwise del_timer_sync() can't detect that the timer's
+		 * handler yet has not finished. This also guarantees that
+		 * the timer is serialized wrt itself.
 		 */
-		if (timer->base != old_base) {
-			spin_unlock(&new_base->lock);
-			spin_unlock(&old_base->lock);
-			goto repeat;
-		}
-	} else {
-		spin_lock(&new_base->lock);
-		if (timer->base != old_base) {
-			spin_unlock(&new_base->lock);
-			goto repeat;
+		if (unlikely(base->running_timer == timer)) {
+			/* The timer remains on a former base */
+			new_base = container_of(base, tvec_base_t, t_base);
+		} else {
+			/* See the comment in lock_timer_base() */
+			timer->base = NULL;
+			spin_unlock(&base->lock);
+			spin_lock(&new_base->t_base.lock);
+			timer->base = &new_base->t_base;
 		}
 	}
 
-	/*
-	 * Delete the previous timeout (if there was any), and install
-	 * the new one:
-	 */
-	if (old_base) {
-		list_del(&timer->entry);
-		ret = 1;
-	}
 	timer->expires = expires;
 	internal_add_timer(new_base, timer);
-	timer->base = new_base;
-
-	if (old_base && (new_base != old_base))
-		spin_unlock(&old_base->lock);
-	spin_unlock(&new_base->lock);
-	spin_unlock_irqrestore(&timer->lock, flags);
+	spin_unlock_irqrestore(&new_base->t_base.lock, flags);
 
 	return ret;
 }
@@ -232,15 +281,15 @@ void add_timer_on(struct timer_list *timer, int cpu)
 {
 	tvec_base_t *base = &per_cpu(tvec_bases, cpu);
   	unsigned long flags;
-  
+
   	BUG_ON(timer_pending(timer) || !timer->function);
 
 	check_timer(timer);
 
-	spin_lock_irqsave(&base->lock, flags);
+	spin_lock_irqsave(&base->t_base.lock, flags);
+	timer->base = &base->t_base;
 	internal_add_timer(base, timer);
-	timer->base = base;
-	spin_unlock_irqrestore(&base->lock, flags);
+	spin_unlock_irqrestore(&base->t_base.lock, flags);
 }
 
 
@@ -295,27 +344,22 @@ EXPORT_SYMBOL(mod_timer);
  */
 int del_timer(struct timer_list *timer)
 {
+	timer_base_t *base;
 	unsigned long flags;
-	tvec_base_t *base;
+	int ret = 0;
 
 	check_timer(timer);
 
-repeat:
- 	base = timer->base;
-	if (!base)
-		return 0;
-	spin_lock_irqsave(&base->lock, flags);
-	if (base != timer->base) {
+	if (timer_pending(timer)) {
+		base = lock_timer_base(timer, &flags);
+		if (timer_pending(timer)) {
+			detach_timer(timer, 1);
+			ret = 1;
+		}
 		spin_unlock_irqrestore(&base->lock, flags);
-		goto repeat;
 	}
-	list_del(&timer->entry);
-	/* Need to make sure that anybody who sees a NULL base also sees the list ops */
-	smp_wmb();
-	timer->base = NULL;
-	spin_unlock_irqrestore(&base->lock, flags);
 
-	return 1;
+	return ret;
 }
 
 EXPORT_SYMBOL(del_timer);
@@ -332,72 +376,39 @@ EXPORT_SYMBOL(del_timer);
  * Synchronization rules: callers must prevent restarting of the timer,
  * otherwise this function is meaningless. It must not be called from
  * interrupt contexts. The caller must not hold locks which would prevent
- * completion of the timer's handler.  Upon exit the timer is not queued and
- * the handler is not running on any CPU.
+ * completion of the timer's handler. The timer's handler must not call
+ * add_timer_on(). Upon exit the timer is not queued and the handler is
+ * not running on any CPU.
  *
  * The function returns whether it has deactivated a pending timer or not.
- *
- * del_timer_sync() is slow and complicated because it copes with timer
- * handlers which re-arm the timer (periodic timers).  If the timer handler
- * is known to not do this (a single shot timer) then use
- * del_singleshot_timer_sync() instead.
  */
 int del_timer_sync(struct timer_list *timer)
 {
-	tvec_base_t *base;
-	int i, ret = 0;
+	timer_base_t *base;
+	unsigned long flags;
+	int ret = -1;
 
 	check_timer(timer);
 
-del_again:
-	ret += del_timer(timer);
+	do {
+		base = lock_timer_base(timer, &flags);
 
-	for_each_online_cpu(i) {
-		base = &per_cpu(tvec_bases, i);
-		if (base->running_timer == timer) {
-			while (base->running_timer == timer) {
-				cpu_relax();
-				preempt_check_resched();
-			}
-			break;
+		if (base->running_timer == timer)
+			goto unlock;
+
+		ret = 0;
+		if (timer_pending(timer)) {
+			detach_timer(timer, 1);
+			ret = 1;
 		}
-	}
-	smp_rmb();
-	if (timer_pending(timer))
-		goto del_again;
+unlock:
+		spin_unlock_irqrestore(&base->lock, flags);
+	} while (ret < 0);
 
 	return ret;
 }
-EXPORT_SYMBOL(del_timer_sync);
 
-/***
- * del_singleshot_timer_sync - deactivate a non-recursive timer
- * @timer: the timer to be deactivated
- *
- * This function is an optimization of del_timer_sync for the case where the
- * caller can guarantee the timer does not reschedule itself in its timer
- * function.
- *
- * Synchronization rules: callers must prevent restarting of the timer,
- * otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which wold prevent
- * completion of the timer's handler.  Upon exit the timer is not queued and
- * the handler is not running on any CPU.
- *
- * The function returns whether it has deactivated a pending timer or not.
- */
-int del_singleshot_timer_sync(struct timer_list *timer)
-{
-	int ret = del_timer(timer);
-
-	if (!ret) {
-		ret = del_timer_sync(timer);
-		BUG_ON(ret);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL(del_singleshot_timer_sync);
+EXPORT_SYMBOL(del_timer_sync);
 #endif
 
 static int cascade(tvec_base_t *base, tvec_t *tv, int index)
@@ -415,7 +426,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 		struct timer_list *tmp;
 
 		tmp = list_entry(curr, struct timer_list, entry);
-		BUG_ON(tmp->base != base);
+		BUG_ON(tmp->base != &base->t_base);
 		curr = curr->next;
 		internal_add_timer(base, tmp);
 	}
@@ -437,7 +448,7 @@ static inline void __run_timers(tvec_base_t *base)
 {
 	struct timer_list *timer;
 
-	spin_lock_irq(&base->lock);
+	spin_lock_irq(&base->t_base.lock);
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
 		struct list_head work_list = LIST_HEAD_INIT(work_list);
 		struct list_head *head = &work_list;
@@ -453,8 +464,7 @@ static inline void __run_timers(tvec_base_t *base)
 			cascade(base, &base->tv5, INDEX(3));
 		++base->timer_jiffies; 
 		list_splice_init(base->tv1.vec + index, &work_list);
-repeat:
-		if (!list_empty(head)) {
+		while (!list_empty(head)) {
 			void (*fn)(unsigned long);
 			unsigned long data;
 
@@ -462,11 +472,9 @@ repeat:
  			fn = timer->function;
  			data = timer->data;
 
-			list_del(&timer->entry);
 			set_running_timer(base, timer);
-			smp_wmb();
-			timer->base = NULL;
-			spin_unlock_irq(&base->lock);
+			detach_timer(timer, 1);
+			spin_unlock_irq(&base->t_base.lock);
 			{
 				u32 preempt_count = preempt_count();
 				fn(data);
@@ -475,12 +483,11 @@ repeat:
 					BUG();
 				}
 			}
-			spin_lock_irq(&base->lock);
-			goto repeat;
+			spin_lock_irq(&base->t_base.lock);
 		}
 	}
 	set_running_timer(base, NULL);
-	spin_unlock_irq(&base->lock);
+	spin_unlock_irq(&base->t_base.lock);
 }
 
 #ifdef CONFIG_NO_IDLE_HZ
@@ -499,7 +506,7 @@ unsigned long next_timer_interrupt(void)
 	int i, j;
 
 	base = &__get_cpu_var(tvec_bases);
-	spin_lock(&base->lock);
+	spin_lock(&base->t_base.lock);
 	expires = base->timer_jiffies + (LONG_MAX >> 1);
 	list = 0;
 
@@ -547,7 +554,7 @@ found:
 				expires = nte->expires;
 		}
 	}
-	spin_unlock(&base->lock);
+	spin_unlock(&base->t_base.lock);
 	return expires;
 }
 #endif
@@ -1286,9 +1293,9 @@ static void __devinit init_timers_cpu(int cpu)
 {
 	int j;
 	tvec_base_t *base;
-       
+
 	base = &per_cpu(tvec_bases, cpu);
-	spin_lock_init(&base->lock);
+	spin_lock_init(&base->t_base.lock);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
 		INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1302,22 +1309,16 @@ static void __devinit init_timers_cpu(int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
+static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
 {
 	struct timer_list *timer;
 
 	while (!list_empty(head)) {
 		timer = list_entry(head->next, struct timer_list, entry);
-		/* We're locking backwards from __mod_timer order here,
-		   beware deadlock. */
-		if (!spin_trylock(&timer->lock))
-			return 0;
-		list_del(&timer->entry);
+		detach_timer(timer, 0);
+		timer->base = &new_base->t_base;
 		internal_add_timer(new_base, timer);
-		timer->base = new_base;
-		spin_unlock(&timer->lock);
 	}
-	return 1;
 }
 
 static void __devinit migrate_timers(int cpu)
@@ -1331,39 +1332,24 @@ static void __devinit migrate_timers(int cpu)
 	new_base = &get_cpu_var(tvec_bases);
 
 	local_irq_disable();
-again:
-	/* Prevent deadlocks via ordering by old_base < new_base. */
-	if (old_base < new_base) {
-		spin_lock(&new_base->lock);
-		spin_lock(&old_base->lock);
-	} else {
-		spin_lock(&old_base->lock);
-		spin_lock(&new_base->lock);
-	}
+	spin_lock(&new_base->t_base.lock);
+	spin_lock(&old_base->t_base.lock);
 
-	if (old_base->running_timer)
+	if (old_base->t_base.running_timer)
 		BUG();
 	for (i = 0; i < TVR_SIZE; i++)
-		if (!migrate_timer_list(new_base, old_base->tv1.vec + i))
-			goto unlock_again;
-	for (i = 0; i < TVN_SIZE; i++)
-		if (!migrate_timer_list(new_base, old_base->tv2.vec + i)
-		    || !migrate_timer_list(new_base, old_base->tv3.vec + i)
-		    || !migrate_timer_list(new_base, old_base->tv4.vec + i)
-		    || !migrate_timer_list(new_base, old_base->tv5.vec + i))
-			goto unlock_again;
-	spin_unlock(&old_base->lock);
-	spin_unlock(&new_base->lock);
+		migrate_timer_list(new_base, old_base->tv1.vec + i);
+	for (i = 0; i < TVN_SIZE; i++) {
+		migrate_timer_list(new_base, old_base->tv2.vec + i);
+		migrate_timer_list(new_base, old_base->tv3.vec + i);
+		migrate_timer_list(new_base, old_base->tv4.vec + i);
+		migrate_timer_list(new_base, old_base->tv5.vec + i);
+	}
+
+	spin_unlock(&old_base->t_base.lock);
+	spin_unlock(&new_base->t_base.lock);
 	local_irq_enable();
 	put_cpu_var(tvec_bases);
-	return;
-
-unlock_again:
-	/* Avoid deadlock with __mod_timer, by backing off. */
-	spin_unlock(&old_base->lock);
-	spin_unlock(&new_base->lock);
-	cpu_relax();
-	goto again;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-- 
cgit v1.3-14-g43fede


From fd450b7318b75343fd76b3d95416853e34e72c95 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Jun 2005 00:08:59 -0700
Subject: [PATCH] timers: introduce try_to_del_timer_sync()

This patch splits del_timer_sync() into 2 functions.  The new one,
try_to_del_timer_sync(), returns -1 when it hits executing timer.

It can be used in interrupt context, or when the caller hold locks which
can prevent completion of the timer's handler.

NOTE.  Currently it can't be used in interrupt context in UP case, because
->running_timer is used only with CONFIG_SMP.

Should the need arise, it is possible to kill #ifdef CONFIG_SMP in
set_running_timer(), it is cheap.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timer.h |  4 +++-
 kernel/timer.c        | 53 ++++++++++++++++++++++++++++++++-------------------
 2 files changed, 36 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 2e78fedfc069..221f81ac2002 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -76,9 +76,11 @@ static inline void add_timer(struct timer_list * timer)
 }
 
 #ifdef CONFIG_SMP
+  extern int try_to_del_timer_sync(struct timer_list *timer);
   extern int del_timer_sync(struct timer_list *timer);
 #else
-# define del_timer_sync(t) del_timer(t)
+# define try_to_del_timer_sync(t)	del_timer(t)
+# define del_timer_sync(t)		del_timer(t)
 #endif
 
 #define del_singleshot_timer_sync(t) del_timer_sync(t)
diff --git a/kernel/timer.c b/kernel/timer.c
index 8aadc62efd65..1f986c16d89f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -365,6 +365,34 @@ int del_timer(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer);
 
 #ifdef CONFIG_SMP
+/*
+ * This function tries to deactivate a timer. Upon successful (ret >= 0)
+ * exit the timer is not queued and the handler is not running on any CPU.
+ *
+ * It must not be called from interrupt contexts.
+ */
+int try_to_del_timer_sync(struct timer_list *timer)
+{
+	timer_base_t *base;
+	unsigned long flags;
+	int ret = -1;
+
+	base = lock_timer_base(timer, &flags);
+
+	if (base->running_timer == timer)
+		goto out;
+
+	ret = 0;
+	if (timer_pending(timer)) {
+		detach_timer(timer, 1);
+		ret = 1;
+	}
+out:
+	spin_unlock_irqrestore(&base->lock, flags);
+
+	return ret;
+}
+
 /***
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -384,28 +412,13 @@ EXPORT_SYMBOL(del_timer);
  */
 int del_timer_sync(struct timer_list *timer)
 {
-	timer_base_t *base;
-	unsigned long flags;
-	int ret = -1;
-
 	check_timer(timer);
 
-	do {
-		base = lock_timer_base(timer, &flags);
-
-		if (base->running_timer == timer)
-			goto unlock;
-
-		ret = 0;
-		if (timer_pending(timer)) {
-			detach_timer(timer, 1);
-			ret = 1;
-		}
-unlock:
-		spin_unlock_irqrestore(&base->lock, flags);
-	} while (ret < 0);
-
-	return ret;
+	for (;;) {
+		int ret = try_to_del_timer_sync(timer);
+		if (ret >= 0)
+			return ret;
+	}
 }
 
 EXPORT_SYMBOL(del_timer_sync);
-- 
cgit v1.3-14-g43fede


From f972be33ce6a08b5f096ba013c7459a3a82f5f39 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Jun 2005 00:09:00 -0700
Subject: [PATCH] posix-timers: use try_to_del_timer_sync()

sys_timer_settime/sys_timer_delete needs to delete k_itimer->real.timer
synchronously while holding ->it_lock, which is also locked in
posix_timer_fn.

This patch removes timer_active/set_timer_inactive which plays with
timer_list's internals in favour of using try_to_del_timer_sync(), which
was introduced in the previous patch.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 34 +++++++---------------------------
 1 file changed, 7 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index cabb63fc9e16..5b7b4736d82b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -88,23 +88,6 @@ static kmem_cache_t *posix_timers_cache;
 static struct idr posix_timers_id;
 static DEFINE_SPINLOCK(idr_lock);
 
-/*
- * Just because the timer is not in the timer list does NOT mean it is
- * inactive.  It could be in the "fire" routine getting a new expire time.
- */
-#define TIMER_INACTIVE 1
-
-#ifdef CONFIG_SMP
-# define timer_active(tmr) \
-		((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE)
-# define set_timer_inactive(tmr) \
-		do { \
-			(tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \
-		} while (0)
-#else
-# define timer_active(tmr) BARFY	// error to use outside of SMP
-# define set_timer_inactive(tmr) do { } while (0)
-#endif
 /*
  * we assume that the new SIGEV_THREAD_ID shares no bits with the other
  * SIGEV values.  Here we put out an error if this assumption fails.
@@ -226,7 +209,6 @@ static inline int common_timer_create(struct k_itimer *new_timer)
 	init_timer(&new_timer->it.real.timer);
 	new_timer->it.real.timer.data = (unsigned long) new_timer;
 	new_timer->it.real.timer.function = posix_timer_fn;
-	set_timer_inactive(new_timer);
 	return 0;
 }
 
@@ -480,7 +462,6 @@ static void posix_timer_fn(unsigned long __data)
 	int do_notify = 1;
 
 	spin_lock_irqsave(&timr->it_lock, flags);
- 	set_timer_inactive(timr);
 	if (!list_empty(&timr->it.real.abs_timer_entry)) {
 		spin_lock(&abs_list.lock);
 		do {
@@ -983,8 +964,8 @@ common_timer_set(struct k_itimer *timr, int flags,
 	 * careful here.  If smp we could be in the "fire" routine which will
 	 * be spinning as we hold the lock.  But this is ONLY an SMP issue.
 	 */
+	if (try_to_del_timer_sync(&timr->it.real.timer) < 0) {
 #ifdef CONFIG_SMP
-	if (timer_active(timr) && !del_timer(&timr->it.real.timer))
 		/*
 		 * It can only be active if on an other cpu.  Since
 		 * we have cleared the interval stuff above, it should
@@ -994,11 +975,9 @@ common_timer_set(struct k_itimer *timr, int flags,
 		 * a "retry" exit status.
 		 */
 		return TIMER_RETRY;
-
-	set_timer_inactive(timr);
-#else
-	del_timer(&timr->it.real.timer);
 #endif
+	}
+
 	remove_from_abslist(timr);
 
 	timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 
@@ -1083,8 +1062,9 @@ retry:
 static inline int common_timer_del(struct k_itimer *timer)
 {
 	timer->it.real.incr = 0;
+
+	if (try_to_del_timer_sync(&timer->it.real.timer) < 0) {
 #ifdef CONFIG_SMP
-	if (timer_active(timer) && !del_timer(&timer->it.real.timer))
 		/*
 		 * It can only be active if on an other cpu.  Since
 		 * we have cleared the interval stuff above, it should
@@ -1094,9 +1074,9 @@ static inline int common_timer_del(struct k_itimer *timer)
 		 * a "retry" exit status.
 		 */
 		return TIMER_RETRY;
-#else
-	del_timer(&timer->it.real.timer);
 #endif
+	}
+
 	remove_from_abslist(timer);
 
 	return 0;
-- 
cgit v1.3-14-g43fede


From ab4af03a4054bd78bcabfb2214c9597201beae35 Mon Sep 17 00:00:00 2001
From: Greg Edwards <edwardsg@sgi.com>
Date: Thu, 23 Jun 2005 00:09:05 -0700
Subject: [PATCH] CON_CONSDEV bit not set correctly on last console

According to include/linux/console.h, CON_CONSDEV flag should be set on
the last console specified on the boot command line:

     86 #define CON_PRINTBUFFER (1)
     87 #define CON_CONSDEV     (2) /* Last on the command line */
     88 #define CON_ENABLED     (4)
     89 #define CON_BOOT        (8)

This does not currently happen if there is more than one console specified
on the boot commandline.  Instead, it gets set on the first console on the
command line.  This can cause problems for things like kdb that look for
the CON_CONSDEV flag to see if the console is valid.

Additionaly, it doesn't look like CON_CONSDEV is reassigned to the next
preferred console at unregister time if the console being unregistered
currently has that bit set.

Example (from sn2 ia64):

elilo vmlinuz root=<dev> console=ttyS0 console=ttySG0

in this case, the flags on ttySG console struct will be 0x4 (should be
0x6).

Attached patch against bk fixes both issues for the cases I looked at.  It
uses selected_console (which gets incremented for each console specified on
the command line) as the indicator of which console to set CON_CONSDEV on.
When adding the console to the list, if the previous one had CON_CONSDEV
set, it masks it out.  Tested on ia64 and x86.

The problem with the current behavior is it breaks overriding the default from
the boot line.  In the ia64 case, there may be a global append line defining
console=a in elilo.conf.  Then you want to boot your kernel, and want to
override the default by passing console=b on the boot line.  elilo constructs
the kernel cmdline by starting with the value of the global append line, then
tacks on whatever else you specify, which puts console=b last.

Signed-off-by: Greg Edwards <edwardsg@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 01b58d7d17ff..3a442bfb8bee 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -876,8 +876,10 @@ void register_console(struct console * console)
 			break;
 		console->flags |= CON_ENABLED;
 		console->index = console_cmdline[i].index;
-		if (i == preferred_console)
+		if (i == selected_console) {
 			console->flags |= CON_CONSDEV;
+			preferred_console = selected_console;
+		}
 		break;
 	}
 
@@ -897,6 +899,8 @@ void register_console(struct console * console)
 	if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
 		console->next = console_drivers;
 		console_drivers = console;
+		if (console->next)
+			console->next->flags &= ~CON_CONSDEV;
 	} else {
 		console->next = console_drivers->next;
 		console_drivers->next = console;
@@ -937,10 +941,14 @@ int unregister_console(struct console * console)
 	/* If last console is removed, we re-enable picking the first
 	 * one that gets registered. Without that, pmac early boot console
 	 * would prevent fbcon from taking over.
+	 *
+	 * If this isn't the last console and it has CON_CONSDEV set, we
+	 * need to set it on the next preferred console.
 	 */
 	if (console_drivers == NULL)
 		preferred_console = selected_console;
-		
+	else if (console->flags & CON_CONSDEV)
+		console_drivers->flags |= CON_CONSDEV;
 
 	release_console_sem();
 	return res;
-- 
cgit v1.3-14-g43fede


From be5b4fbd017d12e0d09ea0528a5839ce2ed2c8c8 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Thu, 23 Jun 2005 00:09:09 -0700
Subject: [PATCH] preempt_count is int - remove cast and don't assign to
 unsigned type

In kernel/sched.c the return value from preempt_count() is cast to an int.
That made sense when preempt_count was defined as different types on is not
needed and should go away.  The patch removes the cast.

In kernel/timer.c the return value from preempt_count() is assigned to a
variable of type u32 and then that unsigned value is later compared to
preempt_count().  Since preempt_count() returns an int, an int is what
should be used to store its return value.  Storing the result in an
unsigned 32bit integer made a tiny bit of sense back when preempt_count was
different types on different archs, but no more - let's not play signed vs
unsigned comparison games when we don't have to.  The patch modifies the
code to use an int to hold the value.  While I was around that bit of code
I also made two changes to a nearby (related) printk() - I modified it to
specify the loglevel explicitly and also broke the line into a few pieces
to avoid it being longer than 80 chars and clarified the text a bit.

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 kernel/timer.c | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index deca041fc364..6ee4515d5a20 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2576,7 +2576,7 @@ void fastcall add_preempt_count(int val)
 	/*
 	 * Underflow?
 	 */
-	BUG_ON(((int)preempt_count() < 0));
+	BUG_ON((preempt_count() < 0));
 	preempt_count() += val;
 	/*
 	 * Spinlock count overflowing soon?
diff --git a/kernel/timer.c b/kernel/timer.c
index 1f986c16d89f..51ff917c9590 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -489,10 +489,14 @@ static inline void __run_timers(tvec_base_t *base)
 			detach_timer(timer, 1);
 			spin_unlock_irq(&base->t_base.lock);
 			{
-				u32 preempt_count = preempt_count();
+				int preempt_count = preempt_count();
 				fn(data);
 				if (preempt_count != preempt_count()) {
-					printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count());
+					printk(KERN_WARNING "huh, entered %p "
+					       "with preempt_count %08x, exited"
+					       " with %08x?\n",
+					       fn, preempt_count,
+					       preempt_count());
 					BUG();
 				}
 			}
-- 
cgit v1.3-14-g43fede


From 5f45f1a78fbac3cc859ec10c5366e97d20d40fa2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Jun 2005 00:09:12 -0700
Subject: [PATCH] remove duplicate get_dentry functions in various places

Various filesystem drivers have grown a get_dentry() function that's a
duplicate of lookup_one_len, except that it doesn't take a maximum length
argument and doesn't check for \0 or / in the passed in filename.

Switch all these places to use lookup_one_len.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Greg KH <greg@kroah.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/usb/core/inode.c | 13 +------------
 fs/debugfs/inode.c       | 12 +-----------
 fs/sysfs/dir.c           |  5 +++--
 fs/sysfs/file.c          |  5 +++--
 fs/sysfs/group.c         |  4 +++-
 fs/sysfs/inode.c         | 10 ----------
 fs/sysfs/sysfs.h         |  1 -
 kernel/cpuset.c          |  8 +-------
 8 files changed, 12 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index f9f9561c6bad..c3e3a95d3804 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -453,17 +453,6 @@ static int usbfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct dentry * get_dentry(struct dentry *parent, const char *name)
-{               
-	struct qstr qstr;
-
-	qstr.name = name;
-	qstr.len = strlen(name);
-	qstr.hash = full_name_hash(name,qstr.len);
-	return lookup_hash(&qstr,parent);
-}               
-
-
 /*
  * fs_create_by_name - create a file, given a name
  * @name:	name of file
@@ -496,7 +485,7 @@ static int fs_create_by_name (const char *name, mode_t mode,
 
 	*dentry = NULL;
 	down(&parent->d_inode->i_sem);
-	*dentry = get_dentry (parent, name);
+	*dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry)) {
 		if ((mode & S_IFMT) == S_IFDIR)
 			error = usbfs_mkdir (parent->d_inode, *dentry, mode);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b529786699e7..a86ac4aeaedb 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -110,16 +110,6 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
 	return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
 }
 
-static struct dentry * get_dentry(struct dentry *parent, const char *name)
-{               
-	struct qstr qstr;
-
-	qstr.name = name;
-	qstr.len = strlen(name);
-	qstr.hash = full_name_hash(name,qstr.len);
-	return lookup_hash(&qstr,parent);
-}               
-
 static struct super_block *debug_get_sb(struct file_system_type *fs_type,
 				        int flags, const char *dev_name,
 					void *data)
@@ -157,7 +147,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 
 	*dentry = NULL;
 	down(&parent->d_inode->i_sem);
-	*dentry = get_dentry (parent, name);
+	*dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry)) {
 		if ((mode & S_IFMT) == S_IFDIR)
 			error = debugfs_mkdir(parent->d_inode, *dentry, mode);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 37d7a6875d86..59734ba1ee60 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -8,6 +8,7 @@
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
+#include <linux/namei.h>
 #include "sysfs.h"
 
 DECLARE_RWSEM(sysfs_rename_sem);
@@ -99,7 +100,7 @@ static int create_dir(struct kobject * k, struct dentry * p,
 	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
 
 	down(&p->d_inode->i_sem);
-	*d = sysfs_get_dentry(p,n);
+	*d = lookup_one_len(n, p, strlen(n));
 	if (!IS_ERR(*d)) {
 		error = sysfs_make_dirent(p->d_fsdata, *d, k, mode, SYSFS_DIR);
 		if (!error) {
@@ -315,7 +316,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 
 	down(&parent->d_inode->i_sem);
 
-	new_dentry = sysfs_get_dentry(parent, new_name);
+	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
 	if (!IS_ERR(new_dentry)) {
   		if (!new_dentry->d_inode) {
 			error = kobject_set_name(kobj, "%s", new_name);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 849aac115460..e9cfa39f4099 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/dnotify.h>
 #include <linux/kobject.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 
@@ -400,7 +401,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
 	int res = -ENOENT;
 
 	down(&dir->d_inode->i_sem);
-	victim = sysfs_get_dentry(dir, attr->name);
+	victim = lookup_one_len(attr->name, dir, strlen(attr->name));
 	if (!IS_ERR(victim)) {
 		/* make sure dentry is really there */
 		if (victim->d_inode && 
@@ -443,7 +444,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 	int res = -ENOENT;
 
 	down(&dir->d_inode->i_sem);
-	victim = sysfs_get_dentry(dir, attr->name);
+	victim = lookup_one_len(attr->name, dir, strlen(attr->name));
 	if (!IS_ERR(victim)) {
 		if (victim->d_inode &&
 		    (victim->d_parent->d_inode == dir->d_inode)) {
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index f11ac5ea7021..122145b0895c 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -11,6 +11,7 @@
 #include <linux/kobject.h>
 #include <linux/module.h>
 #include <linux/dcache.h>
+#include <linux/namei.h>
 #include <linux/err.h>
 #include "sysfs.h"
 
@@ -68,7 +69,8 @@ void sysfs_remove_group(struct kobject * kobj,
 	struct dentry * dir;
 
 	if (grp->name)
-		dir = sysfs_get_dentry(kobj->dentry,grp->name);
+		dir = lookup_one_len(grp->name, kobj->dentry,
+				strlen(grp->name));
 	else
 		dir = dget(kobj->dentry);
 
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 565cac1d4200..8de13bafaa76 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -166,16 +166,6 @@ int sysfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
 	return error;
 }
 
-struct dentry * sysfs_get_dentry(struct dentry * parent, const char * name)
-{
-	struct qstr qstr;
-
-	qstr.name = name;
-	qstr.len = strlen(name);
-	qstr.hash = full_name_hash(name,qstr.len);
-	return lookup_hash(&qstr,parent);
-}
-
 /*
  * Get the name for corresponding element represented by the given sysfs_dirent
  */
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 29da6f5f07c8..3f8953e0e5d0 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -7,7 +7,6 @@ extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *));
 
 extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *,
 				umode_t, int);
-extern struct dentry * sysfs_get_dentry(struct dentry *, const char *);
 
 extern int sysfs_add_file(struct dentry *, const struct attribute *, int);
 extern void sysfs_hash_and_remove(struct dentry * dir, const char * name);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 00e8f2575512..79dd929f4084 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -228,13 +228,7 @@ static struct dentry_operations cpuset_dops = {
 
 static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
 {
-	struct qstr qstr;
-	struct dentry *d;
-
-	qstr.name = name;
-	qstr.len = strlen(name);
-	qstr.hash = full_name_hash(name, qstr.len);
-	d = lookup_hash(&qstr, parent);
+	struct dentry *d = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(d))
 		d->d_op = &cpuset_dops;
 	return d;
-- 
cgit v1.3-14-g43fede


From df164db5fd16888ddbe2a63a47b2f6dda9a428b5 Mon Sep 17 00:00:00 2001
From: Alexander Nyberg <alexn@dsv.su.se>
Date: Thu, 23 Jun 2005 00:09:13 -0700
Subject: [PATCH] avoid resursive oopses

Prevent recursive faults in do_exit() by leaving the task alone and wait
for reboot.  This may allow a more graceful shutdown and possibly save the
original oops.

Signed-off-by: Alexander Nyberg <alexn@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2ef2ad540201..c2bdf6fb61a5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -793,6 +793,17 @@ fastcall NORET_TYPE void do_exit(long code)
 		ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
 	}
 
+	/*
+	 * We're taking recursive faults here in do_exit. Safest is to just
+	 * leave this task alone and wait for reboot.
+	 */
+	if (unlikely(tsk->flags & PF_EXITING)) {
+		printk(KERN_ALERT
+			"Fixing recursive fault but reboot is needed!\n");
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule();
+	}
+
 	tsk->flags |= PF_EXITING;
 
 	/*
-- 
cgit v1.3-14-g43fede


From b94cce926b2b902b79380ccba370d6f9f2980de0 Mon Sep 17 00:00:00 2001
From: Hien Nguyen <hien@us.ibm.com>
Date: Thu, 23 Jun 2005 00:09:19 -0700
Subject: [PATCH] kprobes: function-return probes

This patch adds function-return probes to kprobes for the i386
architecture.  This enables you to establish a handler to be run when a
function returns.

1. API

Two new functions are added to kprobes:

	int register_kretprobe(struct kretprobe *rp);
	void unregister_kretprobe(struct kretprobe *rp);

2. Registration and unregistration

2.1 Register

  To register a function-return probe, the user populates the following
  fields in a kretprobe object and calls register_kretprobe() with the
  kretprobe address as an argument:

  kp.addr - the function's address

  handler - this function is run after the ret instruction executes, but
  before control returns to the return address in the caller.

  maxactive - The maximum number of instances of the probed function that
  can be active concurrently.  For example, if the function is non-
  recursive and is called with a spinlock or mutex held, maxactive = 1
  should be enough.  If the function is non-recursive and can never
  relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
  be enough.  maxactive is used to determine how many kretprobe_instance
  objects to allocate for this particular probed function.  If maxactive <=
  0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
  NR_CPUS) else maxactive=NR_CPUS)

  For example:

    struct kretprobe rp;
    rp.kp.addr = /* entrypoint address */
    rp.handler = /*return probe handler */
    rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
    register_kretprobe(&rp);

  The following field may also be of interest:

  nmissed - Initialized to zero when the function-return probe is
  registered, and incremented every time the probed function is entered but
  there is no kretprobe_instance object available for establishing the
  function-return probe (i.e., because maxactive was set too low).

2.2 Unregister

  To unregiter a function-return probe, the user calls
  unregister_kretprobe() with the same kretprobe object as registered
  previously.  If a probed function is running when the return probe is
  unregistered, the function will return as expected, but the handler won't
  be run.

3. Limitations

3.1 This patch supports only the i386 architecture, but patches for
    x86_64 and ppc64 are anticipated soon.

3.2 Return probes operates by replacing the return address in the stack
    (or in a known register, such as the lr register for ppc).  This may
    cause __builtin_return_address(0), when invoked from the return-probed
    function, to return the address of the return-probes trampoline.

3.3 This implementation uses the "Multiprobes at an address" feature in
    2.6.12-rc3-mm3.

3.4 Due to a limitation in multi-probes, you cannot currently establish
    a return probe and a jprobe on the same function.  A patch to remove
    this limitation is being tested.

This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.

Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c | 102 +++++++++++++++++++++-
 arch/i386/kernel/process.c |  15 ++++
 include/asm-i386/kprobes.h |   3 +
 include/linux/kprobes.h    |  90 ++++++++++++++++++-
 kernel/kprobes.c           | 213 +++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 415 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 59ff9b455069..048f754bbe23 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -23,6 +23,9 @@
  *		Rusty Russell).
  * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
  *		interface to access function arguments.
+ * 2005-May	Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> added function-return probes.
  */
 
 #include <linux/config.h>
@@ -91,6 +94,53 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 		regs->eip = (unsigned long)&p->ainsn.insn;
 }
 
+struct task_struct  *arch_get_kprobe_task(void *ptr)
+{
+	return ((struct thread_info *) (((unsigned long) ptr) &
+					(~(THREAD_SIZE -1))))->task;
+}
+
+void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
+{
+	unsigned long *sara = (unsigned long *)&regs->esp;
+	struct kretprobe_instance *ri;
+	static void *orig_ret_addr;
+
+	/*
+	 * Save the return address when the return probe hits
+	 * the first time, and use it to populate the (krprobe
+	 * instance)->ret_addr for subsequent return probes at
+	 * the same addrress since stack address would have
+	 * the kretprobe_trampoline by then.
+	 */
+	if (((void*) *sara) != kretprobe_trampoline)
+		orig_ret_addr = (void*) *sara;
+
+	if ((ri = get_free_rp_inst(rp)) != NULL) {
+		ri->rp = rp;
+		ri->stack_addr = sara;
+		ri->ret_addr = orig_ret_addr;
+		add_rp_inst(ri);
+		/* Replace the return addr with trampoline addr */
+		*sara = (unsigned long) &kretprobe_trampoline;
+	} else {
+		rp->nmissed++;
+	}
+}
+
+void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock)
+{
+	unsigned long flags = 0;
+	struct kretprobe_instance *ri;
+	spin_lock_irqsave(kp_lock, flags);
+	while ((ri = get_rp_inst_tsk(tk)) != NULL) {
+		*((unsigned long *)(ri->stack_addr)) =
+					(unsigned long) ri->ret_addr;
+		recycle_rp_inst(ri);
+	}
+	spin_unlock_irqrestore(kp_lock, flags);
+}
+
 /*
  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
  * remain disabled thorough out this function.
@@ -183,6 +233,55 @@ no_kprobe:
 	return ret;
 }
 
+/*
+ * For function-return probes, init_kprobes() establishes a probepoint
+ * here. When a retprobed function returns, this probe is hit and
+ * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ */
+ void kretprobe_trampoline_holder(void)
+ {
+ 	asm volatile (  ".global kretprobe_trampoline\n"
+ 			"kretprobe_trampoline: \n"
+ 			"nop\n");
+ }
+
+/*
+ * Called when we hit the probe point at kretprobe_trampoline
+ */
+int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct task_struct *tsk;
+	struct kretprobe_instance *ri;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	unsigned long *sara = ((unsigned long *) &regs->esp) - 1;
+
+	tsk = arch_get_kprobe_task(sara);
+	head = kretprobe_inst_table_head(tsk);
+
+	hlist_for_each_entry(ri, node, head, hlist) {
+		if (ri->stack_addr == sara && ri->rp) {
+			if (ri->rp->handler)
+				ri->rp->handler(ri, regs);
+		}
+	}
+	return 0;
+}
+
+void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
+						unsigned long flags)
+{
+	struct kretprobe_instance *ri;
+	/* RA already popped */
+	unsigned long *sara = ((unsigned long *)&regs->esp) - 1;
+
+	while ((ri = get_rp_inst(sara))) {
+		regs->eip = (unsigned long)ri->ret_addr;
+		recycle_rp_inst(ri);
+	}
+	regs->eflags &= ~TF_MASK;
+}
+
 /*
  * Called after single-stepping.  p->addr is the address of the
  * instruction whose first byte has been replaced by the "int 3"
@@ -266,7 +365,8 @@ static inline int post_kprobe_handler(struct pt_regs *regs)
 	if (current_kprobe->post_handler)
 		current_kprobe->post_handler(current_kprobe, regs, 0);
 
-	resume_execution(current_kprobe, regs);
+	if (current_kprobe->post_handler != trampoline_post_handler)
+		resume_execution(current_kprobe, regs);
 	regs->eflags |= kprobe_saved_eflags;
 
 	unlock_kprobes();
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index be3efba7caf7..aea2ce1145df 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -37,6 +37,7 @@
 #include <linux/kallsyms.h>
 #include <linux/ptrace.h>
 #include <linux/random.h>
+#include <linux/kprobes.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -339,6 +340,13 @@ void exit_thread(void)
 	struct task_struct *tsk = current;
 	struct thread_struct *t = &tsk->thread;
 
+	/*
+	 * Remove function-return probe instances associated with this task
+	 * and put them back on the free list. Do not insert an exit probe for
+	 * this function, it will be disabled by kprobe_flush_task if you do.
+	 */
+	kprobe_flush_task(tsk);
+
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(NULL != t->io_bitmap_ptr)) {
 		int cpu = get_cpu();
@@ -362,6 +370,13 @@ void flush_thread(void)
 {
 	struct task_struct *tsk = current;
 
+	/*
+	 * Remove function-return probe instances associated with this task
+	 * and put them back on the free list. Do not insert an exit probe for
+	 * this function, it will be disabled by kprobe_flush_task if you do.
+	 */
+	kprobe_flush_task(tsk);
+
 	memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));	
 	/*
diff --git a/include/asm-i386/kprobes.h b/include/asm-i386/kprobes.h
index 4092f68d123a..8b6d3a90cd78 100644
--- a/include/asm-i386/kprobes.h
+++ b/include/asm-i386/kprobes.h
@@ -39,6 +39,9 @@ typedef u8 kprobe_opcode_t;
 	: (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
+#define ARCH_SUPPORTS_KRETPROBES
+
+void kretprobe_trampoline(void);
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 99ddba5a4e00..fba39f87efec 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -25,21 +25,31 @@
  *		Rusty Russell).
  * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
  *		interface to access function arguments.
+ * 2005-May	Hien Nguyen <hien@us.ibm.com> and Jim Keniston
+ *		<jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> added function-return probes.
  */
 #include <linux/config.h>
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
+#include <linux/spinlock.h>
+
 #include <asm/kprobes.h>
 
 struct kprobe;
 struct pt_regs;
+struct kretprobe;
+struct kretprobe_instance;
 typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *);
 typedef int (*kprobe_break_handler_t) (struct kprobe *, struct pt_regs *);
 typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *,
 				       unsigned long flags);
 typedef int (*kprobe_fault_handler_t) (struct kprobe *, struct pt_regs *,
 				       int trapnr);
+typedef int (*kretprobe_handler_t) (struct kretprobe_instance *,
+				    struct pt_regs *);
+
 struct kprobe {
 	struct hlist_node hlist;
 
@@ -85,6 +95,62 @@ struct jprobe {
 	kprobe_opcode_t *entry;	/* probe handling code to jump to */
 };
 
+#ifdef ARCH_SUPPORTS_KRETPROBES
+extern int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs);
+extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
+							unsigned long flags);
+extern struct task_struct *arch_get_kprobe_task(void *ptr);
+extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs);
+extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock);
+#else /* ARCH_SUPPORTS_KRETPROBES */
+static inline void kretprobe_trampoline(void)
+{
+}
+static inline int trampoline_probe_handler(struct kprobe *p,
+						struct pt_regs *regs)
+{
+	return 0;
+}
+static inline void trampoline_post_handler(struct kprobe *p,
+				struct pt_regs *regs, unsigned long flags)
+{
+}
+static inline void arch_prepare_kretprobe(struct kretprobe *rp,
+					struct pt_regs *regs)
+{
+}
+static inline void arch_kprobe_flush_task(struct task_struct *tk)
+{
+}
+#define arch_get_kprobe_task(ptr) ((struct task_struct *)NULL)
+#endif /* ARCH_SUPPORTS_KRETPROBES */
+/*
+ * Function-return probe -
+ * Note:
+ * User needs to provide a handler function, and initialize maxactive.
+ * maxactive - The maximum number of instances of the probed function that
+ * can be active concurrently.
+ * nmissed - tracks the number of times the probed function's return was
+ * ignored, due to maxactive being too low.
+ *
+ */
+struct kretprobe {
+	struct kprobe kp;
+	kretprobe_handler_t handler;
+	int maxactive;
+	int nmissed;
+	struct hlist_head free_instances;
+	struct hlist_head used_instances;
+};
+
+struct kretprobe_instance {
+	struct hlist_node uflist; /* either on free list or used list */
+	struct hlist_node hlist;
+	struct kretprobe *rp;
+	void *ret_addr;
+	void *stack_addr;
+};
+
 #ifdef CONFIG_KPROBES
 /* Locks kprobe: irq must be disabled */
 void lock_kprobes(void);
@@ -104,6 +170,7 @@ extern void show_registers(struct pt_regs *regs);
 
 /* Get the kprobe at this addr (if any).  Must have called lock_kprobes */
 struct kprobe *get_kprobe(void *addr);
+struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk);
 
 int register_kprobe(struct kprobe *p);
 void unregister_kprobe(struct kprobe *p);
@@ -113,7 +180,16 @@ int register_jprobe(struct jprobe *p);
 void unregister_jprobe(struct jprobe *p);
 void jprobe_return(void);
 
-#else
+int register_kretprobe(struct kretprobe *rp);
+void unregister_kretprobe(struct kretprobe *rp);
+
+struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp);
+struct kretprobe_instance *get_rp_inst(void *sara);
+struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk);
+void add_rp_inst(struct kretprobe_instance *ri);
+void kprobe_flush_task(struct task_struct *tk);
+void recycle_rp_inst(struct kretprobe_instance *ri);
+#else /* CONFIG_KPROBES */
 static inline int kprobe_running(void)
 {
 	return 0;
@@ -135,5 +211,15 @@ static inline void unregister_jprobe(struct jprobe *p)
 static inline void jprobe_return(void)
 {
 }
-#endif
+static inline int register_kretprobe(struct kretprobe *rp)
+{
+	return -ENOSYS;
+}
+static inline void unregister_kretprobe(struct kretprobe *rp)
+{
+}
+static inline void kprobe_flush_task(struct task_struct *tk)
+{
+}
+#endif				/* CONFIG_KPROBES */
 #endif				/* _LINUX_KPROBES_H */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 037142b72a49..692fbf75ab49 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -27,6 +27,9 @@
  *		interface to access function arguments.
  * 2004-Sep	Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
  *		exceptions notifier to be first on the priority list.
+ * 2005-May	Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> added function-return probes.
  */
 #include <linux/kprobes.h>
 #include <linux/spinlock.h>
@@ -41,6 +44,7 @@
 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
 
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
+static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
 unsigned int kprobe_cpu = NR_CPUS;
 static DEFINE_SPINLOCK(kprobe_lock);
@@ -78,7 +82,7 @@ struct kprobe *get_kprobe(void *addr)
  * Aggregate handlers for multiple kprobes support - these handlers
  * take care of invoking the individual kprobe handlers on p->list
  */
-int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe *kp;
 
@@ -92,8 +96,8 @@ int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
-		unsigned long flags)
+static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+			      unsigned long flags)
 {
 	struct kprobe *kp;
 
@@ -107,7 +111,8 @@ void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 	return;
 }
 
-int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
+static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+			      int trapnr)
 {
 	/*
 	 * if we faulted "during" the execution of a user specified
@@ -120,6 +125,135 @@ int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
 	return 0;
 }
 
+struct kprobe trampoline_p = {
+		.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
+		.pre_handler = trampoline_probe_handler,
+		.post_handler = trampoline_post_handler
+};
+
+struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
+{
+	struct hlist_node *node;
+	struct kretprobe_instance *ri;
+	hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
+		return ri;
+	return NULL;
+}
+
+static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
+{
+	struct hlist_node *node;
+	struct kretprobe_instance *ri;
+	hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
+		return ri;
+	return NULL;
+}
+
+struct kretprobe_instance *get_rp_inst(void *sara)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct task_struct *tsk;
+	struct kretprobe_instance *ri;
+
+	tsk = arch_get_kprobe_task(sara);
+	head = &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+	hlist_for_each_entry(ri, node, head, hlist) {
+		if (ri->stack_addr == sara)
+			return ri;
+	}
+	return NULL;
+}
+
+void add_rp_inst(struct kretprobe_instance *ri)
+{
+	struct task_struct *tsk;
+	/*
+	 * Remove rp inst off the free list -
+	 * Add it back when probed function returns
+	 */
+	hlist_del(&ri->uflist);
+	tsk = arch_get_kprobe_task(ri->stack_addr);
+	/* Add rp inst onto table */
+	INIT_HLIST_NODE(&ri->hlist);
+	hlist_add_head(&ri->hlist,
+			&kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]);
+
+	/* Also add this rp inst to the used list. */
+	INIT_HLIST_NODE(&ri->uflist);
+	hlist_add_head(&ri->uflist, &ri->rp->used_instances);
+}
+
+void recycle_rp_inst(struct kretprobe_instance *ri)
+{
+	/* remove rp inst off the rprobe_inst_table */
+	hlist_del(&ri->hlist);
+	if (ri->rp) {
+		/* remove rp inst off the used list */
+		hlist_del(&ri->uflist);
+		/* put rp inst back onto the free list */
+		INIT_HLIST_NODE(&ri->uflist);
+		hlist_add_head(&ri->uflist, &ri->rp->free_instances);
+	} else
+		/* Unregistering */
+		kfree(ri);
+}
+
+struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
+{
+	return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+}
+
+struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk)
+{
+	struct task_struct *tsk;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct kretprobe_instance *ri;
+
+	head = &kretprobe_inst_table[hash_ptr(tk, KPROBE_HASH_BITS)];
+
+	hlist_for_each_entry(ri, node, head, hlist) {
+		tsk = arch_get_kprobe_task(ri->stack_addr);
+		if (tsk == tk)
+			return ri;
+	}
+	return NULL;
+}
+
+/*
+ * This function is called from do_exit or do_execv when task tk's stack is
+ * about to be recycled. Recycle any function-return probe instances
+ * associated with this task. These represent probed functions that have
+ * been called but may never return.
+ */
+void kprobe_flush_task(struct task_struct *tk)
+{
+	arch_kprobe_flush_task(tk, &kprobe_lock);
+}
+
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+
+	/*TODO: consider to only swap the RA after the last pre_handler fired */
+	arch_prepare_kretprobe(rp, regs);
+	return 0;
+}
+
+static inline void free_rp_inst(struct kretprobe *rp)
+{
+	struct kretprobe_instance *ri;
+	while ((ri = get_free_rp_inst(rp)) != NULL) {
+		hlist_del(&ri->uflist);
+		kfree(ri);
+	}
+}
+
 /*
  * Fill in the required fields of the "manager kprobe". Replace the
  * earlier kprobe in the hlist with the manager kprobe
@@ -257,16 +391,82 @@ void unregister_jprobe(struct jprobe *jp)
 	unregister_kprobe(&jp->kp);
 }
 
+#ifdef ARCH_SUPPORTS_KRETPROBES
+
+int register_kretprobe(struct kretprobe *rp)
+{
+	int ret = 0;
+	struct kretprobe_instance *inst;
+	int i;
+
+	rp->kp.pre_handler = pre_handler_kretprobe;
+
+	/* Pre-allocate memory for max kretprobe instances */
+	if (rp->maxactive <= 0) {
+#ifdef CONFIG_PREEMPT
+		rp->maxactive = max(10, 2 * NR_CPUS);
+#else
+		rp->maxactive = NR_CPUS;
+#endif
+	}
+	INIT_HLIST_HEAD(&rp->used_instances);
+	INIT_HLIST_HEAD(&rp->free_instances);
+	for (i = 0; i < rp->maxactive; i++) {
+		inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL);
+		if (inst == NULL) {
+			free_rp_inst(rp);
+			return -ENOMEM;
+		}
+		INIT_HLIST_NODE(&inst->uflist);
+		hlist_add_head(&inst->uflist, &rp->free_instances);
+	}
+
+	rp->nmissed = 0;
+	/* Establish function entry probe point */
+	if ((ret = register_kprobe(&rp->kp)) != 0)
+		free_rp_inst(rp);
+	return ret;
+}
+
+#else /* ARCH_SUPPORTS_KRETPROBES */
+
+int register_kretprobe(struct kretprobe *rp)
+{
+	return -ENOSYS;
+}
+
+#endif /* ARCH_SUPPORTS_KRETPROBES */
+
+void unregister_kretprobe(struct kretprobe *rp)
+{
+	unsigned long flags;
+	struct kretprobe_instance *ri;
+
+	unregister_kprobe(&rp->kp);
+	/* No race here */
+	spin_lock_irqsave(&kprobe_lock, flags);
+	free_rp_inst(rp);
+	while ((ri = get_used_rp_inst(rp)) != NULL) {
+		ri->rp = NULL;
+		hlist_del(&ri->uflist);
+	}
+	spin_unlock_irqrestore(&kprobe_lock, flags);
+}
+
 static int __init init_kprobes(void)
 {
 	int i, err = 0;
 
 	/* FIXME allocate the probe table, currently defined statically */
 	/* initialize all list heads */
-	for (i = 0; i < KPROBE_TABLE_SIZE; i++)
+	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		INIT_HLIST_HEAD(&kprobe_table[i]);
+		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+	}
 
 	err = register_die_notifier(&kprobe_exceptions_nb);
+	/* Register the trampoline probe for return probe */
+	register_kprobe(&trampoline_p);
 	return err;
 }
 
@@ -277,3 +477,6 @@ EXPORT_SYMBOL_GPL(unregister_kprobe);
 EXPORT_SYMBOL_GPL(register_jprobe);
 EXPORT_SYMBOL_GPL(unregister_jprobe);
 EXPORT_SYMBOL_GPL(jprobe_return);
+EXPORT_SYMBOL_GPL(register_kretprobe);
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
+
-- 
cgit v1.3-14-g43fede


From 7e1048b11c5afe79aac46a42e3ccec86b8365c6d Mon Sep 17 00:00:00 2001
From: Rusty Lynch <rusty.lynch@intel.com>
Date: Thu, 23 Jun 2005 00:09:25 -0700
Subject: [PATCH] Move kprobe [dis]arming into arch specific code

The architecture independent code of the current kprobes implementation is
arming and disarming kprobes at registration time.  The problem is that the
code is assuming that arming and disarming is a just done by a simple write
of some magic value to an address.  This is problematic for ia64 where our
instructions look more like structures, and we can not insert break points
by just doing something like:

*p->addr = BREAKPOINT_INSTRUCTION;

The following patch to 2.6.12-rc4-mm2 adds two new architecture dependent
functions:

     * void arch_arm_kprobe(struct kprobe *p)
     * void arch_disarm_kprobe(struct kprobe *p)

and then adds the new functions for each of the architectures that already
implement kprobes (spar64/ppc64/i386/x86_64).

I thought arch_[dis]arm_kprobe was the most descriptive of what was really
happening, but each of the architectures already had a disarm_kprobe()
function that was really a "disarm and do some other clean-up items as
needed when you stumble across a recursive kprobe." So...  I took the
liberty of changing the code that was calling disarm_kprobe() to call
arch_disarm_kprobe(), and then do the cleanup in the block of code dealing
with the recursive kprobe case.

So far this patch as been tested on i386, x86_64, and ppc64, but still
needs to be tested in sparc64.

Signed-off-by: Rusty Lynch <rusty.lynch@intel.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c    | 19 +++++++++++++++----
 arch/ppc64/kernel/kprobes.c   | 19 +++++++++++++++----
 arch/sparc64/kernel/kprobes.c | 31 ++++++++++++++++++-------------
 arch/x86_64/kernel/kprobes.c  | 26 ++++++++++++++++++--------
 include/linux/kprobes.h       |  2 ++
 kernel/kprobes.c              | 12 ++++--------
 6 files changed, 72 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 048f754bbe23..2314d8d306fd 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -33,6 +33,7 @@
 #include <linux/ptrace.h>
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
+#include <asm/cacheflush.h>
 #include <asm/kdebug.h>
 #include <asm/desc.h>
 
@@ -71,16 +72,25 @@ int arch_prepare_kprobe(struct kprobe *p)
 void arch_copy_kprobe(struct kprobe *p)
 {
 	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	p->opcode = *p->addr;
 }
 
-void arch_remove_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
 {
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+void arch_disarm_kprobe(struct kprobe *p)
 {
 	*p->addr = p->opcode;
-	regs->eip = (unsigned long)p->addr;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
 }
 
 static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -177,7 +187,8 @@ static int kprobe_handler(struct pt_regs *regs)
 				unlock_kprobes();
 				goto no_kprobe;
 			}
-			disarm_kprobe(p, regs);
+			arch_disarm_kprobe(p);
+			regs->eip = (unsigned long)p->addr;
 			ret = 1;
 		} else {
 			p = current_kprobe;
diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c
index e950a2058a19..8c0920a6d03e 100644
--- a/arch/ppc64/kernel/kprobes.c
+++ b/arch/ppc64/kernel/kprobes.c
@@ -32,6 +32,7 @@
 #include <linux/ptrace.h>
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
+#include <asm/cacheflush.h>
 #include <asm/kdebug.h>
 #include <asm/sstep.h>
 
@@ -61,16 +62,25 @@ int arch_prepare_kprobe(struct kprobe *p)
 void arch_copy_kprobe(struct kprobe *p)
 {
 	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	p->opcode = *p->addr;
 }
 
-void arch_remove_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
 {
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+void arch_disarm_kprobe(struct kprobe *p)
 {
 	*p->addr = p->opcode;
-	regs->nip = (unsigned long)p->addr;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
 }
 
 static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -101,7 +111,8 @@ static inline int kprobe_handler(struct pt_regs *regs)
 				unlock_kprobes();
 				goto no_kprobe;
 			}
-			disarm_kprobe(p, regs);
+			arch_disarm_kprobe(p);
+			regs->nip = (unsigned long)p->addr;
 			ret = 1;
 		} else {
 			p = current_kprobe;
diff --git a/arch/sparc64/kernel/kprobes.c b/arch/sparc64/kernel/kprobes.c
index 7066d7ba667a..d67195ba3fa2 100644
--- a/arch/sparc64/kernel/kprobes.c
+++ b/arch/sparc64/kernel/kprobes.c
@@ -6,7 +6,6 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
-
 #include <asm/kdebug.h>
 #include <asm/signal.h>
 
@@ -47,6 +46,19 @@ void arch_copy_kprobe(struct kprobe *p)
 {
 	p->ainsn.insn[0] = *p->addr;
 	p->ainsn.insn[1] = BREAKPOINT_INSTRUCTION_2;
+	p->opcode = *p->addr;
+}
+
+void arch_arm_kprobe(struct kprobe *p)
+{
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flushi(p->addr);
+}
+
+void arch_disarm_kprobe(struct kprobe *p)
+{
+	*p->addr = p->opcode;
+	flushi(p->addr);
 }
 
 void arch_remove_kprobe(struct kprobe *p)
@@ -78,17 +90,6 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 	}
 }
 
-static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
-{
-	*p->addr = p->opcode;
-	flushi(p->addr);
-
-	regs->tpc = (unsigned long) p->addr;
-	regs->tnpc = current_kprobe_orig_tnpc;
-	regs->tstate = ((regs->tstate & ~TSTATE_PIL) |
-			current_kprobe_orig_tstate_pil);
-}
-
 static int kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *p;
@@ -109,7 +110,11 @@ static int kprobe_handler(struct pt_regs *regs)
 				unlock_kprobes();
 				goto no_kprobe;
 			}
-			disarm_kprobe(p, regs);
+			arch_disarm_kprobe(p);
+			regs->tpc = (unsigned long) p->addr;
+			regs->tnpc = current_kprobe_orig_tnpc;
+			regs->tstate = ((regs->tstate & ~TSTATE_PIL) |
+					current_kprobe_orig_tstate_pil);
 			ret = 1;
 		} else {
 			p = current_kprobe;
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 203672ca7401..324bf57925a9 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -39,7 +39,7 @@
 #include <linux/slab.h>
 #include <linux/preempt.h>
 #include <linux/moduleloader.h>
-
+#include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 #include <asm/kdebug.h>
 
@@ -216,19 +216,28 @@ void arch_copy_kprobe(struct kprobe *p)
 		BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
 		*ripdisp = disp;
 	}
+	p->opcode = *p->addr;
 }
 
-void arch_remove_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
 {
-	up(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
-	down(&kprobe_mutex);
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+void arch_disarm_kprobe(struct kprobe *p)
 {
 	*p->addr = p->opcode;
-	regs->rip = (unsigned long)p->addr;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+	up(&kprobe_mutex);
+	free_insn_slot(p->ainsn.insn);
+	down(&kprobe_mutex);
 }
 
 static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -311,7 +320,8 @@ int kprobe_handler(struct pt_regs *regs)
 				unlock_kprobes();
 				goto no_kprobe;
 			}
-			disarm_kprobe(p, regs);
+			arch_disarm_kprobe(p);
+			regs->rip = (unsigned long)p->addr;
 			ret = 1;
 		} else {
 			p = current_kprobe;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index fba39f87efec..0f90466fb8b0 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -165,6 +165,8 @@ static inline int kprobe_running(void)
 
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_copy_kprobe(struct kprobe *p);
+extern void arch_arm_kprobe(struct kprobe *p);
+extern void arch_disarm_kprobe(struct kprobe *p);
 extern void arch_remove_kprobe(struct kprobe *p);
 extern void show_registers(struct pt_regs *regs);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 692fbf75ab49..e8e0ae8a6e14 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -261,7 +261,7 @@ static inline void free_rp_inst(struct kretprobe *rp)
 static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
 	ap->addr = p->addr;
-	ap->opcode = p->opcode;
+	memcpy(&ap->opcode, &p->opcode, sizeof(kprobe_opcode_t));
 	memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn));
 
 	ap->pre_handler = aggr_pre_handler;
@@ -304,10 +304,8 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
 /* kprobe removal house-keeping routines */
 static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
 {
-	*p->addr = p->opcode;
+	arch_disarm_kprobe(p);
 	hlist_del(&p->hlist);
-	flush_icache_range((unsigned long) p->addr,
-		   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 	spin_unlock_irqrestore(&kprobe_lock, flags);
 	arch_remove_kprobe(p);
 }
@@ -344,10 +342,8 @@ int register_kprobe(struct kprobe *p)
 	hlist_add_head(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
-	p->opcode = *p->addr;
-	*p->addr = BREAKPOINT_INSTRUCTION;
-	flush_icache_range((unsigned long) p->addr,
-			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+  	arch_arm_kprobe(p);
+
 out:
 	spin_unlock_irqrestore(&kprobe_lock, flags);
 rm_kprobe:
-- 
cgit v1.3-14-g43fede


From 0aa55e4d7db822059fe8132fe9f2b7773c48216c Mon Sep 17 00:00:00 2001
From: Hien Nguyen <hien@us.ibm.com>
Date: Thu, 23 Jun 2005 00:09:26 -0700
Subject: [PATCH] kprobes: moves lock-unlock to non-arch kprobe_flush_task

This patch moves the lock/unlock of the arch specific kprobe_flush_task()
to the non-arch specific kprobe_flusk_task().

Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Acked-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c | 5 +----
 include/linux/kprobes.h    | 3 +--
 kernel/kprobes.c           | 5 ++++-
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 2314d8d306fd..b8e2bae0ab4f 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -138,17 +138,14 @@ void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
 	}
 }
 
-void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock)
+void arch_kprobe_flush_task(struct task_struct *tk)
 {
-	unsigned long flags = 0;
 	struct kretprobe_instance *ri;
-	spin_lock_irqsave(kp_lock, flags);
 	while ((ri = get_rp_inst_tsk(tk)) != NULL) {
 		*((unsigned long *)(ri->stack_addr)) =
 					(unsigned long) ri->ret_addr;
 		recycle_rp_inst(ri);
 	}
-	spin_unlock_irqrestore(kp_lock, flags);
 }
 
 /*
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 0f90466fb8b0..461391decc46 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -33,7 +33,6 @@
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
-#include <linux/spinlock.h>
 
 #include <asm/kprobes.h>
 
@@ -101,7 +100,7 @@ extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
 							unsigned long flags);
 extern struct task_struct *arch_get_kprobe_task(void *ptr);
 extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs);
-extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock);
+extern void arch_kprobe_flush_task(struct task_struct *tk);
 #else /* ARCH_SUPPORTS_KRETPROBES */
 static inline void kretprobe_trampoline(void)
 {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e8e0ae8a6e14..dd42e717dd35 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -229,7 +229,10 @@ struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk)
  */
 void kprobe_flush_task(struct task_struct *tk)
 {
-	arch_kprobe_flush_task(tk, &kprobe_lock);
+	unsigned long flags = 0;
+	spin_lock_irqsave(&kprobe_lock, flags);
+	arch_kprobe_flush_task(tk);
+	spin_unlock_irqrestore(&kprobe_lock, flags);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From ea32c65cc2d2294c04e9f81d0578a6f51febfdbf Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Thu, 23 Jun 2005 00:09:36 -0700
Subject: [PATCH] kprobes: Temporary disarming of reentrant probe

In situations where a kprobes handler calls a routine which has a probe on it,
then kprobes_handler() disarms the new probe forever.  This patch removes the
above limitation by temporarily disarming the new probe.  When the another
probe hits while handling the old probe, the kprobes_handler() saves previous
kprobes state and handles the new probe without calling the new kprobes
registered handlers.  kprobe_post_handler() restores back the previous kprobes
state and the normal execution continues.

However on x86_64 architecture, re-rentrancy is provided only through
pre_handler().  If a routine having probe is referenced through
post_handler(), then the probes on that routine are disarmed forever, since
the exception stack is gets changed after the processor single steps the
instruction of the new probe.

This patch includes generic changes to support temporary disarming on
reentrancy of probes.

Signed-of-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h | 9 +++++++++
 kernel/kprobes.c        | 1 +
 2 files changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 461391decc46..5e1a7b0d7b3f 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -36,6 +36,12 @@
 
 #include <asm/kprobes.h>
 
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE	0x00000001
+#define KPROBE_HIT_SS		0x00000002
+#define KPROBE_REENTER		0x00000004
+#define KPROBE_HIT_SSDONE	0x00000008
+
 struct kprobe;
 struct pt_regs;
 struct kretprobe;
@@ -55,6 +61,9 @@ struct kprobe {
 	/* list of kprobes for multi-handler support */
 	struct list_head list;
 
+	/*count the number of times this probe was temporarily disarmed */
+	unsigned long nmissed;
+
 	/* location of the probe point */
 	kprobe_opcode_t *addr;
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index dd42e717dd35..456ecedff2d4 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -335,6 +335,7 @@ int register_kprobe(struct kprobe *p)
 	}
 	spin_lock_irqsave(&kprobe_lock, flags);
 	old_p = get_kprobe(p->addr);
+	p->nmissed = 0;
 	if (old_p) {
 		ret = register_aggr_kprobe(old_p, p);
 		goto out;
-- 
cgit v1.3-14-g43fede


From 8b0914ea7475615c7c8965c1ac8fe4069270f25c Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Thu, 23 Jun 2005 00:09:41 -0700
Subject: [PATCH] jprobes: allow a jprobe to coexist with muliple kprobes

Presently either multiple kprobes or only one jprobe could be inserted.
This patch removes the above limitation and allows one jprobe and multiple
kprobes to coexist at the same address.  However multiple jprobes cannot
coexist with multiple kprobes.  Currently I am working on the prototype to
allow multiple jprobes coexist with multiple kprobes.

Signed-off-by: Ananth N Mavinakayanhalli <amavin@redhat.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 51 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 456ecedff2d4..334f37472c56 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -89,9 +89,10 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	list_for_each_entry(kp, &p->list, list) {
 		if (kp->pre_handler) {
 			curr_kprobe = kp;
-			kp->pre_handler(kp, regs);
-			curr_kprobe = NULL;
+			if (kp->pre_handler(kp, regs))
+				return 1;
 		}
+		curr_kprobe = NULL;
 	}
 	return 0;
 }
@@ -125,6 +126,19 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
 	return 0;
 }
 
+static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kprobe *kp = curr_kprobe;
+	if (curr_kprobe && kp->break_handler) {
+		if (kp->break_handler(kp, regs)) {
+			curr_kprobe = NULL;
+			return 1;
+		}
+	}
+	curr_kprobe = NULL;
+	return 0;
+}
+
 struct kprobe trampoline_p = {
 		.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
 		.pre_handler = trampoline_probe_handler,
@@ -257,19 +271,46 @@ static inline void free_rp_inst(struct kretprobe *rp)
 	}
 }
 
+/*
+ * Keep all fields in the kprobe consistent
+ */
+static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+	memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+	memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+}
+
+/*
+* Add the new probe to old_p->list. Fail if this is the
+* second jprobe at the address - two jprobes can't coexist
+*/
+static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+        struct kprobe *kp;
+
+	if (p->break_handler) {
+		list_for_each_entry(kp, &old_p->list, list) {
+			if (kp->break_handler)
+				return -EEXIST;
+		}
+		list_add_tail(&p->list, &old_p->list);
+	} else
+		list_add(&p->list, &old_p->list);
+	return 0;
+}
+
 /*
  * Fill in the required fields of the "manager kprobe". Replace the
  * earlier kprobe in the hlist with the manager kprobe
  */
 static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
+	copy_kprobe(p, ap);
 	ap->addr = p->addr;
-	memcpy(&ap->opcode, &p->opcode, sizeof(kprobe_opcode_t));
-	memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn));
-
 	ap->pre_handler = aggr_pre_handler;
 	ap->post_handler = aggr_post_handler;
 	ap->fault_handler = aggr_fault_handler;
+	ap->break_handler = aggr_break_handler;
 
 	INIT_LIST_HEAD(&ap->list);
 	list_add(&p->list, &ap->list);
@@ -290,16 +331,16 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
 	int ret = 0;
 	struct kprobe *ap;
 
-	if (old_p->break_handler || p->break_handler) {
-		ret = -EEXIST;	/* kprobe and jprobe can't (yet) coexist */
-	} else if (old_p->pre_handler == aggr_pre_handler) {
-		list_add(&p->list, &old_p->list);
+	if (old_p->pre_handler == aggr_pre_handler) {
+		copy_kprobe(old_p, p);
+		ret = add_new_kprobe(old_p, p);
 	} else {
 		ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC);
 		if (!ap)
 			return -ENOMEM;
 		add_aggr_kprobe(ap, old_p);
-		list_add(&p->list, &ap->list);
+		copy_kprobe(ap, p);
+		ret = add_new_kprobe(ap, p);
 	}
 	return ret;
 }
-- 
cgit v1.3-14-g43fede


From d6e711448137ca3301512cec41a2c2ce852b3d0a Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 23 Jun 2005 00:09:43 -0700
Subject: [PATCH] setuid core dump

Add a new `suid_dumpable' sysctl:

This value can be used to query and set the core dump mode for setuid
or otherwise protected/tainted binaries. The modes are

0 - (default) - traditional behaviour.  Any process which has changed
    privilege levels or is execute only will not be dumped

1 - (debug) - all processes dump core when possible.  The core dump is
    owned by the current user and no security is applied.  This is intended
    for system debugging situations only.  Ptrace is unchecked.

2 - (suidsafe) - any binary which normally would not be dumped is dumped
    readable by root only.  This allows the end user to remove such a dump but
    not access it directly.  For security reasons core dumps in this mode will
    not overwrite one another or other files.  This mode is appropriate when
    adminstrators are attempting to debug problems in a normal environment.

(akpm:

> > +EXPORT_SYMBOL(suid_dumpable);
>
> EXPORT_SYMBOL_GPL?

No problem to me.

> >  	if (current->euid == current->uid && current->egid == current->gid)
> >  		current->mm->dumpable = 1;
>
> Should this be SUID_DUMP_USER?

Actually the feedback I had from last time was that the SUID_ defines
should go because its clearer to follow the numbers. They can go
everywhere (and there are lots of places where dumpable is tested/used
as a bool in untouched code)

> Maybe this should be renamed to `dump_policy' or something.  Doing that
> would help us catch any code which isn't using the #defines, too.

Fair comment. The patch was designed to be easy to maintain for Red Hat
rather than for merging. Changing that field would create a gigantic
diff because it is used all over the place.

)

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/kernel.txt | 20 ++++++++++++++++++++
 fs/exec.c                       | 23 +++++++++++++++++++++--
 fs/proc/base.c                  |  6 ++++--
 include/linux/binfmts.h         |  5 +++++
 include/linux/sched.h           |  2 +-
 include/linux/sysctl.h          |  1 +
 kernel/sys.c                    | 22 +++++++++++-----------
 kernel/sysctl.c                 |  9 +++++++++
 security/commoncap.c            |  2 +-
 security/dummy.c                |  2 +-
 10 files changed, 74 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 35159176997b..9f11d36a8c10 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -49,6 +49,7 @@ show up in /proc/sys/kernel:
 - shmmax                      [ sysv ipc ]
 - shmmni
 - stop-a                      [ SPARC only ]
+- suid_dumpable
 - sysrq                       ==> Documentation/sysrq.txt
 - tainted
 - threads-max
@@ -300,6 +301,25 @@ kernel.  This value defaults to SHMMAX.
 
 ==============================================================
 
+suid_dumpable:
+
+This value can be used to query and set the core dump mode for setuid
+or otherwise protected/tainted binaries. The modes are
+
+0 - (default) - traditional behaviour. Any process which has changed
+	privilege levels or is execute only will not be dumped
+1 - (debug) - all processes dump core when possible. The core dump is
+	owned by the current user and no security is applied. This is
+	intended for system debugging situations only. Ptrace is unchecked.
+2 - (suidsafe) - any binary which normally would not be dumped is dumped
+	readable by root only. This allows the end user to remove
+	such a dump but not access it directly. For security reasons
+	core dumps in this mode will not overwrite one another or
+	other files. This mode is appropriate when adminstrators are
+	attempting to debug problems in a normal environment.
+
+==============================================================
+
 tainted: 
 
 Non-zero if the kernel has been tainted.  Numeric values, which
diff --git a/fs/exec.c b/fs/exec.c
index 3a4b35a14c0d..48871917d363 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -58,6 +58,9 @@
 
 int core_uses_pid;
 char core_pattern[65] = "core";
+int suid_dumpable = 0;
+
+EXPORT_SYMBOL(suid_dumpable);
 /* The maximal length of core_pattern is also specified in sysctl.c */
 
 static struct linux_binfmt *formats;
@@ -864,6 +867,9 @@ int flush_old_exec(struct linux_binprm * bprm)
 
 	if (current->euid == current->uid && current->egid == current->gid)
 		current->mm->dumpable = 1;
+	else
+		current->mm->dumpable = suid_dumpable;
+
 	name = bprm->filename;
 
 	/* Copies the binary name from after last slash */
@@ -884,7 +890,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	    permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL) ||
 	    (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
 		suid_keys(current);
-		current->mm->dumpable = 0;
+		current->mm->dumpable = suid_dumpable;
 	}
 
 	/* An exec changes our domain. We are no longer part of the thread
@@ -1432,6 +1438,8 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	struct inode * inode;
 	struct file * file;
 	int retval = 0;
+	int fsuid = current->fsuid;
+	int flag = 0;
 
 	binfmt = current->binfmt;
 	if (!binfmt || !binfmt->core_dump)
@@ -1441,6 +1449,16 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 		up_write(&mm->mmap_sem);
 		goto fail;
 	}
+
+	/*
+	 *	We cannot trust fsuid as being the "true" uid of the
+	 *	process nor do we know its entire history. We only know it
+	 *	was tainted so we dump it as root in mode 2.
+	 */
+	if (mm->dumpable == 2) {	/* Setuid core dump mode */
+		flag = O_EXCL;		/* Stop rewrite attacks */
+		current->fsuid = 0;	/* Dump root private */
+	}
 	mm->dumpable = 0;
 	init_completion(&mm->core_done);
 	spin_lock_irq(&current->sighand->siglock);
@@ -1466,7 +1484,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
  	lock_kernel();
 	format_corename(corename, core_pattern, signr);
 	unlock_kernel();
-	file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE, 0600);
+	file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600);
 	if (IS_ERR(file))
 		goto fail_unlock;
 	inode = file->f_dentry->d_inode;
@@ -1491,6 +1509,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 close_fail:
 	filp_close(file, NULL);
 fail_unlock:
+	current->fsuid = fsuid;
 	complete_all(&mm->core_done);
 fail:
 	return retval;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e31903aadd96..ace151fa4878 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -314,7 +314,7 @@ static int may_ptrace_attach(struct task_struct *task)
 	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
 		goto out;
 	rmb();
-	if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+	if (task->mm->dumpable != 1 && !capable(CAP_SYS_PTRACE))
 		goto out;
 	if (security_ptrace(current, task))
 		goto out;
@@ -1113,7 +1113,9 @@ static int task_dumpable(struct task_struct *task)
 	if (mm)
 		dumpable = mm->dumpable;
 	task_unlock(task);
-	return dumpable;
+	if(dumpable == 1)
+		return 1;
+	return 0;
 }
 
 
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 7e736e201c46..c1e82c514443 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -69,6 +69,11 @@ extern void remove_arg_zero(struct linux_binprm *);
 extern int search_binary_handler(struct linux_binprm *,struct pt_regs *);
 extern int flush_old_exec(struct linux_binprm * bprm);
 
+extern int suid_dumpable;
+#define SUID_DUMP_DISABLE	0	/* No setuid dumping */
+#define SUID_DUMP_USER		1	/* Dump as user of process */
+#define SUID_DUMP_ROOT		2	/* Dump as root */
+
 /* Stack area protections */
 #define EXSTACK_DEFAULT   0	/* Whatever the arch defaults to */
 #define EXSTACK_DISABLE_X 1	/* Disable executable stacks */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b58afd97a180..901742f92389 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -246,7 +246,7 @@ struct mm_struct {
 
 	unsigned long saved_auxv[42]; /* for /proc/PID/auxv */
 
-	unsigned dumpable:1;
+	unsigned dumpable:2;
 	cpumask_t cpu_vm_mask;
 
 	/* Architecture-specific MM context */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index a17745c80a91..614e939c78a4 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -136,6 +136,7 @@ enum
 	KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */
 	KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */
 	KERN_RANDOMIZE=68, /* int: randomize virtual address space */
+	KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */
 };
 
 
diff --git a/kernel/sys.c b/kernel/sys.c
index f006632c2ba7..0a2c8cda9638 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -525,7 +525,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 	}
 	if (new_egid != old_egid)
 	{
-		current->mm->dumpable = 0;
+		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
 	if (rgid != (gid_t) -1 ||
@@ -556,7 +556,7 @@ asmlinkage long sys_setgid(gid_t gid)
 	{
 		if(old_egid != gid)
 		{
-			current->mm->dumpable=0;
+			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->gid = current->egid = current->sgid = current->fsgid = gid;
@@ -565,7 +565,7 @@ asmlinkage long sys_setgid(gid_t gid)
 	{
 		if(old_egid != gid)
 		{
-			current->mm->dumpable=0;
+			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->egid = current->fsgid = gid;
@@ -596,7 +596,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
 
 	if(dumpclear)
 	{
-		current->mm->dumpable = 0;
+		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
 	current->uid = new_ruid;
@@ -653,7 +653,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 
 	if (new_euid != old_euid)
 	{
-		current->mm->dumpable=0;
+		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
 	current->fsuid = current->euid = new_euid;
@@ -703,7 +703,7 @@ asmlinkage long sys_setuid(uid_t uid)
 
 	if (old_euid != uid)
 	{
-		current->mm->dumpable = 0;
+		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
 	current->fsuid = current->euid = uid;
@@ -748,7 +748,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 	if (euid != (uid_t) -1) {
 		if (euid != current->euid)
 		{
-			current->mm->dumpable = 0;
+			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->euid = euid;
@@ -798,7 +798,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 	if (egid != (gid_t) -1) {
 		if (egid != current->egid)
 		{
-			current->mm->dumpable = 0;
+			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->egid = egid;
@@ -845,7 +845,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
 	{
 		if (uid != old_fsuid)
 		{
-			current->mm->dumpable = 0;
+			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->fsuid = uid;
@@ -875,7 +875,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
 	{
 		if (gid != old_fsgid)
 		{
-			current->mm->dumpable = 0;
+			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->fsgid = gid;
@@ -1652,7 +1652,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 				error = 1;
 			break;
 		case PR_SET_DUMPABLE:
-			if (arg2 != 0 && arg2 != 1) {
+			if (arg2 < 0 || arg2 > 2) {
 				error = -EINVAL;
 				break;
 			}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 701d12c63068..24a4d12d5aa9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -58,6 +58,7 @@ extern int sysctl_overcommit_ratio;
 extern int max_threads;
 extern int sysrq_enabled;
 extern int core_uses_pid;
+extern int suid_dumpable;
 extern char core_pattern[];
 extern int cad_pid;
 extern int pid_max;
@@ -950,6 +951,14 @@ static ctl_table fs_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+	{
+		.ctl_name	= KERN_SETUID_DUMPABLE,
+		.procname	= "suid_dumpable",
+		.data		= &suid_dumpable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/security/commoncap.c b/security/commoncap.c
index 849b8c338ee8..04c12f58d656 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -149,7 +149,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 
 	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
 	    !cap_issubset (new_permitted, current->cap_permitted)) {
-		current->mm->dumpable = 0;
+		current->mm->dumpable = suid_dumpable;
 
 		if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 			if (!capable(CAP_SETUID)) {
diff --git a/security/dummy.c b/security/dummy.c
index b32eff146547..6ff887586479 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -130,7 +130,7 @@ static void dummy_bprm_free_security (struct linux_binprm *bprm)
 static void dummy_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 {
 	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid) {
-		current->mm->dumpable = 0;
+		current->mm->dumpable = suid_dumpable;
 
 		if ((unsafe & ~LSM_UNSAFE_PTRACE_CAP) && !capable(CAP_SETUID)) {
 			bprm->e_uid = current->uid;
-- 
cgit v1.3-14-g43fede


From 4fea2838aa00b9e59efde974dcdb455608192811 Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@sw.ru>
Date: Thu, 23 Jun 2005 00:09:51 -0700
Subject: [PATCH] Software suspend and recalc sigpending bug fix

This patch fixes recalc_sigpending() to work correctly with tasks which are
being freezed.

The problem is that freeze_processes() sets PF_FREEZE and TIF_SIGPENDING
flags on tasks, but recalc_sigpending() called from e.g.
sys_rt_sigtimedwait or any other kernel place will clear TIF_SIGPENDING due
to no pending signals queued and the tasks won't be freezed until it
recieves a real signal or freezed_processes() fail due to timeout.

Signed-Off-By: Kirill Korotaev <dev@sw.ru>
Signed-Off-By: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index c89821b69ae3..d1258729a5f9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -213,6 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 fastcall void recalc_sigpending_tsk(struct task_struct *t)
 {
 	if (t->signal->group_stop_count > 0 ||
+	    (t->flags & PF_FREEZE) ||
 	    PENDING(&t->pending, &t->blocked) ||
 	    PENDING(&t->signal->shared_pending, &t->blocked))
 		set_tsk_thread_flag(t, TIF_SIGPENDING);
-- 
cgit v1.3-14-g43fede


From 71a2224d7d1cefc23a1ac80bba421cc069cc3257 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 00:10:05 -0700
Subject: [PATCH] Optimize sys_times for a single thread process

Avoid taking the tasklist_lock in sys_times if the process is single
threaded.  In a NUMA system taking the tasklist_lock may cause a bouncing
cacheline if multiple independent processes continually call sys_times to
measure their performance.

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c |  5 ++++
 kernel/sys.c  | 86 +++++++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 65 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index c2bdf6fb61a5..3ebcd60a19c6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,11 @@ repeat:
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
 	__exit_sighand(p);
+	/*
+	 * Note that the fastpath in sys_times depends on __exit_signal having
+	 * updated the counters before a task is removed from the tasklist of
+	 * the process by __unhash_process.
+	 */
 	__unhash_process(p);
 
 	/*
diff --git a/kernel/sys.c b/kernel/sys.c
index 0a2c8cda9638..5a9d6b075016 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -894,35 +894,69 @@ asmlinkage long sys_times(struct tms __user * tbuf)
 	 */
 	if (tbuf) {
 		struct tms tmp;
-		struct task_struct *tsk = current;
-		struct task_struct *t;
 		cputime_t utime, stime, cutime, cstime;
 
-		read_lock(&tasklist_lock);
-		utime = tsk->signal->utime;
-		stime = tsk->signal->stime;
-		t = tsk;
-		do {
-			utime = cputime_add(utime, t->utime);
-			stime = cputime_add(stime, t->stime);
-			t = next_thread(t);
-		} while (t != tsk);
-
-		/*
-		 * While we have tasklist_lock read-locked, no dying thread
-		 * can be updating current->signal->[us]time.  Instead,
-		 * we got their counts included in the live thread loop.
-		 * However, another thread can come in right now and
-		 * do a wait call that updates current->signal->c[us]time.
-		 * To make sure we always see that pair updated atomically,
-		 * we take the siglock around fetching them.
-		 */
-		spin_lock_irq(&tsk->sighand->siglock);
-		cutime = tsk->signal->cutime;
-		cstime = tsk->signal->cstime;
-		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
+#ifdef CONFIG_SMP
+		if (thread_group_empty(current)) {
+			/*
+			 * Single thread case without the use of any locks.
+			 *
+			 * We may race with release_task if two threads are
+			 * executing. However, release task first adds up the
+			 * counters (__exit_signal) before  removing the task
+			 * from the process tasklist (__unhash_process).
+			 * __exit_signal also acquires and releases the
+			 * siglock which results in the proper memory ordering
+			 * so that the list modifications are always visible
+			 * after the counters have been updated.
+			 *
+			 * If the counters have been updated by the second thread
+			 * but the thread has not yet been removed from the list
+			 * then the other branch will be executing which will
+			 * block on tasklist_lock until the exit handling of the
+			 * other task is finished.
+			 *
+			 * This also implies that the sighand->siglock cannot
+			 * be held by another processor. So we can also
+			 * skip acquiring that lock.
+			 */
+			utime = cputime_add(current->signal->utime, current->utime);
+			stime = cputime_add(current->signal->utime, current->stime);
+			cutime = current->signal->cutime;
+			cstime = current->signal->cstime;
+		} else
+#endif
+		{
+
+			/* Process with multiple threads */
+			struct task_struct *tsk = current;
+			struct task_struct *t;
 
+			read_lock(&tasklist_lock);
+			utime = tsk->signal->utime;
+			stime = tsk->signal->stime;
+			t = tsk;
+			do {
+				utime = cputime_add(utime, t->utime);
+				stime = cputime_add(stime, t->stime);
+				t = next_thread(t);
+			} while (t != tsk);
+
+			/*
+			 * While we have tasklist_lock read-locked, no dying thread
+			 * can be updating current->signal->[us]time.  Instead,
+			 * we got their counts included in the live thread loop.
+			 * However, another thread can come in right now and
+			 * do a wait call that updates current->signal->c[us]time.
+			 * To make sure we always see that pair updated atomically,
+			 * we take the siglock around fetching them.
+			 */
+			spin_lock_irq(&tsk->sighand->siglock);
+			cutime = tsk->signal->cutime;
+			cstime = tsk->signal->cstime;
+			spin_unlock_irq(&tsk->sighand->siglock);
+			read_unlock(&tasklist_lock);
+		}
 		tmp.tms_utime = cputime_to_clock_t(utime);
 		tmp.tms_stime = cputime_to_clock_t(stime);
 		tmp.tms_cutime = cputime_to_clock_t(cutime);
-- 
cgit v1.3-14-g43fede


From c43dc2fd885b5658cfd7cedb7bcca20910c517a4 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Thu, 23 Jun 2005 00:10:27 -0700
Subject: [PATCH] aio: make wait_queue ->task ->private

In the upcoming aio_down patch, it is useful to store a private data
pointer in the kiocb's wait_queue.  Since we provide our own wake up
function and do not require the task_struct pointer, it makes sense to
convert the task pointer into a generic private pointer.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/wait.h | 16 ++++++++--------
 kernel/sched.c       |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index c9486c3efb4a..d38c9fecdc36 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -33,7 +33,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key
 struct __wait_queue {
 	unsigned int flags;
 #define WQ_FLAG_EXCLUSIVE	0x01
-	struct task_struct * task;
+	void *private;
 	wait_queue_func_t func;
 	struct list_head task_list;
 };
@@ -60,7 +60,7 @@ typedef struct __wait_queue_head wait_queue_head_t;
  */
 
 #define __WAITQUEUE_INITIALIZER(name, tsk) {				\
-	.task		= tsk,						\
+	.private	= tsk,						\
 	.func		= default_wake_function,			\
 	.task_list	= { NULL, NULL } }
 
@@ -86,7 +86,7 @@ static inline void init_waitqueue_head(wait_queue_head_t *q)
 static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
 {
 	q->flags = 0;
-	q->task = p;
+	q->private = p;
 	q->func = default_wake_function;
 }
 
@@ -94,7 +94,7 @@ static inline void init_waitqueue_func_entry(wait_queue_t *q,
 					wait_queue_func_t func)
 {
 	q->flags = 0;
-	q->task = NULL;
+	q->private = NULL;
 	q->func = func;
 }
 
@@ -110,7 +110,7 @@ static inline int waitqueue_active(wait_queue_head_t *q)
  * aio specifies a wait queue entry with an async notification
  * callback routine, not associated with any task.
  */
-#define is_sync_wait(wait)	(!(wait) || ((wait)->task))
+#define is_sync_wait(wait)	(!(wait) || ((wait)->private))
 
 extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
@@ -384,7 +384,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
 #define DEFINE_WAIT(name)						\
 	wait_queue_t name = {						\
-		.task		= current,				\
+		.private	= current,				\
 		.func		= autoremove_wake_function,		\
 		.task_list	= LIST_HEAD_INIT((name).task_list),	\
 	}
@@ -393,7 +393,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 	struct wait_bit_queue name = {					\
 		.key = __WAIT_BIT_KEY_INITIALIZER(word, bit),		\
 		.wait	= {						\
-			.task		= current,			\
+			.private	= current,			\
 			.func		= wake_bit_function,		\
 			.task_list	=				\
 				LIST_HEAD_INIT((name).wait.task_list),	\
@@ -402,7 +402,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
 #define init_wait(wait)							\
 	do {								\
-		(wait)->task = current;					\
+		(wait)->private = current;				\
 		(wait)->func = autoremove_wake_function;		\
 		INIT_LIST_HEAD(&(wait)->task_list);			\
 	} while (0)
diff --git a/kernel/sched.c b/kernel/sched.c
index 6ee4515d5a20..76080d142e3d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2869,7 +2869,7 @@ need_resched:
 
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
 {
-	task_t *p = curr->task;
+	task_t *p = curr->private;
 	return try_to_wake_up(p, mode, sync);
 }
 
-- 
cgit v1.3-14-g43fede


From 7888e7ff4ee579442128d7d12a9c9dbf2cf7de6a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 23 Jun 2005 22:00:51 -0700
Subject: [PATCH] Keys: Pass session keyring to call_usermodehelper()

The attached patch makes it possible to pass a session keyring through to the
process spawned by call_usermodehelper().  This allows patch 3/3 to pass an
authorisation key through to /sbin/request-key, thus permitting better access
controls when doing just-in-time key creation.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/key.h         | 10 +++++++++-
 include/linux/kmod.h        | 13 ++++++++++++-
 kernel/kmod.c               | 17 +++++++++++++----
 security/keys/request_key.c |  2 +-
 4 files changed, 35 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/key.h b/include/linux/key.h
index 2c24ffaca86f..2bfbf88d2740 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -273,14 +273,22 @@ extern void key_fsuid_changed(struct task_struct *tsk);
 extern void key_fsgid_changed(struct task_struct *tsk);
 extern void key_init(void);
 
+#define __install_session_keyring(tsk, keyring)			\
+({								\
+	struct key *old_session = tsk->signal->session_keyring;	\
+	tsk->signal->session_keyring = keyring;			\
+	old_session;						\
+})
+
 #else /* CONFIG_KEYS */
 
 #define key_validate(k)			0
 #define key_serial(k)			0
-#define key_get(k) 			NULL
+#define key_get(k) 			({ NULL; })
 #define key_put(k)			do { } while(0)
 #define alloc_uid_keyring(u)		0
 #define switch_uid_keyring(u)		do { } while(0)
+#define __install_session_keyring(t, k)	({ NULL; })
 #define copy_keys(f,t)			0
 #define copy_thread_group_keys(t)	0
 #define exit_keys(t)			do { } while(0)
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 95d0e4b0814d..e4a231549407 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -19,6 +19,7 @@
  *      Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#include <linux/stddef.h>
 #include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/compiler.h>
@@ -34,7 +35,17 @@ static inline int request_module(const char * name, ...) { return -ENOSYS; }
 #endif
 
 #define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
-extern int call_usermodehelper(char *path, char *argv[], char *envp[], int wait);
+
+struct key;
+extern int call_usermodehelper_keys(char *path, char *argv[], char *envp[],
+				    struct key *session_keyring, int wait);
+
+static inline int
+call_usermodehelper(char *path, char **argv, char **envp, int wait)
+{
+	return call_usermodehelper_keys(path, argv, envp, NULL, wait);
+}
+
 extern void usermodehelper_init(void);
 
 #endif /* __LINUX_KMOD_H__ */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index eed53d4f5230..44166e3bb8af 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -120,6 +120,7 @@ struct subprocess_info {
 	char *path;
 	char **argv;
 	char **envp;
+	struct key *ring;
 	int wait;
 	int retval;
 };
@@ -130,16 +131,21 @@ struct subprocess_info {
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
+	struct key *old_session;
 	int retval;
 
-	/* Unblock all signals. */
+	/* Unblock all signals and set the session keyring. */
+	key_get(sub_info->ring);
 	flush_signals(current);
 	spin_lock_irq(&current->sighand->siglock);
+	old_session = __install_session_keyring(current, sub_info->ring);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
+	key_put(old_session);
+
 	/* We can run anywhere, unlike our parent keventd(). */
 	set_cpus_allowed(current, CPU_MASK_ALL);
 
@@ -211,10 +217,11 @@ static void __call_usermodehelper(void *data)
 }
 
 /**
- * call_usermodehelper - start a usermode application
+ * call_usermodehelper_keys - start a usermode application
  * @path: pathname for the application
  * @argv: null-terminated argument list
  * @envp: null-terminated environment list
+ * @session_keyring: session keyring for process (NULL for an empty keyring)
  * @wait: wait for the application to finish and return status.
  *
  * Runs a user-space application.  The application is started
@@ -224,7 +231,8 @@ static void __call_usermodehelper(void *data)
  * Must be called from process context.  Returns a negative error code
  * if program was not execed successfully, or 0.
  */
-int call_usermodehelper(char *path, char **argv, char **envp, int wait)
+int call_usermodehelper_keys(char *path, char **argv, char **envp,
+			     struct key *session_keyring, int wait)
 {
 	DECLARE_COMPLETION(done);
 	struct subprocess_info sub_info = {
@@ -232,6 +240,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
 		.path		= path,
 		.argv		= argv,
 		.envp		= envp,
+		.ring		= session_keyring,
 		.wait		= wait,
 		.retval		= 0,
 	};
@@ -247,7 +256,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
 	wait_for_completion(&done);
 	return sub_info.retval;
 }
-EXPORT_SYMBOL(call_usermodehelper);
+EXPORT_SYMBOL(call_usermodehelper_keys);
 
 void __init usermodehelper_init(void)
 {
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 1f6c0940297f..1919540f047d 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -88,7 +88,7 @@ static int call_request_key(struct key *key,
 	argv[i] = NULL;
 
 	/* do it */
-	return call_usermodehelper(argv[0], argv, envp, 1);
+	return call_usermodehelper_keys(argv[0], argv, envp, NULL, 1);
 
 } /* end call_request_key() */
 
-- 
cgit v1.3-14-g43fede


From 3e30148c3d524a9c1c63ca28261bc24c457eb07a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 23 Jun 2005 22:00:56 -0700
Subject: [PATCH] Keys: Make request-key create an authorisation key

The attached patch makes the following changes:

 (1) There's a new special key type called ".request_key_auth".

     This is an authorisation key for when one process requests a key and
     another process is started to construct it. This type of key cannot be
     created by the user; nor can it be requested by kernel services.

     Authorisation keys hold two references:

     (a) Each refers to a key being constructed. When the key being
     	 constructed is instantiated the authorisation key is revoked,
     	 rendering it of no further use.

     (b) The "authorising process". This is either:

     	 (i) the process that called request_key(), or:

     	 (ii) if the process that called request_key() itself had an
     	      authorisation key in its session keyring, then the authorising
     	      process referred to by that authorisation key will also be
     	      referred to by the new authorisation key.

	 This means that the process that initiated a chain of key requests
	 will authorise the lot of them, and will, by default, wind up with
	 the keys obtained from them in its keyrings.

 (2) request_key() creates an authorisation key which is then passed to
     /sbin/request-key in as part of a new session keyring.

 (3) When request_key() is searching for a key to hand back to the caller, if
     it comes across an authorisation key in the session keyring of the
     calling process, it will also search the keyrings of the process
     specified therein and it will use the specified process's credentials
     (fsuid, fsgid, groups) to do that rather than the calling process's
     credentials.

     This allows a process started by /sbin/request-key to find keys belonging
     to the authorising process.

 (4) A key can be read, even if the process executing KEYCTL_READ doesn't have
     direct read or search permission if that key is contained within the
     keyrings of a process specified by an authorisation key found within the
     calling process's session keyring, and is searchable using the
     credentials of the authorising process.

     This allows a process started by /sbin/request-key to read keys belonging
     to the authorising process.

 (5) The magic KEY_SPEC_*_KEYRING key IDs when passed to KEYCTL_INSTANTIATE or
     KEYCTL_NEGATE will specify a keyring of the authorising process, rather
     than the process doing the instantiation.

 (6) One of the process keyrings can be nominated as the default to which
     request_key() should attach new keys if not otherwise specified. This is
     done with KEYCTL_SET_REQKEY_KEYRING and one of the KEY_REQKEY_DEFL_*
     constants. The current setting can also be read using this call.

 (7) request_key() is partially interruptible. If it is waiting for another
     process to finish constructing a key, it can be interrupted. This permits
     a request-key cycle to be broken without recourse to rebooting.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-Off-By: Benoit Boissinot <benoit.boissinot@ens-lyon.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt           |  34 ++++++++
 include/linux/key-ui.h           |  41 ++++++++-
 include/linux/key.h              |   9 +-
 include/linux/keyctl.h           |  11 +++
 include/linux/sched.h            |   8 +-
 kernel/sys.c                     |   2 +-
 security/keys/Makefile           |   5 +-
 security/keys/compat.c           |   7 +-
 security/keys/internal.h         |  45 +++++++++-
 security/keys/key.c              |  24 ++++--
 security/keys/keyctl.c           | 176 ++++++++++++++++++++++++-------------
 security/keys/keyring.c          |  67 ++++++++++++--
 security/keys/process_keys.c     | 179 +++++++++++++++++++++++---------------
 security/keys/request_key.c      | 182 ++++++++++++++++++++++++++++++++-------
 security/keys/request_key_auth.c | 180 ++++++++++++++++++++++++++++++++++++++
 15 files changed, 779 insertions(+), 191 deletions(-)
 create mode 100644 security/keys/request_key_auth.c

(limited to 'kernel')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 3df40c1fe15a..0321ded4b9ae 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -591,6 +591,37 @@ The keyctl syscall functions are:
      this case too.
 
 
+ (*) Set the default request-key destination keyring.
+
+	long keyctl(KEYCTL_SET_REQKEY_KEYRING, int reqkey_defl);
+
+     This sets the default keyring to which implicitly requested keys will be
+     attached for this thread. reqkey_defl should be one of these constants:
+
+	CONSTANT				VALUE	NEW DEFAULT KEYRING
+	======================================	======	=======================
+	KEY_REQKEY_DEFL_NO_CHANGE		-1	No change
+	KEY_REQKEY_DEFL_DEFAULT			0	Default[1]
+	KEY_REQKEY_DEFL_THREAD_KEYRING		1	Thread keyring
+	KEY_REQKEY_DEFL_PROCESS_KEYRING		2	Process keyring
+	KEY_REQKEY_DEFL_SESSION_KEYRING		3	Session keyring
+	KEY_REQKEY_DEFL_USER_KEYRING		4	User keyring
+	KEY_REQKEY_DEFL_USER_SESSION_KEYRING	5	User session keyring
+	KEY_REQKEY_DEFL_GROUP_KEYRING		6	Group keyring
+
+     The old default will be returned if successful and error EINVAL will be
+     returned if reqkey_defl is not one of the above values.
+
+     The default keyring can be overridden by the keyring indicated to the
+     request_key() system call.
+
+     Note that this setting is inherited across fork/exec.
+
+     [1] The default default is: the thread keyring if there is one, otherwise
+     the process keyring if there is one, otherwise the session keyring if
+     there is one, otherwise the user default session keyring.
+
+
 ===============
 KERNEL SERVICES
 ===============
@@ -626,6 +657,9 @@ payload contents" for more information.
     Should the function fail error ENOKEY, EKEYEXPIRED or EKEYREVOKED will be
     returned.
 
+    If successful, the key will have been attached to the default keyring for
+    implicitly obtained request-key keys, as set by KEYCTL_SET_REQKEY_KEYRING.
+
 
 (*) When it is no longer required, the key should be released using:
 
diff --git a/include/linux/key-ui.h b/include/linux/key-ui.h
index 159ca8d54e9a..cc326174a808 100644
--- a/include/linux/key-ui.h
+++ b/include/linux/key-ui.h
@@ -1,4 +1,4 @@
-/* key-ui.h: key userspace interface stuff for use by keyfs
+/* key-ui.h: key userspace interface stuff
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -84,8 +84,45 @@ static inline int key_any_permission(const struct key *key, key_perm_t perm)
 	return kperm != 0;
 }
 
+static inline int key_task_groups_search(struct task_struct *tsk, gid_t gid)
+{
+	int ret;
+
+	task_lock(tsk);
+	ret = groups_search(tsk->group_info, gid);
+	task_unlock(tsk);
+	return ret;
+}
+
+static inline int key_task_permission(const struct key *key,
+				      struct task_struct *context,
+				      key_perm_t perm)
+{
+	key_perm_t kperm;
+
+	if (key->uid == context->fsuid) {
+		kperm = key->perm >> 16;
+	}
+	else if (key->gid != -1 &&
+		 key->perm & KEY_GRP_ALL && (
+			 key->gid == context->fsgid ||
+			 key_task_groups_search(context, key->gid)
+			 )
+		 ) {
+		kperm = key->perm >> 8;
+	}
+	else {
+		kperm = key->perm;
+	}
+
+	kperm = kperm & perm & KEY_ALL;
+
+	return kperm == perm;
+
+}
 
-extern struct key *lookup_user_key(key_serial_t id, int create, int part,
+extern struct key *lookup_user_key(struct task_struct *context,
+				   key_serial_t id, int create, int partial,
 				   key_perm_t perm);
 
 extern long join_session_keyring(const char *name);
diff --git a/include/linux/key.h b/include/linux/key.h
index 2bfbf88d2740..970bbd916cf4 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -199,10 +199,12 @@ extern int key_payload_reserve(struct key *key, size_t datalen);
 extern int key_instantiate_and_link(struct key *key,
 				    const void *data,
 				    size_t datalen,
-				    struct key *keyring);
+				    struct key *keyring,
+				    struct key *instkey);
 extern int key_negate_and_link(struct key *key,
 			       unsigned timeout,
-			       struct key *keyring);
+			       struct key *keyring,
+			       struct key *instkey);
 extern void key_revoke(struct key *key);
 extern void key_put(struct key *key);
 
@@ -245,9 +247,6 @@ extern struct key *keyring_search(struct key *keyring,
 				  struct key_type *type,
 				  const char *description);
 
-extern struct key *search_process_keyrings(struct key_type *type,
-					   const char *description);
-
 extern int keyring_add_key(struct key *keyring,
 			   struct key *key);
 
diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h
index 381dedc370a3..8d7c59a29e09 100644
--- a/include/linux/keyctl.h
+++ b/include/linux/keyctl.h
@@ -20,6 +20,16 @@
 #define KEY_SPEC_USER_SESSION_KEYRING	-5	/* - key ID for UID-session keyring */
 #define KEY_SPEC_GROUP_KEYRING		-6	/* - key ID for GID-specific keyring */
 
+/* request-key default keyrings */
+#define KEY_REQKEY_DEFL_NO_CHANGE		-1
+#define KEY_REQKEY_DEFL_DEFAULT			0
+#define KEY_REQKEY_DEFL_THREAD_KEYRING		1
+#define KEY_REQKEY_DEFL_PROCESS_KEYRING		2
+#define KEY_REQKEY_DEFL_SESSION_KEYRING		3
+#define KEY_REQKEY_DEFL_USER_KEYRING		4
+#define KEY_REQKEY_DEFL_USER_SESSION_KEYRING	5
+#define KEY_REQKEY_DEFL_GROUP_KEYRING		6
+
 /* keyctl commands */
 #define KEYCTL_GET_KEYRING_ID		0	/* ask for a keyring's ID */
 #define KEYCTL_JOIN_SESSION_KEYRING	1	/* join or start named session keyring */
@@ -35,5 +45,6 @@
 #define KEYCTL_READ			11	/* read a key or keyring's contents */
 #define KEYCTL_INSTANTIATE		12	/* instantiate a partially constructed key */
 #define KEYCTL_NEGATE			13	/* negate a partially constructed key */
+#define KEYCTL_SET_REQKEY_KEYRING	14	/* set default request-key keyring */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 901742f92389..2c69682b0444 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -561,9 +561,10 @@ struct group_info {
 		groups_free(group_info); \
 } while (0)
 
-struct group_info *groups_alloc(int gidsetsize);
-void groups_free(struct group_info *group_info);
-int set_current_groups(struct group_info *group_info);
+extern struct group_info *groups_alloc(int gidsetsize);
+extern void groups_free(struct group_info *group_info);
+extern int set_current_groups(struct group_info *group_info);
+extern int groups_search(struct group_info *group_info, gid_t grp);
 /* access the groups "array" with this macro */
 #define GROUP_AT(gi, i) \
     ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
@@ -660,6 +661,7 @@ struct task_struct {
 	struct user_struct *user;
 #ifdef CONFIG_KEYS
 	struct key *thread_keyring;	/* keyring private to this thread */
+	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
 #endif
 	int oomkilladj; /* OOM kill score adjustment (bit shift). */
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
diff --git a/kernel/sys.c b/kernel/sys.c
index 5a9d6b075016..da24bc1292db 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1259,7 +1259,7 @@ static void groups_sort(struct group_info *group_info)
 }
 
 /* a simple bsearch */
-static int groups_search(struct group_info *group_info, gid_t grp)
+int groups_search(struct group_info *group_info, gid_t grp)
 {
 	int left, right;
 
diff --git a/security/keys/Makefile b/security/keys/Makefile
index ddb495d65062..c392d750b208 100644
--- a/security/keys/Makefile
+++ b/security/keys/Makefile
@@ -7,8 +7,9 @@ obj-y := \
 	keyring.o \
 	keyctl.o \
 	process_keys.o \
-	user_defined.o \
-	request_key.o
+	request_key.o \
+	request_key_auth.o \
+	user_defined.o
 
 obj-$(CONFIG_KEYS_COMPAT) += compat.o
 obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/security/keys/compat.c b/security/keys/compat.c
index aff8b22dcb5c..3303673c636e 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -1,6 +1,6 @@
 /* compat.c: 32-bit compatibility syscall for 64-bit systems
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -24,7 +24,7 @@
  * - if you can, you should call sys_keyctl directly
  */
 asmlinkage long compat_sys_keyctl(u32 option,
-			      u32 arg2, u32 arg3, u32 arg4, u32 arg5)
+				  u32 arg2, u32 arg3, u32 arg4, u32 arg5)
 {
 	switch (option) {
 	case KEYCTL_GET_KEYRING_ID:
@@ -71,6 +71,9 @@ asmlinkage long compat_sys_keyctl(u32 option,
 	case KEYCTL_NEGATE:
 		return keyctl_negate_key(arg2, arg3, arg4);
 
+	case KEYCTL_SET_REQKEY_KEYRING:
+		return keyctl_set_reqkey_keyring(arg2);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 67b2b93a7489..46c8602661c9 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -1,6 +1,6 @@
 /* internal.h: authentication token and access key management internal defs
  *
- * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2003-5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -15,6 +15,16 @@
 #include <linux/key.h>
 #include <linux/key-ui.h>
 
+#if 0
+#define kenter(FMT, a...)	printk("==> %s("FMT")\n",__FUNCTION__ , ## a)
+#define kleave(FMT, a...)	printk("<== %s()"FMT"\n",__FUNCTION__ , ## a)
+#define kdebug(FMT, a...)	printk(FMT"\n" , ## a)
+#else
+#define kenter(FMT, a...)	do {} while(0)
+#define kleave(FMT, a...)	do {} while(0)
+#define kdebug(FMT, a...)	do {} while(0)
+#endif
+
 extern struct key_type key_type_dead;
 extern struct key_type key_type_user;
 
@@ -66,20 +76,46 @@ extern struct key *__keyring_search_one(struct key *keyring,
 					const char *description,
 					key_perm_t perm);
 
+extern struct key *keyring_search_instkey(struct key *keyring,
+					  key_serial_t target_id);
+
 typedef int (*key_match_func_t)(const struct key *, const void *);
 
 extern struct key *keyring_search_aux(struct key *keyring,
+				      struct task_struct *tsk,
 				      struct key_type *type,
 				      const void *description,
 				      key_match_func_t match);
 
-extern struct key *search_process_keyrings_aux(struct key_type *type,
-					       const void *description,
-					       key_match_func_t match);
+extern struct key *search_process_keyrings(struct key_type *type,
+					   const void *description,
+					   key_match_func_t match,
+					   struct task_struct *tsk);
 
 extern struct key *find_keyring_by_name(const char *name, key_serial_t bound);
 
 extern int install_thread_keyring(struct task_struct *tsk);
+extern int install_process_keyring(struct task_struct *tsk);
+
+extern struct key *request_key_and_link(struct key_type *type,
+					const char *description,
+					const char *callout_info,
+					struct key *dest_keyring);
+
+/*
+ * request_key authorisation
+ */
+struct request_key_auth {
+	struct key		*target_key;
+	struct task_struct	*context;
+	pid_t			pid;
+};
+
+extern struct key_type key_type_request_key_auth;
+extern struct key *request_key_auth_new(struct key *target,
+					struct key **_rkakey);
+
+extern struct key *key_get_instantiation_authkey(key_serial_t target_id);
 
 /*
  * keyctl functions
@@ -100,6 +136,7 @@ extern long keyctl_setperm_key(key_serial_t, key_perm_t);
 extern long keyctl_instantiate_key(key_serial_t, const void __user *,
 				   size_t, key_serial_t);
 extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t);
+extern long keyctl_set_reqkey_keyring(int);
 
 
 /*
diff --git a/security/keys/key.c b/security/keys/key.c
index 1fdfccb3fe43..3304d37bb379 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -1,6 +1,6 @@
 /* key.c: basic authentication token and access key management
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -391,7 +391,8 @@ EXPORT_SYMBOL(key_payload_reserve);
 static int __key_instantiate_and_link(struct key *key,
 				      const void *data,
 				      size_t datalen,
-				      struct key *keyring)
+				      struct key *keyring,
+				      struct key *instkey)
 {
 	int ret, awaken;
 
@@ -419,6 +420,10 @@ static int __key_instantiate_and_link(struct key *key,
 			/* and link it into the destination keyring */
 			if (keyring)
 				ret = __key_link(keyring, key);
+
+			/* disable the authorisation key */
+			if (instkey)
+				key_revoke(instkey);
 		}
 	}
 
@@ -439,19 +444,21 @@ static int __key_instantiate_and_link(struct key *key,
 int key_instantiate_and_link(struct key *key,
 			     const void *data,
 			     size_t datalen,
-			     struct key *keyring)
+			     struct key *keyring,
+			     struct key *instkey)
 {
 	int ret;
 
 	if (keyring)
 		down_write(&keyring->sem);
 
-	ret = __key_instantiate_and_link(key, data, datalen, keyring);
+	ret = __key_instantiate_and_link(key, data, datalen, keyring, instkey);
 
 	if (keyring)
 		up_write(&keyring->sem);
 
 	return ret;
+
 } /* end key_instantiate_and_link() */
 
 EXPORT_SYMBOL(key_instantiate_and_link);
@@ -462,7 +469,8 @@ EXPORT_SYMBOL(key_instantiate_and_link);
  */
 int key_negate_and_link(struct key *key,
 			unsigned timeout,
-			struct key *keyring)
+			struct key *keyring,
+			struct key *instkey)
 {
 	struct timespec now;
 	int ret, awaken;
@@ -495,6 +503,10 @@ int key_negate_and_link(struct key *key,
 		/* and link it into the destination keyring */
 		if (keyring)
 			ret = __key_link(keyring, key);
+
+		/* disable the authorisation key */
+		if (instkey)
+			key_revoke(instkey);
 	}
 
 	up_write(&key_construction_sem);
@@ -781,7 +793,7 @@ struct key *key_create_or_update(struct key *keyring,
 	}
 
 	/* instantiate it and link it into the target keyring */
-	ret = __key_instantiate_and_link(key, payload, plen, keyring);
+	ret = __key_instantiate_and_link(key, payload, plen, keyring, NULL);
 	if (ret < 0) {
 		key_put(key);
 		key = ERR_PTR(ret);
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index cedb7326de29..fea262860ea0 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1,6 +1,6 @@
 /* keyctl.c: userspace keyctl operations
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -49,6 +49,13 @@ asmlinkage long sys_add_key(const char __user *_type,
 		goto error;
 	type[31] = '\0';
 
+	if (!type[0])
+		goto error;
+
+	ret = -EPERM;
+	if (type[0] == '.')
+		goto error;
+
 	ret = -EFAULT;
 	dlen = strnlen_user(_description, PAGE_SIZE - 1);
 	if (dlen <= 0)
@@ -82,7 +89,7 @@ asmlinkage long sys_add_key(const char __user *_type,
 	}
 
 	/* find the target keyring (which must be writable) */
-	keyring = lookup_user_key(ringid, 1, 0, KEY_WRITE);
+	keyring = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error3;
@@ -181,7 +188,7 @@ asmlinkage long sys_request_key(const char __user *_type,
 	/* get the destination keyring if specified */
 	dest = NULL;
 	if (destringid) {
-		dest = lookup_user_key(destringid, 1, 0, KEY_WRITE);
+		dest = lookup_user_key(NULL, destringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(dest)) {
 			ret = PTR_ERR(dest);
 			goto error3;
@@ -196,23 +203,15 @@ asmlinkage long sys_request_key(const char __user *_type,
 	}
 
 	/* do the search */
-	key = request_key(ktype, description, callout_info);
+	key = request_key_and_link(ktype, description, callout_info, dest);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error5;
 	}
 
-	/* link the resulting key to the destination keyring */
-	if (dest) {
-		ret = key_link(dest, key);
-		if (ret < 0)
-			goto error6;
-	}
-
 	ret = key->serial;
 
- error6:
-	key_put(key);
+ 	key_put(key);
  error5:
 	key_type_put(ktype);
  error4:
@@ -237,7 +236,7 @@ long keyctl_get_keyring_ID(key_serial_t id, int create)
 	struct key *key;
 	long ret;
 
-	key = lookup_user_key(id, create, 0, KEY_SEARCH);
+	key = lookup_user_key(NULL, id, create, 0, KEY_SEARCH);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
@@ -324,7 +323,7 @@ long keyctl_update_key(key_serial_t id,
 	}
 
 	/* find the target key (which must be writable) */
-	key = lookup_user_key(id, 0, 0, KEY_WRITE);
+	key = lookup_user_key(NULL, id, 0, 0, KEY_WRITE);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error2;
@@ -352,7 +351,7 @@ long keyctl_revoke_key(key_serial_t id)
 	struct key *key;
 	long ret;
 
-	key = lookup_user_key(id, 0, 0, KEY_WRITE);
+	key = lookup_user_key(NULL, id, 0, 0, KEY_WRITE);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
@@ -378,7 +377,7 @@ long keyctl_keyring_clear(key_serial_t ringid)
 	struct key *keyring;
 	long ret;
 
-	keyring = lookup_user_key(ringid, 1, 0, KEY_WRITE);
+	keyring = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error;
@@ -404,13 +403,13 @@ long keyctl_keyring_link(key_serial_t id, key_serial_t ringid)
 	struct key *keyring, *key;
 	long ret;
 
-	keyring = lookup_user_key(ringid, 1, 0, KEY_WRITE);
+	keyring = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error;
 	}
 
-	key = lookup_user_key(id, 1, 0, KEY_LINK);
+	key = lookup_user_key(NULL, id, 1, 0, KEY_LINK);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error2;
@@ -438,13 +437,13 @@ long keyctl_keyring_unlink(key_serial_t id, key_serial_t ringid)
 	struct key *keyring, *key;
 	long ret;
 
-	keyring = lookup_user_key(ringid, 0, 0, KEY_WRITE);
+	keyring = lookup_user_key(NULL, ringid, 0, 0, KEY_WRITE);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error;
 	}
 
-	key = lookup_user_key(id, 0, 0, 0);
+	key = lookup_user_key(NULL, id, 0, 0, 0);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error2;
@@ -475,16 +474,29 @@ long keyctl_describe_key(key_serial_t keyid,
 			 char __user *buffer,
 			 size_t buflen)
 {
-	struct key *key;
+	struct key *key, *instkey;
 	char *tmpbuf;
 	long ret;
 
-	key = lookup_user_key(keyid, 0, 1, KEY_VIEW);
+	key = lookup_user_key(NULL, keyid, 0, 1, KEY_VIEW);
 	if (IS_ERR(key)) {
+		/* viewing a key under construction is permitted if we have the
+		 * authorisation token handy */
+		if (PTR_ERR(key) == -EACCES) {
+			instkey = key_get_instantiation_authkey(keyid);
+			if (!IS_ERR(instkey)) {
+				key_put(instkey);
+				key = lookup_user_key(NULL, keyid, 0, 1, 0);
+				if (!IS_ERR(key))
+					goto okay;
+			}
+		}
+
 		ret = PTR_ERR(key);
 		goto error;
 	}
 
+okay:
 	/* calculate how much description we're going to return */
 	ret = -ENOMEM;
 	tmpbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
@@ -568,7 +580,7 @@ long keyctl_keyring_search(key_serial_t ringid,
 		goto error2;
 
 	/* get the keyring at which to begin the search */
-	keyring = lookup_user_key(ringid, 0, 0, KEY_SEARCH);
+	keyring = lookup_user_key(NULL, ringid, 0, 0, KEY_SEARCH);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error2;
@@ -577,7 +589,7 @@ long keyctl_keyring_search(key_serial_t ringid,
 	/* get the destination keyring if specified */
 	dest = NULL;
 	if (destringid) {
-		dest = lookup_user_key(destringid, 1, 0, KEY_WRITE);
+		dest = lookup_user_key(NULL, destringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(dest)) {
 			ret = PTR_ERR(dest);
 			goto error3;
@@ -656,24 +668,23 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen)
 	long ret;
 
 	/* find the key first */
-	key = lookup_user_key(keyid, 0, 0, 0);
+	key = lookup_user_key(NULL, keyid, 0, 0, 0);
 	if (!IS_ERR(key)) {
 		/* see if we can read it directly */
 		if (key_permission(key, KEY_READ))
 			goto can_read_key;
 
-		/* can't; see if it's searchable from this process's
-		 * keyrings */
-		ret = -ENOKEY;
-		if (key_permission(key, KEY_SEARCH)) {
-			/* okay - we do have search permission on the key
-			 * itself, but do we have the key? */
-			skey = search_process_keyrings_aux(key->type, key,
-							   keyctl_read_key_same);
-			if (!IS_ERR(skey))
-				goto can_read_key2;
-		}
-
+		/* we can't; see if it's searchable from this process's
+		 * keyrings
+		 * - we automatically take account of the fact that it may be
+		 *   dangling off an instantiation key
+		 */
+		skey = search_process_keyrings(key->type, key,
+					       keyctl_read_key_same, current);
+		if (!IS_ERR(skey))
+			goto can_read_key2;
+
+		ret = PTR_ERR(skey);
 		goto error2;
 	}
 
@@ -719,7 +730,7 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid)
 	if (uid == (uid_t) -1 && gid == (gid_t) -1)
 		goto error;
 
-	key = lookup_user_key(id, 1, 1, 0);
+	key = lookup_user_key(NULL, id, 1, 1, 0);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
@@ -776,7 +787,7 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm)
 	if (perm & ~(KEY_USR_ALL | KEY_GRP_ALL | KEY_OTH_ALL))
 		goto error;
 
-	key = lookup_user_key(id, 1, 1, 0);
+	key = lookup_user_key(NULL, id, 1, 1, 0);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
@@ -809,7 +820,8 @@ long keyctl_instantiate_key(key_serial_t id,
 			    size_t plen,
 			    key_serial_t ringid)
 {
-	struct key *key, *keyring;
+	struct request_key_auth *rka;
+	struct key *instkey, *keyring;
 	void *payload;
 	long ret;
 
@@ -831,18 +843,21 @@ long keyctl_instantiate_key(key_serial_t id,
 			goto error2;
 	}
 
-	/* find the target key (which must be writable) */
-	key = lookup_user_key(id, 0, 1, KEY_WRITE);
-	if (IS_ERR(key)) {
-		ret = PTR_ERR(key);
+	/* find the instantiation authorisation key */
+	instkey = key_get_instantiation_authkey(id);
+	if (IS_ERR(instkey)) {
+		ret = PTR_ERR(instkey);
 		goto error2;
 	}
 
-	/* find the destination keyring if present (which must also be
-	 * writable) */
+	rka = instkey->payload.data;
+
+	/* find the destination keyring amongst those belonging to the
+	 * requesting task */
 	keyring = NULL;
 	if (ringid) {
-		keyring = lookup_user_key(ringid, 1, 0, KEY_WRITE);
+		keyring = lookup_user_key(rka->context, ringid, 1, 0,
+					  KEY_WRITE);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error3;
@@ -850,11 +865,12 @@ long keyctl_instantiate_key(key_serial_t id,
 	}
 
 	/* instantiate the key and link it into a keyring */
-	ret = key_instantiate_and_link(key, payload, plen, keyring);
+	ret = key_instantiate_and_link(rka->target_key, payload, plen,
+				       keyring, instkey);
 
 	key_put(keyring);
  error3:
-	key_put(key);
+	key_put(instkey);
  error2:
 	kfree(payload);
  error:
@@ -869,21 +885,24 @@ long keyctl_instantiate_key(key_serial_t id,
  */
 long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 {
-	struct key *key, *keyring;
+	struct request_key_auth *rka;
+	struct key *instkey, *keyring;
 	long ret;
 
-	/* find the target key (which must be writable) */
-	key = lookup_user_key(id, 0, 1, KEY_WRITE);
-	if (IS_ERR(key)) {
-		ret = PTR_ERR(key);
+	/* find the instantiation authorisation key */
+	instkey = key_get_instantiation_authkey(id);
+	if (IS_ERR(instkey)) {
+		ret = PTR_ERR(instkey);
 		goto error;
 	}
 
+	rka = instkey->payload.data;
+
 	/* find the destination keyring if present (which must also be
 	 * writable) */
 	keyring = NULL;
 	if (ringid) {
-		keyring = lookup_user_key(ringid, 1, 0, KEY_WRITE);
+		keyring = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
@@ -891,16 +910,54 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 	}
 
 	/* instantiate the key and link it into a keyring */
-	ret = key_negate_and_link(key, timeout, keyring);
+	ret = key_negate_and_link(rka->target_key, timeout, keyring, instkey);
 
 	key_put(keyring);
  error2:
-	key_put(key);
+	key_put(instkey);
  error:
 	return ret;
 
 } /* end keyctl_negate_key() */
 
+/*****************************************************************************/
+/*
+ * set the default keyring in which request_key() will cache keys
+ * - return the old setting
+ */
+long keyctl_set_reqkey_keyring(int reqkey_defl)
+{
+	int ret;
+
+	switch (reqkey_defl) {
+	case KEY_REQKEY_DEFL_THREAD_KEYRING:
+		ret = install_thread_keyring(current);
+		if (ret < 0)
+			return ret;
+		goto set;
+
+	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
+		ret = install_process_keyring(current);
+		if (ret < 0)
+			return ret;
+
+	case KEY_REQKEY_DEFL_DEFAULT:
+	case KEY_REQKEY_DEFL_SESSION_KEYRING:
+	case KEY_REQKEY_DEFL_USER_KEYRING:
+	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
+	set:
+		current->jit_keyring = reqkey_defl;
+
+	case KEY_REQKEY_DEFL_NO_CHANGE:
+		return current->jit_keyring;
+
+	case KEY_REQKEY_DEFL_GROUP_KEYRING:
+	default:
+		return -EINVAL;
+	}
+
+} /* end keyctl_set_reqkey_keyring() */
+
 /*****************************************************************************/
 /*
  * the key control system call
@@ -971,6 +1028,9 @@ asmlinkage long sys_keyctl(int option, unsigned long arg2, unsigned long arg3,
 					 (unsigned) arg3,
 					 (key_serial_t) arg4);
 
+	case KEYCTL_SET_REQKEY_KEYRING:
+		return keyctl_set_reqkey_keyring(arg2);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index c9a5de197487..90a551e4da66 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -1,6 +1,6 @@
 /* keyring.c: keyring handling
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -308,7 +308,7 @@ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
 			    uid, gid, KEY_USR_ALL, not_in_quota);
 
 	if (!IS_ERR(keyring)) {
-		ret = key_instantiate_and_link(keyring, NULL, 0, dest);
+		ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
 		if (ret < 0) {
 			key_put(keyring);
 			keyring = ERR_PTR(ret);
@@ -326,11 +326,12 @@ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
  * - we only find keys on which we have search permission
  * - we use the supplied match function to see if the description (or other
  *   feature of interest) matches
- * - we readlock the keyrings as we search down the tree
+ * - we rely on RCU to prevent the keyring lists from disappearing on us
  * - we return -EAGAIN if we didn't find any matching key
  * - we return -ENOKEY if we only found negative matching keys
  */
 struct key *keyring_search_aux(struct key *keyring,
+			       struct task_struct *context,
 			       struct key_type *type,
 			       const void *description,
 			       key_match_func_t match)
@@ -352,7 +353,7 @@ struct key *keyring_search_aux(struct key *keyring,
 
 	/* top keyring must have search permission to begin the search */
 	key = ERR_PTR(-EACCES);
-	if (!key_permission(keyring, KEY_SEARCH))
+	if (!key_task_permission(keyring, context, KEY_SEARCH))
 		goto error;
 
 	key = ERR_PTR(-ENOTDIR);
@@ -392,7 +393,7 @@ struct key *keyring_search_aux(struct key *keyring,
 			continue;
 
 		/* key must have search permissions */
-		if (!key_permission(key, KEY_SEARCH))
+		if (!key_task_permission(key, context, KEY_SEARCH))
 			continue;
 
 		/* we set a different error code if we find a negative key */
@@ -418,7 +419,7 @@ struct key *keyring_search_aux(struct key *keyring,
 		if (sp >= KEYRING_SEARCH_MAX_DEPTH)
 			continue;
 
-		if (!key_permission(key, KEY_SEARCH))
+		if (!key_task_permission(key, context, KEY_SEARCH))
 			continue;
 
 		/* stack the current position */
@@ -468,7 +469,11 @@ struct key *keyring_search(struct key *keyring,
 			   struct key_type *type,
 			   const char *description)
 {
-	return keyring_search_aux(keyring, type, description, type->match);
+	if (!type->match)
+		return ERR_PTR(-ENOKEY);
+
+	return keyring_search_aux(keyring, current,
+				  type, description, type->match);
 
 } /* end keyring_search() */
 
@@ -496,7 +501,8 @@ struct key *__keyring_search_one(struct key *keyring,
 			key = klist->keys[loop];
 
 			if (key->type == ktype &&
-			    key->type->match(key, description) &&
+			    (!key->type->match ||
+			     key->type->match(key, description)) &&
 			    key_permission(key, perm) &&
 			    !test_bit(KEY_FLAG_REVOKED, &key->flags)
 			    )
@@ -515,6 +521,51 @@ struct key *__keyring_search_one(struct key *keyring,
 
 } /* end __keyring_search_one() */
 
+/*****************************************************************************/
+/*
+ * search for an instantiation authorisation key matching a target key
+ * - the RCU read lock must be held by the caller
+ * - a target_id of zero specifies any valid token
+ */
+struct key *keyring_search_instkey(struct key *keyring,
+				   key_serial_t target_id)
+{
+	struct request_key_auth *rka;
+	struct keyring_list *klist;
+	struct key *instkey;
+	int loop;
+
+	klist = rcu_dereference(keyring->payload.subscriptions);
+	if (klist) {
+		for (loop = 0; loop < klist->nkeys; loop++) {
+			instkey = klist->keys[loop];
+
+			if (instkey->type != &key_type_request_key_auth)
+				continue;
+
+			rka = instkey->payload.data;
+			if (target_id && rka->target_key->serial != target_id)
+				continue;
+
+			/* the auth key is revoked during instantiation */
+			if (!test_bit(KEY_FLAG_REVOKED, &instkey->flags))
+				goto found;
+
+			instkey = ERR_PTR(-EKEYREVOKED);
+			goto error;
+		}
+	}
+
+	instkey = ERR_PTR(-EACCES);
+	goto error;
+
+found:
+	atomic_inc(&instkey->usage);
+error:
+	return instkey;
+
+} /* end keyring_search_instkey() */
+
 /*****************************************************************************/
 /*
  * find a keyring with the specified name
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 972e30172687..34db087bbcc7 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -165,7 +165,7 @@ int install_thread_keyring(struct task_struct *tsk)
 /*
  * make sure a process keyring is installed
  */
-static int install_process_keyring(struct task_struct *tsk)
+int install_process_keyring(struct task_struct *tsk)
 {
 	unsigned long flags;
 	struct key *keyring;
@@ -376,12 +376,13 @@ void key_fsgid_changed(struct task_struct *tsk)
  * - we return -EAGAIN if we didn't find any matching key
  * - we return -ENOKEY if we found only negative matching keys
  */
-struct key *search_process_keyrings_aux(struct key_type *type,
-					const void *description,
-					key_match_func_t match)
+struct key *search_process_keyrings(struct key_type *type,
+				    const void *description,
+				    key_match_func_t match,
+				    struct task_struct *context)
 {
-	struct task_struct *tsk = current;
-	struct key *key, *ret, *err;
+	struct request_key_auth *rka;
+	struct key *key, *ret, *err, *instkey;
 
 	/* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
 	 * searchable, but we failed to find a key or we found a negative key;
@@ -395,9 +396,9 @@ struct key *search_process_keyrings_aux(struct key_type *type,
 	err = ERR_PTR(-EAGAIN);
 
 	/* search the thread keyring first */
-	if (tsk->thread_keyring) {
-		key = keyring_search_aux(tsk->thread_keyring, type,
-					 description, match);
+	if (context->thread_keyring) {
+		key = keyring_search_aux(context->thread_keyring,
+					 context, type, description, match);
 		if (!IS_ERR(key))
 			goto found;
 
@@ -415,9 +416,9 @@ struct key *search_process_keyrings_aux(struct key_type *type,
 	}
 
 	/* search the process keyring second */
-	if (tsk->signal->process_keyring) {
-		key = keyring_search_aux(tsk->signal->process_keyring,
-					 type, description, match);
+	if (context->signal->process_keyring) {
+		key = keyring_search_aux(context->signal->process_keyring,
+					 context, type, description, match);
 		if (!IS_ERR(key))
 			goto found;
 
@@ -434,53 +435,93 @@ struct key *search_process_keyrings_aux(struct key_type *type,
 		}
 	}
 
-	/* search the session keyring last */
-	if (tsk->signal->session_keyring) {
+	/* search the session keyring */
+	if (context->signal->session_keyring) {
 		rcu_read_lock();
 		key = keyring_search_aux(
-			rcu_dereference(tsk->signal->session_keyring),
-			type, description, match);
+			rcu_dereference(context->signal->session_keyring),
+			context, type, description, match);
 		rcu_read_unlock();
+
+		if (!IS_ERR(key))
+			goto found;
+
+		switch (PTR_ERR(key)) {
+		case -EAGAIN: /* no key */
+			if (ret)
+				break;
+		case -ENOKEY: /* negative key */
+			ret = key;
+			break;
+		default:
+			err = key;
+			break;
+		}
+
+		/* if this process has a session keyring and that has an
+		 * instantiation authorisation key in the bottom level, then we
+		 * also search the keyrings of the process mentioned there */
+		if (context != current)
+			goto no_key;
+
+		rcu_read_lock();
+		instkey = __keyring_search_one(
+			rcu_dereference(context->signal->session_keyring),
+			&key_type_request_key_auth, NULL, 0);
+		rcu_read_unlock();
+
+		if (IS_ERR(instkey))
+			goto no_key;
+
+		rka = instkey->payload.data;
+
+		key = search_process_keyrings(type, description, match,
+					      rka->context);
+		key_put(instkey);
+
+		if (!IS_ERR(key))
+			goto found;
+
+		switch (PTR_ERR(key)) {
+		case -EAGAIN: /* no key */
+			if (ret)
+				break;
+		case -ENOKEY: /* negative key */
+			ret = key;
+			break;
+		default:
+			err = key;
+			break;
+		}
 	}
+	/* or search the user-session keyring */
 	else {
-		key = keyring_search_aux(tsk->user->session_keyring,
-					 type, description, match);
-	}
-
-	if (!IS_ERR(key))
-		goto found;
+		key = keyring_search_aux(context->user->session_keyring,
+					 context, type, description, match);
+		if (!IS_ERR(key))
+			goto found;
 
-	switch (PTR_ERR(key)) {
-	case -EAGAIN: /* no key */
-		if (ret)
+		switch (PTR_ERR(key)) {
+		case -EAGAIN: /* no key */
+			if (ret)
+				break;
+		case -ENOKEY: /* negative key */
+			ret = key;
 			break;
-	case -ENOKEY: /* negative key */
-		ret = key;
-		break;
-	default:
-		err = key;
-		break;
+		default:
+			err = key;
+			break;
+		}
 	}
 
+
+no_key:
 	/* no key - decide on the error we're going to go for */
 	key = ret ? ret : err;
 
- found:
+found:
 	return key;
 
-} /* end search_process_keyrings_aux() */
-
-/*****************************************************************************/
-/*
- * search the process keyrings for the first matching key
- * - we return -EAGAIN if we didn't find any matching key
- * - we return -ENOKEY if we found only negative matching keys
- */
-struct key *search_process_keyrings(struct key_type *type,
-				    const char *description)
-{
-	return search_process_keyrings_aux(type, description, type->match);
-
 } /* end search_process_keyrings() */
 
 /*****************************************************************************/
@@ -489,72 +530,73 @@ struct key *search_process_keyrings(struct key_type *type,
  * - don't create special keyrings unless so requested
  * - partially constructed keys aren't found unless requested
  */
-struct key *lookup_user_key(key_serial_t id, int create, int partial,
-			    key_perm_t perm)
+struct key *lookup_user_key(struct task_struct *context, key_serial_t id,
+			    int create, int partial, key_perm_t perm)
 {
-	struct task_struct *tsk = current;
-	unsigned long flags;
 	struct key *key;
 	int ret;
 
+	if (!context)
+		context = current;
+
 	key = ERR_PTR(-ENOKEY);
 
 	switch (id) {
 	case KEY_SPEC_THREAD_KEYRING:
-		if (!tsk->thread_keyring) {
+		if (!context->thread_keyring) {
 			if (!create)
 				goto error;
 
-			ret = install_thread_keyring(tsk);
+			ret = install_thread_keyring(context);
 			if (ret < 0) {
 				key = ERR_PTR(ret);
 				goto error;
 			}
 		}
 
-		key = tsk->thread_keyring;
+		key = context->thread_keyring;
 		atomic_inc(&key->usage);
 		break;
 
 	case KEY_SPEC_PROCESS_KEYRING:
-		if (!tsk->signal->process_keyring) {
+		if (!context->signal->process_keyring) {
 			if (!create)
 				goto error;
 
-			ret = install_process_keyring(tsk);
+			ret = install_process_keyring(context);
 			if (ret < 0) {
 				key = ERR_PTR(ret);
 				goto error;
 			}
 		}
 
-		key = tsk->signal->process_keyring;
+		key = context->signal->process_keyring;
 		atomic_inc(&key->usage);
 		break;
 
 	case KEY_SPEC_SESSION_KEYRING:
-		if (!tsk->signal->session_keyring) {
+		if (!context->signal->session_keyring) {
 			/* always install a session keyring upon access if one
 			 * doesn't exist yet */
 			ret = install_session_keyring(
-			       tsk, tsk->user->session_keyring);
+			       context, context->user->session_keyring);
 			if (ret < 0)
 				goto error;
 		}
 
-		spin_lock_irqsave(&tsk->sighand->siglock, flags);
-		key = tsk->signal->session_keyring;
+		rcu_read_lock();
+		key = rcu_dereference(context->signal->session_keyring);
 		atomic_inc(&key->usage);
-		spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+		rcu_read_unlock();
 		break;
 
 	case KEY_SPEC_USER_KEYRING:
-		key = tsk->user->uid_keyring;
+		key = context->user->uid_keyring;
 		atomic_inc(&key->usage);
 		break;
 
 	case KEY_SPEC_USER_SESSION_KEYRING:
-		key = tsk->user->session_keyring;
+		key = context->user->session_keyring;
 		atomic_inc(&key->usage);
 		break;
 
@@ -574,7 +616,7 @@ struct key *lookup_user_key(key_serial_t id, int create, int partial,
 		break;
 	}
 
-	/* check the status and permissions */
+	/* check the status */
 	if (perm) {
 		ret = key_validate(key);
 		if (ret < 0)
@@ -585,8 +627,10 @@ struct key *lookup_user_key(key_serial_t id, int create, int partial,
 	if (!partial && !test_bit(KEY_FLAG_INSTANTIATED, &key->flags))
 		goto invalid_key;
 
+	/* check the permissions */
 	ret = -EACCES;
-	if (!key_permission(key, perm))
+
+	if (!key_task_permission(key, context, perm))
 		goto invalid_key;
 
  error:
@@ -609,7 +653,6 @@ struct key *lookup_user_key(key_serial_t id, int create, int partial,
 long join_session_keyring(const char *name)
 {
 	struct task_struct *tsk = current;
-	unsigned long flags;
 	struct key *keyring;
 	long ret;
 
@@ -619,9 +662,9 @@ long join_session_keyring(const char *name)
 		if (ret < 0)
 			goto error;
 
-		spin_lock_irqsave(&tsk->sighand->siglock, flags);
-		ret = tsk->signal->session_keyring->serial;
-		spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+		rcu_read_lock();
+		ret = rcu_dereference(tsk->signal->session_keyring)->serial;
+		rcu_read_unlock();
 		goto error;
 	}
 
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 54aa7b70e63b..dfcd983af1fd 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -1,6 +1,6 @@
 /* request_key.c: request a key from userspace
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/kmod.h>
 #include <linux/err.h>
+#include <linux/keyctl.h>
 #include "internal.h"
 
 struct key_construction {
@@ -27,18 +28,26 @@ DECLARE_WAIT_QUEUE_HEAD(request_key_conswq);
 /*
  * request userspace finish the construction of a key
  * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring> <info>"
- * - if callout_info is an empty string, it'll be rendered as a "-" instead
  */
 static int call_request_key(struct key *key,
 			    const char *op,
 			    const char *callout_info)
 {
 	struct task_struct *tsk = current;
-	unsigned long flags;
 	key_serial_t prkey, sskey;
+	struct key *session_keyring, *rkakey;
 	char *argv[10], *envp[3], uid_str[12], gid_str[12];
 	char key_str[12], keyring_str[3][12];
-	int i;
+	int ret, i;
+
+	kenter("{%d},%s,%s", key->serial, op, callout_info);
+
+	/* generate a new session keyring with an auth key in it */
+	session_keyring = request_key_auth_new(key, &rkakey);
+	if (IS_ERR(session_keyring)) {
+		ret = PTR_ERR(session_keyring);
+		goto error;
+	}
 
 	/* record the UID and GID */
 	sprintf(uid_str, "%d", current->fsuid);
@@ -55,17 +64,17 @@ static int call_request_key(struct key *key,
 	if (tsk->signal->process_keyring)
 		prkey = tsk->signal->process_keyring->serial;
 
-	sskey = 0;
-	spin_lock_irqsave(&tsk->sighand->siglock, flags);
-	if (tsk->signal->session_keyring)
-		sskey = tsk->signal->session_keyring->serial;
-	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
-
+	sprintf(keyring_str[1], "%d", prkey);
 
-	if (!sskey)
+	if (tsk->signal->session_keyring) {
+		rcu_read_lock();
+		sskey = rcu_dereference(tsk->signal->session_keyring)->serial;
+		rcu_read_unlock();
+	}
+	else {
 		sskey = tsk->user->session_keyring->serial;
+	}
 
-	sprintf(keyring_str[1], "%d", prkey);
 	sprintf(keyring_str[2], "%d", sskey);
 
 	/* set up a minimal environment */
@@ -84,11 +93,20 @@ static int call_request_key(struct key *key,
 	argv[i++] = keyring_str[0];
 	argv[i++] = keyring_str[1];
 	argv[i++] = keyring_str[2];
-	argv[i++] = callout_info[0] ? (char *) callout_info : "-";
+	argv[i++] = (char *) callout_info;
 	argv[i] = NULL;
 
 	/* do it */
-	return call_usermodehelper_keys(argv[0], argv, envp, NULL, 1);
+	ret = call_usermodehelper_keys(argv[0], argv, envp, session_keyring, 1);
+
+	/* dispose of the special keys */
+	key_revoke(rkakey);
+	key_put(rkakey);
+	key_put(session_keyring);
+
+ error:
+	kleave(" = %d", ret);
+	return ret;
 
 } /* end call_request_key() */
 
@@ -107,6 +125,8 @@ static struct key *__request_key_construction(struct key_type *type,
 	struct key *key;
 	int ret, negated;
 
+	kenter("%s,%s,%s", type->name, description, callout_info);
+
 	/* create a key and add it to the queue */
 	key = key_alloc(type, description,
 			current->fsuid, current->fsgid, KEY_USR_ALL, 0);
@@ -143,6 +163,7 @@ static struct key *__request_key_construction(struct key_type *type,
 	}
 
  out:
+	kleave(" = %p", key);
 	return key;
 
  request_failed:
@@ -216,6 +237,9 @@ static struct key *request_key_construction(struct key_type *type,
 
 	DECLARE_WAITQUEUE(myself, current);
 
+	kenter("%s,%s,{%d},%s",
+	       type->name, description, user->uid, callout_info);
+
 	/* see if there's such a key under construction already */
 	down_write(&key_construction_sem);
 
@@ -232,6 +256,7 @@ static struct key *request_key_construction(struct key_type *type,
 	/* see about getting userspace to construct the key */
 	key = __request_key_construction(type, description, callout_info);
  error:
+	kleave(" = %p", key);
 	return key;
 
 	/* someone else has the same key under construction
@@ -245,9 +270,11 @@ static struct key *request_key_construction(struct key_type *type,
 	add_wait_queue(&request_key_conswq, &myself);
 
 	for (;;) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE);
 		if (!test_bit(KEY_FLAG_USER_CONSTRUCT, &ckey->flags))
 			break;
+		if (signal_pending(current))
+			break;
 		schedule();
 	}
 
@@ -265,23 +292,85 @@ static struct key *request_key_construction(struct key_type *type,
 
 } /* end request_key_construction() */
 
+/*****************************************************************************/
+/*
+ * link a freshly minted key to an appropriate destination keyring
+ */
+static void request_key_link(struct key *key, struct key *dest_keyring)
+{
+	struct task_struct *tsk = current;
+	struct key *drop = NULL;
+
+	kenter("{%d},%p", key->serial, dest_keyring);
+
+	/* find the appropriate keyring */
+	if (!dest_keyring) {
+		switch (tsk->jit_keyring) {
+		case KEY_REQKEY_DEFL_DEFAULT:
+		case KEY_REQKEY_DEFL_THREAD_KEYRING:
+			dest_keyring = tsk->thread_keyring;
+			if (dest_keyring)
+				break;
+
+		case KEY_REQKEY_DEFL_PROCESS_KEYRING:
+			dest_keyring = tsk->signal->process_keyring;
+			if (dest_keyring)
+				break;
+
+		case KEY_REQKEY_DEFL_SESSION_KEYRING:
+			rcu_read_lock();
+			dest_keyring = key_get(
+				rcu_dereference(tsk->signal->session_keyring));
+			rcu_read_unlock();
+			drop = dest_keyring;
+
+			if (dest_keyring)
+				break;
+
+		case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
+			dest_keyring = current->user->session_keyring;
+			break;
+
+		case KEY_REQKEY_DEFL_USER_KEYRING:
+			dest_keyring = current->user->uid_keyring;
+			break;
+
+		case KEY_REQKEY_DEFL_GROUP_KEYRING:
+		default:
+			BUG();
+		}
+	}
+
+	/* and attach the key to it */
+	key_link(dest_keyring, key);
+
+	key_put(drop);
+
+	kleave("");
+
+} /* end request_key_link() */
+
 /*****************************************************************************/
 /*
  * request a key
  * - search the process's keyrings
  * - check the list of keys being created or updated
- * - call out to userspace for a key if requested (supplementary info can be
- *   passed)
+ * - call out to userspace for a key if supplementary info was provided
+ * - cache the key in an appropriate keyring
  */
-struct key *request_key(struct key_type *type,
-			const char *description,
-			const char *callout_info)
+struct key *request_key_and_link(struct key_type *type,
+				 const char *description,
+				 const char *callout_info,
+				 struct key *dest_keyring)
 {
 	struct key_user *user;
 	struct key *key;
 
+	kenter("%s,%s,%s,%p",
+	       type->name, description, callout_info, dest_keyring);
+
 	/* search all the process keyrings for a key */
-	key = search_process_keyrings_aux(type, description, type->match);
+	key = search_process_keyrings(type, description, type->match, current);
 
 	if (PTR_ERR(key) == -EAGAIN) {
 		/* the search failed, but the keyrings were searchable, so we
@@ -292,12 +381,13 @@ struct key *request_key(struct key_type *type,
 
 		/* - get hold of the user's construction queue */
 		user = key_user_lookup(current->fsuid);
-		if (!user) {
-			key = ERR_PTR(-ENOMEM);
-			goto error;
-		}
+		if (!user)
+			goto nomem;
+
+		do {
+			if (signal_pending(current))
+				goto interrupted;
 
-		for (;;) {
 			/* ask userspace (returns NULL if it waited on a key
 			 * being constructed) */
 			key = request_key_construction(type, description,
@@ -307,18 +397,46 @@ struct key *request_key(struct key_type *type,
 
 			/* someone else made the key we want, so we need to
 			 * search again as it might now be available to us */
-			key = search_process_keyrings_aux(type, description,
-							  type->match);
-			if (PTR_ERR(key) != -EAGAIN)
-				break;
-		}
+			key = search_process_keyrings(type, description,
+						      type->match, current);
+
+		} while (PTR_ERR(key) == -EAGAIN);
 
 		key_user_put(user);
+
+		/* link the new key into the appropriate keyring */
+		if (!PTR_ERR(key))
+			request_key_link(key, dest_keyring);
 	}
 
- error:
+error:
+	kleave(" = %p", key);
 	return key;
 
+nomem:
+	key = ERR_PTR(-ENOMEM);
+	goto error;
+
+interrupted:
+	key_user_put(user);
+	key = ERR_PTR(-EINTR);
+	goto error;
+
+} /* end request_key_and_link() */
+
+/*****************************************************************************/
+/*
+ * request a key
+ * - search the process's keyrings
+ * - check the list of keys being created or updated
+ * - call out to userspace for a key if supplementary info was provided
+ */
+struct key *request_key(struct key_type *type,
+			const char *description,
+			const char *callout_info)
+{
+	return request_key_and_link(type, description, callout_info, NULL);
+
 } /* end request_key() */
 
 EXPORT_SYMBOL(request_key);
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
new file mode 100644
index 000000000000..f22264632229
--- /dev/null
+++ b/security/keys/request_key_auth.c
@@ -0,0 +1,180 @@
+/* request_key_auth.c: request key authorisation controlling key def
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+static int request_key_auth_instantiate(struct key *, const void *, size_t);
+static void request_key_auth_describe(const struct key *, struct seq_file *);
+static void request_key_auth_destroy(struct key *);
+
+/*
+ * the request-key authorisation key type definition
+ */
+struct key_type key_type_request_key_auth = {
+	.name		= ".request_key_auth",
+	.def_datalen	= sizeof(struct request_key_auth),
+	.instantiate	= request_key_auth_instantiate,
+	.describe	= request_key_auth_describe,
+	.destroy	= request_key_auth_destroy,
+};
+
+/*****************************************************************************/
+/*
+ * instantiate a request-key authorisation record
+ */
+static int request_key_auth_instantiate(struct key *key,
+					const void *data,
+					size_t datalen)
+{
+	struct request_key_auth *rka, *irka;
+	struct key *instkey;
+	int ret;
+
+	ret = -ENOMEM;
+	rka = kmalloc(sizeof(*rka), GFP_KERNEL);
+	if (rka) {
+		/* see if the calling process is already servicing the key
+		 * request of another process */
+		instkey = key_get_instantiation_authkey(0);
+		if (!IS_ERR(instkey)) {
+			/* it is - use that instantiation context here too */
+			irka = instkey->payload.data;
+			rka->context = irka->context;
+			rka->pid = irka->pid;
+			key_put(instkey);
+		}
+		else {
+			/* it isn't - use this process as the context */
+			rka->context = current;
+			rka->pid = current->pid;
+		}
+
+		rka->target_key = key_get((struct key *) data);
+		key->payload.data = rka;
+		ret = 0;
+	}
+
+	return ret;
+
+} /* end request_key_auth_instantiate() */
+
+/*****************************************************************************/
+/*
+ *
+ */
+static void request_key_auth_describe(const struct key *key,
+				      struct seq_file *m)
+{
+	struct request_key_auth *rka = key->payload.data;
+
+	seq_puts(m, "key:");
+	seq_puts(m, key->description);
+	seq_printf(m, " pid:%d", rka->pid);
+
+} /* end request_key_auth_describe() */
+
+/*****************************************************************************/
+/*
+ * destroy an instantiation authorisation token key
+ */
+static void request_key_auth_destroy(struct key *key)
+{
+	struct request_key_auth *rka = key->payload.data;
+
+	kenter("{%d}", key->serial);
+
+	key_put(rka->target_key);
+
+} /* end request_key_auth_destroy() */
+
+/*****************************************************************************/
+/*
+ * create a session keyring to be for the invokation of /sbin/request-key and
+ * stick an authorisation token in it
+ */
+struct key *request_key_auth_new(struct key *target, struct key **_rkakey)
+{
+	struct key *keyring, *rkakey = NULL;
+	char desc[20];
+	int ret;
+
+	kenter("%d,", target->serial);
+
+	/* allocate a new session keyring */
+	sprintf(desc, "_req.%u", target->serial);
+
+	keyring = keyring_alloc(desc, current->fsuid, current->fsgid, 1, NULL);
+	if (IS_ERR(keyring)) {
+		kleave("= %ld", PTR_ERR(keyring));
+		return keyring;
+	}
+
+	/* allocate the auth key */
+	sprintf(desc, "%x", target->serial);
+
+	rkakey = key_alloc(&key_type_request_key_auth, desc,
+			   current->fsuid, current->fsgid,
+			   KEY_USR_VIEW, 1);
+	if (IS_ERR(rkakey)) {
+		key_put(keyring);
+		kleave("= %ld", PTR_ERR(rkakey));
+		return rkakey;
+	}
+
+	/* construct and attach to the keyring */
+	ret = key_instantiate_and_link(rkakey, target, 0, keyring, NULL);
+	if (ret < 0) {
+		key_revoke(rkakey);
+		key_put(rkakey);
+		key_put(keyring);
+		kleave("= %d", ret);
+		return ERR_PTR(ret);
+	}
+
+	*_rkakey = rkakey;
+	kleave(" = {%d} ({%d})", keyring->serial, rkakey->serial);
+	return keyring;
+
+} /* end request_key_auth_new() */
+
+/*****************************************************************************/
+/*
+ * get the authorisation key for instantiation of a specific key if attached to
+ * the current process's keyrings
+ * - this key is inserted into a keyring and that is set as /sbin/request-key's
+ *   session keyring
+ * - a target_id of zero specifies any valid token
+ */
+struct key *key_get_instantiation_authkey(key_serial_t target_id)
+{
+	struct task_struct *tsk = current;
+	struct key *instkey;
+
+	/* we must have our own personal session keyring */
+	if (!tsk->signal->session_keyring)
+		return ERR_PTR(-EACCES);
+
+	/* and it must contain a suitable request authorisation key
+	 * - lock RCU against session keyring changing
+	 */
+	rcu_read_lock();
+
+	instkey = keyring_search_instkey(
+		rcu_dereference(tsk->signal->session_keyring), target_id);
+
+	rcu_read_unlock();
+	return instkey;
+
+} /* end key_get_instantiation_authkey() */
-- 
cgit v1.3-14-g43fede


From c988d2b2845495373f666a381d354a7f80981d62 Mon Sep 17 00:00:00 2001
From: Matt Domsch <Matt_Domsch@dell.com>
Date: Thu, 23 Jun 2005 22:05:15 -0700
Subject: [PATCH] modules: add version and srcversion to sysfs

This patch adds version and srcversion files to
/sys/module/${modulename} containing the version and srcversion fields
of the module's modinfo section (if present).

/sys/module/e1000
|-- srcversion
`-- version

This patch differs slightly from the version posted in January, as it
now uses the new kstrdup() call in -mm.

Why put this in sysfs?

a) Tools like DKMS, which deal with changing out individual kernel
   modules without replacing the whole kernel, can behave smarter if they
   can tell the version of a given module.  The autoinstaller feature, for
   example, which determines if your system has a "good" version of a
   driver (i.e.  if the one provided by DKMS has a newer verson than that
   provided by the kernel package installed), and to automatically compile
   and install a newer version if DKMS has it but your kernel doesn't yet
   have that version.

b) Because sysadmins manually, or with tools like DKMS, can switch out
   modules on the file system, you can't count on 'modinfo foo.ko', which
   looks at /lib/modules/${kernelver}/...  actually matching what is loaded
   into the kernel already.  Hence asking sysfs for this.

c) as the unbind-driver-from-device work takes shape, it will be
   possible to rebind a driver that's built-in (no .ko to modinfo for the
   version) to a newly loaded module.  sysfs will have the
   currently-built-in version info, for comparison.

d) tech support scripts can then easily grab the version info for what's
   running presently - a question I get often.

There has been renewed interest in this patch on linux-scsi by driver
authors.

As the idea originated from GregKH, I leave his Signed-off-by: intact,
though the implementation is nearly completely new.  Compiled and run on
x86 and x86_64.

From: Matthew Dobson <colpatch@us.ibm.com>

      build fix

From: Thierry Vignaud <tvignaud@mandriva.com>

      build fix

From: Matthew Dobson <colpatch@us.ibm.com>

      warning fix

Signed-off-by: Greg Kroah-Hartman <greg@kroah.com>
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/module.h |  5 +++
 kernel/module.c        | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 0e432a0f4aee..f05372b7fe77 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -51,6 +51,9 @@ struct module_attribute {
         ssize_t (*show)(struct module_attribute *, struct module *, char *);
         ssize_t (*store)(struct module_attribute *, struct module *,
 			 const char *, size_t count);
+	void (*setup)(struct module *, const char *);
+	int (*test)(struct module *);
+	void (*free)(struct module *);
 };
 
 struct module_kobject
@@ -239,6 +242,8 @@ struct module
 	/* Sysfs stuff. */
 	struct module_kobject mkobj;
 	struct module_param_attrs *param_attrs;
+	const char *version;
+	const char *srcversion;
 
 	/* Exported symbols */
 	const struct kernel_symbol *syms;
diff --git a/kernel/module.c b/kernel/module.c
index a566745dde62..0494c89a0d26 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -35,6 +35,7 @@
 #include <linux/notifier.h>
 #include <linux/stop_machine.h>
 #include <linux/device.h>
+#include <linux/string.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
@@ -370,6 +371,43 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_MODULE_UNLOAD
+#define MODINFO_ATTR(field)	\
+static void setup_modinfo_##field(struct module *mod, const char *s)  \
+{                                                                     \
+	mod->field = kstrdup(s, GFP_KERNEL);                          \
+}                                                                     \
+static ssize_t show_modinfo_##field(struct module_attribute *mattr,   \
+	                struct module *mod, char *buffer)             \
+{                                                                     \
+	return sprintf(buffer, "%s\n", mod->field);                   \
+}                                                                     \
+static int modinfo_##field##_exists(struct module *mod)               \
+{                                                                     \
+	return mod->field != NULL;                                    \
+}                                                                     \
+static void free_modinfo_##field(struct module *mod)                  \
+{                                                                     \
+        kfree(mod->field);                                            \
+        mod->field = NULL;                                            \
+}                                                                     \
+static struct module_attribute modinfo_##field = {                    \
+	.attr = { .name = __stringify(field), .mode = 0444,           \
+		  .owner = THIS_MODULE },                             \
+	.show = show_modinfo_##field,                                 \
+	.setup = setup_modinfo_##field,                               \
+	.test = modinfo_##field##_exists,                             \
+	.free = free_modinfo_##field,                                 \
+};
+
+MODINFO_ATTR(version);
+MODINFO_ATTR(srcversion);
+
+static struct module_attribute *modinfo_attrs[] = {
+	&modinfo_version,
+	&modinfo_srcversion,
+	NULL,
+};
+
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
@@ -1031,6 +1069,32 @@ static void module_remove_refcnt_attr(struct module *mod)
 }
 #endif
 
+#ifdef CONFIG_MODULE_UNLOAD
+static int module_add_modinfo_attrs(struct module *mod)
+{
+	struct module_attribute *attr;
+	int error = 0;
+	int i;
+
+	for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
+		if (!attr->test ||
+		    (attr->test && attr->test(mod)))
+			error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr);
+	}
+	return error;
+}
+
+static void module_remove_modinfo_attrs(struct module *mod)
+{
+	struct module_attribute *attr;
+	int i;
+
+	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+		sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
+		attr->free(mod);
+	}
+}
+#endif
 
 static int mod_sysfs_setup(struct module *mod,
 			   struct kernel_param *kparam,
@@ -1056,6 +1120,12 @@ static int mod_sysfs_setup(struct module *mod,
 	if (err)
 		goto out_unreg;
 
+#ifdef CONFIG_MODULE_UNLOAD
+	err = module_add_modinfo_attrs(mod);
+	if (err)
+		goto out_unreg;
+#endif
+
 	return 0;
 
 out_unreg:
@@ -1066,6 +1136,9 @@ out:
 
 static void mod_kobject_remove(struct module *mod)
 {
+#ifdef CONFIG_MODULE_UNLOAD
+	module_remove_modinfo_attrs(mod);
+#endif
 	module_remove_refcnt_attr(mod);
 	module_param_sysfs_remove(mod);
 
@@ -1311,6 +1384,23 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
 	return NULL;
 }
 
+#ifdef CONFIG_MODULE_UNLOAD
+static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
+			  unsigned int infoindex)
+{
+	struct module_attribute *attr;
+	int i;
+
+	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+		if (attr->setup)
+			attr->setup(mod,
+				    get_modinfo(sechdrs,
+						infoindex,
+						attr->attr.name));
+	}
+}
+#endif
+
 #ifdef CONFIG_KALLSYMS
 int is_exported(const char *name, const struct module *mod)
 {
@@ -1615,6 +1705,11 @@ static struct module *load_module(void __user *umod,
 	/* Set up license info based on the info section */
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
+#ifdef CONFIG_MODULE_UNLOAD
+	/* Set up MODINFO_ATTR fields */
+	setup_modinfo(mod, sechdrs, infoindex);
+#endif
+
 	/* Fix up syms, so that st_value is a pointer to location. */
 	err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
 			       mod);
-- 
cgit v1.3-14-g43fede


From 52c1da39534fb382c061de58b65f678ad74b59f5 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 23 Jun 2005 22:05:33 -0700
Subject: [PATCH] make various thing static

Another rollup of patches which give various symbols static scope

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/media/common/saa7146_fops.c |  2 +-
 drivers/media/video/tvaudio.c       | 22 +++++++++++-----------
 drivers/scsi/hosts.c                |  2 +-
 drivers/scsi/scsi.c                 |  6 ++++--
 drivers/scsi/scsi_debug.c           |  2 +-
 drivers/scsi/scsi_lib.c             |  2 +-
 drivers/scsi/scsi_priv.h            |  4 ----
 drivers/scsi/scsi_sysfs.c           |  4 ++--
 fs/namespace.c                      |  2 +-
 fs/reiserfs/stree.c                 |  2 +-
 include/linux/irq.h                 |  1 -
 include/linux/namespace.h           |  1 -
 include/net/sctp/sm.h               |  6 ------
 kernel/irq/spurious.c               |  2 +-
 kernel/module.c                     |  2 +-
 kernel/power/swsusp.c               |  2 +-
 net/sctp/sm_statefuns.c             | 16 ++++++++++++++--
 17 files changed, 40 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/drivers/media/common/saa7146_fops.c b/drivers/media/common/saa7146_fops.c
index cb826c9adfe7..c04fd11526e0 100644
--- a/drivers/media/common/saa7146_fops.c
+++ b/drivers/media/common/saa7146_fops.c
@@ -403,7 +403,7 @@ static struct file_operations video_fops =
 	.llseek		= no_llseek,
 };
 
-void vv_callback(struct saa7146_dev *dev, unsigned long status)
+static void vv_callback(struct saa7146_dev *dev, unsigned long status)
 {
 	u32 isr = status;
 	
diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c
index 5430b25b910d..9a493bea76d8 100644
--- a/drivers/media/video/tvaudio.c
+++ b/drivers/media/video/tvaudio.c
@@ -1236,17 +1236,17 @@ static int ta8874z_checkit(struct CHIPSTATE *chip)
 /* audio chip descriptions - struct CHIPDESC                              */
 
 /* insmod options to enable/disable individual audio chips */
-int tda8425  = 1;
-int tda9840  = 1;
-int tda9850  = 1;
-int tda9855  = 1;
-int tda9873  = 1;
-int tda9874a = 1;
-int tea6300  = 0;  // address clash with msp34xx
-int tea6320  = 0;  // address clash with msp34xx
-int tea6420  = 1;
-int pic16c54 = 1;
-int ta8874z  = 0;  // address clash with tda9840
+static int tda8425  = 1;
+static int tda9840  = 1;
+static int tda9850  = 1;
+static int tda9855  = 1;
+static int tda9873  = 1;
+static int tda9874a = 1;
+static int tea6300  = 0;  // address clash with msp34xx
+static int tea6320  = 0;  // address clash with msp34xx
+static int tea6420  = 1;
+static int pic16c54 = 1;
+static int ta8874z  = 0;  // address clash with tda9840
 
 module_param(tda8425, int, 0444);
 module_param(tda9840, int, 0444);
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index ba347576d99b..d7a38b6713f9 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -56,7 +56,7 @@ static struct class shost_class = {
  * @shost:	pointer to struct Scsi_Host
  * recovery:	recovery requested to run.
  **/
-void scsi_host_cancel(struct Scsi_Host *shost, int recovery)
+static void scsi_host_cancel(struct Scsi_Host *shost, int recovery)
 {
 	struct scsi_device *sdev;
 
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 5578ae9a9e45..1cb5f7d4f278 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -68,6 +68,8 @@
 #include "scsi_priv.h"
 #include "scsi_logging.h"
 
+static void scsi_done(struct scsi_cmnd *cmd);
+static int scsi_retry_command(struct scsi_cmnd *cmd);
 
 /*
  * Definitions and constants.
@@ -741,7 +743,7 @@ static DEFINE_PER_CPU(struct list_head, scsi_done_q);
  *
  * This function is interrupt context safe.
  */
-void scsi_done(struct scsi_cmnd *cmd)
+static void scsi_done(struct scsi_cmnd *cmd)
 {
 	/*
 	 * We don't have to worry about this one timing out any more.
@@ -836,7 +838,7 @@ static void scsi_softirq(struct softirq_action *h)
  *              level drivers should not become re-entrant as a result of
  *              this.
  */
-int scsi_retry_command(struct scsi_cmnd *cmd)
+static int scsi_retry_command(struct scsi_cmnd *cmd)
 {
 	/*
 	 * Restore the SCSI command state.
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index e0208886b45e..322b5a41a36f 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -1783,7 +1783,7 @@ static void __exit scsi_debug_exit(void)
 device_initcall(scsi_debug_init);
 module_exit(scsi_debug_exit);
 
-void pseudo_0_release(struct device * dev)
+static void pseudo_0_release(struct device * dev)
 {
 	if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
 		printk(KERN_INFO "scsi_debug: pseudo_0_release() called\n");
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9f996499fa9d..621dee8b8cb2 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -44,7 +44,7 @@ struct scsi_host_sg_pool {
 #endif
 
 #define SP(x) { x, "sgpool-" #x } 
-struct scsi_host_sg_pool scsi_sg_pools[] = { 
+static struct scsi_host_sg_pool scsi_sg_pools[] = {
 	SP(8),
 	SP(16),
 	SP(32),
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index c01580df4476..96d4f745975c 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -61,8 +61,6 @@ extern void scsi_exit_hosts(void);
 extern int scsi_dispatch_cmd(struct scsi_cmnd *cmd);
 extern int scsi_setup_command_freelist(struct Scsi_Host *shost);
 extern void scsi_destroy_command_freelist(struct Scsi_Host *shost);
-extern void scsi_done(struct scsi_cmnd *cmd);
-extern int scsi_retry_command(struct scsi_cmnd *cmd);
 extern int scsi_insert_special_req(struct scsi_request *sreq, int);
 extern void scsi_init_cmd_from_req(struct scsi_cmnd *cmd,
 		struct scsi_request *sreq);
@@ -136,7 +134,6 @@ extern void scsi_exit_sysctl(void);
 #endif /* CONFIG_SYSCTL */
 
 /* scsi_sysfs.c */
-extern void scsi_device_dev_release(struct device *);
 extern int scsi_sysfs_add_sdev(struct scsi_device *);
 extern int scsi_sysfs_add_host(struct Scsi_Host *);
 extern int scsi_sysfs_register(void);
@@ -145,7 +142,6 @@ extern void scsi_sysfs_device_initialize(struct scsi_device *);
 extern int scsi_sysfs_target_initialize(struct scsi_device *);
 extern struct scsi_transport_template blank_transport_template;
 
-extern struct class sdev_class;
 extern struct bus_type scsi_bus_type;
 
 /* 
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 93b41100a6d8..beed7fbe1cbe 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -150,7 +150,7 @@ static void scsi_device_cls_release(struct class_device *class_dev)
 	put_device(&sdev->sdev_gendev);
 }
 
-void scsi_device_dev_release(struct device *dev)
+static void scsi_device_dev_release(struct device *dev)
 {
 	struct scsi_device *sdev;
 	struct device *parent;
@@ -185,7 +185,7 @@ void scsi_device_dev_release(struct device *dev)
 		put_device(parent);
 }
 
-struct class sdev_class = {
+static struct class sdev_class = {
 	.name		= "scsi_device",
 	.release	= scsi_device_cls_release,
 };
diff --git a/fs/namespace.c b/fs/namespace.c
index 3b93e5d750eb..208c079e9fdb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -337,7 +337,7 @@ int may_umount(struct vfsmount *mnt)
 
 EXPORT_SYMBOL(may_umount);
 
-void umount_tree(struct vfsmount *mnt)
+static void umount_tree(struct vfsmount *mnt)
 {
 	struct vfsmount *p;
 	LIST_HEAD(kill);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index c47f8fd31a2d..63158491e152 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -223,7 +223,7 @@ extern struct tree_balance * cur_tb;
 const struct reiserfs_key  MIN_KEY = {0, 0, {{0, 0},}};
 
 /* Maximal possible key. It is never in the tree. */
-const struct reiserfs_key  MAX_KEY = {
+static const struct reiserfs_key  MAX_KEY = {
 	__constant_cpu_to_le32(0xffffffff),
 	__constant_cpu_to_le32(0xffffffff),
 	{{__constant_cpu_to_le32(0xffffffff),
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 7fc1022be9ee..12277799c007 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -88,7 +88,6 @@ extern fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 				       struct irqaction *action);
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
 extern void note_interrupt(unsigned int irq, irq_desc_t *desc, int action_ret);
-extern void report_bad_irq(unsigned int irq, irq_desc_t *desc, int action_ret);
 extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
 extern void init_irq_proc(void);
diff --git a/include/linux/namespace.h b/include/linux/namespace.h
index 9eca1558d72f..697991b69f9b 100644
--- a/include/linux/namespace.h
+++ b/include/linux/namespace.h
@@ -12,7 +12,6 @@ struct namespace {
 	struct rw_semaphore	sem;
 };
 
-extern void umount_tree(struct vfsmount *);
 extern int copy_namespace(int, struct task_struct *);
 extern void __put_namespace(struct namespace *namespace);
 
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index a53e08a45e32..88d9fe5975d5 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -131,7 +131,6 @@ sctp_state_fn_t sctp_sf_do_ecne;
 sctp_state_fn_t sctp_sf_ootb;
 sctp_state_fn_t sctp_sf_pdiscard;
 sctp_state_fn_t sctp_sf_violation;
-sctp_state_fn_t sctp_sf_violation_chunklen;
 sctp_state_fn_t sctp_sf_discard_chunk;
 sctp_state_fn_t sctp_sf_do_5_2_1_siminit;
 sctp_state_fn_t sctp_sf_do_5_2_2_dupinit;
@@ -259,11 +258,6 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
-sctp_disposition_t  sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
-					   __u16 error,
-					   const struct sctp_association *asoc,
-					   struct sctp_transport *transport);
-
 /* Prototypes for statetable processing. */
 
 int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f6297c306905..ba039e827d58 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -45,7 +45,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
 	}
 }
 
-void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
 {
 	static int count = 100;
 
diff --git a/kernel/module.c b/kernel/module.c
index 0494c89a0d26..068e271ab3a5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -730,7 +730,7 @@ static int obsparm_copy_string(const char *val, struct kernel_param *kp)
 	return 0;
 }
 
-int set_obsolete(const char *val, struct kernel_param *kp)
+static int set_obsolete(const char *val, struct kernel_param *kp)
 {
 	unsigned int min, max;
 	unsigned int size, maxsize;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 90b3b68dee3f..53f9f8720ee4 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -81,7 +81,7 @@ static int nr_copy_pages_check;
 extern char resume_file[];
 
 /* Local variables that should not be affected by save */
-unsigned int nr_copy_pages __nosavedata = 0;
+static unsigned int nr_copy_pages __nosavedata = 0;
 
 /* Suspend pagedir is allocated before final copy, therefore it
    must be freed after resume 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 058189684c7c..86073df418f5 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -92,6 +92,17 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
 					     sctp_cmd_seq_t *commands);
 static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
 
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+					   __u16 error,
+					   const struct sctp_association *asoc,
+					   struct sctp_transport *transport);
+
+static sctp_disposition_t sctp_sf_violation_chunklen(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands);
 
 /* Small helper function that checks if the chunk length
  * is of the appropriate length.  The 'required_length' argument
@@ -2328,7 +2339,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
  *
  * This is common code called by several sctp_sf_*_abort() functions above.
  */
-sctp_disposition_t  sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
 					   __u16 error,
 					   const struct sctp_association *asoc,
 					   struct sctp_transport *transport)
@@ -3687,7 +3698,8 @@ sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep,
  *
  * Generate an  ABORT chunk and terminate the association.
  */
-sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep,
+static sctp_disposition_t sctp_sf_violation_chunklen(
+				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
 				     const sctp_subtype_t type,
 				     void *arg,
-- 
cgit v1.3-14-g43fede


From f370513640492641b4046bfd9a6e4714f6ae530d Mon Sep 17 00:00:00 2001
From: Zwane Mwaikambo <zwane@linuxpower.ca>
Date: Sat, 25 Jun 2005 14:54:50 -0700
Subject: [PATCH] i386 CPU hotplug

(The i386 CPU hotplug patch provides infrastructure for some work which Pavel
is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua
<shaohua.li@intel.com> is doing)

The following provides i386 architecture support for safely unregistering and
registering processors during runtime, updated for the current -mm tree.  In
order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the
cpu_online check in do_IRQ() by modifying fixup_irqs().  The difference being
that on cpu offline, fixup_irqs() is called before we clear the cpu from
cpu_online_map and a long delay in order to ensure that we never have any
queued external interrupts on the APICs.  There are additional changes to s390
and ppc64 to account for this change.

1) Add CONFIG_HOTPLUG_CPU
2) disable local APIC timer on dead cpus.
3) Disable preempt around irq balancing to prevent CPUs going down.
4) Print irq stats for all possible cpus.
5) Debugging check for interrupts on offline cpus.
6) Hacky fixup_irqs() to redirect irqs when cpus go off/online.
7) play_dead() for offline cpus to spin inside.
8) Handle offline cpus set in flush_tlb_others().
9) Grab lock earlier in smp_call_function() to prevent CPUs going down.
10) Implement __cpu_disable() and __cpu_die().
11) Enable local interrupts in cpu_enable() after fixup_irqs()
12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus.
13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline.

Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig               |  9 ++++
 arch/i386/kernel/apic.c         |  3 +-
 arch/i386/kernel/io_apic.c      |  2 +
 arch/i386/kernel/irq.c          | 67 ++++++++++++++++++++++------
 arch/i386/kernel/process.c      | 39 +++++++++++++++-
 arch/i386/kernel/smp.c          | 24 ++++++----
 arch/i386/kernel/smpboot.c      | 98 ++++++++++++++++++++++++++++++++++++++---
 arch/i386/kernel/traps.c        |  8 ++++
 arch/ia64/kernel/smpboot.c      |  1 +
 arch/ppc64/kernel/pSeries_smp.c |  5 ++-
 arch/s390/kernel/smp.c          |  4 +-
 include/asm-i386/cpu.h          |  2 +
 include/asm-i386/irq.h          |  4 ++
 include/asm-i386/smp.h          |  3 ++
 kernel/cpu.c                    | 14 +++---
 15 files changed, 243 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index d4ae5f9ceae6..b4cd11e58451 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1250,6 +1250,15 @@ config SCx200
 	  This support is also available as a module.  If compiled as a
 	  module, it will be called scx200.
 
+config HOTPLUG_CPU
+	bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
+	depends on SMP && HOTPLUG && EXPERIMENTAL
+	---help---
+	  Say Y here to experiment with turning CPUs off and on.  CPUs
+	  can be controlled through /sys/devices/system/cpu.
+
+	  Say N.
+
 source "drivers/pcmcia/Kconfig"
 
 source "drivers/pci/hotplug/Kconfig"
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 8d993fa71754..a28a088f3e75 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -26,6 +26,7 @@
 #include <linux/mc146818rtc.h>
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
+#include <linux/cpu.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
@@ -1048,7 +1049,7 @@ void __init setup_secondary_APIC_clock(void)
 	setup_APIC_timer(calibration_result);
 }
 
-void __init disable_APIC_timer(void)
+void __devinit disable_APIC_timer(void)
 {
 	if (using_apic_timer) {
 		unsigned long v;
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 08540bc4ba3e..3c2b3bdfc807 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -576,9 +576,11 @@ static int balanced_irq(void *unused)
 		try_to_freeze(PF_FREEZE);
 		if (time_after(jiffies,
 				prev_balance_time+balanced_irq_interval)) {
+			preempt_disable();
 			do_irq_balance();
 			prev_balance_time = jiffies;
 			time_remaining = balanced_irq_interval;
+			preempt_enable();
 		}
 	}
 	return 0;
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 73945a3c53c4..af115004aec5 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -15,6 +15,9 @@
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
 
 DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp;
 EXPORT_PER_CPU_SYMBOL(irq_stat);
@@ -210,9 +213,8 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (i == 0) {
 		seq_printf(p, "           ");
-		for (j=0; j<NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "CPU%d       ",j);
+		for_each_cpu(j)
+			seq_printf(p, "CPU%d       ",j);
 		seq_putc(p, '\n');
 	}
 
@@ -225,9 +227,8 @@ int show_interrupts(struct seq_file *p, void *v)
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+		for_each_cpu(j)
+			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
 		seq_printf(p, " %14s", irq_desc[i].handler->typename);
 		seq_printf(p, "  %s", action->name);
@@ -240,16 +241,14 @@ skip:
 		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
 	} else if (i == NR_IRQS) {
 		seq_printf(p, "NMI: ");
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "%10u ", nmi_count(j));
+		for_each_cpu(j)
+			seq_printf(p, "%10u ", nmi_count(j));
 		seq_putc(p, '\n');
 #ifdef CONFIG_X86_LOCAL_APIC
 		seq_printf(p, "LOC: ");
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "%10u ",
-					per_cpu(irq_stat,j).apic_timer_irqs);
+		for_each_cpu(j)
+			seq_printf(p, "%10u ",
+				per_cpu(irq_stat,j).apic_timer_irqs);
 		seq_putc(p, '\n');
 #endif
 		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
@@ -259,3 +258,45 @@ skip:
 	}
 	return 0;
 }
+
+#ifdef CONFIG_HOTPLUG_CPU
+#include <mach_apic.h>
+
+void fixup_irqs(cpumask_t map)
+{
+	unsigned int irq;
+	static int warned;
+
+	for (irq = 0; irq < NR_IRQS; irq++) {
+		cpumask_t mask;
+		if (irq == 2)
+			continue;
+
+		cpus_and(mask, irq_affinity[irq], map);
+		if (any_online_cpu(mask) == NR_CPUS) {
+			printk("Breaking affinity for irq %i\n", irq);
+			mask = map;
+		}
+		if (irq_desc[irq].handler->set_affinity)
+			irq_desc[irq].handler->set_affinity(irq, mask);
+		else if (irq_desc[irq].action && !(warned++))
+			printk("Cannot set affinity for irq %i\n", irq);
+	}
+
+#if 0
+	barrier();
+	/* Ingo Molnar says: "after the IO-APIC masks have been redirected
+	   [note the nop - the interrupt-enable boundary on x86 is two
+	   instructions from sti] - to flush out pending hardirqs and
+	   IPIs. After this point nothing is supposed to reach this CPU." */
+	__asm__ __volatile__("sti; nop; cli");
+	barrier();
+#else
+	/* That doesn't seem sufficient.  Give it 1ms. */
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+#endif
+}
+#endif
+
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index aea2ce1145df..c1b11e8df60b 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -13,6 +13,7 @@
 
 #include <stdarg.h>
 
+#include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
@@ -55,6 +56,9 @@
 #include <linux/irq.h>
 #include <linux/err.h>
 
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
 static int hlt_counter;
@@ -143,14 +147,44 @@ static void poll_idle (void)
 	}
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+	/* Ack it */
+	__get_cpu_var(cpu_state) = CPU_DEAD;
+
+	/* We shouldn't have to disable interrupts while dead, but
+	 * some interrupts just don't seem to go away, and this makes
+	 * it "work" for testing purposes. */
+	/* Death loop */
+	while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
+		cpu_relax();
+
+	local_irq_disable();
+	__flush_tlb_all();
+	cpu_set(smp_processor_id(), cpu_online_map);
+	enable_APIC_timer();
+	local_irq_enable();
+}
+#else
+static inline void play_dead(void)
+{
+	BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 /*
  * The idle thread. There's no useful work to be
  * done, so just try to conserve power and have a
  * low exit latency (ie sit in a loop waiting for
  * somebody to say that they'd like to reschedule)
  */
-void cpu_idle (void)
+void cpu_idle(void)
 {
+	int cpu = raw_smp_processor_id();
+
 	/* endless idle loop with no priority at all */
 	while (1) {
 		while (!need_resched()) {
@@ -165,6 +199,9 @@ void cpu_idle (void)
 			if (!idle)
 				idle = default_idle;
 
+			if (cpu_is_offline(cpu))
+				play_dead();
+
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
 			idle();
 		}
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 68be7d0c7238..35f521612b20 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -19,6 +19,7 @@
 #include <linux/mc146818rtc.h>
 #include <linux/cache.h>
 #include <linux/interrupt.h>
+#include <linux/cpu.h>
 #include <linux/module.h>
 
 #include <asm/mtrr.h>
@@ -164,7 +165,7 @@ void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
 	unsigned long flags;
 
 	local_irq_save(flags);
-		
+	WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
 	/*
 	 * Wait for idle.
 	 */
@@ -346,21 +347,21 @@ out:
 static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
 						unsigned long va)
 {
-	cpumask_t tmp;
 	/*
 	 * A couple of (to be removed) sanity checks:
 	 *
-	 * - we do not send IPIs to not-yet booted CPUs.
 	 * - current CPU must not be in mask
 	 * - mask must exist :)
 	 */
 	BUG_ON(cpus_empty(cpumask));
-
-	cpus_and(tmp, cpumask, cpu_online_map);
-	BUG_ON(!cpus_equal(cpumask, tmp));
 	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
 	BUG_ON(!mm);
 
+	/* If a CPU which we ran on has gone down, OK. */
+	cpus_and(cpumask, cpumask, cpu_online_map);
+	if (cpus_empty(cpumask))
+		return;
+
 	/*
 	 * i'm not happy about this global shared spinlock in the
 	 * MM hot path, but we'll see how contended it is.
@@ -476,6 +477,7 @@ void flush_tlb_all(void)
  */
 void smp_send_reschedule(int cpu)
 {
+	WARN_ON(cpu_is_offline(cpu));
 	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
 }
 
@@ -516,10 +518,15 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
  */
 {
 	struct call_data_struct data;
-	int cpus = num_online_cpus()-1;
+	int cpus;
 
-	if (!cpus)
+	/* Holding any lock stops cpus from going down. */
+	spin_lock(&call_lock);
+	cpus = num_online_cpus() - 1;
+	if (!cpus) {
+		spin_unlock(&call_lock);
 		return 0;
+	}
 
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
@@ -531,7 +538,6 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
 	if (wait)
 		atomic_set(&data.finished, 0);
 
-	spin_lock(&call_lock);
 	call_data = &data;
 	mb();
 	
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index c20d96d5c15c..ad74a46e9ef0 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -44,6 +44,9 @@
 #include <linux/smp_lock.h>
 #include <linux/irq.h>
 #include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
 
 #include <linux/delay.h>
 #include <linux/mc146818rtc.h>
@@ -96,6 +99,9 @@ static int trampoline_exec;
 
 static void map_cpu_to_logical_apicid(void);
 
+/* State of each CPU. */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
 /*
  * Currently trivial. Write the real->protected mode
  * bootstrap into the page concerned. The caller
@@ -1119,6 +1125,9 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
    who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
+	smp_commenced_mask = cpumask_of_cpu(0);
+	cpu_callin_map = cpumask_of_cpu(0);
+	mb();
 	smp_boot_cpus(max_cpus);
 }
 
@@ -1128,20 +1137,99 @@ void __devinit smp_prepare_boot_cpu(void)
 	cpu_set(smp_processor_id(), cpu_callout_map);
 }
 
-int __devinit __cpu_up(unsigned int cpu)
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* must be called with the cpucontrol mutex held */
+static int __devinit cpu_enable(unsigned int cpu)
 {
-	/* This only works at boot for x86.  See "rewrite" above. */
-	if (cpu_isset(cpu, smp_commenced_mask)) {
-		local_irq_enable();
-		return -ENOSYS;
+	/* get the target out of its holding state */
+	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+	wmb();
+
+	/* wait for the processor to ack it. timeout? */
+	while (!cpu_online(cpu))
+		cpu_relax();
+
+	fixup_irqs(cpu_online_map);
+	/* counter the disable in fixup_irqs() */
+	local_irq_enable();
+	return 0;
+}
+
+int __cpu_disable(void)
+{
+	cpumask_t map = cpu_online_map;
+	int cpu = smp_processor_id();
+
+	/*
+	 * Perhaps use cpufreq to drop frequency, but that could go
+	 * into generic code.
+ 	 *
+	 * We won't take down the boot processor on i386 due to some
+	 * interrupts only being able to be serviced by the BSP.
+	 * Especially so if we're not using an IOAPIC	-zwane
+	 */
+	if (cpu == 0)
+		return -EBUSY;
+
+	/* We enable the timer again on the exit path of the death loop */
+	disable_APIC_timer();
+	/* Allow any queued timer interrupts to get serviced */
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+
+	cpu_clear(cpu, map);
+	fixup_irqs(map);
+	/* It's now safe to remove this processor from the online map */
+	cpu_clear(cpu, cpu_online_map);
+	return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	/* We don't do anything here: idle task is faking death itself. */
+	unsigned int i;
+
+	for (i = 0; i < 10; i++) {
+		/* They ack this in play_dead by setting CPU_DEAD */
+		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+			return;
+		current->state = TASK_UNINTERRUPTIBLE;
+		schedule_timeout(HZ/10);
 	}
+ 	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+#else /* ... !CONFIG_HOTPLUG_CPU */
+int __cpu_disable(void)
+{
+	return -ENOSYS;
+}
 
+void __cpu_die(unsigned int cpu)
+{
+	/* We said "no" in __cpu_disable */
+	BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __devinit __cpu_up(unsigned int cpu)
+{
 	/* In case one didn't come up */
 	if (!cpu_isset(cpu, cpu_callin_map)) {
+		printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
 		local_irq_enable();
 		return -EIO;
 	}
 
+#ifdef CONFIG_HOTPLUG_CPU
+	/* Already up, and in cpu_quiescent now? */
+	if (cpu_isset(cpu, smp_commenced_mask)) {
+		cpu_enable(cpu);
+		return 0;
+	}
+#endif
+
 	local_irq_enable();
 	/* Unleash the CPU! */
 	cpu_set(cpu, smp_commenced_mask);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index e4d4e2162c7a..207ea8ba7169 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -625,6 +625,14 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code)
 	nmi_enter();
 
 	cpu = smp_processor_id();
+
+#ifdef CONFIG_HOTPLUG_CPU
+	if (!cpu_online(cpu)) {
+		nmi_exit();
+		return;
+	}
+#endif
+
 	++nmi_count(cpu);
 
 	if (!nmi_callback(regs, cpu))
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 3865f088ffa2..a888ddc10f7d 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -688,6 +688,7 @@ int __cpu_disable(void)
 		return -EBUSY;
 
 	remove_siblinginfo(cpu);
+	cpu_clear(cpu, cpu_online_map);
 	fixup_irqs();
 	local_flush_tlb_all();
 	cpu_clear(cpu, cpu_callin_map);
diff --git a/arch/ppc64/kernel/pSeries_smp.c b/arch/ppc64/kernel/pSeries_smp.c
index 30154140f7e2..62c55a123560 100644
--- a/arch/ppc64/kernel/pSeries_smp.c
+++ b/arch/ppc64/kernel/pSeries_smp.c
@@ -93,10 +93,13 @@ static int query_cpu_stopped(unsigned int pcpu)
 
 int pSeries_cpu_disable(void)
 {
+	int cpu = smp_processor_id();
+
+	cpu_clear(cpu, cpu_online_map);
 	systemcfg->processorCount--;
 
 	/*fix boot_cpuid here*/
-	if (smp_processor_id() == boot_cpuid)
+	if (cpu == boot_cpuid)
 		boot_cpuid = any_online_cpu(cpu_online_map);
 
 	/* FIXME: abstract this to not be platform specific later on */
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index fdfcf0488b49..93c71fef99dc 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -679,12 +679,14 @@ __cpu_disable(void)
 {
 	unsigned long flags;
 	ec_creg_mask_parms cr_parms;
+	int cpu = smp_processor_id();
 
 	spin_lock_irqsave(&smp_reserve_lock, flags);
-	if (smp_cpu_reserved[smp_processor_id()] != 0) {
+	if (smp_cpu_reserved[cpu] != 0) {
 		spin_unlock_irqrestore(&smp_reserve_lock, flags);
 		return -EBUSY;
 	}
+	cpu_clear(cpu, cpu_online_map);
 
 #ifdef CONFIG_PFAULT
 	/* Disable pfault pseudo page faults on this cpu. */
diff --git a/include/asm-i386/cpu.h b/include/asm-i386/cpu.h
index 002740b21951..e7252c216ca8 100644
--- a/include/asm-i386/cpu.h
+++ b/include/asm-i386/cpu.h
@@ -5,6 +5,7 @@
 #include <linux/cpu.h>
 #include <linux/topology.h>
 #include <linux/nodemask.h>
+#include <linux/percpu.h>
 
 #include <asm/node.h>
 
@@ -16,4 +17,5 @@ extern int arch_register_cpu(int num);
 extern void arch_unregister_cpu(int);
 #endif
 
+DECLARE_PER_CPU(int, cpu_state);
 #endif /* _ASM_I386_CPU_H_ */
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 05b9e61b0a72..e2d8bf23ad70 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -38,4 +38,8 @@ extern void release_vm86_irqs(struct task_struct *);
 extern int irqbalance_disable(char *str);
 #endif
 
+#ifdef CONFIG_HOTPLUG_CPU
+extern void fixup_irqs(cpumask_t map);
+#endif
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 55ef31f66bbe..507f2fd39a6a 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -83,6 +83,9 @@ static __inline int logical_smp_processor_id(void)
 }
 
 #endif
+
+extern int __cpu_disable(void);
+extern void __cpu_die(unsigned int cpu);
 #endif /* !__ASSEMBLY__ */
 
 #define NO_PROC_ID		0xFF		/* No processor magic marker */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 628f4ccda127..53d8263ae12e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,19 +63,15 @@ static int take_cpu_down(void *unused)
 {
 	int err;
 
-	/* Take offline: makes arch_cpu_down somewhat easier. */
-	cpu_clear(smp_processor_id(), cpu_online_map);
-
 	/* Ensure this CPU doesn't handle any more interrupts. */
 	err = __cpu_disable();
 	if (err < 0)
-		cpu_set(smp_processor_id(), cpu_online_map);
-	else
-		/* Force idle task to run as soon as we yield: it should
-		   immediately notice cpu is offline and die quickly. */
-		sched_idle_next();
+		return err;
 
-	return err;
+	/* Force idle task to run as soon as we yield: it should
+	   immediately notice cpu is offline and die quickly. */
+	sched_idle_next();
+	return 0;
 }
 
 int cpu_down(unsigned int cpu)
-- 
cgit v1.3-14-g43fede


From 5a72e04df5470df0ec646029d31e5528167ab1a7 Mon Sep 17 00:00:00 2001
From: Li Shaohua <shaohua.li@intel.com>
Date: Sat, 25 Jun 2005 14:55:06 -0700
Subject: [PATCH] suspend/resume SMP support

Using CPU hotplug to support suspend/resume SMP.  Both S3 and S4 use
disable/enable_nonboot_cpus API.  The S4 part is based on Pavel's original S4
SMP patch.

Signed-off-by: Li Shaohua<shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/mcheck/k7.c      |  2 +-
 arch/i386/kernel/cpu/mcheck/mce.c     |  2 +-
 arch/i386/kernel/cpu/mcheck/p4.c      |  4 +-
 arch/i386/kernel/cpu/mcheck/p6.c      |  2 +-
 arch/i386/kernel/cpu/mcheck/winchip.c |  2 +-
 drivers/acpi/Kconfig                  |  2 +-
 include/linux/suspend.h               |  2 +-
 kernel/power/Kconfig                  |  6 ++-
 kernel/power/Makefile                 |  6 +--
 kernel/power/disk.c                   | 35 +++++++-------
 kernel/power/main.c                   | 16 ++++---
 kernel/power/smp.c                    | 89 +++++++++++++----------------------
 kernel/power/swsusp.c                 |  2 +
 13 files changed, 80 insertions(+), 90 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c
index 8df52e86c4d2..c4abe7657397 100644
--- a/arch/i386/kernel/cpu/mcheck/k7.c
+++ b/arch/i386/kernel/cpu/mcheck/k7.c
@@ -69,7 +69,7 @@ static fastcall void k7_machine_check(struct pt_regs * regs, long error_code)
 
 
 /* AMD K7 machine check is Intel like */
-void __init amd_mcheck_init(struct cpuinfo_x86 *c)
+void __devinit amd_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int i;
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index 7218a7341fbc..2cf25d2ba0f1 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -16,7 +16,7 @@
 
 #include "mce.h"
 
-int mce_disabled __initdata = 0;
+int mce_disabled __devinitdata = 0;
 int nr_mce_banks;
 
 EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index 8b16ceb929b4..0abccb6fdf9e 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -78,7 +78,7 @@ fastcall void smp_thermal_interrupt(struct pt_regs *regs)
 }
 
 /* P4/Xeon Thermal regulation detect and init */
-static void __init intel_init_thermal(struct cpuinfo_x86 *c)
+static void __devinit intel_init_thermal(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	unsigned int cpu = smp_processor_id();
@@ -232,7 +232,7 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
 }
 
 
-void __init intel_p4_mcheck_init(struct cpuinfo_x86 *c)
+void __devinit intel_p4_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int i;
diff --git a/arch/i386/kernel/cpu/mcheck/p6.c b/arch/i386/kernel/cpu/mcheck/p6.c
index 46640f8c2494..f01b73f947e1 100644
--- a/arch/i386/kernel/cpu/mcheck/p6.c
+++ b/arch/i386/kernel/cpu/mcheck/p6.c
@@ -80,7 +80,7 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
 }
 
 /* Set up machine check reporting for processors with Intel style MCE */
-void __init intel_p6_mcheck_init(struct cpuinfo_x86 *c)
+void __devinit intel_p6_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int i;
diff --git a/arch/i386/kernel/cpu/mcheck/winchip.c b/arch/i386/kernel/cpu/mcheck/winchip.c
index 753fa7acb984..7bae68fa168f 100644
--- a/arch/i386/kernel/cpu/mcheck/winchip.c
+++ b/arch/i386/kernel/cpu/mcheck/winchip.c
@@ -23,7 +23,7 @@ static fastcall void winchip_machine_check(struct pt_regs * regs, long error_cod
 }
 
 /* Set up machine check reporting on the Winchip C6 series */
-void __init winchip_mcheck_init(struct cpuinfo_x86 *c)
+void __devinit winchip_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 lo, hi;
 	machine_check_vector = winchip_machine_check;
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 670fdb5142d1..86c52520ed34 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -55,7 +55,7 @@ if ACPI_INTERPRETER
 
 config ACPI_SLEEP
 	bool "Sleep States (EXPERIMENTAL)"
-	depends on X86
+	depends on X86 && (!SMP || SUSPEND_SMP)
 	depends on EXPERIMENTAL && PM
 	default y
 	---help---
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 2bf0d5fabcdb..f2e96fdfaae0 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -58,7 +58,7 @@ static inline int software_suspend(void)
 }
 #endif
 
-#ifdef CONFIG_SMP
+#ifdef CONFIG_SUSPEND_SMP
 extern void disable_nonboot_cpus(void);
 extern void enable_nonboot_cpus(void);
 #else
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 696387ffe49c..fdb377636505 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -28,7 +28,7 @@ config PM_DEBUG
 
 config SOFTWARE_SUSPEND
 	bool "Software Suspend (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && PM && SWAP
+	depends on EXPERIMENTAL && PM && SWAP && (SUSPEND_SMP || !SMP)
 	---help---
 	  Enable the possibility of suspending the machine.
 	  It doesn't need APM.
@@ -72,3 +72,7 @@ config PM_STD_PARTITION
 	  suspended image to. It will simply pick the first available swap 
 	  device.
 
+config SUSPEND_SMP
+	bool
+	depends on HOTPLUG_CPU && X86 && PM
+	default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index fbdc634135a7..2f438d0eaa13 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
 EXTRA_CFLAGS	+=	-DDEBUG
 endif
 
-swsusp-smp-$(CONFIG_SMP)	+= smp.o
-
 obj-y				:= main.o process.o console.o pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o $(swsusp-smp-y) disk.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o
+
+obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 02b6764034dc..fb8de63c2919 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -117,8 +117,8 @@ static void finish(void)
 {
 	device_resume();
 	platform_finish();
-	enable_nonboot_cpus();
 	thaw_processes();
+	enable_nonboot_cpus();
 	pm_restore_console();
 }
 
@@ -131,28 +131,35 @@ static int prepare_processes(void)
 
 	sys_sync();
 
+	disable_nonboot_cpus();
+
 	if (freeze_processes()) {
 		error = -EBUSY;
-		return error;
+		goto thaw;
 	}
 
 	if (pm_disk_mode == PM_DISK_PLATFORM) {
 		if (pm_ops && pm_ops->prepare) {
 			if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
-				return error;
+				goto thaw;
 		}
 	}
 
 	/* Free memory before shutting down devices. */
 	free_some_memory();
-
 	return 0;
+thaw:
+	thaw_processes();
+	enable_nonboot_cpus();
+	pm_restore_console();
+	return error;
 }
 
 static void unprepare_processes(void)
 {
-	enable_nonboot_cpus();
+	platform_finish();
 	thaw_processes();
+	enable_nonboot_cpus();
 	pm_restore_console();
 }
 
@@ -160,15 +167,9 @@ static int prepare_devices(void)
 {
 	int error;
 
-	disable_nonboot_cpus();
-	if ((error = device_suspend(PMSG_FREEZE))) {
+	if ((error = device_suspend(PMSG_FREEZE)))
 		printk("Some devices failed to suspend\n");
-		platform_finish();
-		enable_nonboot_cpus();
-		return error;
-	}
-
-	return 0;
+	return error;
 }
 
 /**
@@ -185,9 +186,9 @@ int pm_suspend_disk(void)
 	int error;
 
 	error = prepare_processes();
-	if (!error) {
-		error = prepare_devices();
-	}
+	if (error)
+		return error;
+	error = prepare_devices();
 
 	if (error) {
 		unprepare_processes();
@@ -250,7 +251,7 @@ static int software_resume(void)
 
 	if ((error = prepare_processes())) {
 		swsusp_close();
-		goto Cleanup;
+		goto Done;
 	}
 
 	pr_debug("PM: Reading swsusp image.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4cdebc972ff2..c94cb9e95090 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -55,6 +55,13 @@ static int suspend_prepare(suspend_state_t state)
 
 	pm_prepare_console();
 
+	disable_nonboot_cpus();
+
+	if (num_online_cpus() != 1) {
+		error = -EPERM;
+		goto Enable_cpu;
+	}
+
 	if (freeze_processes()) {
 		error = -EAGAIN;
 		goto Thaw;
@@ -75,6 +82,8 @@ static int suspend_prepare(suspend_state_t state)
 		pm_ops->finish(state);
  Thaw:
 	thaw_processes();
+ Enable_cpu:
+	enable_nonboot_cpus();
 	pm_restore_console();
 	return error;
 }
@@ -113,6 +122,7 @@ static void suspend_finish(suspend_state_t state)
 	if (pm_ops && pm_ops->finish)
 		pm_ops->finish(state);
 	thaw_processes();
+	enable_nonboot_cpus();
 	pm_restore_console();
 }
 
@@ -150,12 +160,6 @@ static int enter_state(suspend_state_t state)
 		goto Unlock;
 	}
 
-	/* Suspend is hard to get right on SMP. */
-	if (num_online_cpus() != 1) {
-		error = -EPERM;
-		goto Unlock;
-	}
-
 	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
 	if ((error = suspend_prepare(state)))
 		goto Unlock;
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index 457c2302ed42..bbe23079c62c 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -13,73 +13,52 @@
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
 #include <asm/atomic.h>
 #include <asm/tlbflush.h>
 
-static atomic_t cpu_counter, freeze;
-
-
-static void smp_pause(void * data)
-{
-	struct saved_context ctxt;
-	__save_processor_state(&ctxt);
-	printk("Sleeping in:\n");
-	dump_stack();
-	atomic_inc(&cpu_counter);
-	while (atomic_read(&freeze)) {
-		/* FIXME: restore takes place at random piece inside this.
-		   This should probably be written in assembly, and
-		   preserve general-purpose registers, too
-
-		   What about stack? We may need to move to new stack here.
-
-		   This should better be ran with interrupts disabled.
-		 */
-		cpu_relax();
-		barrier();
-	}
-	atomic_dec(&cpu_counter);
-	__restore_processor_state(&ctxt);
-}
-
-static cpumask_t oldmask;
+/* This is protected by pm_sem semaphore */
+static cpumask_t frozen_cpus;
 
 void disable_nonboot_cpus(void)
 {
-	oldmask = current->cpus_allowed;
-	set_cpus_allowed(current, cpumask_of_cpu(0));
-	printk("Freezing CPUs (at %d)", raw_smp_processor_id());
-	current->state = TASK_INTERRUPTIBLE;
-	schedule_timeout(HZ);
-	printk("...");
-	BUG_ON(raw_smp_processor_id() != 0);
-
-	/* FIXME: for this to work, all the CPUs must be running
-	 * "idle" thread (or we deadlock). Is that guaranteed? */
+	int cpu, error;
 
-	atomic_set(&cpu_counter, 0);
-	atomic_set(&freeze, 1);
-	smp_call_function(smp_pause, NULL, 0, 0);
-	while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) {
-		cpu_relax();
-		barrier();
+	error = 0;
+	cpus_clear(frozen_cpus);
+	printk("Freezing cpus ...\n");
+	for_each_online_cpu(cpu) {
+		if (cpu == 0)
+			continue;
+		error = cpu_down(cpu);
+		if (!error) {
+			cpu_set(cpu, frozen_cpus);
+			printk("CPU%d is down\n", cpu);
+			continue;
+		}
+		printk("Error taking cpu %d down: %d\n", cpu, error);
 	}
-	printk("ok\n");
+	BUG_ON(smp_processor_id() != 0);
+	if (error)
+		panic("cpus not sleeping");
 }
 
 void enable_nonboot_cpus(void)
 {
-	printk("Restarting CPUs");
-	atomic_set(&freeze, 0);
-	while (atomic_read(&cpu_counter)) {
-		cpu_relax();
-		barrier();
-	}
-	printk("...");
-	set_cpus_allowed(current, oldmask);
-	schedule();
-	printk("ok\n");
+	int cpu, error;
 
+	printk("Thawing cpus ...\n");
+	for_each_cpu_mask(cpu, frozen_cpus) {
+		error = smp_prepare_cpu(cpu);
+		if (!error)
+			error = cpu_up(cpu);
+		if (!error) {
+			printk("CPU%d is up\n", cpu);
+			continue;
+		}
+		printk("Error taking cpu %d up: %d\n", cpu, error);
+		panic("Not enough cpus");
+	}
+	cpus_clear(frozen_cpus);
 }
 
-
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 53f9f8720ee4..339b5c3735bd 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1193,8 +1193,10 @@ static const char * sanity_check(void)
 		return "version";
 	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
 		return "machine";
+#if 0
 	if(swsusp_info.cpus != num_online_cpus())
 		return "number of cpus";
+#endif
 	return NULL;
 }
 
-- 
cgit v1.3-14-g43fede


From 620b03276488c3cf103caf1e326bd21f00d3df84 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 25 Jun 2005 14:55:11 -0700
Subject: [PATCH] properly stop devices before poweroff

Without this patch, Linux provokes emergency disk shutdowns and
similar nastiness. It was in SuSE kernels for some time, IIRC.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pm.h | 33 +++++++++++++++++++++------------
 kernel/sys.c       |  3 +++
 2 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index ed2b76e75199..14479325e3f3 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -103,7 +103,8 @@ extern int pm_active;
 /*
  * Register a device with power management
  */
-struct pm_dev __deprecated *pm_register(pm_dev_t type, unsigned long id, pm_callback callback);
+struct pm_dev __deprecated *
+pm_register(pm_dev_t type, unsigned long id, pm_callback callback);
 
 /*
  * Unregister a device with power management
@@ -190,17 +191,18 @@ typedef u32 __bitwise pm_message_t;
 /*
  * There are 4 important states driver can be in:
  * ON     -- driver is working
- * FREEZE -- stop operations and apply whatever policy is applicable to a suspended driver
- *           of that class, freeze queues for block like IDE does, drop packets for
- *           ethernet, etc... stop DMA engine too etc... so a consistent image can be
- *           saved; but do not power any hardware down.
- * SUSPEND - like FREEZE, but hardware is doing as much powersaving as possible. Roughly
- *           pci D3.
+ * FREEZE -- stop operations and apply whatever policy is applicable to a
+ *           suspended driver of that class, freeze queues for block like IDE
+ *           does, drop packets for ethernet, etc... stop DMA engine too etc...
+ *           so a consistent image can be saved; but do not power any hardware
+ *           down.
+ * SUSPEND - like FREEZE, but hardware is doing as much powersaving as
+ *           possible. Roughly pci D3.
  *
- * Unfortunately, current drivers only recognize numeric values 0 (ON) and 3 (SUSPEND).
- * We'll need to fix the drivers. So yes, putting 3 to all diferent defines is intentional,
- * and will go away as soon as drivers are fixed. Also note that typedef is neccessary,
- * we'll probably want to switch to
+ * Unfortunately, current drivers only recognize numeric values 0 (ON) and 3
+ * (SUSPEND).  We'll need to fix the drivers. So yes, putting 3 to all different
+ * defines is intentional, and will go away as soon as drivers are fixed.  Also
+ * note that typedef is neccessary, we'll probably want to switch to
  *   typedef struct pm_message_t { int event; int flags; } pm_message_t
  * or something similar soon.
  */
@@ -222,11 +224,18 @@ struct dev_pm_info {
 
 extern void device_pm_set_parent(struct device * dev, struct device * parent);
 
-extern int device_suspend(pm_message_t state);
 extern int device_power_down(pm_message_t state);
 extern void device_power_up(void);
 extern void device_resume(void);
 
+#ifdef CONFIG_PM
+extern int device_suspend(pm_message_t state);
+#else
+static inline int device_suspend(pm_message_t state)
+{
+	return 0;
+}
+#endif
 
 #endif /* __KERNEL__ */
 
diff --git a/kernel/sys.c b/kernel/sys.c
index da24bc1292db..dac10161ca23 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -405,6 +405,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	case LINUX_REBOOT_CMD_HALT:
 		notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
 		system_state = SYSTEM_HALT;
+		device_suspend(PMSG_SUSPEND);
 		device_shutdown();
 		printk(KERN_EMERG "System halted.\n");
 		machine_halt();
@@ -415,6 +416,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	case LINUX_REBOOT_CMD_POWER_OFF:
 		notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
 		system_state = SYSTEM_POWER_OFF;
+		device_suspend(PMSG_SUSPEND);
 		device_shutdown();
 		printk(KERN_EMERG "Power down.\n");
 		machine_power_off();
@@ -431,6 +433,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 
 		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
 		system_state = SYSTEM_RESTART;
+		device_suspend(PMSG_FREEZE);
 		device_shutdown();
 		printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
 		machine_restart(buffer);
-- 
cgit v1.3-14-g43fede


From 8f9bdf15c059c5d84db9c395705bf79b30762420 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 25 Jun 2005 14:55:12 -0700
Subject: [PATCH] swsusp: kill unneccessary does_collide_order

The following patch removes the unnecessary function does_collide_order().

This function is no longer necessary, as currently there are only 0-order
allocations in swsusp, and the use of it is confusing.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 339b5c3735bd..7747a8c43e84 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -929,21 +929,6 @@ int swsusp_resume(void)
 	return error;
 }
 
-/* More restore stuff */
-
-/*
- * Returns true if given address/order collides with any orig_address 
- */
-static int does_collide_order(unsigned long addr, int order)
-{
-	int i;
-	
-	for (i=0; i < (1<<order); i++)
-		if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE)))
-			return 1;
-	return 0;
-}
-
 /**
  *	On resume, for storing the PBE list and the image,
  *	we can only use memory pages that do not conflict with the pages
@@ -973,7 +958,7 @@ static unsigned long get_usable_page(unsigned gfp_mask)
 	unsigned long m;
 
 	m = get_zeroed_page(gfp_mask);
-	while (does_collide_order(m, 0)) {
+	while (!PageNosaveFree(virt_to_page(m))) {
 		eat_page((void *)m);
 		m = get_zeroed_page(gfp_mask);
 		if (!m)
@@ -1061,7 +1046,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 	/* Relocate colliding pages */
 
 	for_each_pb_page (pbpage, pblist) {
-		if (does_collide_order((unsigned long)pbpage, 0)) {
+		if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
 			m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
 			if (!m) {
 				error = -ENOMEM;
-- 
cgit v1.3-14-g43fede


From 2e4d5822dc71f01bf515b8f6f4e41ae12ee785b8 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 25 Jun 2005 14:55:12 -0700
Subject: [PATCH] swsusp: cleanup whitespace

The following patch cleans up whitespace in swsusp.c (a bit):

- removes any trailing whitespace

- adds spaces after if, for, for_each_pbe, for_each_zone etc., wherever
  necessary.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 66 +++++++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 7747a8c43e84..9a3ca659a436 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -10,12 +10,12 @@
  * This file is released under the GPLv2.
  *
  * I'd like to thank the following people for their work:
- * 
+ *
  * Pavel Machek <pavel@ucw.cz>:
  * Modifications, defectiveness pointing, being with me at the very beginning,
  * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
  *
- * Steve Doddi <dirk@loth.demon.co.uk>: 
+ * Steve Doddi <dirk@loth.demon.co.uk>:
  * Support the possibility of hardware state restoring.
  *
  * Raph <grey.havens@earthling.net>:
@@ -84,11 +84,11 @@ extern char resume_file[];
 static unsigned int nr_copy_pages __nosavedata = 0;
 
 /* Suspend pagedir is allocated before final copy, therefore it
-   must be freed after resume 
+   must be freed after resume
 
    Warning: this is evil. There are actually two pagedirs at time of
    resume. One is "pagedir_save", which is empty frame allocated at
-   time of suspend, that must be freed. Second is "pagedir_nosave", 
+   time of suspend, that must be freed. Second is "pagedir_nosave",
    allocated at time of resume, that travels through memory not to
    collide with anything.
 
@@ -132,7 +132,7 @@ static int mark_swapfiles(swp_entry_t prev)
 {
 	int error;
 
-	rw_swap_page_sync(READ, 
+	rw_swap_page_sync(READ,
 			  swp_entry(root_swap, 0),
 			  virt_to_page((unsigned long)&swsusp_header));
 	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
@@ -140,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev)
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
 		swsusp_header.swsusp_info = prev;
-		error = rw_swap_page_sync(WRITE, 
+		error = rw_swap_page_sync(WRITE,
 					  swp_entry(root_swap, 0),
 					  virt_to_page((unsigned long)
 						       &swsusp_header));
@@ -174,22 +174,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
 	int i, len;
-	
+
 	len=strlen(resume_file);
 	root_swap = 0xFFFF;
-	
+
 	swap_list_lock();
-	for(i=0; i<MAX_SWAPFILES; i++) {
+	for (i=0; i<MAX_SWAPFILES; i++) {
 		if (swap_info[i].flags == 0) {
 			swapfile_used[i]=SWAPFILE_UNUSED;
 		} else {
-			if(!len) {
+			if (!len) {
 	    			printk(KERN_WARNING "resume= option should be used to set suspend device" );
-				if(root_swap == 0xFFFF) {
+				if (root_swap == 0xFFFF) {
 					swapfile_used[i] = SWAPFILE_SUSPEND;
 					root_swap = i;
 				} else
-					swapfile_used[i] = SWAPFILE_IGNORED;				  
+					swapfile_used[i] = SWAPFILE_IGNORED;
 			} else {
 	  			/* we ignore all swap devices that are not the resume_file */
 				if (is_resume_device(&swap_info[i])) {
@@ -209,15 +209,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */
  * This is called after saving image so modification
  * will be lost after resume... and that's what we want.
  * we make the device unusable. A new call to
- * lock_swapdevices can unlock the devices. 
+ * lock_swapdevices can unlock the devices.
  */
 static void lock_swapdevices(void)
 {
 	int i;
 
 	swap_list_lock();
-	for(i = 0; i< MAX_SWAPFILES; i++)
-		if(swapfile_used[i] == SWAPFILE_IGNORED) {
+	for (i = 0; i< MAX_SWAPFILES; i++)
+		if (swapfile_used[i] == SWAPFILE_IGNORED) {
 			swap_info[i].flags ^= 0xFF;
 		}
 	swap_list_unlock();
@@ -229,7 +229,7 @@ static void lock_swapdevices(void)
  *	@loc:	Place to store the entry we used.
  *
  *	Allocate a new swap entry and 'sync' it. Note we discard -EIO
- *	errors. That is an artifact left over from swsusp. It did not 
+ *	errors. That is an artifact left over from swsusp. It did not
  *	check the return of rw_swap_page_sync() at all, since most pages
  *	written back to swap would return -EIO.
  *	This is a partial improvement, since we will at least return other
@@ -241,7 +241,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
 	int error = 0;
 
 	entry = get_swap_page();
-	if (swp_offset(entry) && 
+	if (swp_offset(entry) &&
 	    swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
 		error = rw_swap_page_sync(WRITE, entry,
 					  virt_to_page(addr));
@@ -257,7 +257,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
 /**
  *	data_free - Free the swap entries used by the saved image.
  *
- *	Walk the list of used swap entries and free each one. 
+ *	Walk the list of used swap entries and free each one.
  *	This is only used for cleanup when suspend fails.
  */
 static void data_free(void)
@@ -290,7 +290,7 @@ static int data_write(void)
 		mod = 1;
 
 	printk( "Writing data to swap (%d pages)...     ", nr_copy_pages );
-	for_each_pbe(p, pagedir_nosave) {
+	for_each_pbe (p, pagedir_nosave) {
 		if (!(i%mod))
 			printk( "\b\b\b\b%3d%%", i / mod );
 		if ((error = write_page(p->address, &(p->swap_address))))
@@ -335,7 +335,7 @@ static int close_swap(void)
 
 	dump_info();
 	error = write_page((unsigned long)&swsusp_info, &entry);
-	if (!error) { 
+	if (!error) {
 		printk( "S" );
 		error = mark_swapfiles(entry);
 		printk( "|\n" );
@@ -370,7 +370,7 @@ static int write_pagedir(void)
 	struct pbe * pbe;
 
 	printk( "Writing pagedir...");
-	for_each_pb_page(pbe, pagedir_nosave) {
+	for_each_pb_page (pbe, pagedir_nosave) {
 		if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
 			return error;
 	}
@@ -472,7 +472,7 @@ static int save_highmem(void)
 	int res = 0;
 
 	pr_debug("swsusp: Saving Highmem\n");
-	for_each_zone(zone) {
+	for_each_zone (zone) {
 		if (is_highmem(zone))
 			res = save_highmem_zone(zone);
 		if (res)
@@ -547,7 +547,7 @@ static void count_data_pages(void)
 
 	nr_copy_pages = 0;
 
-	for_each_zone(zone) {
+	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
@@ -562,9 +562,9 @@ static void copy_data_pages(void)
 	struct zone *zone;
 	unsigned long zone_pfn;
 	struct pbe * pbe = pagedir_nosave;
-	
+
 	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
-	for_each_zone(zone) {
+	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
@@ -702,7 +702,7 @@ static void free_image_pages(void)
 {
 	struct pbe * p;
 
-	for_each_pbe(p, pagedir_save) {
+	for_each_pbe (p, pagedir_save) {
 		if (p->address) {
 			ClearPageNosave(virt_to_page(p->address));
 			free_page(p->address);
@@ -719,7 +719,7 @@ static int alloc_image_pages(void)
 {
 	struct pbe * p;
 
-	for_each_pbe(p, pagedir_save) {
+	for_each_pbe (p, pagedir_save) {
 		p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
 		if (!p->address)
 			return -ENOMEM;
@@ -740,7 +740,7 @@ void swsusp_free(void)
 /**
  *	enough_free_mem - Make sure we enough free memory to snapshot.
  *
- *	Returns TRUE or FALSE after checking the number of available 
+ *	Returns TRUE or FALSE after checking the number of available
  *	free pages.
  */
 
@@ -758,11 +758,11 @@ static int enough_free_mem(void)
 /**
  *	enough_swap - Make sure we have enough swap to save the image.
  *
- *	Returns TRUE or FALSE after checking the total amount of swap 
+ *	Returns TRUE or FALSE after checking the total amount of swap
  *	space avaiable.
  *
  *	FIXME: si_swapinfo(&i) returns all swap devices information.
- *	We should only consider resume_device. 
+ *	We should only consider resume_device.
  */
 
 static int enough_swap(void)
@@ -827,8 +827,8 @@ static int suspend_prepare_image(void)
 	error = swsusp_alloc();
 	if (error)
 		return error;
-	
-	/* During allocating of suspend pagedir, new cold pages may appear. 
+
+	/* During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
 	 */
 	drain_local_pages();
@@ -1030,7 +1030,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 
 	/* Set page flags */
 
-	for_each_zone(zone) {
+	for_each_zone (zone) {
         	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
                 	SetPageNosaveFree(pfn_to_page(zone_pfn +
 					zone->zone_start_pfn));
-- 
cgit v1.3-14-g43fede


From c61978b30322c83a94d7e4857fa5b9996b7d7931 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 25 Jun 2005 14:55:14 -0700
Subject: [PATCH] swsusp: fix nr_copy_pages

The following patch moves the recalculation of nr_copy_pages so that the right
number is used in the calculation of the size of memory and swap needed.

It prevents swsusp from attempting to suspend if there is not enough memory
and/or swap (which is unlikely anyway).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 9a3ca659a436..c285fc5a2320 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -781,18 +781,18 @@ static int swsusp_alloc(void)
 {
 	int error;
 
+	pagedir_nosave = NULL;
+	nr_copy_pages = calc_nr(nr_copy_pages);
+
 	pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
 		 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
 
-	pagedir_nosave = NULL;
 	if (!enough_free_mem())
 		return -ENOMEM;
 
 	if (!enough_swap())
 		return -ENOSPC;
 
-	nr_copy_pages = calc_nr(nr_copy_pages);
-
 	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
 		return -ENOMEM;
-- 
cgit v1.3-14-g43fede


From ac25575203c11145066ea5cb583354cb5f0a8ade Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Sat, 25 Jun 2005 14:55:15 -0700
Subject: [PATCH] CPU hotplug printk fix

In the cpu hotplug case, per-cpu data possibly isn't initialized even the
system state is 'running'.  As the comments say in the original code, some
console drivers assume per-cpu resources have been allocated.  radeon fb is
one such driver, which uses kmalloc.  After a CPU is down, the per-cpu data
of slab is freed, so the system crashed when printing some info.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 3a442bfb8bee..5092397fac29 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -588,8 +588,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 			log_level_unknown = 1;
 	}
 
-	if (!cpu_online(smp_processor_id()) &&
-	    system_state != SYSTEM_RUNNING) {
+	if (!cpu_online(smp_processor_id())) {
 		/*
 		 * Some console drivers may assume that per-cpu resources have
 		 * been allocated.  So don't allow them to be called by this
-- 
cgit v1.3-14-g43fede


From 19c324397a55edf122822f829779b46b9cb385dd Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 25 Jun 2005 14:55:17 -0700
Subject: [PATCH] swsusp: only allow it when it makes sense

Show swsuspend only on .config where it can compile.  I got this on PPC32 &&
SMP:

kernel/power/smp.c:24: error: storage size of `ctxt' isn't known

Also mark swsusp as no longer experimental.

Signed-off-by: Olaf Hering <olh@suse.de>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index fdb377636505..2c7121d9bff1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,8 +27,8 @@ config PM_DEBUG
 	like suspend support.
 
 config SOFTWARE_SUSPEND
-	bool "Software Suspend (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && PM && SWAP && (SUSPEND_SMP || !SMP)
+	bool "Software Suspend"
+	depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
 	---help---
 	  Enable the possibility of suspending the machine.
 	  It doesn't need APM.
-- 
cgit v1.3-14-g43fede


From e0f364f4069f76a3613a797c388832822d179076 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:06 -0700
Subject: [PATCH] sched: cleanup wake_idle

New sched-domains code means we don't get spans with offline CPUs in
them.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 76080d142e3d..86be13ee5006 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -927,14 +927,14 @@ static int wake_idle(int cpu, task_t *p)
 
 	for_each_domain(cpu, sd) {
 		if (sd->flags & SD_WAKE_IDLE) {
-			cpus_and(tmp, sd->span, cpu_online_map);
-			cpus_and(tmp, tmp, p->cpus_allowed);
+			cpus_and(tmp, sd->span, p->cpus_allowed);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i))
 					return i;
 			}
 		}
-		else break;
+		else
+			break;
 	}
 	return cpu;
 }
-- 
cgit v1.3-14-g43fede


From 8102679447da7fcbcb5226ee0207c3a034bc6d5f Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:07 -0700
Subject: [PATCH] sched: improve load balancing pinned tasks

John Hawkes explained the problem best:

	A large number of processes that are pinned to a single CPU results
	in every other CPU's load_balance() seeing this overloaded CPU as
	"busiest", yet move_tasks() never finds a task to pull-migrate.  This
	condition occurs during module unload, but can also occur as a
	denial-of-service using sys_sched_setaffinity().  Several hundred
	CPUs performing this fruitless load_balance() will livelock on the
	busiest CPU's runqueue lock.  A smaller number of CPUs will livelock
	if the pinned task count gets high.

Expanding slightly on John's patch, this one attempts to work out whether the
balancing failure has been due to too many tasks pinned on the runqueue.  This
allows it to be basically invisible to the regular blancing paths (ie.  when
there are no pinned tasks).  We can use this extra knowledge to shut down the
balancing faster, and ensure the migration threads don't start running which
is another problem observed in the wild.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 62 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 86be13ee5006..2794c79b9197 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1632,7 +1632,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
  */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-		     struct sched_domain *sd, enum idle_type idle)
+	     struct sched_domain *sd, enum idle_type idle, int *all_pinned)
 {
 	/*
 	 * We do not migrate tasks that are:
@@ -1640,10 +1640,12 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (task_running(rq, p))
-		return 0;
 	if (!cpu_isset(this_cpu, p->cpus_allowed))
 		return 0;
+	*all_pinned = 0;
+
+	if (task_running(rq, p))
+		return 0;
 
 	/*
 	 * Aggressive migration if:
@@ -1656,7 +1658,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 		return 1;
 
 	if (task_hot(p, rq->timestamp_last_tick, sd))
-			return 0;
+		return 0;
 	return 1;
 }
 
@@ -1669,16 +1671,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
 		      unsigned long max_nr_move, struct sched_domain *sd,
-		      enum idle_type idle)
+		      enum idle_type idle, int *all_pinned)
 {
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
-	int idx, pulled = 0;
+	int idx, pulled = 0, pinned = 0;
 	task_t *tmp;
 
-	if (max_nr_move <= 0 || busiest->nr_running <= 1)
+	if (max_nr_move == 0)
 		goto out;
 
+	pinned = 1;
+
 	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
@@ -1717,7 +1721,7 @@ skip_queue:
 
 	curr = curr->prev;
 
-	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1746,6 +1750,9 @@ out:
 	 * inside pull_task().
 	 */
 	schedstat_add(sd, lb_gained[idle], pulled);
+
+	if (all_pinned)
+		*all_pinned = pinned;
 	return pulled;
 }
 
@@ -1917,7 +1924,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 	struct sched_group *group;
 	runqueue_t *busiest;
 	unsigned long imbalance;
-	int nr_moved;
+	int nr_moved, all_pinned;
+	int active_balance = 0;
 
 	spin_lock(&this_rq->lock);
 	schedstat_inc(sd, lb_cnt[idle]);
@@ -1956,9 +1964,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		 */
 		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
-						imbalance, sd, idle);
+						imbalance, sd, idle,
+						&all_pinned);
 		spin_unlock(&busiest->lock);
+
+		/* All tasks on this runqueue were pinned by CPU affinity */
+		if (unlikely(all_pinned))
+			goto out_balanced;
 	}
+
 	spin_unlock(&this_rq->lock);
 
 	if (!nr_moved) {
@@ -1966,16 +1980,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		sd->nr_balance_failed++;
 
 		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-			int wake = 0;
 
 			spin_lock(&busiest->lock);
 			if (!busiest->active_balance) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
-				wake = 1;
+				active_balance = 1;
 			}
 			spin_unlock(&busiest->lock);
-			if (wake)
+			if (active_balance)
 				wake_up_process(busiest->migration_thread);
 
 			/*
@@ -1984,18 +1997,21 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 			 */
 			sd->nr_balance_failed = sd->cache_nice_tries;
 		}
-
-		/*
-		 * We were unbalanced, but unsuccessful in move_tasks(),
-		 * so bump the balance_interval to lessen the lock contention.
-		 */
-		if (sd->balance_interval < sd->max_interval)
-			sd->balance_interval++;
-	} else {
+	} else
 		sd->nr_balance_failed = 0;
 
+	if (likely(!active_balance)) {
 		/* We were unbalanced, so reset the balancing interval */
 		sd->balance_interval = sd->min_interval;
+	} else {
+		/*
+		 * If we've begun active balancing, start to back off. This
+		 * case may not be covered by the all_pinned logic if there
+		 * is only 1 task on the busy runqueue (because we don't call
+		 * move_tasks).
+		 */
+		if (sd->balance_interval < sd->max_interval)
+			sd->balance_interval *= 2;
 	}
 
 	return nr_moved;
@@ -2047,7 +2063,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 
 	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
 	nr_moved = move_tasks(this_rq, this_cpu, busiest,
-					imbalance, sd, NEWLY_IDLE);
+					imbalance, sd, NEWLY_IDLE, NULL);
 	if (!nr_moved)
 		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
 
@@ -2126,7 +2142,7 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 				/* move a task from busiest_rq to target_rq */
 				double_lock_balance(busiest_rq, target_rq);
 				if (move_tasks(target_rq, cpu, busiest_rq,
-						1, sd, SCHED_IDLE)) {
+						1, sd, SCHED_IDLE, NULL)) {
 					schedstat_inc(sd, alb_pushed);
 				} else {
 					schedstat_inc(sd, alb_failed);
-- 
cgit v1.3-14-g43fede


From 16cfb1c04c3cbe3759f339d3333e7e1e7d59712a Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:08 -0700
Subject: [PATCH] sched: reduce active load balancing

Fix up active load balancing a bit so it doesn't get called when it shouldn't.
Reset the nr_balance_failed counter at more points where we have found
conditions to be balanced.  This reduces too aggressive active balancing seen
on some workloads.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2794c79b9197..03d737791c1a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2021,6 +2021,7 @@ out_balanced:
 
 	schedstat_inc(sd, lb_balanced[idle]);
 
+	sd->nr_balance_failed = 0;
 	/* tune up the balancing interval */
 	if (sd->balance_interval < sd->max_interval)
 		sd->balance_interval *= 2;
@@ -2046,16 +2047,14 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
 	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
 	if (!group) {
-		schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
 		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
-		goto out;
+		goto out_balanced;
 	}
 
 	busiest = find_busiest_queue(group);
 	if (!busiest || busiest == this_rq) {
-		schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
-		goto out;
+		goto out_balanced;
 	}
 
 	/* Attempt to move tasks */
@@ -2066,11 +2065,16 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 					imbalance, sd, NEWLY_IDLE, NULL);
 	if (!nr_moved)
 		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+	else
+		sd->nr_balance_failed = 0;
 
 	spin_unlock(&busiest->lock);
-
-out:
 	return nr_moved;
+
+out_balanced:
+	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+	sd->nr_balance_failed = 0;
+	return 0;
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 3950745131e23472fb5ace2ee4a2093e7590ec69 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:09 -0700
Subject: [PATCH] sched: fix SMT scheduling problems

SMT balancing has a couple of problems.  Firstly, active_load_balance is too
complex - basically it should be a dumb helper for when the periodic balancer
has determined there is an imbalance, but gets stuck because the task is
running.

So rip out all its "smarts", and just make it move one task to the target CPU.

Second, the busy CPU's sched-domain tree was being used for active balancing.
This means that it may not see that nr_balance_failed has reached a critical
level.  So use the target CPU's sched-domain tree for this.  We can do this
because we hold its runqueue lock.

Lastly, reset nr_balance_failed to a point where we allow cache hot migration.
This will help ensure active load balancing is successful.

Thanks to Suresh Siddha for pointing out these issues.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 76 ++++++++++++++++++++++++----------------------------------
 1 file changed, 31 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 03d737791c1a..41e69b5ee652 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1995,7 +1995,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 			 * We've kicked active balancing, reset the failure
 			 * counter.
 			 */
-			sd->nr_balance_failed = sd->cache_nice_tries;
+			sd->nr_balance_failed = sd->cache_nice_tries+1;
 		}
 	} else
 		sd->nr_balance_failed = 0;
@@ -2106,56 +2106,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 {
 	struct sched_domain *sd;
-	struct sched_group *cpu_group;
 	runqueue_t *target_rq;
-	cpumask_t visited_cpus;
-	int cpu;
+	int target_cpu = busiest_rq->push_cpu;
+
+	if (busiest_rq->nr_running <= 1)
+		/* no task to move */
+		return;
+
+	target_rq = cpu_rq(target_cpu);
 
 	/*
-	 * Search for suitable CPUs to push tasks to in successively higher
-	 * domains with SD_LOAD_BALANCE set.
+	 * This condition is "impossible", if it occurs
+	 * we need to fix it.  Originally reported by
+	 * Bjorn Helgaas on a 128-cpu setup.
 	 */
-	visited_cpus = CPU_MASK_NONE;
-	for_each_domain(busiest_cpu, sd) {
-		if (!(sd->flags & SD_LOAD_BALANCE))
-			/* no more domains to search */
-			break;
+	BUG_ON(busiest_rq == target_rq);
 
-		schedstat_inc(sd, alb_cnt);
+	/* move a task from busiest_rq to target_rq */
+	double_lock_balance(busiest_rq, target_rq);
 
-		cpu_group = sd->groups;
-		do {
-			for_each_cpu_mask(cpu, cpu_group->cpumask) {
-				if (busiest_rq->nr_running <= 1)
-					/* no more tasks left to move */
-					return;
-				if (cpu_isset(cpu, visited_cpus))
-					continue;
-				cpu_set(cpu, visited_cpus);
-				if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
-					continue;
-
-				target_rq = cpu_rq(cpu);
-				/*
-				 * This condition is "impossible", if it occurs
-				 * we need to fix it.  Originally reported by
-				 * Bjorn Helgaas on a 128-cpu setup.
-				 */
-				BUG_ON(busiest_rq == target_rq);
-
-				/* move a task from busiest_rq to target_rq */
-				double_lock_balance(busiest_rq, target_rq);
-				if (move_tasks(target_rq, cpu, busiest_rq,
-						1, sd, SCHED_IDLE, NULL)) {
-					schedstat_inc(sd, alb_pushed);
-				} else {
-					schedstat_inc(sd, alb_failed);
-				}
-				spin_unlock(&target_rq->lock);
-			}
-			cpu_group = cpu_group->next;
-		} while (cpu_group != sd->groups);
-	}
+	/* Search for an sd spanning us and the target CPU. */
+	for_each_domain(target_cpu, sd)
+		if ((sd->flags & SD_LOAD_BALANCE) &&
+			cpu_isset(busiest_cpu, sd->span))
+				break;
+
+	if (unlikely(sd == NULL))
+		goto out;
+
+	schedstat_inc(sd, alb_cnt);
+
+	if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+		schedstat_inc(sd, alb_pushed);
+	else
+		schedstat_inc(sd, alb_failed);
+out:
+	spin_unlock(&target_rq->lock);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From db935dbd43c4290d710304662cc908f733afea06 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:11 -0700
Subject: [PATCH] sched: add debugging

These conditions should now be impossible, and we need to fix them if they
happen.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 41e69b5ee652..8b035a8b3c30 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1942,15 +1942,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		goto out_balanced;
 	}
 
-	/*
-	 * This should be "impossible", but since load
-	 * balancing is inherently racy and statistical,
-	 * it could happen in theory.
-	 */
-	if (unlikely(busiest == this_rq)) {
-		WARN_ON(1);
-		goto out_balanced;
-	}
+	BUG_ON(busiest == this_rq);
 
 	schedstat_add(sd, lb_imbalance[idle], imbalance);
 
@@ -2052,11 +2044,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 	}
 
 	busiest = find_busiest_queue(group);
-	if (!busiest || busiest == this_rq) {
+	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
 	}
 
+	BUG_ON(busiest == this_rq);
+
 	/* Attempt to move tasks */
 	double_lock_balance(this_rq, busiest);
 
-- 
cgit v1.3-14-g43fede


From 99b61ccf0bf0e9a85823d39a5db6a1519caeb13d Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:12 -0700
Subject: [PATCH] sched: less aggressive idle balancing

Remove the special casing for idle CPU balancing.  Things like this are
hurting for example on SMT, where are single sibling being idle doesn't really
warrant a really aggressive pull over the NUMA domain, for example.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8b035a8b3c30..f665de34ed82 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1877,15 +1877,9 @@ nextgroup:
 
 	/* Get rid of the scaling factor, rounding down as we divide */
 	*imbalance = *imbalance / SCHED_LOAD_SCALE;
-
 	return busiest;
 
 out_balanced:
-	if (busiest && (idle == NEWLY_IDLE ||
-			(idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
-		*imbalance = 1;
-		return busiest;
-	}
 
 	*imbalance = 0;
 	return NULL;
-- 
cgit v1.3-14-g43fede


From 7897986bad8f6cd50d6149345aca7f6480f49464 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:13 -0700
Subject: [PATCH] sched: balance timers

Do CPU load averaging over a number of different intervals.  Allow each
interval to be chosen by sending a parameter to source_load and target_load.
0 is instantaneous, idx > 0 returns a decaying average with the most recent
sample weighted at 2^(idx-1).  To a maximum of 3 (could be easily increased).

So generally a higher number will result in more conservative balancing.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/topology.h   |   4 ++
 include/asm-x86_64/topology.h |   6 +-
 include/linux/sched.h         |   4 ++
 include/linux/topology.h      |   8 +++
 kernel/sched.c                | 138 ++++++++++++++++++++++--------------------
 5 files changed, 95 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 6d0f67507b21..0055fbfeec7b 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -74,6 +74,10 @@ static inline int node_to_first_cpu(int node)
 	.imbalance_pct		= 125,			\
 	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
+	.wake_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 8f77e9f6bc23..fe8d80a15751 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -39,7 +39,11 @@ extern int __node_distance(int, int);
 	.busy_factor		= 32,			\
 	.imbalance_pct		= 125,			\
 	.cache_hot_time		= (10*1000000),		\
-	.cache_nice_tries	= 1,			\
+	.cache_nice_tries	= 2,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 2,			\
+	.newidle_idx		= 1, 			\
+	.wake_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c69682b0444..664981ac1fb6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -488,6 +488,10 @@ struct sched_domain {
 	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
 	unsigned int per_cpu_gain;	/* CPU % gained by adding domain cpus */
+	unsigned int busy_idx;
+	unsigned int idle_idx;
+	unsigned int newidle_idx;
+	unsigned int wake_idx;
 	int flags;			/* See SD_* */
 
 	/* Runtime fields. */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d70e8972c67f..ae9c2216dfa6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -89,6 +89,10 @@
 	.cache_hot_time		= 0,			\
 	.cache_nice_tries	= 0,			\
 	.per_cpu_gain		= 25,			\
+	.busy_idx		= 0,			\
+	.idle_idx		= 0,			\
+	.newidle_idx		= 0,			\
+	.wake_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
@@ -115,6 +119,10 @@
 	.cache_hot_time		= (5*1000000/2),	\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
+	.busy_idx		= 2,			\
+	.idle_idx		= 0,			\
+	.newidle_idx		= 1,			\
+	.wake_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
diff --git a/kernel/sched.c b/kernel/sched.c
index f665de34ed82..b597b07e7911 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -206,7 +206,7 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
-	unsigned long cpu_load;
+	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
 
@@ -886,23 +886,27 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	if (type == 0)
+		return load_now;
 
-	return min(rq->cpu_load, load_now);
+	return min(rq->cpu_load[type-1], load_now);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	if (type == 0)
+		return load_now;
 
-	return max(rq->cpu_load, load_now);
+	return max(rq->cpu_load[type-1], load_now);
 }
 
 #endif
@@ -967,7 +971,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 	runqueue_t *rq;
 #ifdef CONFIG_SMP
 	unsigned long load, this_load;
-	struct sched_domain *sd;
+	struct sched_domain *sd, *this_sd = NULL;
 	int new_cpu;
 #endif
 
@@ -986,72 +990,64 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
-#ifdef CONFIG_SCHEDSTATS
+	new_cpu = cpu;
+
 	schedstat_inc(rq, ttwu_cnt);
 	if (cpu == this_cpu) {
 		schedstat_inc(rq, ttwu_local);
-	} else {
-		for_each_domain(this_cpu, sd) {
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_wake_remote);
-				break;
-			}
+		goto out_set_cpu;
+	}
+
+	for_each_domain(this_cpu, sd) {
+		if (cpu_isset(cpu, sd->span)) {
+			schedstat_inc(sd, ttwu_wake_remote);
+			this_sd = sd;
+			break;
 		}
 	}
-#endif
 
-	new_cpu = cpu;
-	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 
-	load = source_load(cpu);
-	this_load = target_load(this_cpu);
-
 	/*
-	 * If sync wakeup then subtract the (maximum possible) effect of
-	 * the currently running task from the load of the current CPU:
+	 * Check for affine wakeup and passive balancing possibilities.
 	 */
-	if (sync)
-		this_load -= SCHED_LOAD_SCALE;
-
-	/* Don't pull the task off an idle CPU to a busy one */
-	if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
-		goto out_set_cpu;
+	if (this_sd) {
+		int idx = this_sd->wake_idx;
+		unsigned int imbalance;
 
-	new_cpu = this_cpu; /* Wake to this CPU if we can */
+		load = source_load(cpu, idx);
+		this_load = target_load(this_cpu, idx);
 
-	/*
-	 * Scan domains for affine wakeup and passive balancing
-	 * possibilities.
-	 */
-	for_each_domain(this_cpu, sd) {
-		unsigned int imbalance;
 		/*
-		 * Start passive balancing when half the imbalance_pct
-		 * limit is reached.
+		 * If sync wakeup then subtract the (maximum possible) effect of
+		 * the currently running task from the load of the current CPU:
 		 */
-		imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+		if (sync)
+			this_load -= SCHED_LOAD_SCALE;
+
+		 /* Don't pull the task off an idle CPU to a busy one */
+		if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
+			goto out_set_cpu;
 
-		if ((sd->flags & SD_WAKE_AFFINE) &&
-				!task_hot(p, rq->timestamp_last_tick, sd)) {
+		new_cpu = this_cpu; /* Wake to this CPU if we can */
+
+		if ((this_sd->flags & SD_WAKE_AFFINE) &&
+			!task_hot(p, rq->timestamp_last_tick, this_sd)) {
 			/*
 			 * This domain has SD_WAKE_AFFINE and p is cache cold
 			 * in this domain.
 			 */
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_move_affine);
-				goto out_set_cpu;
-			}
-		} else if ((sd->flags & SD_WAKE_BALANCE) &&
+			schedstat_inc(this_sd, ttwu_move_affine);
+			goto out_set_cpu;
+		} else if ((this_sd->flags & SD_WAKE_BALANCE) &&
 				imbalance*this_load <= 100*load) {
 			/*
 			 * This domain has SD_WAKE_BALANCE and there is
 			 * an imbalance.
 			 */
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_move_balance);
-				goto out_set_cpu;
-			}
+			schedstat_inc(this_sd, ttwu_move_balance);
+			goto out_set_cpu;
 		}
 	}
 
@@ -1509,7 +1505,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 	cpus_and(mask, sd->span, p->cpus_allowed);
 
 	for_each_cpu_mask(i, mask) {
-		load = target_load(i);
+		load = target_load(i, sd->wake_idx);
 
 		if (load < min_load) {
 			min_cpu = i;
@@ -1522,7 +1518,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 	}
 
 	/* add +1 to account for the new task */
-	this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+	this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
 
 	/*
 	 * Would with the addition of the new task to the
@@ -1767,8 +1763,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+	int load_idx;
 
 	max_load = this_load = total_load = total_pwr = 0;
+	if (idle == NOT_IDLE)
+		load_idx = sd->busy_idx;
+	else if (idle == NEWLY_IDLE)
+		load_idx = sd->newidle_idx;
+	else
+		load_idx = sd->idle_idx;
 
 	do {
 		unsigned long load;
@@ -1783,9 +1786,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = target_load(i);
+				load = target_load(i, load_idx);
 			else
-				load = source_load(i);
+				load = source_load(i, load_idx);
 
 			avg_load += load;
 		}
@@ -1895,7 +1898,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = source_load(i);
+		load = source_load(i, 0);
 
 		if (load > max_load) {
 			max_load = load;
@@ -2150,18 +2153,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 	unsigned long old_load, this_load;
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *sd;
+	int i;
 
-	/* Update our load */
-	old_load = this_rq->cpu_load;
 	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
-	/*
-	 * Round up the averaging division if load is increasing. This
-	 * prevents us from getting stuck on 9 if the load is 10, for
-	 * example.
-	 */
-	if (this_load > old_load)
-		old_load++;
-	this_rq->cpu_load = (old_load + this_load) / 2;
+	/* Update our load */
+	for (i = 0; i < 3; i++) {
+		unsigned long new_load = this_load;
+		int scale = 1 << i;
+		old_load = this_rq->cpu_load[i];
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale-1;
+		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+	}
 
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
@@ -4921,13 +4929,15 @@ void __init sched_init(void)
 
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
+		rq->nr_running = 0;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_dummy;
-		rq->cpu_load = 0;
+		for (j = 1; j < 3; j++)
+			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
 		rq->migration_thread = NULL;
-- 
cgit v1.3-14-g43fede


From a3f21bce1fefdf92a4d1705e888d390b10f3ac6f Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:15 -0700
Subject: [PATCH] sched: tweak affine wakeups

Do less affine wakeups.  We're trying to reduce dbt2-pgsql idle time
regressions here...  make sure we don't don't move tasks the wrong way in an
imbalance condition.  Also, remove the cache coldness requirement from the
calculation - this seems to induce sharp cutoff points where behaviour will
suddenly change on some workloads if the load creeps slightly over or under
some point.  It is good for periodic balancing because in that case have
otherwise have no other context to determine what task to move.

But also make a minor tweak to "wake balancing" - the imbalance tolerance is
now set at half the domain's imbalance, so we get the opportunity to do wake
balancing before the more random periodic rebalancing gets preformed.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 57 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b597b07e7911..5ae3568eed0b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1016,38 +1016,45 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 		int idx = this_sd->wake_idx;
 		unsigned int imbalance;
 
+		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+
 		load = source_load(cpu, idx);
 		this_load = target_load(this_cpu, idx);
 
-		/*
-		 * If sync wakeup then subtract the (maximum possible) effect of
-		 * the currently running task from the load of the current CPU:
-		 */
-		if (sync)
-			this_load -= SCHED_LOAD_SCALE;
-
-		 /* Don't pull the task off an idle CPU to a busy one */
-		if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
-			goto out_set_cpu;
-
 		new_cpu = this_cpu; /* Wake to this CPU if we can */
 
-		if ((this_sd->flags & SD_WAKE_AFFINE) &&
-			!task_hot(p, rq->timestamp_last_tick, this_sd)) {
-			/*
-			 * This domain has SD_WAKE_AFFINE and p is cache cold
-			 * in this domain.
-			 */
-			schedstat_inc(this_sd, ttwu_move_affine);
-			goto out_set_cpu;
-		} else if ((this_sd->flags & SD_WAKE_BALANCE) &&
-				imbalance*this_load <= 100*load) {
+		if (this_sd->flags & SD_WAKE_AFFINE) {
+			unsigned long tl = this_load;
 			/*
-			 * This domain has SD_WAKE_BALANCE and there is
-			 * an imbalance.
+			 * If sync wakeup then subtract the (maximum possible)
+			 * effect of the currently running task from the load
+			 * of the current CPU:
 			 */
-			schedstat_inc(this_sd, ttwu_move_balance);
-			goto out_set_cpu;
+			if (sync)
+				tl -= SCHED_LOAD_SCALE;
+
+			if ((tl <= load &&
+				tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
+				100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+				/*
+				 * This domain has SD_WAKE_AFFINE and
+				 * p is cache cold in this domain, and
+				 * there is no bad imbalance.
+				 */
+				schedstat_inc(this_sd, ttwu_move_affine);
+				goto out_set_cpu;
+			}
+		}
+
+		/*
+		 * Start passive balancing when half the imbalance_pct
+		 * limit is reached.
+		 */
+		if (this_sd->flags & SD_WAKE_BALANCE) {
+			if (imbalance*this_load <= 100*load) {
+				schedstat_inc(this_sd, ttwu_move_balance);
+				goto out_set_cpu;
+			}
 		}
 	}
 
-- 
cgit v1.3-14-g43fede


From cafb20c1f9976a70d633bb1e1c8c24eab00e4e80 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:17 -0700
Subject: [PATCH] sched: no aggressive idle balancing

Remove the very aggressive idle stuff that has recently gone into 2.6 - it is
going against the direction we are trying to go.  Hopefully we can regain
performance through other methods.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/topology.h   |  1 -
 include/asm-x86_64/topology.h |  1 -
 include/linux/topology.h      |  1 -
 kernel/sched.c                | 21 ++-------------------
 4 files changed, 2 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 0055fbfeec7b..5eb6f61dcefc 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -82,7 +82,6 @@ static inline int node_to_first_cpu(int node)
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_NEWIDLE	\
-				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index fe8d80a15751..9cb7459ce722 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -48,7 +48,6 @@ extern int __node_distance(int, int);
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
-				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index ae9c2216dfa6..b23ec64df7f1 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -127,7 +127,6 @@
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/kernel/sched.c b/kernel/sched.c
index 5ae3568eed0b..396724a2519f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -414,22 +414,6 @@ static inline runqueue_t *this_rq_lock(void)
 	return rq;
 }
 
-#ifdef CONFIG_SCHED_SMT
-static int cpu_and_siblings_are_idle(int cpu)
-{
-	int sib;
-	for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
-		if (idle_cpu(sib))
-			continue;
-		return 0;
-	}
-
-	return 1;
-}
-#else
-#define cpu_and_siblings_are_idle(A) idle_cpu(A)
-#endif
-
 #ifdef CONFIG_SCHEDSTATS
 /*
  * Called when a process is dequeued from the active array and given
@@ -1652,12 +1636,11 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 
 	/*
 	 * Aggressive migration if:
-	 * 1) the [whole] cpu is idle, or
+	 * 1) task is cache cold, or
 	 * 2) too many balance attempts have failed.
 	 */
 
-	if (cpu_and_siblings_are_idle(this_cpu) || \
-			sd->nr_balance_failed > sd->cache_nice_tries)
+	if (sd->nr_balance_failed > sd->cache_nice_tries)
 		return 1;
 
 	if (task_hot(p, rq->timestamp_last_tick, sd))
-- 
cgit v1.3-14-g43fede


From 147cbb4bbe991452698f0772d8292f22825710ba Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:19 -0700
Subject: [PATCH] sched: balance on fork

Reimplement the balance on exec balancing to be sched-domains aware.  Use this
to also do balance on fork balancing.  Make x86_64 do balance on fork over the
NUMA domain.

The problem that the non sched domains aware blancing became apparent on dual
core, multi socket opterons.  What we want is for the new tasks to be sent to
a different socket, but more often than not, we would first load up our
sibling core, or fill two cores of a single remote socket before selecting a
new one.

This gives large improvements to STREAM on such systems.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/topology.h |   2 +
 include/linux/sched.h         |  10 +--
 include/linux/topology.h      |   2 +
 kernel/sched.c                | 164 ++++++++++++++++++++++++++++--------------
 4 files changed, 119 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 9cb7459ce722..802d09b9c99f 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -44,9 +44,11 @@ extern int __node_distance(int, int);
 	.idle_idx		= 2,			\
 	.newidle_idx		= 1, 			\
 	.wake_idx		= 1,			\
+	.forkexec_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 664981ac1fb6..613491d3a875 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -460,10 +460,11 @@ enum idle_type
 #define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
 #define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
 #define SD_BALANCE_EXEC		4	/* Balance on exec */
-#define SD_WAKE_IDLE		8	/* Wake to idle CPU on task wakeup */
-#define SD_WAKE_AFFINE		16	/* Wake task to waking CPU */
-#define SD_WAKE_BALANCE		32	/* Perform balancing at task wakeup */
-#define SD_SHARE_CPUPOWER	64	/* Domain members share cpu power */
+#define SD_BALANCE_FORK		8	/* Balance on fork, clone */
+#define SD_WAKE_IDLE		16	/* Wake to idle CPU on task wakeup */
+#define SD_WAKE_AFFINE		32	/* Wake task to waking CPU */
+#define SD_WAKE_BALANCE		64	/* Perform balancing at task wakeup */
+#define SD_SHARE_CPUPOWER	128	/* Domain members share cpu power */
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
@@ -492,6 +493,7 @@ struct sched_domain {
 	unsigned int idle_idx;
 	unsigned int newidle_idx;
 	unsigned int wake_idx;
+	unsigned int forkexec_idx;
 	int flags;			/* See SD_* */
 
 	/* Runtime fields. */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index b23ec64df7f1..665597207def 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -93,6 +93,7 @@
 	.idle_idx		= 0,			\
 	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
@@ -123,6 +124,7 @@
 	.idle_idx		= 0,			\
 	.newidle_idx		= 1,			\
 	.wake_idx		= 1,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
diff --git a/kernel/sched.c b/kernel/sched.c
index 396724a2519f..7ecc237e2aab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -893,6 +893,79 @@ static inline unsigned long target_load(int cpu, int type)
 	return max(rq->cpu_load[type-1], load_now);
 }
 
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+	unsigned long min_load = ULONG_MAX, this_load = 0;
+	int load_idx = sd->forkexec_idx;
+	int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+	do {
+		unsigned long load, avg_load;
+		int local_group;
+		int i;
+
+		local_group = cpu_isset(this_cpu, group->cpumask);
+		/* XXX: put a cpus allowed check */
+
+		/* Tally up the load of all CPUs in the group */
+		avg_load = 0;
+
+		for_each_cpu_mask(i, group->cpumask) {
+			/* Bias balancing toward cpus of our domain */
+			if (local_group)
+				load = source_load(i, load_idx);
+			else
+				load = target_load(i, load_idx);
+
+			avg_load += load;
+		}
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		if (local_group) {
+			this_load = avg_load;
+			this = group;
+		} else if (avg_load < min_load) {
+			min_load = avg_load;
+			idlest = group;
+		}
+		group = group->next;
+	} while (group != sd->groups);
+
+	if (!idlest || 100*this_load < imbalance*min_load)
+		return NULL;
+	return idlest;
+}
+
+/*
+ * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ */
+static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+{
+	unsigned long load, min_load = ULONG_MAX;
+	int idlest = -1;
+	int i;
+
+	for_each_cpu_mask(i, group->cpumask) {
+		load = source_load(i, 0);
+
+		if (load < min_load || (load == min_load && i == this_cpu)) {
+			min_load = load;
+			idlest = i;
+		}
+	}
+
+	return idlest;
+}
+
+
 #endif
 
 /*
@@ -1107,11 +1180,6 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
 	return try_to_wake_up(p, state, 0);
 }
 
-#ifdef CONFIG_SMP
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-			   struct sched_domain *sd);
-#endif
-
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -1181,12 +1249,38 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 	unsigned long flags;
 	int this_cpu, cpu;
 	runqueue_t *rq, *this_rq;
+#ifdef CONFIG_SMP
+	struct sched_domain *tmp, *sd = NULL;
+#endif
 
 	rq = task_rq_lock(p, &flags);
-	cpu = task_cpu(p);
+	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id();
+	cpu = task_cpu(p);
 
-	BUG_ON(p->state != TASK_RUNNING);
+#ifdef CONFIG_SMP
+	for_each_domain(cpu, tmp)
+		if (tmp->flags & SD_BALANCE_FORK)
+			sd = tmp;
+
+	if (sd) {
+		struct sched_group *group;
+
+		cpu = task_cpu(p);
+		group = find_idlest_group(sd, p, cpu);
+		if (group) {
+			int new_cpu;
+			new_cpu = find_idlest_cpu(group, cpu);
+			if (new_cpu != -1 && new_cpu != cpu &&
+					cpu_isset(new_cpu, p->cpus_allowed)) {
+				set_task_cpu(p, new_cpu);
+				task_rq_unlock(rq, &flags);
+				rq = task_rq_lock(p, &flags);
+				cpu = task_cpu(p);
+			}
+		}
+	}
+#endif
 
 	/*
 	 * We decrease the sleep average of forking parents
@@ -1480,51 +1574,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 	}
 }
 
-/*
- * find_idlest_cpu - find the least busy runqueue.
- */
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-			   struct sched_domain *sd)
-{
-	unsigned long load, min_load, this_load;
-	int i, min_cpu;
-	cpumask_t mask;
-
-	min_cpu = UINT_MAX;
-	min_load = ULONG_MAX;
-
-	cpus_and(mask, sd->span, p->cpus_allowed);
-
-	for_each_cpu_mask(i, mask) {
-		load = target_load(i, sd->wake_idx);
-
-		if (load < min_load) {
-			min_cpu = i;
-			min_load = load;
-
-			/* break out early on an idle CPU: */
-			if (!min_load)
-				break;
-		}
-	}
-
-	/* add +1 to account for the new task */
-	this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
-
-	/*
-	 * Would with the addition of the new task to the
-	 * current CPU there be an imbalance between this
-	 * CPU and the idlest CPU?
-	 *
-	 * Use half of the balancing threshold - new-context is
-	 * a good opportunity to balance.
-	 */
-	if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
-		return min_cpu;
-
-	return this_cpu;
-}
-
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -1578,8 +1627,15 @@ void sched_exec(void)
 			sd = tmp;
 
 	if (sd) {
+		struct sched_group *group;
 		schedstat_inc(sd, sbe_attempts);
-		new_cpu = find_idlest_cpu(current, this_cpu, sd);
+		group = find_idlest_group(sd, current, this_cpu);
+		if (!group)
+			goto out;
+		new_cpu = find_idlest_cpu(group, this_cpu);
+		if (new_cpu == -1)
+			goto out;
+
 		if (new_cpu != this_cpu) {
 			schedstat_inc(sd, sbe_pushed);
 			put_cpu();
@@ -1792,12 +1848,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
-			goto nextgroup;
 		} else if (avg_load > max_load) {
 			max_load = avg_load;
 			busiest = group;
 		}
-nextgroup:
 		group = group->next;
 	} while (group != sd->groups);
 
-- 
cgit v1.3-14-g43fede


From 68767a0ae428801649d510d9a65bb71feed44dd1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:20 -0700
Subject: [PATCH] sched: schedstats update for balance on fork

Add SCHEDSTAT statistics for sched-balance-fork.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 10 ++++++--
 kernel/sched.c        | 63 +++++++++++++++++++++++++++++----------------------
 2 files changed, 44 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 613491d3a875..36a10781c3f3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -517,10 +517,16 @@ struct sched_domain {
 	unsigned long alb_failed;
 	unsigned long alb_pushed;
 
-	/* sched_balance_exec() stats */
-	unsigned long sbe_attempts;
+	/* SD_BALANCE_EXEC stats */
+	unsigned long sbe_cnt;
+	unsigned long sbe_balanced;
 	unsigned long sbe_pushed;
 
+	/* SD_BALANCE_FORK stats */
+	unsigned long sbf_cnt;
+	unsigned long sbf_balanced;
+	unsigned long sbf_pushed;
+
 	/* try_to_wake_up() stats */
 	unsigned long ttwu_wake_remote;
 	unsigned long ttwu_move_affine;
diff --git a/kernel/sched.c b/kernel/sched.c
index 7ecc237e2aab..2711130cd973 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -309,7 +309,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 11
+#define SCHEDSTAT_VERSION 12
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -356,9 +356,10 @@ static int show_schedstat(struct seq_file *seq, void *v)
 				    sd->lb_nobusyq[itype],
 				    sd->lb_nobusyg[itype]);
 			}
-			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
 			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
-			    sd->sbe_pushed, sd->sbe_attempts,
+			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
 		}
 #endif
@@ -1264,24 +1265,34 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 			sd = tmp;
 
 	if (sd) {
+		int new_cpu;
 		struct sched_group *group;
 
+		schedstat_inc(sd, sbf_cnt);
 		cpu = task_cpu(p);
 		group = find_idlest_group(sd, p, cpu);
-		if (group) {
-			int new_cpu;
-			new_cpu = find_idlest_cpu(group, cpu);
-			if (new_cpu != -1 && new_cpu != cpu &&
-					cpu_isset(new_cpu, p->cpus_allowed)) {
-				set_task_cpu(p, new_cpu);
-				task_rq_unlock(rq, &flags);
-				rq = task_rq_lock(p, &flags);
-				cpu = task_cpu(p);
-			}
+		if (!group) {
+			schedstat_inc(sd, sbf_balanced);
+			goto no_forkbalance;
+		}
+
+		new_cpu = find_idlest_cpu(group, cpu);
+		if (new_cpu == -1 || new_cpu == cpu) {
+			schedstat_inc(sd, sbf_balanced);
+			goto no_forkbalance;
+		}
+
+		if (cpu_isset(new_cpu, p->cpus_allowed)) {
+			schedstat_inc(sd, sbf_pushed);
+			set_task_cpu(p, new_cpu);
+			task_rq_unlock(rq, &flags);
+			rq = task_rq_lock(p, &flags);
+			cpu = task_cpu(p);
 		}
 	}
-#endif
 
+no_forkbalance:
+#endif
 	/*
 	 * We decrease the sleep average of forking parents
 	 * and children as well, to keep max-interactive tasks
@@ -1618,30 +1629,28 @@ void sched_exec(void)
 	struct sched_domain *tmp, *sd = NULL;
 	int new_cpu, this_cpu = get_cpu();
 
-	/* Prefer the current CPU if there's only this task running */
-	if (this_rq()->nr_running <= 1)
-		goto out;
-
 	for_each_domain(this_cpu, tmp)
 		if (tmp->flags & SD_BALANCE_EXEC)
 			sd = tmp;
 
 	if (sd) {
 		struct sched_group *group;
-		schedstat_inc(sd, sbe_attempts);
+		schedstat_inc(sd, sbe_cnt);
 		group = find_idlest_group(sd, current, this_cpu);
-		if (!group)
+		if (!group) {
+			schedstat_inc(sd, sbe_balanced);
 			goto out;
+		}
 		new_cpu = find_idlest_cpu(group, this_cpu);
-		if (new_cpu == -1)
+		if (new_cpu == -1 || new_cpu == this_cpu) {
+			schedstat_inc(sd, sbe_balanced);
 			goto out;
-
-		if (new_cpu != this_cpu) {
-			schedstat_inc(sd, sbe_pushed);
-			put_cpu();
-			sched_migrate_task(current, new_cpu);
-			return;
 		}
+
+		schedstat_inc(sd, sbe_pushed);
+		put_cpu();
+		sched_migrate_task(current, new_cpu);
+		return;
 	}
 out:
 	put_cpu();
-- 
cgit v1.3-14-g43fede


From 48c08d3f8ff94fa118187e4d8d4a5707bb85e59d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 25 Jun 2005 14:57:22 -0700
Subject: [PATCH] sched: uninline task_timeslice

      "Chen, Kenneth W" <kenneth.w.chen@intel.com>

uninline task_timeslice() - reduces code footprint noticeably, and it's
slowpath code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2711130cd973..98bf1c091da5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -166,7 +166,7 @@
 #define SCALE_PRIO(x, prio) \
 	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
 
-static inline unsigned int task_timeslice(task_t *p)
+static unsigned int task_timeslice(task_t *p)
 {
 	if (p->static_prio < NICE_TO_PRIO(0))
 		return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
-- 
cgit v1.3-14-g43fede


From 4866cde064afbb6c2a488c265e696879de616daa Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:23 -0700
Subject: [PATCH] sched: cleanup context switch locking

Instead of requiring architecture code to interact with the scheduler's
locking implementation, provide a couple of defines that can be used by the
architecture to request runqueue unlocked context switches, and ask for
interrupts to be enabled over the context switch.

Also replaces the "switch_lock" used by these architectures with an oncpu
flag (note, not a potentially slow bitflag).  This eliminates one bus
locked memory operation when context switching, and simplifies the
task_running function.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-arm/system.h     |  30 ++--------
 include/asm-ia64/system.h    |  10 +---
 include/asm-mips/system.h    |  10 +---
 include/asm-s390/system.h    |  17 +-----
 include/asm-sparc/system.h   |   4 +-
 include/asm-sparc64/system.h |  14 ++---
 include/linux/init_task.h    |   1 -
 include/linux/sched.h        |  10 +++-
 kernel/sched.c               | 132 +++++++++++++++++++++++++++++++++++--------
 9 files changed, 131 insertions(+), 97 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-arm/system.h b/include/asm-arm/system.h
index 39dd7008013c..3d0d2860b6db 100644
--- a/include/asm-arm/system.h
+++ b/include/asm-arm/system.h
@@ -145,34 +145,12 @@ extern unsigned int user_debug;
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)
 #define nop() __asm__ __volatile__("mov\tr0,r0\t@ nop\n\t");
 
-#ifdef CONFIG_SMP
 /*
- * Define our own context switch locking.  This allows us to enable
- * interrupts over the context switch, otherwise we end up with high
- * interrupt latency.  The real problem area is switch_mm() which may
- * do a full cache flush.
+ * switch_mm() may do a full cache flush over the context switch,
+ * so enable interrupts over the context switch to avoid high
+ * latency.
  */
-#define prepare_arch_switch(rq,next)					\
-do {									\
-	spin_lock(&(next)->switch_lock);				\
-	spin_unlock_irq(&(rq)->lock);					\
-} while (0)
-
-#define finish_arch_switch(rq,prev)					\
-	spin_unlock(&(prev)->switch_lock)
-
-#define task_running(rq,p)						\
-	((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
-#else
-/*
- * Our UP-case is more simple, but we assume knowledge of how
- * spin_unlock_irq() and friends are implemented.  This avoids
- * us needlessly decrementing and incrementing the preempt count.
- */
-#define prepare_arch_switch(rq,next)	local_irq_enable()
-#define finish_arch_switch(rq,prev)	spin_unlock(&(rq)->lock)
-#define task_running(rq,p)		((rq)->curr == (p))
-#endif
+#define __ARCH_WANT_INTERRUPTS_ON_CTXSW
 
 /*
  * switch_to(prev, next) should switch from task `prev' to `next'
diff --git a/include/asm-ia64/system.h b/include/asm-ia64/system.h
index 6f516e76d1f0..cd2cf76b2db1 100644
--- a/include/asm-ia64/system.h
+++ b/include/asm-ia64/system.h
@@ -183,8 +183,6 @@ do {								\
 
 #ifdef __KERNEL__
 
-#define prepare_to_switch()    do { } while(0)
-
 #ifdef CONFIG_IA32_SUPPORT
 # define IS_IA32_PROCESS(regs)	(ia64_psr(regs)->is != 0)
 #else
@@ -274,13 +272,7 @@ extern void ia64_load_extra (struct task_struct *task);
  * of that CPU which will not be released, because there we wait for the
  * tasklist_lock to become available.
  */
-#define prepare_arch_switch(rq, next)		\
-do {						\
-	spin_lock(&(next)->switch_lock);	\
-	spin_unlock(&(rq)->lock);		\
-} while (0)
-#define finish_arch_switch(rq, prev)	spin_unlock_irq(&(prev)->switch_lock)
-#define task_running(rq, p) 		((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
+#define __ARCH_WANT_UNLOCKED_CTXSW
 
 #define ia64_platform_is(x) (strcmp(x, platform_name) == 0)
 
diff --git a/include/asm-mips/system.h b/include/asm-mips/system.h
index 888fd8908467..169f3d4265b1 100644
--- a/include/asm-mips/system.h
+++ b/include/asm-mips/system.h
@@ -422,16 +422,10 @@ extern void __die_if_kernel(const char *, struct pt_regs *, const char *file,
 extern int stop_a_enabled;
 
 /*
- * Taken from include/asm-ia64/system.h; prevents deadlock on SMP
+ * See include/asm-ia64/system.h; prevents deadlock on SMP
  * systems.
  */
-#define prepare_arch_switch(rq, next)		\
-do {						\
-	spin_lock(&(next)->switch_lock);	\
-	spin_unlock(&(rq)->lock);		\
-} while (0)
-#define finish_arch_switch(rq, prev)	spin_unlock_irq(&(prev)->switch_lock)
-#define task_running(rq, p) 		((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
+#define __ARCH_WANT_UNLOCKED_CTXSW
 
 #define arch_align_stack(x) (x)
 
diff --git a/include/asm-s390/system.h b/include/asm-s390/system.h
index e3cb3ce1d24a..b4a9f05a93d6 100644
--- a/include/asm-s390/system.h
+++ b/include/asm-s390/system.h
@@ -104,29 +104,18 @@ static inline void restore_access_regs(unsigned int *acrs)
 	prev = __switch_to(prev,next);					     \
 } while (0)
 
-#define prepare_arch_switch(rq, next)	do { } while(0)
-#define task_running(rq, p)		((rq)->curr == (p))
-
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 extern void account_user_vtime(struct task_struct *);
 extern void account_system_vtime(struct task_struct *);
-
-#define finish_arch_switch(rq, prev) do {				     \
-	set_fs(current->thread.mm_segment);				     \
-	spin_unlock(&(rq)->lock);					     \
-	account_system_vtime(prev);					     \
-	local_irq_enable();						     \
-} while (0)
-
 #else
+#define account_system_vtime(prev) do { } while (0)
+#endif
 
 #define finish_arch_switch(rq, prev) do {				     \
 	set_fs(current->thread.mm_segment);				     \
-	spin_unlock_irq(&(rq)->lock);					     \
+	account_system_vtime(prev);					     \
 } while (0)
 
-#endif
-
 #define nop() __asm__ __volatile__ ("nop")
 
 #define xchg(ptr,x) \
diff --git a/include/asm-sparc/system.h b/include/asm-sparc/system.h
index 80cf20cfaee1..898562ebe94c 100644
--- a/include/asm-sparc/system.h
+++ b/include/asm-sparc/system.h
@@ -101,7 +101,7 @@ extern void fpsave(unsigned long *fpregs, unsigned long *fsr,
  * SWITCH_ENTER and SWITH_DO_LAZY_FPU do not work yet (e.g. SMP does not work)
  * XXX WTF is the above comment? Found in late teen 2.4.x.
  */
-#define prepare_arch_switch(rq, next) do { \
+#define prepare_arch_switch(next) do { \
 	__asm__ __volatile__( \
 	".globl\tflush_patch_switch\nflush_patch_switch:\n\t" \
 	"save %sp, -0x40, %sp; save %sp, -0x40, %sp; save %sp, -0x40, %sp\n\t" \
@@ -109,8 +109,6 @@ extern void fpsave(unsigned long *fpregs, unsigned long *fsr,
 	"save %sp, -0x40, %sp\n\t" \
 	"restore; restore; restore; restore; restore; restore; restore"); \
 } while(0)
-#define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
-#define task_running(rq, p)		((rq)->curr == (p))
 
 	/* Much care has gone into this code, do not touch it.
 	 *
diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h
index fd12ca386f48..f9be2c5b4dc9 100644
--- a/include/asm-sparc64/system.h
+++ b/include/asm-sparc64/system.h
@@ -139,19 +139,13 @@ extern void __flushw_user(void);
 #define flush_user_windows flushw_user
 #define flush_register_windows flushw_all
 
-#define prepare_arch_switch(rq, next)		\
-do {	spin_lock(&(next)->switch_lock);	\
-	spin_unlock(&(rq)->lock);		\
+/* Don't hold the runqueue lock over context switch */
+#define __ARCH_WANT_UNLOCKED_CTXSW
+#define prepare_arch_switch(next)		\
+do {						\
 	flushw_all();				\
 } while (0)
 
-#define finish_arch_switch(rq, prev)		\
-do {	spin_unlock_irq(&(prev)->switch_lock);	\
-} while (0)
-
-#define task_running(rq, p) \
-	((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
-
 	/* See what happens when you design the chip correctly?
 	 *
 	 * We tell gcc we clobber all non-fixed-usage registers except
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index a6a8c1a38d5e..03206a425d7a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,7 +108,6 @@ extern struct group_info init_groups;
 	.blocked	= {{0}},					\
 	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
-	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 36a10781c3f3..d27be9337425 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -368,6 +368,11 @@ struct signal_struct {
 #endif
 };
 
+/* Context switch must be unlocked if interrupts are to be enabled */
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+# define __ARCH_WANT_UNLOCKED_CTXSW
+#endif
+
 /*
  * Bits in flags field of signal_struct.
  */
@@ -594,6 +599,9 @@ struct task_struct {
 
 	int lock_depth;		/* BKL lock depth */
 
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	int oncpu;
+#endif
 	int prio, static_prio;
 	struct list_head run_list;
 	prio_array_t *array;
@@ -716,8 +724,6 @@ struct task_struct {
 	spinlock_t alloc_lock;
 /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
 	spinlock_t proc_lock;
-/* context-switch lock */
-	spinlock_t switch_lock;
 
 /* journalling filesystem info */
 	void *journal_info;
diff --git a/kernel/sched.c b/kernel/sched.c
index 98bf1c091da5..b1410577f9a8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -268,14 +268,71 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-/*
- * Default context-switch locking:
- */
 #ifndef prepare_arch_switch
-# define prepare_arch_switch(rq, next)	do { } while (0)
-# define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p)		((rq)->curr == (p))
+# define prepare_arch_switch(next)	do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)	do { } while (0)
+#endif
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+	return rq->curr == p;
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+	spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+#ifdef CONFIG_SMP
+	return p->oncpu;
+#else
+	return rq->curr == p;
+#endif
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * We can optimise this out completely for !SMP, because the
+	 * SMP rebalancing from interrupt is the only thing that cares
+	 * here.
+	 */
+	next->oncpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	spin_unlock_irq(&rq->lock);
+#else
+	spin_unlock(&rq->lock);
 #endif
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * After ->oncpu is cleared, the task can be moved to a different CPU.
+	 * We must ensure this doesn't happen until the switch is completely
+	 * finished.
+	 */
+	smp_wmb();
+	prev->oncpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -1196,17 +1253,14 @@ void fastcall sched_fork(task_t *p)
 	p->state = TASK_RUNNING;
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
-	spin_lock_init(&p->switch_lock);
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	p->oncpu = 0;
+#endif
 #ifdef CONFIG_PREEMPT
-	/*
-	 * During context-switch we hold precisely one spinlock, which
-	 * schedule_tail drops. (in the common case it's this_rq()->lock,
-	 * but it also can be p->switch_lock.) So we compensate with a count
-	 * of 1. Also, we want to start with kernel preemption disabled.
-	 */
+	/* Want to start with kernel preemption disabled. */
 	p->thread_info->preempt_count = 1;
 #endif
 	/*
@@ -1387,23 +1441,41 @@ void fastcall sched_exit(task_t * p)
 	task_rq_unlock(rq, &flags);
 }
 
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+{
+	prepare_lock_switch(rq, next);
+	prepare_arch_switch(next);
+}
+
 /**
  * finish_task_switch - clean up after a task-switch
  * @prev: the thread we just switched away from.
  *
- * We enter this with the runqueue still locked, and finish_arch_switch()
- * will unlock it along with doing any other architecture-specific cleanup
- * actions.
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock.  (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
-static inline void finish_task_switch(task_t *prev)
+static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
 	__releases(rq->lock)
 {
-	runqueue_t *rq = this_rq();
 	struct mm_struct *mm = rq->prev_mm;
 	unsigned long prev_task_flags;
 
@@ -1421,7 +1493,8 @@ static inline void finish_task_switch(task_t *prev)
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_task_flags = prev->flags;
-	finish_arch_switch(rq, prev);
+	finish_arch_switch(prev);
+	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_task_flags & PF_DEAD))
@@ -1435,8 +1508,12 @@ static inline void finish_task_switch(task_t *prev)
 asmlinkage void schedule_tail(task_t *prev)
 	__releases(rq->lock)
 {
-	finish_task_switch(prev);
-
+	runqueue_t *rq = this_rq();
+	finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+	/* In this case, finish_task_switch does not reenable preemption */
+	preempt_enable();
+#endif
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
 }
@@ -2816,11 +2893,15 @@ switch_tasks:
 		rq->curr = next;
 		++*switch_count;
 
-		prepare_arch_switch(rq, next);
+		prepare_task_switch(rq, next);
 		prev = context_switch(rq, prev, next);
 		barrier();
-
-		finish_task_switch(prev);
+		/*
+		 * this_rq must be evaluated again because prev may have moved
+		 * CPUs since it called schedule(), thus the 'rq' on its stack
+		 * frame will be invalid.
+		 */
+		finish_task_switch(this_rq(), prev);
 	} else
 		spin_unlock_irq(&rq->lock);
 
@@ -4085,6 +4166,9 @@ void __devinit init_idle(task_t *idle, int cpu)
 
 	spin_lock_irqsave(&rq->lock, flags);
 	rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	idle->oncpu = 1;
+#endif
 	set_tsk_need_resched(idle);
 	spin_unlock_irqrestore(&rq->lock, flags);
 
-- 
cgit v1.3-14-g43fede


From 41c7ce9ad9a859871dffbe7dbc8b1f9571724e3c Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:24 -0700
Subject: [PATCH] sched: null domains

Fix the last 2 places that directly access a runqueue's sched-domain and
assume it cannot be NULL.

That allows the use of NULL for domain, instead of a dummy domain, to signify
no balancing is to happen.  No functional changes.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b1410577f9a8..77c07c2928b9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2579,11 +2579,15 @@ out:
 #ifdef CONFIG_SCHED_SMT
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
-	struct sched_domain *sd = this_rq->sd;
+	struct sched_domain *tmp, *sd = NULL;
 	cpumask_t sibling_map;
 	int i;
 
-	if (!(sd->flags & SD_SHARE_CPUPOWER))
+	for_each_domain(this_cpu, tmp)
+		if (tmp->flags & SD_SHARE_CPUPOWER)
+			sd = tmp;
+
+	if (!sd)
 		return;
 
 	/*
@@ -2624,13 +2628,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 
 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
-	struct sched_domain *sd = this_rq->sd;
+	struct sched_domain *tmp, *sd = NULL;
 	cpumask_t sibling_map;
 	prio_array_t *array;
 	int ret = 0, i;
 	task_t *p;
 
-	if (!(sd->flags & SD_SHARE_CPUPOWER))
+	for_each_domain(this_cpu, tmp)
+		if (tmp->flags & SD_SHARE_CPUPOWER)
+			sd = tmp;
+
+	if (!sd)
 		return 0;
 
 	/*
@@ -4617,6 +4625,11 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	int level = 0;
 
+	if (!sd) {
+		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+		return;
+	}
+
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 
 	do {
@@ -4874,7 +4887,7 @@ static void __devinit arch_init_sched_domains(void)
 	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
 
 	/*
-	 * Set up domains. Isolated domains just stay on the dummy domain.
+	 * Set up domains. Isolated domains just stay on the NULL domain.
 	 */
 	for_each_cpu_mask(i, cpu_default_map) {
 		int group;
@@ -4987,18 +5000,11 @@ static void __devinit arch_destroy_sched_domains(void)
 
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 
-/*
- * Initial dummy domain for early boot and for hotplug cpu. Being static,
- * it is initialized to zero, so all balancing flags are cleared which is
- * what we want.
- */
-static struct sched_domain sched_domain_dummy;
-
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
  * and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to a "dummy" domain
+ * code, so we temporarily attach all running cpus to the NULL domain
  * which will prevent rebalancing while the sched domains are recalculated.
  */
 static int update_sched_domains(struct notifier_block *nfb,
@@ -5010,7 +5016,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 	case CPU_UP_PREPARE:
 	case CPU_DOWN_PREPARE:
 		for_each_online_cpu(i)
-			cpu_attach_domain(&sched_domain_dummy, i);
+			cpu_attach_domain(NULL, i);
 		arch_destroy_sched_domains();
 		return NOTIFY_OK;
 
@@ -5072,7 +5078,7 @@ void __init sched_init(void)
 		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
-		rq->sd = &sched_domain_dummy;
+		rq->sd = NULL;
 		for (j = 1; j < 3; j++)
 			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;
-- 
cgit v1.3-14-g43fede


From 245af2c7870bd5940f7bfad19a0a03b32751fbc5 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Sat, 25 Jun 2005 14:57:25 -0700
Subject: [PATCH] sched: remove degenerate domains

Remove degenerate scheduler domains during the sched-domain init.

For example on x86_64, we always have NUMA configured in.  On Intel EM64T
systems, top most sched domain will be of NUMA and with only one sched_group
in it.

With fork/exec balances(recent Nick's fixes in -mm tree), we always endup
taking wrong decisions because of this topmost domain (as it contains only one
group and find_idlest_group always returns NULL).  We will endup loading HT
package completely first, letting active load balance kickin and correct it.

In general, this patch also makes sense with out recent Nick's fixes in -mm.

From: Nick Piggin <nickpiggin@yahoo.com.au>

Modified to account for more than just sched_groups when scanning for
degenerate domains by Nick Piggin.  And allow a runqueue's sd to go NULL
rather than keep a single degenerate domain around (this happens when you run
with maxcpus=1).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 77c07c2928b9..e75b301b5340 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4712,6 +4712,57 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 #define sched_domain_debug(sd, cpu) {}
 #endif
 
+static int __devinit sd_degenerate(struct sched_domain *sd)
+{
+	if (cpus_weight(sd->span) == 1)
+		return 1;
+
+	/* Following flags need at least 2 groups */
+	if (sd->flags & (SD_LOAD_BALANCE |
+			 SD_BALANCE_NEWIDLE |
+			 SD_BALANCE_FORK |
+			 SD_BALANCE_EXEC)) {
+		if (sd->groups != sd->groups->next)
+			return 0;
+	}
+
+	/* Following flags don't use groups */
+	if (sd->flags & (SD_WAKE_IDLE |
+			 SD_WAKE_AFFINE |
+			 SD_WAKE_BALANCE))
+		return 0;
+
+	return 1;
+}
+
+static int __devinit sd_parent_degenerate(struct sched_domain *sd,
+						struct sched_domain *parent)
+{
+	unsigned long cflags = sd->flags, pflags = parent->flags;
+
+	if (sd_degenerate(parent))
+		return 1;
+
+	if (!cpus_equal(sd->span, parent->span))
+		return 0;
+
+	/* Does parent contain flags not in child? */
+	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
+	if (cflags & SD_WAKE_AFFINE)
+		pflags &= ~SD_WAKE_BALANCE;
+	/* Flags needing groups don't count if only 1 group in parent */
+	if (parent->groups == parent->groups->next) {
+		pflags &= ~(SD_LOAD_BALANCE |
+				SD_BALANCE_NEWIDLE |
+				SD_BALANCE_FORK |
+				SD_BALANCE_EXEC);
+	}
+	if (~cflags & pflags)
+		return 0;
+
+	return 1;
+}
+
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
@@ -4722,6 +4773,19 @@ void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
 	unsigned long flags;
 	runqueue_t *rq = cpu_rq(cpu);
 	int local = 1;
+	struct sched_domain *tmp;
+
+	/* Remove the sched domains which do not contribute to scheduling. */
+	for (tmp = sd; tmp; tmp = tmp->parent) {
+		struct sched_domain *parent = tmp->parent;
+		if (!parent)
+			break;
+		if (sd_parent_degenerate(tmp, parent))
+			tmp->parent = parent->parent;
+	}
+
+	if (sd && sd_degenerate(sd))
+		sd = sd->parent;
 
 	sched_domain_debug(sd, cpu);
 
-- 
cgit v1.3-14-g43fede


From 3dbd5342074a1e570ec84edf859deb9be588006d Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:26 -0700
Subject: [PATCH] sched: multilevel sbe sbf

The fundamental problem that Suresh has with balance on exec and fork is that
it only tries to balance the top level domain with the flag set.

This was worked around by removing degenerate domains, but is still a problem
if people want to start using more complex sched-domains, especially
multilevel NUMA that ia64 is already using.

This patch makes balance on fork and exec try balancing over not just the top
most domain with the flag set, but all the way down the domain tree.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 45 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e75b301b5340..ef32389ee768 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1319,21 +1319,24 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 			sd = tmp;
 
 	if (sd) {
+		cpumask_t span;
 		int new_cpu;
 		struct sched_group *group;
 
+again:
 		schedstat_inc(sd, sbf_cnt);
+		span = sd->span;
 		cpu = task_cpu(p);
 		group = find_idlest_group(sd, p, cpu);
 		if (!group) {
 			schedstat_inc(sd, sbf_balanced);
-			goto no_forkbalance;
+			goto nextlevel;
 		}
 
 		new_cpu = find_idlest_cpu(group, cpu);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			schedstat_inc(sd, sbf_balanced);
-			goto no_forkbalance;
+			goto nextlevel;
 		}
 
 		if (cpu_isset(new_cpu, p->cpus_allowed)) {
@@ -1343,9 +1346,21 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 			rq = task_rq_lock(p, &flags);
 			cpu = task_cpu(p);
 		}
+
+		/* Now try balancing at a lower domain level */
+nextlevel:
+		sd = NULL;
+		for_each_domain(cpu, tmp) {
+			if (cpus_subset(span, tmp->span))
+				break;
+			if (tmp->flags & SD_BALANCE_FORK)
+				sd = tmp;
+		}
+
+		if (sd)
+			goto again;
 	}
 
-no_forkbalance:
 #endif
 	/*
 	 * We decrease the sleep average of forking parents
@@ -1711,25 +1726,41 @@ void sched_exec(void)
 			sd = tmp;
 
 	if (sd) {
+		cpumask_t span;
 		struct sched_group *group;
+again:
 		schedstat_inc(sd, sbe_cnt);
+		span = sd->span;
 		group = find_idlest_group(sd, current, this_cpu);
 		if (!group) {
 			schedstat_inc(sd, sbe_balanced);
-			goto out;
+			goto nextlevel;
 		}
 		new_cpu = find_idlest_cpu(group, this_cpu);
 		if (new_cpu == -1 || new_cpu == this_cpu) {
 			schedstat_inc(sd, sbe_balanced);
-			goto out;
+			goto nextlevel;
 		}
 
 		schedstat_inc(sd, sbe_pushed);
 		put_cpu();
 		sched_migrate_task(current, new_cpu);
-		return;
+
+		/* Now try balancing at a lower domain level */
+		this_cpu = get_cpu();
+nextlevel:
+		sd = NULL;
+		for_each_domain(this_cpu, tmp) {
+			if (cpus_subset(span, tmp->span))
+				break;
+			if (tmp->flags & SD_BALANCE_EXEC)
+				sd = tmp;
+		}
+
+		if (sd)
+			goto again;
 	}
-out:
+
 	put_cpu();
 }
 
-- 
cgit v1.3-14-g43fede


From 674311d5b411e9042df4fdf7aef0b3c8217b6240 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:27 -0700
Subject: [PATCH] sched: RCU domains

One of the problems with the multilevel balance-on-fork/exec is that it needs
to jump through hoops to satisfy sched-domain's locking semantics (that is,
you may traverse your own domain when not preemptable, and you may traverse
others' domains when holding their runqueue lock).

balance-on-exec had to potentially migrate between more than one CPU before
finding a final CPU to migrate to, and balance-on-fork needed to potentially
take multiple runqueue locks.

So bite the bullet and make sched-domains go completely RCU.  This actually
simplifies the code quite a bit.

From: Ingo Molnar <mingo@elte.hu>

schedstats RCU fix, and a nice comment on for_each_domain, from Ingo.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 60 +++++++++++++++-------------------------------------------
 1 file changed, 15 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ef32389ee768..54ce787b6207 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -260,8 +260,15 @@ struct runqueue {
 
 static DEFINE_PER_CPU(struct runqueue, runqueues);
 
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See update_sched_domains: synchronize_kernel for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
 #define for_each_domain(cpu, domain) \
-	for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
@@ -395,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
 #ifdef CONFIG_SMP
 		/* domain-specific stats */
+		preempt_disable();
 		for_each_domain(cpu, sd) {
 			enum idle_type itype;
 			char mask_str[NR_CPUS];
@@ -419,6 +427,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
 		}
+		preempt_enable();
 #endif
 	}
 	return 0;
@@ -824,22 +833,12 @@ inline int task_curr(const task_t *p)
 }
 
 #ifdef CONFIG_SMP
-enum request_type {
-	REQ_MOVE_TASK,
-	REQ_SET_DOMAIN,
-};
-
 typedef struct {
 	struct list_head list;
-	enum request_type type;
 
-	/* For REQ_MOVE_TASK */
 	task_t *task;
 	int dest_cpu;
 
-	/* For REQ_SET_DOMAIN */
-	struct sched_domain *sd;
-
 	struct completion done;
 } migration_req_t;
 
@@ -861,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
 	}
 
 	init_completion(&req->done);
-	req->type = REQ_MOVE_TASK;
 	req->task = p;
 	req->dest_cpu = dest_cpu;
 	list_add(&req->list, &rq->migration_queue);
@@ -4378,17 +4376,9 @@ static int migration_thread(void * data)
 		req = list_entry(head->next, migration_req_t, list);
 		list_del_init(head->next);
 
-		if (req->type == REQ_MOVE_TASK) {
-			spin_unlock(&rq->lock);
-			__migrate_task(req->task, cpu, req->dest_cpu);
-			local_irq_enable();
-		} else if (req->type == REQ_SET_DOMAIN) {
-			rq->sd = req->sd;
-			spin_unlock_irq(&rq->lock);
-		} else {
-			spin_unlock_irq(&rq->lock);
-			WARN_ON(1);
-		}
+		spin_unlock(&rq->lock);
+		__migrate_task(req->task, cpu, req->dest_cpu);
+		local_irq_enable();
 
 		complete(&req->done);
 	}
@@ -4619,7 +4609,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
 			migration_req_t *req;
 			req = list_entry(rq->migration_queue.next,
 					 migration_req_t, list);
-			BUG_ON(req->type != REQ_MOVE_TASK);
 			list_del_init(&req->list);
 			complete(&req->done);
 		}
@@ -4800,10 +4789,7 @@ static int __devinit sd_parent_degenerate(struct sched_domain *sd,
  */
 void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
-	migration_req_t req;
-	unsigned long flags;
 	runqueue_t *rq = cpu_rq(cpu);
-	int local = 1;
 	struct sched_domain *tmp;
 
 	/* Remove the sched domains which do not contribute to scheduling. */
@@ -4820,24 +4806,7 @@ void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
 
 	sched_domain_debug(sd, cpu);
 
-	spin_lock_irqsave(&rq->lock, flags);
-
-	if (cpu == smp_processor_id() || !cpu_online(cpu)) {
-		rq->sd = sd;
-	} else {
-		init_completion(&req.done);
-		req.type = REQ_SET_DOMAIN;
-		req.sd = sd;
-		list_add(&req.list, &rq->migration_queue);
-		local = 0;
-	}
-
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	if (!local) {
-		wake_up_process(rq->migration_thread);
-		wait_for_completion(&req.done);
-	}
+	rcu_assign_pointer(rq->sd, sd);
 }
 
 /* cpus with isolated domains */
@@ -5112,6 +5081,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 		for_each_online_cpu(i)
 			cpu_attach_domain(NULL, i);
+		synchronize_kernel();
 		arch_destroy_sched_domains();
 		return NOTIFY_OK;
 
-- 
cgit v1.3-14-g43fede


From 476d139c218e44e045e4bc6d4cc02b010b343939 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:29 -0700
Subject: [PATCH] sched: consolidate sbe sbf

Consolidate balance-on-exec with balance-on-fork.  This is made easy by the
sched-domains RCU patches.

As well as the general goodness of code reduction, this allows the runqueues
to be unlocked during balance-on-fork.

schedstats is a problem.  Maybe just have balance-on-event instead of
distinguishing fork and exec?

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |   2 +-
 kernel/fork.c         |  21 +++---
 kernel/sched.c        | 174 ++++++++++++++++++++------------------------------
 3 files changed, 81 insertions(+), 116 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d27be9337425..edb2c69a8873 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -930,7 +930,7 @@ extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
 #else
  static inline void kick_process(struct task_struct *tsk) { }
 #endif
-extern void FASTCALL(sched_fork(task_t * p));
+extern void FASTCALL(sched_fork(task_t * p, int clone_flags));
 extern void FASTCALL(sched_exit(task_t * p));
 
 extern int in_group_p(gid_t);
diff --git a/kernel/fork.c b/kernel/fork.c
index a28d11e10877..2c7806873bfd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1003,9 +1003,6 @@ static task_t *copy_process(unsigned long clone_flags,
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
 
-	/* Perform scheduler related setup */
-	sched_fork(p);
-
 	/*
 	 * Ok, make it visible to the rest of the system.
 	 * We dont wake it up yet.
@@ -1014,18 +1011,24 @@ static task_t *copy_process(unsigned long clone_flags,
 	INIT_LIST_HEAD(&p->ptrace_children);
 	INIT_LIST_HEAD(&p->ptrace_list);
 
+	/* Perform scheduler related setup. Assign this task to a CPU. */
+	sched_fork(p, clone_flags);
+
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
 	/*
-	 * The task hasn't been attached yet, so cpus_allowed mask cannot
-	 * have changed. The cpus_allowed mask of the parent may have
-	 * changed after it was copied first time, and it may then move to
-	 * another CPU - so we re-copy it here and set the child's CPU to
-	 * the parent's CPU. This avoids alot of nasty races.
+	 * The task hasn't been attached yet, so its cpus_allowed mask will
+	 * not be changed, nor will its assigned CPU.
+	 *
+	 * The cpus_allowed mask of the parent may have changed after it was
+	 * copied first time - so re-copy it here, then check the child's CPU
+	 * to ensure it is on a valid CPU (and if not, just force it back to
+	 * parent's CPU). This avoids alot of nasty races.
 	 */
 	p->cpus_allowed = current->cpus_allowed;
-	set_task_cpu(p, smp_processor_id());
+	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed)))
+		set_task_cpu(p, smp_processor_id());
 
 	/*
 	 * Check for pending SIGKILL! The new thread should not be allowed
diff --git a/kernel/sched.c b/kernel/sched.c
index 54ce787b6207..579da278e72f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1021,8 +1021,59 @@ static int find_idlest_cpu(struct sched_group *group, int this_cpu)
 	return idlest;
 }
 
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+	struct task_struct *t = current;
+	struct sched_domain *tmp, *sd = NULL;
 
-#endif
+	for_each_domain(cpu, tmp)
+		if (tmp->flags & flag)
+			sd = tmp;
+
+	while (sd) {
+		cpumask_t span;
+		struct sched_group *group;
+		int new_cpu;
+		int weight;
+
+		span = sd->span;
+		group = find_idlest_group(sd, t, cpu);
+		if (!group)
+			goto nextlevel;
+
+		new_cpu = find_idlest_cpu(group, cpu);
+		if (new_cpu == -1 || new_cpu == cpu)
+			goto nextlevel;
+
+		/* Now try balancing at a lower domain level */
+		cpu = new_cpu;
+nextlevel:
+		sd = NULL;
+		weight = cpus_weight(span);
+		for_each_domain(cpu, tmp) {
+			if (weight <= cpus_weight(tmp->span))
+				break;
+			if (tmp->flags & flag)
+				sd = tmp;
+		}
+		/* while loop will break here if sd == NULL */
+	}
+
+	return cpu;
+}
+
+#endif /* CONFIG_SMP */
 
 /*
  * wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -1240,8 +1291,15 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  */
-void fastcall sched_fork(task_t *p)
+void fastcall sched_fork(task_t *p, int clone_flags)
 {
+	int cpu = get_cpu();
+
+#ifdef CONFIG_SMP
+	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+	set_task_cpu(p, cpu);
+
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
@@ -1282,12 +1340,10 @@ void fastcall sched_fork(task_t *p)
 		 * runqueue lock is not a problem.
 		 */
 		current->time_slice = 1;
-		preempt_disable();
 		scheduler_tick();
-		local_irq_enable();
-		preempt_enable();
-	} else
-		local_irq_enable();
+	}
+	local_irq_enable();
+	put_cpu();
 }
 
 /*
@@ -1302,64 +1358,12 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 	unsigned long flags;
 	int this_cpu, cpu;
 	runqueue_t *rq, *this_rq;
-#ifdef CONFIG_SMP
-	struct sched_domain *tmp, *sd = NULL;
-#endif
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id();
 	cpu = task_cpu(p);
 
-#ifdef CONFIG_SMP
-	for_each_domain(cpu, tmp)
-		if (tmp->flags & SD_BALANCE_FORK)
-			sd = tmp;
-
-	if (sd) {
-		cpumask_t span;
-		int new_cpu;
-		struct sched_group *group;
-
-again:
-		schedstat_inc(sd, sbf_cnt);
-		span = sd->span;
-		cpu = task_cpu(p);
-		group = find_idlest_group(sd, p, cpu);
-		if (!group) {
-			schedstat_inc(sd, sbf_balanced);
-			goto nextlevel;
-		}
-
-		new_cpu = find_idlest_cpu(group, cpu);
-		if (new_cpu == -1 || new_cpu == cpu) {
-			schedstat_inc(sd, sbf_balanced);
-			goto nextlevel;
-		}
-
-		if (cpu_isset(new_cpu, p->cpus_allowed)) {
-			schedstat_inc(sd, sbf_pushed);
-			set_task_cpu(p, new_cpu);
-			task_rq_unlock(rq, &flags);
-			rq = task_rq_lock(p, &flags);
-			cpu = task_cpu(p);
-		}
-
-		/* Now try balancing at a lower domain level */
-nextlevel:
-		sd = NULL;
-		for_each_domain(cpu, tmp) {
-			if (cpus_subset(span, tmp->span))
-				break;
-			if (tmp->flags & SD_BALANCE_FORK)
-				sd = tmp;
-		}
-
-		if (sd)
-			goto again;
-	}
-
-#endif
 	/*
 	 * We decrease the sleep average of forking parents
 	 * and children as well, to keep max-interactive tasks
@@ -1708,58 +1712,16 @@ out:
 }
 
 /*
- * sched_exec(): find the highest-level, exec-balance-capable
- * domain and try to migrate the task to the least loaded CPU.
- *
- * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
-	struct sched_domain *tmp, *sd = NULL;
 	int new_cpu, this_cpu = get_cpu();
-
-	for_each_domain(this_cpu, tmp)
-		if (tmp->flags & SD_BALANCE_EXEC)
-			sd = tmp;
-
-	if (sd) {
-		cpumask_t span;
-		struct sched_group *group;
-again:
-		schedstat_inc(sd, sbe_cnt);
-		span = sd->span;
-		group = find_idlest_group(sd, current, this_cpu);
-		if (!group) {
-			schedstat_inc(sd, sbe_balanced);
-			goto nextlevel;
-		}
-		new_cpu = find_idlest_cpu(group, this_cpu);
-		if (new_cpu == -1 || new_cpu == this_cpu) {
-			schedstat_inc(sd, sbe_balanced);
-			goto nextlevel;
-		}
-
-		schedstat_inc(sd, sbe_pushed);
-		put_cpu();
-		sched_migrate_task(current, new_cpu);
-
-		/* Now try balancing at a lower domain level */
-		this_cpu = get_cpu();
-nextlevel:
-		sd = NULL;
-		for_each_domain(this_cpu, tmp) {
-			if (cpus_subset(span, tmp->span))
-				break;
-			if (tmp->flags & SD_BALANCE_EXEC)
-				sd = tmp;
-		}
-
-		if (sd)
-			goto again;
-	}
-
+	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
 	put_cpu();
+	if (new_cpu != this_cpu)
+		sched_migrate_task(current, new_cpu);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 77391d71681d05d2f4502f91ad62618522abf624 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:30 -0700
Subject: [PATCH] sched: relax pinned balancing

The maximum rebalance interval allowed by the multiprocessor balancing
backoff is often not large enough to handle corner cases where there are
lots of tasks pinned on a CPU.  Suresh reported:

	I see system livelock's if for example I have 7000 processes
	pinned onto one cpu (this is on the fastest 8-way system I
	have access to).

After this patch, the machine is reported to go well above this number.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 579da278e72f..6e452eb95ac3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2030,6 +2030,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
 	return busiest;
 }
 
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL	512
+
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
@@ -2042,7 +2048,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 	struct sched_group *group;
 	runqueue_t *busiest;
 	unsigned long imbalance;
-	int nr_moved, all_pinned;
+	int nr_moved, all_pinned = 0;
 	int active_balance = 0;
 
 	spin_lock(&this_rq->lock);
@@ -2133,7 +2139,8 @@ out_balanced:
 
 	sd->nr_balance_failed = 0;
 	/* tune up the balancing interval */
-	if (sd->balance_interval < sd->max_interval)
+	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 
 	return 0;
-- 
cgit v1.3-14-g43fede


From a3464a102a69a4e00efb0a763e274ce290995b4b Mon Sep 17 00:00:00 2001
From: Chen Shang <shangcs@gmail.com>
Date: Sat, 25 Jun 2005 14:57:31 -0700
Subject: [PATCH] sched: micro-optimize task requeueing in schedule()

micro-optimize task requeueing in schedule() & clean up recalc_task_prio().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6e452eb95ac3..a3d1c8e43d34 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -673,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 	rq->nr_running++;
 }
 
-static void recalc_task_prio(task_t *p, unsigned long long now)
+static int recalc_task_prio(task_t *p, unsigned long long now)
 {
 	/* Caller must always ensure 'now >= p->timestamp' */
 	unsigned long long __sleep_time = now - p->timestamp;
@@ -732,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
 		}
 	}
 
-	p->prio = effective_prio(p);
+	return effective_prio(p);
 }
 
 /*
@@ -755,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
 	}
 #endif
 
-	recalc_task_prio(p, now);
+	p->prio = recalc_task_prio(p, now);
 
 	/*
 	 * This checks to make sure it's not an uninterruptible task
@@ -2751,7 +2751,7 @@ asmlinkage void __sched schedule(void)
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx;
+	int cpu, idx, new_prio;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -2873,9 +2873,14 @@ go_idle:
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
 		array = next->array;
-		dequeue_task(next, array);
-		recalc_task_prio(next, next->timestamp + delta);
-		enqueue_task(next, array);
+		new_prio = recalc_task_prio(next, next->timestamp + delta);
+
+		if (unlikely(next->prio != new_prio)) {
+			dequeue_task(next, array);
+			next->prio = new_prio;
+			enqueue_task(next, array);
+		} else
+			requeue_task(next, array);
 	}
 	next->activated = 0;
 switch_tasks:
-- 
cgit v1.3-14-g43fede


From 37e4ab3f0cba13adf3535d373fd98e5ee47b5410 Mon Sep 17 00:00:00 2001
From: Olivier Croquette <ocroquette@free.fr>
Date: Sat, 25 Jun 2005 14:57:32 -0700
Subject: [PATCH] Changing RT priority without CAP_SYS_NICE

Presently, a process without the capability CAP_SYS_NICE can not change
its own policy, which is OK.

But it can also not decrease its RT priority (if scheduled with policy
SCHED_RR or SCHED_FIFO), which is what this patch changes.

The rationale is the same as for the nice value: a process should be
able to require less priority for itself. Increasing the priority is
still not allowed.

This is for example useful if you give a multithreaded user process a RT
priority, and the process would like to organize its internal threads
using priorities also. Then you can give the process the highest
priority needed N, and the process starts its threads with lower
priorities: N-1, N-2...

The POSIX norm says that the permissions are implementation specific, so
I think we can do that.

In a sense, it makes the permissions consistent whatever the policy is:
with this patch, process scheduled by SCHED_FIFO, SCHED_RR and
SCHED_OTHER can all decrease their priority.

From: Ingo Molnar <mingo@elte.hu>

cleaned up and merged to -mm.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a3d1c8e43d34..d3d81b82e378 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3531,13 +3531,24 @@ recheck:
 	if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
 		return -EINVAL;
 
-	if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
-	    param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
-	    !capable(CAP_SYS_NICE))
-		return -EPERM;
-	if ((current->euid != p->euid) && (current->euid != p->uid) &&
-	    !capable(CAP_SYS_NICE))
-		return -EPERM;
+	/*
+	 * Allow unprivileged RT tasks to decrease priority:
+	 */
+	if (!capable(CAP_SYS_NICE)) {
+		/* can't change policy */
+		if (policy != p->policy)
+			return -EPERM;
+		/* can't increase priority */
+		if (policy != SCHED_NORMAL &&
+		    param->sched_priority > p->rt_priority &&
+		    param->sched_priority >
+				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+			return -EPERM;
+		/* can't change other user's priorities */
+		if ((current->euid != p->euid) &&
+		    (current->euid != p->uid))
+			return -EPERM;
+	}
 
 	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
-- 
cgit v1.3-14-g43fede


From 1a20ff27ef75d866730ee796acd811a925af762f Mon Sep 17 00:00:00 2001
From: Dinakar Guniguntala <dino@in.ibm.com>
Date: Sat, 25 Jun 2005 14:57:33 -0700
Subject: [PATCH] Dynamic sched domains: sched changes

The following patches add dynamic sched domains functionality that was
extensively discussed on lkml and lse-tech.  I would like to see this added to
-mm

o The main advantage with this feature is that it ensures that the scheduler
  load balacing code only balances against the cpus that are in the sched
  domain as defined by an exclusive cpuset and not all of the cpus in the
  system. This removes any overhead due to load balancing code trying to
  pull tasks outside of the cpu exclusive cpuset only to be prevented by
  the tasks' cpus_allowed mask.
o cpu exclusive cpusets are useful for servers running orthogonal
  workloads such as RT applications requiring low latency and HPC
  applications that are throughput sensitive

o It provides a new API partition_sched_domains in sched.c
  that makes dynamic sched domains possible.
o cpu_exclusive cpusets sets are now associated with a sched domain.
  Which means that the users can dynamically modify the sched domains
  through the cpuset file system interface
o ia64 sched domain code has been updated to support this feature as well
o Currently, this does not support hotplug. (However some of my tests
  indicate hotplug+preempt is currently broken)
o I have tested it extensively on x86.
o This should have very minimal impact on performance as none of
  the fast paths are affected

Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |   2 +
 kernel/sched.c        | 132 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 88 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index edb2c69a8873..98c109e4f43d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -539,6 +539,8 @@ struct sched_domain {
 #endif
 };
 
+extern void partition_sched_domains(cpumask_t *partition1,
+				    cpumask_t *partition2);
 #ifdef ARCH_HAS_SCHED_DOMAIN
 /* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
 extern cpumask_t cpu_isolated_map;
diff --git a/kernel/sched.c b/kernel/sched.c
index d3d81b82e378..dee96b22635e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,7 +262,7 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See update_sched_domains: synchronize_kernel for details.
+ * See detach_destroy_domains: synchronize_sched for details.
  *
  * The domain tree of any CPU may only be accessed from within
  * preempt-disabled sections.
@@ -4624,7 +4624,7 @@ int __init migration_init(void)
 #endif
 
 #ifdef CONFIG_SMP
-#define SCHED_DOMAIN_DEBUG
+#undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
@@ -4717,7 +4717,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 #define sched_domain_debug(sd, cpu) {}
 #endif
 
-static int __devinit sd_degenerate(struct sched_domain *sd)
+static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpus_weight(sd->span) == 1)
 		return 1;
@@ -4740,7 +4740,7 @@ static int __devinit sd_degenerate(struct sched_domain *sd)
 	return 1;
 }
 
-static int __devinit sd_parent_degenerate(struct sched_domain *sd,
+static int sd_parent_degenerate(struct sched_domain *sd,
 						struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
@@ -4772,7 +4772,7 @@ static int __devinit sd_parent_degenerate(struct sched_domain *sd,
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
+void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -4823,7 +4823,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void __devinit init_sched_build_groups(struct sched_group groups[],
+void init_sched_build_groups(struct sched_group groups[],
 			cpumask_t span, int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
@@ -4859,13 +4859,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
 
 
 #ifdef ARCH_HAS_SCHED_DOMAIN
-extern void __devinit arch_init_sched_domains(void);
-extern void __devinit arch_destroy_sched_domains(void);
+extern void build_sched_domains(const cpumask_t *cpu_map);
+extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
 #else
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu)
 {
 	return cpu;
 }
@@ -4873,7 +4874,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu)
 {
 #ifdef CONFIG_SCHED_SMT
 	return first_cpu(cpu_sibling_map[cpu]);
@@ -4886,7 +4887,7 @@ static int __devinit cpu_to_phys_group(int cpu)
 
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int __devinit cpu_to_node_group(int cpu)
+static int cpu_to_node_group(int cpu)
 {
 	return cpu_to_node(cpu);
 }
@@ -4917,39 +4918,28 @@ static void check_sibling_maps(void)
 #endif
 
 /*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
  */
-static void __devinit arch_init_sched_domains(void)
+static void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
-	cpumask_t cpu_default_map;
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-	check_sibling_maps();
-#endif
-	/*
-	 * Setup mask for cpus without special case scheduling requirements.
-	 * For now this just excludes isolated cpus, but could be used to
-	 * exclude other special cases in the future.
-	 */
-	cpus_complement(cpu_default_map, cpu_isolated_map);
-	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
 
 	/*
-	 * Set up domains. Isolated domains just stay on the NULL domain.
+	 * Set up domains for cpus specified by the cpu_map.
 	 */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		int group;
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
 		sd = &per_cpu(node_domains, i);
 		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		sd->span = cpu_default_map;
+		sd->span = *cpu_map;
 		sd->groups = &sched_group_nodes[group];
 #endif
 
@@ -4967,7 +4957,7 @@ static void __devinit arch_init_sched_domains(void)
 		group = cpu_to_cpu_group(i);
 		*sd = SD_SIBLING_INIT;
 		sd->span = cpu_sibling_map[i];
-		cpus_and(sd->span, sd->span, cpu_default_map);
+		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		sd->groups = &sched_group_cpus[group];
 #endif
@@ -4977,7 +4967,7 @@ static void __devinit arch_init_sched_domains(void)
 	/* Set up CPU (sibling) groups */
 	for_each_online_cpu(i) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
-		cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
 			continue;
 
@@ -4990,7 +4980,7 @@ static void __devinit arch_init_sched_domains(void)
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask))
 			continue;
 
@@ -5000,12 +4990,12 @@ static void __devinit arch_init_sched_domains(void)
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	init_sched_build_groups(sched_group_nodes, cpu_default_map,
+	init_sched_build_groups(sched_group_nodes, *cpu_map,
 					&cpu_to_node_group);
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		int power;
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
@@ -5029,7 +5019,7 @@ static void __devinit arch_init_sched_domains(void)
 	}
 
 	/* Attach the domains */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -5039,16 +5029,71 @@ static void __devinit arch_init_sched_domains(void)
 		cpu_attach_domain(sd, i);
 	}
 }
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static void arch_init_sched_domains(cpumask_t *cpu_map)
+{
+	cpumask_t cpu_default_map;
 
-#ifdef CONFIG_HOTPLUG_CPU
-static void __devinit arch_destroy_sched_domains(void)
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+	check_sibling_maps();
+#endif
+	/*
+	 * Setup mask for cpus without special case scheduling requirements.
+	 * For now this just excludes isolated cpus, but could be used to
+	 * exclude other special cases in the future.
+	 */
+	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+	build_sched_domains(&cpu_default_map);
+}
+
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 	/* Do nothing: everything is statically allocated. */
 }
-#endif
 
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static inline void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+	int i;
+
+	for_each_cpu_mask(i, *cpu_map)
+		cpu_attach_domain(NULL, i);
+	synchronize_sched();
+	arch_destroy_sched_domains(cpu_map);
+}
+
+/*
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+	cpumask_t change_map;
+
+	cpus_and(*partition1, *partition1, cpu_online_map);
+	cpus_and(*partition2, *partition2, cpu_online_map);
+	cpus_or(change_map, *partition1, *partition2);
+
+	/* Detach sched domains from all of the affected cpus */
+	detach_destroy_domains(&change_map);
+	if (!cpus_empty(*partition1))
+		build_sched_domains(partition1);
+	if (!cpus_empty(*partition2))
+		build_sched_domains(partition2);
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
@@ -5059,15 +5104,10 @@ static void __devinit arch_destroy_sched_domains(void)
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
-	int i;
-
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_DOWN_PREPARE:
-		for_each_online_cpu(i)
-			cpu_attach_domain(NULL, i);
-		synchronize_kernel();
-		arch_destroy_sched_domains();
+		detach_destroy_domains(&cpu_online_map);
 		return NOTIFY_OK;
 
 	case CPU_UP_CANCELED:
@@ -5083,7 +5123,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 	}
 
 	/* The hotplug lock is already held by cpu_up/cpu_down */
-	arch_init_sched_domains();
+	arch_init_sched_domains(&cpu_online_map);
 
 	return NOTIFY_OK;
 }
@@ -5092,7 +5132,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 void __init sched_init_smp(void)
 {
 	lock_cpu_hotplug();
-	arch_init_sched_domains();
+	arch_init_sched_domains(&cpu_online_map);
 	unlock_cpu_hotplug();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
-- 
cgit v1.3-14-g43fede


From 85d7b94981e2e919697bc235aad7367b33c3864b Mon Sep 17 00:00:00 2001
From: Dinakar Guniguntala <dino@in.ibm.com>
Date: Sat, 25 Jun 2005 14:57:34 -0700
Subject: [PATCH] Dynamic sched domains: cpuset changes

Adds the core update_cpu_domains code and updated cpusets documentation

Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt | 16 +++++++++
 kernel/cpuset.c           | 89 ++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 92 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index 2f8f24eaefd9..ad944c060312 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -51,6 +51,14 @@ mems_allowed vector.
 
 If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct
 ancestor or descendent, may share any of the same CPUs or Memory Nodes.
+A cpuset that is cpu exclusive has a sched domain associated with it.
+The sched domain consists of all cpus in the current cpuset that are not
+part of any exclusive child cpusets.
+This ensures that the scheduler load balacing code only balances
+against the cpus that are in the sched domain as defined above and not
+all of the cpus in the system. This removes any overhead due to
+load balancing code trying to pull tasks outside of the cpu exclusive
+cpuset only to be prevented by the tasks' cpus_allowed mask.
 
 User level code may create and destroy cpusets by name in the cpuset
 virtual file system, manage the attributes and permissions of these
@@ -84,6 +92,9 @@ This can be especially valuable on:
       and a database), or
     * NUMA systems running large HPC applications with demanding
       performance characteristics.
+    * Also cpu_exclusive cpusets are useful for servers running orthogonal
+      workloads such as RT applications requiring low latency and HPC
+      applications that are throughput sensitive
 
 These subsets, or "soft partitions" must be able to be dynamically
 adjusted, as the job mix changes, without impacting other concurrently
@@ -125,6 +136,8 @@ Cpusets extends these two mechanisms as follows:
  - A cpuset may be marked exclusive, which ensures that no other
    cpuset (except direct ancestors and descendents) may contain
    any overlapping CPUs or Memory Nodes.
+   Also a cpu_exclusive cpuset would be associated with a sched
+   domain.
  - You can list all the tasks (by pid) attached to any cpuset.
 
 The implementation of cpusets requires a few, simple hooks
@@ -136,6 +149,9 @@ into the rest of the kernel, none in performance critical paths:
    allowed in that tasks cpuset.
  - in sched.c migrate_all_tasks(), to keep migrating tasks within
    the CPUs allowed by their cpuset, if possible.
+ - in sched.c, a new API partition_sched_domains for handling
+   sched domain changes associated with cpu_exclusive cpusets
+   and related changes in both sched.c and arch/ia64/kernel/domain.c
  - in the mbind and set_mempolicy system calls, to mask the requested
    Memory Nodes by what's allowed in that tasks cpuset.
  - in page_alloc, to restrict memory to allowed nodes.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 79dd929f4084..984c0bf3807f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -595,10 +595,62 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 	return 0;
 }
 
+/*
+ * For a given cpuset cur, partition the system as follows
+ * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
+ *    exclusive child cpusets
+ * b. All cpus in the current cpuset's cpus_allowed that are not part of any
+ *    exclusive child cpusets
+ * Build these two partitions by calling partition_sched_domains
+ *
+ * Call with cpuset_sem held.  May nest a call to the
+ * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ */
+static void update_cpu_domains(struct cpuset *cur)
+{
+	struct cpuset *c, *par = cur->parent;
+	cpumask_t pspan, cspan;
+
+	if (par == NULL || cpus_empty(cur->cpus_allowed))
+		return;
+
+	/*
+	 * Get all cpus from parent's cpus_allowed not part of exclusive
+	 * children
+	 */
+	pspan = par->cpus_allowed;
+	list_for_each_entry(c, &par->children, sibling) {
+		if (is_cpu_exclusive(c))
+			cpus_andnot(pspan, pspan, c->cpus_allowed);
+	}
+	if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+		cpus_or(pspan, pspan, cur->cpus_allowed);
+		if (cpus_equal(pspan, cur->cpus_allowed))
+			return;
+		cspan = CPU_MASK_NONE;
+	} else {
+		if (cpus_empty(pspan))
+			return;
+		cspan = cur->cpus_allowed;
+		/*
+		 * Get all cpus from current cpuset's cpus_allowed not part
+		 * of exclusive children
+		 */
+		list_for_each_entry(c, &cur->children, sibling) {
+			if (is_cpu_exclusive(c))
+				cpus_andnot(cspan, cspan, c->cpus_allowed);
+		}
+	}
+
+	lock_cpu_hotplug();
+	partition_sched_domains(&pspan, &cspan);
+	unlock_cpu_hotplug();
+}
+
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
-	int retval;
+	int retval, cpus_unchanged;
 
 	trialcs = *cs;
 	retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -608,9 +660,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	if (cpus_empty(trialcs.cpus_allowed))
 		return -ENOSPC;
 	retval = validate_change(cs, &trialcs);
-	if (retval == 0)
-		cs->cpus_allowed = trialcs.cpus_allowed;
-	return retval;
+	if (retval < 0)
+		return retval;
+	cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+	cs->cpus_allowed = trialcs.cpus_allowed;
+	if (is_cpu_exclusive(cs) && !cpus_unchanged)
+		update_cpu_domains(cs);
+	return 0;
 }
 
 static int update_nodemask(struct cpuset *cs, char *buf)
@@ -646,7 +702,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 {
 	int turning_on;
 	struct cpuset trialcs;
-	int err;
+	int err, cpu_exclusive_changed;
 
 	turning_on = (simple_strtoul(buf, NULL, 10) != 0);
 
@@ -657,13 +713,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 		clear_bit(bit, &trialcs.flags);
 
 	err = validate_change(cs, &trialcs);
-	if (err == 0) {
-		if (turning_on)
-			set_bit(bit, &cs->flags);
-		else
-			clear_bit(bit, &cs->flags);
-	}
-	return err;
+	if (err < 0)
+		return err;
+	cpu_exclusive_changed =
+		(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
+	if (turning_on)
+		set_bit(bit, &cs->flags);
+	else
+		clear_bit(bit, &cs->flags);
+
+	if (cpu_exclusive_changed)
+                update_cpu_domains(cs);
+	return 0;
 }
 
 static int attach_task(struct cpuset *cs, char *buf)
@@ -1309,12 +1370,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 		up(&cpuset_sem);
 		return -EBUSY;
 	}
-	spin_lock(&cs->dentry->d_lock);
 	parent = cs->parent;
 	set_bit(CS_REMOVED, &cs->flags);
+	if (is_cpu_exclusive(cs))
+		update_cpu_domains(cs);
 	list_del(&cs->sibling);	/* delete my sibling from parent->children */
 	if (list_empty(&parent->children))
 		check_for_release(parent);
+	spin_lock(&cs->dentry->d_lock);
 	d = dget(cs->dentry);
 	cs->dentry = NULL;
 	spin_unlock(&d->d_lock);
-- 
cgit v1.3-14-g43fede


From cc19ca86a023fcd552c78e77a7be6ce271f92a28 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 25 Jun 2005 14:57:36 -0700
Subject: [PATCH] consolidate PREEMPT options into kernel/Kconfig.preempt

This patch consolidates the CONFIG_PREEMPT and CONFIG_PREEMPT_BKL
preemption options into kernel/Kconfig.preempt.  This, besides reducing
source-code, also enables more centralized tweaking of preemption related
options.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig      | 23 +----------------------
 arch/ppc64/Kconfig     | 21 +--------------------
 arch/x86_64/Kconfig    | 29 ++---------------------------
 kernel/Kconfig.preempt | 24 ++++++++++++++++++++++++
 4 files changed, 28 insertions(+), 69 deletions(-)
 create mode 100644 kernel/Kconfig.preempt

(limited to 'kernel')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index b4cd11e58451..961ab20032f5 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -510,28 +510,7 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
-config PREEMPT
-	bool "Preemptible Kernel"
-	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load.
-
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
-
-config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
-	depends on PREEMPT
-	default y
-	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
-
-	  Say Y here if you are building a kernel for a desktop system.
-	  Say N if you are unsure.
+source "kernel/Kconfig.preempt"
 
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors"
diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig
index cb27068bfcd4..5f40b438b584 100644
--- a/arch/ppc64/Kconfig
+++ b/arch/ppc64/Kconfig
@@ -270,26 +270,7 @@ config SCHED_SMT
 	  when dealing with POWER5 cpus at a cost of slightly increased
 	  overhead in some places. If unsure say N here.
 
-config PREEMPT
-	bool "Preemptible Kernel"
-	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
-
-config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
-	depends on PREEMPT
-	default y
-	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
-
-	  Say Y here if you are building a kernel for a desktop system.
-	  Say N if you are unsure.
+source "kernel/Kconfig.preempt"
 
 config EEH
 	bool "PCI Extended Error Handling (EEH)" if EMBEDDED
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 92f5a5266023..a853d87ff7e3 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -207,33 +207,6 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
-config PREEMPT
-	bool "Preemptible Kernel"
-	---help---
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load. On contrary it may also break your drivers and add
-	  priority inheritance problems to your system. Don't select it if
-	  you rely on a stable system or have slightly obscure hardware.
-	  It's also not very well tested on x86-64 currently.
-	  You have been warned.
-
-	  Say Y here if you are feeling brave and building a kernel for a
-	  desktop, embedded or real-time system.  Say N if you are unsure.
-
-config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
-	depends on PREEMPT
-	default y
-	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
-
-	  Say Y here if you are building a kernel for a desktop system.
-	  Say N if you are unsure.
-
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
 	depends on SMP
@@ -244,6 +217,8 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+source "kernel/Kconfig.preempt"
+
 config K8_NUMA
        bool "K8 NUMA support"
        select NUMA
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
new file mode 100644
index 000000000000..587328be8216
--- /dev/null
+++ b/kernel/Kconfig.preempt
@@ -0,0 +1,24 @@
+
+config PREEMPT
+	bool "Preemptible Kernel"
+	help
+	  This option reduces the latency of the kernel when reacting to
+	  real-time or interactive events by allowing a low priority process to
+	  be preempted even if it is in kernel mode executing a system call.
+	  This allows applications to run more reliably even when the system is
+	  under load.
+
+	  Say Y here if you are building a kernel for a desktop, embedded
+	  or real-time system.  Say N if you are unsure.
+
+config PREEMPT_BKL
+	bool "Preempt The Big Kernel Lock"
+	depends on PREEMPT
+	default y
+	help
+	  This option reduces the latency of the kernel by making the
+	  big kernel lock preemptible.
+
+	  Say Y here if you are building a kernel for a desktop system.
+	  Say N if you are unsure.
+
-- 
cgit v1.3-14-g43fede


From f704f56af95bec3c1ca719d64d0becef74d40899 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 25 Jun 2005 14:57:38 -0700
Subject: [PATCH] enable PREEMPT_BKL on !PREEMPT+SMP too

The only sane way to clean up the current 3 lock_kernel() variants seems to
be to remove the spinlock-based BKL implementations altogether, and to keep
the semaphore-based one only.  If we dont want to do that for whatever
reason then i'm afraid we have to live with the current complexity.  (but
i'm open for other cleanup suggestions as well.)

To explore this possibility we'll (at a minimum) have to know whether the
semaphore-based BKL works fine on plain SMP too.  The patch below enables
this.

The patch may make sense in isolation as well, as it might bring
performance benefits: code that would formerly spin on the BKL spinlock
will now schedule away and give up the CPU.  It might introduce performance
regressions as well, if any performance-critical code uses the BKL heavily
and gets overscheduled due to the semaphore.  I very much hope there is no
such performance-critical codepath left though.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Kconfig.preempt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 587328be8216..34c631221aa3 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -13,7 +13,7 @@ config PREEMPT
 
 config PREEMPT_BKL
 	bool "Preempt The Big Kernel Lock"
-	depends on PREEMPT
+	depends on SMP || PREEMPT
 	default y
 	help
 	  This option reduces the latency of the kernel by making the
-- 
cgit v1.3-14-g43fede


From f8cbd99bd3a023db8d6356d19a5f6f539d367327 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 25 Jun 2005 14:57:39 -0700
Subject: [PATCH] sched: voluntary kernel preemption

This patch adds a new preemption model: 'Voluntary Kernel Preemption'.  The
3 models can be selected from a new menu:

            (X) No Forced Preemption (Server)
            ( ) Voluntary Kernel Preemption (Desktop)
            ( ) Preemptible Kernel (Low-Latency Desktop)

we still default to the stock (Server) preemption model.

Voluntary preemption works by adding a cond_resched()
(reschedule-if-needed) call to every might_sleep() check.  It is lighter
than CONFIG_PREEMPT - at the cost of not having as tight latencies.  It
represents a different latency/complexity/overhead tradeoff.

It has no runtime impact at all if disabled.  Here are size stats that show
how the various preemption models impact the kernel's size:

    text    data     bss     dec     hex filename
 3618774  547184  179896 4345854  424ffe vmlinux.stock
 3626406  547184  179896 4353486  426dce vmlinux.voluntary   +0.2%
 3748414  548640  179896 4476950  445016 vmlinux.preempt     +3.5%

voluntary-preempt is +0.2% of .text, preempt is +3.5%.

This feature has been tested for many months by lots of people (and it's
also included in the RHEL4 distribution and earlier variants were in Fedora
as well), and it's intended for users and distributions who dont want to
use full-blown CONFIG_PREEMPT for one reason or another.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kernel.h | 18 +++++++++++-----
 kernel/Kconfig.preempt | 57 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 62 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e25b97062ce1..687ba8c9973d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -58,15 +58,23 @@ struct completion;
  * be biten later when the calling function happens to sleep when it is not
  * supposed to.
  */
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int cond_resched(void);
+# define might_resched() cond_resched()
+#else
+# define might_resched() do { } while (0)
+#endif
+
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-#define might_sleep() __might_sleep(__FILE__, __LINE__)
-#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0)
-void __might_sleep(char *file, int line);
+  void __might_sleep(char *file, int line);
+# define might_sleep() \
+	do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
 #else
-#define might_sleep() do {} while(0)
-#define might_sleep_if(cond) do {} while (0)
+# define might_sleep() do { might_resched(); } while (0)
 #endif
 
+#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0)
+
 #define abs(x) ({				\
 		int __x = (x);			\
 		(__x < 0) ? -__x : __x;		\
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 34c631221aa3..0b46a5dff4c0 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,15 +1,56 @@
 
-config PREEMPT
-	bool "Preemptible Kernel"
+choice
+	prompt "Preemption Model"
+	default PREEMPT_NONE
+
+config PREEMPT_NONE
+	bool "No Forced Preemption (Server)"
+	help
+	  This is the traditional Linux preemption model, geared towards
+	  throughput. It will still provide good latencies most of the
+	  time, but there are no guarantees and occasional longer delays
+	  are possible.
+
+	  Select this option if you are building a kernel for a server or
+	  scientific/computation system, or if you want to maximize the
+	  raw processing power of the kernel, irrespective of scheduling
+	  latencies.
+
+config PREEMPT_VOLUNTARY
+	bool "Voluntary Kernel Preemption (Desktop)"
 	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
+	  This option reduces the latency of the kernel by adding more
+	  "explicit preemption points" to the kernel code. These new
+	  preemption points have been selected to reduce the maximum
+	  latency of rescheduling, providing faster application reactions,
+	  at the cost of slighly lower throughput.
+
+	  This allows reaction to interactive events by allowing a
+	  low priority process to voluntarily preempt itself even if it
+	  is in kernel mode executing a system call. This allows
+	  applications to run more 'smoothly' even when the system is
 	  under load.
 
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
+	  Select this if you are building a kernel for a desktop system.
+
+config PREEMPT
+	bool "Preemptible Kernel (Low-Latency Desktop)"
+	help
+	  This option reduces the latency of the kernel by making
+	  all kernel code (that is not executing in a critical section)
+	  preemptible.  This allows reaction to interactive events by
+	  permitting a low priority process to be preempted involuntarily
+	  even if it is in kernel mode executing a system call and would
+	  otherwise not be about to reach a natural preemption point.
+	  This allows applications to run more 'smoothly' even when the
+	  system is under load, at the cost of slighly lower throughput
+	  and a slight runtime overhead to kernel code.
+
+	  Select this if you are building a kernel for a desktop or
+	  embedded system with latency requirements in the milliseconds
+	  range.
+
+endchoice
 
 config PREEMPT_BKL
 	bool "Preempt The Big Kernel Lock"
-- 
cgit v1.3-14-g43fede


From dc009d92435f99498cbc579ce76bf28e837e2c14 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:57:52 -0700
Subject: [PATCH] kexec: add kexec syscalls

This patch introduces the architecture independent implementation the
sys_kexec_load, the compat_sys_kexec_load system calls.

Kexec on panic support has been integrated into the core patch and is
relatively clean.

In addition the hopefully architecture independent option
crashkernel=size@location has been docuemented.  It's purpose is to reserve
space for the panic kernel to live, and where no DMA transfer will ever be
setup to access.

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Alexander Nyberg <alexn@telia.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |    4 +
 MAINTAINERS                         |   10 +
 include/linux/kexec.h               |  127 +++++
 include/linux/reboot.h              |    3 +
 include/linux/syscalls.h            |    5 +-
 kernel/Makefile                     |    1 +
 kernel/kexec.c                      | 1036 +++++++++++++++++++++++++++++++++++
 kernel/panic.c                      |   23 +-
 kernel/sys.c                        |   20 +
 kernel/sys_ni.c                     |    2 +
 10 files changed, 1227 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/kexec.h
 create mode 100644 kernel/kexec.c

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 86db43fd6b0f..560ff5ae3fd9 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -358,6 +358,10 @@ running once the system is up.
 	cpia_pp=	[HW,PPT]
 			Format: { parport<nr> | auto | none }
 
+	crashkernel=nn[KMG]@ss[KMG]
+			[KNL] Reserve a chunk of physical memory to
+			hold a kernel to switch to with kexec on panic.
+
 	cs4232=		[HW,OSS]
 			Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq>
 
diff --git a/MAINTAINERS b/MAINTAINERS
index dbdd8494b2e6..81728572799e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1330,6 +1330,16 @@ M:	rml@novell.com
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 
+KEXEC
+P:	Eric Biederman
+P:	Randy Dunlap
+M:	ebiederm@xmission.com
+M:	rddunlap@osdl.org
+W:	http://www.xmission.com/~ebiederm/files/kexec/
+L:	linux-kernel@vger.kernel.org
+L:	fastboot@osdl.org
+S:	Maintained
+
 LANMEDIA WAN CARD DRIVER
 P:	Andrew Stanley-Jones
 M:	asj@lanmedia.com
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
new file mode 100644
index 000000000000..e3fc35f4e35f
--- /dev/null
+++ b/include/linux/kexec.h
@@ -0,0 +1,127 @@
+#ifndef LINUX_KEXEC_H
+#define LINUX_KEXEC_H
+
+#ifdef CONFIG_KEXEC
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <asm/kexec.h>
+
+/* Verify architecture specific macros are defined */
+
+#ifndef KEXEC_SOURCE_MEMORY_LIMIT
+#error KEXEC_SOURCE_MEMORY_LIMIT not defined
+#endif
+
+#ifndef KEXEC_DESTINATION_MEMORY_LIMIT
+#error KEXEC_DESTINATION_MEMORY_LIMIT not defined
+#endif
+
+#ifndef KEXEC_CONTROL_MEMORY_LIMIT
+#error KEXEC_CONTROL_MEMORY_LIMIT not defined
+#endif
+
+#ifndef KEXEC_CONTROL_CODE_SIZE
+#error KEXEC_CONTROL_CODE_SIZE not defined
+#endif
+
+#ifndef KEXEC_ARCH
+#error KEXEC_ARCH not defined
+#endif
+
+/*
+ * This structure is used to hold the arguments that are used when loading
+ * kernel binaries.
+ */
+
+typedef unsigned long kimage_entry_t;
+#define IND_DESTINATION  0x1
+#define IND_INDIRECTION  0x2
+#define IND_DONE         0x4
+#define IND_SOURCE       0x8
+
+#define KEXEC_SEGMENT_MAX 8
+struct kexec_segment {
+	void __user *buf;
+	size_t bufsz;
+	unsigned long mem;	/* User space sees this as a (void *) ... */
+	size_t memsz;
+};
+
+#ifdef CONFIG_COMPAT
+struct compat_kexec_segment {
+	compat_uptr_t buf;
+	compat_size_t bufsz;
+	compat_ulong_t mem;	/* User space sees this as a (void *) ... */
+	compat_size_t memsz;
+};
+#endif
+
+struct kimage {
+	kimage_entry_t head;
+	kimage_entry_t *entry;
+	kimage_entry_t *last_entry;
+
+	unsigned long destination;
+
+	unsigned long start;
+	struct page *control_code_page;
+
+	unsigned long nr_segments;
+	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+
+	struct list_head control_pages;
+	struct list_head dest_pages;
+	struct list_head unuseable_pages;
+
+	/* Address of next control page to allocate for crash kernels. */
+	unsigned long control_page;
+
+	/* Flags to indicate special processing */
+	unsigned int type : 1;
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+};
+
+
+
+/* kexec interface functions */
+extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
+extern int machine_kexec_prepare(struct kimage *image);
+extern void machine_kexec_cleanup(struct kimage *image);
+extern asmlinkage long sys_kexec_load(unsigned long entry,
+	unsigned long nr_segments, struct kexec_segment __user *segments,
+	unsigned long flags);
+#ifdef CONFIG_COMPAT
+extern asmlinkage long compat_sys_kexec_load(unsigned long entry,
+	unsigned long nr_segments, struct compat_kexec_segment __user *segments,
+	unsigned long flags);
+#endif
+extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order);
+extern void crash_kexec(void);
+extern struct kimage *kexec_image;
+
+#define KEXEC_ON_CRASH  0x00000001
+#define KEXEC_ARCH_MASK 0xffff0000
+
+/* These values match the ELF architecture values.
+ * Unless there is a good reason that should continue to be the case.
+ */
+#define KEXEC_ARCH_DEFAULT ( 0 << 16)
+#define KEXEC_ARCH_386     ( 3 << 16)
+#define KEXEC_ARCH_X86_64  (62 << 16)
+#define KEXEC_ARCH_PPC     (20 << 16)
+#define KEXEC_ARCH_PPC64   (21 << 16)
+#define KEXEC_ARCH_IA_64   (50 << 16)
+
+#define KEXEC_FLAGS    (KEXEC_ON_CRASH)  /* List of defined/legal kexec flags */
+
+/* Location of a reserved region to hold the crash kernel.
+ */
+extern struct resource crashk_res;
+
+#else /* !CONFIG_KEXEC */
+static inline void crash_kexec(void) { }
+#endif /* CONFIG_KEXEC */
+#endif /* LINUX_KEXEC_H */
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index d60fafc8bdc5..c5a05e16edb2 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -51,6 +51,9 @@ extern void machine_restart(char *cmd);
 extern void machine_halt(void);
 extern void machine_power_off(void);
 
+extern void machine_shutdown(void);
+extern void machine_crash_shutdown(void);
+
 #endif
 
 #endif /* _LINUX_REBOOT_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c39f6f72cbbc..7ba8f8f747aa 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -159,8 +159,9 @@ asmlinkage long sys_shutdown(int, int);
 asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd,
 				void __user *arg);
 asmlinkage long sys_restart_syscall(void);
-asmlinkage long sys_kexec_load(void *entry, unsigned long nr_segments,
-			struct kexec_segment *segments, unsigned long flags);
+asmlinkage long sys_kexec_load(unsigned long entry,
+	unsigned long nr_segments, struct kexec_segment __user *segments,
+	unsigned long flags);
 
 asmlinkage long sys_exit(int error_code);
 asmlinkage void sys_exit_group(int error_code);
diff --git a/kernel/Makefile b/kernel/Makefile
index b01d26fe8db7..cfc8b0dea950 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
diff --git a/kernel/kexec.c b/kernel/kexec.c
new file mode 100644
index 000000000000..def9c73ec9a6
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,1036 @@
+/*
+ * kexec.c - kexec system call
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/syscalls.h>
+#include <linux/ioport.h>
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/semaphore.h>
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the control code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic more
+ *    reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static int kimage_is_destination_range(
+	struct kimage *image, unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
+
+static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
+	unsigned long nr_segments, struct kexec_segment __user *segments)
+{
+	size_t segment_bytes;
+	struct kimage *image;
+	unsigned long i;
+	int result;
+
+	/* Allocate a controlling structure */
+	result = -ENOMEM;
+	image = kmalloc(sizeof(*image), GFP_KERNEL);
+	if (!image) {
+		goto out;
+	}
+	memset(image, 0, sizeof(*image));
+	image->head = 0;
+	image->entry = &image->head;
+	image->last_entry = &image->head;
+	image->control_page = ~0; /* By default this does not apply */
+	image->start = entry;
+	image->type = KEXEC_TYPE_DEFAULT;
+
+	/* Initialize the list of control pages */
+	INIT_LIST_HEAD(&image->control_pages);
+
+	/* Initialize the list of destination pages */
+	INIT_LIST_HEAD(&image->dest_pages);
+
+	/* Initialize the list of unuseable pages */
+	INIT_LIST_HEAD(&image->unuseable_pages);
+
+	/* Read in the segments */
+	image->nr_segments = nr_segments;
+	segment_bytes = nr_segments * sizeof(*segments);
+	result = copy_from_user(image->segment, segments, segment_bytes);
+	if (result)
+		goto out;
+
+	/*
+	 * Verify we have good destination addresses.  The caller is
+	 * responsible for making certain we don't attempt to load
+	 * the new image into invalid or reserved areas of RAM.  This
+	 * just verifies it is an address we can use.
+	 *
+	 * Since the kernel does everything in page size chunks ensure
+	 * the destination addreses are page aligned.  Too many
+	 * special cases crop of when we don't do this.  The most
+	 * insidious is getting overlapping destination addresses
+	 * simply because addresses are changed to page size
+	 * granularity.
+	 */
+	result = -EADDRNOTAVAIL;
+	for (i = 0; i < nr_segments; i++) {
+		unsigned long mstart, mend;
+		mstart = image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+			goto out;
+		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+			goto out;
+	}
+
+	/* Verify our destination addresses do not overlap.
+	 * If we alloed overlapping destination addresses
+	 * through very weird things can happen with no
+	 * easy explanation as one segment stops on another.
+	 */
+	result = -EINVAL;
+	for(i = 0; i < nr_segments; i++) {
+		unsigned long mstart, mend;
+		unsigned long j;
+		mstart = image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		for(j = 0; j < i; j++) {
+			unsigned long pstart, pend;
+			pstart = image->segment[j].mem;
+			pend   = pstart + image->segment[j].memsz;
+			/* Do the segments overlap ? */
+			if ((mend > pstart) && (mstart < pend))
+				goto out;
+		}
+	}
+
+	/* Ensure our buffer sizes are strictly less than
+	 * our memory sizes.  This should always be the case,
+	 * and it is easier to check up front than to be surprised
+	 * later on.
+	 */
+	result = -EINVAL;
+	for(i = 0; i < nr_segments; i++) {
+		if (image->segment[i].bufsz > image->segment[i].memsz)
+			goto out;
+	}
+
+
+	result = 0;
+ out:
+	if (result == 0) {
+		*rimage = image;
+	} else {
+		kfree(image);
+	}
+	return result;
+
+}
+
+static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
+	unsigned long nr_segments, struct kexec_segment __user *segments)
+{
+	int result;
+	struct kimage *image;
+
+	/* Allocate and initialize a controlling structure */
+	image = NULL;
+	result = do_kimage_alloc(&image, entry, nr_segments, segments);
+	if (result) {
+		goto out;
+	}
+	*rimage = image;
+
+	/*
+	 * Find a location for the control code buffer, and add it
+	 * the vector of segments so that it's pages will also be
+	 * counted as destination pages.
+	 */
+	result = -ENOMEM;
+	image->control_code_page = kimage_alloc_control_pages(image,
+		get_order(KEXEC_CONTROL_CODE_SIZE));
+	if (!image->control_code_page) {
+		printk(KERN_ERR "Could not allocate control_code_buffer\n");
+		goto out;
+	}
+
+	result = 0;
+ out:
+	if (result == 0) {
+		*rimage = image;
+	} else {
+		kfree(image);
+	}
+	return result;
+}
+
+static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
+	unsigned long nr_segments, struct kexec_segment *segments)
+{
+	int result;
+	struct kimage *image;
+	unsigned long i;
+
+	image = NULL;
+	/* Verify we have a valid entry point */
+	if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
+		result = -EADDRNOTAVAIL;
+		goto out;
+	}
+
+	/* Allocate and initialize a controlling structure */
+	result = do_kimage_alloc(&image, entry, nr_segments, segments);
+	if (result) {
+		goto out;
+	}
+
+	/* Enable the special crash kernel control page
+	 * allocation policy.
+	 */
+	image->control_page = crashk_res.start;
+	image->type = KEXEC_TYPE_CRASH;
+
+	/*
+	 * Verify we have good destination addresses.  Normally
+	 * the caller is responsible for making certain we don't
+	 * attempt to load the new image into invalid or reserved
+	 * areas of RAM.  But crash kernels are preloaded into a
+	 * reserved area of ram.  We must ensure the addresses
+	 * are in the reserved area otherwise preloading the
+	 * kernel could corrupt things.
+	 */
+	result = -EADDRNOTAVAIL;
+	for (i = 0; i < nr_segments; i++) {
+		unsigned long mstart, mend;
+		mstart = image->segment[i].mem;
+		mend = mstart + image->segment[i].memsz;
+		/* Ensure we are within the crash kernel limits */
+		if ((mstart < crashk_res.start) || (mend > crashk_res.end))
+			goto out;
+	}
+
+
+	/*
+	 * Find a location for the control code buffer, and add
+	 * the vector of segments so that it's pages will also be
+	 * counted as destination pages.
+	 */
+	result = -ENOMEM;
+	image->control_code_page = kimage_alloc_control_pages(image,
+		get_order(KEXEC_CONTROL_CODE_SIZE));
+	if (!image->control_code_page) {
+		printk(KERN_ERR "Could not allocate control_code_buffer\n");
+		goto out;
+	}
+
+	result = 0;
+ out:
+	if (result == 0) {
+		*rimage = image;
+	} else {
+		kfree(image);
+	}
+	return result;
+}
+
+static int kimage_is_destination_range(
+	struct kimage *image, unsigned long start, unsigned long end)
+{
+	unsigned long i;
+
+	for (i = 0; i < image->nr_segments; i++) {
+		unsigned long mstart, mend;
+		mstart = image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		if ((end > mstart) && (start < mend)) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+	struct page *pages;
+	pages = alloc_pages(gfp_mask, order);
+	if (pages) {
+		unsigned int count, i;
+		pages->mapping = NULL;
+		pages->private = order;
+		count = 1 << order;
+		for(i = 0; i < count; i++) {
+			SetPageReserved(pages + i);
+		}
+	}
+	return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+	unsigned int order, count, i;
+	order = page->private;
+	count = 1 << order;
+	for(i = 0; i < count; i++) {
+		ClearPageReserved(page + i);
+	}
+	__free_pages(page, order);
+}
+
+static void kimage_free_page_list(struct list_head *list)
+{
+	struct list_head *pos, *next;
+	list_for_each_safe(pos, next, list) {
+		struct page *page;
+
+		page = list_entry(pos, struct page, lru);
+		list_del(&page->lru);
+
+		kimage_free_pages(page);
+	}
+}
+
+static struct page *kimage_alloc_normal_control_pages(
+	struct kimage *image, unsigned int order)
+{
+	/* Control pages are special, they are the intermediaries
+	 * that are needed while we copy the rest of the pages
+	 * to their final resting place.  As such they must
+	 * not conflict with either the destination addresses
+	 * or memory the kernel is already using.
+	 *
+	 * The only case where we really need more than one of
+	 * these are for architectures where we cannot disable
+	 * the MMU and must instead generate an identity mapped
+	 * page table for all of the memory.
+	 *
+	 * At worst this runs in O(N) of the image size.
+	 */
+	struct list_head extra_pages;
+	struct page *pages;
+	unsigned int count;
+
+	count = 1 << order;
+	INIT_LIST_HEAD(&extra_pages);
+
+	/* Loop while I can allocate a page and the page allocated
+	 * is a destination page.
+	 */
+	do {
+		unsigned long pfn, epfn, addr, eaddr;
+		pages = kimage_alloc_pages(GFP_KERNEL, order);
+		if (!pages)
+			break;
+		pfn   = page_to_pfn(pages);
+		epfn  = pfn + count;
+		addr  = pfn << PAGE_SHIFT;
+		eaddr = epfn << PAGE_SHIFT;
+		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+			kimage_is_destination_range(image, addr, eaddr))
+		{
+			list_add(&pages->lru, &extra_pages);
+			pages = NULL;
+		}
+	} while(!pages);
+	if (pages) {
+		/* Remember the allocated page... */
+		list_add(&pages->lru, &image->control_pages);
+
+		/* Because the page is already in it's destination
+		 * location we will never allocate another page at
+		 * that address.  Therefore kimage_alloc_pages
+		 * will not return it (again) and we don't need
+		 * to give it an entry in image->segment[].
+		 */
+	}
+	/* Deal with the destination pages I have inadvertently allocated.
+	 *
+	 * Ideally I would convert multi-page allocations into single
+	 * page allocations, and add everyting to image->dest_pages.
+	 *
+	 * For now it is simpler to just free the pages.
+	 */
+	kimage_free_page_list(&extra_pages);
+	return pages;
+
+}
+
+static struct page *kimage_alloc_crash_control_pages(
+	struct kimage *image, unsigned int order)
+{
+	/* Control pages are special, they are the intermediaries
+	 * that are needed while we copy the rest of the pages
+	 * to their final resting place.  As such they must
+	 * not conflict with either the destination addresses
+	 * or memory the kernel is already using.
+	 *
+	 * Control pages are also the only pags we must allocate
+	 * when loading a crash kernel.  All of the other pages
+	 * are specified by the segments and we just memcpy
+	 * into them directly.
+	 *
+	 * The only case where we really need more than one of
+	 * these are for architectures where we cannot disable
+	 * the MMU and must instead generate an identity mapped
+	 * page table for all of the memory.
+	 *
+	 * Given the low demand this implements a very simple
+	 * allocator that finds the first hole of the appropriate
+	 * size in the reserved memory region, and allocates all
+	 * of the memory up to and including the hole.
+	 */
+	unsigned long hole_start, hole_end, size;
+	struct page *pages;
+	pages = NULL;
+	size = (1 << order) << PAGE_SHIFT;
+	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+	hole_end   = hole_start + size - 1;
+	while(hole_end <= crashk_res.end) {
+		unsigned long i;
+		if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) {
+			break;
+		}
+		if (hole_end > crashk_res.end) {
+			break;
+		}
+		/* See if I overlap any of the segments */
+		for(i = 0; i < image->nr_segments; i++) {
+			unsigned long mstart, mend;
+			mstart = image->segment[i].mem;
+			mend   = mstart + image->segment[i].memsz - 1;
+			if ((hole_end >= mstart) && (hole_start <= mend)) {
+				/* Advance the hole to the end of the segment */
+				hole_start = (mend + (size - 1)) & ~(size - 1);
+				hole_end   = hole_start + size - 1;
+				break;
+			}
+		}
+		/* If I don't overlap any segments I have found my hole! */
+		if (i == image->nr_segments) {
+			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+			break;
+		}
+	}
+	if (pages) {
+		image->control_page = hole_end;
+	}
+	return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(
+	struct kimage *image, unsigned int order)
+{
+	struct page *pages = NULL;
+	switch(image->type) {
+	case KEXEC_TYPE_DEFAULT:
+		pages = kimage_alloc_normal_control_pages(image, order);
+		break;
+	case KEXEC_TYPE_CRASH:
+		pages = kimage_alloc_crash_control_pages(image, order);
+		break;
+	}
+	return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+	if (*image->entry != 0) {
+		image->entry++;
+	}
+	if (image->entry == image->last_entry) {
+		kimage_entry_t *ind_page;
+		struct page *page;
+		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+		if (!page) {
+			return -ENOMEM;
+		}
+		ind_page = page_address(page);
+		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		image->entry = ind_page;
+		image->last_entry =
+			ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+	}
+	*image->entry = entry;
+	image->entry++;
+	*image->entry = 0;
+	return 0;
+}
+
+static int kimage_set_destination(
+	struct kimage *image, unsigned long destination)
+{
+	int result;
+
+	destination &= PAGE_MASK;
+	result = kimage_add_entry(image, destination | IND_DESTINATION);
+	if (result == 0) {
+		image->destination = destination;
+	}
+	return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+	int result;
+
+	page &= PAGE_MASK;
+	result = kimage_add_entry(image, page | IND_SOURCE);
+	if (result == 0) {
+		image->destination += PAGE_SIZE;
+	}
+	return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+	/* Walk through and free any extra destination pages I may have */
+	kimage_free_page_list(&image->dest_pages);
+
+	/* Walk through and free any unuseable pages I have cached */
+	kimage_free_page_list(&image->unuseable_pages);
+
+}
+static int kimage_terminate(struct kimage *image)
+{
+	if (*image->entry != 0) {
+		image->entry++;
+	}
+	*image->entry = IND_DONE;
+	return 0;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+		ptr = (entry & IND_INDIRECTION)? \
+			phys_to_virt((entry & PAGE_MASK)): ptr +1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+	struct page *page;
+
+	page = pfn_to_page(entry >> PAGE_SHIFT);
+	kimage_free_pages(page);
+}
+
+static void kimage_free(struct kimage *image)
+{
+	kimage_entry_t *ptr, entry;
+	kimage_entry_t ind = 0;
+
+	if (!image)
+		return;
+	kimage_free_extra_pages(image);
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_INDIRECTION) {
+			/* Free the previous indirection page */
+			if (ind & IND_INDIRECTION) {
+				kimage_free_entry(ind);
+			}
+			/* Save this indirection page until we are
+			 * done with it.
+			 */
+			ind = entry;
+		}
+		else if (entry & IND_SOURCE) {
+			kimage_free_entry(entry);
+		}
+	}
+	/* Free the final indirection page */
+	if (ind & IND_INDIRECTION) {
+		kimage_free_entry(ind);
+	}
+
+	/* Handle any machine specific cleanup */
+	machine_kexec_cleanup(image);
+
+	/* Free the kexec control pages... */
+	kimage_free_page_list(&image->control_pages);
+	kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
+{
+	kimage_entry_t *ptr, entry;
+	unsigned long destination = 0;
+
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_DESTINATION) {
+			destination = entry & PAGE_MASK;
+		}
+		else if (entry & IND_SOURCE) {
+			if (page == destination) {
+				return ptr;
+			}
+			destination += PAGE_SIZE;
+		}
+	}
+	return 0;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
+{
+	/*
+	 * Here we implement safeguards to ensure that a source page
+	 * is not copied to its destination page before the data on
+	 * the destination page is no longer useful.
+	 *
+	 * To do this we maintain the invariant that a source page is
+	 * either its own destination page, or it is not a
+	 * destination page at all.
+	 *
+	 * That is slightly stronger than required, but the proof
+	 * that no problems will not occur is trivial, and the
+	 * implementation is simply to verify.
+	 *
+	 * When allocating all pages normally this algorithm will run
+	 * in O(N) time, but in the worst case it will run in O(N^2)
+	 * time.   If the runtime is a problem the data structures can
+	 * be fixed.
+	 */
+	struct page *page;
+	unsigned long addr;
+
+	/*
+	 * Walk through the list of destination pages, and see if I
+	 * have a match.
+	 */
+	list_for_each_entry(page, &image->dest_pages, lru) {
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+		if (addr == destination) {
+			list_del(&page->lru);
+			return page;
+		}
+	}
+	page = NULL;
+	while (1) {
+		kimage_entry_t *old;
+
+		/* Allocate a page, if we run out of memory give up */
+		page = kimage_alloc_pages(gfp_mask, 0);
+		if (!page) {
+			return 0;
+		}
+		/* If the page cannot be used file it away */
+		if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+			list_add(&page->lru, &image->unuseable_pages);
+			continue;
+		}
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+
+		/* If it is the destination page we want use it */
+		if (addr == destination)
+			break;
+
+		/* If the page is not a destination page use it */
+		if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
+			break;
+
+		/*
+		 * I know that the page is someones destination page.
+		 * See if there is already a source page for this
+		 * destination page.  And if so swap the source pages.
+		 */
+		old = kimage_dst_used(image, addr);
+		if (old) {
+			/* If so move it */
+			unsigned long old_addr;
+			struct page *old_page;
+
+			old_addr = *old & PAGE_MASK;
+			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			copy_highpage(page, old_page);
+			*old = addr | (*old & ~PAGE_MASK);
+
+			/* The old page I have found cannot be a
+			 * destination page, so return it.
+			 */
+			addr = old_addr;
+			page = old_page;
+			break;
+		}
+		else {
+			/* Place the page on the destination list I
+			 * will use it later.
+			 */
+			list_add(&page->lru, &image->dest_pages);
+		}
+	}
+	return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+	struct kexec_segment *segment)
+{
+	unsigned long maddr;
+	unsigned long ubytes, mbytes;
+	int result;
+	unsigned char *buf;
+
+	result = 0;
+	buf = segment->buf;
+	ubytes = segment->bufsz;
+	mbytes = segment->memsz;
+	maddr = segment->mem;
+
+	result = kimage_set_destination(image, maddr);
+	if (result < 0) {
+		goto out;
+	}
+	while(mbytes) {
+		struct page *page;
+		char *ptr;
+		size_t uchunk, mchunk;
+		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+		if (page == 0) {
+			result  = -ENOMEM;
+			goto out;
+		}
+		result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
+		if (result < 0) {
+			goto out;
+		}
+		ptr = kmap(page);
+		/* Start with a clear page */
+		memset(ptr, 0, PAGE_SIZE);
+		ptr += maddr & ~PAGE_MASK;
+		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
+		if (mchunk > mbytes) {
+			mchunk = mbytes;
+		}
+		uchunk = mchunk;
+		if (uchunk > ubytes) {
+			uchunk = ubytes;
+		}
+		result = copy_from_user(ptr, buf, uchunk);
+		kunmap(page);
+		if (result) {
+			result = (result < 0) ? result : -EIO;
+			goto out;
+		}
+		ubytes -= uchunk;
+		maddr  += mchunk;
+		buf    += mchunk;
+		mbytes -= mchunk;
+	}
+ out:
+	return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+	struct kexec_segment *segment)
+{
+	/* For crash dumps kernels we simply copy the data from
+	 * user space to it's destination.
+	 * We do things a page at a time for the sake of kmap.
+	 */
+	unsigned long maddr;
+	unsigned long ubytes, mbytes;
+	int result;
+	unsigned char *buf;
+
+	result = 0;
+	buf = segment->buf;
+	ubytes = segment->bufsz;
+	mbytes = segment->memsz;
+	maddr = segment->mem;
+	while(mbytes) {
+		struct page *page;
+		char *ptr;
+		size_t uchunk, mchunk;
+		page = pfn_to_page(maddr >> PAGE_SHIFT);
+		if (page == 0) {
+			result  = -ENOMEM;
+			goto out;
+		}
+		ptr = kmap(page);
+		ptr += maddr & ~PAGE_MASK;
+		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
+		if (mchunk > mbytes) {
+			mchunk = mbytes;
+		}
+		uchunk = mchunk;
+		if (uchunk > ubytes) {
+			uchunk = ubytes;
+			/* Zero the trailing part of the page */
+			memset(ptr + uchunk, 0, mchunk - uchunk);
+		}
+		result = copy_from_user(ptr, buf, uchunk);
+		kunmap(page);
+		if (result) {
+			result = (result < 0) ? result : -EIO;
+			goto out;
+		}
+		ubytes -= uchunk;
+		maddr  += mchunk;
+		buf    += mchunk;
+		mbytes -= mchunk;
+	}
+ out:
+	return result;
+}
+
+static int kimage_load_segment(struct kimage *image,
+	struct kexec_segment *segment)
+{
+	int result = -ENOMEM;
+	switch(image->type) {
+	case KEXEC_TYPE_DEFAULT:
+		result = kimage_load_normal_segment(image, segment);
+		break;
+	case KEXEC_TYPE_CRASH:
+		result = kimage_load_crash_segment(image, segment);
+		break;
+	}
+	return result;
+}
+
+/*
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ *   address space, and very carefully places the data in the
+ *   allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ *   the devices to shut down.  Preventing on-going dmas, and placing
+ *   the devices in a consistent state so a later kernel can
+ *   reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ *   and the copies the image to it's final destination.  And
+ *   jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
+ */
+struct kimage *kexec_image = NULL;
+static struct kimage *kexec_crash_image = NULL;
+/*
+ * A home grown binary mutex.
+ * Nothing can wait so this mutex is safe to use
+ * in interrupt context :)
+ */
+static int kexec_lock = 0;
+
+asmlinkage long sys_kexec_load(unsigned long entry,
+	unsigned long nr_segments, struct kexec_segment __user *segments,
+	unsigned long flags)
+{
+	struct kimage **dest_image, *image;
+	int locked;
+	int result;
+
+	/* We only trust the superuser with rebooting the system. */
+	if (!capable(CAP_SYS_BOOT))
+		return -EPERM;
+
+	/*
+	 * Verify we have a legal set of flags
+	 * This leaves us room for future extensions.
+	 */
+	if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
+		return -EINVAL;
+
+	/* Verify we are on the appropriate architecture */
+	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
+		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
+	{
+		return -EINVAL;
+	}
+
+	/* Put an artificial cap on the number
+	 * of segments passed to kexec_load.
+	 */
+	if (nr_segments > KEXEC_SEGMENT_MAX)
+		return -EINVAL;
+
+	image = NULL;
+	result = 0;
+
+	/* Because we write directly to the reserved memory
+	 * region when loading crash kernels we need a mutex here to
+	 * prevent multiple crash  kernels from attempting to load
+	 * simultaneously, and to prevent a crash kernel from loading
+	 * over the top of a in use crash kernel.
+	 *
+	 * KISS: always take the mutex.
+	 */
+	locked = xchg(&kexec_lock, 1);
+	if (locked) {
+		return -EBUSY;
+	}
+	dest_image = &kexec_image;
+	if (flags & KEXEC_ON_CRASH) {
+		dest_image = &kexec_crash_image;
+	}
+	if (nr_segments > 0) {
+		unsigned long i;
+		/* Loading another kernel to reboot into */
+		if ((flags & KEXEC_ON_CRASH) == 0) {
+			result = kimage_normal_alloc(&image, entry, nr_segments, segments);
+		}
+		/* Loading another kernel to switch to if this one crashes */
+		else if (flags & KEXEC_ON_CRASH) {
+			/* Free any current crash dump kernel before
+			 * we corrupt it.
+			 */
+			kimage_free(xchg(&kexec_crash_image, NULL));
+			result = kimage_crash_alloc(&image, entry, nr_segments, segments);
+		}
+		if (result) {
+			goto out;
+		}
+		result = machine_kexec_prepare(image);
+		if (result) {
+			goto out;
+		}
+		for(i = 0; i < nr_segments; i++) {
+			result = kimage_load_segment(image, &image->segment[i]);
+			if (result) {
+				goto out;
+			}
+		}
+		result = kimage_terminate(image);
+		if (result) {
+			goto out;
+		}
+	}
+	/* Install the new kernel, and  Uninstall the old */
+	image = xchg(dest_image, image);
+
+ out:
+	xchg(&kexec_lock, 0); /* Release the mutex */
+	kimage_free(image);
+	return result;
+}
+
+#ifdef CONFIG_COMPAT
+asmlinkage long compat_sys_kexec_load(unsigned long entry,
+	unsigned long nr_segments, struct compat_kexec_segment __user *segments,
+	unsigned long flags)
+{
+	struct compat_kexec_segment in;
+	struct kexec_segment out, __user *ksegments;
+	unsigned long i, result;
+
+	/* Don't allow clients that don't understand the native
+	 * architecture to do anything.
+	 */
+	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) {
+		return -EINVAL;
+	}
+
+	if (nr_segments > KEXEC_SEGMENT_MAX) {
+		return -EINVAL;
+	}
+
+	ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+	for (i=0; i < nr_segments; i++) {
+		result = copy_from_user(&in, &segments[i], sizeof(in));
+		if (result) {
+			return -EFAULT;
+		}
+
+		out.buf   = compat_ptr(in.buf);
+		out.bufsz = in.bufsz;
+		out.mem   = in.mem;
+		out.memsz = in.memsz;
+
+		result = copy_to_user(&ksegments[i], &out, sizeof(out));
+		if (result) {
+			return -EFAULT;
+		}
+	}
+
+	return sys_kexec_load(entry, nr_segments, ksegments, flags);
+}
+#endif
+
+void crash_kexec(void)
+{
+	struct kimage *image;
+	int locked;
+
+
+	/* Take the kexec_lock here to prevent sys_kexec_load
+	 * running on one cpu from replacing the crash kernel
+	 * we are using after a panic on a different cpu.
+	 *
+	 * If the crash kernel was not located in a fixed area
+	 * of memory the xchg(&kexec_crash_image) would be
+	 * sufficient.  But since I reuse the memory...
+	 */
+	locked = xchg(&kexec_lock, 1);
+	if (!locked) {
+		image = xchg(&kexec_crash_image, NULL);
+		if (image) {
+			machine_crash_shutdown();
+			machine_kexec(image);
+		}
+		xchg(&kexec_lock, 0);
+	}
+}
diff --git a/kernel/panic.c b/kernel/panic.c
index 081f7465fc8d..66f43d33cd80 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
 #include <linux/sysrq.h>
 #include <linux/interrupt.h>
 #include <linux/nmi.h>
+#include <linux/kexec.h>
 
 int panic_timeout;
 int panic_on_oops;
@@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...)
         unsigned long caller = (unsigned long) __builtin_return_address(0);
 #endif
 
+	/*
+	 * It's possible to come here directly from a panic-assertion and not
+	 * have preempt disabled. Some functions called from here want
+	 * preempt to be disabled. No point enabling it later though...
+	 */
+	preempt_disable();
+
 	bust_spinlocks(1);
 	va_start(args, fmt);
 	vsnprintf(buf, sizeof(buf), fmt, args);
@@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...)
 	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
 	bust_spinlocks(0);
 
+	/*
+	 * If we have crashed and we have a crash kernel loaded let it handle
+	 * everything else.
+	 * Do we want to call this before we try to display a message?
+	 */
+	crash_kexec();
+
 #ifdef CONFIG_SMP
+	/*
+	 * Note smp_send_stop is the usual smp shutdown function, which
+	 * unfortunately means it may not be hardened to work in a panic
+	 * situation.
+	 */
 	smp_send_stop();
 #endif
 
@@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	if (!panic_blink)
 		panic_blink = no_blink;
 
-	if (panic_timeout > 0)
-	{
+	if (panic_timeout > 0) {
 		/*
 	 	 * Delay timeout seconds before rebooting the machine. 
 		 * We can't use the "normal" timers since we just panicked..
diff --git a/kernel/sys.c b/kernel/sys.c
index dac10161ca23..9a24374c23bc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,6 +16,8 @@
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
 #include <linux/workqueue.h>
 #include <linux/device.h>
 #include <linux/key.h>
@@ -439,6 +441,24 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		machine_restart(buffer);
 		break;
 
+#ifdef CONFIG_KEXEC
+	case LINUX_REBOOT_CMD_KEXEC:
+	{
+		struct kimage *image;
+		image = xchg(&kexec_image, 0);
+		if (!image) {
+			unlock_kernel();
+			return -EINVAL;
+		}
+		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+		system_state = SYSTEM_RESTART;
+		device_shutdown();
+		printk(KERN_EMERG "Starting new kernel\n");
+		machine_shutdown();
+		machine_kexec(image);
+		break;
+	}
+#endif
 #ifdef CONFIG_SOFTWARE_SUSPEND
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
 		{
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6f15bea7d1a8..29196ce9b40f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -18,6 +18,8 @@ cond_syscall(sys_acct);
 cond_syscall(sys_lookup_dcookie);
 cond_syscall(sys_swapon);
 cond_syscall(sys_swapoff);
+cond_syscall(sys_kexec_load);
+cond_syscall(compat_sys_kexec_load);
 cond_syscall(sys_init_module);
 cond_syscall(sys_delete_module);
 cond_syscall(sys_socketpair);
-- 
cgit v1.3-14-g43fede


From 50cccc699ed849d31c9e3f7643db33edade20e4e Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 25 Jun 2005 14:57:55 -0700
Subject: [PATCH] Kexec on panic vmlinux initrd fix

This is a minor bug fix in kexec to resolve the problem of loading panic
kernel with initrd.

o Problem: Loading a capture kenrel fails if initrd is also being loaded.
  This has been observed for vmlinux image for kexec on panic case.

o This patch fixes the problem. In segment location and size verification
  logic, minor correction has been done. Segment memory end (mend) should be
  mstart + memsz - 1. This one byte offset was source of failure for initrd
  loading which was being loaded at hole boundary.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kexec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index def9c73ec9a6..a0411b3bd54a 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -264,7 +264,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 	for (i = 0; i < nr_segments; i++) {
 		unsigned long mstart, mend;
 		mstart = image->segment[i].mem;
-		mend = mstart + image->segment[i].memsz;
+		mend = mstart + image->segment[i].memsz - 1;
 		/* Ensure we are within the crash kernel limits */
 		if ((mstart < crashk_res.start) || (mend > crashk_res.end))
 			goto out;
-- 
cgit v1.3-14-g43fede


From 625f1c8219d95300ed32e4c67eb62a50ded095ba Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:12 -0700
Subject: [PATCH] Kdump: Export crash notes section address through sysfs

o Following patch exports kexec global variable "crash_notes" to user space
  through sysfs as kernel attribute in /sys/kernel.

Signed-off-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/crash.c   |  2 --
 arch/x86_64/kernel/crash.c |  3 ---
 include/asm-i386/kexec.h   |  5 +++++
 include/asm-x86_64/kexec.h |  5 +++++
 kernel/ksysfs.c            | 13 +++++++++++++
 5 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 59b92d217464..3645ad7ac200 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -26,8 +26,6 @@
 #include <asm/apic.h>
 #include <mach_ipi.h>
 
-#define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
 
 note_buf_t crash_notes[NR_CPUS];
 
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index 7caf8a49d0cb..6183bcb85257 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -20,9 +20,6 @@
 #include <asm/nmi.h>
 #include <asm/hw_irq.h>
 
-#define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
-
 note_buf_t crash_notes[NR_CPUS];
 
 void machine_crash_shutdown(void)
diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h
index a1599b55d62d..6ed2a03e37b3 100644
--- a/include/asm-i386/kexec.h
+++ b/include/asm-i386/kexec.h
@@ -25,4 +25,9 @@
 /* The native architecture */
 #define KEXEC_ARCH KEXEC_ARCH_386
 
+#define MAX_NOTE_BYTES 1024
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+
+extern note_buf_t crash_notes[];
+
 #endif /* _I386_KEXEC_H */
diff --git a/include/asm-x86_64/kexec.h b/include/asm-x86_64/kexec.h
index dc33646dc7dd..42d2ff15c592 100644
--- a/include/asm-x86_64/kexec.h
+++ b/include/asm-x86_64/kexec.h
@@ -25,4 +25,9 @@
 /* The native architecture */
 #define KEXEC_ARCH KEXEC_ARCH_X86_64
 
+#define MAX_NOTE_BYTES 1024
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+
+extern note_buf_t crash_notes[];
+
 #endif /* _X86_64_KEXEC_H */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1f064a63f8cf..015fb69ad94d 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -30,12 +30,25 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
 KERNEL_ATTR_RO(hotplug_seqnum);
 #endif
 
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+static ssize_t crash_notes_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%p\n", (void *)crash_notes);
+}
+KERNEL_ATTR_RO(crash_notes);
+#endif
+
 decl_subsys(kernel, NULL, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
 
 static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_HOTPLUG
 	&hotplug_seqnum_attr.attr,
+#endif
+#ifdef CONFIG_KEXEC
+	&crash_notes_attr.attr,
 #endif
 	NULL
 };
-- 
cgit v1.3-14-g43fede


From 60e64d46a58236e3c718074372cab6a5b56a3b15 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:19 -0700
Subject: [PATCH] kdump: Routines for copying dump pages

This patch provides the interfaces necessary to read the dump contents,
treating it as a high memory device.

Signed off by Hariprasad Nellitheertha <hari@in.ibm.com>
Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/highmem.c     | 18 +++++++++++++++++
 include/asm-i386/highmem.h |  1 +
 include/linux/crash_dump.h | 13 ++++++++++++
 include/linux/highmem.h    |  1 +
 kernel/Makefile            |  1 +
 kernel/crash_dump.c        | 49 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 83 insertions(+)
 create mode 100644 include/linux/crash_dump.h
 create mode 100644 kernel/crash_dump.c

(limited to 'kernel')

diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c
index 4b7aaf99d7ea..b6eb4dcb8777 100644
--- a/arch/i386/mm/highmem.c
+++ b/arch/i386/mm/highmem.c
@@ -75,6 +75,24 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 	preempt_check_resched();
 }
 
+/* This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+{
+	enum fixed_addresses idx;
+	unsigned long vaddr;
+
+	inc_preempt_count();
+
+	idx = type + KM_TYPE_NR*smp_processor_id();
+	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
+	__flush_tlb_one(vaddr);
+
+	return (void*) vaddr;
+}
+
 struct page *kmap_atomic_to_page(void *ptr)
 {
 	unsigned long idx, vaddr = (unsigned long)ptr;
diff --git a/include/asm-i386/highmem.h b/include/asm-i386/highmem.h
index 1df42bf347df..0fd331306b60 100644
--- a/include/asm-i386/highmem.h
+++ b/include/asm-i386/highmem.h
@@ -70,6 +70,7 @@ void *kmap(struct page *page);
 void kunmap(struct page *page);
 void *kmap_atomic(struct page *page, enum km_type type);
 void kunmap_atomic(void *kvaddr, enum km_type type);
+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
 struct page *kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
new file mode 100644
index 000000000000..7d983b817429
--- /dev/null
+++ b/include/linux/crash_dump.h
@@ -0,0 +1,13 @@
+#ifndef LINUX_CRASH_DUMP_H
+#define LINUX_CRASH_DUMP_H
+
+#ifdef CONFIG_CRASH_DUMP
+#include <linux/kexec.h>
+#include <linux/smp_lock.h>
+#include <linux/device.h>
+#include <linux/proc_fs.h>
+
+extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
+						unsigned long, int);
+#endif /* CONFIG_CRASH_DUMP */
+#endif /* LINUX_CRASHDUMP_H */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 2a7e6c65c882..6bece9280eb7 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -28,6 +28,7 @@ static inline void *kmap(struct page *page)
 
 #define kmap_atomic(page, idx)		page_address(page)
 #define kunmap_atomic(addr, idx)	do { } while (0)
+#define kmap_atomic_pfn(pfn, idx)	page_address(pfn_to_page(pfn))
 #define kmap_atomic_to_page(ptr)	virt_to_page(ptr)
 
 #endif /* CONFIG_HIGHMEM */
diff --git a/kernel/Makefile b/kernel/Makefile
index cfc8b0dea950..cb05cd05d237 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5a1e6d5d203e
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,49 @@
+/*
+ *	kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ *	Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *	Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
+
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+/*
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+				size_t csize, unsigned long offset, int userbuf)
+{
+	void *page, *vaddr;
+
+	if (!csize)
+		return 0;
+
+	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+	copy_page(page, vaddr);
+	kunmap_atomic(vaddr, KM_PTE0);
+
+	if (userbuf) {
+		if (copy_to_user(buf, (page + offset), csize)) {
+			kfree(page);
+			return -EFAULT;
+		}
+	} else {
+		memcpy(buf, (page + offset), csize);
+	}
+
+	kfree(page);
+	return csize;
+}
-- 
cgit v1.3-14-g43fede


From 2030eae52b416a9a9f0ffda74c982b7f1e19496d Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:20 -0700
Subject: [PATCH] Retrieve elfcorehdr address from command line

This patch adds support for retrieving the address of elf core header if one
is passed in command line.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt | 4 ++++
 arch/i386/kernel/setup.c            | 8 ++++++++
 include/linux/crash_dump.h          | 1 +
 kernel/crash_dump.c                 | 3 +++
 4 files changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 560ff5ae3fd9..f44bb5567c5b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -451,6 +451,10 @@ running once the system is up.
 			Format: {"as"|"cfq"|"deadline"|"noop"}
 			See Documentation/block/as-iosched.txt
 			and Documentation/block/deadline-iosched.txt for details.
+	elfcorehdr=	[IA-32]
+			Specifies physical address of start of kernel core image
+			elf header.
+			See Documentation/kdump.txt for details.
 
 	enforcing	[SELINUX] Set initial enforcing status.
 			Format: {"0" | "1"}
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 8d58a053e12e..7306353c520e 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -44,6 +44,7 @@
 #include <linux/edd.h>
 #include <linux/nodemask.h>
 #include <linux/kexec.h>
+#include <linux/crash_dump.h>
 
 #include <video/edid.h>
 
@@ -881,6 +882,13 @@ static void __init parse_cmdline_early (char ** cmdline_p)
 			}
 		}
 #endif
+#ifdef CONFIG_CRASH_DUMP
+		/* elfcorehdr= specifies the location of elf core header
+		 * stored by the crashed kernel.
+		 */
+		else if (!memcmp(from, "elfcorehdr=", 11))
+			elfcorehdr_addr = memparse(from+11, &from);
+#endif
 
 		/*
 		 * highmem=size forces highmem to be exactly 'size' bytes.
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 7d983b817429..3f25fd1eaa4b 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -7,6 +7,7 @@
 #include <linux/device.h>
 #include <linux/proc_fs.h>
 
+extern unsigned long long elfcorehdr_addr;
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);
 #endif /* CONFIG_CRASH_DUMP */
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 5a1e6d5d203e..10b966c3744c 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -15,6 +15,9 @@
 #include <asm/io.h>
 #include <asm/uaccess.h>
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr;
+
 /*
  * Copy a page from "oldmem". For this page, there is no pte mapped
  * in the current kernel. We stitch up a pte, similar to kmap_atomic.
-- 
cgit v1.3-14-g43fede


From 666bfddbe8b8fd4fd44617d6c55193d5ac7edb29 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:21 -0700
Subject: [PATCH] kdump: Access dump file in elf format (/proc/vmcore)

From: "Vivek Goyal" <vgoyal@in.ibm.com>

o Support for /proc/vmcore interface. This interface exports elf core image
  either in ELF32 or ELF64 format, depending on the format in which elf headers
  have been stored by crashed kernel.
o Added support for CONFIG_VMCORE config option.
o Removed the dependency on /proc/kcore.

From: "Eric W. Biederman" <ebiederm@xmission.com>

This patch has been refactored to more closely match the prevailing style in
the affected files.  And to clearly indicate the dependency between
/proc/kcore and proc/vmcore.c

From: Hariprasad Nellitheertha <hari@in.ibm.com>

This patch contains the code that provides an ELF format interface to the
previous kernel's memory post kexec reboot.

Signed off by Hariprasad Nellitheertha <hari@in.ibm.com>
Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig                 |   6 +
 fs/proc/Makefile           |   1 +
 fs/proc/proc_misc.c        |   6 +
 fs/proc/vmcore.c           | 451 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/crash_dump.h |   4 +
 include/linux/proc_fs.h    |   7 +
 kernel/crash_dump.c        |   2 +-
 7 files changed, 476 insertions(+), 1 deletion(-)
 create mode 100644 fs/proc/vmcore.c

(limited to 'kernel')

diff --git a/fs/Kconfig b/fs/Kconfig
index 8157f2e2d515..062177956239 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -734,6 +734,12 @@ config PROC_KCORE
 	bool "/proc/kcore support" if !ARM
 	depends on PROC_FS && MMU
 
+config PROC_VMCORE
+        bool "/proc/vmcore support (EXPERIMENTAL)"
+        depends on PROC_FS && EMBEDDED && EXPERIMENTAL && CRASH_DUMP
+        help
+        Exports the dump image of crashed kernel in ELF format.
+
 config SYSFS
 	bool "sysfs file system support" if EMBEDDED
 	default y
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 738b9b602932..7431d7ba2d09 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -11,4 +11,5 @@ proc-y       += inode.o root.o base.o generic.o array.o \
 		kmsg.o proc_tty.o proc_misc.o
 
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
+proc-$(CONFIG_PROC_VMCORE)	+= vmcore.o
 proc-$(CONFIG_PROC_DEVICETREE)	+= proc_devtree.o
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 94b570ad037d..a3453555a94e 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -44,6 +44,7 @@
 #include <linux/jiffies.h>
 #include <linux/sysrq.h>
 #include <linux/vmalloc.h>
+#include <linux/crash_dump.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
@@ -618,6 +619,11 @@ void __init proc_misc_init(void)
 				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
 	}
 #endif
+#ifdef CONFIG_PROC_VMCORE
+	proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL);
+	if (proc_vmcore)
+		proc_vmcore->proc_fops = &proc_vmcore_operations;
+#endif
 #ifdef CONFIG_MAGIC_SYSRQ
 	entry = create_proc_entry("sysrq-trigger", S_IWUSR, NULL);
 	if (entry)
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
new file mode 100644
index 000000000000..8ad467855845
--- /dev/null
+++ b/fs/proc/vmcore.c
@@ -0,0 +1,451 @@
+/*
+ *	fs/proc/vmcore.c Interface for accessing the crash
+ * 				 dump from the system's previous life.
+ * 	Heavily borrowed from fs/proc/kcore.c
+ *	Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *	Copyright (C) IBM Corporation, 2004. All rights reserved
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/proc_fs.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/crash_dump.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+/* List representing chunks of contiguous memory areas and their offsets in
+ * vmcore file.
+ */
+static LIST_HEAD(vmcore_list);
+
+/* Stores the pointer to the buffer containing kernel elf core headers. */
+static char *elfcorebuf;
+static size_t elfcorebuf_sz;
+
+/* Total size of vmcore file. */
+static u64 vmcore_size;
+
+struct proc_dir_entry *proc_vmcore = NULL;
+
+/* Reads a page from the oldmem device from given offset. */
+static ssize_t read_from_oldmem(char *buf, size_t count,
+			     loff_t *ppos, int userbuf)
+{
+	unsigned long pfn, offset;
+	size_t nr_bytes;
+	ssize_t read = 0, tmp;
+
+	if (!count)
+		return 0;
+
+	offset = (unsigned long)(*ppos % PAGE_SIZE);
+	pfn = (unsigned long)(*ppos / PAGE_SIZE);
+	if (pfn > saved_max_pfn)
+		return -EINVAL;
+
+	do {
+		if (count > (PAGE_SIZE - offset))
+			nr_bytes = PAGE_SIZE - offset;
+		else
+			nr_bytes = count;
+
+		tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf);
+		if (tmp < 0)
+			return tmp;
+		*ppos += nr_bytes;
+		count -= nr_bytes;
+		buf += nr_bytes;
+		read += nr_bytes;
+		++pfn;
+		offset = 0;
+	} while (count);
+
+	return read;
+}
+
+/* Maps vmcore file offset to respective physical address in memroy. */
+static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
+					struct vmcore **m_ptr)
+{
+	struct vmcore *m;
+	u64 paddr;
+
+	list_for_each_entry(m, vc_list, list) {
+		u64 start, end;
+		start = m->offset;
+		end = m->offset + m->size - 1;
+		if (offset >= start && offset <= end) {
+			paddr = m->paddr + offset - start;
+			*m_ptr = m;
+			return paddr;
+		}
+	}
+	*m_ptr = NULL;
+	return 0;
+}
+
+/* Read from the ELF header and then the crash dump. On error, negative value is
+ * returned otherwise number of bytes read are returned.
+ */
+static ssize_t read_vmcore(struct file *file, char __user *buffer,
+				size_t buflen, loff_t *fpos)
+{
+	ssize_t acc = 0, tmp;
+	size_t tsz, nr_bytes;
+	u64 start;
+	struct vmcore *curr_m = NULL;
+
+	if (buflen == 0 || *fpos >= vmcore_size)
+		return 0;
+
+	/* trim buflen to not go beyond EOF */
+	if (buflen > vmcore_size - *fpos)
+		buflen = vmcore_size - *fpos;
+
+	/* Read ELF core header */
+	if (*fpos < elfcorebuf_sz) {
+		tsz = elfcorebuf_sz - *fpos;
+		if (buflen < tsz)
+			tsz = buflen;
+		if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
+			return -EFAULT;
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+
+		/* leave now if filled buffer already */
+		if (buflen == 0)
+			return acc;
+	}
+
+	start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m);
+	if (!curr_m)
+        	return -EINVAL;
+	if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
+		tsz = buflen;
+
+	/* Calculate left bytes in current memory segment. */
+	nr_bytes = (curr_m->size - (start - curr_m->paddr));
+	if (tsz > nr_bytes)
+		tsz = nr_bytes;
+
+	while (buflen) {
+		tmp = read_from_oldmem(buffer, tsz, &start, 1);
+		if (tmp < 0)
+			return tmp;
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+		if (start >= (curr_m->paddr + curr_m->size)) {
+			if (curr_m->list.next == &vmcore_list)
+				return acc;	/*EOF*/
+			curr_m = list_entry(curr_m->list.next,
+						struct vmcore, list);
+			start = curr_m->paddr;
+		}
+		if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
+			tsz = buflen;
+		/* Calculate left bytes in current memory segment. */
+		nr_bytes = (curr_m->size - (start - curr_m->paddr));
+		if (tsz > nr_bytes)
+			tsz = nr_bytes;
+	}
+	return acc;
+}
+
+static int open_vmcore(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+struct file_operations proc_vmcore_operations = {
+	.read		= read_vmcore,
+	.open		= open_vmcore,
+};
+
+static struct vmcore* __init get_new_element(void)
+{
+	struct vmcore *p;
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p)
+		memset(p, 0, sizeof(*p));
+	return p;
+}
+
+static u64 __init get_vmcore_size_elf64(char *elfptr)
+{
+	int i;
+	u64 size;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr *phdr_ptr;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+	size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
+	for (i = 0; i < ehdr_ptr->e_phnum; i++) {
+		size += phdr_ptr->p_memsz;
+		phdr_ptr++;
+	}
+	return size;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
+						struct list_head *vc_list)
+{
+	int i, nr_ptnote=0, rc=0;
+	char *tmp;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr phdr, *phdr_ptr;
+	Elf64_Nhdr *nhdr_ptr;
+	u64 phdr_sz = 0, note_off;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		int j;
+		void *notes_section;
+		struct vmcore *new;
+		u64 offset, max_sz, sz, real_sz = 0;
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		nr_ptnote++;
+		max_sz = phdr_ptr->p_memsz;
+		offset = phdr_ptr->p_offset;
+		notes_section = kmalloc(max_sz, GFP_KERNEL);
+		if (!notes_section)
+			return -ENOMEM;
+		rc = read_from_oldmem(notes_section, max_sz, &offset, 0);
+		if (rc < 0) {
+			kfree(notes_section);
+			return rc;
+		}
+		nhdr_ptr = notes_section;
+		for (j = 0; j < max_sz; j += sz) {
+			if (nhdr_ptr->n_namesz == 0)
+				break;
+			sz = sizeof(Elf64_Nhdr) +
+				((nhdr_ptr->n_namesz + 3) & ~3) +
+				((nhdr_ptr->n_descsz + 3) & ~3);
+			real_sz += sz;
+			nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
+		}
+
+		/* Add this contiguous chunk of notes section to vmcore list.*/
+		new = get_new_element();
+		if (!new) {
+			kfree(notes_section);
+			return -ENOMEM;
+		}
+		new->paddr = phdr_ptr->p_offset;
+		new->size = real_sz;
+		list_add_tail(&new->list, vc_list);
+		phdr_sz += real_sz;
+		kfree(notes_section);
+	}
+
+	/* Prepare merged PT_NOTE program header. */
+	phdr.p_type    = PT_NOTE;
+	phdr.p_flags   = 0;
+	note_off = sizeof(Elf64_Ehdr) +
+			(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
+	phdr.p_offset  = note_off;
+	phdr.p_vaddr   = phdr.p_paddr = 0;
+	phdr.p_filesz  = phdr.p_memsz = phdr_sz;
+	phdr.p_align   = 0;
+
+	/* Add merged PT_NOTE program header*/
+	tmp = elfptr + sizeof(Elf64_Ehdr);
+	memcpy(tmp, &phdr, sizeof(phdr));
+	tmp += sizeof(phdr);
+
+	/* Remove unwanted PT_NOTE program headers. */
+	i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
+	*elfsz = *elfsz - i;
+	memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
+
+	/* Modify e_phnum to reflect merged headers. */
+	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+
+	return 0;
+}
+
+/* Add memory chunks represented by program headers to vmcore list. Also update
+ * the new offset fields of exported program headers. */
+static int __init process_ptload_program_headers_elf64(char *elfptr,
+						size_t elfsz,
+						struct list_head *vc_list)
+{
+	int i;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr *phdr_ptr;
+	loff_t vmcore_off;
+	struct vmcore *new;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
+
+	/* First program header is PT_NOTE header. */
+	vmcore_off = sizeof(Elf64_Ehdr) +
+			(ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
+			phdr_ptr->p_memsz; /* Note sections */
+
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		if (phdr_ptr->p_type != PT_LOAD)
+			continue;
+
+		/* Add this contiguous chunk of memory to vmcore list.*/
+		new = get_new_element();
+		if (!new)
+			return -ENOMEM;
+		new->paddr = phdr_ptr->p_offset;
+		new->size = phdr_ptr->p_memsz;
+		list_add_tail(&new->list, vc_list);
+
+		/* Update the program header offset. */
+		phdr_ptr->p_offset = vmcore_off;
+		vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+	}
+	return 0;
+}
+
+/* Sets offset fields of vmcore elements. */
+static void __init set_vmcore_list_offsets_elf64(char *elfptr,
+						struct list_head *vc_list)
+{
+	loff_t vmcore_off;
+	Elf64_Ehdr *ehdr_ptr;
+	struct vmcore *m;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+
+	/* Skip Elf header and program headers. */
+	vmcore_off = sizeof(Elf64_Ehdr) +
+			(ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
+
+	list_for_each_entry(m, vc_list, list) {
+		m->offset = vmcore_off;
+		vmcore_off += m->size;
+	}
+}
+
+static int __init parse_crash_elf64_headers(void)
+{
+	int rc=0;
+	Elf64_Ehdr ehdr;
+	u64 addr;
+
+	addr = elfcorehdr_addr;
+
+	/* Read Elf header */
+	rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0);
+	if (rc < 0)
+		return rc;
+
+	/* Do some basic Verification. */
+	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
+		(ehdr.e_type != ET_CORE) ||
+		!elf_check_arch(&ehdr) ||
+		ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
+		ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
+		ehdr.e_version != EV_CURRENT ||
+		ehdr.e_ehsize != sizeof(Elf64_Ehdr) ||
+		ehdr.e_phentsize != sizeof(Elf64_Phdr) ||
+		ehdr.e_phnum == 0) {
+		printk(KERN_WARNING "Warning: Core image elf header is not"
+					"sane\n");
+		return -EINVAL;
+	}
+
+	/* Read in all elf headers. */
+	elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr);
+	elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+	if (!elfcorebuf)
+		return -ENOMEM;
+	addr = elfcorehdr_addr;
+	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
+	if (rc < 0) {
+		kfree(elfcorebuf);
+		return rc;
+	}
+
+	/* Merge all PT_NOTE headers into one. */
+	rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
+	if (rc) {
+		kfree(elfcorebuf);
+		return rc;
+	}
+	rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
+							&vmcore_list);
+	if (rc) {
+		kfree(elfcorebuf);
+		return rc;
+	}
+	set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
+	return 0;
+}
+
+static int __init parse_crash_elf_headers(void)
+{
+	unsigned char e_ident[EI_NIDENT];
+	u64 addr;
+	int rc=0;
+
+	addr = elfcorehdr_addr;
+	rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0);
+	if (rc < 0)
+		return rc;
+	if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
+		printk(KERN_WARNING "Warning: Core image elf header"
+					" not found\n");
+		return -EINVAL;
+	}
+
+	if (e_ident[EI_CLASS] == ELFCLASS64) {
+		rc = parse_crash_elf64_headers();
+		if (rc)
+			return rc;
+
+		/* Determine vmcore size. */
+		vmcore_size = get_vmcore_size_elf64(elfcorebuf);
+	} else {
+		printk(KERN_WARNING "Warning: Core image elf header is not"
+					" sane\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* Init function for vmcore module. */
+static int __init vmcore_init(void)
+{
+	int rc = 0;
+
+	/* If elfcorehdr= has been passed in cmdline, then capture the dump.*/
+	if (!(elfcorehdr_addr < ELFCORE_ADDR_MAX))
+		return rc;
+	rc = parse_crash_elf_headers();
+	if (rc) {
+		printk(KERN_WARNING "Kdump: vmcore not initialized\n");
+		return rc;
+	}
+
+	/* Initialize /proc/vmcore size if proc is already up. */
+	if (proc_vmcore)
+		proc_vmcore->size = vmcore_size;
+	return 0;
+}
+module_init(vmcore_init)
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 3f25fd1eaa4b..534d750d922d 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -7,8 +7,12 @@
 #include <linux/device.h>
 #include <linux/proc_fs.h>
 
+#define ELFCORE_ADDR_MAX	(-1ULL)
 extern unsigned long long elfcorehdr_addr;
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);
+extern struct file_operations proc_vmcore_operations;
+extern struct proc_dir_entry *proc_vmcore;
+
 #endif /* CONFIG_CRASH_DUMP */
 #endif /* LINUX_CRASHDUMP_H */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 59e505261fd6..0563581e3a02 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -74,6 +74,13 @@ struct kcore_list {
 	size_t size;
 };
 
+struct vmcore {
+	struct list_head list;
+	unsigned long long paddr;
+	unsigned long size;
+	loff_t offset;
+};
+
 #ifdef CONFIG_PROC_FS
 
 extern struct proc_dir_entry proc_root;
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 10b966c3744c..459ba49e376a 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -16,7 +16,7 @@
 #include <asm/uaccess.h>
 
 /* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr;
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
 
 /*
  * Copy a page from "oldmem". For this page, there is no pte mapped
-- 
cgit v1.3-14-g43fede


From 6e274d144302068a00794ec22e73520c0615cb6f Mon Sep 17 00:00:00 2001
From: Alexander Nyberg <alexn@telia.com>
Date: Sat, 25 Jun 2005 14:58:26 -0700
Subject: [PATCH] kdump: Use real pt_regs from exception

Makes kexec_crashdump() take a pt_regs * as an argument.  This allows to
get exact register state at the point of the crash.  If we come from direct
panic assertion NULL will be passed and the current registers saved before
crashdump.

This hooks into two places:
die(): check the conditions under which we will panic when calling
do_exit and go there directly with the pt_regs that caused the fatal
fault.

die_nmi(): If we receive an NMI lockup while in the kernel use the
pt_regs and go directly to crash_kexec(). We're probably nested up badly
at this point so this might be the only chance to escape with proper
information.

Signed-off-by: Alexander Nyberg <alexn@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/crash.c          | 36 ++++++++++++++++++++++++------------
 arch/i386/kernel/traps.c          | 17 +++++++++++++++++
 arch/ppc/kernel/machine_kexec.c   |  2 +-
 arch/ppc64/kernel/machine_kexec.c |  2 +-
 arch/s390/kernel/crash.c          |  2 +-
 arch/x86_64/kernel/crash.c        |  2 +-
 drivers/char/sysrq.c              |  2 +-
 include/linux/kexec.h             |  8 ++++++--
 include/linux/reboot.h            |  3 ++-
 kernel/kexec.c                    | 13 +++++++++++--
 kernel/panic.c                    |  2 +-
 11 files changed, 66 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index a021681d21f8..8bdb4b6af0ff 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -100,12 +100,31 @@ static void crash_get_current_regs(struct pt_regs *regs)
 	regs->eip = (unsigned long)current_text_addr();
 }
 
-static void crash_save_self(void)
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
+static void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs)
+{
+	memcpy(newregs, oldregs, sizeof(*newregs));
+	newregs->esp = (unsigned long)&(oldregs->esp);
+	__asm__ __volatile__("xorl %eax, %eax;");
+	__asm__ __volatile__ ("movw %%ss, %%ax;" :"=a"(newregs->xss));
+}
+
+/* We may have saved_regs from where the error came from
+ * or it is NULL if via a direct panic().
+ */
+static void crash_save_self(struct pt_regs *saved_regs)
 {
 	struct pt_regs regs;
 	int cpu;
 	cpu = smp_processor_id();
-	crash_get_current_regs(&regs);
+
+	if (saved_regs)
+		crash_setup_regs(&regs, saved_regs);
+	else
+		crash_get_current_regs(&regs);
 	crash_save_this_cpu(&regs, cpu);
 }
 
@@ -124,15 +143,8 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
 		return 1;
 	local_irq_disable();
 
-	/* CPU does not save ss and esp on stack if execution is already
-	 * running in kernel mode at the time of NMI occurrence. This code
-	 * fixes it.
-	 */
 	if (!user_mode(regs)) {
-		memcpy(&fixed_regs, regs, sizeof(*regs));
-		fixed_regs.esp = (unsigned long)&(regs->esp);
-		__asm__ __volatile__("xorl %eax, %eax;");
-		__asm__ __volatile__ ("movw %%ss, %%ax;" :"=a"(fixed_regs.xss));
+		crash_setup_regs(&fixed_regs, regs);
 		regs = &fixed_regs;
 	}
 	crash_save_this_cpu(regs, cpu);
@@ -184,7 +196,7 @@ static void nmi_shootdown_cpus(void)
 }
 #endif
 
-void machine_crash_shutdown(void)
+void machine_crash_shutdown(struct pt_regs *regs)
 {
 	/* This function is only called after the system
 	 * has paniced or is otherwise in a critical state.
@@ -204,5 +216,5 @@ void machine_crash_shutdown(void)
 #if defined(CONFIG_X86_IO_APIC)
 	disable_IO_APIC();
 #endif
-	crash_save_self();
+	crash_save_self(regs);
 }
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 207ea8ba7169..e458463ebc05 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -27,6 +27,7 @@
 #include <linux/ptrace.h>
 #include <linux/utsname.h>
 #include <linux/kprobes.h>
+#include <linux/kexec.h>
 
 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
@@ -294,6 +295,9 @@ bug:
 	printk("Kernel BUG\n");
 }
 
+/* This is gone through when something in the kernel
+ * has done something bad and is about to be terminated.
+*/
 void die(const char * str, struct pt_regs * regs, long err)
 {
 	static struct {
@@ -341,6 +345,10 @@ void die(const char * str, struct pt_regs * regs, long err)
 	bust_spinlocks(0);
 	die.lock_owner = -1;
 	spin_unlock_irq(&die.lock);
+
+	if (kexec_should_crash(current))
+		crash_kexec(regs);
+
 	if (in_interrupt())
 		panic("Fatal exception in interrupt");
 
@@ -570,6 +578,15 @@ void die_nmi (struct pt_regs *regs, const char *msg)
 	console_silent();
 	spin_unlock(&nmi_print_lock);
 	bust_spinlocks(0);
+
+	/* If we are in kernel we are probably nested up pretty bad
+	 * and might aswell get out now while we still can.
+	*/
+	if (!user_mode(regs)) {
+		current->thread.trap_no = 2;
+		crash_kexec(regs);
+	}
+
 	do_exit(SIGSEGV);
 }
 
diff --git a/arch/ppc/kernel/machine_kexec.c b/arch/ppc/kernel/machine_kexec.c
index 435ad9ea0a83..b82535357d6d 100644
--- a/arch/ppc/kernel/machine_kexec.c
+++ b/arch/ppc/kernel/machine_kexec.c
@@ -34,7 +34,7 @@ void machine_shutdown(void)
 	}
 }
 
-void machine_crash_shutdown(void)
+void machine_crash_shutdown(struct pt_regs *regs)
 {
 	if (ppc_md.machine_crash_shutdown) {
 		ppc_md.machine_crash_shutdown();
diff --git a/arch/ppc64/kernel/machine_kexec.c b/arch/ppc64/kernel/machine_kexec.c
index 217965d60a45..06b25b59c8a8 100644
--- a/arch/ppc64/kernel/machine_kexec.c
+++ b/arch/ppc64/kernel/machine_kexec.c
@@ -34,7 +34,7 @@ note_buf_t crash_notes[NR_CPUS];
  * and if what it will achieve. Letting it be now to compile the code
  * in generic kexec environment
  */
-void machine_crash_shutdown(void)
+void machine_crash_shutdown(struct pt_regs *regs)
 {
 	/* do nothing right now */
 	/* smp_relase_cpus() if we want smp on panic kernel */
diff --git a/arch/s390/kernel/crash.c b/arch/s390/kernel/crash.c
index db38283c1f27..7bd169c58b0c 100644
--- a/arch/s390/kernel/crash.c
+++ b/arch/s390/kernel/crash.c
@@ -12,6 +12,6 @@
 
 note_buf_t crash_notes[NR_CPUS];
 
-void machine_crash_shutdown(void)
+void machine_crash_shutdown(struct pt_regs *regs)
 {
 }
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index 6183bcb85257..d7fa4248501c 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -22,7 +22,7 @@
 
 note_buf_t crash_notes[NR_CPUS];
 
-void machine_crash_shutdown(void)
+void machine_crash_shutdown(struct pt_regs *regs)
 {
 	/* This function is only called after the system
 	 * has paniced or is otherwise in a critical state.
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 53b2c8fab00e..af79805b5576 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -100,7 +100,7 @@ static struct sysrq_key_op sysrq_unraw_op = {
 static void sysrq_handle_crashdump(int key, struct pt_regs *pt_regs,
 				struct tty_struct *tty)
 {
-	crash_kexec();
+	crash_kexec(pt_regs);
 }
 static struct sysrq_key_op sysrq_crashdump_op = {
 	.handler	= sysrq_handle_crashdump,
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 0653a27c3d72..7383173a3a9c 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -99,7 +99,8 @@ extern asmlinkage long compat_sys_kexec_load(unsigned long entry,
 	unsigned long flags);
 #endif
 extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order);
-extern void crash_kexec(void);
+extern void crash_kexec(struct pt_regs *);
+int kexec_should_crash(struct task_struct *);
 extern struct kimage *kexec_image;
 
 #define KEXEC_ON_CRASH  0x00000001
@@ -123,6 +124,9 @@ extern struct kimage *kexec_image;
 extern struct resource crashk_res;
 
 #else /* !CONFIG_KEXEC */
-static inline void crash_kexec(void) { }
+struct pt_regs;
+struct task_struct;
+static inline void crash_kexec(struct pt_regs *regs) { }
+static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 #endif /* CONFIG_KEXEC */
 #endif /* LINUX_KEXEC_H */
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index c5a05e16edb2..2d4dd23168dd 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -52,7 +52,8 @@ extern void machine_halt(void);
 extern void machine_power_off(void);
 
 extern void machine_shutdown(void);
-extern void machine_crash_shutdown(void);
+struct pt_regs;
+extern void machine_crash_shutdown(struct pt_regs *);
 
 #endif
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a0411b3bd54a..277f22afe74b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -18,6 +18,8 @@
 #include <linux/reboot.h>
 #include <linux/syscalls.h>
 #include <linux/ioport.h>
+#include <linux/hardirq.h>
+
 #include <asm/page.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -32,6 +34,13 @@ struct resource crashk_res = {
 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
 
+int kexec_should_crash(struct task_struct *p)
+{
+	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+		return 1;
+	return 0;
+}
+
 /*
  * When kexec transitions to the new kernel there is a one-to-one
  * mapping between physical and virtual addresses.  On processors
@@ -1010,7 +1019,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
 }
 #endif
 
-void crash_kexec(void)
+void crash_kexec(struct pt_regs *regs)
 {
 	struct kimage *image;
 	int locked;
@@ -1028,7 +1037,7 @@ void crash_kexec(void)
 	if (!locked) {
 		image = xchg(&kexec_crash_image, NULL);
 		if (image) {
-			machine_crash_shutdown();
+			machine_crash_shutdown(regs);
 			machine_kexec(image);
 		}
 		xchg(&kexec_lock, 0);
diff --git a/kernel/panic.c b/kernel/panic.c
index 66f43d33cd80..74ba5f3e46c7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -83,7 +83,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	 * everything else.
 	 * Do we want to call this before we try to display a message?
 	 */
-	crash_kexec();
+	crash_kexec(NULL);
 
 #ifdef CONFIG_SMP
 	/*
-- 
cgit v1.3-14-g43fede


From 72414d3f1d22fc3e311b162fca95c430048d38ce Mon Sep 17 00:00:00 2001
From: Maneesh Soni <maneesh@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:28 -0700
Subject: [PATCH] kexec code cleanup

o Following patch provides purely cosmetic changes and corrects CodingStyle
  guide lines related certain issues like below in kexec related files

  o braces for one line "if" statements, "for" loops,
  o more than 80 column wide lines,
  o No space after "while", "for" and "switch" key words

o Changes:
  o take-2: Removed the extra tab before "case" key words.
  o take-3: Put operator at the end of line and space before "*/"

Signed-off-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/crash.c           |  23 +--
 arch/i386/kernel/machine_kexec.c   |  16 +-
 arch/ppc/kernel/machine_kexec.c    |  30 ++--
 arch/ppc64/kernel/machine_kexec.c  |   9 +-
 arch/s390/kernel/machine_kexec.c   |   4 +-
 arch/x86_64/kernel/machine_kexec.c |  49 +++---
 drivers/char/mem.c                 |   2 +-
 include/linux/kexec.h              |  13 +-
 include/linux/syscalls.h           |   6 +-
 kernel/kexec.c                     | 302 ++++++++++++++++++++-----------------
 10 files changed, 243 insertions(+), 211 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 8bdb4b6af0ff..e5fab12f7926 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -31,10 +31,11 @@ note_buf_t crash_notes[NR_CPUS];
 /* This keeps a track of which one is crashing cpu. */
 static int crashing_cpu;
 
-static u32 *append_elf_note(u32 *buf,
-	char *name, unsigned type, void *data, size_t data_len)
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
 {
 	struct elf_note note;
+
 	note.n_namesz = strlen(name) + 1;
 	note.n_descsz = data_len;
 	note.n_type   = type;
@@ -44,26 +45,28 @@ static u32 *append_elf_note(u32 *buf,
 	buf += (note.n_namesz + 3)/4;
 	memcpy(buf, data, note.n_descsz);
 	buf += (note.n_descsz + 3)/4;
+
 	return buf;
 }
 
 static void final_note(u32 *buf)
 {
 	struct elf_note note;
+
 	note.n_namesz = 0;
 	note.n_descsz = 0;
 	note.n_type   = 0;
 	memcpy(buf, &note, sizeof(note));
 }
 
-
 static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
 {
 	struct elf_prstatus prstatus;
 	u32 *buf;
-	if ((cpu < 0) || (cpu >= NR_CPUS)) {
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
 		return;
-	}
+
 	/* Using ELF notes here is opportunistic.
 	 * I need a well defined structure format
 	 * for the data I pass, and I need tags
@@ -75,9 +78,8 @@ static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
 	memset(&prstatus, 0, sizeof(prstatus));
 	prstatus.pr_pid = current->pid;
 	elf_core_copy_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, "CORE", NT_PRSTATUS,
-		&prstatus, sizeof(prstatus));
-
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
 	final_note(buf);
 }
 
@@ -119,8 +121,8 @@ static void crash_save_self(struct pt_regs *saved_regs)
 {
 	struct pt_regs regs;
 	int cpu;
-	cpu = smp_processor_id();
 
+	cpu = smp_processor_id();
 	if (saved_regs)
 		crash_setup_regs(&regs, saved_regs);
 	else
@@ -153,6 +155,7 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
 	/* Assume hlt works */
 	__asm__("hlt");
 	for(;;);
+
 	return 1;
 }
 
@@ -169,8 +172,8 @@ static void smp_send_nmi_allbutself(void)
 static void nmi_shootdown_cpus(void)
 {
 	unsigned long msecs;
-	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
 
+	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
 	/* Would it be better to replace the trap vector here? */
 	set_nmi_callback(crash_nmi_callback);
 	/* Ensure the new callback function is set before sending
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
index 671880415d1c..52ed18d8b511 100644
--- a/arch/i386/kernel/machine_kexec.c
+++ b/arch/i386/kernel/machine_kexec.c
@@ -80,7 +80,8 @@ static void identity_map_page(unsigned long address)
 	/* Identity map the page table entry */
 	pgtable_level1[level1_index] = address | L0_ATTR;
 	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
-	set_64bit(&pgtable_level3[level3_index], __pa(pgtable_level2) | L2_ATTR);
+	set_64bit(&pgtable_level3[level3_index],
+					       __pa(pgtable_level2) | L2_ATTR);
 
 	/* Flush the tlb so the new mapping takes effect.
 	 * Global tlb entries are not flushed but that is not an issue.
@@ -139,8 +140,10 @@ static void load_segments(void)
 }
 
 typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
-	unsigned long indirection_page, unsigned long reboot_code_buffer,
-	unsigned long start_address, unsigned int has_pae) ATTRIB_NORET;
+					unsigned long indirection_page,
+					unsigned long reboot_code_buffer,
+					unsigned long start_address,
+					unsigned int has_pae) ATTRIB_NORET;
 
 const extern unsigned char relocate_new_kernel[];
 extern void relocate_new_kernel_end(void);
@@ -180,20 +183,23 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 {
 	unsigned long page_list;
 	unsigned long reboot_code_buffer;
+
 	relocate_new_kernel_t rnk;
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
 	/* Compute some offsets */
-	reboot_code_buffer = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	reboot_code_buffer = page_to_pfn(image->control_code_page)
+								<< PAGE_SHIFT;
 	page_list = image->head;
 
 	/* Set up an identity mapping for the reboot_code_buffer */
 	identity_map_page(reboot_code_buffer);
 
 	/* copy it out */
-	memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
+	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+						relocate_new_kernel_size);
 
 	/* The segment registers are funny things, they are
 	 * automatically loaded from a table, in memory wherever you
diff --git a/arch/ppc/kernel/machine_kexec.c b/arch/ppc/kernel/machine_kexec.c
index b82535357d6d..84d65a87191e 100644
--- a/arch/ppc/kernel/machine_kexec.c
+++ b/arch/ppc/kernel/machine_kexec.c
@@ -21,24 +21,23 @@
 #include <asm/machdep.h>
 
 typedef NORET_TYPE void (*relocate_new_kernel_t)(
-	unsigned long indirection_page, unsigned long reboot_code_buffer,
-	unsigned long start_address) ATTRIB_NORET;
+				unsigned long indirection_page,
+				unsigned long reboot_code_buffer,
+				unsigned long start_address) ATTRIB_NORET;
 
 const extern unsigned char relocate_new_kernel[];
 const extern unsigned int relocate_new_kernel_size;
 
 void machine_shutdown(void)
 {
-	if (ppc_md.machine_shutdown) {
+	if (ppc_md.machine_shutdown)
 		ppc_md.machine_shutdown();
-	}
 }
 
 void machine_crash_shutdown(struct pt_regs *regs)
 {
-	if (ppc_md.machine_crash_shutdown) {
+	if (ppc_md.machine_crash_shutdown)
 		ppc_md.machine_crash_shutdown();
-	}
 }
 
 /*
@@ -48,9 +47,8 @@ void machine_crash_shutdown(struct pt_regs *regs)
  */
 int machine_kexec_prepare(struct kimage *image)
 {
-	if (ppc_md.machine_kexec_prepare) {
+	if (ppc_md.machine_kexec_prepare)
 		return ppc_md.machine_kexec_prepare(image);
-	}
 	/*
 	 * Fail if platform doesn't provide its own machine_kexec_prepare
 	 * implementation.
@@ -60,9 +58,8 @@ int machine_kexec_prepare(struct kimage *image)
 
 void machine_kexec_cleanup(struct kimage *image)
 {
-	if (ppc_md.machine_kexec_cleanup) {
+	if (ppc_md.machine_kexec_cleanup)
 		ppc_md.machine_kexec_cleanup(image);
-	}
 }
 
 /*
@@ -71,9 +68,9 @@ void machine_kexec_cleanup(struct kimage *image)
  */
 NORET_TYPE void machine_kexec(struct kimage *image)
 {
-	if (ppc_md.machine_kexec) {
+	if (ppc_md.machine_kexec)
 		ppc_md.machine_kexec(image);
-	} else {
+	else {
 		/*
 		 * Fall back to normal restart if platform doesn't provide
 		 * its own kexec function, and user insist to kexec...
@@ -83,7 +80,6 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	for(;;);
 }
 
-
 /*
  * This is a generic machine_kexec function suitable at least for
  * non-OpenFirmware embedded platforms.
@@ -104,15 +100,15 @@ void machine_kexec_simple(struct kimage *image)
 
 	/* we need both effective and real address here */
 	reboot_code_buffer =
-		(unsigned long)page_address(image->control_code_page);
+			(unsigned long)page_address(image->control_code_page);
 	reboot_code_buffer_phys = virt_to_phys((void *)reboot_code_buffer);
 
 	/* copy our kernel relocation code to the control code page */
-	memcpy((void *)reboot_code_buffer,
-		relocate_new_kernel, relocate_new_kernel_size);
+	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+						relocate_new_kernel_size);
 
 	flush_icache_range(reboot_code_buffer,
-		reboot_code_buffer + KEXEC_CONTROL_CODE_SIZE);
+				reboot_code_buffer + KEXEC_CONTROL_CODE_SIZE);
 	printk(KERN_INFO "Bye!\n");
 
 	/* now call it */
diff --git a/arch/ppc64/kernel/machine_kexec.c b/arch/ppc64/kernel/machine_kexec.c
index 06b25b59c8a8..fdb2fc649d72 100644
--- a/arch/ppc64/kernel/machine_kexec.c
+++ b/arch/ppc64/kernel/machine_kexec.c
@@ -58,7 +58,7 @@ int machine_kexec_prepare(struct kimage *image)
 	 * handle the virtual mode, we must make sure no destination
 	 * overlaps kernel static data or bss.
 	 */
-	for(i = 0; i < image->nr_segments; i++)
+	for (i = 0; i < image->nr_segments; i++)
 		if (image->segment[i].mem < __pa(_end))
 			return -ETXTBSY;
 
@@ -76,7 +76,7 @@ int machine_kexec_prepare(struct kimage *image)
 		low = __pa(htab_address);
 		high = low + (htab_hash_mask + 1) * HASH_GROUP_SIZE;
 
-		for(i = 0; i < image->nr_segments; i++) {
+		for (i = 0; i < image->nr_segments; i++) {
 			begin = image->segment[i].mem;
 			end = begin + image->segment[i].memsz;
 
@@ -98,7 +98,7 @@ int machine_kexec_prepare(struct kimage *image)
 		low = *basep;
 		high = low + (*sizep);
 
-		for(i = 0; i < image->nr_segments; i++) {
+		for (i = 0; i < image->nr_segments; i++) {
 			begin = image->segment[i].mem;
 			end = begin + image->segment[i].memsz;
 
@@ -274,7 +274,8 @@ union thread_union kexec_stack
 
 /* Our assembly helper, in kexec_stub.S */
 extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start,
-	void *image, void *control, void (*clear_all)(void)) ATTRIB_NORET;
+					void *image, void *control,
+					void (*clear_all)(void)) ATTRIB_NORET;
 
 /* too late to fail here */
 void machine_kexec(struct kimage *image)
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 7a94db76df46..2721c3a32b84 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -67,7 +67,7 @@ machine_kexec(struct kimage *image)
 	ctl_clear_bit(0,28);
 
 	on_each_cpu(kexec_halt_all_cpus, image, 0, 0);
-	for(;;);
+	for (;;);
 }
 
 static void
@@ -85,7 +85,7 @@ kexec_halt_all_cpus(void *kernel_image)
 	for_each_online_cpu(cpu) {
 		if (cpu == smp_processor_id())
 			continue;
-		while(!smp_cpu_not_running(cpu))
+		while (!smp_cpu_not_running(cpu))
 			cpu_relax();
 	}
 
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 200b5993f8d9..60d1eff41567 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -32,29 +32,31 @@
 #define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 
-static void init_level2_page(
-	u64 *level2p, unsigned long addr)
+static void init_level2_page(u64 *level2p, unsigned long addr)
 {
 	unsigned long end_addr;
+
 	addr &= PAGE_MASK;
 	end_addr = addr + LEVEL2_SIZE;
-	while(addr < end_addr) {
+	while (addr < end_addr) {
 		*(level2p++) = addr | L1_ATTR;
 		addr += LEVEL1_SIZE;
 	}
 }
 
-static int init_level3_page(struct kimage *image,
-	u64 *level3p, unsigned long addr, unsigned long last_addr)
+static int init_level3_page(struct kimage *image, u64 *level3p,
+				unsigned long addr, unsigned long last_addr)
 {
 	unsigned long end_addr;
 	int result;
+
 	result = 0;
 	addr &= PAGE_MASK;
 	end_addr = addr + LEVEL3_SIZE;
-	while((addr < last_addr) && (addr < end_addr)) {
+	while ((addr < last_addr) && (addr < end_addr)) {
 		struct page *page;
 		u64 *level2p;
+
 		page = kimage_alloc_control_pages(image, 0);
 		if (!page) {
 			result = -ENOMEM;
@@ -66,7 +68,7 @@ static int init_level3_page(struct kimage *image,
 		addr += LEVEL2_SIZE;
 	}
 	/* clear the unused entries */
-	while(addr < end_addr) {
+	while (addr < end_addr) {
 		*(level3p++) = 0;
 		addr += LEVEL2_SIZE;
 	}
@@ -75,17 +77,19 @@ out:
 }
 
 
-static int init_level4_page(struct kimage *image,
-	u64 *level4p, unsigned long addr, unsigned long last_addr)
+static int init_level4_page(struct kimage *image, u64 *level4p,
+				unsigned long addr, unsigned long last_addr)
 {
 	unsigned long end_addr;
 	int result;
+
 	result = 0;
 	addr &= PAGE_MASK;
 	end_addr = addr + LEVEL4_SIZE;
-	while((addr < last_addr) && (addr < end_addr)) {
+	while ((addr < last_addr) && (addr < end_addr)) {
 		struct page *page;
 		u64 *level3p;
+
 		page = kimage_alloc_control_pages(image, 0);
 		if (!page) {
 			result = -ENOMEM;
@@ -100,11 +104,11 @@ static int init_level4_page(struct kimage *image,
 		addr += LEVEL3_SIZE;
 	}
 	/* clear the unused entries */
-	while(addr < end_addr) {
+	while (addr < end_addr) {
 		*(level4p++) = 0;
 		addr += LEVEL3_SIZE;
 	}
- out:
+out:
 	return result;
 }
 
@@ -113,7 +117,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 {
 	u64 *level4p;
 	level4p = (u64 *)__va(start_pgtable);
-	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+ 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
 }
 
 static void set_idt(void *newidt, u16 limit)
@@ -159,9 +163,10 @@ static void load_segments(void)
 #undef __STR
 }
 
-typedef NORET_TYPE void (*relocate_new_kernel_t)(
-	unsigned long indirection_page, unsigned long control_code_buffer,
-	unsigned long start_address, unsigned long pgtable) ATTRIB_NORET;
+typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+					unsigned long control_code_buffer,
+					unsigned long start_address,
+					unsigned long pgtable) ATTRIB_NORET;
 
 const extern unsigned char relocate_new_kernel[];
 const extern unsigned long relocate_new_kernel_size;
@@ -172,17 +177,17 @@ int machine_kexec_prepare(struct kimage *image)
 	int result;
 
 	/* Calculate the offsets */
-	start_pgtable       = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
 	control_code_buffer = start_pgtable + 4096UL;
 
 	/* Setup the identity mapped 64bit page table */
 	result = init_pgtable(image, start_pgtable);
-	if (result) {
+	if (result)
 		return result;
-	}
 
 	/* Place the code in the reboot code buffer */
-	memcpy(__va(control_code_buffer), relocate_new_kernel, relocate_new_kernel_size);
+	memcpy(__va(control_code_buffer), relocate_new_kernel,
+						relocate_new_kernel_size);
 
 	return 0;
 }
@@ -207,8 +212,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	local_irq_disable();
 
 	/* Calculate the offsets */
-	page_list           = image->head;
-	start_pgtable       = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	page_list = image->head;
+	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
 	control_code_buffer = start_pgtable + 4096UL;
 
 	/* Set the low half of the page table to my identity mapped
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index b64108dd765b..42187381506b 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -287,7 +287,7 @@ static ssize_t read_oldmem(struct file *file, char __user *buf,
 	size_t read = 0, csize;
 	int rc = 0;
 
-	while(count) {
+	while (count) {
 		pfn = *ppos / PAGE_SIZE;
 		if (pfn > saved_max_pfn)
 			return read;
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 7383173a3a9c..c8468472aec0 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -91,14 +91,17 @@ extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
 extern int machine_kexec_prepare(struct kimage *image);
 extern void machine_kexec_cleanup(struct kimage *image);
 extern asmlinkage long sys_kexec_load(unsigned long entry,
-	unsigned long nr_segments, struct kexec_segment __user *segments,
-	unsigned long flags);
+					unsigned long nr_segments,
+					struct kexec_segment __user *segments,
+					unsigned long flags);
 #ifdef CONFIG_COMPAT
 extern asmlinkage long compat_sys_kexec_load(unsigned long entry,
-	unsigned long nr_segments, struct compat_kexec_segment __user *segments,
-	unsigned long flags);
+				unsigned long nr_segments,
+				struct compat_kexec_segment __user *segments,
+				unsigned long flags);
 #endif
-extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order);
+extern struct page *kimage_alloc_control_pages(struct kimage *image,
+						unsigned int order);
 extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
 extern struct kimage *kexec_image;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7ba8f8f747aa..52830b6d94e5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -159,9 +159,9 @@ asmlinkage long sys_shutdown(int, int);
 asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd,
 				void __user *arg);
 asmlinkage long sys_restart_syscall(void);
-asmlinkage long sys_kexec_load(unsigned long entry,
-	unsigned long nr_segments, struct kexec_segment __user *segments,
-	unsigned long flags);
+asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
+				struct kexec_segment __user *segments,
+				unsigned long flags);
 
 asmlinkage long sys_exit(int error_code);
 asmlinkage void sys_exit_group(int error_code);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 277f22afe74b..7843548cf2d9 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -87,12 +87,15 @@ int kexec_should_crash(struct task_struct *p)
  */
 #define KIMAGE_NO_DEST (-1UL)
 
-static int kimage_is_destination_range(
-	struct kimage *image, unsigned long start, unsigned long end);
-static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
+static int kimage_is_destination_range(struct kimage *image,
+				       unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image,
+				       unsigned int gfp_mask,
+				       unsigned long dest);
 
 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
-	unsigned long nr_segments, struct kexec_segment __user *segments)
+	                    unsigned long nr_segments,
+                            struct kexec_segment __user *segments)
 {
 	size_t segment_bytes;
 	struct kimage *image;
@@ -102,9 +105,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 	/* Allocate a controlling structure */
 	result = -ENOMEM;
 	image = kmalloc(sizeof(*image), GFP_KERNEL);
-	if (!image) {
+	if (!image)
 		goto out;
-	}
+
 	memset(image, 0, sizeof(*image));
 	image->head = 0;
 	image->entry = &image->head;
@@ -145,6 +148,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 	result = -EADDRNOTAVAIL;
 	for (i = 0; i < nr_segments; i++) {
 		unsigned long mstart, mend;
+
 		mstart = image->segment[i].mem;
 		mend   = mstart + image->segment[i].memsz;
 		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
@@ -159,12 +163,13 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 	 * easy explanation as one segment stops on another.
 	 */
 	result = -EINVAL;
-	for(i = 0; i < nr_segments; i++) {
+	for (i = 0; i < nr_segments; i++) {
 		unsigned long mstart, mend;
 		unsigned long j;
+
 		mstart = image->segment[i].mem;
 		mend   = mstart + image->segment[i].memsz;
-		for(j = 0; j < i; j++) {
+		for (j = 0; j < i; j++) {
 			unsigned long pstart, pend;
 			pstart = image->segment[j].mem;
 			pend   = pstart + image->segment[j].memsz;
@@ -180,25 +185,25 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 	 * later on.
 	 */
 	result = -EINVAL;
-	for(i = 0; i < nr_segments; i++) {
+	for (i = 0; i < nr_segments; i++) {
 		if (image->segment[i].bufsz > image->segment[i].memsz)
 			goto out;
 	}
 
-
 	result = 0;
- out:
-	if (result == 0) {
+out:
+	if (result == 0)
 		*rimage = image;
-	} else {
+	else
 		kfree(image);
-	}
+
 	return result;
 
 }
 
 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
-	unsigned long nr_segments, struct kexec_segment __user *segments)
+				unsigned long nr_segments,
+				struct kexec_segment __user *segments)
 {
 	int result;
 	struct kimage *image;
@@ -206,9 +211,9 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 	/* Allocate and initialize a controlling structure */
 	image = NULL;
 	result = do_kimage_alloc(&image, entry, nr_segments, segments);
-	if (result) {
+	if (result)
 		goto out;
-	}
+
 	*rimage = image;
 
 	/*
@@ -218,7 +223,7 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 	 */
 	result = -ENOMEM;
 	image->control_code_page = kimage_alloc_control_pages(image,
-		get_order(KEXEC_CONTROL_CODE_SIZE));
+					   get_order(KEXEC_CONTROL_CODE_SIZE));
 	if (!image->control_code_page) {
 		printk(KERN_ERR "Could not allocate control_code_buffer\n");
 		goto out;
@@ -226,16 +231,17 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 
 	result = 0;
  out:
-	if (result == 0) {
+	if (result == 0)
 		*rimage = image;
-	} else {
+	else
 		kfree(image);
-	}
+
 	return result;
 }
 
 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
-	unsigned long nr_segments, struct kexec_segment *segments)
+				unsigned long nr_segments,
+				struct kexec_segment *segments)
 {
 	int result;
 	struct kimage *image;
@@ -250,9 +256,8 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 
 	/* Allocate and initialize a controlling structure */
 	result = do_kimage_alloc(&image, entry, nr_segments, segments);
-	if (result) {
+	if (result)
 		goto out;
-	}
 
 	/* Enable the special crash kernel control page
 	 * allocation policy.
@@ -272,6 +277,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 	result = -EADDRNOTAVAIL;
 	for (i = 0; i < nr_segments; i++) {
 		unsigned long mstart, mend;
+
 		mstart = image->segment[i].mem;
 		mend = mstart + image->segment[i].memsz - 1;
 		/* Ensure we are within the crash kernel limits */
@@ -279,7 +285,6 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 			goto out;
 	}
 
-
 	/*
 	 * Find a location for the control code buffer, and add
 	 * the vector of segments so that it's pages will also be
@@ -287,80 +292,84 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 	 */
 	result = -ENOMEM;
 	image->control_code_page = kimage_alloc_control_pages(image,
-		get_order(KEXEC_CONTROL_CODE_SIZE));
+					   get_order(KEXEC_CONTROL_CODE_SIZE));
 	if (!image->control_code_page) {
 		printk(KERN_ERR "Could not allocate control_code_buffer\n");
 		goto out;
 	}
 
 	result = 0;
- out:
-	if (result == 0) {
+out:
+	if (result == 0)
 		*rimage = image;
-	} else {
+	else
 		kfree(image);
-	}
+
 	return result;
 }
 
-static int kimage_is_destination_range(
-	struct kimage *image, unsigned long start, unsigned long end)
+static int kimage_is_destination_range(struct kimage *image,
+					unsigned long start,
+					unsigned long end)
 {
 	unsigned long i;
 
 	for (i = 0; i < image->nr_segments; i++) {
 		unsigned long mstart, mend;
+
 		mstart = image->segment[i].mem;
-		mend   = mstart + image->segment[i].memsz;
-		if ((end > mstart) && (start < mend)) {
+		mend = mstart + image->segment[i].memsz;
+		if ((end > mstart) && (start < mend))
 			return 1;
-		}
 	}
+
 	return 0;
 }
 
-static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
+static struct page *kimage_alloc_pages(unsigned int gfp_mask,
+					unsigned int order)
 {
 	struct page *pages;
+
 	pages = alloc_pages(gfp_mask, order);
 	if (pages) {
 		unsigned int count, i;
 		pages->mapping = NULL;
 		pages->private = order;
 		count = 1 << order;
-		for(i = 0; i < count; i++) {
+		for (i = 0; i < count; i++)
 			SetPageReserved(pages + i);
-		}
 	}
+
 	return pages;
 }
 
 static void kimage_free_pages(struct page *page)
 {
 	unsigned int order, count, i;
+
 	order = page->private;
 	count = 1 << order;
-	for(i = 0; i < count; i++) {
+	for (i = 0; i < count; i++)
 		ClearPageReserved(page + i);
-	}
 	__free_pages(page, order);
 }
 
 static void kimage_free_page_list(struct list_head *list)
 {
 	struct list_head *pos, *next;
+
 	list_for_each_safe(pos, next, list) {
 		struct page *page;
 
 		page = list_entry(pos, struct page, lru);
 		list_del(&page->lru);
-
 		kimage_free_pages(page);
 	}
 }
 
-static struct page *kimage_alloc_normal_control_pages(
-	struct kimage *image, unsigned int order)
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+							unsigned int order)
 {
 	/* Control pages are special, they are the intermediaries
 	 * that are needed while we copy the rest of the pages
@@ -387,6 +396,7 @@ static struct page *kimage_alloc_normal_control_pages(
 	 */
 	do {
 		unsigned long pfn, epfn, addr, eaddr;
+
 		pages = kimage_alloc_pages(GFP_KERNEL, order);
 		if (!pages)
 			break;
@@ -395,12 +405,12 @@ static struct page *kimage_alloc_normal_control_pages(
 		addr  = pfn << PAGE_SHIFT;
 		eaddr = epfn << PAGE_SHIFT;
 		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
-			kimage_is_destination_range(image, addr, eaddr))
-		{
+			      kimage_is_destination_range(image, addr, eaddr)) {
 			list_add(&pages->lru, &extra_pages);
 			pages = NULL;
 		}
-	} while(!pages);
+	} while (!pages);
+
 	if (pages) {
 		/* Remember the allocated page... */
 		list_add(&pages->lru, &image->control_pages);
@@ -420,12 +430,12 @@ static struct page *kimage_alloc_normal_control_pages(
 	 * For now it is simpler to just free the pages.
 	 */
 	kimage_free_page_list(&extra_pages);
-	return pages;
 
+	return pages;
 }
 
-static struct page *kimage_alloc_crash_control_pages(
-	struct kimage *image, unsigned int order)
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+						      unsigned int order)
 {
 	/* Control pages are special, they are the intermediaries
 	 * that are needed while we copy the rest of the pages
@@ -450,21 +460,22 @@ static struct page *kimage_alloc_crash_control_pages(
 	 */
 	unsigned long hole_start, hole_end, size;
 	struct page *pages;
+
 	pages = NULL;
 	size = (1 << order) << PAGE_SHIFT;
 	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 	hole_end   = hole_start + size - 1;
-	while(hole_end <= crashk_res.end) {
+	while (hole_end <= crashk_res.end) {
 		unsigned long i;
-		if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) {
+
+		if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
 			break;
-		}
-		if (hole_end > crashk_res.end) {
+		if (hole_end > crashk_res.end)
 			break;
-		}
 		/* See if I overlap any of the segments */
-		for(i = 0; i < image->nr_segments; i++) {
+		for (i = 0; i < image->nr_segments; i++) {
 			unsigned long mstart, mend;
+
 			mstart = image->segment[i].mem;
 			mend   = mstart + image->segment[i].memsz - 1;
 			if ((hole_end >= mstart) && (hole_start <= mend)) {
@@ -480,18 +491,19 @@ static struct page *kimage_alloc_crash_control_pages(
 			break;
 		}
 	}
-	if (pages) {
+	if (pages)
 		image->control_page = hole_end;
-	}
+
 	return pages;
 }
 
 
-struct page *kimage_alloc_control_pages(
-	struct kimage *image, unsigned int order)
+struct page *kimage_alloc_control_pages(struct kimage *image,
+					 unsigned int order)
 {
 	struct page *pages = NULL;
-	switch(image->type) {
+
+	switch (image->type) {
 	case KEXEC_TYPE_DEFAULT:
 		pages = kimage_alloc_normal_control_pages(image, order);
 		break;
@@ -499,43 +511,46 @@ struct page *kimage_alloc_control_pages(
 		pages = kimage_alloc_crash_control_pages(image, order);
 		break;
 	}
+
 	return pages;
 }
 
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
-	if (*image->entry != 0) {
+	if (*image->entry != 0)
 		image->entry++;
-	}
+
 	if (image->entry == image->last_entry) {
 		kimage_entry_t *ind_page;
 		struct page *page;
+
 		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
-		if (!page) {
+		if (!page)
 			return -ENOMEM;
-		}
+
 		ind_page = page_address(page);
 		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
 		image->entry = ind_page;
-		image->last_entry =
-			ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+		image->last_entry = ind_page +
+				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
 	}
 	*image->entry = entry;
 	image->entry++;
 	*image->entry = 0;
+
 	return 0;
 }
 
-static int kimage_set_destination(
-	struct kimage *image, unsigned long destination)
+static int kimage_set_destination(struct kimage *image,
+				   unsigned long destination)
 {
 	int result;
 
 	destination &= PAGE_MASK;
 	result = kimage_add_entry(image, destination | IND_DESTINATION);
-	if (result == 0) {
+	if (result == 0)
 		image->destination = destination;
-	}
+
 	return result;
 }
 
@@ -546,9 +561,9 @@ static int kimage_add_page(struct kimage *image, unsigned long page)
 
 	page &= PAGE_MASK;
 	result = kimage_add_entry(image, page | IND_SOURCE);
-	if (result == 0) {
+	if (result == 0)
 		image->destination += PAGE_SIZE;
-	}
+
 	return result;
 }
 
@@ -564,10 +579,11 @@ static void kimage_free_extra_pages(struct kimage *image)
 }
 static int kimage_terminate(struct kimage *image)
 {
-	if (*image->entry != 0) {
+	if (*image->entry != 0)
 		image->entry++;
-	}
+
 	*image->entry = IND_DONE;
+
 	return 0;
 }
 
@@ -591,26 +607,24 @@ static void kimage_free(struct kimage *image)
 
 	if (!image)
 		return;
+
 	kimage_free_extra_pages(image);
 	for_each_kimage_entry(image, ptr, entry) {
 		if (entry & IND_INDIRECTION) {
 			/* Free the previous indirection page */
-			if (ind & IND_INDIRECTION) {
+			if (ind & IND_INDIRECTION)
 				kimage_free_entry(ind);
-			}
 			/* Save this indirection page until we are
 			 * done with it.
 			 */
 			ind = entry;
 		}
-		else if (entry & IND_SOURCE) {
+		else if (entry & IND_SOURCE)
 			kimage_free_entry(entry);
-		}
 	}
 	/* Free the final indirection page */
-	if (ind & IND_INDIRECTION) {
+	if (ind & IND_INDIRECTION)
 		kimage_free_entry(ind);
-	}
 
 	/* Handle any machine specific cleanup */
 	machine_kexec_cleanup(image);
@@ -620,26 +634,28 @@ static void kimage_free(struct kimage *image)
 	kfree(image);
 }
 
-static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+					unsigned long page)
 {
 	kimage_entry_t *ptr, entry;
 	unsigned long destination = 0;
 
 	for_each_kimage_entry(image, ptr, entry) {
-		if (entry & IND_DESTINATION) {
+		if (entry & IND_DESTINATION)
 			destination = entry & PAGE_MASK;
-		}
 		else if (entry & IND_SOURCE) {
-			if (page == destination) {
+			if (page == destination)
 				return ptr;
-			}
 			destination += PAGE_SIZE;
 		}
 	}
+
 	return 0;
 }
 
-static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
+static struct page *kimage_alloc_page(struct kimage *image,
+					unsigned int gfp_mask,
+					unsigned long destination)
 {
 	/*
 	 * Here we implement safeguards to ensure that a source page
@@ -679,11 +695,11 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mas
 
 		/* Allocate a page, if we run out of memory give up */
 		page = kimage_alloc_pages(gfp_mask, 0);
-		if (!page) {
+		if (!page)
 			return 0;
-		}
 		/* If the page cannot be used file it away */
-		if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+		if (page_to_pfn(page) >
+				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 			list_add(&page->lru, &image->unuseable_pages);
 			continue;
 		}
@@ -694,7 +710,8 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mas
 			break;
 
 		/* If the page is not a destination page use it */
-		if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
+		if (!kimage_is_destination_range(image, addr,
+						  addr + PAGE_SIZE))
 			break;
 
 		/*
@@ -727,11 +744,12 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mas
 			list_add(&page->lru, &image->dest_pages);
 		}
 	}
+
 	return page;
 }
 
 static int kimage_load_normal_segment(struct kimage *image,
-	struct kexec_segment *segment)
+					 struct kexec_segment *segment)
 {
 	unsigned long maddr;
 	unsigned long ubytes, mbytes;
@@ -745,34 +763,36 @@ static int kimage_load_normal_segment(struct kimage *image,
 	maddr = segment->mem;
 
 	result = kimage_set_destination(image, maddr);
-	if (result < 0) {
+	if (result < 0)
 		goto out;
-	}
-	while(mbytes) {
+
+	while (mbytes) {
 		struct page *page;
 		char *ptr;
 		size_t uchunk, mchunk;
+
 		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
 		if (page == 0) {
 			result  = -ENOMEM;
 			goto out;
 		}
-		result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
-		if (result < 0) {
+		result = kimage_add_page(image, page_to_pfn(page)
+								<< PAGE_SHIFT);
+		if (result < 0)
 			goto out;
-		}
+
 		ptr = kmap(page);
 		/* Start with a clear page */
 		memset(ptr, 0, PAGE_SIZE);
 		ptr += maddr & ~PAGE_MASK;
 		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
-		if (mchunk > mbytes) {
+		if (mchunk > mbytes)
 			mchunk = mbytes;
-		}
+
 		uchunk = mchunk;
-		if (uchunk > ubytes) {
+		if (uchunk > ubytes)
 			uchunk = ubytes;
-		}
+
 		result = copy_from_user(ptr, buf, uchunk);
 		kunmap(page);
 		if (result) {
@@ -784,12 +804,12 @@ static int kimage_load_normal_segment(struct kimage *image,
 		buf    += mchunk;
 		mbytes -= mchunk;
 	}
- out:
+out:
 	return result;
 }
 
 static int kimage_load_crash_segment(struct kimage *image,
-	struct kexec_segment *segment)
+					struct kexec_segment *segment)
 {
 	/* For crash dumps kernels we simply copy the data from
 	 * user space to it's destination.
@@ -805,10 +825,11 @@ static int kimage_load_crash_segment(struct kimage *image,
 	ubytes = segment->bufsz;
 	mbytes = segment->memsz;
 	maddr = segment->mem;
-	while(mbytes) {
+	while (mbytes) {
 		struct page *page;
 		char *ptr;
 		size_t uchunk, mchunk;
+
 		page = pfn_to_page(maddr >> PAGE_SHIFT);
 		if (page == 0) {
 			result  = -ENOMEM;
@@ -817,9 +838,9 @@ static int kimage_load_crash_segment(struct kimage *image,
 		ptr = kmap(page);
 		ptr += maddr & ~PAGE_MASK;
 		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
-		if (mchunk > mbytes) {
+		if (mchunk > mbytes)
 			mchunk = mbytes;
-		}
+
 		uchunk = mchunk;
 		if (uchunk > ubytes) {
 			uchunk = ubytes;
@@ -837,15 +858,16 @@ static int kimage_load_crash_segment(struct kimage *image,
 		buf    += mchunk;
 		mbytes -= mchunk;
 	}
- out:
+out:
 	return result;
 }
 
 static int kimage_load_segment(struct kimage *image,
-	struct kexec_segment *segment)
+				struct kexec_segment *segment)
 {
 	int result = -ENOMEM;
-	switch(image->type) {
+
+	switch (image->type) {
 	case KEXEC_TYPE_DEFAULT:
 		result = kimage_load_normal_segment(image, segment);
 		break;
@@ -853,6 +875,7 @@ static int kimage_load_segment(struct kimage *image,
 		result = kimage_load_crash_segment(image, segment);
 		break;
 	}
+
 	return result;
 }
 
@@ -885,9 +908,9 @@ static struct kimage *kexec_crash_image = NULL;
  */
 static int kexec_lock = 0;
 
-asmlinkage long sys_kexec_load(unsigned long entry,
-	unsigned long nr_segments, struct kexec_segment __user *segments,
-	unsigned long flags)
+asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
+				struct kexec_segment __user *segments,
+				unsigned long flags)
 {
 	struct kimage **dest_image, *image;
 	int locked;
@@ -907,9 +930,7 @@ asmlinkage long sys_kexec_load(unsigned long entry,
 	/* Verify we are on the appropriate architecture */
 	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
 		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
-	{
 		return -EINVAL;
-	}
 
 	/* Put an artificial cap on the number
 	 * of segments passed to kexec_load.
@@ -929,58 +950,59 @@ asmlinkage long sys_kexec_load(unsigned long entry,
 	 * KISS: always take the mutex.
 	 */
 	locked = xchg(&kexec_lock, 1);
-	if (locked) {
+	if (locked)
 		return -EBUSY;
-	}
+
 	dest_image = &kexec_image;
-	if (flags & KEXEC_ON_CRASH) {
+	if (flags & KEXEC_ON_CRASH)
 		dest_image = &kexec_crash_image;
-	}
 	if (nr_segments > 0) {
 		unsigned long i;
+
 		/* Loading another kernel to reboot into */
-		if ((flags & KEXEC_ON_CRASH) == 0) {
-			result = kimage_normal_alloc(&image, entry, nr_segments, segments);
-		}
+		if ((flags & KEXEC_ON_CRASH) == 0)
+			result = kimage_normal_alloc(&image, entry,
+							nr_segments, segments);
 		/* Loading another kernel to switch to if this one crashes */
 		else if (flags & KEXEC_ON_CRASH) {
 			/* Free any current crash dump kernel before
 			 * we corrupt it.
 			 */
 			kimage_free(xchg(&kexec_crash_image, NULL));
-			result = kimage_crash_alloc(&image, entry, nr_segments, segments);
+			result = kimage_crash_alloc(&image, entry,
+						     nr_segments, segments);
 		}
-		if (result) {
+		if (result)
 			goto out;
-		}
+
 		result = machine_kexec_prepare(image);
-		if (result) {
+		if (result)
 			goto out;
-		}
-		for(i = 0; i < nr_segments; i++) {
+
+		for (i = 0; i < nr_segments; i++) {
 			result = kimage_load_segment(image, &image->segment[i]);
-			if (result) {
+			if (result)
 				goto out;
-			}
 		}
 		result = kimage_terminate(image);
-		if (result) {
+		if (result)
 			goto out;
-		}
 	}
 	/* Install the new kernel, and  Uninstall the old */
 	image = xchg(dest_image, image);
 
- out:
+out:
 	xchg(&kexec_lock, 0); /* Release the mutex */
 	kimage_free(image);
+
 	return result;
 }
 
 #ifdef CONFIG_COMPAT
 asmlinkage long compat_sys_kexec_load(unsigned long entry,
-	unsigned long nr_segments, struct compat_kexec_segment __user *segments,
-	unsigned long flags)
+				unsigned long nr_segments,
+				struct compat_kexec_segment __user *segments,
+				unsigned long flags)
 {
 	struct compat_kexec_segment in;
 	struct kexec_segment out, __user *ksegments;
@@ -989,20 +1011,17 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
 	/* Don't allow clients that don't understand the native
 	 * architecture to do anything.
 	 */
-	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) {
+	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
 		return -EINVAL;
-	}
 
-	if (nr_segments > KEXEC_SEGMENT_MAX) {
+	if (nr_segments > KEXEC_SEGMENT_MAX)
 		return -EINVAL;
-	}
 
 	ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
 	for (i=0; i < nr_segments; i++) {
 		result = copy_from_user(&in, &segments[i], sizeof(in));
-		if (result) {
+		if (result)
 			return -EFAULT;
-		}
 
 		out.buf   = compat_ptr(in.buf);
 		out.bufsz = in.bufsz;
@@ -1010,9 +1029,8 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
 		out.memsz = in.memsz;
 
 		result = copy_to_user(&ksegments[i], &out, sizeof(out));
-		if (result) {
+		if (result)
 			return -EFAULT;
-		}
 	}
 
 	return sys_kexec_load(entry, nr_segments, ksegments, flags);
-- 
cgit v1.3-14-g43fede


From 96ec3efdcbaea4f403f2a5f1204edbf903a01961 Mon Sep 17 00:00:00 2001
From: Domen Puncer <domen@coderock.org>
Date: Sat, 25 Jun 2005 14:58:43 -0700
Subject: [PATCH] kernel/timer: fix msleep_interruptible() comment

The comment for msleep_interruptible() is wrong, as it will ignore
wait-queue events, but will wake up early for signals.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 51ff917c9590..f2a11887a726 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1597,7 +1597,7 @@ void msleep(unsigned int msecs)
 EXPORT_SYMBOL(msleep);
 
 /**
- * msleep_interruptible - sleep waiting for waitqueue interruptions
+ * msleep_interruptible - sleep waiting for signals
  * @msecs: Time in milliseconds to sleep for
  */
 unsigned long msleep_interruptible(unsigned int msecs)
-- 
cgit v1.3-14-g43fede


From 5a6b454f8024bac68495b6cd51615feb0b54baa9 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Sat, 25 Jun 2005 14:58:48 -0700
Subject: [PATCH] remove redundant NULL check before before kfree() in
 kernel/sysctl.c

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 24a4d12d5aa9..270ee7fadbd8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1000,8 +1000,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
 		int error = parse_table(name, nlen, oldval, oldlenp, 
 					newval, newlen, head->ctl_table,
 					&context);
-		if (context)
-			kfree(context);
+		kfree(context);
 		if (error != -ENOTDIR)
 			return error;
 		tmp = tmp->next;
-- 
cgit v1.3-14-g43fede


From 8c0e33c133021ee241e9d51255b9fb18eb34ef0e Mon Sep 17 00:00:00 2001
From: Nick Wilson <njw@osdl.org>
Date: Sat, 25 Jun 2005 14:59:00 -0700
Subject: [PATCH] Use ALIGN to remove duplicate code

This patch makes use of ALIGN() to remove duplicate round-up code.

Signed-off-by: Nick Wilson <njw@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/a.out.h | 2 +-
 kernel/resource.c     | 2 +-
 lib/bitmap.c          | 3 +--
 mm/bootmem.c          | 6 +++---
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/a.out.h b/include/linux/a.out.h
index af8a1dfa5c32..f913cc3e1b0d 100644
--- a/include/linux/a.out.h
+++ b/include/linux/a.out.h
@@ -138,7 +138,7 @@ enum machine_type {
 #endif
 #endif
 
-#define _N_SEGMENT_ROUND(x) (((x) + SEGMENT_SIZE - 1) & ~(SEGMENT_SIZE - 1))
+#define _N_SEGMENT_ROUND(x) ALIGN(x, SEGMENT_SIZE)
 
 #define _N_TXTENDADDR(x) (N_TXTADDR(x)+(x).a_text)
 
diff --git a/kernel/resource.c b/kernel/resource.c
index 52f696f11adf..26967e042201 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -263,7 +263,7 @@ static int find_resource(struct resource *root, struct resource *new,
 			new->start = min;
 		if (new->end > max)
 			new->end = max;
-		new->start = (new->start + align - 1) & ~(align - 1);
+		new->start = ALIGN(new->start, align);
 		if (alignf)
 			alignf(alignf_data, new, size, align);
 		if (new->start < new->end && new->end - new->start >= size - 1) {
diff --git a/lib/bitmap.c b/lib/bitmap.c
index d1388a5ce89c..fb9371fdd44a 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -289,7 +289,6 @@ EXPORT_SYMBOL(__bitmap_weight);
 
 #define CHUNKSZ				32
 #define nbits_to_hold_value(val)	fls(val)
-#define roundup_power2(val,modulus)	(((val) + (modulus) - 1) & ~((modulus) - 1))
 #define unhex(c)			(isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10))
 #define BASEDEC 10		/* fancier cpuset lists input in decimal */
 
@@ -316,7 +315,7 @@ int bitmap_scnprintf(char *buf, unsigned int buflen,
 	if (chunksz == 0)
 		chunksz = CHUNKSZ;
 
-	i = roundup_power2(nmaskbits, CHUNKSZ) - CHUNKSZ;
+	i = ALIGN(nmaskbits, CHUNKSZ) - CHUNKSZ;
 	for (; i >= 0; i -= CHUNKSZ) {
 		chunkmask = ((1ULL << chunksz) - 1);
 		word = i / BITS_PER_LONG;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 45275f1f8947..c1330cc19783 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -65,7 +65,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
 	pgdat->pgdat_next = pgdat_list;
 	pgdat_list = pgdat;
 
-	mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
+	mapsize = ALIGN(mapsize, sizeof(long));
 	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
 	bdata->node_boot_start = (start << PAGE_SHIFT);
 	bdata->node_low_pfn = end;
@@ -186,7 +186,7 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	} else
 		preferred = 0;
 
-	preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;
+	preferred = ALIGN(preferred, align) >> PAGE_SHIFT;
 	preferred += offset;
 	areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
 	incr = align >> PAGE_SHIFT ? : 1;
@@ -227,7 +227,7 @@ found:
 	 */
 	if (align < PAGE_SIZE &&
 	    bdata->last_offset && bdata->last_pos+1 == start) {
-		offset = (bdata->last_offset+align-1) & ~(align-1);
+		offset = ALIGN(bdata->last_offset, align);
 		BUG_ON(offset > PAGE_SIZE);
 		remaining_size = PAGE_SIZE-offset;
 		if (size < remaining_size) {
-- 
cgit v1.3-14-g43fede


From 3e1d1d28d99dabe63c64f7f40f1ca1d646de1f73 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Fri, 24 Jun 2005 23:13:50 -0700
Subject: [PATCH] Cleanup patch for process freezing

1. Establish a simple API for process freezing defined in linux/include/sched.h:

   frozen(process)		Check for frozen process
   freezing(process)		Check if a process is being frozen
   freeze(process)		Tell a process to freeze (go to refrigerator)
   thaw_process(process)	Restart process
   frozen_process(process)	Process is frozen now

2. Remove all references to PF_FREEZE and PF_FROZEN from all
   kernel sources except sched.h

3. Fix numerous locations where try_to_freeze is manually done by a driver

4. Remove the argument that is no longer necessary from two function calls.

5. Some whitespace cleanup

6. Clear potential race in refrigerator (provides an open window of PF_FREEZE
   cleared before setting PF_FROZEN, recalc_sigpending does not check
   PF_FROZEN).

This patch does not address the problem of freeze_processes() violating the rule
that a task may only modify its own flags by setting PF_FREEZE. This is not clean
in an SMP environment. freeze(process) is therefore not SMP safe!

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/power/kernel_threads.txt    |  3 +-
 Documentation/power/swsusp.txt            |  3 +-
 arch/frv/kernel/signal.c                  |  4 +-
 arch/h8300/kernel/signal.c                |  4 +-
 arch/i386/kernel/io_apic.c                |  2 +-
 arch/i386/kernel/signal.c                 |  4 +-
 arch/m32r/kernel/signal.c                 |  4 +-
 arch/ppc/kernel/signal.c                  |  3 +-
 arch/x86_64/kernel/signal.c               |  2 +-
 drivers/block/pktcdvd.c                   |  3 +-
 drivers/ieee1394/ieee1394_core.c          |  4 +-
 drivers/ieee1394/nodemgr.c                |  2 +-
 drivers/input/gameport/gameport.c         |  2 +-
 drivers/input/serio/serio.c               |  2 +-
 drivers/macintosh/therm_adt746x.c         |  4 +-
 drivers/md/md.c                           |  3 +-
 drivers/media/dvb/dvb-core/dvb_frontend.c |  3 +-
 drivers/media/video/msp3400.c             |  3 +-
 drivers/media/video/video-buf-dvb.c       |  3 +-
 drivers/net/8139too.c                     |  2 +-
 drivers/net/irda/sir_kthread.c            |  3 +-
 drivers/net/irda/stir4200.c               |  4 +-
 drivers/net/wireless/airo.c               |  2 +-
 drivers/pcmcia/cs.c                       |  2 +-
 drivers/pnp/pnpbios/core.c                |  2 +-
 drivers/usb/core/hub.c                    |  2 +-
 drivers/usb/gadget/file_storage.c         |  3 +-
 drivers/usb/storage/usb.c                 |  4 +-
 drivers/w1/w1.c                           |  4 +-
 fs/afs/kafsasyncd.c                       |  2 +-
 fs/afs/kafstimod.c                        |  2 +-
 fs/jbd/journal.c                          |  4 +-
 fs/jfs/jfs_logmgr.c                       |  4 +-
 fs/jfs/jfs_txnmgr.c                       |  8 ++--
 fs/lockd/clntproc.c                       |  2 +-
 fs/xfs/linux-2.6/xfs_buf.c                |  4 +-
 fs/xfs/linux-2.6/xfs_super.c              |  2 +-
 include/linux/sched.h                     | 73 +++++++++++++++++++++++++------
 kernel/power/process.c                    | 26 +++++------
 kernel/sched.c                            |  3 +-
 kernel/signal.c                           |  5 +--
 mm/pdflush.c                              |  2 +-
 mm/vmscan.c                               |  4 +-
 net/rxrpc/krxiod.c                        |  2 +-
 net/rxrpc/krxsecd.c                       |  2 +-
 net/rxrpc/krxtimod.c                      |  2 +-
 net/sunrpc/svcsock.c                      |  6 +--
 47 files changed, 126 insertions(+), 113 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/power/kernel_threads.txt b/Documentation/power/kernel_threads.txt
index 60b548105edf..fb57784986b1 100644
--- a/Documentation/power/kernel_threads.txt
+++ b/Documentation/power/kernel_threads.txt
@@ -12,8 +12,7 @@ refrigerator. Code to do this looks like this:
 	do {
 		hub_events();
 		wait_event_interruptible(khubd_wait, !list_empty(&hub_event_list));
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 	} while (!signal_pending(current));
 
 from drivers/usb/core/hub.c::hub_thread()
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index c7c3459fde43..4e1627cc5b51 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -164,8 +164,7 @@ place where the thread is safe to be frozen (no kernel semaphores
 should be held at that point and it must be safe to sleep there), and
 add:
 
-            if (current->flags & PF_FREEZE)
-                    refrigerator(PF_FREEZE);
+            try_to_freeze();
 
 If the thread is needed for writing the image to storage, you should
 instead set the PF_NOFREEZE process flag when creating the thread.
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index d8d8f3d4304d..36a2dffc8ebd 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -536,10 +536,8 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if (!user_mode(regs))
 		return 1;
 
-	if (current->flags & PF_FREEZE) {
-		refrigerator(0);
+	if (try_to_freeze())
 		goto no_signal;
-	}
 
 	if (!oldset)
 		oldset = &current->blocked;
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index a4799d633ef4..5aab87eae1f9 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -517,10 +517,8 @@ asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if ((regs->ccr & 0x10))
 		return 1;
 
-	if (current->flags & PF_FREEZE) {
-		refrigerator(0);
+	if (try_to_freeze())
 		goto no_signal;
-	}
 
 	current->thread.esp0 = (unsigned long) regs;
 
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 08540bc4ba3e..2451a3a99440 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -573,7 +573,7 @@ static int balanced_irq(void *unused)
 	for ( ; ; ) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		time_remaining = schedule_timeout(time_remaining);
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 		if (time_after(jiffies,
 				prev_balance_time+balanced_irq_interval)) {
 			do_irq_balance();
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index b9b8f4e20fad..ac5b1e975c5c 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -608,10 +608,8 @@ int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if (!user_mode(regs))
 		return 1;
 
-	if (current->flags & PF_FREEZE) {
-		refrigerator(0);
+	if (try_to_freeze)
 		goto no_signal;
-	}
 
 	if (!oldset)
 		oldset = &current->blocked;
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 50311eb07a24..5aef7e406ef5 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -371,10 +371,8 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if (!user_mode(regs))
 		return 1;
 
-	if (current->flags & PF_FREEZE) {
-		refrigerator(0);
+	if (try_to_freeze()) 
 		goto no_signal;
-	}
 
 	if (!oldset)
 		oldset = &current->blocked;
diff --git a/arch/ppc/kernel/signal.c b/arch/ppc/kernel/signal.c
index 7c8437da09d5..8aaeb6f4e750 100644
--- a/arch/ppc/kernel/signal.c
+++ b/arch/ppc/kernel/signal.c
@@ -705,8 +705,7 @@ int do_signal(sigset_t *oldset, struct pt_regs *regs)
 	unsigned long frame, newsp;
 	int signr, ret;
 
-	if (current->flags & PF_FREEZE) {
-		refrigerator(PF_FREEZE);
+	if (try_to_freeze()) {
 		signr = 0;
 		if (!signal_pending(current))
 			goto no_signal;
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 98b7ba95d581..98590a989f3d 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -425,7 +425,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if (!user_mode(regs))
 		return 1;
 
-	if (try_to_freeze(0))
+	if (try_to_freeze())
 		goto no_signal;
 
 	if (!oldset)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 7f3d78de265c..7b838342f0a3 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1251,8 +1251,7 @@ static int kcdrwd(void *foobar)
 			VPRINTK("kcdrwd: wake up\n");
 
 			/* make swsusp happy with our thread */
-			if (current->flags & PF_FREEZE)
-				refrigerator(PF_FREEZE);
+			try_to_freeze();
 
 			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
 				if (!pkt->sleep_time)
diff --git a/drivers/ieee1394/ieee1394_core.c b/drivers/ieee1394/ieee1394_core.c
index 2d9a9b74e687..629070b83a33 100644
--- a/drivers/ieee1394/ieee1394_core.c
+++ b/drivers/ieee1394/ieee1394_core.c
@@ -1041,10 +1041,8 @@ static int hpsbpkt_thread(void *__hi)
 
 	while (1) {
 		if (down_interruptible(&khpsbpkt_sig)) {
-			if (current->flags & PF_FREEZE) {
-				refrigerator(0);
+			if (try_to_freeze())
 				continue;
-			}
 			printk("khpsbpkt: received unexpected signal?!\n" );
 			break;
 		}
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index 32abb6dda888..9a46c3b44bf8 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -1510,7 +1510,7 @@ static int nodemgr_host_thread(void *__hi)
 
 		if (down_interruptible(&hi->reset_sem) ||
 		    down_interruptible(&nodemgr_serialize)) {
-			if (try_to_freeze(PF_FREEZE))
+			if (try_to_freeze())
 				continue;
 			printk("NodeMgr: received unexpected signal?!\n" );
 			break;
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index e152d0fa0cdd..c77a82e46055 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -439,7 +439,7 @@ static int gameport_thread(void *nothing)
 	do {
 		gameport_handle_events();
 		wait_event_interruptible(gameport_wait, !list_empty(&gameport_event_list));
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 	} while (!signal_pending(current));
 
 	printk(KERN_DEBUG "gameport: kgameportd exiting\n");
diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index feab4970406e..341824c48529 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -344,7 +344,7 @@ static int serio_thread(void *nothing)
 	do {
 		serio_handle_events();
 		wait_event_interruptible(serio_wait, !list_empty(&serio_event_list));
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 	} while (!signal_pending(current));
 
 	printk(KERN_DEBUG "serio: kseriod exiting\n");
diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c
index 5ba190ce14a0..c9ca1118e449 100644
--- a/drivers/macintosh/therm_adt746x.c
+++ b/drivers/macintosh/therm_adt746x.c
@@ -328,9 +328,7 @@ static int monitor_task(void *arg)
 	struct thermostat* th = arg;
 
 	while(!kthread_should_stop()) {
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
-
+		try_to_freeze();
 		msleep_interruptible(2000);
 
 #ifndef DEBUG
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0c6b5b6baff6..3802f7a17f16 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2976,8 +2976,7 @@ static int md_thread(void * arg)
 		wait_event_interruptible_timeout(thread->wqueue,
 						 test_bit(THREAD_WAKEUP, &thread->flags),
 						 thread->timeout);
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 
 		clear_bit(THREAD_WAKEUP, &thread->flags);
 
diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.c b/drivers/media/dvb/dvb-core/dvb_frontend.c
index d6b7a9de471e..f11daae91cd4 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.c
@@ -391,8 +391,7 @@ static int dvb_frontend_thread(void *data)
 			break;
 		}
 
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 
 		if (down_interruptible(&fepriv->sem))
 			break;
diff --git a/drivers/media/video/msp3400.c b/drivers/media/video/msp3400.c
index 1b7d38e96f14..b4ee9dfe6d42 100644
--- a/drivers/media/video/msp3400.c
+++ b/drivers/media/video/msp3400.c
@@ -750,8 +750,7 @@ static int msp34xx_sleep(struct msp3400c *msp, int timeout)
 #endif
 		}
 	}
-	if (current->flags & PF_FREEZE)
-		refrigerator(PF_FREEZE);
+	try_to_freeze();
 	remove_wait_queue(&msp->wq, &wait);
 	return msp->restart;
 }
diff --git a/drivers/media/video/video-buf-dvb.c b/drivers/media/video/video-buf-dvb.c
index 5f870075b55e..15f5bb486963 100644
--- a/drivers/media/video/video-buf-dvb.c
+++ b/drivers/media/video/video-buf-dvb.c
@@ -62,8 +62,7 @@ static int videobuf_dvb_thread(void *data)
 			break;
 		if (kthread_should_stop())
 			break;
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 
 		/* feed buffer data to demux */
 		if (buf->state == STATE_DONE)
diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index 047202c4d9a8..5a4a08a7c951 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -1606,7 +1606,7 @@ static int rtl8139_thread (void *data)
 		do {
 			timeout = interruptible_sleep_on_timeout (&tp->thr_wait, timeout);
 			/* make swsusp happy with our thread */
-			try_to_freeze(PF_FREEZE);
+			try_to_freeze();
 		} while (!signal_pending (current) && (timeout > 0));
 
 		if (signal_pending (current)) {
diff --git a/drivers/net/irda/sir_kthread.c b/drivers/net/irda/sir_kthread.c
index 18cea1099530..c65054364bca 100644
--- a/drivers/net/irda/sir_kthread.c
+++ b/drivers/net/irda/sir_kthread.c
@@ -135,8 +135,7 @@ static int irda_thread(void *startup)
 		remove_wait_queue(&irda_rq_queue.kick, &wait);
 
 		/* make swsusp happy with our thread */
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 
 		run_irda_queue();
 	}
diff --git a/drivers/net/irda/stir4200.c b/drivers/net/irda/stir4200.c
index 66f488c13717..15f207323d97 100644
--- a/drivers/net/irda/stir4200.c
+++ b/drivers/net/irda/stir4200.c
@@ -763,7 +763,7 @@ static int stir_transmit_thread(void *arg)
 	{
 #ifdef CONFIG_PM
 		/* if suspending, then power off and wait */
-		if (unlikely(current->flags & PF_FREEZE)) {
+		if (unlikely(freezing(current))) {
 			if (stir->receiving)
 				receive_stop(stir);
 			else
@@ -771,7 +771,7 @@ static int stir_transmit_thread(void *arg)
 
 			write_reg(stir, REG_CTRL1, CTRL1_TXPWD|CTRL1_RXPWD);
 
-			refrigerator(PF_FREEZE);
+			refrigerator();
 
 			if (change_speed(stir, stir->speed))
 				break;
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index fb10a2db63ad..d72e0385e4f2 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -2918,7 +2918,7 @@ static int airo_thread(void *data) {
 			flush_signals(current);
 
 		/* make swsusp happy with our thread */
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		if (test_bit(JOB_DIE, &ai->flags))
 			break;
diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c
index d136b3c8fac9..48e4f04530d8 100644
--- a/drivers/pcmcia/cs.c
+++ b/drivers/pcmcia/cs.c
@@ -718,7 +718,7 @@ static int pccardd(void *__skt)
 		}
 
 		schedule();
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		if (!skt->thread)
 			break;
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index e939c93a931c..778a324028f4 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -182,7 +182,7 @@ static int pnp_dock_thread(void * unused)
 		msleep_interruptible(2000);
 
 		if(signal_pending(current)) {
-			if (try_to_freeze(PF_FREEZE))
+			if (try_to_freeze())
 				continue;
 			break;
 		}
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index d2d648ee8640..a8d879a85d04 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2808,7 +2808,7 @@ static int hub_thread(void *__unused)
 	do {
 		hub_events();
 		wait_event_interruptible(khubd_wait, !list_empty(&hub_event_list)); 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 	} while (!signal_pending(current));
 
 	pr_debug ("%s: khubd exiting\n", usbcore_name);
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 037a7f163822..a9be85103d23 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -1554,8 +1554,7 @@ static int sleep_thread(struct fsg_dev *fsg)
 	rc = wait_event_interruptible(fsg->thread_wqh,
 			fsg->thread_wakeup_needed);
 	fsg->thread_wakeup_needed = 0;
-	if (current->flags & PF_FREEZE)
-		refrigerator(PF_FREEZE);
+	try_to_freeze();
 	return (rc ? -EINTR : 0);
 }
 
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index 35c1ca6b5a8e..77e7fc258aa2 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -847,10 +847,8 @@ retry:
 		wait_event_interruptible_timeout(us->delay_wait,
 				test_bit(US_FLIDX_DISCONNECTING, &us->flags),
 				delay_use * HZ);
-		if (current->flags & PF_FREEZE) {
-			refrigerator(PF_FREEZE);
+		if (try_to_freeze())
 			goto retry;
-		}
 	}
 
 	/* If the device is still connected, perform the scanning */
diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index b460927ec32a..312cf3220f12 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -646,7 +646,7 @@ static int w1_control(void *data)
 	while (!control_needs_exit || have_to_wait) {
 		have_to_wait = 0;
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 		msleep_interruptible(w1_timeout * 1000);
 
 		if (signal_pending(current))
@@ -725,7 +725,7 @@ int w1_process(void *data)
 	allow_signal(SIGTERM);
 
 	while (!test_bit(W1_MASTER_NEED_EXIT, &dev->flags)) {
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 		msleep_interruptible(w1_timeout * 1000);
 
 		if (signal_pending(current))
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
index 6fc88ae8ad94..7ac07d0d47b9 100644
--- a/fs/afs/kafsasyncd.c
+++ b/fs/afs/kafsasyncd.c
@@ -116,7 +116,7 @@ static int kafsasyncd(void *arg)
 		remove_wait_queue(&kafsasyncd_sleepq, &myself);
 		set_current_state(TASK_RUNNING);
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		/* discard pending signals */
 		afs_discard_my_signals();
diff --git a/fs/afs/kafstimod.c b/fs/afs/kafstimod.c
index 86e710dd057e..65bc05ab8182 100644
--- a/fs/afs/kafstimod.c
+++ b/fs/afs/kafstimod.c
@@ -91,7 +91,7 @@ static int kafstimod(void *arg)
 			complete_and_exit(&kafstimod_dead, 0);
 		}
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		/* discard pending signals */
 		afs_discard_my_signals();
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 1e6f2e2ad4a3..5e7b43949517 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -167,7 +167,7 @@ loop:
 	}
 
 	wake_up(&journal->j_wait_done_commit);
-	if (current->flags & PF_FREEZE) {
+	if (freezing(current)) {
 		/*
 		 * The simpler the better. Flushing journal isn't a
 		 * good idea, because that depends on threads that may
@@ -175,7 +175,7 @@ loop:
 		 */
 		jbd_debug(1, "Now suspending kjournald\n");
 		spin_unlock(&journal->j_state_lock);
-		refrigerator(PF_FREEZE);
+		refrigerator();
 		spin_lock(&journal->j_state_lock);
 	} else {
 		/*
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 7c8387ed4192..79d07624bfe1 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2359,9 +2359,9 @@ int jfsIOWait(void *arg)
 			lbmStartIO(bp);
 			spin_lock_irq(&log_redrive_lock);
 		}
-		if (current->flags & PF_FREEZE) {
+		if (freezing(current)) {
 			spin_unlock_irq(&log_redrive_lock);
-			refrigerator(PF_FREEZE);
+			refrigerator();
 		} else {
 			add_wait_queue(&jfs_IO_thread_wait, &wq);
 			set_current_state(TASK_INTERRUPTIBLE);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 8cbaaff1d5fa..121c981ff453 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2788,9 +2788,9 @@ int jfs_lazycommit(void *arg)
 		/* In case a wakeup came while all threads were active */
 		jfs_commit_thread_waking = 0;
 
-		if (current->flags & PF_FREEZE) {
+		if (freezing(current)) {
 			LAZY_UNLOCK(flags);
-			refrigerator(PF_FREEZE);
+			refrigerator();
 		} else {
 			DECLARE_WAITQUEUE(wq, current);
 
@@ -2987,9 +2987,9 @@ int jfs_sync(void *arg)
 		/* Add anon_list2 back to anon_list */
 		list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
 
-		if (current->flags & PF_FREEZE) {
+		if (freezing(current)) {
 			TXN_UNLOCK();
-			refrigerator(PF_FREEZE);
+			refrigerator();
 		} else {
 			DECLARE_WAITQUEUE(wq, current);
 
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index fd77ed1d710d..14b3ce87fa29 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -313,7 +313,7 @@ static int nlm_wait_on_grace(wait_queue_head_t *queue)
 	prepare_to_wait(queue, &wait, TASK_INTERRUPTIBLE);
 	if (!signalled ()) {
 		schedule_timeout(NLMCLNT_GRACE_WAIT);
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 		if (!signalled ())
 			status = 0;
 	}
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index c60e69431e11..df0cba239dd5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1771,9 +1771,9 @@ xfsbufd(
 
 	INIT_LIST_HEAD(&tmp);
 	do {
-		if (unlikely(current->flags & PF_FREEZE)) {
+		if (unlikely(freezing(current))) {
 			xfsbufd_force_sleep = 1;
-			refrigerator(PF_FREEZE);
+			refrigerator();
 		} else {
 			xfsbufd_force_sleep = 0;
 		}
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 5fe9af38aa20..f6dd7de25927 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -483,7 +483,7 @@ xfssyncd(
 		set_current_state(TASK_INTERRUPTIBLE);
 		timeleft = schedule_timeout(timeleft);
 		/* swsusp */
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 		if (vfsp->vfs_flag & VFS_UMOUNT)
 			break;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c69682b0444..e7fd09b0557f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1245,33 +1245,78 @@ extern void normalize_rt_tasks(void);
 
 #endif
 
-/* try_to_freeze
- *
- * Checks whether we need to enter the refrigerator
- * and returns 1 if we did so.
- */
 #ifdef CONFIG_PM
-extern void refrigerator(unsigned long);
+/*
+ * Check if a process has been frozen
+ */
+static inline int frozen(struct task_struct *p)
+{
+	return p->flags & PF_FROZEN;
+}
+
+/*
+ * Check if there is a request to freeze a process
+ */
+static inline int freezing(struct task_struct *p)
+{
+	return p->flags & PF_FREEZE;
+}
+
+/*
+ * Request that a process be frozen
+ * FIXME: SMP problem. We may not modify other process' flags!
+ */
+static inline void freeze(struct task_struct *p)
+{
+	p->flags |= PF_FREEZE;
+}
+
+/*
+ * Wake up a frozen process
+ */
+static inline int thaw_process(struct task_struct *p)
+{
+	if (frozen(p)) {
+		p->flags &= ~PF_FROZEN;
+		wake_up_process(p);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * freezing is complete, mark process as frozen
+ */
+static inline void frozen_process(struct task_struct *p)
+{
+	p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN;
+}
+
+extern void refrigerator(void);
 extern int freeze_processes(void);
 extern void thaw_processes(void);
 
-static inline int try_to_freeze(unsigned long refrigerator_flags)
+static inline int try_to_freeze(void)
 {
-	if (unlikely(current->flags & PF_FREEZE)) {
-		refrigerator(refrigerator_flags);
+	if (freezing(current)) {
+		refrigerator();
 		return 1;
 	} else
 		return 0;
 }
 #else
-static inline void refrigerator(unsigned long flag) {}
+static inline int frozen(struct task_struct *p) { return 0; }
+static inline int freezing(struct task_struct *p) { return 0; }
+static inline void freeze(struct task_struct *p) { BUG(); }
+static inline int thaw_process(struct task_struct *p) { return 1; }
+static inline void frozen_process(struct task_struct *p) { BUG(); }
+
+static inline void refrigerator(void) {}
 static inline int freeze_processes(void) { BUG(); return 0; }
 static inline void thaw_processes(void) {}
 
-static inline int try_to_freeze(unsigned long refrigerator_flags)
-{
-	return 0;
-}
+static inline int try_to_freeze(void) { return 0; }
+
 #endif /* CONFIG_PM */
 #endif /* __KERNEL__ */
 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78d92dc6a1ed..0a086640bcfc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -32,7 +32,7 @@ static inline int freezeable(struct task_struct * p)
 }
 
 /* Refrigerator is place where frozen processes are stored :-). */
-void refrigerator(unsigned long flag)
+void refrigerator(void)
 {
 	/* Hmm, should we be allowed to suspend when there are realtime
 	   processes around? */
@@ -41,14 +41,13 @@ void refrigerator(unsigned long flag)
 	current->state = TASK_UNINTERRUPTIBLE;
 	pr_debug("%s entered refrigerator\n", current->comm);
 	printk("=");
-	current->flags &= ~PF_FREEZE;
 
+	frozen_process(current);
 	spin_lock_irq(&current->sighand->siglock);
 	recalc_sigpending(); /* We sent fake signal, clean it up */
 	spin_unlock_irq(&current->sighand->siglock);
 
-	current->flags |= PF_FROZEN;
-	while (current->flags & PF_FROZEN)
+	while (frozen(current))
 		schedule();
 	pr_debug("%s left refrigerator\n", current->comm);
 	current->state = save;
@@ -57,10 +56,10 @@ void refrigerator(unsigned long flag)
 /* 0 = success, else # of processes that we failed to stop */
 int freeze_processes(void)
 {
-       int todo;
-       unsigned long start_time;
+	int todo;
+	unsigned long start_time;
 	struct task_struct *g, *p;
-	
+
 	printk( "Stopping tasks: " );
 	start_time = jiffies;
 	do {
@@ -70,14 +69,12 @@ int freeze_processes(void)
 			unsigned long flags;
 			if (!freezeable(p))
 				continue;
-			if ((p->flags & PF_FROZEN) ||
+			if ((frozen(p)) ||
 			    (p->state == TASK_TRACED) ||
 			    (p->state == TASK_STOPPED))
 				continue;
 
-			/* FIXME: smp problem here: we may not access other process' flags
-			   without locking */
-			p->flags |= PF_FREEZE;
+			freeze(p);
 			spin_lock_irqsave(&p->sighand->siglock, flags);
 			signal_wake_up(p, 0);
 			spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -91,7 +88,7 @@ int freeze_processes(void)
 			return todo;
 		}
 	} while(todo);
-	
+
 	printk( "|\n" );
 	BUG_ON(in_atomic());
 	return 0;
@@ -106,10 +103,7 @@ void thaw_processes(void)
 	do_each_thread(g, p) {
 		if (!freezeable(p))
 			continue;
-		if (p->flags & PF_FROZEN) {
-			p->flags &= ~PF_FROZEN;
-			wake_up_process(p);
-		} else
+		if (!thaw_process(p))
 			printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
 	} while_each_thread(g, p);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 76080d142e3d..6fa9ea4ae44c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4174,8 +4174,7 @@ static int migration_thread(void * data)
 		struct list_head *head;
 		migration_req_t *req;
 
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 
 		spin_lock_irq(&rq->lock);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index d1258729a5f9..ca1186eef938 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -213,7 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 fastcall void recalc_sigpending_tsk(struct task_struct *t)
 {
 	if (t->signal->group_stop_count > 0 ||
-	    (t->flags & PF_FREEZE) ||
+	    (freezing(t)) ||
 	    PENDING(&t->pending, &t->blocked) ||
 	    PENDING(&t->signal->shared_pending, &t->blocked))
 		set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -2231,8 +2231,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 			current->state = TASK_INTERRUPTIBLE;
 			timeout = schedule_timeout(timeout);
 
-			if (current->flags & PF_FREEZE)
-				refrigerator(PF_FREEZE);
+			try_to_freeze();
 			spin_lock_irq(&current->sighand->siglock);
 			sig = dequeue_signal(current, &these, &info);
 			current->blocked = current->real_blocked;
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 38ce279cc8cd..d6781951267e 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -105,7 +105,7 @@ static int __pdflush(struct pdflush_work *my_work)
 		spin_unlock_irq(&pdflush_lock);
 
 		schedule();
-		if (try_to_freeze(PF_FREEZE)) {
+		if (try_to_freeze()) {
 			spin_lock_irq(&pdflush_lock);
 			continue;
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4b8e62a19370..1fa312a8db77 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1216,8 +1216,8 @@ static int kswapd(void *p)
 	order = 0;
 	for ( ; ; ) {
 		unsigned long new_order;
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+
+		try_to_freeze();
 
 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 		new_order = pgdat->kswapd_max_order;
diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index 2b537f425a17..dada34a77b21 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -138,7 +138,7 @@ static int rxrpc_krxiod(void *arg)
 
 		_debug("### End Work");
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
                 /* discard pending signals */
 		rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index 6020c89d9228..1aadd026d354 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -107,7 +107,7 @@ static int rxrpc_krxsecd(void *arg)
 
 		_debug("### End Inbound Calls");
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
                 /* discard pending signals */
 		rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 249c2b0290bb..3ac81cdd1211 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -90,7 +90,7 @@ static int krxtimod(void *arg)
 			complete_and_exit(&krxtimod_dead, 0);
 		}
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		/* discard pending signals */
 		rxrpc_discard_my_signals();
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 05907035bc96..56db8f13e6cb 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1185,8 +1185,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 	arg->page_len = (pages-2)*PAGE_SIZE;
 	arg->len = (pages-1)*PAGE_SIZE;
 	arg->tail[0].iov_len = 0;
-	
-	try_to_freeze(PF_FREEZE);
+
+	try_to_freeze();
 	if (signalled())
 		return -EINTR;
 
@@ -1227,7 +1227,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 
 		schedule_timeout(timeout);
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		spin_lock_bh(&serv->sv_lock);
 		remove_wait_queue(&rqstp->rq_wait, &wait);
-- 
cgit v1.3-14-g43fede