From 1b2439dbb703ae8d95a9ce7ece6b7800b80f41f0 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 15 Aug 2008 15:29:38 -0700
Subject: debug: add notifier chain debugging

during some development we suspected a case where we left something
in a notifier chain that was from a module that was unloaded already...
and that sort of thing is rather hard to track down.

This patch adds a very simple sanity check (which isn't all that
expensive) to make sure the notifier we're about to call is
actually from either the kernel itself of from a still-loaded
module, avoiding a hard-to-chase-down crash.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/Kconfig.debug | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8b5a7d304a5f..342858fbabbc 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -536,6 +536,16 @@ config DEBUG_SG
 
 	  If unsure, say N.
 
+config DEBUG_NOTIFIERS
+	bool "Debug notifier call chains"
+	depends on DEBUG_KERNEL
+	help
+	  Enable this to turn on sanity checking for notifier call chains.
+	  This is most useful for kernel developers to make sure that
+	  modules properly unregister themselves from notifier chains.
+	  This is a relatively cheap check but if you care about maximum
+	  performance, say N.
+
 config FRAME_POINTER
 	bool "Compile the kernel with frame pointers"
 	depends on DEBUG_KERNEL && \
-- 
cgit v1.3-14-g43fede


From ab7476cf76e560f0efda2a631a70aabe93009025 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 15 Aug 2008 15:29:38 -0700
Subject: debug: add notifier chain debugging, v2

- unbreak ia64 (and powerpc) where function pointers dont
  point at code but at data (reported by Tony Luck)

[ mingo@elte.hu: various cleanups ]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h |  3 +++
 kernel/extable.c       | 16 ++++++++++++++++
 kernel/notifier.c      | 10 +---------
 lib/vsprintf.c         |  2 +-
 4 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'lib')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2651f805ba6d..4e1366b552ae 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -187,6 +187,9 @@ extern unsigned long long memparse(char *ptr, char **retptr);
 extern int core_kernel_text(unsigned long addr);
 extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
+extern int func_ptr_is_kernel_text(void *ptr);
+extern void *dereference_function_descriptor(void *ptr);
+
 struct pid;
 extern struct pid *session_of_pgrp(struct pid *pgrp);
 
diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e17023..adf0cc9c02d6 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -66,3 +66,19 @@ int kernel_text_address(unsigned long addr)
 		return 1;
 	return module_text_address(addr) != NULL;
 }
+
+/*
+ * On some architectures (PPC64, IA64) function pointers
+ * are actually only tokens to some data that then holds the
+ * real function address. As a result, to find if a function
+ * pointer is part of the kernel text, we need to do some
+ * special dereferencing first.
+ */
+int func_ptr_is_kernel_text(void *ptr)
+{
+	unsigned long addr;
+	addr = (unsigned long) dereference_function_descriptor(ptr);
+	if (core_kernel_text(addr))
+		return 1;
+	return module_text_address(addr) != NULL;
+}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 143fdd77dbf7..0f39e398ef60 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -21,10 +21,6 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
 static int notifier_chain_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
-	if (!kernel_text_address((unsigned long)n->notifier_call)) {
-		WARN(1, "Invalid notifier registered!");
-		return 0;
-	}
 	while ((*nl) != NULL) {
 		if (n->priority > (*nl)->priority)
 			break;
@@ -38,10 +34,6 @@ static int notifier_chain_register(struct notifier_block **nl,
 static int notifier_chain_cond_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
-	if (!kernel_text_address((unsigned long)n->notifier_call)) {
-		WARN(1, "Invalid notifier registered!");
-		return 0;
-	}
 	while ((*nl) != NULL) {
 		if ((*nl) == n)
 			return 0;
@@ -92,7 +84,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
 		next_nb = rcu_dereference(nb->next);
 
 #ifdef CONFIG_DEBUG_NOTIFIERS
-		if (!kernel_text_address((unsigned long)nb->notifier_call)) {
+		if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
 			WARN(1, "Invalid notifier called!");
 			nb = next_nb;
 			continue;
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index d8d1d1142248..f5e5ffb9942f 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -513,7 +513,7 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio
 	return buf;
 }
 
-static inline void *dereference_function_descriptor(void *ptr)
+void *dereference_function_descriptor(void *ptr)
 {
 #if defined(CONFIG_IA64) || defined(CONFIG_PPC64)
 	void *p;
-- 
cgit v1.3-14-g43fede


From dd45c9cf687682c9ce256ab14bd8914db77410bb Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 27 Oct 2008 15:47:12 -0700
Subject: printk: add %pM format specifier for MAC addresses

Add format specifiers for printing out six colon-separated bytes:

MAC addresses (%pM):
xx:xx:xx:xx:xx:xx

%#pM is also supported and omits the colon separators.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/vsprintf.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index a013bbc23717..0deaaaf2b14e 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -581,6 +581,23 @@ static char *resource_string(char *buf, char *end, struct resource *res, int fie
 	return string(buf, end, sym, field_width, precision, flags);
 }
 
+static char *mac_address_string(char *buf, char *end, u8 *addr, int field_width,
+				int precision, int flags)
+{
+	char mac_addr[6 * 3]; /* (6 * 2 hex digits), 5 colons and trailing zero */
+	char *p = mac_addr;
+	int i;
+
+	for (i = 0; i < 6; i++) {
+		p = pack_hex_byte(p, addr[i]);
+		if (!(flags & SPECIAL) && i != 5)
+			*p++ = ':';
+	}
+	*p = '\0';
+
+	return string(buf, end, mac_addr, field_width, precision, flags & ~SPECIAL);
+}
+
 /*
  * Show a '%p' thing.  A kernel extension is that the '%p' is followed
  * by an extra set of alphanumeric characters that are extended format
@@ -592,6 +609,8 @@ static char *resource_string(char *buf, char *end, struct resource *res, int fie
  * - 'S' For symbolic direct pointers
  * - 'R' For a struct resource pointer, it prints the range of
  *       addresses (not the name nor the flags)
+ * - 'M' For a 6-byte MAC address, it prints the address in the
+ *       usual colon-separated hex notation
  *
  * Note: The difference between 'S' and 'F' is that on ia64 and ppc64
  * function pointers are really function descriptors, which contain a
@@ -607,6 +626,8 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field
 		return symbol_string(buf, end, ptr, field_width, precision, flags);
 	case 'R':
 		return resource_string(buf, end, ptr, field_width, precision, flags);
+	case 'M':
+		return mac_address_string(buf, end, ptr, field_width, precision, flags);
 	}
 	flags |= SMALL;
 	if (field_width == -1) {
-- 
cgit v1.3-14-g43fede


From 689afa7da106032a3e859ae35494f80dd6eac640 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 28 Oct 2008 16:04:44 -0700
Subject: printk: add %p6 format specifier for IPv6 addresses

Takes a pointer to a IPv6 address and formats it in the usual
colon-separated hex format:
xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx

Each 16 bit word is printed in network-endian byteorder.

%#p6 is also supported and will omit the colons.

%p6 is a replacement for NIP6_FMT and NIP6()
%#p6 is a replacement for NIP6_SEQFMT and NIP6()

Note that NIP6() took a struct in6_addr whereas this takes a pointer
to a struct in6_addr.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/vsprintf.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 0deaaaf2b14e..cb5bc04ff82b 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -598,6 +598,24 @@ static char *mac_address_string(char *buf, char *end, u8 *addr, int field_width,
 	return string(buf, end, mac_addr, field_width, precision, flags & ~SPECIAL);
 }
 
+static char *ip6_addr_string(char *buf, char *end, u8 *addr, int field_width,
+			 int precision, int flags)
+{
+	char ip6_addr[8 * 5]; /* (8 * 4 hex digits), 7 colons and trailing zero */
+	char *p = ip6_addr;
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		p = pack_hex_byte(p, addr[2 * i]);
+		p = pack_hex_byte(p, addr[2 * i + 1]);
+		if (!(flags & SPECIAL) && i != 7)
+			*p++ = ':';
+	}
+	*p = '\0';
+
+	return string(buf, end, ip6_addr, field_width, precision, flags & ~SPECIAL);
+}
+
 /*
  * Show a '%p' thing.  A kernel extension is that the '%p' is followed
  * by an extra set of alphanumeric characters that are extended format
@@ -611,6 +629,8 @@ static char *mac_address_string(char *buf, char *end, u8 *addr, int field_width,
  *       addresses (not the name nor the flags)
  * - 'M' For a 6-byte MAC address, it prints the address in the
  *       usual colon-separated hex notation
+ * - '6' For a IPv6 address prints the address in network-ordered 16 bit hex
+ *       with colon separators
  *
  * Note: The difference between 'S' and 'F' is that on ia64 and ppc64
  * function pointers are really function descriptors, which contain a
@@ -628,6 +648,8 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field
 		return resource_string(buf, end, ptr, field_width, precision, flags);
 	case 'M':
 		return mac_address_string(buf, end, ptr, field_width, precision, flags);
+	case '6':
+		return ip6_addr_string(buf, end, ptr, field_width, precision, flags);
 	}
 	flags |= SMALL;
 	if (field_width == -1) {
-- 
cgit v1.3-14-g43fede


From 4aa996066ffc0dba24036c961ee38dfdfbfc061c Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 29 Oct 2008 12:49:58 -0700
Subject: printk: add %I4, %I6, %i4, %i6 format specifiers

For use in printing IPv4, or IPv6 addresses in the usual way:

%i4 and %I4 are currently equivalent and print the address in
dot-separated decimal x.x.x.x

%I6 prints 16-bit network order hex with colon separators:
xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx

%i6 omits the colons.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/vsprintf.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index cb5bc04ff82b..a8630063d14d 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -616,6 +616,23 @@ static char *ip6_addr_string(char *buf, char *end, u8 *addr, int field_width,
 	return string(buf, end, ip6_addr, field_width, precision, flags & ~SPECIAL);
 }
 
+static char *ip4_addr_string(char *buf, char *end, u8 *addr, int field_width,
+			 int precision, int flags)
+{
+	char ip4_addr[4 * 4]; /* (4 * 3 decimal digits), 3 dots and trailing zero */
+	char *p = ip4_addr;
+	int i;
+
+	for (i = 0; i < 4; i++) {
+		p = put_dec_trunc(p, addr[i]);
+		if (i != 3)
+			*p++ = '.';
+	}
+	*p = '\0';
+
+	return string(buf, end, ip4_addr, field_width, precision, flags & ~SPECIAL);
+}
+
 /*
  * Show a '%p' thing.  A kernel extension is that the '%p' is followed
  * by an extra set of alphanumeric characters that are extended format
@@ -629,6 +646,10 @@ static char *ip6_addr_string(char *buf, char *end, u8 *addr, int field_width,
  *       addresses (not the name nor the flags)
  * - 'M' For a 6-byte MAC address, it prints the address in the
  *       usual colon-separated hex notation
+ * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way (dot-separated
+ *       decimal for v4 and colon separated network-order 16 bit hex for v6)
+ * - 'i' [46] for 'raw' IPv4/IPv6 addresses, IPv6 omits the colons, IPv4 is
+         currently the same
  * - '6' For a IPv6 address prints the address in network-ordered 16 bit hex
  *       with colon separators
  *
@@ -650,6 +671,16 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field
 		return mac_address_string(buf, end, ptr, field_width, precision, flags);
 	case '6':
 		return ip6_addr_string(buf, end, ptr, field_width, precision, flags);
+	case 'i':
+		flags |= SPECIAL;
+		/* Fallthrough */
+	case 'I':
+		if (fmt[1] == '6')
+			return ip6_addr_string(buf, end, ptr, field_width, precision, flags);
+		if (fmt[1] == '4')
+			return ip4_addr_string(buf, end, ptr, field_width, precision, flags);
+		flags &= ~SPECIAL;
+		break;
 	}
 	flags |= SMALL;
 	if (field_width == -1) {
-- 
cgit v1.3-14-g43fede


From 6b9a1066f8d4c57c1b778affd91fc38e1121b6f8 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 29 Oct 2008 12:53:10 -0700
Subject: printk: remove %p6 format specifier, fix up comments

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/vsprintf.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index a8630063d14d..dd7cc7fa3e77 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -649,9 +649,7 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr, int field_width,
  * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way (dot-separated
  *       decimal for v4 and colon separated network-order 16 bit hex for v6)
  * - 'i' [46] for 'raw' IPv4/IPv6 addresses, IPv6 omits the colons, IPv4 is
-         currently the same
- * - '6' For a IPv6 address prints the address in network-ordered 16 bit hex
- *       with colon separators
+ *       currently the same
  *
  * Note: The difference between 'S' and 'F' is that on ia64 and ppc64
  * function pointers are really function descriptors, which contain a
@@ -669,8 +667,6 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field
 		return resource_string(buf, end, ptr, field_width, precision, flags);
 	case 'M':
 		return mac_address_string(buf, end, ptr, field_width, precision, flags);
-	case '6':
-		return ip6_addr_string(buf, end, ptr, field_width, precision, flags);
 	case 'i':
 		flags |= SPECIAL;
 		/* Fallthrough */
-- 
cgit v1.3-14-g43fede


From b9ac99855dcc0316ba2feee2b0d3e578f8315b75 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 3 Nov 2008 17:09:55 -0800
Subject: printk: ipv4 address digits printed in reverse order

put_dec_trunc prints the digits in reverse order and is reversed
inside number(). Continue using put_dec_trunc, but reverse each quad
in ip4_addr_string.

[Noticed by Julius Volz]

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/vsprintf.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index dd7cc7fa3e77..6897724ff5df 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -620,11 +620,15 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr, int field_width,
 			 int precision, int flags)
 {
 	char ip4_addr[4 * 4]; /* (4 * 3 decimal digits), 3 dots and trailing zero */
+	char temp[3];	/* hold each IP quad in reverse order */
 	char *p = ip4_addr;
-	int i;
+	int i, digits;
 
 	for (i = 0; i < 4; i++) {
-		p = put_dec_trunc(p, addr[i]);
+		digits = put_dec_trunc(temp, addr[i]) - temp;
+		/* reverse the digits in the quad */
+		while (digits--)
+			*p++ = temp[digits];
 		if (i != 3)
 			*p++ = '.';
 	}
-- 
cgit v1.3-14-g43fede


From 6cc88bc45ce8043171089c9592da223dfab91823 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:21 +1100
Subject: CRED: Rename is_single_threaded() to is_wq_single_threaded()

Rename is_single_threaded() to is_wq_single_threaded() so that a new
is_single_threaded() can be created that refers to tasks rather than
waitqueues.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/workqueue.c       |  8 ++++----
 lib/is_single_threaded.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 4 deletions(-)
 create mode 100644 lib/is_single_threaded.c

(limited to 'lib')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f928f2a87b9b..f12ab5c4dec4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -84,21 +84,21 @@ static cpumask_t cpu_singlethread_map __read_mostly;
 static cpumask_t cpu_populated_map __read_mostly;
 
 /* If it's single threaded, it isn't in the list of workqueues. */
-static inline int is_single_threaded(struct workqueue_struct *wq)
+static inline int is_wq_single_threaded(struct workqueue_struct *wq)
 {
 	return wq->singlethread;
 }
 
 static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
 {
-	return is_single_threaded(wq)
+	return is_wq_single_threaded(wq)
 		? &cpu_singlethread_map : &cpu_populated_map;
 }
 
 static
 struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
 {
-	if (unlikely(is_single_threaded(wq)))
+	if (unlikely(is_wq_single_threaded(wq)))
 		cpu = singlethread_cpu;
 	return per_cpu_ptr(wq->cpu_wq, cpu);
 }
@@ -769,7 +769,7 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	struct workqueue_struct *wq = cwq->wq;
-	const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
+	const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
 	struct task_struct *p;
 
 	p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c
new file mode 100644
index 000000000000..f1ed2fe76c65
--- /dev/null
+++ b/lib/is_single_threaded.c
@@ -0,0 +1,45 @@
+/* Function to determine if a thread group is single threaded or not
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * - Derived from security/selinux/hooks.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+
+/**
+ * is_single_threaded - Determine if a thread group is single-threaded or not
+ * @p: A task in the thread group in question
+ *
+ * This returns true if the thread group to which a task belongs is single
+ * threaded, false if it is not.
+ */
+bool is_single_threaded(struct task_struct *p)
+{
+	struct task_struct *g, *t;
+	struct mm_struct *mm = p->mm;
+
+	if (atomic_read(&p->signal->count) != 1)
+		goto no;
+
+	if (atomic_read(&p->mm->mm_users) != 1) {
+		read_lock(&tasklist_lock);
+		do_each_thread(g, t) {
+			if (t->mm == mm && t != p)
+				goto no_unlock;
+		} while_each_thread(g, t);
+		read_unlock(&tasklist_lock);
+	}
+
+	return true;
+
+no_unlock:
+	read_unlock(&tasklist_lock);
+no:
+	return false;
+}
-- 
cgit v1.3-14-g43fede


From d84f4f992cbd76e8f39c488cf0c5d123843923b1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:23 +1100
Subject: CRED: Inaugurate COW credentials

Inaugurate copy-on-write credentials management.  This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.

A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().

With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:

	struct cred *new = prepare_creds();
	int ret = blah(new);
	if (ret < 0) {
		abort_creds(new);
		return ret;
	}
	return commit_creds(new);

There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.

To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const.  The purpose of this is compile-time
discouragement of altering credentials through those pointers.  Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:

  (1) Its reference count may incremented and decremented.

  (2) The keyrings to which it points may be modified, but not replaced.

The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).

This patch and the preceding patches have been tested with the LTP SELinux
testsuite.

This patch makes several logical sets of alteration:

 (1) execve().

     This now prepares and commits credentials in various places in the
     security code rather than altering the current creds directly.

 (2) Temporary credential overrides.

     do_coredump() and sys_faccessat() now prepare their own credentials and
     temporarily override the ones currently on the acting thread, whilst
     preventing interference from other threads by holding cred_replace_mutex
     on the thread being dumped.

     This will be replaced in a future patch by something that hands down the
     credentials directly to the functions being called, rather than altering
     the task's objective credentials.

 (3) LSM interface.

     A number of functions have been changed, added or removed:

     (*) security_capset_check(), ->capset_check()
     (*) security_capset_set(), ->capset_set()

     	 Removed in favour of security_capset().

     (*) security_capset(), ->capset()

     	 New.  This is passed a pointer to the new creds, a pointer to the old
     	 creds and the proposed capability sets.  It should fill in the new
     	 creds or return an error.  All pointers, barring the pointer to the
     	 new creds, are now const.

     (*) security_bprm_apply_creds(), ->bprm_apply_creds()

     	 Changed; now returns a value, which will cause the process to be
     	 killed if it's an error.

     (*) security_task_alloc(), ->task_alloc_security()

     	 Removed in favour of security_prepare_creds().

     (*) security_cred_free(), ->cred_free()

     	 New.  Free security data attached to cred->security.

     (*) security_prepare_creds(), ->cred_prepare()

     	 New. Duplicate any security data attached to cred->security.

     (*) security_commit_creds(), ->cred_commit()

     	 New. Apply any security effects for the upcoming installation of new
     	 security by commit_creds().

     (*) security_task_post_setuid(), ->task_post_setuid()

     	 Removed in favour of security_task_fix_setuid().

     (*) security_task_fix_setuid(), ->task_fix_setuid()

     	 Fix up the proposed new credentials for setuid().  This is used by
     	 cap_set_fix_setuid() to implicitly adjust capabilities in line with
     	 setuid() changes.  Changes are made to the new credentials, rather
     	 than the task itself as in security_task_post_setuid().

     (*) security_task_reparent_to_init(), ->task_reparent_to_init()

     	 Removed.  Instead the task being reparented to init is referred
     	 directly to init's credentials.

	 NOTE!  This results in the loss of some state: SELinux's osid no
	 longer records the sid of the thread that forked it.

     (*) security_key_alloc(), ->key_alloc()
     (*) security_key_permission(), ->key_permission()

     	 Changed.  These now take cred pointers rather than task pointers to
     	 refer to the security context.

 (4) sys_capset().

     This has been simplified and uses less locking.  The LSM functions it
     calls have been merged.

 (5) reparent_to_kthreadd().

     This gives the current thread the same credentials as init by simply using
     commit_thread() to point that way.

 (6) __sigqueue_alloc() and switch_uid()

     __sigqueue_alloc() can't stop the target task from changing its creds
     beneath it, so this function gets a reference to the currently applicable
     user_struct which it then passes into the sigqueue struct it returns if
     successful.

     switch_uid() is now called from commit_creds(), and possibly should be
     folded into that.  commit_creds() should take care of protecting
     __sigqueue_alloc().

 (7) [sg]et[ug]id() and co and [sg]et_current_groups.

     The set functions now all use prepare_creds(), commit_creds() and
     abort_creds() to build and check a new set of credentials before applying
     it.

     security_task_set[ug]id() is called inside the prepared section.  This
     guarantees that nothing else will affect the creds until we've finished.

     The calling of set_dumpable() has been moved into commit_creds().

     Much of the functionality of set_user() has been moved into
     commit_creds().

     The get functions all simply access the data directly.

 (8) security_task_prctl() and cap_task_prctl().

     security_task_prctl() has been modified to return -ENOSYS if it doesn't
     want to handle a function, or otherwise return the return value directly
     rather than through an argument.

     Additionally, cap_task_prctl() now prepares a new set of credentials, even
     if it doesn't end up using it.

 (9) Keyrings.

     A number of changes have been made to the keyrings code:

     (a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
     	 all been dropped and built in to the credentials functions directly.
     	 They may want separating out again later.

     (b) key_alloc() and search_process_keyrings() now take a cred pointer
     	 rather than a task pointer to specify the security context.

     (c) copy_creds() gives a new thread within the same thread group a new
     	 thread keyring if its parent had one, otherwise it discards the thread
     	 keyring.

     (d) The authorisation key now points directly to the credentials to extend
     	 the search into rather pointing to the task that carries them.

     (e) Installing thread, process or session keyrings causes a new set of
     	 credentials to be created, even though it's not strictly necessary for
     	 process or session keyrings (they're shared).

(10) Usermode helper.

     The usermode helper code now carries a cred struct pointer in its
     subprocess_info struct instead of a new session keyring pointer.  This set
     of credentials is derived from init_cred and installed on the new process
     after it has been cloned.

     call_usermodehelper_setup() allocates the new credentials and
     call_usermodehelper_freeinfo() discards them if they haven't been used.  A
     special cred function (prepare_usermodeinfo_creds()) is provided
     specifically for call_usermodehelper_setup() to call.

     call_usermodehelper_setkeys() adjusts the credentials to sport the
     supplied keyring as the new session keyring.

(11) SELinux.

     SELinux has a number of changes, in addition to those to support the LSM
     interface changes mentioned above:

     (a) selinux_setprocattr() no longer does its check for whether the
     	 current ptracer can access processes with the new SID inside the lock
     	 that covers getting the ptracer's SID.  Whilst this lock ensures that
     	 the check is done with the ptracer pinned, the result is only valid
     	 until the lock is released, so there's no point doing it inside the
     	 lock.

(12) is_single_threaded().

     This function has been extracted from selinux_setprocattr() and put into
     a file of its own in the lib/ directory as join_session_keyring() now
     wants to use it too.

     The code in SELinux just checked to see whether a task shared mm_structs
     with other tasks (CLONE_VM), but that isn't good enough.  We really want
     to know if they're part of the same thread group (CLONE_THREAD).

(13) nfsd.

     The NFS server daemon now has to use the COW credentials to set the
     credentials it is going to use.  It really needs to pass the credentials
     down to the functions it calls, but it can't do that until other patches
     in this series have been applied.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/exec.c                        |  31 ++-
 fs/nfsd/auth.c                   |  92 ++++----
 fs/nfsd/nfs4recover.c            |  68 +++---
 fs/nfsd/nfsfh.c                  |  11 +-
 fs/open.c                        |  31 ++-
 include/linux/audit.h            |  22 +-
 include/linux/capability.h       |   2 -
 include/linux/cred.h             |  44 +++-
 include/linux/init_task.h        |   2 +
 include/linux/key.h              |  22 +-
 include/linux/sched.h            |   6 +-
 include/linux/security.h         | 178 +++++++---------
 init/main.c                      |   1 +
 kernel/auditsc.c                 |  42 ++--
 kernel/capability.c              |  78 +++----
 kernel/cred-internals.h          |  21 ++
 kernel/cred.c                    | 321 ++++++++++++++++++++++++----
 kernel/exit.c                    |   9 +-
 kernel/fork.c                    |   7 +-
 kernel/kmod.c                    |  30 ++-
 kernel/ptrace.c                  |   9 +
 kernel/signal.c                  |  10 +-
 kernel/sys.c                     | 450 +++++++++++++++++++++------------------
 kernel/user.c                    |  37 +---
 kernel/user_namespace.c          |  12 +-
 lib/Makefile                     |   2 +-
 net/rxrpc/ar-key.c               |   6 +-
 security/capability.c            |  21 +-
 security/commoncap.c             | 265 +++++++++++------------
 security/keys/internal.h         |  17 +-
 security/keys/key.c              |  25 +--
 security/keys/keyctl.c           |  95 ++++++---
 security/keys/keyring.c          |  14 +-
 security/keys/permission.c       |  24 ++-
 security/keys/proc.c             |   8 +-
 security/keys/process_keys.c     | 333 ++++++++++++++---------------
 security/keys/request_key.c      |  29 ++-
 security/keys/request_key_auth.c |  41 ++--
 security/security.c              |  58 +++--
 security/selinux/hooks.c         | 286 ++++++++++++-------------
 security/smack/smack_lsm.c       |  82 ++++---
 41 files changed, 1603 insertions(+), 1239 deletions(-)
 create mode 100644 kernel/cred-internals.h

(limited to 'lib')

diff --git a/fs/exec.c b/fs/exec.c
index a5330e1a2216..9bd3559ddece 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1007,13 +1007,12 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 */
 	current->mm->task_size = TASK_SIZE;
 
-	if (bprm->e_uid != current_euid() || bprm->e_gid != current_egid()) {
-		suid_keys(current);
+	if (bprm->e_uid != current_euid() ||
+	    bprm->e_gid != current_egid()) {
 		set_dumpable(current->mm, suid_dumpable);
 		current->pdeath_signal = 0;
 	} else if (file_permission(bprm->file, MAY_READ) ||
 			(bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
-		suid_keys(current);
 		set_dumpable(current->mm, suid_dumpable);
 	}
 
@@ -1096,10 +1095,8 @@ void compute_creds(struct linux_binprm *bprm)
 {
 	int unsafe;
 
-	if (bprm->e_uid != current_uid()) {
-		suid_keys(current);
+	if (bprm->e_uid != current_uid())
 		current->pdeath_signal = 0;
-	}
 	exec_keys(current);
 
 	task_lock(current);
@@ -1709,8 +1706,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	struct linux_binfmt * binfmt;
 	struct inode * inode;
 	struct file * file;
+	const struct cred *old_cred;
+	struct cred *cred;
 	int retval = 0;
-	int fsuid = current_fsuid();
 	int flag = 0;
 	int ispipe = 0;
 	unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
@@ -1723,12 +1721,20 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	binfmt = current->binfmt;
 	if (!binfmt || !binfmt->core_dump)
 		goto fail;
+
+	cred = prepare_creds();
+	if (!cred) {
+		retval = -ENOMEM;
+		goto fail;
+	}
+
 	down_write(&mm->mmap_sem);
 	/*
 	 * If another thread got here first, or we are not dumpable, bail out.
 	 */
 	if (mm->core_state || !get_dumpable(mm)) {
 		up_write(&mm->mmap_sem);
+		put_cred(cred);
 		goto fail;
 	}
 
@@ -1739,12 +1745,16 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 */
 	if (get_dumpable(mm) == 2) {	/* Setuid core dump mode */
 		flag = O_EXCL;		/* Stop rewrite attacks */
-		current->cred->fsuid = 0;	/* Dump root private */
+		cred->fsuid = 0;	/* Dump root private */
 	}
 
 	retval = coredump_wait(exit_code, &core_state);
-	if (retval < 0)
+	if (retval < 0) {
+		put_cred(cred);
 		goto fail;
+	}
+
+	old_cred = override_creds(cred);
 
 	/*
 	 * Clear any false indication of pending signals that might
@@ -1835,7 +1845,8 @@ fail_unlock:
 	if (helper_argv)
 		argv_free(helper_argv);
 
-	current->cred->fsuid = fsuid;
+	revert_creds(old_cred);
+	put_cred(cred);
 	coredump_finish(mm);
 fail:
 	return retval;
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 808fc03a6fbd..836ffa1047d9 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -27,55 +27,67 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
 
 int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 {
-	struct cred *act_as = current->cred ;
-	struct svc_cred	cred = rqstp->rq_cred;
+	struct group_info *rqgi;
+	struct group_info *gi;
+	struct cred *new;
 	int i;
 	int flags = nfsexp_flags(rqstp, exp);
 	int ret;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	new->fsuid = rqstp->rq_cred.cr_uid;
+	new->fsgid = rqstp->rq_cred.cr_gid;
+
+	rqgi = rqstp->rq_cred.cr_group_info;
+
 	if (flags & NFSEXP_ALLSQUASH) {
-		cred.cr_uid = exp->ex_anon_uid;
-		cred.cr_gid = exp->ex_anon_gid;
-		cred.cr_group_info = groups_alloc(0);
+		new->fsuid = exp->ex_anon_uid;
+		new->fsgid = exp->ex_anon_gid;
+		gi = groups_alloc(0);
 	} else if (flags & NFSEXP_ROOTSQUASH) {
-		struct group_info *gi;
-		if (!cred.cr_uid)
-			cred.cr_uid = exp->ex_anon_uid;
-		if (!cred.cr_gid)
-			cred.cr_gid = exp->ex_anon_gid;
-		gi = groups_alloc(cred.cr_group_info->ngroups);
-		if (gi)
-			for (i = 0; i < cred.cr_group_info->ngroups; i++) {
-				if (!GROUP_AT(cred.cr_group_info, i))
-					GROUP_AT(gi, i) = exp->ex_anon_gid;
-				else
-					GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i);
-			}
-		cred.cr_group_info = gi;
-	} else
-		get_group_info(cred.cr_group_info);
-
-	if (cred.cr_uid != (uid_t) -1)
-		act_as->fsuid = cred.cr_uid;
-	else
-		act_as->fsuid = exp->ex_anon_uid;
-	if (cred.cr_gid != (gid_t) -1)
-		act_as->fsgid = cred.cr_gid;
-	else
-		act_as->fsgid = exp->ex_anon_gid;
+		if (!new->fsuid)
+			new->fsuid = exp->ex_anon_uid;
+		if (!new->fsgid)
+			new->fsgid = exp->ex_anon_gid;
 
-	if (!cred.cr_group_info)
-		return -ENOMEM;
-	ret = set_groups(act_as, cred.cr_group_info);
-	put_group_info(cred.cr_group_info);
-	if ((cred.cr_uid)) {
-		act_as->cap_effective =
-			cap_drop_nfsd_set(act_as->cap_effective);
+		gi = groups_alloc(rqgi->ngroups);
+		if (!gi)
+			goto oom;
+
+		for (i = 0; i < rqgi->ngroups; i++) {
+			if (!GROUP_AT(rqgi, i))
+				GROUP_AT(gi, i) = exp->ex_anon_gid;
+			else
+				GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
+		}
 	} else {
-		act_as->cap_effective =
-			cap_raise_nfsd_set(act_as->cap_effective,
-					   act_as->cap_permitted);
+		gi = get_group_info(rqgi);
 	}
+
+	if (new->fsuid == (uid_t) -1)
+		new->fsuid = exp->ex_anon_uid;
+	if (new->fsgid == (gid_t) -1)
+		new->fsgid = exp->ex_anon_gid;
+
+	ret = set_groups(new, gi);
+	put_group_info(gi);
+	if (!ret)
+		goto error;
+
+	if (new->uid)
+		new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
+	else
+		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
+							new->cap_permitted);
+	return commit_creds(new);
+
+oom:
+	ret = -ENOMEM;
+error:
+	abort_creds(new);
 	return ret;
 }
 
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 632a50b4b371..9371ea12d7fa 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -54,20 +54,26 @@
 static struct path rec_dir;
 static int rec_dir_init = 0;
 
-static void
-nfs4_save_user(uid_t *saveuid, gid_t *savegid)
+static int
+nfs4_save_creds(const struct cred **original_creds)
 {
-	*saveuid = current->cred->fsuid;
-	*savegid = current->cred->fsgid;
-	current->cred->fsuid = 0;
-	current->cred->fsgid = 0;
+	struct cred *new;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	new->fsuid = 0;
+	new->fsgid = 0;
+	*original_creds = override_creds(new);
+	put_cred(new);
+	return 0;
 }
 
 static void
-nfs4_reset_user(uid_t saveuid, gid_t savegid)
+nfs4_reset_creds(const struct cred *original)
 {
-	current->cred->fsuid = saveuid;
-	current->cred->fsgid = savegid;
+	revert_creds(original);
 }
 
 static void
@@ -129,10 +135,9 @@ nfsd4_sync_rec_dir(void)
 int
 nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
+	const struct cred *original_cred;
 	char *dname = clp->cl_recdir;
 	struct dentry *dentry;
-	uid_t uid;
-	gid_t gid;
 	int status;
 
 	dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
@@ -140,7 +145,9 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 	if (!rec_dir_init || clp->cl_firststate)
 		return 0;
 
-	nfs4_save_user(&uid, &gid);
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		return status;
 
 	/* lock the parent */
 	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
@@ -168,7 +175,7 @@ out_unlock:
 		clp->cl_firststate = 1;
 		nfsd4_sync_rec_dir();
 	}
-	nfs4_reset_user(uid, gid);
+	nfs4_reset_creds(original_cred);
 	dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
 	return status;
 }
@@ -211,20 +218,21 @@ nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
 static int
 nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 {
+	const struct cred *original_cred;
 	struct file *filp;
 	struct dentry_list_arg dla = {
 		.parent = dir,
 	};
 	struct list_head *dentries = &dla.dentries;
 	struct dentry_list *child;
-	uid_t uid;
-	gid_t gid;
 	int status;
 
 	if (!rec_dir_init)
 		return 0;
 
-	nfs4_save_user(&uid, &gid);
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		return status;
 
 	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
 			   current_cred());
@@ -250,7 +258,7 @@ out:
 		dput(child->dentry);
 		kfree(child);
 	}
-	nfs4_reset_user(uid, gid);
+	nfs4_reset_creds(original_cred);
 	return status;
 }
 
@@ -312,8 +320,7 @@ out:
 void
 nfsd4_remove_clid_dir(struct nfs4_client *clp)
 {
-	uid_t uid;
-	gid_t gid;
+	const struct cred *original_cred;
 	int status;
 
 	if (!rec_dir_init || !clp->cl_firststate)
@@ -323,9 +330,13 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 	if (status)
 		goto out;
 	clp->cl_firststate = 0;
-	nfs4_save_user(&uid, &gid);
+
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		goto out;
+
 	status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
-	nfs4_reset_user(uid, gid);
+	nfs4_reset_creds(original_cred);
 	if (status == 0)
 		nfsd4_sync_rec_dir();
 	mnt_drop_write(rec_dir.mnt);
@@ -402,16 +413,21 @@ nfsd4_recdir_load(void) {
 void
 nfsd4_init_recdir(char *rec_dirname)
 {
-	uid_t			uid = 0;
-	gid_t			gid = 0;
-	int 			status;
+	const struct cred *original_cred;
+	int status;
 
 	printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
 			rec_dirname);
 
 	BUG_ON(rec_dir_init);
 
-	nfs4_save_user(&uid, &gid);
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0) {
+		printk("NFSD: Unable to change credentials to find recovery"
+		       " directory: error %d\n",
+		       status);
+		return;
+	}
 
 	status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
 			&rec_dir);
@@ -421,7 +437,7 @@ nfsd4_init_recdir(char *rec_dirname)
 
 	if (!status)
 		rec_dir_init = 1;
-	nfs4_reset_user(uid, gid);
+	nfs4_reset_creds(original_cred);
 }
 
 void
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index e67cfaea0865..f0da7d9c3a92 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -186,9 +186,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 		 * access control settings being in effect, we cannot
 		 * fix that case easily.
 		 */
-		current->cred->cap_effective =
-			cap_raise_nfsd_set(current->cred->cap_effective,
-					   current->cred->cap_permitted);
+		struct cred *new = prepare_creds();
+		if (!new)
+			return nfserrno(-ENOMEM);
+		new->cap_effective =
+			cap_raise_nfsd_set(new->cap_effective,
+					   new->cap_permitted);
+		put_cred(override_creds(new));
+		put_cred(new);
 	} else {
 		error = nfsd_setuser_and_check_port(rqstp, exp);
 		if (error)
diff --git a/fs/open.c b/fs/open.c
index f96eaab280a3..c0a426d5766c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -425,30 +425,33 @@ out:
  */
 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
-	struct cred *cred = current->cred;
+	const struct cred *old_cred;
+	struct cred *override_cred;
 	struct path path;
 	struct inode *inode;
-	int old_fsuid, old_fsgid;
-	kernel_cap_t uninitialized_var(old_cap);  /* !SECURE_NO_SETUID_FIXUP */
 	int res;
 
 	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
 		return -EINVAL;
 
-	old_fsuid = cred->fsuid;
-	old_fsgid = cred->fsgid;
+	override_cred = prepare_creds();
+	if (!override_cred)
+		return -ENOMEM;
 
-	cred->fsuid = cred->uid;
-	cred->fsgid = cred->gid;
+	override_cred->fsuid = override_cred->uid;
+	override_cred->fsgid = override_cred->gid;
 
 	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
 		/* Clear the capabilities if we switch to a non-root user */
-		if (current->cred->uid)
-			old_cap = cap_set_effective(__cap_empty_set);
+		if (override_cred->uid)
+			cap_clear(override_cred->cap_effective);
 		else
-			old_cap = cap_set_effective(cred->cap_permitted);
+			override_cred->cap_effective =
+				override_cred->cap_permitted;
 	}
 
+	old_cred = override_creds(override_cred);
+
 	res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
 	if (res)
 		goto out;
@@ -485,12 +488,8 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 out_path_release:
 	path_put(&path);
 out:
-	cred->fsuid = old_fsuid;
-	cred->fsgid = old_fsgid;
-
-	if (!issecure(SECURE_NO_SETUID_FIXUP))
-		cap_set_effective(old_cap);
-
+	revert_creds(old_cred);
+	put_cred(override_cred);
 	return res;
 }
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6fbebac7b1bf..0b2fcb698a63 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -454,8 +454,10 @@ extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_pr
 extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout);
 extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification);
 extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
-extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE);
-extern int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm);
+extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
+				  const struct cred *new,
+				  const struct cred *old);
+extern int __audit_log_capset(pid_t pid, const struct cred *new, const struct cred *old);
 
 static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -522,16 +524,20 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
  *
  * -Eric
  */
-static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE)
+static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
+				       const struct cred *new,
+				       const struct cred *old)
 {
 	if (unlikely(!audit_dummy_context()))
-		__audit_log_bprm_fcaps(bprm, pP, pE);
+		return __audit_log_bprm_fcaps(bprm, new, old);
+	return 0;
 }
 
-static inline int audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm)
+static inline int audit_log_capset(pid_t pid, const struct cred *new,
+				   const struct cred *old)
 {
 	if (unlikely(!audit_dummy_context()))
-		return __audit_log_capset(pid, eff, inh, perm);
+		return __audit_log_capset(pid, new, old);
 	return 0;
 }
 
@@ -566,8 +572,8 @@ extern int audit_signals;
 #define audit_mq_timedreceive(d,l,p,t) ({ 0; })
 #define audit_mq_notify(d,n) ({ 0; })
 #define audit_mq_getsetattr(d,s) ({ 0; })
-#define audit_log_bprm_fcaps(b, p, e) do { ; } while (0)
-#define audit_log_capset(pid, e, i, p) ({ 0; })
+#define audit_log_bprm_fcaps(b, ncr, ocr) ({ 0; })
+#define audit_log_capset(pid, ncr, ocr) ({ 0; })
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7f26580a5a4d..e22f48c2a46f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -519,8 +519,6 @@ extern const kernel_cap_t __cap_empty_set;
 extern const kernel_cap_t __cap_full_set;
 extern const kernel_cap_t __cap_init_eff_set;
 
-kernel_cap_t cap_set_effective(const kernel_cap_t pE_new);
-
 /**
  * has_capability - Determine if a task has a superior capability available
  * @t: The task in question
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 62b9e532422d..eaf6fa695a04 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -84,6 +84,8 @@ struct thread_group_cred {
 	struct key	*process_keyring;	/* keyring private to this process */
 	struct rcu_head	rcu;			/* RCU deletion hook */
 };
+
+extern void release_tgcred(struct cred *cred);
 #endif
 
 /*
@@ -137,11 +139,30 @@ struct cred {
 	struct user_struct *user;	/* real user ID subscription */
 	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
 	struct rcu_head	rcu;		/* RCU deletion hook */
-	spinlock_t	lock;		/* lock for pointer changes */
 };
 
 extern void __put_cred(struct cred *);
 extern int copy_creds(struct task_struct *, unsigned long);
+extern struct cred *prepare_creds(void);
+extern struct cred *prepare_usermodehelper_creds(void);
+extern int commit_creds(struct cred *);
+extern void abort_creds(struct cred *);
+extern const struct cred *override_creds(const struct cred *) __deprecated;
+extern void revert_creds(const struct cred *) __deprecated;
+extern void __init cred_init(void);
+
+/**
+ * get_new_cred - Get a reference on a new set of credentials
+ * @cred: The new credentials to reference
+ *
+ * Get a reference on the specified set of new credentials.  The caller must
+ * release the reference.
+ */
+static inline struct cred *get_new_cred(struct cred *cred)
+{
+	atomic_inc(&cred->usage);
+	return cred;
+}
 
 /**
  * get_cred - Get a reference on a set of credentials
@@ -150,10 +171,9 @@ extern int copy_creds(struct task_struct *, unsigned long);
  * Get a reference on the specified set of credentials.  The caller must
  * release the reference.
  */
-static inline struct cred *get_cred(struct cred *cred)
+static inline const struct cred *get_cred(const struct cred *cred)
 {
-	atomic_inc(&cred->usage);
-	return cred;
+	return get_new_cred((struct cred *) cred);
 }
 
 /**
@@ -166,6 +186,8 @@ static inline struct cred *get_cred(struct cred *cred)
 static inline void put_cred(const struct cred *_cred)
 {
 	struct cred *cred = (struct cred *) _cred;
+
+	BUG_ON(atomic_read(&(cred)->usage) <= 0);
 	if (atomic_dec_and_test(&(cred)->usage))
 		__put_cred(cred);
 }
@@ -250,13 +272,13 @@ static inline void put_cred(const struct cred *_cred)
 	__groups;					\
 })
 
-#define task_cred_xxx(task, xxx)		\
-({						\
-	__typeof__(task->cred->xxx) ___val;	\
-	rcu_read_lock();			\
-	___val = __task_cred((task))->xxx;	\
-	rcu_read_unlock();			\
-	___val;					\
+#define task_cred_xxx(task, xxx)			\
+({							\
+	__typeof__(((struct cred *)NULL)->xxx) ___val;	\
+	rcu_read_lock();				\
+	___val = __task_cred((task))->xxx;		\
+	rcu_read_unlock();				\
+	___val;						\
 })
 
 #define task_uid(task)		(task_cred_xxx((task), uid))
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5e24c54b6dfd..08c3b24ad9a8 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -150,6 +150,8 @@ extern struct cred init_cred;
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
 	.group_leader	= &tsk,						\
 	.cred		= &init_cred,					\
+	.cred_exec_mutex =						\
+		 __MUTEX_INITIALIZER(tsk.cred_exec_mutex),		\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
diff --git a/include/linux/key.h b/include/linux/key.h
index 0836cc838b0c..69ecf0934b02 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -73,6 +73,7 @@ struct key;
 struct seq_file;
 struct user_struct;
 struct signal_struct;
+struct cred;
 
 struct key_type;
 struct key_owner;
@@ -181,7 +182,7 @@ struct key {
 extern struct key *key_alloc(struct key_type *type,
 			     const char *desc,
 			     uid_t uid, gid_t gid,
-			     struct task_struct *ctx,
+			     const struct cred *cred,
 			     key_perm_t perm,
 			     unsigned long flags);
 
@@ -249,7 +250,7 @@ extern int key_unlink(struct key *keyring,
 		      struct key *key);
 
 extern struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
-				 struct task_struct *ctx,
+				 const struct cred *cred,
 				 unsigned long flags,
 				 struct key *dest);
 
@@ -276,22 +277,12 @@ extern ctl_table key_sysctls[];
 /*
  * the userspace interface
  */
-extern void switch_uid_keyring(struct user_struct *new_user);
-extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk);
-extern void exit_keys(struct task_struct *tsk);
-extern int suid_keys(struct task_struct *tsk);
+extern int install_thread_keyring_to_cred(struct cred *cred);
 extern int exec_keys(struct task_struct *tsk);
 extern void key_fsuid_changed(struct task_struct *tsk);
 extern void key_fsgid_changed(struct task_struct *tsk);
 extern void key_init(void);
 
-#define __install_session_keyring(keyring)				\
-({									\
-	struct key *old_session = current->cred->tgcred->session_keyring; \
-	current->cred->tgcred->session_keyring = keyring;		\
-	old_session;							\
-})
-
 #else /* CONFIG_KEYS */
 
 #define key_validate(k)			0
@@ -303,11 +294,6 @@ extern void key_init(void);
 #define make_key_ref(k, p)		NULL
 #define key_ref_to_ptr(k)		NULL
 #define is_key_possessed(k)		0
-#define switch_uid_keyring(u)		do { } while(0)
-#define __install_session_keyring(k)	({ NULL; })
-#define copy_keys(f,t)			0
-#define exit_keys(t)			do { } while(0)
-#define suid_keys(t)			do { } while(0)
 #define exec_keys(t)			do { } while(0)
 #define key_fsuid_changed(t)		do { } while(0)
 #define key_fsgid_changed(t)		do { } while(0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2913252989b3..121d655e460d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1145,7 +1145,8 @@ struct task_struct {
 	struct list_head cpu_timers[3];
 
 /* process credentials */
-	struct cred *cred;	/* actual/objective task credentials */
+	const struct cred *cred;	/* actual/objective task credentials (COW) */
+	struct mutex cred_exec_mutex;	/* execve vs ptrace cred calculation mutex */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
@@ -1720,7 +1721,6 @@ static inline struct user_struct *get_uid(struct user_struct *u)
 	return u;
 }
 extern void free_uid(struct user_struct *);
-extern void switch_uid(struct user_struct *);
 extern void release_uids(struct user_namespace *ns);
 
 #include <asm/current.h>
@@ -1870,6 +1870,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 #define for_each_process(p) \
 	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
 
+extern bool is_single_threaded(struct task_struct *);
+
 /*
  * Careful: do_each_thread/while_each_thread is a double loop so
  *          'break' will not work as expected - use goto instead.
diff --git a/include/linux/security.h b/include/linux/security.h
index 7e9fe046a0d1..68be11251447 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -53,24 +53,21 @@ extern int cap_settime(struct timespec *ts, struct timezone *tz);
 extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
 extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
-extern int cap_capset_check(const kernel_cap_t *effective,
-			    const kernel_cap_t *inheritable,
-			    const kernel_cap_t *permitted);
-extern void cap_capset_set(const kernel_cap_t *effective,
-			   const kernel_cap_t *inheritable,
-			   const kernel_cap_t *permitted);
+extern int cap_capset(struct cred *new, const struct cred *old,
+		      const kernel_cap_t *effective,
+		      const kernel_cap_t *inheritable,
+		      const kernel_cap_t *permitted);
 extern int cap_bprm_set_security(struct linux_binprm *bprm);
-extern void cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
+extern int cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
 extern int cap_bprm_secureexec(struct linux_binprm *bprm);
 extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
 			      const void *value, size_t size, int flags);
 extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
 extern int cap_inode_need_killpriv(struct dentry *dentry);
 extern int cap_inode_killpriv(struct dentry *dentry);
-extern int cap_task_post_setuid(uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags);
-extern void cap_task_reparent_to_init(struct task_struct *p);
+extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
 extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-			  unsigned long arg4, unsigned long arg5, long *rc_p);
+			  unsigned long arg4, unsigned long arg5);
 extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp);
 extern int cap_task_setioprio(struct task_struct *p, int ioprio);
 extern int cap_task_setnice(struct task_struct *p, int nice);
@@ -170,8 +167,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	Compute and set the security attributes of a process being transformed
  *	by an execve operation based on the old attributes (current->security)
  *	and the information saved in @bprm->security by the set_security hook.
- *	Since this hook function (and its caller) are void, this hook can not
- *	return an error.  However, it can leave the security attributes of the
+ *	Since this function may return an error, in which case the process will
+ *      be killed.  However, it can leave the security attributes of the
  *	process unchanged if an access failure occurs at this point.
  *	bprm_apply_creds is called under task_lock.  @unsafe indicates various
  *	reasons why it may be unsafe to change security state.
@@ -593,15 +590,18 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	manual page for definitions of the @clone_flags.
  *	@clone_flags contains the flags indicating what should be shared.
  *	Return 0 if permission is granted.
- * @cred_alloc_security:
- *	@cred contains the cred struct for child process.
- *	Allocate and attach a security structure to the cred->security field.
- *	The security field is initialized to NULL when the task structure is
- *	allocated.
- *	Return 0 if operation was successful.
  * @cred_free:
  *	@cred points to the credentials.
  *	Deallocate and clear the cred->security field in a set of credentials.
+ * @cred_prepare:
+ *	@new points to the new credentials.
+ *	@old points to the original credentials.
+ *	@gfp indicates the atomicity of any memory allocations.
+ *	Prepare a new set of credentials by copying the data from the old set.
+ * @cred_commit:
+ *	@new points to the new credentials.
+ *	@old points to the original credentials.
+ *	Install a new set of credentials.
  * @task_setuid:
  *	Check permission before setting one or more of the user identity
  *	attributes of the current process.  The @flags parameter indicates
@@ -614,15 +614,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@id2 contains a uid.
  *	@flags contains one of the LSM_SETID_* values.
  *	Return 0 if permission is granted.
- * @task_post_setuid:
+ * @task_fix_setuid:
  *	Update the module's state after setting one or more of the user
  *	identity attributes of the current process.  The @flags parameter
  *	indicates which of the set*uid system calls invoked this hook.  If
- *	@flags is LSM_SETID_FS, then @old_ruid is the old fs uid and the other
- *	parameters are not used.
- *	@old_ruid contains the old real uid (or fs uid if LSM_SETID_FS).
- *	@old_euid contains the old effective uid (or -1 if LSM_SETID_FS).
- *	@old_suid contains the old saved uid (or -1 if LSM_SETID_FS).
+ *	@new is the set of credentials that will be installed.  Modifications
+ *	should be made to this rather than to @current->cred.
+ *	@old is the set of credentials that are being replaces
  *	@flags contains one of the LSM_SETID_* values.
  *	Return 0 on success.
  * @task_setgid:
@@ -725,13 +723,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@arg3 contains a argument.
  *	@arg4 contains a argument.
  *	@arg5 contains a argument.
- *      @rc_p contains a pointer to communicate back the forced return code
- *	Return 0 if permission is granted, and non-zero if the security module
- *      has taken responsibility (setting *rc_p) for the prctl call.
- * @task_reparent_to_init:
- *	Set the security attributes in @p->security for a kernel thread that
- *	is being reparented to the init task.
- *	@p contains the task_struct for the kernel thread.
+ *	Return -ENOSYS if no-one wanted to handle this op, any other value to
+ *	cause prctl() to return immediately with that value.
  * @task_to_inode:
  *	Set the security attributes for an inode based on an associated task's
  *	security attributes, e.g. for /proc/pid inodes.
@@ -1008,7 +1001,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	See whether a specific operational right is granted to a process on a
  *	key.
  *	@key_ref refers to the key (key pointer + possession attribute bit).
- *	@context points to the process to provide the context against which to
+ *	@cred points to the credentials to provide the context against which to
  *	evaluate the security data on the key.
  *	@perm describes the combination of permissions required of this key.
  *	Return 1 if permission granted, 0 if permission denied and -ve it the
@@ -1170,6 +1163,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@child process.
  *	Security modules may also want to perform a process tracing check
  *	during an execve in the set_security or apply_creds hooks of
+ *	tracing check during an execve in the bprm_set_creds hook of
  *	binprm_security_ops if the process is being traced and its security
  *	attributes would be changed by the execve.
  *	@child contains the task_struct structure for the target process.
@@ -1193,19 +1187,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@inheritable contains the inheritable capability set.
  *	@permitted contains the permitted capability set.
  *	Return 0 if the capability sets were successfully obtained.
- * @capset_check:
- *	Check permission before setting the @effective, @inheritable, and
- *	@permitted capability sets for the current process.
- *	@effective contains the effective capability set.
- *	@inheritable contains the inheritable capability set.
- *	@permitted contains the permitted capability set.
- *	Return 0 if permission is granted.
- * @capset_set:
+ * @capset:
  *	Set the @effective, @inheritable, and @permitted capability sets for
  *	the current process.
+ *	@new contains the new credentials structure for target process.
+ *	@old contains the current credentials structure for target process.
  *	@effective contains the effective capability set.
  *	@inheritable contains the inheritable capability set.
  *	@permitted contains the permitted capability set.
+ *	Return 0 and update @new if permission is granted.
  * @capable:
  *	Check whether the @tsk process has the @cap capability.
  *	@tsk contains the task_struct for the process.
@@ -1297,12 +1287,11 @@ struct security_operations {
 	int (*capget) (struct task_struct *target,
 		       kernel_cap_t *effective,
 		       kernel_cap_t *inheritable, kernel_cap_t *permitted);
-	int (*capset_check) (const kernel_cap_t *effective,
-			     const kernel_cap_t *inheritable,
-			     const kernel_cap_t *permitted);
-	void (*capset_set) (const kernel_cap_t *effective,
-			    const kernel_cap_t *inheritable,
-			    const kernel_cap_t *permitted);
+	int (*capset) (struct cred *new,
+		       const struct cred *old,
+		       const kernel_cap_t *effective,
+		       const kernel_cap_t *inheritable,
+		       const kernel_cap_t *permitted);
 	int (*capable) (struct task_struct *tsk, int cap, int audit);
 	int (*acct) (struct file *file);
 	int (*sysctl) (struct ctl_table *table, int op);
@@ -1314,7 +1303,7 @@ struct security_operations {
 
 	int (*bprm_alloc_security) (struct linux_binprm *bprm);
 	void (*bprm_free_security) (struct linux_binprm *bprm);
-	void (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe);
+	int (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe);
 	void (*bprm_post_apply_creds) (struct linux_binprm *bprm);
 	int (*bprm_set_security) (struct linux_binprm *bprm);
 	int (*bprm_check_security) (struct linux_binprm *bprm);
@@ -1405,11 +1394,13 @@ struct security_operations {
 	int (*dentry_open) (struct file *file, const struct cred *cred);
 
 	int (*task_create) (unsigned long clone_flags);
-	int (*cred_alloc_security) (struct cred *cred);
 	void (*cred_free) (struct cred *cred);
+	int (*cred_prepare)(struct cred *new, const struct cred *old,
+			    gfp_t gfp);
+	void (*cred_commit)(struct cred *new, const struct cred *old);
 	int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags);
-	int (*task_post_setuid) (uid_t old_ruid /* or fsuid */ ,
-				 uid_t old_euid, uid_t old_suid, int flags);
+	int (*task_fix_setuid) (struct cred *new, const struct cred *old,
+				int flags);
 	int (*task_setgid) (gid_t id0, gid_t id1, gid_t id2, int flags);
 	int (*task_setpgid) (struct task_struct *p, pid_t pgid);
 	int (*task_getpgid) (struct task_struct *p);
@@ -1429,8 +1420,7 @@ struct security_operations {
 	int (*task_wait) (struct task_struct *p);
 	int (*task_prctl) (int option, unsigned long arg2,
 			   unsigned long arg3, unsigned long arg4,
-			   unsigned long arg5, long *rc_p);
-	void (*task_reparent_to_init) (struct task_struct *p);
+			   unsigned long arg5);
 	void (*task_to_inode) (struct task_struct *p, struct inode *inode);
 
 	int (*ipc_permission) (struct kern_ipc_perm *ipcp, short flag);
@@ -1535,10 +1525,10 @@ struct security_operations {
 
 	/* key management security hooks */
 #ifdef CONFIG_KEYS
-	int (*key_alloc) (struct key *key, struct task_struct *tsk, unsigned long flags);
+	int (*key_alloc) (struct key *key, const struct cred *cred, unsigned long flags);
 	void (*key_free) (struct key *key);
 	int (*key_permission) (key_ref_t key_ref,
-			       struct task_struct *context,
+			       const struct cred *cred,
 			       key_perm_t perm);
 	int (*key_getsecurity)(struct key *key, char **_buffer);
 #endif	/* CONFIG_KEYS */
@@ -1564,12 +1554,10 @@ int security_capget(struct task_struct *target,
 		    kernel_cap_t *effective,
 		    kernel_cap_t *inheritable,
 		    kernel_cap_t *permitted);
-int security_capset_check(const kernel_cap_t *effective,
-			  const kernel_cap_t *inheritable,
-			  const kernel_cap_t *permitted);
-void security_capset_set(const kernel_cap_t *effective,
-			 const kernel_cap_t *inheritable,
-			 const kernel_cap_t *permitted);
+int security_capset(struct cred *new, const struct cred *old,
+		    const kernel_cap_t *effective,
+		    const kernel_cap_t *inheritable,
+		    const kernel_cap_t *permitted);
 int security_capable(struct task_struct *tsk, int cap);
 int security_capable_noaudit(struct task_struct *tsk, int cap);
 int security_acct(struct file *file);
@@ -1583,7 +1571,7 @@ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_vm_enough_memory_kern(long pages);
 int security_bprm_alloc(struct linux_binprm *bprm);
 void security_bprm_free(struct linux_binprm *bprm);
-void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
+int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
 void security_bprm_post_apply_creds(struct linux_binprm *bprm);
 int security_bprm_set(struct linux_binprm *bprm);
 int security_bprm_check(struct linux_binprm *bprm);
@@ -1660,11 +1648,12 @@ int security_file_send_sigiotask(struct task_struct *tsk,
 int security_file_receive(struct file *file);
 int security_dentry_open(struct file *file, const struct cred *cred);
 int security_task_create(unsigned long clone_flags);
-int security_cred_alloc(struct cred *cred);
 void security_cred_free(struct cred *cred);
+int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp);
+void security_commit_creds(struct cred *new, const struct cred *old);
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags);
-int security_task_post_setuid(uid_t old_ruid, uid_t old_euid,
-			      uid_t old_suid, int flags);
+int security_task_fix_setuid(struct cred *new, const struct cred *old,
+			     int flags);
 int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags);
 int security_task_setpgid(struct task_struct *p, pid_t pgid);
 int security_task_getpgid(struct task_struct *p);
@@ -1683,8 +1672,7 @@ int security_task_kill(struct task_struct *p, struct siginfo *info,
 			int sig, u32 secid);
 int security_task_wait(struct task_struct *p);
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-			 unsigned long arg4, unsigned long arg5, long *rc_p);
-void security_task_reparent_to_init(struct task_struct *p);
+			unsigned long arg4, unsigned long arg5);
 void security_task_to_inode(struct task_struct *p, struct inode *inode);
 int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag);
 void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid);
@@ -1759,18 +1747,13 @@ static inline int security_capget(struct task_struct *target,
 	return cap_capget(target, effective, inheritable, permitted);
 }
 
-static inline int security_capset_check(const kernel_cap_t *effective,
-					const kernel_cap_t *inheritable,
-					const kernel_cap_t *permitted)
+static inline int security_capset(struct cred *new,
+				   const struct cred *old,
+				   const kernel_cap_t *effective,
+				   const kernel_cap_t *inheritable,
+				   const kernel_cap_t *permitted)
 {
-	return cap_capset_check(effective, inheritable, permitted);
-}
-
-static inline void security_capset_set(const kernel_cap_t *effective,
-				       const kernel_cap_t *inheritable,
-				       const kernel_cap_t *permitted)
-{
-	cap_capset_set(effective, inheritable, permitted);
+	return cap_capset(new, old, effective, inheritable, permitted);
 }
 
 static inline int security_capable(struct task_struct *tsk, int cap)
@@ -1837,9 +1820,9 @@ static inline int security_bprm_alloc(struct linux_binprm *bprm)
 static inline void security_bprm_free(struct linux_binprm *bprm)
 { }
 
-static inline void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+static inline int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 {
-	cap_bprm_apply_creds(bprm, unsafe);
+	return cap_bprm_apply_creds(bprm, unsafe);
 }
 
 static inline void security_bprm_post_apply_creds(struct linux_binprm *bprm)
@@ -2182,13 +2165,20 @@ static inline int security_task_create(unsigned long clone_flags)
 	return 0;
 }
 
-static inline int security_cred_alloc(struct cred *cred)
+static inline void security_cred_free(struct cred *cred)
+{ }
+
+static inline int security_prepare_creds(struct cred *new,
+					 const struct cred *old,
+					 gfp_t gfp)
 {
 	return 0;
 }
 
-static inline void security_cred_free(struct cred *cred)
-{ }
+static inline void security_commit_creds(struct cred *new,
+					 const struct cred *old)
+{
+}
 
 static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2,
 				       int flags)
@@ -2196,10 +2186,11 @@ static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2,
 	return 0;
 }
 
-static inline int security_task_post_setuid(uid_t old_ruid, uid_t old_euid,
-					    uid_t old_suid, int flags)
+static inline int security_task_fix_setuid(struct cred *new,
+					   const struct cred *old,
+					   int flags)
 {
-	return cap_task_post_setuid(old_ruid, old_euid, old_suid, flags);
+	return cap_task_fix_setuid(new, old, flags);
 }
 
 static inline int security_task_setgid(gid_t id0, gid_t id1, gid_t id2,
@@ -2286,14 +2277,9 @@ static inline int security_task_wait(struct task_struct *p)
 static inline int security_task_prctl(int option, unsigned long arg2,
 				      unsigned long arg3,
 				      unsigned long arg4,
-				      unsigned long arg5, long *rc_p)
-{
-	return cap_task_prctl(option, arg2, arg3, arg3, arg5, rc_p);
-}
-
-static inline void security_task_reparent_to_init(struct task_struct *p)
+				      unsigned long arg5)
 {
-	cap_task_reparent_to_init(p);
+	return cap_task_prctl(option, arg2, arg3, arg3, arg5);
 }
 
 static inline void security_task_to_inode(struct task_struct *p, struct inode *inode)
@@ -2719,16 +2705,16 @@ static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi
 #ifdef CONFIG_KEYS
 #ifdef CONFIG_SECURITY
 
-int security_key_alloc(struct key *key, struct task_struct *tsk, unsigned long flags);
+int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags);
 void security_key_free(struct key *key);
 int security_key_permission(key_ref_t key_ref,
-			    struct task_struct *context, key_perm_t perm);
+			    const struct cred *cred, key_perm_t perm);
 int security_key_getsecurity(struct key *key, char **_buffer);
 
 #else
 
 static inline int security_key_alloc(struct key *key,
-				     struct task_struct *tsk,
+				     const struct cred *cred,
 				     unsigned long flags)
 {
 	return 0;
@@ -2739,7 +2725,7 @@ static inline void security_key_free(struct key *key)
 }
 
 static inline int security_key_permission(key_ref_t key_ref,
-					  struct task_struct *context,
+					  const struct cred *cred,
 					  key_perm_t perm)
 {
 	return 0;
diff --git a/init/main.c b/init/main.c
index 7e117a231af1..db843bff5732 100644
--- a/init/main.c
+++ b/init/main.c
@@ -669,6 +669,7 @@ asmlinkage void __init start_kernel(void)
 		efi_enter_virtual_mode();
 #endif
 	thread_info_cache_init();
+	cred_init();
 	fork_init(num_physpages);
 	proc_caches_init();
 	buffer_init();
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ae8ef88ade3f..bc1e2d854bf6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2546,18 +2546,17 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 /**
  * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
- * @bprm pointer to the bprm being processed
- * @caps the caps read from the disk
+ * @bprm: pointer to the bprm being processed
+ * @new: the proposed new credentials
+ * @old: the old credentials
  *
  * Simply check if the proc already has the caps given by the file and if not
  * store the priv escalation info for later auditing at the end of the syscall
  *
- * this can fail and we don't care.  See the note in audit.h for
- * audit_log_bprm_fcaps() for my explaination....
- *
  * -Eric
  */
-void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE)
+int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
+			   const struct cred *new, const struct cred *old)
 {
 	struct audit_aux_data_bprm_fcaps *ax;
 	struct audit_context *context = current->audit_context;
@@ -2566,7 +2565,7 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 
 	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
 	if (!ax)
-		return;
+		return -ENOMEM;
 
 	ax->d.type = AUDIT_BPRM_FCAPS;
 	ax->d.next = context->aux;
@@ -2581,26 +2580,27 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 	ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
 	ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
 
-	ax->old_pcap.permitted = *pP;
-	ax->old_pcap.inheritable = current->cred->cap_inheritable;
-	ax->old_pcap.effective = *pE;
+	ax->old_pcap.permitted   = old->cap_permitted;
+	ax->old_pcap.inheritable = old->cap_inheritable;
+	ax->old_pcap.effective   = old->cap_effective;
 
-	ax->new_pcap.permitted = current->cred->cap_permitted;
-	ax->new_pcap.inheritable = current->cred->cap_inheritable;
-	ax->new_pcap.effective = current->cred->cap_effective;
+	ax->new_pcap.permitted   = new->cap_permitted;
+	ax->new_pcap.inheritable = new->cap_inheritable;
+	ax->new_pcap.effective   = new->cap_effective;
+	return 0;
 }
 
 /**
  * __audit_log_capset - store information about the arguments to the capset syscall
- * @pid target pid of the capset call
- * @eff effective cap set
- * @inh inheritible cap set
- * @perm permited cap set
+ * @pid: target pid of the capset call
+ * @new: the new credentials
+ * @old: the old (current) credentials
  *
  * Record the aguments userspace sent to sys_capset for later printing by the
  * audit system if applicable
  */
-int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm)
+int __audit_log_capset(pid_t pid,
+		       const struct cred *new, const struct cred *old)
 {
 	struct audit_aux_data_capset *ax;
 	struct audit_context *context = current->audit_context;
@@ -2617,9 +2617,9 @@ int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_c
 	context->aux = (void *)ax;
 
 	ax->pid = pid;
-	ax->cap.effective = *eff;
-	ax->cap.inheritable = *eff;
-	ax->cap.permitted = *perm;
+	ax->cap.effective   = new->cap_effective;
+	ax->cap.inheritable = new->cap_effective;
+	ax->cap.permitted   = new->cap_permitted;
 
 	return 0;
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index a404b980b1bd..36b4b4daebec 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,12 +15,7 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
-
-/*
- * This lock protects task->cap_* for all tasks including current.
- * Locking rule: acquire this prior to tasklist_lock.
- */
-static DEFINE_SPINLOCK(task_capability_lock);
+#include "cred-internals.h"
 
 /*
  * Leveraged for setting/resetting capabilities
@@ -128,12 +123,11 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
 }
 
 /*
- * If we have configured with filesystem capability support, then the
- * only thing that can change the capabilities of the current process
- * is the current process. As such, we can't be in this code at the
- * same time as we are in the process of setting capabilities in this
- * process. The net result is that we can limit our use of locks to
- * when we are reading the caps of another process.
+ * The only thing that can change the capabilities of the current
+ * process is the current process. As such, we can't be in this code
+ * at the same time as we are in the process of setting capabilities
+ * in this process. The net result is that we can limit our use of
+ * locks to when we are reading the caps of another process.
  */
 static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 				     kernel_cap_t *pIp, kernel_cap_t *pPp)
@@ -143,7 +137,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 	if (pid && (pid != task_pid_vnr(current))) {
 		struct task_struct *target;
 
-		spin_lock(&task_capability_lock);
 		read_lock(&tasklist_lock);
 
 		target = find_task_by_vpid(pid);
@@ -153,34 +146,12 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 			ret = security_capget(target, pEp, pIp, pPp);
 
 		read_unlock(&tasklist_lock);
-		spin_unlock(&task_capability_lock);
 	} else
 		ret = security_capget(current, pEp, pIp, pPp);
 
 	return ret;
 }
 
-/*
- * Atomically modify the effective capabilities returning the original
- * value. No permission check is performed here - it is assumed that the
- * caller is permitted to set the desired effective capabilities.
- */
-kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
-{
-	kernel_cap_t pE_old;
-
-	spin_lock(&task_capability_lock);
-
-	pE_old = current->cred->cap_effective;
-	current->cred->cap_effective = pE_new;
-
-	spin_unlock(&task_capability_lock);
-
-	return pE_old;
-}
-
-EXPORT_SYMBOL(cap_set_effective);
-
 /**
  * sys_capget - get the capabilities of a given process.
  * @header: pointer to struct that contains capability version and
@@ -208,7 +179,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 		return -EINVAL;
 
 	ret = cap_get_target_pid(pid, &pE, &pI, &pP);
-
 	if (!ret) {
 		struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 		unsigned i;
@@ -270,6 +240,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 	unsigned i, tocopy;
 	kernel_cap_t inheritable, permitted, effective;
+	struct cred *new;
 	int ret;
 	pid_t pid;
 
@@ -284,8 +255,8 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (pid != 0 && pid != task_pid_vnr(current))
 		return -EPERM;
 
-	if (copy_from_user(&kdata, data, tocopy
-			   * sizeof(struct __user_cap_data_struct)))
+	if (copy_from_user(&kdata, data,
+			   tocopy * sizeof(struct __user_cap_data_struct)))
 		return -EFAULT;
 
 	for (i = 0; i < tocopy; i++) {
@@ -300,24 +271,23 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 		i++;
 	}
 
-	ret = audit_log_capset(pid, &effective, &inheritable, &permitted);
-	if (ret)
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = security_capset(new, current_cred(),
+			      &effective, &inheritable, &permitted);
+	if (ret < 0)
+		goto error;
+
+	ret = audit_log_capset(pid, new, current_cred());
+	if (ret < 0)
 		return ret;
 
-	/* This lock is required even when filesystem capability support is
-	 * configured - it protects the sys_capget() call from returning
-	 * incorrect data in the case that the targeted process is not the
-	 * current one.
-	 */
-	spin_lock(&task_capability_lock);
-
-	ret = security_capset_check(&effective, &inheritable, &permitted);
-	/* Having verified that the proposed changes are legal, we now put them
-	 * into effect.
-	 */
-	if (!ret)
-		security_capset_set(&effective, &inheritable, &permitted);
-	spin_unlock(&task_capability_lock);
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
 	return ret;
 }
 
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
new file mode 100644
index 000000000000..2dc4fc2d0bf1
--- /dev/null
+++ b/kernel/cred-internals.h
@@ -0,0 +1,21 @@
+/* Internal credentials stuff
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+/*
+ * user.c
+ */
+static inline void sched_switch_user(struct task_struct *p)
+{
+#ifdef CONFIG_USER_SCHED
+	sched_move_task(p);
+#endif	/* CONFIG_USER_SCHED */
+}
+
diff --git a/kernel/cred.c b/kernel/cred.c
index ac73e3617684..cb6b5eda978d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -15,6 +15,10 @@
 #include <linux/keyctl.h>
 #include <linux/init_task.h>
 #include <linux/security.h>
+#include <linux/cn_proc.h>
+#include "cred-internals.h"
+
+static struct kmem_cache *cred_jar;
 
 /*
  * The common credentials for the initial task's thread group
@@ -64,7 +68,7 @@ static void release_tgcred_rcu(struct rcu_head *rcu)
 /*
  * Release a set of thread group credentials.
  */
-static void release_tgcred(struct cred *cred)
+void release_tgcred(struct cred *cred)
 {
 #ifdef CONFIG_KEYS
 	struct thread_group_cred *tgcred = cred->tgcred;
@@ -81,79 +85,322 @@ static void put_cred_rcu(struct rcu_head *rcu)
 {
 	struct cred *cred = container_of(rcu, struct cred, rcu);
 
-	BUG_ON(atomic_read(&cred->usage) != 0);
+	if (atomic_read(&cred->usage) != 0)
+		panic("CRED: put_cred_rcu() sees %p with usage %d\n",
+		      cred, atomic_read(&cred->usage));
 
+	security_cred_free(cred);
 	key_put(cred->thread_keyring);
 	key_put(cred->request_key_auth);
 	release_tgcred(cred);
 	put_group_info(cred->group_info);
 	free_uid(cred->user);
-	security_cred_free(cred);
-	kfree(cred);
+	kmem_cache_free(cred_jar, cred);
 }
 
 /**
  * __put_cred - Destroy a set of credentials
- * @sec: The record to release
+ * @cred: The record to release
  *
  * Destroy a set of credentials on which no references remain.
  */
 void __put_cred(struct cred *cred)
 {
+	BUG_ON(atomic_read(&cred->usage) != 0);
+
 	call_rcu(&cred->rcu, put_cred_rcu);
 }
 EXPORT_SYMBOL(__put_cred);
 
+/**
+ * prepare_creds - Prepare a new set of credentials for modification
+ *
+ * Prepare a new set of task credentials for modification.  A task's creds
+ * shouldn't generally be modified directly, therefore this function is used to
+ * prepare a new copy, which the caller then modifies and then commits by
+ * calling commit_creds().
+ *
+ * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
+ *
+ * Call commit_creds() or abort_creds() to clean up.
+ */
+struct cred *prepare_creds(void)
+{
+	struct task_struct *task = current;
+	const struct cred *old;
+	struct cred *new;
+
+	BUG_ON(atomic_read(&task->cred->usage) < 1);
+
+	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	old = task->cred;
+	memcpy(new, old, sizeof(struct cred));
+
+	atomic_set(&new->usage, 1);
+	get_group_info(new->group_info);
+	get_uid(new->user);
+
+#ifdef CONFIG_KEYS
+	key_get(new->thread_keyring);
+	key_get(new->request_key_auth);
+	atomic_inc(&new->tgcred->usage);
+#endif
+
+#ifdef CONFIG_SECURITY
+	new->security = NULL;
+#endif
+
+	if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+		goto error;
+	return new;
+
+error:
+	abort_creds(new);
+	return NULL;
+}
+EXPORT_SYMBOL(prepare_creds);
+
+/*
+ * prepare new credentials for the usermode helper dispatcher
+ */
+struct cred *prepare_usermodehelper_creds(void)
+{
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred = NULL;
+#endif
+	struct cred *new;
+
+#ifdef CONFIG_KEYS
+	tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
+	if (!tgcred)
+		return NULL;
+#endif
+
+	new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
+	if (!new)
+		return NULL;
+
+	memcpy(new, &init_cred, sizeof(struct cred));
+
+	atomic_set(&new->usage, 1);
+	get_group_info(new->group_info);
+	get_uid(new->user);
+
+#ifdef CONFIG_KEYS
+	new->thread_keyring = NULL;
+	new->request_key_auth = NULL;
+	new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
+
+	atomic_set(&tgcred->usage, 1);
+	spin_lock_init(&tgcred->lock);
+	new->tgcred = tgcred;
+#endif
+
+#ifdef CONFIG_SECURITY
+	new->security = NULL;
+#endif
+	if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
+		goto error;
+
+	BUG_ON(atomic_read(&new->usage) != 1);
+	return new;
+
+error:
+	put_cred(new);
+	return NULL;
+}
+
 /*
  * Copy credentials for the new process created by fork()
+ *
+ * We share if we can, but under some circumstances we have to generate a new
+ * set.
  */
 int copy_creds(struct task_struct *p, unsigned long clone_flags)
 {
-	struct cred *pcred;
-	int ret;
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred;
+#endif
+	struct cred *new;
+
+	mutex_init(&p->cred_exec_mutex);
 
-	pcred = kmemdup(p->cred, sizeof(*p->cred), GFP_KERNEL);
-	if (!pcred)
+	if (
+#ifdef CONFIG_KEYS
+		!p->cred->thread_keyring &&
+#endif
+		clone_flags & CLONE_THREAD
+	    ) {
+		get_cred(p->cred);
+		atomic_inc(&p->cred->user->processes);
+		return 0;
+	}
+
+	new = prepare_creds();
+	if (!new)
 		return -ENOMEM;
 
 #ifdef CONFIG_KEYS
-	if (clone_flags & CLONE_THREAD) {
-		atomic_inc(&pcred->tgcred->usage);
-	} else {
-		pcred->tgcred = kmalloc(sizeof(struct cred), GFP_KERNEL);
-		if (!pcred->tgcred) {
-			kfree(pcred);
+	/* new threads get their own thread keyrings if their parent already
+	 * had one */
+	if (new->thread_keyring) {
+		key_put(new->thread_keyring);
+		new->thread_keyring = NULL;
+		if (clone_flags & CLONE_THREAD)
+			install_thread_keyring_to_cred(new);
+	}
+
+	/* we share the process and session keyrings between all the threads in
+	 * a process - this is slightly icky as we violate COW credentials a
+	 * bit */
+	if (!(clone_flags & CLONE_THREAD)) {
+		tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+		if (!tgcred) {
+			put_cred(new);
 			return -ENOMEM;
 		}
-		atomic_set(&pcred->tgcred->usage, 1);
-		spin_lock_init(&pcred->tgcred->lock);
-		pcred->tgcred->process_keyring = NULL;
-		pcred->tgcred->session_keyring =
-			key_get(p->cred->tgcred->session_keyring);
+		atomic_set(&tgcred->usage, 1);
+		spin_lock_init(&tgcred->lock);
+		tgcred->process_keyring = NULL;
+		tgcred->session_keyring = key_get(new->tgcred->session_keyring);
+
+		release_tgcred(new);
+		new->tgcred = tgcred;
 	}
 #endif
 
-#ifdef CONFIG_SECURITY
-	pcred->security = NULL;
-#endif
+	atomic_inc(&new->user->processes);
+	p->cred = new;
+	return 0;
+}
 
-	ret = security_cred_alloc(pcred);
-	if (ret < 0) {
-		release_tgcred(pcred);
-		kfree(pcred);
-		return ret;
+/**
+ * commit_creds - Install new credentials upon the current task
+ * @new: The credentials to be assigned
+ *
+ * Install a new set of credentials to the current task, using RCU to replace
+ * the old set.
+ *
+ * This function eats the caller's reference to the new credentials.
+ *
+ * Always returns 0 thus allowing this function to be tail-called at the end
+ * of, say, sys_setgid().
+ */
+int commit_creds(struct cred *new)
+{
+	struct task_struct *task = current;
+	const struct cred *old;
+
+	BUG_ON(atomic_read(&new->usage) < 1);
+	BUG_ON(atomic_read(&task->cred->usage) < 1);
+
+	old = task->cred;
+	security_commit_creds(new, old);
+
+	/* dumpability changes */
+	if (old->euid != new->euid ||
+	    old->egid != new->egid ||
+	    old->fsuid != new->fsuid ||
+	    old->fsgid != new->fsgid ||
+	    !cap_issubset(new->cap_permitted, old->cap_permitted)) {
+		set_dumpable(task->mm, suid_dumpable);
+		task->pdeath_signal = 0;
+		smp_wmb();
 	}
 
-	atomic_set(&pcred->usage, 1);
-	get_group_info(pcred->group_info);
-	get_uid(pcred->user);
-	key_get(pcred->thread_keyring);
-	key_get(pcred->request_key_auth);
+	/* alter the thread keyring */
+	if (new->fsuid != old->fsuid)
+		key_fsuid_changed(task);
+	if (new->fsgid != old->fsgid)
+		key_fsgid_changed(task);
+
+	/* do it
+	 * - What if a process setreuid()'s and this brings the
+	 *   new uid over his NPROC rlimit?  We can check this now
+	 *   cheaply with the new uid cache, so if it matters
+	 *   we should be checking for it.  -DaveM
+	 */
+	if (new->user != old->user)
+		atomic_inc(&new->user->processes);
+	rcu_assign_pointer(task->cred, new);
+	if (new->user != old->user)
+		atomic_dec(&old->user->processes);
+
+	sched_switch_user(task);
+
+	/* send notifications */
+	if (new->uid   != old->uid  ||
+	    new->euid  != old->euid ||
+	    new->suid  != old->suid ||
+	    new->fsuid != old->fsuid)
+		proc_id_connector(task, PROC_EVENT_UID);
 
-	atomic_inc(&pcred->user->processes);
+	if (new->gid   != old->gid  ||
+	    new->egid  != old->egid ||
+	    new->sgid  != old->sgid ||
+	    new->fsgid != old->fsgid)
+		proc_id_connector(task, PROC_EVENT_GID);
 
-	/* RCU assignment is unneeded here as no-one can have accessed this
-	 * pointer yet, barring us */
-	p->cred = pcred;
+	put_cred(old);
 	return 0;
 }
+EXPORT_SYMBOL(commit_creds);
+
+/**
+ * abort_creds - Discard a set of credentials and unlock the current task
+ * @new: The credentials that were going to be applied
+ *
+ * Discard a set of credentials that were under construction and unlock the
+ * current task.
+ */
+void abort_creds(struct cred *new)
+{
+	BUG_ON(atomic_read(&new->usage) < 1);
+	put_cred(new);
+}
+EXPORT_SYMBOL(abort_creds);
+
+/**
+ * override_creds - Temporarily override the current process's credentials
+ * @new: The credentials to be assigned
+ *
+ * Install a set of temporary override credentials on the current process,
+ * returning the old set for later reversion.
+ */
+const struct cred *override_creds(const struct cred *new)
+{
+	const struct cred *old = current->cred;
+
+	rcu_assign_pointer(current->cred, get_cred(new));
+	return old;
+}
+EXPORT_SYMBOL(override_creds);
+
+/**
+ * revert_creds - Revert a temporary credentials override
+ * @old: The credentials to be restored
+ *
+ * Revert a temporary set of override credentials to an old set, discarding the
+ * override set.
+ */
+void revert_creds(const struct cred *old)
+{
+	const struct cred *override = current->cred;
+
+	rcu_assign_pointer(current->cred, old);
+	put_cred(override);
+}
+EXPORT_SYMBOL(revert_creds);
+
+/*
+ * initialise the credentials stuff
+ */
+void __init cred_init(void)
+{
+	/* allocate a slab in which we can store credentials */
+	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
+				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index bbc22530f2c1..c0711da15486 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,12 +47,14 @@
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
+#include <linux/init_task.h>
 #include <trace/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
+#include "cred-internals.h"
 
 static void exit_mm(struct task_struct * tsk);
 
@@ -338,12 +340,12 @@ static void reparent_to_kthreadd(void)
 	/* cpus_allowed? */
 	/* rt_priority? */
 	/* signals? */
-	security_task_reparent_to_init(current);
 	memcpy(current->signal->rlim, init_task.signal->rlim,
 	       sizeof(current->signal->rlim));
-	atomic_inc(&(INIT_USER->__count));
+
+	atomic_inc(&init_cred.usage);
+	commit_creds(&init_cred);
 	write_unlock_irq(&tasklist_lock);
-	switch_uid(INIT_USER);
 }
 
 void __set_special_pids(struct pid *pid)
@@ -1085,7 +1087,6 @@ NORET_TYPE void do_exit(long code)
 	check_stack_usage();
 	exit_thread();
 	cgroup_exit(tsk, 1);
-	exit_keys(tsk);
 
 	if (group_dead && tsk->signal->leader)
 		disassociate_ctty(1);
diff --git a/kernel/fork.c b/kernel/fork.c
index ded1972672a3..82a7948a664e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1084,10 +1084,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_sighand;
 	if ((retval = copy_mm(clone_flags, p)))
 		goto bad_fork_cleanup_signal;
-	if ((retval = copy_keys(clone_flags, p)))
-		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
-		goto bad_fork_cleanup_keys;
+		goto bad_fork_cleanup_mm;
 	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
@@ -1252,8 +1250,6 @@ bad_fork_cleanup_io:
 	put_io_context(p->io_context);
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
-bad_fork_cleanup_keys:
-	exit_keys(p);
 bad_fork_cleanup_mm:
 	if (p->mm)
 		mmput(p->mm);
@@ -1281,6 +1277,7 @@ bad_fork_cleanup_cgroup:
 bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
+	atomic_dec(&p->cred->user->processes);
 	put_cred(p->cred);
 bad_fork_free:
 	free_task(p);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index f044f8f57703..b46dbb908669 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -118,10 +118,10 @@ EXPORT_SYMBOL(request_module);
 struct subprocess_info {
 	struct work_struct work;
 	struct completion *complete;
+	struct cred *cred;
 	char *path;
 	char **argv;
 	char **envp;
-	struct key *ring;
 	enum umh_wait wait;
 	int retval;
 	struct file *stdin;
@@ -134,19 +134,20 @@ struct subprocess_info {
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
-	struct key *new_session, *old_session;
 	int retval;
 
-	/* Unblock all signals and set the session keyring. */
-	new_session = key_get(sub_info->ring);
+	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+
+	/* Unblock all signals */
 	spin_lock_irq(&current->sighand->siglock);
-	old_session = __install_session_keyring(new_session);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	key_put(old_session);
+	/* Install the credentials */
+	commit_creds(sub_info->cred);
+	sub_info->cred = NULL;
 
 	/* Install input pipe when needed */
 	if (sub_info->stdin) {
@@ -185,6 +186,8 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
 	if (info->cleanup)
 		(*info->cleanup)(info->argv, info->envp);
+	if (info->cred)
+		put_cred(info->cred);
 	kfree(info);
 }
 EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -240,6 +243,8 @@ static void __call_usermodehelper(struct work_struct *work)
 	pid_t pid;
 	enum umh_wait wait = sub_info->wait;
 
+	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+
 	/* CLONE_VFORK: wait until the usermode helper has execve'd
 	 * successfully We need the data structures to stay around
 	 * until that is done.  */
@@ -362,6 +367,9 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 	sub_info->path = path;
 	sub_info->argv = argv;
 	sub_info->envp = envp;
+	sub_info->cred = prepare_usermodehelper_creds();
+	if (!sub_info->cred)
+		return NULL;
 
   out:
 	return sub_info;
@@ -376,7 +384,13 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
 void call_usermodehelper_setkeys(struct subprocess_info *info,
 				 struct key *session_keyring)
 {
-	info->ring = session_keyring;
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred = info->cred->tgcred;
+	key_put(tgcred->session_keyring);
+	tgcred->session_keyring = key_get(session_keyring);
+#else
+	BUG();
+#endif
 }
 EXPORT_SYMBOL(call_usermodehelper_setkeys);
 
@@ -444,6 +458,8 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval = 0;
 
+	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+
 	helper_lock();
 	if (sub_info->path[0] == '\0')
 		goto out;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b9d5f4e4f6a4..f764b8806955 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -171,6 +171,14 @@ int ptrace_attach(struct task_struct *task)
 	if (same_thread_group(task, current))
 		goto out;
 
+	/* Protect exec's credential calculations against our interference;
+	 * SUID, SGID and LSM creds get determined differently under ptrace.
+	 */
+	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	if (retval  < 0)
+		goto out;
+
+	retval = -EPERM;
 repeat:
 	/*
 	 * Nasty, nasty.
@@ -210,6 +218,7 @@ repeat:
 bad:
 	write_unlock_irqrestore(&tasklist_lock, flags);
 	task_unlock(task);
+	mutex_unlock(&current->cred_exec_mutex);
 out:
 	return retval;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 84989124bafb..2a64304ed54b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -180,7 +180,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
 /*
  * allocate a new signal queue record
  * - this may be called without locks if and only if t == current, otherwise an
- *   appopriate lock must be held to protect t's user_struct
+ *   appopriate lock must be held to stop the target task from exiting
  */
 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 					 int override_rlimit)
@@ -194,7 +194,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	 * caller must be holding the RCU readlock (by way of a spinlock) and
 	 * we use RCU protection here
 	 */
-	user = __task_cred(t)->user;
+	user = get_uid(__task_cred(t)->user);
 	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
@@ -202,12 +202,14 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 		q = kmem_cache_alloc(sigqueue_cachep, flags);
 	if (unlikely(q == NULL)) {
 		atomic_dec(&user->sigpending);
+		free_uid(user);
 	} else {
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
-		q->user = get_uid(user);
+		q->user = user;
 	}
-	return(q);
+
+	return q;
 }
 
 static void __sigqueue_free(struct sigqueue *q)
diff --git a/kernel/sys.c b/kernel/sys.c
index ccc9eb736d35..ab735040468a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -180,7 +180,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = cred->user;
+			user = (struct user_struct *) cred->user;
 			if (!who)
 				who = cred->uid;
 			else if ((who != cred->uid) &&
@@ -479,47 +479,48 @@ void ctrl_alt_del(void)
  */
 asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 {
-	struct cred *cred = current->cred;
-	int old_rgid = cred->gid;
-	int old_egid = cred->egid;
-	int new_rgid = old_rgid;
-	int new_egid = old_egid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
 	if (retval)
-		return retval;
+		goto error;
 
+	retval = -EPERM;
 	if (rgid != (gid_t) -1) {
-		if ((old_rgid == rgid) ||
-		    (cred->egid == rgid) ||
+		if (old->gid == rgid ||
+		    old->egid == rgid ||
 		    capable(CAP_SETGID))
-			new_rgid = rgid;
+			new->gid = rgid;
 		else
-			return -EPERM;
+			goto error;
 	}
 	if (egid != (gid_t) -1) {
-		if ((old_rgid == egid) ||
-		    (cred->egid == egid) ||
-		    (cred->sgid == egid) ||
+		if (old->gid == egid ||
+		    old->egid == egid ||
+		    old->sgid == egid ||
 		    capable(CAP_SETGID))
-			new_egid = egid;
+			new->egid = egid;
 		else
-			return -EPERM;
-	}
-	if (new_egid != old_egid) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
+			goto error;
 	}
+
 	if (rgid != (gid_t) -1 ||
-	    (egid != (gid_t) -1 && egid != old_rgid))
-		cred->sgid = new_egid;
-	cred->fsgid = new_egid;
-	cred->egid = new_egid;
-	cred->gid = new_rgid;
-	key_fsgid_changed(current);
-	proc_id_connector(current, PROC_EVENT_GID);
-	return 0;
+	    (egid != (gid_t) -1 && egid != old->gid))
+		new->sgid = new->egid;
+	new->fsgid = new->egid;
+
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 /*
@@ -529,40 +530,42 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
  */
 asmlinkage long sys_setgid(gid_t gid)
 {
-	struct cred *cred = current->cred;
-	int old_egid = cred->egid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
 	if (retval)
-		return retval;
+		goto error;
 
-	if (capable(CAP_SETGID)) {
-		if (old_egid != gid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->gid = cred->egid = cred->sgid = cred->fsgid = gid;
-	} else if ((gid == cred->gid) || (gid == cred->sgid)) {
-		if (old_egid != gid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->egid = cred->fsgid = gid;
-	}
+	retval = -EPERM;
+	if (capable(CAP_SETGID))
+		new->gid = new->egid = new->sgid = new->fsgid = gid;
+	else if (gid == old->gid || gid == old->sgid)
+		new->egid = new->fsgid = gid;
 	else
-		return -EPERM;
+		goto error;
 
-	key_fsgid_changed(current);
-	proc_id_connector(current, PROC_EVENT_GID);
-	return 0;
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
   
-static int set_user(uid_t new_ruid, int dumpclear)
+/*
+ * change the user struct in a credentials set to match the new UID
+ */
+static int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
+	new_user = alloc_uid(current->nsproxy->user_ns, new->uid);
 	if (!new_user)
 		return -EAGAIN;
 
@@ -573,13 +576,8 @@ static int set_user(uid_t new_ruid, int dumpclear)
 		return -EAGAIN;
 	}
 
-	switch_uid(new_user);
-
-	if (dumpclear) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
-	}
-	current->cred->uid = new_ruid;
+	free_uid(new->user);
+	new->user = new_user;
 	return 0;
 }
 
@@ -600,55 +598,56 @@ static int set_user(uid_t new_ruid, int dumpclear)
  */
 asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 {
-	struct cred *cred = current->cred;
-	int old_ruid, old_euid, old_suid, new_ruid, new_euid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
 	if (retval)
-		return retval;
-
-	new_ruid = old_ruid = cred->uid;
-	new_euid = old_euid = cred->euid;
-	old_suid = cred->suid;
+		goto error;
 
+	retval = -EPERM;
 	if (ruid != (uid_t) -1) {
-		new_ruid = ruid;
-		if ((old_ruid != ruid) &&
-		    (cred->euid != ruid) &&
+		new->uid = ruid;
+		if (old->uid != ruid &&
+		    old->euid != ruid &&
 		    !capable(CAP_SETUID))
-			return -EPERM;
+			goto error;
 	}
 
 	if (euid != (uid_t) -1) {
-		new_euid = euid;
-		if ((old_ruid != euid) &&
-		    (cred->euid != euid) &&
-		    (cred->suid != euid) &&
+		new->euid = euid;
+		if (old->uid != euid &&
+		    old->euid != euid &&
+		    old->suid != euid &&
 		    !capable(CAP_SETUID))
-			return -EPERM;
+			goto error;
 	}
 
-	if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
-		return -EAGAIN;
+	retval = -EAGAIN;
+	if (new->uid != old->uid && set_user(new) < 0)
+		goto error;
 
-	if (new_euid != old_euid) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
-	}
-	cred->fsuid = cred->euid = new_euid;
 	if (ruid != (uid_t) -1 ||
-	    (euid != (uid_t) -1 && euid != old_ruid))
-		cred->suid = cred->euid;
-	cred->fsuid = cred->euid;
-
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
+	    (euid != (uid_t) -1 && euid != old->uid))
+		new->suid = new->euid;
+	new->fsuid = new->euid;
 
-	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
-}
+	retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
+	if (retval < 0)
+		goto error;
 
+	return commit_creds(new);
 
+error:
+	abort_creds(new);
+	return retval;
+}
 		
 /*
  * setuid() is implemented like SysV with SAVED_IDS 
@@ -663,37 +662,41 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
  */
 asmlinkage long sys_setuid(uid_t uid)
 {
-	struct cred *cred = current->cred;
-	int old_euid = cred->euid;
-	int old_ruid, old_suid, new_suid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
 	if (retval)
-		return retval;
+		goto error;
 
-	old_ruid = cred->uid;
-	old_suid = cred->suid;
-	new_suid = old_suid;
-	
+	retval = -EPERM;
 	if (capable(CAP_SETUID)) {
-		if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
-			return -EAGAIN;
-		new_suid = uid;
-	} else if ((uid != cred->uid) && (uid != new_suid))
-		return -EPERM;
-
-	if (old_euid != uid) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
+		new->suid = new->uid = uid;
+		if (uid != old->uid && set_user(new) < 0) {
+			retval = -EAGAIN;
+			goto error;
+		}
+	} else if (uid != old->uid && uid != new->suid) {
+		goto error;
 	}
-	cred->fsuid = cred->euid = uid;
-	cred->suid = new_suid;
 
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
+	new->fsuid = new->euid = uid;
+
+	retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
+	if (retval < 0)
+		goto error;
+
+	return commit_creds(new);
 
-	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
+error:
+	abort_creds(new);
+	return retval;
 }
 
 
@@ -703,47 +706,53 @@ asmlinkage long sys_setuid(uid_t uid)
  */
 asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 {
-	struct cred *cred = current->cred;
-	int old_ruid = cred->uid;
-	int old_euid = cred->euid;
-	int old_suid = cred->suid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
 	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
 	if (retval)
-		return retval;
+		goto error;
+	old = current_cred();
 
+	retval = -EPERM;
 	if (!capable(CAP_SETUID)) {
-		if ((ruid != (uid_t) -1) && (ruid != cred->uid) &&
-		    (ruid != cred->euid) && (ruid != cred->suid))
-			return -EPERM;
-		if ((euid != (uid_t) -1) && (euid != cred->uid) &&
-		    (euid != cred->euid) && (euid != cred->suid))
-			return -EPERM;
-		if ((suid != (uid_t) -1) && (suid != cred->uid) &&
-		    (suid != cred->euid) && (suid != cred->suid))
-			return -EPERM;
+		if (ruid != (uid_t) -1 && ruid != old->uid &&
+		    ruid != old->euid  && ruid != old->suid)
+			goto error;
+		if (euid != (uid_t) -1 && euid != old->uid &&
+		    euid != old->euid  && euid != old->suid)
+			goto error;
+		if (suid != (uid_t) -1 && suid != old->uid &&
+		    suid != old->euid  && suid != old->suid)
+			goto error;
 	}
+
+	retval = -EAGAIN;
 	if (ruid != (uid_t) -1) {
-		if (ruid != cred->uid &&
-		    set_user(ruid, euid != cred->euid) < 0)
-			return -EAGAIN;
+		new->uid = ruid;
+		if (ruid != old->uid && set_user(new) < 0)
+			goto error;
 	}
-	if (euid != (uid_t) -1) {
-		if (euid != cred->euid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->euid = euid;
-	}
-	cred->fsuid = cred->euid;
+	if (euid != (uid_t) -1)
+		new->euid = euid;
 	if (suid != (uid_t) -1)
-		cred->suid = suid;
+		new->suid = suid;
+	new->fsuid = new->euid;
 
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
+	retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
+	if (retval < 0)
+		goto error;
 
-	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
@@ -763,40 +772,45 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
  */
 asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 {
-	struct cred *cred = current->cred;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
 	if (retval)
-		return retval;
+		goto error;
 
+	retval = -EPERM;
 	if (!capable(CAP_SETGID)) {
-		if ((rgid != (gid_t) -1) && (rgid != cred->gid) &&
-		    (rgid != cred->egid) && (rgid != cred->sgid))
-			return -EPERM;
-		if ((egid != (gid_t) -1) && (egid != cred->gid) &&
-		    (egid != cred->egid) && (egid != cred->sgid))
-			return -EPERM;
-		if ((sgid != (gid_t) -1) && (sgid != cred->gid) &&
-		    (sgid != cred->egid) && (sgid != cred->sgid))
-			return -EPERM;
+		if (rgid != (gid_t) -1 && rgid != old->gid &&
+		    rgid != old->egid  && rgid != old->sgid)
+			goto error;
+		if (egid != (gid_t) -1 && egid != old->gid &&
+		    egid != old->egid  && egid != old->sgid)
+			goto error;
+		if (sgid != (gid_t) -1 && sgid != old->gid &&
+		    sgid != old->egid  && sgid != old->sgid)
+			goto error;
 	}
-	if (egid != (gid_t) -1) {
-		if (egid != cred->egid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->egid = egid;
-	}
-	cred->fsgid = cred->egid;
+
 	if (rgid != (gid_t) -1)
-		cred->gid = rgid;
+		new->gid = rgid;
+	if (egid != (gid_t) -1)
+		new->egid = egid;
 	if (sgid != (gid_t) -1)
-		cred->sgid = sgid;
+		new->sgid = sgid;
+	new->fsgid = new->egid;
 
-	key_fsgid_changed(current);
-	proc_id_connector(current, PROC_EVENT_GID);
-	return 0;
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
@@ -820,28 +834,35 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
  */
 asmlinkage long sys_setfsuid(uid_t uid)
 {
-	struct cred *cred = current->cred;
-	int old_fsuid;
+	const struct cred *old;
+	struct cred *new;
+	uid_t old_fsuid;
+
+	new = prepare_creds();
+	if (!new)
+		return current_fsuid();
+	old = current_cred();
+	old_fsuid = old->fsuid;
 
-	old_fsuid = cred->fsuid;
-	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
-		return old_fsuid;
+	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
+		goto error;
 
-	if (uid == cred->uid || uid == cred->euid ||
-	    uid == cred->suid || uid == cred->fsuid ||
+	if (uid == old->uid  || uid == old->euid  ||
+	    uid == old->suid || uid == old->fsuid ||
 	    capable(CAP_SETUID)) {
 		if (uid != old_fsuid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
+			new->fsuid = uid;
+			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
+				goto change_okay;
 		}
-		cred->fsuid = uid;
 	}
 
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
-
-	security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
+error:
+	abort_creds(new);
+	return old_fsuid;
 
+change_okay:
+	commit_creds(new);
 	return old_fsuid;
 }
 
@@ -850,24 +871,34 @@ asmlinkage long sys_setfsuid(uid_t uid)
  */
 asmlinkage long sys_setfsgid(gid_t gid)
 {
-	struct cred *cred = current->cred;
-	int old_fsgid;
+	const struct cred *old;
+	struct cred *new;
+	gid_t old_fsgid;
+
+	new = prepare_creds();
+	if (!new)
+		return current_fsgid();
+	old = current_cred();
+	old_fsgid = old->fsgid;
 
-	old_fsgid = cred->fsgid;
 	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
-		return old_fsgid;
+		goto error;
 
-	if (gid == cred->gid || gid == cred->egid ||
-	    gid == cred->sgid || gid == cred->fsgid ||
+	if (gid == old->gid  || gid == old->egid  ||
+	    gid == old->sgid || gid == old->fsgid ||
 	    capable(CAP_SETGID)) {
 		if (gid != old_fsgid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
+			new->fsgid = gid;
+			goto change_okay;
 		}
-		cred->fsgid = gid;
-		key_fsgid_changed(current);
-		proc_id_connector(current, PROC_EVENT_GID);
 	}
+
+error:
+	abort_creds(new);
+	return old_fsgid;
+
+change_okay:
+	commit_creds(new);
 	return old_fsgid;
 }
 
@@ -1136,7 +1167,7 @@ EXPORT_SYMBOL(groups_free);
 
 /* export the group_info to a user-space array */
 static int groups_to_user(gid_t __user *grouplist,
-    struct group_info *group_info)
+			  const struct group_info *group_info)
 {
 	int i;
 	unsigned int count = group_info->ngroups;
@@ -1227,31 +1258,25 @@ int groups_search(const struct group_info *group_info, gid_t grp)
 }
 
 /**
- * set_groups - Change a group subscription in a security record
- * @sec: The security record to alter
- * @group_info: The group list to impose
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
  *
- * Validate a group subscription and, if valid, impose it upon a task security
- * record.
+ * Validate a group subscription and, if valid, insert it into a set
+ * of credentials.
  */
-int set_groups(struct cred *cred, struct group_info *group_info)
+int set_groups(struct cred *new, struct group_info *group_info)
 {
 	int retval;
-	struct group_info *old_info;
 
 	retval = security_task_setgroups(group_info);
 	if (retval)
 		return retval;
 
+	put_group_info(new->group_info);
 	groups_sort(group_info);
 	get_group_info(group_info);
-
-	spin_lock(&cred->lock);
-	old_info = cred->group_info;
-	cred->group_info = group_info;
-	spin_unlock(&cred->lock);
-
-	put_group_info(old_info);
+	new->group_info = group_info;
 	return 0;
 }
 
@@ -1266,7 +1291,20 @@ EXPORT_SYMBOL(set_groups);
  */
 int set_current_groups(struct group_info *group_info)
 {
-	return set_groups(current->cred, group_info);
+	struct cred *new;
+	int ret;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = set_groups(new, group_info);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
+	}
+
+	return commit_creds(new);
 }
 
 EXPORT_SYMBOL(set_current_groups);
@@ -1666,9 +1704,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 	unsigned char comm[sizeof(me->comm)];
 	long error;
 
-	if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
+	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
+	if (error != -ENOSYS)
 		return error;
 
+	error = 0;
 	switch (option) {
 		case PR_SET_PDEATHSIG:
 			if (!valid_signal(arg2)) {
diff --git a/kernel/user.c b/kernel/user.c
index 104d22ac84d5..d476307dd4b0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/user_namespace.h>
+#include "cred-internals.h"
 
 struct user_namespace init_user_ns = {
 	.kref = {
@@ -104,16 +105,10 @@ static int sched_create_user(struct user_struct *up)
 	return rc;
 }
 
-static void sched_switch_user(struct task_struct *p)
-{
-	sched_move_task(p);
-}
-
 #else	/* CONFIG_USER_SCHED */
 
 static void sched_destroy_user(struct user_struct *up) { }
 static int sched_create_user(struct user_struct *up) { return 0; }
-static void sched_switch_user(struct task_struct *p) { }
 
 #endif	/* CONFIG_USER_SCHED */
 
@@ -448,36 +443,6 @@ out_unlock:
 	return NULL;
 }
 
-void switch_uid(struct user_struct *new_user)
-{
-	struct user_struct *old_user;
-
-	/* What if a process setreuid()'s and this brings the
-	 * new uid over his NPROC rlimit?  We can check this now
-	 * cheaply with the new uid cache, so if it matters
-	 * we should be checking for it.  -DaveM
-	 */
-	old_user = current->cred->user;
-	atomic_inc(&new_user->processes);
-	atomic_dec(&old_user->processes);
-	switch_uid_keyring(new_user);
-	current->cred->user = new_user;
-	sched_switch_user(current);
-
-	/*
-	 * We need to synchronize with __sigqueue_alloc()
-	 * doing a get_uid(p->user).. If that saw the old
-	 * user value, we need to wait until it has exited
-	 * its critical region before we can free the old
-	 * structure.
-	 */
-	smp_mb();
-	spin_unlock_wait(&current->sighand->siglock);
-
-	free_uid(old_user);
-	suid_keys(current);
-}
-
 #ifdef CONFIG_USER_NS
 void release_uids(struct user_namespace *ns)
 {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f82730adea00..0d9c51d67333 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -19,6 +19,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 {
 	struct user_namespace *ns;
 	struct user_struct *new_user;
+	struct cred *new;
 	int n;
 
 	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
@@ -45,7 +46,16 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	switch_uid(new_user);
+	/* Install the new user */
+	new = prepare_creds();
+	if (!new) {
+		free_uid(new_user);
+		free_uid(ns->root_user);
+		kfree(ns);
+	}
+	free_uid(new->user);
+	new->user = new_user;
+	commit_creds(new);
 	return ns;
 }
 
diff --git a/lib/Makefile b/lib/Makefile
index 7cb65d85aeb0..80fe8a3ec12a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
-	 proportions.o prio_heap.o ratelimit.o show_mem.o
+	 proportions.o prio_heap.o ratelimit.o show_mem.o is_single_threaded.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index 9a8ff684da79..ad8c7a782da1 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -287,6 +287,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 			      time_t expiry,
 			      u32 kvno)
 {
+	const struct cred *cred = current_cred();
 	struct key *key;
 	int ret;
 
@@ -297,7 +298,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 
 	_enter("");
 
-	key = key_alloc(&key_type_rxrpc, "x", 0, 0, current, 0,
+	key = key_alloc(&key_type_rxrpc, "x", 0, 0, cred, 0,
 			KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(key)) {
 		_leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
@@ -340,10 +341,11 @@ EXPORT_SYMBOL(rxrpc_get_server_data_key);
  */
 struct key *rxrpc_get_null_key(const char *keyname)
 {
+	const struct cred *cred = current_cred();
 	struct key *key;
 	int ret;
 
-	key = key_alloc(&key_type_rxrpc, keyname, 0, 0, current,
+	key = key_alloc(&key_type_rxrpc, keyname, 0, 0, cred,
 			KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(key))
 		return key;
diff --git a/security/capability.c b/security/capability.c
index fac2f61b69a9..efeb6d9e0e6a 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -340,12 +340,16 @@ static int cap_task_create(unsigned long clone_flags)
 	return 0;
 }
 
-static int cap_cred_alloc_security(struct cred *cred)
+static void cap_cred_free(struct cred *cred)
+{
+}
+
+static int cap_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp)
 {
 	return 0;
 }
 
-static void cap_cred_free(struct cred *cred)
+static void cap_cred_commit(struct cred *new, const struct cred *old)
 {
 }
 
@@ -750,7 +754,7 @@ static void cap_release_secctx(char *secdata, u32 seclen)
 }
 
 #ifdef CONFIG_KEYS
-static int cap_key_alloc(struct key *key, struct task_struct *ctx,
+static int cap_key_alloc(struct key *key, const struct cred *cred,
 			 unsigned long flags)
 {
 	return 0;
@@ -760,7 +764,7 @@ static void cap_key_free(struct key *key)
 {
 }
 
-static int cap_key_permission(key_ref_t key_ref, struct task_struct *context,
+static int cap_key_permission(key_ref_t key_ref, const struct cred *cred,
 			      key_perm_t perm)
 {
 	return 0;
@@ -814,8 +818,7 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, ptrace_may_access);
 	set_to_cap_if_null(ops, ptrace_traceme);
 	set_to_cap_if_null(ops, capget);
-	set_to_cap_if_null(ops, capset_check);
-	set_to_cap_if_null(ops, capset_set);
+	set_to_cap_if_null(ops, capset);
 	set_to_cap_if_null(ops, acct);
 	set_to_cap_if_null(ops, capable);
 	set_to_cap_if_null(ops, quotactl);
@@ -890,10 +893,11 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, file_receive);
 	set_to_cap_if_null(ops, dentry_open);
 	set_to_cap_if_null(ops, task_create);
-	set_to_cap_if_null(ops, cred_alloc_security);
 	set_to_cap_if_null(ops, cred_free);
+	set_to_cap_if_null(ops, cred_prepare);
+	set_to_cap_if_null(ops, cred_commit);
 	set_to_cap_if_null(ops, task_setuid);
-	set_to_cap_if_null(ops, task_post_setuid);
+	set_to_cap_if_null(ops, task_fix_setuid);
 	set_to_cap_if_null(ops, task_setgid);
 	set_to_cap_if_null(ops, task_setpgid);
 	set_to_cap_if_null(ops, task_getpgid);
@@ -910,7 +914,6 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, task_wait);
 	set_to_cap_if_null(ops, task_kill);
 	set_to_cap_if_null(ops, task_prctl);
-	set_to_cap_if_null(ops, task_reparent_to_init);
 	set_to_cap_if_null(ops, task_to_inode);
 	set_to_cap_if_null(ops, ipc_permission);
 	set_to_cap_if_null(ops, ipc_getsecid);
diff --git a/security/commoncap.c b/security/commoncap.c
index 0384bf95db68..b5419273f92d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -72,8 +72,8 @@ int cap_ptrace_may_access(struct task_struct *child, unsigned int mode)
 	int ret = 0;
 
 	rcu_read_lock();
-	if (!cap_issubset(child->cred->cap_permitted,
-			  current->cred->cap_permitted) &&
+	if (!cap_issubset(__task_cred(child)->cap_permitted,
+			  current_cred()->cap_permitted) &&
 	    !capable(CAP_SYS_PTRACE))
 		ret = -EPERM;
 	rcu_read_unlock();
@@ -85,8 +85,8 @@ int cap_ptrace_traceme(struct task_struct *parent)
 	int ret = 0;
 
 	rcu_read_lock();
-	if (!cap_issubset(current->cred->cap_permitted,
-			 parent->cred->cap_permitted) &&
+	if (!cap_issubset(current_cred()->cap_permitted,
+			  __task_cred(parent)->cap_permitted) &&
 	    !has_capability(parent, CAP_SYS_PTRACE))
 		ret = -EPERM;
 	rcu_read_unlock();
@@ -117,7 +117,7 @@ static inline int cap_inh_is_capped(void)
 	 * to the old permitted set. That is, if the current task
 	 * does *not* possess the CAP_SETPCAP capability.
 	 */
-	return (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0);
+	return cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0;
 }
 
 static inline int cap_limit_ptraced_target(void) { return 1; }
@@ -132,52 +132,39 @@ static inline int cap_limit_ptraced_target(void)
 
 #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
 
-int cap_capset_check(const kernel_cap_t *effective,
-		     const kernel_cap_t *inheritable,
-		     const kernel_cap_t *permitted)
+int cap_capset(struct cred *new,
+	       const struct cred *old,
+	       const kernel_cap_t *effective,
+	       const kernel_cap_t *inheritable,
+	       const kernel_cap_t *permitted)
 {
-	const struct cred *cred = current->cred;
-
-	if (cap_inh_is_capped()
-	    && !cap_issubset(*inheritable,
-			     cap_combine(cred->cap_inheritable,
-					 cred->cap_permitted))) {
+	if (cap_inh_is_capped() &&
+	    !cap_issubset(*inheritable,
+			  cap_combine(old->cap_inheritable,
+				      old->cap_permitted)))
 		/* incapable of using this inheritable set */
 		return -EPERM;
-	}
+
 	if (!cap_issubset(*inheritable,
-			   cap_combine(cred->cap_inheritable,
-				       cred->cap_bset))) {
+			  cap_combine(old->cap_inheritable,
+				      old->cap_bset)))
 		/* no new pI capabilities outside bounding set */
 		return -EPERM;
-	}
 
 	/* verify restrictions on target's new Permitted set */
-	if (!cap_issubset (*permitted,
-			   cap_combine (cred->cap_permitted,
-					cred->cap_permitted))) {
+	if (!cap_issubset(*permitted, old->cap_permitted))
 		return -EPERM;
-	}
 
 	/* verify the _new_Effective_ is a subset of the _new_Permitted_ */
-	if (!cap_issubset (*effective, *permitted)) {
+	if (!cap_issubset(*effective, *permitted))
 		return -EPERM;
-	}
 
+	new->cap_effective   = *effective;
+	new->cap_inheritable = *inheritable;
+	new->cap_permitted   = *permitted;
 	return 0;
 }
 
-void cap_capset_set(const kernel_cap_t *effective,
-		    const kernel_cap_t *inheritable,
-		    const kernel_cap_t *permitted)
-{
-	struct cred *cred = current->cred;
-
-	cred->cap_effective   = *effective;
-	cred->cap_inheritable = *inheritable;
-	cred->cap_permitted   = *permitted;
-}
-
 static inline void bprm_clear_caps(struct linux_binprm *bprm)
 {
 	cap_clear(bprm->cap_post_exec_permitted);
@@ -382,41 +369,46 @@ int cap_bprm_set_security (struct linux_binprm *bprm)
 	return ret;
 }
 
-void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
+int cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 {
-	struct cred *cred = current->cred;
+	const struct cred *old = current_cred();
+	struct cred *new;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
-	if (bprm->e_uid != cred->uid || bprm->e_gid != cred->gid ||
+	if (bprm->e_uid != old->uid || bprm->e_gid != old->gid ||
 	    !cap_issubset(bprm->cap_post_exec_permitted,
-			  cred->cap_permitted)) {
+			  old->cap_permitted)) {
 		set_dumpable(current->mm, suid_dumpable);
 		current->pdeath_signal = 0;
 
 		if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 			if (!capable(CAP_SETUID)) {
-				bprm->e_uid = cred->uid;
-				bprm->e_gid = cred->gid;
+				bprm->e_uid = old->uid;
+				bprm->e_gid = old->gid;
 			}
 			if (cap_limit_ptraced_target()) {
 				bprm->cap_post_exec_permitted = cap_intersect(
 					bprm->cap_post_exec_permitted,
-					cred->cap_permitted);
+					new->cap_permitted);
 			}
 		}
 	}
 
-	cred->suid = cred->euid = cred->fsuid = bprm->e_uid;
-	cred->sgid = cred->egid = cred->fsgid = bprm->e_gid;
+	new->suid = new->euid = new->fsuid = bprm->e_uid;
+	new->sgid = new->egid = new->fsgid = bprm->e_gid;
 
 	/* For init, we want to retain the capabilities set
 	 * in the init_task struct. Thus we skip the usual
 	 * capability rules */
 	if (!is_global_init(current)) {
-		cred->cap_permitted = bprm->cap_post_exec_permitted;
+		new->cap_permitted = bprm->cap_post_exec_permitted;
 		if (bprm->cap_effective)
-			cred->cap_effective = bprm->cap_post_exec_permitted;
+			new->cap_effective = bprm->cap_post_exec_permitted;
 		else
-			cap_clear(cred->cap_effective);
+			cap_clear(new->cap_effective);
 	}
 
 	/*
@@ -431,15 +423,15 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 	 * Number 1 above might fail if you don't have a full bset, but I think
 	 * that is interesting information to audit.
 	 */
-	if (!cap_isclear(cred->cap_effective)) {
-		if (!cap_issubset(CAP_FULL_SET, cred->cap_effective) ||
-		    (bprm->e_uid != 0) || (cred->uid != 0) ||
+	if (!cap_isclear(new->cap_effective)) {
+		if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
+		    bprm->e_uid != 0 || new->uid != 0 ||
 		    issecure(SECURE_NOROOT))
-			audit_log_bprm_fcaps(bprm, &cred->cap_permitted,
-					     &cred->cap_effective);
+			audit_log_bprm_fcaps(bprm, new, old);
 	}
 
-	cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+	return commit_creds(new);
 }
 
 int cap_bprm_secureexec (struct linux_binprm *bprm)
@@ -514,65 +506,49 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
  * files..
  * Thanks to Olaf Kirch and Peter Benie for spotting this.
  */
-static inline void cap_emulate_setxuid (int old_ruid, int old_euid,
-					int old_suid)
+static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
 {
-	struct cred *cred = current->cred;
-
-	if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) &&
-	    (cred->uid != 0 && cred->euid != 0 && cred->suid != 0) &&
+	if ((old->uid == 0 || old->euid == 0 || old->suid == 0) &&
+	    (new->uid != 0 && new->euid != 0 && new->suid != 0) &&
 	    !issecure(SECURE_KEEP_CAPS)) {
-		cap_clear(cred->cap_permitted);
-		cap_clear(cred->cap_effective);
-	}
-	if (old_euid == 0 && cred->euid != 0) {
-		cap_clear(cred->cap_effective);
-	}
-	if (old_euid != 0 && cred->euid == 0) {
-		cred->cap_effective = cred->cap_permitted;
+		cap_clear(new->cap_permitted);
+		cap_clear(new->cap_effective);
 	}
+	if (old->euid == 0 && new->euid != 0)
+		cap_clear(new->cap_effective);
+	if (old->euid != 0 && new->euid == 0)
+		new->cap_effective = new->cap_permitted;
 }
 
-int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid,
-			  int flags)
+int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
 {
-	struct cred *cred = current->cred;
-
 	switch (flags) {
 	case LSM_SETID_RE:
 	case LSM_SETID_ID:
 	case LSM_SETID_RES:
 		/* Copied from kernel/sys.c:setreuid/setuid/setresuid. */
-		if (!issecure (SECURE_NO_SETUID_FIXUP)) {
-			cap_emulate_setxuid (old_ruid, old_euid, old_suid);
-		}
+		if (!issecure(SECURE_NO_SETUID_FIXUP))
+			cap_emulate_setxuid(new, old);
 		break;
 	case LSM_SETID_FS:
-		{
-			uid_t old_fsuid = old_ruid;
-
-			/* Copied from kernel/sys.c:setfsuid. */
+		/* Copied from kernel/sys.c:setfsuid. */
 
-			/*
-			 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
-			 *          if not, we might be a bit too harsh here.
-			 */
-
-			if (!issecure (SECURE_NO_SETUID_FIXUP)) {
-				if (old_fsuid == 0 && cred->fsuid != 0) {
-					cred->cap_effective =
-						cap_drop_fs_set(
-							cred->cap_effective);
-				}
-				if (old_fsuid != 0 && cred->fsuid == 0) {
-					cred->cap_effective =
-						cap_raise_fs_set(
-						    cred->cap_effective,
-						    cred->cap_permitted);
-				}
+		/*
+		 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
+		 *          if not, we might be a bit too harsh here.
+		 */
+		if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+			if (old->fsuid == 0 && new->fsuid != 0) {
+				new->cap_effective =
+					cap_drop_fs_set(new->cap_effective);
+			}
+			if (old->fsuid != 0 && new->fsuid == 0) {
+				new->cap_effective =
+					cap_raise_fs_set(new->cap_effective,
+							 new->cap_permitted);
 			}
-			break;
 		}
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -628,13 +604,14 @@ int cap_task_setnice (struct task_struct *p, int nice)
  * this task could get inconsistent info.  There can be no
  * racing writer bc a task can only change its own caps.
  */
-static long cap_prctl_drop(unsigned long cap)
+static long cap_prctl_drop(struct cred *new, unsigned long cap)
 {
 	if (!capable(CAP_SETPCAP))
 		return -EPERM;
 	if (!cap_valid(cap))
 		return -EINVAL;
-	cap_lower(current->cred->cap_bset, cap);
+
+	cap_lower(new->cap_bset, cap);
 	return 0;
 }
 
@@ -655,22 +632,29 @@ int cap_task_setnice (struct task_struct *p, int nice)
 #endif
 
 int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-		   unsigned long arg4, unsigned long arg5, long *rc_p)
+		   unsigned long arg4, unsigned long arg5)
 {
-	struct cred *cred = current_cred();
+	struct cred *new;
 	long error = 0;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
 	switch (option) {
 	case PR_CAPBSET_READ:
+		error = -EINVAL;
 		if (!cap_valid(arg2))
-			error = -EINVAL;
-		else
-			error = !!cap_raised(cred->cap_bset, arg2);
-		break;
+			goto error;
+		error = !!cap_raised(new->cap_bset, arg2);
+		goto no_change;
+
 #ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 	case PR_CAPBSET_DROP:
-		error = cap_prctl_drop(arg2);
-		break;
+		error = cap_prctl_drop(new, arg2);
+		if (error < 0)
+			goto error;
+		goto changed;
 
 	/*
 	 * The next four prctl's remain to assist with transitioning a
@@ -692,12 +676,12 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 	 * capability-based-privilege environment.
 	 */
 	case PR_SET_SECUREBITS:
-		if ((((cred->securebits & SECURE_ALL_LOCKS) >> 1)
-		     & (cred->securebits ^ arg2))                  /*[1]*/
-		    || ((cred->securebits & SECURE_ALL_LOCKS
-			 & ~arg2))                                    /*[2]*/
-		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/
-		    || (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0)) { /*[4]*/
+		error = -EPERM;
+		if ((((new->securebits & SECURE_ALL_LOCKS) >> 1)
+		     & (new->securebits ^ arg2))			/*[1]*/
+		    || ((new->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
+		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
+		    || (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0) /*[4]*/
 			/*
 			 * [1] no changing of bits that are locked
 			 * [2] no unlocking of locks
@@ -705,50 +689,51 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			 * [4] doing anything requires privilege (go read about
 			 *     the "sendmail capabilities bug")
 			 */
-			error = -EPERM;  /* cannot change a locked bit */
-		} else {
-			cred->securebits = arg2;
-		}
-		break;
+		    )
+			/* cannot change a locked bit */
+			goto error;
+		new->securebits = arg2;
+		goto changed;
+
 	case PR_GET_SECUREBITS:
-		error = cred->securebits;
-		break;
+		error = new->securebits;
+		goto no_change;
 
 #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
 
 	case PR_GET_KEEPCAPS:
 		if (issecure(SECURE_KEEP_CAPS))
 			error = 1;
-		break;
+		goto no_change;
+
 	case PR_SET_KEEPCAPS:
+		error = -EINVAL;
 		if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
-			error = -EINVAL;
-		else if (issecure(SECURE_KEEP_CAPS_LOCKED))
-			error = -EPERM;
-		else if (arg2)
-			cred->securebits |= issecure_mask(SECURE_KEEP_CAPS);
+			goto error;
+		error = -EPERM;
+		if (issecure(SECURE_KEEP_CAPS_LOCKED))
+			goto error;
+		if (arg2)
+			new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
 		else
-			cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
-		break;
+			new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+		goto changed;
 
 	default:
 		/* No functionality available - continue with default */
-		return 0;
+		error = -ENOSYS;
+		goto error;
 	}
 
 	/* Functionality provided */
-	*rc_p = error;
-	return 1;
-}
-
-void cap_task_reparent_to_init (struct task_struct *p)
-{
-	struct cred *cred = p->cred;
-
-	cap_set_init_eff(cred->cap_effective);
-	cap_clear(cred->cap_inheritable);
-	cap_set_full(cred->cap_permitted);
-	p->cred->securebits = SECUREBITS_DEFAULT;
+changed:
+	return commit_creds(new);
+
+no_change:
+	error = 0;
+error:
+	abort_creds(new);
+	return error;
 }
 
 int cap_syslog (int type)
diff --git a/security/keys/internal.h b/security/keys/internal.h
index d1586c629788..81932abefe7b 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -12,6 +12,7 @@
 #ifndef _INTERNAL_H
 #define _INTERNAL_H
 
+#include <linux/sched.h>
 #include <linux/key-type.h>
 
 static inline __attribute__((format(printf, 1, 2)))
@@ -25,7 +26,7 @@ void no_printk(const char *fmt, ...)
 #define kleave(FMT, ...) \
 	printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
 #define kdebug(FMT, ...) \
-	printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
+	printk(KERN_DEBUG "   "FMT"\n", ##__VA_ARGS__)
 #else
 #define kenter(FMT, ...) \
 	no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
@@ -97,7 +98,7 @@ extern struct key *keyring_search_instkey(struct key *keyring,
 typedef int (*key_match_func_t)(const struct key *, const void *);
 
 extern key_ref_t keyring_search_aux(key_ref_t keyring_ref,
-				    struct task_struct *tsk,
+				    const struct cred *cred,
 				    struct key_type *type,
 				    const void *description,
 				    key_match_func_t match);
@@ -105,13 +106,13 @@ extern key_ref_t keyring_search_aux(key_ref_t keyring_ref,
 extern key_ref_t search_process_keyrings(struct key_type *type,
 					 const void *description,
 					 key_match_func_t match,
-					 struct task_struct *tsk);
+					 const struct cred *cred);
 
 extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check);
 
 extern int install_user_keyrings(void);
-extern int install_thread_keyring(void);
-extern int install_process_keyring(void);
+extern int install_thread_keyring_to_cred(struct cred *);
+extern int install_process_keyring_to_cred(struct cred *);
 
 extern struct key *request_key_and_link(struct key_type *type,
 					const char *description,
@@ -130,12 +131,12 @@ extern long join_session_keyring(const char *name);
  * check to see whether permission is granted to use a key in the desired way
  */
 extern int key_task_permission(const key_ref_t key_ref,
-			       struct task_struct *context,
+			       const struct cred *cred,
 			       key_perm_t perm);
 
 static inline int key_permission(const key_ref_t key_ref, key_perm_t perm)
 {
-	return key_task_permission(key_ref, current, perm);
+	return key_task_permission(key_ref, current_cred(), perm);
 }
 
 /* required permissions */
@@ -153,7 +154,7 @@ static inline int key_permission(const key_ref_t key_ref, key_perm_t perm)
 struct request_key_auth {
 	struct key		*target_key;
 	struct key		*dest_keyring;
-	struct task_struct	*context;
+	const struct cred	*cred;
 	void			*callout_info;
 	size_t			callout_len;
 	pid_t			pid;
diff --git a/security/keys/key.c b/security/keys/key.c
index a6ca39ed3b0e..f76c8a546fd3 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -218,7 +218,7 @@ serial_exists:
  *   instantiate the key or discard it before returning
  */
 struct key *key_alloc(struct key_type *type, const char *desc,
-		      uid_t uid, gid_t gid, struct task_struct *ctx,
+		      uid_t uid, gid_t gid, const struct cred *cred,
 		      key_perm_t perm, unsigned long flags)
 {
 	struct key_user *user = NULL;
@@ -294,7 +294,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 #endif
 
 	/* let the security module know about the key */
-	ret = security_key_alloc(key, ctx, flags);
+	ret = security_key_alloc(key, cred, flags);
 	if (ret < 0)
 		goto security_error;
 
@@ -391,7 +391,7 @@ static int __key_instantiate_and_link(struct key *key,
 				      const void *data,
 				      size_t datalen,
 				      struct key *keyring,
-				      struct key *instkey)
+				      struct key *authkey)
 {
 	int ret, awaken;
 
@@ -421,8 +421,8 @@ static int __key_instantiate_and_link(struct key *key,
 				ret = __key_link(keyring, key);
 
 			/* disable the authorisation key */
-			if (instkey)
-				key_revoke(instkey);
+			if (authkey)
+				key_revoke(authkey);
 		}
 	}
 
@@ -444,14 +444,14 @@ int key_instantiate_and_link(struct key *key,
 			     const void *data,
 			     size_t datalen,
 			     struct key *keyring,
-			     struct key *instkey)
+			     struct key *authkey)
 {
 	int ret;
 
 	if (keyring)
 		down_write(&keyring->sem);
 
-	ret = __key_instantiate_and_link(key, data, datalen, keyring, instkey);
+	ret = __key_instantiate_and_link(key, data, datalen, keyring, authkey);
 
 	if (keyring)
 		up_write(&keyring->sem);
@@ -469,7 +469,7 @@ EXPORT_SYMBOL(key_instantiate_and_link);
 int key_negate_and_link(struct key *key,
 			unsigned timeout,
 			struct key *keyring,
-			struct key *instkey)
+			struct key *authkey)
 {
 	struct timespec now;
 	int ret, awaken;
@@ -504,8 +504,8 @@ int key_negate_and_link(struct key *key,
 			ret = __key_link(keyring, key);
 
 		/* disable the authorisation key */
-		if (instkey)
-			key_revoke(instkey);
+		if (authkey)
+			key_revoke(authkey);
 	}
 
 	mutex_unlock(&key_construction_mutex);
@@ -743,6 +743,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 			       key_perm_t perm,
 			       unsigned long flags)
 {
+	const struct cred *cred = current_cred();
 	struct key_type *ktype;
 	struct key *keyring, *key = NULL;
 	key_ref_t key_ref;
@@ -802,8 +803,8 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	}
 
 	/* allocate a new key */
-	key = key_alloc(ktype, description, current_fsuid(), current_fsgid(),
-			current, perm, flags);
+	key = key_alloc(ktype, description, cred->fsuid, cred->fsgid, cred,
+			perm, flags);
 	if (IS_ERR(key)) {
 		key_ref = ERR_CAST(key);
 		goto error_3;
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 8833b447adef..7c72baa02f2e 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -866,6 +866,23 @@ static long get_instantiation_keyring(key_serial_t ringid,
 	return -ENOKEY;
 }
 
+/*
+ * change the request_key authorisation key on the current process
+ */
+static int keyctl_change_reqkey_auth(struct key *key)
+{
+	struct cred *new;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	key_put(new->request_key_auth);
+	new->request_key_auth = key_get(key);
+
+	return commit_creds(new);
+}
+
 /*****************************************************************************/
 /*
  * instantiate the key with the specified payload, and, if one is given, link
@@ -876,12 +893,15 @@ long keyctl_instantiate_key(key_serial_t id,
 			    size_t plen,
 			    key_serial_t ringid)
 {
+	const struct cred *cred = current_cred();
 	struct request_key_auth *rka;
 	struct key *instkey, *dest_keyring;
 	void *payload;
 	long ret;
 	bool vm = false;
 
+	kenter("%d,,%zu,%d", id, plen, ringid);
+
 	ret = -EINVAL;
 	if (plen > 1024 * 1024 - 1)
 		goto error;
@@ -889,7 +909,7 @@ long keyctl_instantiate_key(key_serial_t id,
 	/* the appropriate instantiation authorisation key must have been
 	 * assumed before calling this */
 	ret = -EPERM;
-	instkey = current->cred->request_key_auth;
+	instkey = cred->request_key_auth;
 	if (!instkey)
 		goto error;
 
@@ -931,10 +951,8 @@ long keyctl_instantiate_key(key_serial_t id,
 
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
-	if (ret == 0) {
-		key_put(current->cred->request_key_auth);
-		current->cred->request_key_auth = NULL;
-	}
+	if (ret == 0)
+		keyctl_change_reqkey_auth(NULL);
 
 error2:
 	if (!vm)
@@ -953,14 +971,17 @@ error:
  */
 long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 {
+	const struct cred *cred = current_cred();
 	struct request_key_auth *rka;
 	struct key *instkey, *dest_keyring;
 	long ret;
 
+	kenter("%d,%u,%d", id, timeout, ringid);
+
 	/* the appropriate instantiation authorisation key must have been
 	 * assumed before calling this */
 	ret = -EPERM;
-	instkey = current->cred->request_key_auth;
+	instkey = cred->request_key_auth;
 	if (!instkey)
 		goto error;
 
@@ -982,10 +1003,8 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
-	if (ret == 0) {
-		key_put(current->cred->request_key_auth);
-		current->cred->request_key_auth = NULL;
-	}
+	if (ret == 0)
+		keyctl_change_reqkey_auth(NULL);
 
 error:
 	return ret;
@@ -999,36 +1018,56 @@ error:
  */
 long keyctl_set_reqkey_keyring(int reqkey_defl)
 {
-	struct cred *cred = current->cred;
-	int ret;
+	struct cred *new;
+	int ret, old_setting;
+
+	old_setting = current_cred_xxx(jit_keyring);
+
+	if (reqkey_defl == KEY_REQKEY_DEFL_NO_CHANGE)
+		return old_setting;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
 	switch (reqkey_defl) {
 	case KEY_REQKEY_DEFL_THREAD_KEYRING:
-		ret = install_thread_keyring();
+		ret = install_thread_keyring_to_cred(new);
 		if (ret < 0)
-			return ret;
+			goto error;
 		goto set;
 
 	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-		ret = install_process_keyring();
-		if (ret < 0)
-			return ret;
+		ret = install_process_keyring_to_cred(new);
+		if (ret < 0) {
+			if (ret != -EEXIST)
+				goto error;
+			ret = 0;
+		}
+		goto set;
 
 	case KEY_REQKEY_DEFL_DEFAULT:
 	case KEY_REQKEY_DEFL_SESSION_KEYRING:
 	case KEY_REQKEY_DEFL_USER_KEYRING:
 	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
-	set:
-		cred->jit_keyring = reqkey_defl;
+	case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
+		goto set;
 
 	case KEY_REQKEY_DEFL_NO_CHANGE:
-		return cred->jit_keyring;
-
 	case KEY_REQKEY_DEFL_GROUP_KEYRING:
 	default:
-		return -EINVAL;
+		ret = -EINVAL;
+		goto error;
 	}
 
+set:
+	new->jit_keyring = reqkey_defl;
+	commit_creds(new);
+	return old_setting;
+error:
+	abort_creds(new);
+	return -EINVAL;
+
 } /* end keyctl_set_reqkey_keyring() */
 
 /*****************************************************************************/
@@ -1087,9 +1126,7 @@ long keyctl_assume_authority(key_serial_t id)
 
 	/* we divest ourselves of authority if given an ID of 0 */
 	if (id == 0) {
-		key_put(current->cred->request_key_auth);
-		current->cred->request_key_auth = NULL;
-		ret = 0;
+		ret = keyctl_change_reqkey_auth(NULL);
 		goto error;
 	}
 
@@ -1104,10 +1141,12 @@ long keyctl_assume_authority(key_serial_t id)
 		goto error;
 	}
 
-	key_put(current->cred->request_key_auth);
-	current->cred->request_key_auth = authkey;
-	ret = authkey->serial;
+	ret = keyctl_change_reqkey_auth(authkey);
+	if (ret < 0)
+		goto error;
+	key_put(authkey);
 
+	ret = authkey->serial;
 error:
 	return ret;
 
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index fdf75f901991..ed851574d073 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -245,14 +245,14 @@ static long keyring_read(const struct key *keyring,
  * allocate a keyring and link into the destination keyring
  */
 struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
-			  struct task_struct *ctx, unsigned long flags,
+			  const struct cred *cred, unsigned long flags,
 			  struct key *dest)
 {
 	struct key *keyring;
 	int ret;
 
 	keyring = key_alloc(&key_type_keyring, description,
-			    uid, gid, ctx,
+			    uid, gid, cred,
 			    (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL,
 			    flags);
 
@@ -281,7 +281,7 @@ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
  * - we propagate the possession attribute from the keyring ref to the key ref
  */
 key_ref_t keyring_search_aux(key_ref_t keyring_ref,
-			     struct task_struct *context,
+			     const struct cred *cred,
 			     struct key_type *type,
 			     const void *description,
 			     key_match_func_t match)
@@ -304,7 +304,7 @@ key_ref_t keyring_search_aux(key_ref_t keyring_ref,
 	key_check(keyring);
 
 	/* top keyring must have search permission to begin the search */
-        err = key_task_permission(keyring_ref, context, KEY_SEARCH);
+        err = key_task_permission(keyring_ref, cred, KEY_SEARCH);
 	if (err < 0) {
 		key_ref = ERR_PTR(err);
 		goto error;
@@ -377,7 +377,7 @@ descend:
 
 		/* key must have search permissions */
 		if (key_task_permission(make_key_ref(key, possessed),
-					context, KEY_SEARCH) < 0)
+					cred, KEY_SEARCH) < 0)
 			continue;
 
 		/* we set a different error code if we pass a negative key */
@@ -404,7 +404,7 @@ ascend:
 			continue;
 
 		if (key_task_permission(make_key_ref(key, possessed),
-					context, KEY_SEARCH) < 0)
+					cred, KEY_SEARCH) < 0)
 			continue;
 
 		/* stack the current position */
@@ -459,7 +459,7 @@ key_ref_t keyring_search(key_ref_t keyring,
 	if (!type->match)
 		return ERR_PTR(-ENOKEY);
 
-	return keyring_search_aux(keyring, current,
+	return keyring_search_aux(keyring, current->cred,
 				  type, description, type->match);
 
 } /* end keyring_search() */
diff --git a/security/keys/permission.c b/security/keys/permission.c
index 13c36164f284..5d9fc7b93f2e 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -14,24 +14,27 @@
 #include "internal.h"
 
 /*****************************************************************************/
-/*
- * check to see whether permission is granted to use a key in the desired way,
- * but permit the security modules to override
+/**
+ * key_task_permission - Check a key can be used
+ * @key_ref: The key to check
+ * @cred: The credentials to use
+ * @perm: The permissions to check for
+ *
+ * Check to see whether permission is granted to use a key in the desired way,
+ * but permit the security modules to override.
+ *
+ * The caller must hold either a ref on cred or must hold the RCU readlock or a
+ * spinlock.
  */
-int key_task_permission(const key_ref_t key_ref,
-			struct task_struct *context,
+int key_task_permission(const key_ref_t key_ref, const struct cred *cred,
 			key_perm_t perm)
 {
-	const struct cred *cred;
 	struct key *key;
 	key_perm_t kperm;
 	int ret;
 
 	key = key_ref_to_ptr(key_ref);
 
-	rcu_read_lock();
-	cred = __task_cred(context);
-
 	/* use the second 8-bits of permissions for keys the caller owns */
 	if (key->uid == cred->fsuid) {
 		kperm = key->perm >> 16;
@@ -57,7 +60,6 @@ int key_task_permission(const key_ref_t key_ref,
 	kperm = key->perm;
 
 use_these_perms:
-	rcu_read_lock();
 
 	/* use the top 8-bits of permissions for keys the caller possesses
 	 * - possessor permissions are additive with other permissions
@@ -71,7 +73,7 @@ use_these_perms:
 		return -EACCES;
 
 	/* let LSM be the final arbiter */
-	return security_key_permission(key_ref, context, perm);
+	return security_key_permission(key_ref, cred, perm);
 
 } /* end key_task_permission() */
 
diff --git a/security/keys/proc.c b/security/keys/proc.c
index f619170da760..7f508def50e3 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -136,8 +136,12 @@ static int proc_keys_show(struct seq_file *m, void *v)
 	int rc;
 
 	/* check whether the current task is allowed to view the key (assuming
-	 * non-possession) */
-	rc = key_task_permission(make_key_ref(key, 0), current, KEY_VIEW);
+	 * non-possession)
+	 * - the caller holds a spinlock, and thus the RCU read lock, making our
+	 *   access to __current_cred() safe
+	 */
+	rc = key_task_permission(make_key_ref(key, 0), current_cred(),
+				 KEY_VIEW);
 	if (rc < 0)
 		return 0;
 
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 70ee93406f30..df329f684a65 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -42,11 +42,15 @@ struct key_user root_key_user = {
  */
 int install_user_keyrings(void)
 {
-	struct user_struct *user = current->cred->user;
+	struct user_struct *user;
+	const struct cred *cred;
 	struct key *uid_keyring, *session_keyring;
 	char buf[20];
 	int ret;
 
+	cred = current_cred();
+	user = cred->user;
+
 	kenter("%p{%u}", user, user->uid);
 
 	if (user->uid_keyring) {
@@ -67,7 +71,7 @@ int install_user_keyrings(void)
 		uid_keyring = find_keyring_by_name(buf, true);
 		if (IS_ERR(uid_keyring)) {
 			uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1,
-						    current, KEY_ALLOC_IN_QUOTA,
+						    cred, KEY_ALLOC_IN_QUOTA,
 						    NULL);
 			if (IS_ERR(uid_keyring)) {
 				ret = PTR_ERR(uid_keyring);
@@ -83,8 +87,7 @@ int install_user_keyrings(void)
 		if (IS_ERR(session_keyring)) {
 			session_keyring =
 				keyring_alloc(buf, user->uid, (gid_t) -1,
-					      current, KEY_ALLOC_IN_QUOTA,
-					      NULL);
+					      cred, KEY_ALLOC_IN_QUOTA, NULL);
 			if (IS_ERR(session_keyring)) {
 				ret = PTR_ERR(session_keyring);
 				goto error_release;
@@ -116,142 +119,128 @@ error:
 	return ret;
 }
 
-/*****************************************************************************/
 /*
- * deal with the UID changing
+ * install a fresh thread keyring directly to new credentials
  */
-void switch_uid_keyring(struct user_struct *new_user)
+int install_thread_keyring_to_cred(struct cred *new)
 {
-#if 0 /* do nothing for now */
-	struct key *old;
-
-	/* switch to the new user's session keyring if we were running under
-	 * root's default session keyring */
-	if (new_user->uid != 0 &&
-	    current->session_keyring == &root_session_keyring
-	    ) {
-		atomic_inc(&new_user->session_keyring->usage);
-
-		task_lock(current);
-		old = current->session_keyring;
-		current->session_keyring = new_user->session_keyring;
-		task_unlock(current);
+	struct key *keyring;
 
-		key_put(old);
-	}
-#endif
+	keyring = keyring_alloc("_tid", new->uid, new->gid, new,
+				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
 
-} /* end switch_uid_keyring() */
+	new->thread_keyring = keyring;
+	return 0;
+}
 
-/*****************************************************************************/
 /*
  * install a fresh thread keyring, discarding the old one
  */
-int install_thread_keyring(void)
+static int install_thread_keyring(void)
 {
-	struct task_struct *tsk = current;
-	struct key *keyring, *old;
-	char buf[20];
+	struct cred *new;
 	int ret;
 
-	sprintf(buf, "_tid.%u", tsk->pid);
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
-	keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
-				KEY_ALLOC_QUOTA_OVERRUN, NULL);
-	if (IS_ERR(keyring)) {
-		ret = PTR_ERR(keyring);
-		goto error;
+	BUG_ON(new->thread_keyring);
+
+	ret = install_thread_keyring_to_cred(new);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
 	}
 
-	task_lock(tsk);
-	old = tsk->cred->thread_keyring;
-	tsk->cred->thread_keyring = keyring;
-	task_unlock(tsk);
+	return commit_creds(new);
+}
 
-	ret = 0;
+/*
+ * install a process keyring directly to a credentials struct
+ * - returns -EEXIST if there was already a process keyring, 0 if one installed,
+ *   and other -ve on any other error
+ */
+int install_process_keyring_to_cred(struct cred *new)
+{
+	struct key *keyring;
+	int ret;
 
-	key_put(old);
-error:
+	if (new->tgcred->process_keyring)
+		return -EEXIST;
+
+	keyring = keyring_alloc("_pid", new->uid, new->gid,
+				new, KEY_ALLOC_QUOTA_OVERRUN, NULL);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
+
+	spin_lock_irq(&new->tgcred->lock);
+	if (!new->tgcred->process_keyring) {
+		new->tgcred->process_keyring = keyring;
+		keyring = NULL;
+		ret = 0;
+	} else {
+		ret = -EEXIST;
+	}
+	spin_unlock_irq(&new->tgcred->lock);
+	key_put(keyring);
 	return ret;
+}
 
-} /* end install_thread_keyring() */
-
-/*****************************************************************************/
 /*
  * make sure a process keyring is installed
+ * - we
  */
-int install_process_keyring(void)
+static int install_process_keyring(void)
 {
-	struct task_struct *tsk = current;
-	struct key *keyring;
-	char buf[20];
+	struct cred *new;
 	int ret;
 
-	might_sleep();
-
-	if (!tsk->cred->tgcred->process_keyring) {
-		sprintf(buf, "_pid.%u", tsk->tgid);
-
-		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
-					KEY_ALLOC_QUOTA_OVERRUN, NULL);
-		if (IS_ERR(keyring)) {
-			ret = PTR_ERR(keyring);
-			goto error;
-		}
-
-		/* attach keyring */
-		spin_lock_irq(&tsk->cred->tgcred->lock);
-		if (!tsk->cred->tgcred->process_keyring) {
-			tsk->cred->tgcred->process_keyring = keyring;
-			keyring = NULL;
-		}
-		spin_unlock_irq(&tsk->cred->tgcred->lock);
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
-		key_put(keyring);
+	ret = install_process_keyring_to_cred(new);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret != -EEXIST ?: 0;
 	}
 
-	ret = 0;
-error:
-	return ret;
-
-} /* end install_process_keyring() */
+	return commit_creds(new);
+}
 
-/*****************************************************************************/
 /*
- * install a session keyring, discarding the old one
- * - if a keyring is not supplied, an empty one is invented
+ * install a session keyring directly to a credentials struct
  */
-static int install_session_keyring(struct key *keyring)
+static int install_session_keyring_to_cred(struct cred *cred,
+					   struct key *keyring)
 {
-	struct task_struct *tsk = current;
 	unsigned long flags;
 	struct key *old;
-	char buf[20];
 
 	might_sleep();
 
 	/* create an empty session keyring */
 	if (!keyring) {
-		sprintf(buf, "_ses.%u", tsk->tgid);
-
 		flags = KEY_ALLOC_QUOTA_OVERRUN;
-		if (tsk->cred->tgcred->session_keyring)
+		if (cred->tgcred->session_keyring)
 			flags = KEY_ALLOC_IN_QUOTA;
 
-		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid,
-					tsk, flags, NULL);
+		keyring = keyring_alloc("_ses", cred->uid, cred->gid,
+					cred, flags, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
-	}
-	else {
+	} else {
 		atomic_inc(&keyring->usage);
 	}
 
 	/* install the keyring */
-	spin_lock_irq(&tsk->cred->tgcred->lock);
-	old = tsk->cred->tgcred->session_keyring;
-	rcu_assign_pointer(tsk->cred->tgcred->session_keyring, keyring);
-	spin_unlock_irq(&tsk->cred->tgcred->lock);
+	spin_lock_irq(&cred->tgcred->lock);
+	old = cred->tgcred->session_keyring;
+	rcu_assign_pointer(cred->tgcred->session_keyring, keyring);
+	spin_unlock_irq(&cred->tgcred->lock);
 
 	/* we're using RCU on the pointer, but there's no point synchronising
 	 * on it if it didn't previously point to anything */
@@ -261,38 +250,29 @@ static int install_session_keyring(struct key *keyring)
 	}
 
 	return 0;
+}
 
-} /* end install_session_keyring() */
-
-/*****************************************************************************/
 /*
- * copy the keys for fork
+ * install a session keyring, discarding the old one
+ * - if a keyring is not supplied, an empty one is invented
  */
-int copy_keys(unsigned long clone_flags, struct task_struct *tsk)
+static int install_session_keyring(struct key *keyring)
 {
-	key_check(tsk->cred->thread_keyring);
-	key_check(tsk->cred->request_key_auth);
-
-	/* no thread keyring yet */
-	tsk->cred->thread_keyring = NULL;
-
-	/* copy the request_key() authorisation for this thread */
-	key_get(tsk->cred->request_key_auth);
-
-	return 0;
+	struct cred *new;
+	int ret;
 
-} /* end copy_keys() */
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
-/*****************************************************************************/
-/*
- * dispose of per-thread keys upon thread exit
- */
-void exit_keys(struct task_struct *tsk)
-{
-	key_put(tsk->cred->thread_keyring);
-	key_put(tsk->cred->request_key_auth);
+	ret = install_session_keyring_to_cred(new, NULL);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
+	}
 
-} /* end exit_keys() */
+	return commit_creds(new);
+}
 
 /*****************************************************************************/
 /*
@@ -300,38 +280,41 @@ void exit_keys(struct task_struct *tsk)
  */
 int exec_keys(struct task_struct *tsk)
 {
-	struct key *old;
+	struct thread_group_cred *tgcred = NULL;
+	struct cred *new;
 
-	/* newly exec'd tasks don't get a thread keyring */
-	task_lock(tsk);
-	old = tsk->cred->thread_keyring;
-	tsk->cred->thread_keyring = NULL;
-	task_unlock(tsk);
+#ifdef CONFIG_KEYS
+	tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+	if (!tgcred)
+		return -ENOMEM;
+#endif
 
-	key_put(old);
+	new = prepare_creds();
+	if (new < 0)
+		return -ENOMEM;
 
-	/* discard the process keyring from a newly exec'd task */
-	spin_lock_irq(&tsk->cred->tgcred->lock);
-	old = tsk->cred->tgcred->process_keyring;
-	tsk->cred->tgcred->process_keyring = NULL;
-	spin_unlock_irq(&tsk->cred->tgcred->lock);
+	/* newly exec'd tasks don't get a thread keyring */
+	key_put(new->thread_keyring);
+	new->thread_keyring = NULL;
 
-	key_put(old);
+	/* create a new per-thread-group creds for all this set of threads to
+	 * share */
+	memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
 
-	return 0;
+	atomic_set(&tgcred->usage, 1);
+	spin_lock_init(&tgcred->lock);
 
-} /* end exec_keys() */
+	/* inherit the session keyring; new process keyring */
+	key_get(tgcred->session_keyring);
+	tgcred->process_keyring = NULL;
 
-/*****************************************************************************/
-/*
- * deal with SUID programs
- * - we might want to make this invent a new session keyring
- */
-int suid_keys(struct task_struct *tsk)
-{
+	release_tgcred(new);
+	new->tgcred = tgcred;
+
+	commit_creds(new);
 	return 0;
 
-} /* end suid_keys() */
+} /* end exec_keys() */
 
 /*****************************************************************************/
 /*
@@ -376,16 +359,13 @@ void key_fsgid_changed(struct task_struct *tsk)
 key_ref_t search_process_keyrings(struct key_type *type,
 				  const void *description,
 				  key_match_func_t match,
-				  struct task_struct *context)
+				  const struct cred *cred)
 {
 	struct request_key_auth *rka;
-	struct cred *cred;
 	key_ref_t key_ref, ret, err;
 
 	might_sleep();
 
-	cred = get_task_cred(context);
-
 	/* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
 	 * searchable, but we failed to find a key or we found a negative key;
 	 * otherwise we want to return a sample error (probably -EACCES) if
@@ -401,7 +381,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	if (cred->thread_keyring) {
 		key_ref = keyring_search_aux(
 			make_key_ref(cred->thread_keyring, 1),
-			context, type, description, match);
+			cred, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -422,7 +402,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	if (cred->tgcred->process_keyring) {
 		key_ref = keyring_search_aux(
 			make_key_ref(cred->tgcred->process_keyring, 1),
-			context, type, description, match);
+			cred, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -446,7 +426,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 			make_key_ref(rcu_dereference(
 					     cred->tgcred->session_keyring),
 				     1),
-			context, type, description, match);
+			cred, type, description, match);
 		rcu_read_unlock();
 
 		if (!IS_ERR(key_ref))
@@ -468,7 +448,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	else if (cred->user->session_keyring) {
 		key_ref = keyring_search_aux(
 			make_key_ref(cred->user->session_keyring, 1),
-			context, type, description, match);
+			cred, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -490,7 +470,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	 * - we don't permit access to request_key auth keys via this method
 	 */
 	if (cred->request_key_auth &&
-	    context == current &&
+	    cred == current_cred() &&
 	    type != &key_type_request_key_auth
 	    ) {
 		/* defend against the auth key being revoked */
@@ -500,7 +480,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 			rka = cred->request_key_auth->payload.data;
 
 			key_ref = search_process_keyrings(type, description,
-							  match, rka->context);
+							  match, rka->cred);
 
 			up_read(&cred->request_key_auth->sem);
 
@@ -527,7 +507,6 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	key_ref = ret ? ret : err;
 
 found:
-	put_cred(cred);
 	return key_ref;
 
 } /* end search_process_keyrings() */
@@ -552,8 +531,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 			  key_perm_t perm)
 {
 	struct request_key_auth *rka;
-	struct task_struct *t = current;
-	struct cred *cred;
+	const struct cred *cred;
 	struct key *key;
 	key_ref_t key_ref, skey_ref;
 	int ret;
@@ -608,6 +586,7 @@ try_again:
 				goto error;
 			ret = install_session_keyring(
 				cred->user->session_keyring);
+
 			if (ret < 0)
 				goto error;
 			goto reget_creds;
@@ -693,7 +672,7 @@ try_again:
 		/* check to see if we possess the key */
 		skey_ref = search_process_keyrings(key->type, key,
 						   lookup_user_key_possessed,
-						   current);
+						   cred);
 
 		if (!IS_ERR(skey_ref)) {
 			key_put(key);
@@ -725,7 +704,7 @@ try_again:
 		goto invalid_key;
 
 	/* check the permissions */
-	ret = key_task_permission(key_ref, t, perm);
+	ret = key_task_permission(key_ref, cred, perm);
 	if (ret < 0)
 		goto invalid_key;
 
@@ -755,21 +734,33 @@ reget_creds:
  */
 long join_session_keyring(const char *name)
 {
-	struct task_struct *tsk = current;
-	struct cred *cred = current->cred;
+	const struct cred *old;
+	struct cred *new;
 	struct key *keyring;
-	long ret;
+	long ret, serial;
+
+	/* only permit this if there's a single thread in the thread group -
+	 * this avoids us having to adjust the creds on all threads and risking
+	 * ENOMEM */
+	if (!is_single_threaded(current))
+		return -EMLINK;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
 
 	/* if no name is provided, install an anonymous keyring */
 	if (!name) {
-		ret = install_session_keyring(NULL);
+		ret = install_session_keyring_to_cred(new, NULL);
 		if (ret < 0)
 			goto error;
 
-		rcu_read_lock();
-		ret = rcu_dereference(cred->tgcred->session_keyring)->serial;
-		rcu_read_unlock();
-		goto error;
+		serial = new->tgcred->session_keyring->serial;
+		ret = commit_creds(new);
+		if (ret == 0)
+			ret = serial;
+		goto okay;
 	}
 
 	/* allow the user to join or create a named keyring */
@@ -779,29 +770,33 @@ long join_session_keyring(const char *name)
 	keyring = find_keyring_by_name(name, false);
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
-		keyring = keyring_alloc(name, cred->uid, cred->gid, tsk,
+		keyring = keyring_alloc(name, old->uid, old->gid, old,
 					KEY_ALLOC_IN_QUOTA, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
 		}
-	}
-	else if (IS_ERR(keyring)) {
+	} else if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error2;
 	}
 
 	/* we've got a keyring - now to install it */
-	ret = install_session_keyring(keyring);
+	ret = install_session_keyring_to_cred(new, keyring);
 	if (ret < 0)
 		goto error2;
 
+	commit_creds(new);
+	mutex_unlock(&key_session_mutex);
+
 	ret = keyring->serial;
 	key_put(keyring);
+okay:
+	return ret;
 
 error2:
 	mutex_unlock(&key_session_mutex);
 error:
+	abort_creds(new);
 	return ret;
-
-} /* end join_session_keyring() */
+}
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 3d12558362df..0e04f72ef2d4 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -83,8 +83,10 @@ static int call_sbin_request_key(struct key_construction *cons,
 	/* allocate a new session keyring */
 	sprintf(desc, "_req.%u", key->serial);
 
-	keyring = keyring_alloc(desc, current_fsuid(), current_fsgid(), current,
+	cred = get_current_cred();
+	keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred,
 				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+	put_cred(cred);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error_alloc;
@@ -104,8 +106,7 @@ static int call_sbin_request_key(struct key_construction *cons,
 
 	/* we specify the process's default keyrings */
 	sprintf(keyring_str[0], "%d",
-		cred->thread_keyring ?
-		cred->thread_keyring->serial : 0);
+		cred->thread_keyring ? cred->thread_keyring->serial : 0);
 
 	prkey = 0;
 	if (cred->tgcred->process_keyring)
@@ -155,8 +156,8 @@ error_link:
 	key_put(keyring);
 
 error_alloc:
-	kleave(" = %d", ret);
 	complete_request_key(cons, ret);
+	kleave(" = %d", ret);
 	return ret;
 }
 
@@ -295,6 +296,7 @@ static int construct_alloc_key(struct key_type *type,
 			       struct key_user *user,
 			       struct key **_key)
 {
+	const struct cred *cred = current_cred();
 	struct key *key;
 	key_ref_t key_ref;
 
@@ -302,9 +304,8 @@ static int construct_alloc_key(struct key_type *type,
 
 	mutex_lock(&user->cons_lock);
 
-	key = key_alloc(type, description,
-			current_fsuid(), current_fsgid(), current, KEY_POS_ALL,
-			flags);
+	key = key_alloc(type, description, cred->fsuid, cred->fsgid, cred,
+			KEY_POS_ALL, flags);
 	if (IS_ERR(key))
 		goto alloc_failed;
 
@@ -317,8 +318,7 @@ static int construct_alloc_key(struct key_type *type,
 	 * waited for locks */
 	mutex_lock(&key_construction_mutex);
 
-	key_ref = search_process_keyrings(type, description, type->match,
-					  current);
+	key_ref = search_process_keyrings(type, description, type->match, cred);
 	if (!IS_ERR(key_ref))
 		goto key_already_present;
 
@@ -363,6 +363,8 @@ static struct key *construct_key_and_link(struct key_type *type,
 	struct key *key;
 	int ret;
 
+	kenter("");
+
 	user = key_user_lookup(current_fsuid());
 	if (!user)
 		return ERR_PTR(-ENOMEM);
@@ -376,17 +378,21 @@ static struct key *construct_key_and_link(struct key_type *type,
 	if (ret == 0) {
 		ret = construct_key(key, callout_info, callout_len, aux,
 				    dest_keyring);
-		if (ret < 0)
+		if (ret < 0) {
+			kdebug("cons failed");
 			goto construction_failed;
+		}
 	}
 
 	key_put(dest_keyring);
+	kleave(" = key %d", key_serial(key));
 	return key;
 
 construction_failed:
 	key_negate_and_link(key, key_negative_timeout, NULL, NULL);
 	key_put(key);
 	key_put(dest_keyring);
+	kleave(" = %d", ret);
 	return ERR_PTR(ret);
 }
 
@@ -405,6 +411,7 @@ struct key *request_key_and_link(struct key_type *type,
 				 struct key *dest_keyring,
 				 unsigned long flags)
 {
+	const struct cred *cred = current_cred();
 	struct key *key;
 	key_ref_t key_ref;
 
@@ -414,7 +421,7 @@ struct key *request_key_and_link(struct key_type *type,
 
 	/* search all the process keyrings for a key */
 	key_ref = search_process_keyrings(type, description, type->match,
-					  current);
+					  cred);
 
 	if (!IS_ERR(key_ref)) {
 		key = key_ref_to_ptr(key_ref);
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 2125579d5d73..86747151ee5b 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -105,9 +105,9 @@ static void request_key_auth_revoke(struct key *key)
 
 	kenter("{%d}", key->serial);
 
-	if (rka->context) {
-		put_task_struct(rka->context);
-		rka->context = NULL;
+	if (rka->cred) {
+		put_cred(rka->cred);
+		rka->cred = NULL;
 	}
 
 } /* end request_key_auth_revoke() */
@@ -122,9 +122,9 @@ static void request_key_auth_destroy(struct key *key)
 
 	kenter("{%d}", key->serial);
 
-	if (rka->context) {
-		put_task_struct(rka->context);
-		rka->context = NULL;
+	if (rka->cred) {
+		put_cred(rka->cred);
+		rka->cred = NULL;
 	}
 
 	key_put(rka->target_key);
@@ -143,6 +143,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 				 size_t callout_len, struct key *dest_keyring)
 {
 	struct request_key_auth *rka, *irka;
+	const struct cred *cred = current->cred;
 	struct key *authkey = NULL;
 	char desc[20];
 	int ret;
@@ -164,28 +165,25 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 
 	/* see if the calling process is already servicing the key request of
 	 * another process */
-	if (current->cred->request_key_auth) {
+	if (cred->request_key_auth) {
 		/* it is - use that instantiation context here too */
-		down_read(&current->cred->request_key_auth->sem);
+		down_read(&cred->request_key_auth->sem);
 
 		/* if the auth key has been revoked, then the key we're
 		 * servicing is already instantiated */
-		if (test_bit(KEY_FLAG_REVOKED,
-			     &current->cred->request_key_auth->flags))
+		if (test_bit(KEY_FLAG_REVOKED, &cred->request_key_auth->flags))
 			goto auth_key_revoked;
 
-		irka = current->cred->request_key_auth->payload.data;
-		rka->context = irka->context;
+		irka = cred->request_key_auth->payload.data;
+		rka->cred = get_cred(irka->cred);
 		rka->pid = irka->pid;
-		get_task_struct(rka->context);
 
-		up_read(&current->cred->request_key_auth->sem);
+		up_read(&cred->request_key_auth->sem);
 	}
 	else {
 		/* it isn't - use this process as the context */
-		rka->context = current;
+		rka->cred = get_cred(cred);
 		rka->pid = current->pid;
-		get_task_struct(rka->context);
 	}
 
 	rka->target_key = key_get(target);
@@ -197,7 +195,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 	sprintf(desc, "%x", target->serial);
 
 	authkey = key_alloc(&key_type_request_key_auth, desc,
-			    current_fsuid(), current_fsgid(), current,
+			    cred->fsuid, cred->fsgid, cred,
 			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
 			    KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(authkey)) {
@@ -205,16 +203,16 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 		goto error_alloc;
 	}
 
-	/* construct and attach to the keyring */
+	/* construct the auth key */
 	ret = key_instantiate_and_link(authkey, rka, 0, NULL, NULL);
 	if (ret < 0)
 		goto error_inst;
 
-	kleave(" = {%d}", authkey->serial);
+	kleave(" = {%d,%d}", authkey->serial, atomic_read(&authkey->usage));
 	return authkey;
 
 auth_key_revoked:
-	up_read(&current->cred->request_key_auth->sem);
+	up_read(&cred->request_key_auth->sem);
 	kfree(rka->callout_info);
 	kfree(rka);
 	kleave("= -EKEYREVOKED");
@@ -257,6 +255,7 @@ static int key_get_instantiation_authkey_match(const struct key *key,
  */
 struct key *key_get_instantiation_authkey(key_serial_t target_id)
 {
+	const struct cred *cred = current_cred();
 	struct key *authkey;
 	key_ref_t authkey_ref;
 
@@ -264,7 +263,7 @@ struct key *key_get_instantiation_authkey(key_serial_t target_id)
 		&key_type_request_key_auth,
 		(void *) (unsigned long) target_id,
 		key_get_instantiation_authkey_match,
-		current);
+		cred);
 
 	if (IS_ERR(authkey_ref)) {
 		authkey = ERR_CAST(authkey_ref);
diff --git a/security/security.c b/security/security.c
index f40a0a04c3c2..a55d739c6864 100644
--- a/security/security.c
+++ b/security/security.c
@@ -145,18 +145,13 @@ int security_capget(struct task_struct *target,
 	return security_ops->capget(target, effective, inheritable, permitted);
 }
 
-int security_capset_check(const kernel_cap_t *effective,
-			  const kernel_cap_t *inheritable,
-			  const kernel_cap_t *permitted)
+int security_capset(struct cred *new, const struct cred *old,
+		    const kernel_cap_t *effective,
+		    const kernel_cap_t *inheritable,
+		    const kernel_cap_t *permitted)
 {
-	return security_ops->capset_check(effective, inheritable, permitted);
-}
-
-void security_capset_set(const kernel_cap_t *effective,
-			 const kernel_cap_t *inheritable,
-			 const kernel_cap_t *permitted)
-{
-	security_ops->capset_set(effective, inheritable, permitted);
+	return security_ops->capset(new, old,
+				    effective, inheritable, permitted);
 }
 
 int security_capable(struct task_struct *tsk, int cap)
@@ -228,9 +223,9 @@ void security_bprm_free(struct linux_binprm *bprm)
 	security_ops->bprm_free_security(bprm);
 }
 
-void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 {
-	security_ops->bprm_apply_creds(bprm, unsafe);
+	return security_ops->bprm_apply_creds(bprm, unsafe);
 }
 
 void security_bprm_post_apply_creds(struct linux_binprm *bprm)
@@ -616,14 +611,19 @@ int security_task_create(unsigned long clone_flags)
 	return security_ops->task_create(clone_flags);
 }
 
-int security_cred_alloc(struct cred *cred)
+void security_cred_free(struct cred *cred)
 {
-	return security_ops->cred_alloc_security(cred);
+	security_ops->cred_free(cred);
 }
 
-void security_cred_free(struct cred *cred)
+int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
 {
-	security_ops->cred_free(cred);
+	return security_ops->cred_prepare(new, old, gfp);
+}
+
+void security_commit_creds(struct cred *new, const struct cred *old)
+{
+	return security_ops->cred_commit(new, old);
 }
 
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
@@ -631,10 +631,10 @@ int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
 	return security_ops->task_setuid(id0, id1, id2, flags);
 }
 
-int security_task_post_setuid(uid_t old_ruid, uid_t old_euid,
-			       uid_t old_suid, int flags)
+int security_task_fix_setuid(struct cred *new, const struct cred *old,
+			     int flags)
 {
-	return security_ops->task_post_setuid(old_ruid, old_euid, old_suid, flags);
+	return security_ops->task_fix_setuid(new, old, flags);
 }
 
 int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags)
@@ -716,14 +716,9 @@ int security_task_wait(struct task_struct *p)
 }
 
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-			 unsigned long arg4, unsigned long arg5, long *rc_p)
-{
-	return security_ops->task_prctl(option, arg2, arg3, arg4, arg5, rc_p);
-}
-
-void security_task_reparent_to_init(struct task_struct *p)
+			 unsigned long arg4, unsigned long arg5)
 {
-	security_ops->task_reparent_to_init(p);
+	return security_ops->task_prctl(option, arg2, arg3, arg4, arg5);
 }
 
 void security_task_to_inode(struct task_struct *p, struct inode *inode)
@@ -1123,9 +1118,10 @@ EXPORT_SYMBOL(security_skb_classify_flow);
 
 #ifdef CONFIG_KEYS
 
-int security_key_alloc(struct key *key, struct task_struct *tsk, unsigned long flags)
+int security_key_alloc(struct key *key, const struct cred *cred,
+		       unsigned long flags)
 {
-	return security_ops->key_alloc(key, tsk, flags);
+	return security_ops->key_alloc(key, cred, flags);
 }
 
 void security_key_free(struct key *key)
@@ -1134,9 +1130,9 @@ void security_key_free(struct key *key)
 }
 
 int security_key_permission(key_ref_t key_ref,
-			    struct task_struct *context, key_perm_t perm)
+			    const struct cred *cred, key_perm_t perm)
 {
-	return security_ops->key_permission(key_ref, context, perm);
+	return security_ops->key_permission(key_ref, cred, perm);
 }
 
 int security_key_getsecurity(struct key *key, char **_buffer)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index f20cbd681ba6..c71bba78872f 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -156,20 +156,20 @@ static int selinux_secmark_enabled(void)
 	return (atomic_read(&selinux_secmark_refcount) > 0);
 }
 
-/* Allocate and free functions for each kind of security blob. */
-
-static int cred_alloc_security(struct cred *cred)
+/*
+ * initialise the security for the init task
+ */
+static void cred_init_security(void)
 {
+	struct cred *cred = (struct cred *) current->cred;
 	struct task_security_struct *tsec;
 
 	tsec = kzalloc(sizeof(struct task_security_struct), GFP_KERNEL);
 	if (!tsec)
-		return -ENOMEM;
+		panic("SELinux:  Failed to initialize initial task.\n");
 
-	tsec->osid = tsec->sid = SECINITSID_UNLABELED;
+	tsec->osid = tsec->sid = SECINITSID_KERNEL;
 	cred->security = tsec;
-
-	return 0;
 }
 
 /*
@@ -1378,6 +1378,19 @@ static inline u32 signal_to_av(int sig)
 	return perm;
 }
 
+/*
+ * Check permission between a pair of credentials
+ * fork check, ptrace check, etc.
+ */
+static int cred_has_perm(const struct cred *actor,
+			 const struct cred *target,
+			 u32 perms)
+{
+	u32 asid = cred_sid(actor), tsid = cred_sid(target);
+
+	return avc_has_perm(asid, tsid, SECCLASS_PROCESS, perms, NULL);
+}
+
 /*
  * Check permission between a pair of tasks, e.g. signal checks,
  * fork check, ptrace check, etc.
@@ -1820,24 +1833,19 @@ static int selinux_capget(struct task_struct *target, kernel_cap_t *effective,
 	return secondary_ops->capget(target, effective, inheritable, permitted);
 }
 
-static int selinux_capset_check(const kernel_cap_t *effective,
-				const kernel_cap_t *inheritable,
-				const kernel_cap_t *permitted)
+static int selinux_capset(struct cred *new, const struct cred *old,
+			  const kernel_cap_t *effective,
+			  const kernel_cap_t *inheritable,
+			  const kernel_cap_t *permitted)
 {
 	int error;
 
-	error = secondary_ops->capset_check(effective, inheritable, permitted);
+	error = secondary_ops->capset(new, old,
+				      effective, inheritable, permitted);
 	if (error)
 		return error;
 
-	return task_has_perm(current, current, PROCESS__SETCAP);
-}
-
-static void selinux_capset_set(const kernel_cap_t *effective,
-			       const kernel_cap_t *inheritable,
-			       const kernel_cap_t *permitted)
-{
-	secondary_ops->capset_set(effective, inheritable, permitted);
+	return cred_has_perm(old, new, PROCESS__SETCAP);
 }
 
 static int selinux_capable(struct task_struct *tsk, int cap, int audit)
@@ -2244,16 +2252,23 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 	spin_unlock(&files->file_lock);
 }
 
-static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+static int selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 {
 	struct task_security_struct *tsec;
 	struct bprm_security_struct *bsec;
+	struct cred *new;
 	u32 sid;
 	int rc;
 
-	secondary_ops->bprm_apply_creds(bprm, unsafe);
+	rc = secondary_ops->bprm_apply_creds(bprm, unsafe);
+	if (rc < 0)
+		return rc;
 
-	tsec = current_security();
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	tsec = new->security;
 
 	bsec = bprm->security;
 	sid = bsec->sid;
@@ -2268,7 +2283,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 					PROCESS__SHARE, NULL);
 			if (rc) {
 				bsec->unsafe = 1;
-				return;
+				goto out;
 			}
 		}
 
@@ -2292,12 +2307,16 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 						  PROCESS__PTRACE, NULL);
 				if (rc) {
 					bsec->unsafe = 1;
-					return;
+					goto out;
 				}
 			}
 		}
 		tsec->sid = sid;
 	}
+
+out:
+	commit_creds(new);
+	return 0;
 }
 
 /*
@@ -3021,6 +3040,7 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd,
 static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
 {
 	const struct cred *cred = current_cred();
+	int rc = 0;
 
 #ifndef CONFIG_PPC32
 	if ((prot & PROT_EXEC) && (!file || (!shared && (prot & PROT_WRITE)))) {
@@ -3029,9 +3049,9 @@ static int file_map_prot_check(struct file *file, unsigned long prot, int shared
 		 * private file mapping that will also be writable.
 		 * This has an additional check.
 		 */
-		int rc = task_has_perm(current, current, PROCESS__EXECMEM);
+		rc = cred_has_perm(cred, cred, PROCESS__EXECMEM);
 		if (rc)
-			return rc;
+			goto error;
 	}
 #endif
 
@@ -3048,7 +3068,9 @@ static int file_map_prot_check(struct file *file, unsigned long prot, int shared
 
 		return file_has_perm(cred, file, av);
 	}
-	return 0;
+
+error:
+	return rc;
 }
 
 static int selinux_file_mmap(struct file *file, unsigned long reqprot,
@@ -3090,8 +3112,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
 		rc = 0;
 		if (vma->vm_start >= vma->vm_mm->start_brk &&
 		    vma->vm_end <= vma->vm_mm->brk) {
-			rc = task_has_perm(current, current,
-					   PROCESS__EXECHEAP);
+			rc = cred_has_perm(cred, cred, PROCESS__EXECHEAP);
 		} else if (!vma->vm_file &&
 			   vma->vm_start <= vma->vm_mm->start_stack &&
 			   vma->vm_end >= vma->vm_mm->start_stack) {
@@ -3104,8 +3125,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
 			 * modified content.  This typically should only
 			 * occur for text relocations.
 			 */
-			rc = file_has_perm(cred, vma->vm_file,
-					   FILE__EXECMOD);
+			rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD);
 		}
 		if (rc)
 			return rc;
@@ -3211,6 +3231,7 @@ static int selinux_dentry_open(struct file *file, const struct cred *cred)
 	struct file_security_struct *fsec;
 	struct inode *inode;
 	struct inode_security_struct *isec;
+
 	inode = file->f_path.dentry->d_inode;
 	fsec = file->f_security;
 	isec = inode->i_security;
@@ -3247,38 +3268,41 @@ static int selinux_task_create(unsigned long clone_flags)
 	return task_has_perm(current, current, PROCESS__FORK);
 }
 
-static int selinux_cred_alloc_security(struct cred *cred)
+/*
+ * detach and free the LSM part of a set of credentials
+ */
+static void selinux_cred_free(struct cred *cred)
 {
-	struct task_security_struct *tsec1, *tsec2;
-	int rc;
-
-	tsec1 = current_security();
+	struct task_security_struct *tsec = cred->security;
+	cred->security = NULL;
+	kfree(tsec);
+}
 
-	rc = cred_alloc_security(cred);
-	if (rc)
-		return rc;
-	tsec2 = cred->security;
+/*
+ * prepare a new set of credentials for modification
+ */
+static int selinux_cred_prepare(struct cred *new, const struct cred *old,
+				gfp_t gfp)
+{
+	const struct task_security_struct *old_tsec;
+	struct task_security_struct *tsec;
 
-	tsec2->osid = tsec1->osid;
-	tsec2->sid = tsec1->sid;
+	old_tsec = old->security;
 
-	/* Retain the exec, fs, key, and sock SIDs across fork */
-	tsec2->exec_sid = tsec1->exec_sid;
-	tsec2->create_sid = tsec1->create_sid;
-	tsec2->keycreate_sid = tsec1->keycreate_sid;
-	tsec2->sockcreate_sid = tsec1->sockcreate_sid;
+	tsec = kmemdup(old_tsec, sizeof(struct task_security_struct), gfp);
+	if (!tsec)
+		return -ENOMEM;
 
+	new->security = tsec;
 	return 0;
 }
 
 /*
- * detach and free the LSM part of a set of credentials
+ * commit new credentials
  */
-static void selinux_cred_free(struct cred *cred)
+static void selinux_cred_commit(struct cred *new, const struct cred *old)
 {
-	struct task_security_struct *tsec = cred->security;
-	cred->security = NULL;
-	kfree(tsec);
+	secondary_ops->cred_commit(new, old);
 }
 
 static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
@@ -3292,9 +3316,10 @@ static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
 	return 0;
 }
 
-static int selinux_task_post_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
+static int selinux_task_fix_setuid(struct cred *new, const struct cred *old,
+				   int flags)
 {
-	return secondary_ops->task_post_setuid(id0, id1, id2, flags);
+	return secondary_ops->task_fix_setuid(new, old, flags);
 }
 
 static int selinux_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags)
@@ -3368,7 +3393,7 @@ static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim
 	/* Control the ability to change the hard limit (whether
 	   lowering or raising it), so that the hard limit can
 	   later be used as a safe reset point for the soft limit
-	   upon context transitions. See selinux_bprm_apply_creds. */
+	   upon context transitions.  See selinux_bprm_committing_creds. */
 	if (old_rlim->rlim_max != new_rlim->rlim_max)
 		return task_has_perm(current, current, PROCESS__SETRLIMIT);
 
@@ -3422,13 +3447,12 @@ static int selinux_task_prctl(int option,
 			      unsigned long arg2,
 			      unsigned long arg3,
 			      unsigned long arg4,
-			      unsigned long arg5,
-			      long *rc_p)
+			      unsigned long arg5)
 {
 	/* The current prctl operations do not appear to require
 	   any SELinux controls since they merely observe or modify
 	   the state of the current process. */
-	return secondary_ops->task_prctl(option, arg2, arg3, arg4, arg5, rc_p);
+	return secondary_ops->task_prctl(option, arg2, arg3, arg4, arg5);
 }
 
 static int selinux_task_wait(struct task_struct *p)
@@ -3436,18 +3460,6 @@ static int selinux_task_wait(struct task_struct *p)
 	return task_has_perm(p, current, PROCESS__SIGCHLD);
 }
 
-static void selinux_task_reparent_to_init(struct task_struct *p)
-{
-	struct task_security_struct *tsec;
-
-	secondary_ops->task_reparent_to_init(p);
-
-	tsec = p->cred->security;
-	tsec->osid = tsec->sid;
-	tsec->sid = SECINITSID_KERNEL;
-	return;
-}
-
 static void selinux_task_to_inode(struct task_struct *p,
 				  struct inode *inode)
 {
@@ -5325,7 +5337,8 @@ static int selinux_setprocattr(struct task_struct *p,
 {
 	struct task_security_struct *tsec;
 	struct task_struct *tracer;
-	u32 sid = 0;
+	struct cred *new;
+	u32 sid = 0, ptsid;
 	int error;
 	char *str = value;
 
@@ -5372,86 +5385,75 @@ static int selinux_setprocattr(struct task_struct *p,
 			return error;
 	}
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
 	/* Permission checking based on the specified context is
 	   performed during the actual operation (execve,
 	   open/mkdir/...), when we know the full context of the
-	   operation.  See selinux_bprm_set_security for the execve
+	   operation.  See selinux_bprm_set_creds for the execve
 	   checks and may_create for the file creation checks. The
 	   operation will then fail if the context is not permitted. */
-	tsec = p->cred->security;
-	if (!strcmp(name, "exec"))
+	tsec = new->security;
+	if (!strcmp(name, "exec")) {
 		tsec->exec_sid = sid;
-	else if (!strcmp(name, "fscreate"))
+	} else if (!strcmp(name, "fscreate")) {
 		tsec->create_sid = sid;
-	else if (!strcmp(name, "keycreate")) {
+	} else if (!strcmp(name, "keycreate")) {
 		error = may_create_key(sid, p);
 		if (error)
-			return error;
+			goto abort_change;
 		tsec->keycreate_sid = sid;
-	} else if (!strcmp(name, "sockcreate"))
+	} else if (!strcmp(name, "sockcreate")) {
 		tsec->sockcreate_sid = sid;
-	else if (!strcmp(name, "current")) {
-		struct av_decision avd;
-
+	} else if (!strcmp(name, "current")) {
+		error = -EINVAL;
 		if (sid == 0)
-			return -EINVAL;
-		/*
-		 * SELinux allows to change context in the following case only.
-		 *  - Single threaded processes.
-		 *  - Multi threaded processes intend to change its context into
-		 *    more restricted domain (defined by TYPEBOUNDS statement).
-		 */
-		if (atomic_read(&p->mm->mm_users) != 1) {
-			struct task_struct *g, *t;
-			struct mm_struct *mm = p->mm;
-			read_lock(&tasklist_lock);
-			do_each_thread(g, t) {
-				if (t->mm == mm && t != p) {
-					read_unlock(&tasklist_lock);
-					error = security_bounded_transition(tsec->sid, sid);
-					if (!error)
-						goto boundary_ok;
-
-					return error;
-				}
-			} while_each_thread(g, t);
-			read_unlock(&tasklist_lock);
+			goto abort_change;
+
+		/* Only allow single threaded processes to change context */
+		error = -EPERM;
+		if (!is_single_threaded(p)) {
+			error = security_bounded_transition(tsec->sid, sid);
+			if (error)
+				goto abort_change;
 		}
-boundary_ok:
 
 		/* Check permissions for the transition. */
 		error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS,
 				     PROCESS__DYNTRANSITION, NULL);
 		if (error)
-			return error;
+			goto abort_change;
 
 		/* Check for ptracing, and update the task SID if ok.
 		   Otherwise, leave SID unchanged and fail. */
+		ptsid = 0;
 		task_lock(p);
-		rcu_read_lock();
 		tracer = tracehook_tracer_task(p);
-		if (tracer != NULL) {
-			u32 ptsid = task_sid(tracer);
-			rcu_read_unlock();
-			error = avc_has_perm_noaudit(ptsid, sid,
-						     SECCLASS_PROCESS,
-						     PROCESS__PTRACE, 0, &avd);
-			if (!error)
-				tsec->sid = sid;
-			task_unlock(p);
-			avc_audit(ptsid, sid, SECCLASS_PROCESS,
-				  PROCESS__PTRACE, &avd, error, NULL);
+		if (tracer)
+			ptsid = task_sid(tracer);
+		task_unlock(p);
+
+		if (tracer) {
+			error = avc_has_perm(ptsid, sid, SECCLASS_PROCESS,
+					     PROCESS__PTRACE, NULL);
 			if (error)
-				return error;
-		} else {
-			rcu_read_unlock();
-			tsec->sid = sid;
-			task_unlock(p);
+				goto abort_change;
 		}
-	} else
-		return -EINVAL;
 
+		tsec->sid = sid;
+	} else {
+		error = -EINVAL;
+		goto abort_change;
+	}
+
+	commit_creds(new);
 	return size;
+
+abort_change:
+	abort_creds(new);
+	return error;
 }
 
 static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
@@ -5471,23 +5473,21 @@ static void selinux_release_secctx(char *secdata, u32 seclen)
 
 #ifdef CONFIG_KEYS
 
-static int selinux_key_alloc(struct key *k, struct task_struct *tsk,
+static int selinux_key_alloc(struct key *k, const struct cred *cred,
 			     unsigned long flags)
 {
-	const struct task_security_struct *__tsec;
+	const struct task_security_struct *tsec;
 	struct key_security_struct *ksec;
 
 	ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL);
 	if (!ksec)
 		return -ENOMEM;
 
-	rcu_read_lock();
-	__tsec = __task_cred(tsk)->security;
-	if (__tsec->keycreate_sid)
-		ksec->sid = __tsec->keycreate_sid;
+	tsec = cred->security;
+	if (tsec->keycreate_sid)
+		ksec->sid = tsec->keycreate_sid;
 	else
-		ksec->sid = __tsec->sid;
-	rcu_read_unlock();
+		ksec->sid = tsec->sid;
 
 	k->security = ksec;
 	return 0;
@@ -5502,8 +5502,8 @@ static void selinux_key_free(struct key *k)
 }
 
 static int selinux_key_permission(key_ref_t key_ref,
-			    struct task_struct *ctx,
-			    key_perm_t perm)
+				  const struct cred *cred,
+				  key_perm_t perm)
 {
 	struct key *key;
 	struct key_security_struct *ksec;
@@ -5515,7 +5515,7 @@ static int selinux_key_permission(key_ref_t key_ref,
 	if (perm == 0)
 		return 0;
 
-	sid = task_sid(ctx);
+	sid = cred_sid(cred);
 
 	key = key_ref_to_ptr(key_ref);
 	ksec = key->security;
@@ -5545,8 +5545,7 @@ static struct security_operations selinux_ops = {
 	.ptrace_may_access =		selinux_ptrace_may_access,
 	.ptrace_traceme =		selinux_ptrace_traceme,
 	.capget =			selinux_capget,
-	.capset_check =			selinux_capset_check,
-	.capset_set =			selinux_capset_set,
+	.capset =			selinux_capset,
 	.sysctl =			selinux_sysctl,
 	.capable =			selinux_capable,
 	.quotactl =			selinux_quotactl,
@@ -5621,10 +5620,11 @@ static struct security_operations selinux_ops = {
 	.dentry_open =			selinux_dentry_open,
 
 	.task_create =			selinux_task_create,
-	.cred_alloc_security =		selinux_cred_alloc_security,
 	.cred_free =			selinux_cred_free,
+	.cred_prepare =			selinux_cred_prepare,
+	.cred_commit =			selinux_cred_commit,
 	.task_setuid =			selinux_task_setuid,
-	.task_post_setuid =		selinux_task_post_setuid,
+	.task_fix_setuid =		selinux_task_fix_setuid,
 	.task_setgid =			selinux_task_setgid,
 	.task_setpgid =			selinux_task_setpgid,
 	.task_getpgid =			selinux_task_getpgid,
@@ -5641,7 +5641,6 @@ static struct security_operations selinux_ops = {
 	.task_kill =			selinux_task_kill,
 	.task_wait =			selinux_task_wait,
 	.task_prctl =			selinux_task_prctl,
-	.task_reparent_to_init =	selinux_task_reparent_to_init,
 	.task_to_inode =		selinux_task_to_inode,
 
 	.ipc_permission =		selinux_ipc_permission,
@@ -5737,8 +5736,6 @@ static struct security_operations selinux_ops = {
 
 static __init int selinux_init(void)
 {
-	struct task_security_struct *tsec;
-
 	if (!security_module_enable(&selinux_ops)) {
 		selinux_enabled = 0;
 		return 0;
@@ -5752,10 +5749,7 @@ static __init int selinux_init(void)
 	printk(KERN_INFO "SELinux:  Initializing.\n");
 
 	/* Set the security state for the initial task. */
-	if (cred_alloc_security(current->cred))
-		panic("SELinux:  Failed to initialize initial task.\n");
-	tsec = current->cred->security;
-	tsec->osid = tsec->sid = SECINITSID_KERNEL;
+	cred_init_security();
 
 	sel_inode_cache = kmem_cache_create("selinux_inode_security",
 					    sizeof(struct inode_security_struct),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 11167fd567b9..e952b397153d 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -104,8 +104,7 @@ static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode)
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(current->cred->security, ctp->cred->security,
-			MAY_READWRITE);
+	rc = smk_access(current_security(), task_security(ctp), MAY_READWRITE);
 	if (rc != 0 && capable(CAP_MAC_OVERRIDE))
 		return 0;
 	return rc;
@@ -127,8 +126,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(ptp->cred->security, current->cred->security,
-			MAY_READWRITE);
+	rc = smk_access(task_security(ptp), current_security(), MAY_READWRITE);
 	if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE))
 		return 0;
 	return rc;
@@ -976,22 +974,6 @@ static int smack_file_receive(struct file *file)
  * Task hooks
  */
 
-/**
- * smack_cred_alloc_security - "allocate" a task cred blob
- * @cred: the task creds in need of a blob
- *
- * Smack isn't using copies of blobs. Everyone
- * points to an immutable list. No alloc required.
- * No data copy required.
- *
- * Always returns 0
- */
-static int smack_cred_alloc_security(struct cred *cred)
-{
-	cred->security = current_security();
-	return 0;
-}
-
 /**
  * smack_cred_free - "free" task-level security credentials
  * @cred: the credentials in question
@@ -1005,6 +987,30 @@ static void smack_cred_free(struct cred *cred)
 	cred->security = NULL;
 }
 
+/**
+ * smack_cred_prepare - prepare new set of credentials for modification
+ * @new: the new credentials
+ * @old: the original credentials
+ * @gfp: the atomicity of any memory allocations
+ *
+ * Prepare a new set of credentials for modification.
+ */
+static int smack_cred_prepare(struct cred *new, const struct cred *old,
+			      gfp_t gfp)
+{
+	new->security = old->security;
+	return 0;
+}
+
+/*
+ * commit new credentials
+ * @new: the new credentials
+ * @old: the original credentials
+ */
+static void smack_cred_commit(struct cred *new, const struct cred *old)
+{
+}
+
 /**
  * smack_task_setpgid - Smack check on setting pgid
  * @p: the task object
@@ -2036,6 +2042,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
 static int smack_setprocattr(struct task_struct *p, char *name,
 			     void *value, size_t size)
 {
+	struct cred *new;
 	char *newsmack;
 
 	/*
@@ -2058,7 +2065,11 @@ static int smack_setprocattr(struct task_struct *p, char *name,
 	if (newsmack == NULL)
 		return -EINVAL;
 
-	p->cred->security = newsmack;
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	new->security = newsmack;
+	commit_creds(new);
 	return size;
 }
 
@@ -2354,17 +2365,17 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 /**
  * smack_key_alloc - Set the key security blob
  * @key: object
- * @tsk: the task associated with the key
+ * @cred: the credentials to use
  * @flags: unused
  *
  * No allocation required
  *
  * Returns 0
  */
-static int smack_key_alloc(struct key *key, struct task_struct *tsk,
+static int smack_key_alloc(struct key *key, const struct cred *cred,
 			   unsigned long flags)
 {
-	key->security = tsk->cred->security;
+	key->security = cred->security;
 	return 0;
 }
 
@@ -2382,14 +2393,14 @@ static void smack_key_free(struct key *key)
 /*
  * smack_key_permission - Smack access on a key
  * @key_ref: gets to the object
- * @context: task involved
+ * @cred: the credentials to use
  * @perm: unused
  *
  * Return 0 if the task has read and write to the object,
  * an error code otherwise
  */
 static int smack_key_permission(key_ref_t key_ref,
-				struct task_struct *context, key_perm_t perm)
+				const struct cred *cred, key_perm_t perm)
 {
 	struct key *keyp;
 
@@ -2405,11 +2416,10 @@ static int smack_key_permission(key_ref_t key_ref,
 	/*
 	 * This should not occur
 	 */
-	if (context->cred->security == NULL)
+	if (cred->security == NULL)
 		return -EACCES;
 
-	return smk_access(context->cred->security, keyp->security,
-			  MAY_READWRITE);
+	return smk_access(cred->security, keyp->security, MAY_READWRITE);
 }
 #endif /* CONFIG_KEYS */
 
@@ -2580,8 +2590,7 @@ struct security_operations smack_ops = {
 	.ptrace_may_access =		smack_ptrace_may_access,
 	.ptrace_traceme =		smack_ptrace_traceme,
 	.capget = 			cap_capget,
-	.capset_check = 		cap_capset_check,
-	.capset_set = 			cap_capset_set,
+	.capset = 			cap_capset,
 	.capable = 			cap_capable,
 	.syslog = 			smack_syslog,
 	.settime = 			cap_settime,
@@ -2630,9 +2639,10 @@ struct security_operations smack_ops = {
 	.file_send_sigiotask = 		smack_file_send_sigiotask,
 	.file_receive = 		smack_file_receive,
 
-	.cred_alloc_security = 		smack_cred_alloc_security,
 	.cred_free =			smack_cred_free,
-	.task_post_setuid =		cap_task_post_setuid,
+	.cred_prepare =			smack_cred_prepare,
+	.cred_commit =			smack_cred_commit,
+	.task_fix_setuid =		cap_task_fix_setuid,
 	.task_setpgid = 		smack_task_setpgid,
 	.task_getpgid = 		smack_task_getpgid,
 	.task_getsid = 			smack_task_getsid,
@@ -2645,7 +2655,6 @@ struct security_operations smack_ops = {
 	.task_movememory = 		smack_task_movememory,
 	.task_kill = 			smack_task_kill,
 	.task_wait = 			smack_task_wait,
-	.task_reparent_to_init =	cap_task_reparent_to_init,
 	.task_to_inode = 		smack_task_to_inode,
 	.task_prctl =			cap_task_prctl,
 
@@ -2721,6 +2730,8 @@ struct security_operations smack_ops = {
  */
 static __init int smack_init(void)
 {
+	struct cred *cred;
+
 	if (!security_module_enable(&smack_ops))
 		return 0;
 
@@ -2729,7 +2740,8 @@ static __init int smack_init(void)
 	/*
 	 * Set the security state for the initial task.
 	 */
-	current->cred->security = &smack_known_floor.smk_known;
+	cred = (struct cred *) current->cred;
+	cred->security = &smack_known_floor.smk_known;
 
 	/*
 	 * Initialize locks
-- 
cgit v1.3-14-g43fede


From 411c41eea58bd3500cf897e2c27dd5330935a3a8 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 25 Nov 2008 00:40:37 -0800
Subject: aoe: remove private mac address format function

Add %pm to omit the colons when printing a mac address.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/block/aoe/aoe.h    |  1 -
 drivers/block/aoe/aoeblk.c |  2 +-
 drivers/block/aoe/aoecmd.c | 22 ++++++++++------------
 drivers/block/aoe/aoenet.c | 11 -----------
 lib/vsprintf.c             |  3 +++
 5 files changed, 14 insertions(+), 25 deletions(-)

(limited to 'lib')

diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 93f3690396a5..c237527b1aa5 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -200,4 +200,3 @@ void aoenet_xmit(struct sk_buff_head *);
 int is_aoe_netif(struct net_device *ifp);
 int set_aoe_iflist(const char __user *str, size_t size);
 
-unsigned long long mac_addr(char addr[6]);
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 1747dd272cd4..2307a271bdc9 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -37,7 +37,7 @@ static ssize_t aoedisk_show_mac(struct device *dev,
 
 	if (t == NULL)
 		return snprintf(page, PAGE_SIZE, "none\n");
-	return snprintf(page, PAGE_SIZE, "%012llx\n", mac_addr(t->addr));
+	return snprintf(page, PAGE_SIZE, "%pm\n", t->addr);
 }
 static ssize_t aoedisk_show_netif(struct device *dev,
 				  struct device_attribute *attr, char *page)
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 71ff78c9e4d6..45c5a33daf49 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -349,11 +349,9 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
 	ah = (struct aoe_atahdr *) (h+1);
 
 	snprintf(buf, sizeof buf,
-		"%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x "
-		"s=%012llx d=%012llx nout=%d\n",
+		"%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
 		"retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n,
-		mac_addr(h->src),
-		mac_addr(h->dst), t->nout);
+		h->src, h->dst, t->nout);
 	aoechr_error(buf);
 
 	f->tag = n;
@@ -544,10 +542,10 @@ rexmit_timer(ulong vp)
 				printk(KERN_INFO
 					"aoe: e%ld.%d: "
 					"too many lost jumbo on "
-					"%s:%012llx - "
+					"%s:%pm - "
 					"falling back to %d frames.\n",
 					d->aoemajor, d->aoeminor,
-					ifp->nd->name, mac_addr(t->addr),
+					ifp->nd->name, t->addr,
 					DEFAULTBCNT);
 				ifp->maxbcnt = 0;
 			}
@@ -672,8 +670,8 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
 
 	if (d->ssize != ssize)
 		printk(KERN_INFO
-			"aoe: %012llx e%ld.%d v%04x has %llu sectors\n",
-			mac_addr(t->addr),
+			"aoe: %pm e%ld.%d v%04x has %llu sectors\n",
+			t->addr,
 			d->aoemajor, d->aoeminor,
 			d->fw_ver, (long long)ssize);
 	d->ssize = ssize;
@@ -775,8 +773,8 @@ aoecmd_ata_rsp(struct sk_buff *skb)
 	n = get_unaligned_be32(&hin->tag);
 	t = gettgt(d, hin->src);
 	if (t == NULL) {
-		printk(KERN_INFO "aoe: can't find target e%ld.%d:%012llx\n",
-			d->aoemajor, d->aoeminor, mac_addr(hin->src));
+		printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n",
+			d->aoemajor, d->aoeminor, hin->src);
 		spin_unlock_irqrestore(&d->lock, flags);
 		return;
 	}
@@ -1036,10 +1034,10 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
 		n = n ? n * 512 : DEFAULTBCNT;
 		if (n != ifp->maxbcnt) {
 			printk(KERN_INFO
-				"aoe: e%ld.%d: setting %d%s%s:%012llx\n",
+				"aoe: e%ld.%d: setting %d%s%s:%pm\n",
 				d->aoemajor, d->aoeminor, n,
 				" byte data frames on ", ifp->nd->name,
-				mac_addr(t->addr));
+				t->addr);
 			ifp->maxbcnt = n;
 		}
 	}
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index 9157d64270cb..30de5b1c647e 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -83,17 +83,6 @@ set_aoe_iflist(const char __user *user_str, size_t size)
 	return 0;
 }
 
-unsigned long long
-mac_addr(char addr[6])
-{
-	__be64 n = 0;
-	char *p = (char *) &n;
-
-	memcpy(p + 2, addr, 6);	/* (sizeof addr != 6) */
-
-	return (unsigned long long) __be64_to_cpu(n);
-}
-
 void
 aoenet_xmit(struct sk_buff_head *queue)
 {
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 6897724ff5df..3b777025d876 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -669,6 +669,9 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field
 		return symbol_string(buf, end, ptr, field_width, precision, flags);
 	case 'R':
 		return resource_string(buf, end, ptr, field_width, precision, flags);
+	case 'm':
+		flags |= SPECIAL;
+		/* Fallthrough */
 	case 'M':
 		return mac_address_string(buf, end, ptr, field_width, precision, flags);
 	case 'i':
-- 
cgit v1.3-14-g43fede


From 3ae7020543db0b769538e64d1ce8d51fceff60ca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 26 Nov 2008 10:02:00 +0100
Subject: debugobjects: add boot parameter default value

Impact: add .config driven boot parameter default value

Right now debugobjects can only be activated if the debug_objects
boot parameter is passed in via the boot command line.

Make this more convenient (and randomizable) by also providing
a .config method. Enable it by default. (DEBUG_OBJECTS itself
is default-off)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/Kconfig.debug  | 8 ++++++++
 lib/debugobjects.c | 4 +++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b0f239e443bc..124ece1e67ad 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -252,6 +252,14 @@ config DEBUG_OBJECTS_TIMERS
 	  timer routines to track the life time of timer objects and
 	  validate the timer operations.
 
+config DEBUG_OBJECTS_ENABLE_DEFAULT
+	int "debug_objects bootup default value (0-1)"
+        range 0 1
+        default "1"
+        depends on DEBUG_OBJECTS
+        help
+          Debug objects boot parameter default value
+
 config DEBUG_SLAB
 	bool "Debug slab memory allocations"
 	depends on DEBUG_KERNEL && SLAB
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index e3ab374e1334..5d99be1fd988 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -45,7 +45,9 @@ static struct kmem_cache	*obj_cache;
 static int			debug_objects_maxchain __read_mostly;
 static int			debug_objects_fixups __read_mostly;
 static int			debug_objects_warnings __read_mostly;
-static int			debug_objects_enabled __read_mostly;
+static int			debug_objects_enabled __read_mostly
+				= CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT;
+
 static struct debug_obj_descr	*descr_test  __read_mostly;
 
 static int __init enable_object_debug(char *str)
-- 
cgit v1.3-14-g43fede


From aab46da0520af9c99b7802cebe4f14a81ff39415 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 13 Dec 2008 21:20:27 +1030
Subject: cpumask: Add CONFIG_CPUMASK_OFFSTACK

Impact: Add config option to enable code in cpumask.h

Currently it can be set if DEBUG_PER_CPU_MAPS, or set specifically by
an arch.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 lib/Kconfig | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'lib')

diff --git a/lib/Kconfig b/lib/Kconfig
index 85cf7ea978aa..7823f8342abf 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -157,4 +157,11 @@ config CHECK_SIGNATURE
 config HAVE_LMB
 	boolean
 
+config CPUMASK_OFFSTACK
+	bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
+	help
+	  Use dynamic allocation for cpumask_var_t, instead of putting
+	  them on the stack.  This is a bit more expensive, but avoids
+	  stack overflow.
+
 endmenu
-- 
cgit v1.3-14-g43fede


From b93a531e315e97ef00367099e6b5f19651936e20 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 16 Dec 2008 11:40:27 +0000
Subject: allow bug table entries to use relative pointers (and use it on
 x86-64)

Impact: reduce bug table size

This allows reducing the bug table size by half. Perhaps there are
other 64-bit architectures that could also make use of this.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig           |  4 ++++
 arch/x86/include/asm/bug.h |  2 +-
 include/asm-generic/bug.h  |  8 ++++++++
 lib/bug.c                  | 19 +++++++++++++++++--
 4 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ac22bb7719f7..ab98cca84e1b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -87,6 +87,10 @@ config GENERIC_IOMAP
 config GENERIC_BUG
 	def_bool y
 	depends on BUG
+	select GENERIC_BUG_RELATIVE_POINTERS if X86_64
+
+config GENERIC_BUG_RELATIVE_POINTERS
+	bool
 
 config GENERIC_HWEIGHT
 	def_bool y
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 3def2065fcea..d9cf1cd156d2 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -9,7 +9,7 @@
 #ifdef CONFIG_X86_32
 # define __BUG_C0	"2:\t.long 1b, %c0\n"
 #else
-# define __BUG_C0	"2:\t.quad 1b, %c0\n"
+# define __BUG_C0	"2:\t.long 1b - 2b, %c0 - 2b\n"
 #endif
 
 #define BUG()							\
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 12c07c1866b2..4c794d73fb84 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -8,9 +8,17 @@
 #ifdef CONFIG_GENERIC_BUG
 #ifndef __ASSEMBLY__
 struct bug_entry {
+#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
 	unsigned long	bug_addr;
+#else
+	signed int	bug_addr_disp;
+#endif
 #ifdef CONFIG_DEBUG_BUGVERBOSE
+#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
 	const char	*file;
+#else
+	signed int	file_disp;
+#endif
 	unsigned short	line;
 #endif
 	unsigned short	flags;
diff --git a/lib/bug.c b/lib/bug.c
index bfeafd60ee9f..300e41afbf97 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -5,6 +5,8 @@
 
   CONFIG_BUG - emit BUG traps.  Nothing happens without this.
   CONFIG_GENERIC_BUG - enable this code.
+  CONFIG_GENERIC_BUG_RELATIVE_POINTERS - use 32-bit pointers relative to
+	the containing struct bug_entry for bug_addr and file.
   CONFIG_DEBUG_BUGVERBOSE - emit full file+line information for each BUG
 
   CONFIG_BUG and CONFIG_DEBUG_BUGVERBOSE are potentially user-settable
@@ -43,6 +45,15 @@
 
 extern const struct bug_entry __start___bug_table[], __stop___bug_table[];
 
+static inline unsigned long bug_addr(const struct bug_entry *bug)
+{
+#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
+	return bug->bug_addr;
+#else
+	return (unsigned long)bug + bug->bug_addr_disp;
+#endif
+}
+
 #ifdef CONFIG_MODULES
 static LIST_HEAD(module_bug_list);
 
@@ -55,7 +66,7 @@ static const struct bug_entry *module_find_bug(unsigned long bugaddr)
 		unsigned i;
 
 		for (i = 0; i < mod->num_bugs; ++i, ++bug)
-			if (bugaddr == bug->bug_addr)
+			if (bugaddr == bug_addr(bug))
 				return bug;
 	}
 	return NULL;
@@ -108,7 +119,7 @@ const struct bug_entry *find_bug(unsigned long bugaddr)
 	const struct bug_entry *bug;
 
 	for (bug = __start___bug_table; bug < __stop___bug_table; ++bug)
-		if (bugaddr == bug->bug_addr)
+		if (bugaddr == bug_addr(bug))
 			return bug;
 
 	return module_find_bug(bugaddr);
@@ -133,7 +144,11 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 
 	if (bug) {
 #ifdef CONFIG_DEBUG_BUGVERBOSE
+#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
 		file = bug->file;
+#else
+		file = (const char *)bug + bug->file_disp;
+#endif
 		line = bug->line;
 #endif
 		warning = (bug->flags & BUGFLAG_WARNING) != 0;
-- 
cgit v1.3-14-g43fede


From 8c5df16bec8a60bb8589fc232b9e26cac0ed4b2c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 16 Dec 2008 12:17:26 -0800
Subject: swiotlb: allow architectures to override swiotlb pool allocation

Impact: generalize swiotlb allocation code

Architectures may need to allocate memory specially for use with
the swiotlb.  Create the weak function swiotlb_alloc_boot() and
swiotlb_alloc() defaulting to the current behaviour.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/swiotlb.h |  3 +++
 lib/swiotlb.c           | 16 +++++++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b18ec5533e8c..b8c5fc766a56 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -10,6 +10,9 @@ struct scatterlist;
 extern void
 swiotlb_init(void);
 
+extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs);
+extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
+
 extern void
 *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			dma_addr_t *dma_handle, gfp_t flags);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 5f6c629a924d..abecb2857556 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/swiotlb.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ctype.h>
@@ -126,6 +127,16 @@ setup_io_tlb_npages(char *str)
 __setup("swiotlb=", setup_io_tlb_npages);
 /* make io_tlb_overflow tunable too? */
 
+void * __weak swiotlb_alloc_boot(size_t size, unsigned long nslabs)
+{
+	return alloc_bootmem_low_pages(size);
+}
+
+void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs)
+{
+	return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
+}
+
 /*
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
@@ -145,7 +156,7 @@ swiotlb_init_with_default_size(size_t default_size)
 	/*
 	 * Get IO TLB memory from the low pages
 	 */
-	io_tlb_start = alloc_bootmem_low_pages(bytes);
+	io_tlb_start = swiotlb_alloc_boot(bytes, io_tlb_nslabs);
 	if (!io_tlb_start)
 		panic("Cannot allocate SWIOTLB buffer");
 	io_tlb_end = io_tlb_start + bytes;
@@ -202,8 +213,7 @@ swiotlb_late_init_with_default_size(size_t default_size)
 	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
 
 	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
-		io_tlb_start = (char *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
-		                                        order);
+		io_tlb_start = swiotlb_alloc(order, io_tlb_nslabs);
 		if (io_tlb_start)
 			break;
 		order--;
-- 
cgit v1.3-14-g43fede


From 0016fdee927f7aa0f428494bcf11ae60c7470a02 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 16 Dec 2008 12:17:27 -0800
Subject: swiotlb: move some definitions to header

Impact: cleanup

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/swiotlb.h | 14 ++++++++++++++
 lib/swiotlb.c           | 14 +-------------
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'lib')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b8c5fc766a56..58b996a642f9 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -7,6 +7,20 @@ struct device;
 struct dma_attrs;
 struct scatterlist;
 
+/*
+ * Maximum allowable number of contiguous slabs to map,
+ * must be a power of 2.  What is the appropriate value ?
+ * The complexity of {map,unmap}_single is linearly dependent on this value.
+ */
+#define IO_TLB_SEGSIZE	128
+
+
+/*
+ * log of the size of each IO TLB slab.  The number of slabs is command line
+ * controllable.
+ */
+#define IO_TLB_SHIFT 11
+
 extern void
 swiotlb_init(void);
 
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index abecb2857556..db724ba7ebf6 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -23,6 +23,7 @@
 #include <linux/spinlock.h>
 #include <linux/swiotlb.h>
 #include <linux/string.h>
+#include <linux/swiotlb.h>
 #include <linux/types.h>
 #include <linux/ctype.h>
 
@@ -40,19 +41,6 @@
 #define SG_ENT_VIRT_ADDRESS(sg)	(sg_virt((sg)))
 #define SG_ENT_PHYS_ADDRESS(sg)	virt_to_bus(SG_ENT_VIRT_ADDRESS(sg))
 
-/*
- * Maximum allowable number of contiguous slabs to map,
- * must be a power of 2.  What is the appropriate value ?
- * The complexity of {map,unmap}_single is linearly dependent on this value.
- */
-#define IO_TLB_SEGSIZE	128
-
-/*
- * log of the size of each IO TLB slab.  The number of slabs is command line
- * controllable.
- */
-#define IO_TLB_SHIFT 11
-
 #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
 
 /*
-- 
cgit v1.3-14-g43fede


From a5ddde4a558b3bd1e3dc50e274a0db2ea7a8fd06 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 16 Dec 2008 12:17:29 -0800
Subject: swiotlb: add comment where we handle the overflow of a dma mask on 32
 bit

Impact: cleanup

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index db724ba7ebf6..1272b23e4769 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -301,6 +301,10 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir)
 	start_dma_addr = virt_to_bus(io_tlb_start) & mask;
 
 	offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+
+	/*
+ 	 * Carefully handle integer overflow which can occur when mask == ~0UL.
+ 	 */
 	max_slots = mask + 1
 		    ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
 		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
-- 
cgit v1.3-14-g43fede


From e08e1f7adba522378e8d2ae941bf25443866136d Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 16 Dec 2008 12:17:30 -0800
Subject: swiotlb: allow architectures to override phys<->bus<->phys
 conversions

Impact: generalize phys<->bus<->phys conversions in the swiotlb code

Architectures may need to override these conversions. Implement a
__weak hook point containing the default implementation.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/swiotlb.h |  3 +++
 lib/swiotlb.c           | 52 ++++++++++++++++++++++++++++++++++---------------
 2 files changed, 39 insertions(+), 16 deletions(-)

(limited to 'lib')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 58b996a642f9..694f1839cbc0 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -27,6 +27,9 @@ swiotlb_init(void);
 extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs);
 extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
 
+extern dma_addr_t swiotlb_phys_to_bus(phys_addr_t address);
+extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address);
+
 extern void
 *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			dma_addr_t *dma_handle, gfp_t flags);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 1272b23e4769..3494263cdd9a 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -125,6 +125,26 @@ void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs)
 	return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
 }
 
+dma_addr_t __weak swiotlb_phys_to_bus(phys_addr_t paddr)
+{
+	return paddr;
+}
+
+phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr)
+{
+	return baddr;
+}
+
+static dma_addr_t swiotlb_virt_to_bus(volatile void *address)
+{
+	return swiotlb_phys_to_bus(virt_to_phys(address));
+}
+
+static void *swiotlb_bus_to_virt(dma_addr_t address)
+{
+	return phys_to_virt(swiotlb_bus_to_phys(address));
+}
+
 /*
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
@@ -168,7 +188,7 @@ swiotlb_init_with_default_size(size_t default_size)
 		panic("Cannot allocate SWIOTLB overflow buffer!\n");
 
 	printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n",
-	       virt_to_bus(io_tlb_start), virt_to_bus(io_tlb_end));
+	       swiotlb_virt_to_bus(io_tlb_start), swiotlb_virt_to_bus(io_tlb_end));
 }
 
 void __init
@@ -250,7 +270,7 @@ swiotlb_late_init_with_default_size(size_t default_size)
 
 	printk(KERN_INFO "Placing %luMB software IO TLB between 0x%lx - "
 	       "0x%lx\n", bytes >> 20,
-	       virt_to_bus(io_tlb_start), virt_to_bus(io_tlb_end));
+	       swiotlb_virt_to_bus(io_tlb_start), swiotlb_virt_to_bus(io_tlb_end));
 
 	return 0;
 
@@ -298,7 +318,7 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir)
 	unsigned long max_slots;
 
 	mask = dma_get_seg_boundary(hwdev);
-	start_dma_addr = virt_to_bus(io_tlb_start) & mask;
+	start_dma_addr = swiotlb_virt_to_bus(io_tlb_start) & mask;
 
 	offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
 
@@ -475,7 +495,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		dma_mask = hwdev->coherent_dma_mask;
 
 	ret = (void *)__get_free_pages(flags, order);
-	if (ret && !is_buffer_dma_capable(dma_mask, virt_to_bus(ret), size)) {
+	if (ret && !is_buffer_dma_capable(dma_mask, swiotlb_virt_to_bus(ret), size)) {
 		/*
 		 * The allocated memory isn't reachable by the device.
 		 * Fall back on swiotlb_map_single().
@@ -496,7 +516,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	}
 
 	memset(ret, 0, size);
-	dev_addr = virt_to_bus(ret);
+	dev_addr = swiotlb_virt_to_bus(ret);
 
 	/* Confirm address can be DMA'd by device */
 	if (!is_buffer_dma_capable(dma_mask, dev_addr, size)) {
@@ -556,7 +576,7 @@ dma_addr_t
 swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 			 int dir, struct dma_attrs *attrs)
 {
-	dma_addr_t dev_addr = virt_to_bus(ptr);
+	dma_addr_t dev_addr = swiotlb_virt_to_bus(ptr);
 	void *map;
 
 	BUG_ON(dir == DMA_NONE);
@@ -577,7 +597,7 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 		map = io_tlb_overflow_buffer;
 	}
 
-	dev_addr = virt_to_bus(map);
+	dev_addr = swiotlb_virt_to_bus(map);
 
 	/*
 	 * Ensure that the address returned is DMA'ble
@@ -607,7 +627,7 @@ void
 swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
 			   size_t size, int dir, struct dma_attrs *attrs)
 {
-	char *dma_addr = bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 	if (is_swiotlb_buffer(dma_addr))
@@ -637,7 +657,7 @@ static void
 swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 		    size_t size, int dir, int target)
 {
-	char *dma_addr = bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 	if (is_swiotlb_buffer(dma_addr))
@@ -668,7 +688,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
 			  unsigned long offset, size_t size,
 			  int dir, int target)
 {
-	char *dma_addr = bus_to_virt(dev_addr) + offset;
+	char *dma_addr = swiotlb_bus_to_virt(dev_addr) + offset;
 
 	BUG_ON(dir == DMA_NONE);
 	if (is_swiotlb_buffer(dma_addr))
@@ -724,7 +744,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 
 	for_each_sg(sgl, sg, nelems, i) {
 		addr = SG_ENT_VIRT_ADDRESS(sg);
-		dev_addr = virt_to_bus(addr);
+		dev_addr = swiotlb_virt_to_bus(addr);
 		if (swiotlb_force ||
 		    address_needs_mapping(hwdev, dev_addr, sg->length)) {
 			void *map = map_single(hwdev, addr, sg->length, dir);
@@ -737,7 +757,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 				sgl[0].dma_length = 0;
 				return 0;
 			}
-			sg->dma_address = virt_to_bus(map);
+			sg->dma_address = swiotlb_virt_to_bus(map);
 		} else
 			sg->dma_address = dev_addr;
 		sg->dma_length = sg->length;
@@ -768,7 +788,7 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 
 	for_each_sg(sgl, sg, nelems, i) {
 		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
-			unmap_single(hwdev, bus_to_virt(sg->dma_address),
+			unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				     sg->dma_length, dir);
 		else if (dir == DMA_FROM_DEVICE)
 			dma_mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length);
@@ -801,7 +821,7 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
 
 	for_each_sg(sgl, sg, nelems, i) {
 		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
-			sync_single(hwdev, bus_to_virt(sg->dma_address),
+			sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				    sg->dma_length, dir, target);
 		else if (dir == DMA_FROM_DEVICE)
 			dma_mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length);
@@ -825,7 +845,7 @@ swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
 int
 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
-	return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
+	return (dma_addr == swiotlb_virt_to_bus(io_tlb_overflow_buffer));
 }
 
 /*
@@ -837,7 +857,7 @@ swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 int
 swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
-	return virt_to_bus(io_tlb_end - 1) <= mask;
+	return swiotlb_virt_to_bus(io_tlb_end - 1) <= mask;
 }
 
 EXPORT_SYMBOL(swiotlb_map_single);
-- 
cgit v1.3-14-g43fede


From b81ea27b2329bf44b30c427800954f845896d476 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 16 Dec 2008 12:17:31 -0800
Subject: swiotlb: add arch hook to force mapping

Impact: generalize the sw-IOTLB range checks

Some architectures require special rules to determine whether a range
needs mapping or not.  This adds a weak function for architectures to
override.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/swiotlb.h |  2 ++
 lib/swiotlb.c           | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 694f1839cbc0..325af1de0351 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -30,6 +30,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
 extern dma_addr_t swiotlb_phys_to_bus(phys_addr_t address);
 extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address);
 
+extern int swiotlb_arch_range_needs_mapping(void *ptr, size_t size);
+
 extern void
 *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			dma_addr_t *dma_handle, gfp_t flags);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 3494263cdd9a..d8b09051c455 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -145,6 +145,11 @@ static void *swiotlb_bus_to_virt(dma_addr_t address)
 	return phys_to_virt(swiotlb_bus_to_phys(address));
 }
 
+int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
+{
+	return 0;
+}
+
 /*
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
@@ -297,6 +302,11 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size)
 	return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
 }
 
+static inline int range_needs_mapping(void *ptr, size_t size)
+{
+	return swiotlb_force || swiotlb_arch_range_needs_mapping(ptr, size);
+}
+
 static int is_swiotlb_buffer(char *addr)
 {
 	return addr >= io_tlb_start && addr < io_tlb_end;
@@ -585,7 +595,8 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
-	if (!address_needs_mapping(hwdev, dev_addr, size) && !swiotlb_force)
+	if (!address_needs_mapping(hwdev, dev_addr, size) &&
+	    !range_needs_mapping(ptr, size))
 		return dev_addr;
 
 	/*
@@ -745,7 +756,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 	for_each_sg(sgl, sg, nelems, i) {
 		addr = SG_ENT_VIRT_ADDRESS(sg);
 		dev_addr = swiotlb_virt_to_bus(addr);
-		if (swiotlb_force ||
+		if (range_needs_mapping(sg_virt(sg), sg->length) ||
 		    address_needs_mapping(hwdev, dev_addr, sg->length)) {
 			void *map = map_single(hwdev, addr, sg->length, dir);
 			if (!map) {
-- 
cgit v1.3-14-g43fede


From 1b548f667c1487d92e794a9f7a67788f49b952d8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 16 Dec 2008 12:17:32 -0800
Subject: swiotlb: factor out copy to/from device

Impact: generalize IO bounce memcpys

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index d8b09051c455..a0b4039e2880 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -312,6 +312,15 @@ static int is_swiotlb_buffer(char *addr)
 	return addr >= io_tlb_start && addr < io_tlb_end;
 }
 
+static void
+__sync_single(char *buffer, char *dma_addr, size_t size, int dir)
+{
+	if (dir == DMA_TO_DEVICE)
+		memcpy(dma_addr, buffer, size);
+	else
+		memcpy(buffer, dma_addr, size);
+}
+
 /*
  * Allocates bounce buffer and returns its kernel virtual address.
  */
@@ -413,7 +422,7 @@ found:
 	for (i = 0; i < nslots; i++)
 		io_tlb_orig_addr[index+i] = buffer + (i << IO_TLB_SHIFT);
 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
-		memcpy(dma_addr, buffer, size);
+		__sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
 
 	return dma_addr;
 }
@@ -437,7 +446,7 @@ unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 		 * bounce... copy the data back into the original buffer * and
 		 * delete the bounce buffer.
 		 */
-		memcpy(buffer, dma_addr, size);
+		__sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
 
 	/*
 	 * Return the buffer to the free list by setting the corresponding
@@ -477,13 +486,13 @@ sync_single(struct device *hwdev, char *dma_addr, size_t size,
 	switch (target) {
 	case SYNC_FOR_CPU:
 		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-			memcpy(buffer, dma_addr, size);
+			__sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
 		else
 			BUG_ON(dir != DMA_TO_DEVICE);
 		break;
 	case SYNC_FOR_DEVICE:
 		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-			memcpy(dma_addr, buffer, size);
+			__sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
 		else
 			BUG_ON(dir != DMA_FROM_DEVICE);
 		break;
-- 
cgit v1.3-14-g43fede


From ef9b189352f2eb78f14e52996f4780a523b04a49 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 16 Dec 2008 12:17:33 -0800
Subject: swiotlb: support bouncing of HighMem pages

Impact: prepare the swiotlb code for HighMem struct pages

This requires us to treat DMA regions in terms of page+offset rather
than virtual addressing since a HighMem page may not have a mapping.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 122 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 89 insertions(+), 33 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index a0b4039e2880..1661af593914 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -26,6 +26,7 @@
 #include <linux/swiotlb.h>
 #include <linux/types.h>
 #include <linux/ctype.h>
+#include <linux/highmem.h>
 
 #include <asm/io.h>
 #include <asm/dma.h>
@@ -38,9 +39,6 @@
 #define OFFSET(val,align) ((unsigned long)	\
 	                   ( (val) & ( (align) - 1)))
 
-#define SG_ENT_VIRT_ADDRESS(sg)	(sg_virt((sg)))
-#define SG_ENT_PHYS_ADDRESS(sg)	virt_to_bus(SG_ENT_VIRT_ADDRESS(sg))
-
 #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
 
 /*
@@ -91,7 +89,10 @@ static unsigned int io_tlb_index;
  * We need to save away the original address corresponding to a mapped entry
  * for the sync operations.
  */
-static unsigned char **io_tlb_orig_addr;
+static struct swiotlb_phys_addr {
+	struct page *page;
+	unsigned int offset;
+} *io_tlb_orig_addr;
 
 /*
  * Protect the above data structures in the map and unmap calls
@@ -150,6 +151,11 @@ int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
 	return 0;
 }
 
+static dma_addr_t swiotlb_sg_to_bus(struct scatterlist *sg)
+{
+	return swiotlb_phys_to_bus(page_to_phys(sg_page(sg)) + sg->offset);
+}
+
 /*
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
@@ -183,7 +189,7 @@ swiotlb_init_with_default_size(size_t default_size)
 	for (i = 0; i < io_tlb_nslabs; i++)
  		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
 	io_tlb_index = 0;
-	io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(char *));
+	io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(struct swiotlb_phys_addr));
 
 	/*
 	 * Get the overflow emergency buffer
@@ -258,12 +264,12 @@ swiotlb_late_init_with_default_size(size_t default_size)
  		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
 	io_tlb_index = 0;
 
-	io_tlb_orig_addr = (unsigned char **)__get_free_pages(GFP_KERNEL,
-	                           get_order(io_tlb_nslabs * sizeof(char *)));
+	io_tlb_orig_addr = (struct swiotlb_phys_addr *)__get_free_pages(GFP_KERNEL,
+	                           get_order(io_tlb_nslabs * sizeof(struct swiotlb_phys_addr)));
 	if (!io_tlb_orig_addr)
 		goto cleanup3;
 
-	memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(char *));
+	memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(struct swiotlb_phys_addr));
 
 	/*
 	 * Get the overflow emergency buffer
@@ -312,20 +318,59 @@ static int is_swiotlb_buffer(char *addr)
 	return addr >= io_tlb_start && addr < io_tlb_end;
 }
 
-static void
-__sync_single(char *buffer, char *dma_addr, size_t size, int dir)
+static struct swiotlb_phys_addr swiotlb_bus_to_phys_addr(char *dma_addr)
 {
-	if (dir == DMA_TO_DEVICE)
-		memcpy(dma_addr, buffer, size);
-	else
-		memcpy(buffer, dma_addr, size);
+	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	struct swiotlb_phys_addr buffer = io_tlb_orig_addr[index];
+	buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1);
+	buffer.page += buffer.offset >> PAGE_SHIFT;
+	buffer.offset &= PAGE_SIZE - 1;
+	return buffer;
+}
+
+static void
+__sync_single(struct swiotlb_phys_addr buffer, char *dma_addr, size_t size, int dir)
+{
+	if (PageHighMem(buffer.page)) {
+		size_t len, bytes;
+		char *dev, *host, *kmp;
+
+		len = size;
+		while (len != 0) {
+			unsigned long flags;
+
+			bytes = len;
+			if ((bytes + buffer.offset) > PAGE_SIZE)
+				bytes = PAGE_SIZE - buffer.offset;
+			local_irq_save(flags); /* protects KM_BOUNCE_READ */
+			kmp  = kmap_atomic(buffer.page, KM_BOUNCE_READ);
+			dev  = dma_addr + size - len;
+			host = kmp + buffer.offset;
+			if (dir == DMA_FROM_DEVICE)
+				memcpy(host, dev, bytes);
+			else
+				memcpy(dev, host, bytes);
+			kunmap_atomic(kmp, KM_BOUNCE_READ);
+			local_irq_restore(flags);
+			len -= bytes;
+			buffer.page++;
+			buffer.offset = 0;
+		}
+	} else {
+		void *v = page_address(buffer.page) + buffer.offset;
+
+		if (dir == DMA_TO_DEVICE)
+			memcpy(dma_addr, v, size);
+		else
+			memcpy(v, dma_addr, size);
+	}
 }
 
 /*
  * Allocates bounce buffer and returns its kernel virtual address.
  */
 static void *
-map_single(struct device *hwdev, char *buffer, size_t size, int dir)
+map_single(struct device *hwdev, struct swiotlb_phys_addr buffer, size_t size, int dir)
 {
 	unsigned long flags;
 	char *dma_addr;
@@ -335,6 +380,7 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir)
 	unsigned long mask;
 	unsigned long offset_slots;
 	unsigned long max_slots;
+	struct swiotlb_phys_addr slot_buf;
 
 	mask = dma_get_seg_boundary(hwdev);
 	start_dma_addr = swiotlb_virt_to_bus(io_tlb_start) & mask;
@@ -419,8 +465,13 @@ found:
 	 * This is needed when we sync the memory.  Then we sync the buffer if
 	 * needed.
 	 */
-	for (i = 0; i < nslots; i++)
-		io_tlb_orig_addr[index+i] = buffer + (i << IO_TLB_SHIFT);
+	slot_buf = buffer;
+	for (i = 0; i < nslots; i++) {
+		slot_buf.page += slot_buf.offset >> PAGE_SHIFT;
+		slot_buf.offset &= PAGE_SIZE - 1;
+		io_tlb_orig_addr[index+i] = slot_buf;
+		slot_buf.offset += 1 << IO_TLB_SHIFT;
+	}
 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
 		__sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
 
@@ -436,12 +487,12 @@ unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 	unsigned long flags;
 	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
 	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	char *buffer = io_tlb_orig_addr[index];
+	struct swiotlb_phys_addr buffer = swiotlb_bus_to_phys_addr(dma_addr);
 
 	/*
 	 * First, sync the memory before unmapping the entry
 	 */
-	if (buffer && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+	if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
 		/*
 		 * bounce... copy the data back into the original buffer * and
 		 * delete the bounce buffer.
@@ -478,10 +529,7 @@ static void
 sync_single(struct device *hwdev, char *dma_addr, size_t size,
 	    int dir, int target)
 {
-	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	char *buffer = io_tlb_orig_addr[index];
-
-	buffer += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
+	struct swiotlb_phys_addr buffer = swiotlb_bus_to_phys_addr(dma_addr);
 
 	switch (target) {
 	case SYNC_FOR_CPU:
@@ -529,7 +577,10 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		 * swiotlb_map_single(), which will grab memory from
 		 * the lowest available address range.
 		 */
-		ret = map_single(hwdev, NULL, size, DMA_FROM_DEVICE);
+		struct swiotlb_phys_addr buffer;
+		buffer.page = virt_to_page(NULL);
+		buffer.offset = 0;
+		ret = map_single(hwdev, buffer, size, DMA_FROM_DEVICE);
 		if (!ret)
 			return NULL;
 	}
@@ -597,6 +648,7 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 {
 	dma_addr_t dev_addr = swiotlb_virt_to_bus(ptr);
 	void *map;
+	struct swiotlb_phys_addr buffer;
 
 	BUG_ON(dir == DMA_NONE);
 	/*
@@ -611,7 +663,9 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 	/*
 	 * Oh well, have to allocate and map a bounce buffer.
 	 */
-	map = map_single(hwdev, ptr, size, dir);
+	buffer.page   = virt_to_page(ptr);
+	buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
+	map = map_single(hwdev, buffer, size, dir);
 	if (!map) {
 		swiotlb_full(hwdev, size, dir, 1);
 		map = io_tlb_overflow_buffer;
@@ -756,18 +810,20 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 		     int dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
-	void *addr;
+	struct swiotlb_phys_addr buffer;
 	dma_addr_t dev_addr;
 	int i;
 
 	BUG_ON(dir == DMA_NONE);
 
 	for_each_sg(sgl, sg, nelems, i) {
-		addr = SG_ENT_VIRT_ADDRESS(sg);
-		dev_addr = swiotlb_virt_to_bus(addr);
+		dev_addr = swiotlb_sg_to_bus(sg);
 		if (range_needs_mapping(sg_virt(sg), sg->length) ||
 		    address_needs_mapping(hwdev, dev_addr, sg->length)) {
-			void *map = map_single(hwdev, addr, sg->length, dir);
+			void *map;
+			buffer.page   = sg_page(sg);
+			buffer.offset = sg->offset;
+			map = map_single(hwdev, buffer, sg->length, dir);
 			if (!map) {
 				/* Don't panic here, we expect map_sg users
 				   to do proper error handling. */
@@ -807,11 +863,11 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 	BUG_ON(dir == DMA_NONE);
 
 	for_each_sg(sgl, sg, nelems, i) {
-		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
+		if (sg->dma_address != swiotlb_sg_to_bus(sg))
 			unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				     sg->dma_length, dir);
 		else if (dir == DMA_FROM_DEVICE)
-			dma_mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length);
+			dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
 	}
 }
 EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
@@ -840,11 +896,11 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
 	BUG_ON(dir == DMA_NONE);
 
 	for_each_sg(sgl, sg, nelems, i) {
-		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
+		if (sg->dma_address != swiotlb_sg_to_bus(sg))
 			sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				    sg->dma_length, dir, target);
 		else if (dir == DMA_FROM_DEVICE)
-			dma_mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length);
+			dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From 2e5b2b86b65fe36bb8401e5608eb0b7603aa1cab Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 16 Dec 2008 12:17:34 -0800
Subject: swiotlb: consolidate swiotlb info message printing

Impact: clean up swiotlb printks

Remove duplicated swiotlb info printing, and make it more detailed.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 1661af593914..fa2dc4e5f9ba 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -156,6 +156,32 @@ static dma_addr_t swiotlb_sg_to_bus(struct scatterlist *sg)
 	return swiotlb_phys_to_bus(page_to_phys(sg_page(sg)) + sg->offset);
 }
 
+static void swiotlb_print_info(unsigned long bytes)
+{
+	phys_addr_t pstart, pend;
+	dma_addr_t bstart, bend;
+
+	pstart = virt_to_phys(io_tlb_start);
+	pend = virt_to_phys(io_tlb_end);
+
+	bstart = swiotlb_phys_to_bus(pstart);
+	bend = swiotlb_phys_to_bus(pend);
+
+	printk(KERN_INFO "Placing %luMB software IO TLB between %p - %p\n",
+	       bytes >> 20, io_tlb_start, io_tlb_end);
+	if (pstart != bstart || pend != bend)
+		printk(KERN_INFO "software IO TLB at phys %#llx - %#llx"
+		       " bus %#llx - %#llx\n",
+		       (unsigned long long)pstart,
+		       (unsigned long long)pend,
+		       (unsigned long long)bstart,
+		       (unsigned long long)bend);
+	else
+		printk(KERN_INFO "software IO TLB at phys %#llx - %#llx\n",
+		       (unsigned long long)pstart,
+		       (unsigned long long)pend);
+}
+
 /*
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
@@ -198,8 +224,7 @@ swiotlb_init_with_default_size(size_t default_size)
 	if (!io_tlb_overflow_buffer)
 		panic("Cannot allocate SWIOTLB overflow buffer!\n");
 
-	printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n",
-	       swiotlb_virt_to_bus(io_tlb_start), swiotlb_virt_to_bus(io_tlb_end));
+	swiotlb_print_info(bytes);
 }
 
 void __init
@@ -279,9 +304,7 @@ swiotlb_late_init_with_default_size(size_t default_size)
 	if (!io_tlb_overflow_buffer)
 		goto cleanup4;
 
-	printk(KERN_INFO "Placing %luMB software IO TLB between 0x%lx - "
-	       "0x%lx\n", bytes >> 20,
-	       swiotlb_virt_to_bus(io_tlb_start), swiotlb_virt_to_bus(io_tlb_end));
+	swiotlb_print_info(bytes);
 
 	return 0;
 
-- 
cgit v1.3-14-g43fede


From 64db4cfff99c04cd5f550357edcc8780f96b54a2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2008 21:55:32 +0100
Subject: "Tree RCU": scalable classic RCU implementation

This patch fixes a long-standing performance bug in classic RCU that
results in massive internal-to-RCU lock contention on systems with
more than a few hundred CPUs.  Although this patch creates a separate
flavor of RCU for ease of review and patch maintenance, it is intended
to replace classic RCU.

This patch still handles stress better than does mainline, so I am still
calling it ready for inclusion.  This patch is against the -tip tree.
Nevertheless, experience on an actual 1000+ CPU machine would still be
most welcome.

Most of the changes noted below were found while creating an rcutiny
(which should permit ejecting the current rcuclassic) and while doing
detailed line-by-line documentation.

Updates from v9 (http://lkml.org/lkml/2008/12/2/334):

o	Fixes from remainder of line-by-line code walkthrough,
	including comment spelling, initialization, undesirable
	narrowing due to type conversion, removing redundant memory
	barriers, removing redundant local-variable initialization,
	and removing redundant local variables.

	I do not believe that any of these fixes address the CPU-hotplug
	issues that Andi Kleen was seeing, but please do give it a whirl
	in case the machine is smarter than I am.

	A writeup from the walkthrough may be found at the following
	URL, in case you are suffering from terminal insomnia or
	masochism:

	http://www.kernel.org/pub/linux/kernel/people/paulmck/tmp/rcutree-walkthrough.2008.12.16a.pdf

o	Made rcutree tracing use seq_file, as suggested some time
	ago by Lai Jiangshan.

o	Added a .csv variant of the rcudata debugfs trace file, to allow
	people having thousands of CPUs to drop the data into
	a spreadsheet.	Tested with oocalc and gnumeric.  Updated
	documentation to suit.

Updates from v8 (http://lkml.org/lkml/2008/11/15/139):

o	Fix a theoretical race between grace-period initialization and
	force_quiescent_state() that could occur if more than three
	jiffies were required to carry out the grace-period
	initialization.  Which it might, if you had enough CPUs.

o	Apply Ingo's printk-standardization patch.

o	Substitute local variables for repeated accesses to global
	variables.

o	Fix comment misspellings and redundant (but harmless) increments
	of ->n_rcu_pending (this latter after having explicitly added it).

o	Apply checkpatch fixes.

Updates from v7 (http://lkml.org/lkml/2008/10/10/291):

o	Fixed a number of problems noted by Gautham Shenoy, including
	the cpu-stall-detection bug that he was having difficulty
	convincing me was real.  ;-)

o	Changed cpu-stall detection to wait for ten seconds rather than
	three in order to reduce false positive, as suggested by Ingo
	Molnar.

o	Produced a design document (http://lwn.net/Articles/305782/).
	The act of writing this document uncovered a number of both
	theoretical and "here and now" bugs as noted below.

o	Fix dynticks_nesting accounting confusion, simplify WARN_ON()
	condition, fix kerneldoc comments, and add memory barriers
	in dynticks interface functions.

o	Add more data to tracing.

o	Remove unused "rcu_barrier" field from rcu_data structure.

o	Count calls to rcu_pending() from scheduling-clock interrupt
	to use as a surrogate timebase should jiffies stop counting.

o	Fix a theoretical race between force_quiescent_state() and
	grace-period initialization.  Yes, initialization does have to
	go on for some jiffies for this race to occur, but given enough
	CPUs...

Updates from v6 (http://lkml.org/lkml/2008/9/23/448):

o	Fix a number of checkpatch.pl complaints.

o	Apply review comments from Ingo Molnar and Lai Jiangshan
	on the stall-detection code.

o	Fix several bugs in !CONFIG_SMP builds.

o	Fix a misspelled config-parameter name so that RCU now announces
	at boot time if stall detection is configured.

o	Run tests on numerous combinations of configurations parameters,
	which after the fixes above, now build and run correctly.

Updates from v5 (http://lkml.org/lkml/2008/9/15/92, bad subject line):

o	Fix a compiler error in the !CONFIG_FANOUT_EXACT case (blew a
	changeset some time ago, and finally got around to retesting
	this option).

o	Fix some tracing bugs in rcupreempt that caused incorrect
	totals to be printed.

o	I now test with a more brutal random-selection online/offline
	script (attached).  Probably more brutal than it needs to be
	on the people reading it as well, but so it goes.

o	A number of optimizations and usability improvements:

	o	Make rcu_pending() ignore the grace-period timeout when
		there is no grace period in progress.

	o	Make force_quiescent_state() avoid going for a global
		lock in the case where there is no grace period in
		progress.

	o	Rearrange struct fields to improve struct layout.

	o	Make call_rcu() initiate a grace period if RCU was
		idle, rather than waiting for the next scheduling
		clock interrupt.

	o	Invoke rcu_irq_enter() and rcu_irq_exit() only when
		idle, as suggested by Andi Kleen.  I still don't
		completely trust this change, and might back it out.

	o	Make CONFIG_RCU_TRACE be the single config variable
		manipulated for all forms of RCU, instead of the prior
		confusion.

	o	Document tracing files and formats for both rcupreempt
		and rcutree.

Updates from v4 for those missing v5 given its bad subject line:

o	Separated dynticks interface so that NMIs and irqs call separate
	functions, greatly simplifying it.  In particular, this code
	no longer requires a proof of correctness.  ;-)

o	Separated dynticks state out into its own per-CPU structure,
	avoiding the duplicated accounting.

o	The case where a dynticks-idle CPU runs an irq handler that
	invokes call_rcu() is now correctly handled, forcing that CPU
	out of dynticks-idle mode.

o	Review comments have been applied (thank you all!!!).
	For but one example, fixed the dynticks-ordering issue that
	Manfred pointed out, saving me much debugging.  ;-)

o	Adjusted rcuclassic and rcupreempt to handle dynticks changes.

Attached is an updated patch to Classic RCU that applies a hierarchy,
greatly reducing the contention on the top-level lock for large machines.
This passes 10-hour concurrent rcutorture and online-offline testing on
128-CPU ppc64 without dynticks enabled, and exposes some timekeeping
bugs in presence of dynticks (exciting working on a system where
"sleep 1" hangs until interrupted...), which were fixed in the
2.6.27 kernel.  It is getting more reliable than mainline by some
measures, so the next version will be against -tip for inclusion.
See also Manfred Spraul's recent patches (or his earlier work from
2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).
We will converge onto a common patch in the fullness of time, but are
currently exploring different regions of the design space.  That said,
I have already gratefully stolen quite a few of Manfred's ideas.

This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
there is no hierarchy.  By default, the RCU initialization code will
adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
this balancing, allowing the hierarchy to be exactly aligned to the
underlying hardware.  Up to two levels of hierarchy are permitted
(in addition to the root node), allowing up to 16,384 CPUs on 32-bit
systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
am going to regret saying this, but this seems more than sufficient
for the foreseeable future.  (Some architectures might wish to set
CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
If this becomes a real problem, additional levels can be added, but I
doubt that it will make a significant difference on real hardware.)

In the common case, a given CPU will manipulate its private rcu_data
structure and the rcu_node structure that it shares with its immediate
neighbors.  This can reduce both lock and memory contention by multiple
orders of magnitude, which should eliminate the need for the strange
manipulations that are reported to be required when running Linux on
very large systems.

Some shortcomings:

o	More bugs will probably surface as a result of an ongoing
	line-by-line code inspection.

	Patches will be provided as required.

o	There are probably hangs, rcutorture failures, &c.  Seems
	quite stable on a 128-CPU machine, but that is kind of small
	compared to 4096 CPUs.  However, seems to do better than
	mainline.

	Patches will be provided as required.

o	The memory footprint of this version is several KB larger
	than rcuclassic.

	A separate UP-only rcutiny patch will be provided, which will
	reduce the memory footprint significantly, even compared
	to the old rcuclassic.  One such patch passes light testing,
	and has a memory footprint smaller even than rcuclassic.
	Initial reaction from various embedded guys was "it is not
	worth it", so am putting it aside.

Credits:

o	Manfred Spraul for ideas, review comments, and bugs spotted,
	as well as some good friendly competition.  ;-)

o	Josh Triplett, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Lai Jiangshan, Andi Kleen, Andy Whitcroft, and Andrew Morton
	for reviews and comments.

o	Thomas Gleixner for much-needed help with some timer issues
	(see patches below).

o	Jon M. Tollefson, Tim Pepper, Andrew Theurer, Jose R. Santos,
	Andy Whitcroft, Darrick Wong, Nishanth Aravamudan, Anton
	Blanchard, Dave Kleikamp, and Nathan Lynch for keeping machines
	alive despite my heavy abuse^Wtesting.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/RCU/00-INDEX             |    2 +
 Documentation/RCU/trace.txt            |  413 +++++++++
 arch/powerpc/platforms/pseries/rtasd.c |    4 +
 include/linux/hardirq.h                |   14 +-
 include/linux/rcupdate.h               |   10 +-
 include/linux/rcutree.h                |  329 +++++++
 init/Kconfig                           |   18 +-
 kernel/Kconfig.preempt                 |   62 +-
 kernel/Makefile                        |    6 +-
 kernel/rcupreempt.c                    |   10 +
 kernel/rcupreempt_trace.c              |   10 +-
 kernel/rcutree.c                       | 1535 ++++++++++++++++++++++++++++++++
 kernel/rcutree_trace.c                 |  271 ++++++
 kernel/softirq.c                       |    5 +-
 lib/Kconfig.debug                      |   13 +
 15 files changed, 2671 insertions(+), 31 deletions(-)
 create mode 100644 Documentation/RCU/trace.txt
 create mode 100644 include/linux/rcutree.h
 create mode 100644 kernel/rcutree.c
 create mode 100644 kernel/rcutree_trace.c

(limited to 'lib')

diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index 461481dfb7c3..7dc0695a8f90 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -16,6 +16,8 @@ RTFP.txt
 	- List of RCU papers (bibliography) going back to 1980.
 torture.txt
 	- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
+trace.txt
+	- CONFIG_RCU_TRACE debugfs files and formats
 UP.txt
 	- RCU on Uniprocessor Systems
 whatisRCU.txt
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
new file mode 100644
index 000000000000..068848240a8b
--- /dev/null
+++ b/Documentation/RCU/trace.txt
@@ -0,0 +1,413 @@
+CONFIG_RCU_TRACE debugfs Files and Formats
+
+
+The rcupreempt and rcutree implementations of RCU provide debugfs trace
+output that summarizes counters and state.  This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+Note that the rcuclassic implementation of RCU does not provide debugfs
+trace output.
+
+The following sections describe the debugfs files and formats for
+preemptable RCU (rcupreempt) and hierarchical RCU (rcutree).
+
+
+Preemptable RCU debugfs Files and Formats
+
+This implementation of RCU provides three debugfs files under the
+top-level directory RCU: rcu/rcuctrs (which displays the per-CPU
+counters used by preemptable RCU) rcu/rcugp (which displays grace-period
+counters), and rcu/rcustats (which internal counters for debugging RCU).
+
+The output of "cat rcu/rcuctrs" looks as follows:
+
+CPU last cur F M
+  0    5  -5 0 0
+  1   -1   0 0 0
+  2    0   1 0 0
+  3    0   1 0 0
+  4    0   1 0 0
+  5    0   1 0 0
+  6    0   2 0 0
+  7    0  -1 0 0
+  8    0   1 0 0
+ggp = 26226, state = waitzero
+
+The per-CPU fields are as follows:
+
+o	"CPU" gives the CPU number.  Offline CPUs are not displayed.
+
+o	"last" gives the value of the counter that is being decremented
+	for the current grace period phase.  In the example above,
+	the counters sum to 4, indicating that there are still four
+	RCU read-side critical sections still running that started
+	before the last counter flip.
+
+o	"cur" gives the value of the counter that is currently being
+	both incremented (by rcu_read_lock()) and decremented (by
+	rcu_read_unlock()).  In the example above, the counters sum to
+	1, indicating that there is only one RCU read-side critical section
+	still running that started after the last counter flip.
+
+o	"F" indicates whether RCU is waiting for this CPU to acknowledge
+	a counter flip.  In the above example, RCU is not waiting on any,
+	which is consistent with the state being "waitzero" rather than
+	"waitack".
+
+o	"M" indicates whether RCU is waiting for this CPU to execute a
+	memory barrier.  In the above example, RCU is not waiting on any,
+	which is consistent with the state being "waitzero" rather than
+	"waitmb".
+
+o	"ggp" is the global grace-period counter.
+
+o	"state" is the RCU state, which can be one of the following:
+
+	o	"idle": there is no grace period in progress.
+
+	o	"waitack": RCU just incremented the global grace-period
+		counter, which has the effect of reversing the roles of
+		the "last" and "cur" counters above, and is waiting for
+		all the CPUs to acknowledge the flip.  Once the flip has
+		been acknowledged, CPUs will no longer be incrementing
+		what are now the "last" counters, so that their sum will
+		decrease monotonically down to zero.
+
+	o	"waitzero": RCU is waiting for the sum of the "last" counters
+		to decrease to zero.
+
+	o	"waitmb": RCU is waiting for each CPU to execute a memory
+		barrier, which ensures that instructions from a given CPU's
+		last RCU read-side critical section cannot be reordered
+		with instructions following the memory-barrier instruction.
+
+The output of "cat rcu/rcugp" looks as follows:
+
+oldggp=48870  newggp=48873
+
+Note that reading from this file provokes a synchronize_rcu().  The
+"oldggp" value is that of "ggp" from rcu/rcuctrs above, taken before
+executing the synchronize_rcu(), and the "newggp" value is also the
+"ggp" value, but taken after the synchronize_rcu() command returns.
+
+
+The output of "cat rcu/rcugp" looks as follows:
+
+na=1337955 nl=40 wa=1337915 wl=44 da=1337871 dl=0 dr=1337871 di=1337871
+1=50989 e1=6138 i1=49722 ie1=82 g1=49640 a1=315203 ae1=265563 a2=49640
+z1=1401244 ze1=1351605 z2=49639 m1=5661253 me1=5611614 m2=49639
+
+These are counters tracking internal preemptable-RCU events, however,
+some of them may be useful for debugging algorithms using RCU.  In
+particular, the "nl", "wl", and "dl" values track the number of RCU
+callbacks in various states.  The fields are as follows:
+
+o	"na" is the total number of RCU callbacks that have been enqueued
+	since boot.
+
+o	"nl" is the number of RCU callbacks waiting for the previous
+	grace period to end so that they can start waiting on the next
+	grace period.
+
+o	"wa" is the total number of RCU callbacks that have started waiting
+	for a grace period since boot.  "na" should be roughly equal to
+	"nl" plus "wa".
+
+o	"wl" is the number of RCU callbacks currently waiting for their
+	grace period to end.
+
+o	"da" is the total number of RCU callbacks whose grace periods
+	have completed since boot.  "wa" should be roughly equal to
+	"wl" plus "da".
+
+o	"dr" is the total number of RCU callbacks that have been removed
+	from the list of callbacks ready to invoke.  "dr" should be roughly
+	equal to "da".
+
+o	"di" is the total number of RCU callbacks that have been invoked
+	since boot.  "di" should be roughly equal to "da", though some
+	early versions of preemptable RCU had a bug so that only the
+	last CPU's count of invocations was displayed, rather than the
+	sum of all CPU's counts.
+
+o	"1" is the number of calls to rcu_try_flip().  This should be
+	roughly equal to the sum of "e1", "i1", "a1", "z1", and "m1"
+	described below.  In other words, the number of times that
+	the state machine is visited should be equal to the sum of the
+	number of times that each state is visited plus the number of
+	times that the state-machine lock acquisition failed.
+
+o	"e1" is the number of times that rcu_try_flip() was unable to
+	acquire the fliplock.
+
+o	"i1" is the number of calls to rcu_try_flip_idle().
+
+o	"ie1" is the number of times rcu_try_flip_idle() exited early
+	due to the calling CPU having no work for RCU.
+
+o	"g1" is the number of times that rcu_try_flip_idle() decided
+	to start a new grace period.  "i1" should be roughly equal to
+	"ie1" plus "g1".
+
+o	"a1" is the number of calls to rcu_try_flip_waitack().
+
+o	"ae1" is the number of times that rcu_try_flip_waitack() found
+	that at least one CPU had not yet acknowledge the new grace period
+	(AKA "counter flip").
+
+o	"a2" is the number of time rcu_try_flip_waitack() found that
+	all CPUs had acknowledged.  "a1" should be roughly equal to
+	"ae1" plus "a2".  (This particular output was collected on
+	a 128-CPU machine, hence the smaller-than-usual fraction of
+	calls to rcu_try_flip_waitack() finding all CPUs having already
+	acknowledged.)
+
+o	"z1" is the number of calls to rcu_try_flip_waitzero().
+
+o	"ze1" is the number of times that rcu_try_flip_waitzero() found
+	that not all of the old RCU read-side critical sections had
+	completed.
+
+o	"z2" is the number of times that rcu_try_flip_waitzero() finds
+	the sum of the counters equal to zero, in other words, that
+	all of the old RCU read-side critical sections had completed.
+	The value of "z1" should be roughly equal to "ze1" plus
+	"z2".
+
+o	"m1" is the number of calls to rcu_try_flip_waitmb().
+
+o	"me1" is the number of times that rcu_try_flip_waitmb() finds
+	that at least one CPU has not yet executed a memory barrier.
+
+o	"m2" is the number of times that rcu_try_flip_waitmb() finds that
+	all CPUs have executed a memory barrier.
+
+
+Hierarchical RCU debugfs Files and Formats
+
+This implementation of RCU provides three debugfs files under the
+top-level directory RCU: rcu/rcudata (which displays fields in struct
+rcu_data), rcu/rcugp (which displays grace-period counters), and
+rcu/rcuhier (which displays the struct rcu_node hierarchy).
+
+The output of "cat rcu/rcudata" looks as follows:
+
+rcu:
+  0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10
+  1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10
+  2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10
+  3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10
+  4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10
+  5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10
+  6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10
+  7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10
+rcu_bh:
+  0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10
+  1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10
+  2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10
+  3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10
+  4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
+  5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
+  6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
+  7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
+
+The first section lists the rcu_data structures for rcu, the second for
+rcu_bh.  Each section has one line per CPU, or eight for this 8-CPU system.
+The fields are as follows:
+
+o	The number at the beginning of each line is the CPU number.
+	CPUs numbers followed by an exclamation mark are offline,
+	but have been online at least once since boot.	There will be
+	no output for CPUs that have never been online, which can be
+	a good thing in the surprisingly common case where NR_CPUS is
+	substantially larger than the number of actual CPUs.
+
+o	"c" is the count of grace periods that this CPU believes have
+	completed.  CPUs in dynticks idle mode may lag quite a ways
+	behind, for example, CPU 4 under "rcu" above, which has slept
+	through the past 25 RCU grace periods.	It is not unusual to
+	see CPUs lagging by thousands of grace periods.
+
+o	"g" is the count of grace periods that this CPU believes have
+	started.  Again, CPUs in dynticks idle mode may lag behind.
+	If the "c" and "g" values are equal, this CPU has already
+	reported a quiescent state for the last RCU grace period that
+	it is aware of, otherwise, the CPU believes that it owes RCU a
+	quiescent state.
+
+o	"pq" indicates that this CPU has passed through a quiescent state
+	for the current grace period.  It is possible for "pq" to be
+	"1" and "c" different than "g", which indicates that although
+	the CPU has passed through a quiescent state, either (1) this
+	CPU has not yet reported that fact, (2) some other CPU has not
+	yet reported for this grace period, or (3) both.
+
+o	"pqc" indicates which grace period the last-observed quiescent
+	state for this CPU corresponds to.  This is important for handling
+	the race between CPU 0 reporting an extended dynticks-idle
+	quiescent state for CPU 1 and CPU 1 suddenly waking up and
+	reporting its own quiescent state.  If CPU 1 was the last CPU
+	for the current grace period, then the CPU that loses this race
+	will attempt to incorrectly mark CPU 1 as having checked in for
+	the next grace period!
+
+o	"qp" indicates that RCU still expects a quiescent state from
+	this CPU.
+
+o	"rpfq" is the number of rcu_pending() calls on this CPU required
+	to induce this CPU to invoke force_quiescent_state().
+
+o	"rp" is low-order four hex digits of the count of how many times
+	rcu_pending() has been invoked on this CPU.
+
+o	"dt" is the current value of the dyntick counter that is incremented
+	when entering or leaving dynticks idle state, either by the
+	scheduler or by irq.  The number after the "/" is the interrupt
+	nesting depth when in dyntick-idle state, or one greater than
+	the interrupt-nesting depth otherwise.
+
+	This field is displayed only for CONFIG_NO_HZ kernels.
+
+o	"dn" is the current value of the dyntick counter that is incremented
+	when entering or leaving dynticks idle state via NMI.  If both
+	the "dt" and "dn" values are even, then this CPU is in dynticks
+	idle mode and may be ignored by RCU.  If either of these two
+	counters is odd, then RCU must be alert to the possibility of
+	an RCU read-side critical section running on this CPU.
+
+	This field is displayed only for CONFIG_NO_HZ kernels.
+
+o	"df" is the number of times that some other CPU has forced a
+	quiescent state on behalf of this CPU due to this CPU being in
+	dynticks-idle state.
+
+	This field is displayed only for CONFIG_NO_HZ kernels.
+
+o	"of" is the number of times that some other CPU has forced a
+	quiescent state on behalf of this CPU due to this CPU being
+	offline.  In a perfect world, this might neve happen, but it
+	turns out that offlining and onlining a CPU can take several grace
+	periods, and so there is likely to be an extended period of time
+	when RCU believes that the CPU is online when it really is not.
+	Please note that erring in the other direction (RCU believing a
+	CPU is offline when it is really alive and kicking) is a fatal
+	error, so it makes sense to err conservatively.
+
+o	"ri" is the number of times that RCU has seen fit to send a
+	reschedule IPI to this CPU in order to get it to report a
+	quiescent state.
+
+o	"ql" is the number of RCU callbacks currently residing on
+	this CPU.  This is the total number of callbacks, regardless
+	of what state they are in (new, waiting for grace period to
+	start, waiting for grace period to end, ready to invoke).
+
+o	"b" is the batch limit for this CPU.  If more than this number
+	of RCU callbacks is ready to invoke, then the remainder will
+	be deferred.
+
+
+The output of "cat rcu/rcugp" looks as follows:
+
+rcu: completed=33062  gpnum=33063
+rcu_bh: completed=464  gpnum=464
+
+Again, this output is for both "rcu" and "rcu_bh".  The fields are
+taken from the rcu_state structure, and are as follows:
+
+o	"completed" is the number of grace periods that have completed.
+	It is comparable to the "c" field from rcu/rcudata in that a
+	CPU whose "c" field matches the value of "completed" is aware
+	that the corresponding RCU grace period has completed.
+
+o	"gpnum" is the number of grace periods that have started.  It is
+	comparable to the "g" field from rcu/rcudata in that a CPU
+	whose "g" field matches the value of "gpnum" is aware that the
+	corresponding RCU grace period has started.
+
+	If these two fields are equal (as they are for "rcu_bh" above),
+	then there is no grace period in progress, in other words, RCU
+	is idle.  On the other hand, if the two fields differ (as they
+	do for "rcu" above), then an RCU grace period is in progress.
+
+
+The output of "cat rcu/rcuhier" looks as follows, with very long lines:
+
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
+1/1 0:127 ^0    
+3/3 0:35 ^0    0/0 36:71 ^1    0/0 72:107 ^2    0/0 108:127 ^3    
+3/3f 0:5 ^0    2/3 6:11 ^1    0/0 12:17 ^2    0/0 18:23 ^3    0/0 24:29 ^4    0/0 30:35 ^5    0/0 36:41 ^0    0/0 42:47 ^1    0/0 48:53 ^2    0/0 54:59 ^3    0/0 60:65 ^4    0/0 66:71 ^5    0/0 72:77 ^0    0/0 78:83 ^1    0/0 84:89 ^2    0/0 90:95 ^3    0/0 96:101 ^4    0/0 102:107 ^5    0/0 108:113 ^0    0/0 114:119 ^1    0/0 120:125 ^2    0/0 126:127 ^3    
+rcu_bh:
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
+0/1 0:127 ^0    
+0/3 0:35 ^0    0/0 36:71 ^1    0/0 72:107 ^2    0/0 108:127 ^3    
+0/3f 0:5 ^0    0/3 6:11 ^1    0/0 12:17 ^2    0/0 18:23 ^3    0/0 24:29 ^4    0/0 30:35 ^5    0/0 36:41 ^0    0/0 42:47 ^1    0/0 48:53 ^2    0/0 54:59 ^3    0/0 60:65 ^4    0/0 66:71 ^5    0/0 72:77 ^0    0/0 78:83 ^1    0/0 84:89 ^2    0/0 90:95 ^3    0/0 96:101 ^4    0/0 102:107 ^5    0/0 108:113 ^0    0/0 114:119 ^1    0/0 120:125 ^2    0/0 126:127 ^3
+
+This is once again split into "rcu" and "rcu_bh" portions.  The fields are
+as follows:
+
+o	"c" is exactly the same as "completed" under rcu/rcugp.
+
+o	"g" is exactly the same as "gpnum" under rcu/rcugp.
+
+o	"s" is the "signaled" state that drives force_quiescent_state()'s
+	state machine.
+
+o	"jfq" is the number of jiffies remaining for this grace period
+	before force_quiescent_state() is invoked to help push things
+	along.  Note that CPUs in dyntick-idle mode thoughout the grace
+	period will not report on their own, but rather must be check by
+	some other CPU via force_quiescent_state().
+
+o	"j" is the low-order four hex digits of the jiffies counter.
+	Yes, Paul did run into a number of problems that turned out to
+	be due to the jiffies counter no longer counting.  Why do you ask?
+
+o	"nfqs" is the number of calls to force_quiescent_state() since
+	boot.
+
+o	"nfqsng" is the number of useless calls to force_quiescent_state(),
+	where there wasn't actually a grace period active.  This can
+	happen due to races.  The number in parentheses is the difference
+	between "nfqs" and "nfqsng", or the number of times that
+	force_quiescent_state() actually did some real work.
+
+o	"fqlh" is the number of calls to force_quiescent_state() that
+	exited immediately (without even being counted in nfqs above)
+	due to contention on ->fqslock.
+
+o	Each element of the form "1/1 0:127 ^0" represents one struct
+	rcu_node.  Each line represents one level of the hierarchy, from
+	root to leaves.  It is best to think of the rcu_data structures
+	as forming yet another level after the leaves.  Note that there
+	might be either one, two, or three levels of rcu_node structures,
+	depending on the relationship between CONFIG_RCU_FANOUT and
+	CONFIG_NR_CPUS.
+	
+	o	The numbers separated by the "/" are the qsmask followed
+		by the qsmaskinit.  The qsmask will have one bit
+		set for each entity in the next lower level that
+		has not yet checked in for the current grace period.
+		The qsmaskinit will have one bit for each entity that is
+		currently expected to check in during each grace period.
+		The value of qsmaskinit is assigned to that of qsmask
+		at the beginning of each grace period.
+
+		For example, for "rcu", the qsmask of the first entry
+		of the lowest level is 0x14, meaning that we are still
+		waiting for CPUs 2 and 4 to check in for the current
+		grace period.
+
+	o	The numbers separated by the ":" are the range of CPUs
+		served by this struct rcu_node.  This can be helpful
+		in working out how the hierarchy is wired together.
+
+		For example, the first entry at the lowest level shows
+		"0:5", indicating that it covers CPUs 0 through 5.
+
+	o	The number after the "^" indicates the bit in the
+		next higher level rcu_node structure that this
+		rcu_node structure corresponds to.
+
+		For example, the first entry at the lowest level shows
+		"^0", indicating that it corresponds to bit zero in
+		the first entry at the middle level.
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
index f4e55be2eea9..afad9f5ac0ac 100644
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -208,6 +208,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 		break;
 	case ERR_TYPE_KERNEL_PANIC:
 	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
@@ -227,6 +228,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 	/* Check to see if we need to or have stopped logging */
 	if (fatal || !logging_enabled) {
 		logging_enabled = 0;
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
@@ -249,11 +251,13 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 		else
 			rtas_log_start += 1;
 
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		wake_up_interruptible(&rtas_log_wait);
 		break;
 	case ERR_TYPE_KERNEL_PANIC:
 	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 181006cc94a0..9b70b9231693 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -118,13 +118,17 @@ static inline void account_system_vtime(struct task_struct *tsk)
 }
 #endif
 
-#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
+#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU)
 extern void rcu_irq_enter(void);
 extern void rcu_irq_exit(void);
+extern void rcu_nmi_enter(void);
+extern void rcu_nmi_exit(void);
 #else
 # define rcu_irq_enter() do { } while (0)
 # define rcu_irq_exit() do { } while (0)
-#endif /* CONFIG_PREEMPT_RCU */
+# define rcu_nmi_enter() do { } while (0)
+# define rcu_nmi_exit() do { } while (0)
+#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */
 
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
@@ -134,7 +138,6 @@ extern void rcu_irq_exit(void);
  */
 #define __irq_enter()					\
 	do {						\
-		rcu_irq_enter();			\
 		account_system_vtime(current);		\
 		add_preempt_count(HARDIRQ_OFFSET);	\
 		trace_hardirq_enter();			\
@@ -153,7 +156,6 @@ extern void irq_enter(void);
 		trace_hardirq_exit();			\
 		account_system_vtime(current);		\
 		sub_preempt_count(HARDIRQ_OFFSET);	\
-		rcu_irq_exit();				\
 	} while (0)
 
 /*
@@ -161,7 +163,7 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
-#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
-#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
+#define nmi_enter()		do { lockdep_off(); rcu_nmi_enter(); __irq_enter(); } while (0)
+#define nmi_exit()		do { __irq_exit(); rcu_nmi_exit(); lockdep_on(); } while (0)
 
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 86f1f5e43e33..bfd289aff576 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,11 +52,15 @@ struct rcu_head {
 	void (*func)(struct rcu_head *head);
 };
 
-#ifdef CONFIG_CLASSIC_RCU
+#if defined(CONFIG_CLASSIC_RCU)
 #include <linux/rcuclassic.h>
-#else /* #ifdef CONFIG_CLASSIC_RCU */
+#elif defined(CONFIG_TREE_RCU)
+#include <linux/rcutree.h>
+#elif defined(CONFIG_PREEMPT_RCU)
 #include <linux/rcupreempt.h>
-#endif /* #else #ifdef CONFIG_CLASSIC_RCU */
+#else
+#error "Unknown RCU implementation specified to kernel configuration"
+#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */
 
 #define RCU_HEAD_INIT 	{ .next = NULL, .func = NULL }
 #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
new file mode 100644
index 000000000000..d4368b7975c3
--- /dev/null
+++ b/include/linux/rcutree.h
@@ -0,0 +1,329 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical algorithm
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+
+#ifndef __LINUX_RCUTREE_H
+#define __LINUX_RCUTREE_H
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+/*
+ * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this has not been tested, so there is probably some
+ * bug somewhere.
+ */
+#define MAX_RCU_LVLS 3
+#define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT
+#  define NUM_RCU_LVLS	      1
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (NR_CPUS)
+#  define NUM_RCU_LVL_2	      0
+#  define NUM_RCU_LVL_3	      0
+#elif NR_CPUS <= RCU_FANOUT_SQ
+#  define NUM_RCU_LVLS	      2
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
+#  define NUM_RCU_LVL_2	      (NR_CPUS)
+#  define NUM_RCU_LVL_3	      0
+#elif NR_CPUS <= RCU_FANOUT_CUBE
+#  define NUM_RCU_LVLS	      3
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
+#  define NUM_RCU_LVL_3	      NR_CPUS
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+
+/*
+ * Dynticks per-CPU state.
+ */
+struct rcu_dynticks {
+	int dynticks_nesting;	/* Track nesting level, sort of. */
+	int dynticks;		/* Even value for dynticks-idle, else odd. */
+	int dynticks_nmi;	/* Even value for either dynticks-idle or */
+				/*  not in nmi handler, else odd.  So this */
+				/*  remains even for nmi from irq handler. */
+};
+
+/*
+ * Definition for node within the RCU grace-period-detection hierarchy.
+ */
+struct rcu_node {
+	spinlock_t lock;
+	unsigned long qsmask;	/* CPUs or groups that need to switch in */
+				/*  order for current grace period to proceed.*/
+	unsigned long qsmaskinit;
+				/* Per-GP initialization for qsmask. */
+	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
+	int	grplo;		/* lowest-numbered CPU or group here. */
+	int	grphi;		/* highest-numbered CPU or group here. */
+	u8	grpnum;		/* CPU/group number for next level up. */
+	u8	level;		/* root is at level 0. */
+	struct rcu_node *parent;
+} ____cacheline_internodealigned_in_smp;
+
+/* Index values for nxttail array in struct rcu_data. */
+#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL		3
+#define RCU_NEXT_SIZE		4
+
+/* Per-CPU data for read-copy update. */
+struct rcu_data {
+	/* 1) quiescent-state and grace-period handling : */
+	long		completed;	/* Track rsp->completed gp number */
+					/*  in order to detect GP end. */
+	long		gpnum;		/* Highest gp number that this CPU */
+					/*  is aware of having started. */
+	long		passed_quiesc_completed;
+					/* Value of completed at time of qs. */
+	bool		passed_quiesc;	/* User-mode/idle loop etc. */
+	bool		qs_pending;	/* Core waits for quiesc state. */
+	bool		beenonline;	/* CPU online at least once. */
+	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
+	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */
+
+	/* 2) batch handling */
+	/*
+	 * If nxtlist is not NULL, it is partitioned as follows.
+	 * Any of the partitions might be empty, in which case the
+	 * pointer to that partition will be equal to the pointer for
+	 * the following partition.  When the list is empty, all of
+	 * the nxttail elements point to nxtlist, which is NULL.
+	 *
+	 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
+	 *	Entries that might have arrived after current GP ended
+	 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
+	 *	Entries known to have arrived before current GP ended
+	 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
+	 *	Entries that batch # <= ->completed - 1: waiting for current GP
+	 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
+	 *	Entries that batch # <= ->completed
+	 *	The grace period for these entries has completed, and
+	 *	the other grace-period-completed entries may be moved
+	 *	here temporarily in rcu_process_callbacks().
+	 */
+	struct rcu_head *nxtlist;
+	struct rcu_head **nxttail[RCU_NEXT_SIZE];
+	long		qlen; 	 	/* # of queued callbacks */
+	long		blimit;		/* Upper limit on a processed batch */
+
+#ifdef CONFIG_NO_HZ
+	/* 3) dynticks interface. */
+	struct rcu_dynticks *dynticks;	/* Shared per-CPU dynticks state. */
+	int dynticks_snap;		/* Per-GP tracking for dynticks. */
+	int dynticks_nmi_snap;		/* Per-GP tracking for dynticks_nmi. */
+#endif /* #ifdef CONFIG_NO_HZ */
+
+	/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
+#ifdef CONFIG_NO_HZ
+	unsigned long dynticks_fqs;	/* Kicked due to dynticks idle. */
+#endif /* #ifdef CONFIG_NO_HZ */
+	unsigned long offline_fqs;	/* Kicked due to being offline. */
+	unsigned long resched_ipi;	/* Sent a resched IPI. */
+
+	/* 5) state to allow this CPU to force_quiescent_state on others */
+	long n_rcu_pending;		/* rcu_pending() calls since boot. */
+	long n_rcu_pending_force_qs;	/* when to force quiescent states. */
+
+	int cpu;
+};
+
+/* Values for signaled field in struct rcu_state. */
+#define RCU_GP_INIT		0	/* Grace period being initialized. */
+#define RCU_SAVE_DYNTICK	1	/* Need to scan dyntick state. */
+#define RCU_FORCE_QS		2	/* Need to force quiescent state. */
+#ifdef CONFIG_NO_HZ
+#define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
+#else /* #ifdef CONFIG_NO_HZ */
+#define RCU_SIGNAL_INIT		RCU_FORCE_QS
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ)  /* for rsp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ)  /* for rsp->jiffies_stall */
+#define RCU_STALL_RAT_DELAY		2	  /* Allow other CPUs time */
+						  /*  to take at least one */
+						  /*  scheduling clock irq */
+						  /*  before ratting on them. */
+
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * RCU global state, including node hierarchy.  This hierarchy is
+ * represented in "heap" form in a dense array.  The root (first level)
+ * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
+ * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
+ * and the third level in ->node[m+1] and following (->node[m+1] referenced
+ * by ->level[2]).  The number of levels is determined by the number of
+ * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
+ * consisting of a single rcu_node.
+ */
+struct rcu_state {
+	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
+	struct rcu_node *level[NUM_RCU_LVLS];	/* Hierarchy levels. */
+	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
+	u8 levelspread[NUM_RCU_LVLS];		/* kids/node in each level. */
+	struct rcu_data *rda[NR_CPUS];		/* array of rdp pointers. */
+
+	/* The following fields are guarded by the root rcu_node's lock. */
+
+	u8	signaled ____cacheline_internodealigned_in_smp;
+						/* Force QS state. */
+	long	gpnum;				/* Current gp number. */
+	long	completed;			/* # of last completed gp. */
+	spinlock_t onofflock;			/* exclude on/offline and */
+						/*  starting new GP. */
+	spinlock_t fqslock;			/* Only one task forcing */
+						/*  quiescent states. */
+	unsigned long jiffies_force_qs;		/* Time at which to invoke */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs;		/* Number of calls to */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs_lh;		/* ~Number of calls leaving */
+						/*  due to lock unavailable. */
+	unsigned long n_force_qs_ngp;		/* Number of calls leaving */
+						/*  due to no GP active. */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	unsigned long gp_start;			/* Time at which GP started, */
+						/*  but in jiffies. */
+	unsigned long jiffies_stall;		/* Time at which to check */
+						/*  for CPU stalls. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_NO_HZ
+	long dynticks_completed;		/* Value of completed @ snap. */
+#endif /* #ifdef CONFIG_NO_HZ */
+};
+
+extern struct rcu_state rcu_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+static inline void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+static inline void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+
+extern int rcu_pending(int cpu);
+extern int rcu_needs_cpu(int cpu);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern struct lockdep_map rcu_lock_map;
+# define rcu_read_acquire()	\
+			lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
+# define rcu_read_release()	lock_release(&rcu_lock_map, 1, _THIS_IP_)
+#else
+# define rcu_read_acquire()	do { } while (0)
+# define rcu_read_release()	do { } while (0)
+#endif
+
+static inline void __rcu_read_lock(void)
+{
+	preempt_disable();
+	__acquire(RCU);
+	rcu_read_acquire();
+}
+static inline void __rcu_read_unlock(void)
+{
+	rcu_read_release();
+	__release(RCU);
+	preempt_enable();
+}
+static inline void __rcu_read_lock_bh(void)
+{
+	local_bh_disable();
+	__acquire(RCU_BH);
+	rcu_read_acquire();
+}
+static inline void __rcu_read_unlock_bh(void)
+{
+	rcu_read_release();
+	__release(RCU_BH);
+	local_bh_enable();
+}
+
+#define __synchronize_sched() synchronize_rcu()
+
+#define call_rcu_sched(head, func) call_rcu(head, func)
+
+static inline void rcu_init_sched(void)
+{
+}
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+
+extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
+
+#ifdef CONFIG_NO_HZ
+void rcu_enter_nohz(void);
+void rcu_exit_nohz(void);
+#else /* CONFIG_NO_HZ */
+static inline void rcu_enter_nohz(void)
+{
+}
+static inline void rcu_exit_nohz(void)
+{
+}
+#endif /* CONFIG_NO_HZ */
+
+#endif /* __LINUX_RCUTREE_H */
diff --git a/init/Kconfig b/init/Kconfig
index f763762d544a..9dd7958a71f0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -928,10 +928,16 @@ source "block/Kconfig"
 config PREEMPT_NOTIFIERS
 	bool
 
-config CLASSIC_RCU
-	def_bool !PREEMPT_RCU
+config TREE_RCU_TRACE
+	def_bool RCU_TRACE && TREE_RCU
+	select DEBUG_FS
 	help
-	  This option selects the classic RCU implementation that is
-	  designed for best read-side performance on non-realtime
-	  systems.  Classic RCU is the default.  Note that the
-	  PREEMPT_RCU symbol is used to select/deselect this option.
+	  This option provides tracing for the TREE_RCU implementation,
+	  permitting Makefile to trivially select kernel/rcutree_trace.c.
+
+config PREEMPT_RCU_TRACE
+	def_bool RCU_TRACE && PREEMPT_RCU
+	select DEBUG_FS
+	help
+	  This option provides tracing for the PREEMPT_RCU implementation,
+	  permitting Makefile to trivially select kernel/rcupreempt_trace.c.
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03dc1fc..463f29743ea0 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,10 +52,29 @@ config PREEMPT
 
 endchoice
 
+choice
+	prompt "RCU Implementation"
+	default CLASSIC_RCU
+
+config CLASSIC_RCU
+	bool "Classic RCU"
+	help
+	  This option selects the classic RCU implementation that is
+	  designed for best read-side performance on non-realtime
+	  systems.
+	  
+	  Select this option if you are unsure.
+
+config TREE_RCU
+	bool "Tree-based hierarchical RCU"
+	help
+	  This option selects the RCU implementation that is
+	  designed for very large SMP system with hundreds or
+	  thousands of CPUs.
+
 config PREEMPT_RCU
 	bool "Preemptible RCU"
 	depends on PREEMPT
-	default n
 	help
 	  This option reduces the latency of the kernel by making certain
 	  RCU sections preemptible. Normally RCU code is non-preemptible, if
@@ -64,16 +83,47 @@ config PREEMPT_RCU
 	  now-naive assumptions about each RCU read-side critical section
 	  remaining on a given CPU through its execution.
 
-	  Say N if you are unsure.
+endchoice
 
 config RCU_TRACE
-	bool "Enable tracing for RCU - currently stats in debugfs"
-	depends on PREEMPT_RCU
-	select DEBUG_FS
-	default y
+	bool "Enable tracing for RCU"
+	depends on TREE_RCU || PREEMPT_RCU
 	help
 	  This option provides tracing in RCU which presents stats
 	  in debugfs for debugging RCU implementation.
 
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
+
+config RCU_FANOUT
+	int "Tree-based hierarchical RCU fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on TREE_RCU
+	default 64 if 64BIT
+	default 32 if !64BIT
+	help
+	  This option controls the fanout of hierarchical implementations
+	  of RCU, allowing RCU to work efficiently on machines with
+	  large numbers of CPUs.  This value must be at least the cube
+	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+	  systems and up to 262,144 for 64-bit systems.
+
+	  Select a specific number if testing RCU itself.
+	  Take the default if unsure.
+
+config RCU_FANOUT_EXACT
+	bool "Disable tree-based hierarchical RCU auto-balancing"
+	depends on TREE_RCU
+	default n
+	help
+	  This option forces use of the exact RCU_FANOUT value specified,
+	  regardless of imbalances in the hierarchy.  This is useful for
+	  testing RCU itself, and might one day be useful on systems with
+	  strong NUMA behavior.
+
+	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+	  Say n if unsure.
+
+	
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad003b19d..b4fdbbff5ec0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,10 +74,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
-ifeq ($(CONFIG_PREEMPT_RCU),y)
-obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
-endif
+obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 59236e8b9daa..04982659875a 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -551,6 +551,16 @@ void rcu_irq_exit(void)
 	}
 }
 
+void rcu_nmi_enter(void)
+{
+	rcu_irq_enter();
+}
+
+void rcu_nmi_exit(void)
+{
+	rcu_irq_exit();
+}
+
 static void dyntick_save_progress_counter(int cpu)
 {
 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 35c2d3360ecf..7c2665cac172 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
 		sp->done_length += cp->done_length;
 		sp->done_add += cp->done_add;
 		sp->done_remove += cp->done_remove;
-		atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
+		atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
 		sp->rcu_check_callbacks += cp->rcu_check_callbacks;
-		atomic_set(&sp->rcu_try_flip_1,
-			   atomic_read(&cp->rcu_try_flip_1));
-		atomic_set(&sp->rcu_try_flip_e1,
-			   atomic_read(&cp->rcu_try_flip_e1));
+		atomic_add(atomic_read(&cp->rcu_try_flip_1),
+			   &sp->rcu_try_flip_1);
+		atomic_add(atomic_read(&cp->rcu_try_flip_e1),
+			   &sp->rcu_try_flip_e1);
 		sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
 		sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
 		sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 000000000000..a342b032112c
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1535 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/time.h>
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+#endif
+
+/* Data structures. */
+
+#define RCU_STATE_INITIALIZER(name) { \
+	.level = { &name.node[0] }, \
+	.levelcnt = { \
+		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
+		NUM_RCU_LVL_1, \
+		NUM_RCU_LVL_2, \
+		NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
+	}, \
+	.signaled = RCU_SIGNAL_INIT, \
+	.gpnum = -300, \
+	.completed = -300, \
+	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+	.fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
+	.n_force_qs = 0, \
+	.n_force_qs_ngp = 0, \
+}
+
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_data);
+
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+#ifdef CONFIG_NO_HZ
+DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
+#endif /* #ifdef CONFIG_NO_HZ */
+
+static int blimit = 10;		/* Maximum callbacks per softirq. */
+static int qhimark = 10000;	/* If this many pending, ignore blimit. */
+static int qlowmark = 100;	/* Once only this many pending, use blimit. */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU BH batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/*
+ * Does the CPU have callbacks ready to be invoked?
+ */
+static int
+cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
+{
+	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the current CPU require a yet-as-unscheduled grace period?
+ */
+static int
+cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* ACCESS_ONCE() because we are accessing outside of lock. */
+	return *rdp->nxttail[RCU_DONE_TAIL] &&
+	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
+}
+
+/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+	return &rsp->node[0];
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * If the specified CPU is offline, tell the caller that it is in
+ * a quiescent state.  Otherwise, whack it with a reschedule IPI.
+ * Grace periods can end up waiting on an offline CPU when that
+ * CPU is in the process of coming online -- it will be added to the
+ * rcu_node bitmasks before it actually makes it online.  The same thing
+ * can happen while a CPU is in the process of coming online.  Because this
+ * race is quite rare, we check for it after detecting that the grace
+ * period has been delayed rather than checking each and every CPU
+ * each and every time we start a new grace period.
+ */
+static int rcu_implicit_offline_qs(struct rcu_data *rdp)
+{
+	/*
+	 * If the CPU is offline, it is in a quiescent state.  We can
+	 * trust its state not to change because interrupts are disabled.
+	 */
+	if (cpu_is_offline(rdp->cpu)) {
+		rdp->offline_fqs++;
+		return 1;
+	}
+
+	/* The CPU is online, so send it a reschedule IPI. */
+	if (rdp->cpu != smp_processor_id())
+		smp_send_reschedule(rdp->cpu);
+	else
+		set_need_resched();
+	rdp->resched_ipi++;
+	return 0;
+}
+
+#endif /* #ifdef CONFIG_SMP */
+
+#ifdef CONFIG_NO_HZ
+static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
+
+/**
+ * rcu_enter_nohz - inform RCU that current CPU is entering nohz
+ *
+ * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur.  (Though RCU read-side
+ * critical sections can occur in irq handlers in nohz mode, a possibility
+ * handled by rcu_irq_enter() and rcu_irq_exit()).
+ */
+void rcu_enter_nohz(void)
+{
+	unsigned long flags;
+	struct rcu_dynticks *rdtp;
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	local_irq_save(flags);
+	rdtp = &__get_cpu_var(rcu_dynticks);
+	rdtp->dynticks++;
+	rdtp->dynticks_nesting--;
+	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+	local_irq_restore(flags);
+}
+
+/*
+ * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
+ *
+ * Exit nohz mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections normally occur.
+ */
+void rcu_exit_nohz(void)
+{
+	unsigned long flags;
+	struct rcu_dynticks *rdtp;
+
+	local_irq_save(flags);
+	rdtp = &__get_cpu_var(rcu_dynticks);
+	rdtp->dynticks++;
+	rdtp->dynticks_nesting++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+	local_irq_restore(flags);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is active.
+ */
+void rcu_nmi_enter(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks & 0x1)
+		return;
+	rdtp->dynticks_nmi++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is no longer active.
+ */
+void rcu_nmi_exit(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks & 0x1)
+		return;
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	rdtp->dynticks_nmi++;
+	WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
+}
+
+/**
+ * rcu_irq_enter - inform RCU of entry to hard irq context
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * rdtp->dynticks to let the RCU handling know that the CPU is active.
+ */
+void rcu_irq_enter(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks_nesting++)
+		return;
+	rdtp->dynticks++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_irq_exit - inform RCU of exit from hard irq context
+ *
+ * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
+ * to put let the RCU handling be aware that the CPU is going back to idle
+ * with no ticks.
+ */
+void rcu_irq_exit(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (--rdtp->dynticks_nesting)
+		return;
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	rdtp->dynticks++;
+	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+
+	/* If the interrupt queued a callback, get out of dyntick mode. */
+	if (__get_cpu_var(rcu_data).nxtlist ||
+	    __get_cpu_var(rcu_bh_data).nxtlist)
+		set_need_resched();
+}
+
+/*
+ * Record the specified "completed" value, which is later used to validate
+ * dynticks counter manipulations.  Specify "rsp->completed - 1" to
+ * unconditionally invalidate any future dynticks manipulations (which is
+ * useful at the beginning of a grace period).
+ */
+static void dyntick_record_completed(struct rcu_state *rsp, long comp)
+{
+	rsp->dynticks_completed = comp;
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Recall the previously recorded value of the completion for dynticks.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->dynticks_completed;
+}
+
+/*
+ * Snapshot the specified CPU's dynticks counter so that we can later
+ * credit them with an implicit quiescent state.  Return 1 if this CPU
+ * is already in a quiescent state courtesy of dynticks idle mode.
+ */
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
+{
+	int ret;
+	int snap;
+	int snap_nmi;
+
+	snap = rdp->dynticks->dynticks;
+	snap_nmi = rdp->dynticks->dynticks_nmi;
+	smp_mb();	/* Order sampling of snap with end of grace period. */
+	rdp->dynticks_snap = snap;
+	rdp->dynticks_nmi_snap = snap_nmi;
+	ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
+	if (ret)
+		rdp->dynticks_fqs++;
+	return ret;
+}
+
+/*
+ * Return true if the specified CPU has passed through a quiescent
+ * state by virtue of being in or having passed through an dynticks
+ * idle state since the last call to dyntick_save_progress_counter()
+ * for this same CPU.
+ */
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	long curr;
+	long curr_nmi;
+	long snap;
+	long snap_nmi;
+
+	curr = rdp->dynticks->dynticks;
+	snap = rdp->dynticks_snap;
+	curr_nmi = rdp->dynticks->dynticks_nmi;
+	snap_nmi = rdp->dynticks_nmi_snap;
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq/NMI handlers, then we can safely pretend that the CPU
+	 * already acknowledged the request to pass through a quiescent
+	 * state.  Either way, that CPU cannot possibly be in an RCU
+	 * read-side critical section that started before the beginning
+	 * of the current RCU grace period.
+	 */
+	if ((curr != snap || (curr & 0x1) == 0) &&
+	    (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
+		rdp->dynticks_fqs++;
+		return 1;
+	}
+
+	/* Go check for the CPU being offline. */
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#endif /* #ifdef CONFIG_SMP */
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static void dyntick_record_completed(struct rcu_state *rsp, long comp)
+{
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * If there are no dynticks, then the only way that a CPU can passively
+ * be in a quiescent state is to be offline.  Unlike dynticks idle, which
+ * is a point in time during the prior (already finished) grace period,
+ * an offline CPU is always in a quiescent state, and thus can be
+ * unconditionally applied.  So just return the current value of completed.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->completed;
+}
+
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
+{
+	return 0;
+}
+
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#endif /* #ifdef CONFIG_SMP */
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+	rsp->gp_start = jiffies;
+	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_state *rsp)
+{
+	int cpu;
+	long delta;
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	/* Only let one CPU complain about others per time interval. */
+
+	spin_lock_irqsave(&rnp->lock, flags);
+	delta = jiffies - rsp->jiffies_stall;
+	if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+	/* OK, time to rat on our buddy... */
+
+	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		if (rnp_cur->qsmask == 0)
+			continue;
+		for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
+			if (rnp_cur->qsmask & (1UL << cpu))
+				printk(" %d", rnp_cur->grplo + cpu);
+	}
+	printk(" (detected by %d, t=%ld jiffies)\n",
+	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
+	force_quiescent_state(rsp, 0);  /* Kick them all. */
+}
+
+static void print_cpu_stall(struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
+			smp_processor_id(), jiffies - rsp->gp_start);
+	dump_stack();
+	spin_lock_irqsave(&rnp->lock, flags);
+	if ((long)(jiffies - rsp->jiffies_stall) >= 0)
+		rsp->jiffies_stall =
+			jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+	set_need_resched();  /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long delta;
+	struct rcu_node *rnp;
+
+	delta = jiffies - rsp->jiffies_stall;
+	rnp = rdp->mynode;
+	if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+
+		/* We haven't checked in, so go dump stack. */
+		print_cpu_stall(rsp);
+
+	} else if (rsp->gpnum != rsp->completed &&
+		   delta >= RCU_STALL_RAT_DELAY) {
+
+		/* They had two time units to dump stack, so complain. */
+		print_other_cpu_stall(rsp);
+	}
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * Update CPU-local rcu_data state to record the newly noticed grace period.
+ * This is used both when we started the grace period and when we notice
+ * that someone else started the grace period.
+ */
+static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->qs_pending = 1;
+	rdp->passed_quiesc = 0;
+	rdp->gpnum = rsp->gpnum;
+	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
+				      RCU_JIFFIES_TILL_FORCE_QS;
+}
+
+/*
+ * Did someone else start a new RCU grace period start since we last
+ * checked?  Update local state appropriately if so.  Must be called
+ * on the CPU corresponding to rdp.
+ */
+static int
+check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	local_irq_save(flags);
+	if (rdp->gpnum != rsp->gpnum) {
+		note_new_gpnum(rsp, rdp);
+		ret = 1;
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period.  The caller must hold
+ * the root node's ->lock, which is released before return.  Hard irqs must
+ * be disabled.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
+	__releases(rcu_get_root(rsp)->lock)
+{
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur;
+	struct rcu_node *rnp_end;
+
+	if (!cpu_needs_another_gp(rsp, rdp)) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	/* Advance to a new grace period and initialize state. */
+	rsp->gpnum++;
+	rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
+				      RCU_JIFFIES_TILL_FORCE_QS;
+	record_gp_stall_check_time(rsp);
+	dyntick_record_completed(rsp, rsp->completed - 1);
+	note_new_gpnum(rsp, rdp);
+
+	/*
+	 * Because we are first, we know that all our callbacks will
+	 * be covered by this upcoming grace period, even the ones
+	 * that were registered arbitrarily recently.
+	 */
+	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+	/* Special-case the common single-level case. */
+	if (NUM_RCU_NODES == 1) {
+		rnp->qsmask = rnp->qsmaskinit;
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	spin_unlock(&rnp->lock);  /* leave irqs disabled. */
+
+
+	/* Exclude any concurrent CPU-hotplug operations. */
+	spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+
+	/*
+	 * Set the quiescent-state-needed bits in all the non-leaf RCU
+	 * nodes for all currently online CPUs.  This operation relies
+	 * on the layout of the hierarchy within the rsp->node[] array.
+	 * Note that other CPUs will access only the leaves of the
+	 * hierarchy, which still indicate that no grace period is in
+	 * progress.  In addition, we have excluded CPU-hotplug operations.
+	 *
+	 * We therefore do not need to hold any locks.  Any required
+	 * memory barriers will be supplied by the locks guarding the
+	 * leaf rcu_nodes in the hierarchy.
+	 */
+
+	rnp_end = rsp->level[NUM_RCU_LVLS - 1];
+	for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+
+	/*
+	 * Now set up the leaf nodes.  Here we must be careful.  First,
+	 * we need to hold the lock in order to exclude other CPUs, which
+	 * might be contending for the leaf nodes' locks.  Second, as
+	 * soon as we initialize a given leaf node, its CPUs might run
+	 * up the rest of the hierarchy.  We must therefore acquire locks
+	 * for each node that we touch during this stage.  (But we still
+	 * are excluding CPU-hotplug operations.)
+	 *
+	 * Note that the grace period cannot complete until we finish
+	 * the initialization process, as there will be at least one
+	 * qsmask bit set in the root node until that time, namely the
+	 * one corresponding to this CPU.
+	 */
+	rnp_end = &rsp->node[NUM_RCU_NODES];
+	rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		spin_lock(&rnp_cur->lock);	/* irqs already disabled. */
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+		spin_unlock(&rnp_cur->lock);	/* irqs already disabled. */
+	}
+
+	rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
+}
+
+/*
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.  This may be called only from the CPU to whom the rdp
+ * belongs.
+ */
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long completed_snap;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */
+
+	/* Did another grace period end? */
+	if (rdp->completed != completed_snap) {
+
+		/* Advance callbacks.  No harm if list empty. */
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		/* Remember that we saw this grace-period completion. */
+		rdp->completed = completed_snap;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Similar to cpu_quiet(), for which it is a helper function.  Allows
+ * a group of CPUs to be quieted at one go, though all the CPUs in the
+ * group must be represented by the same leaf rcu_node structure.
+ * That structure's lock must be held upon entry, and it is released
+ * before return.
+ */
+static void
+cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
+	      unsigned long flags)
+	__releases(rnp->lock)
+{
+	/* Walk up the rcu_node hierarchy. */
+	for (;;) {
+		if (!(rnp->qsmask & mask)) {
+
+			/* Our bit has already been cleared, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		rnp->qsmask &= ~mask;
+		if (rnp->qsmask != 0) {
+
+			/* Other bits still set at this level, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		mask = rnp->grpmask;
+		if (rnp->parent == NULL) {
+
+			/* No more levels.  Exit loop holding root lock. */
+
+			break;
+		}
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		rnp = rnp->parent;
+		spin_lock_irqsave(&rnp->lock, flags);
+	}
+
+	/*
+	 * Get here if we are the last CPU to pass through a quiescent
+	 * state for this grace period.  Clean up and let rcu_start_gp()
+	 * start up the next grace period if one is needed.  Note that
+	 * we still hold rnp->lock, as required by rcu_start_gp(), which
+	 * will release it.
+	 */
+	rsp->completed = rsp->gpnum;
+	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
+	rcu_start_gp(rsp, flags);  /* releases rnp->lock. */
+}
+
+/*
+ * Record a quiescent state for the specified CPU, which must either be
+ * the current CPU or an offline CPU.  The lastcomp argument is used to
+ * make sure we are still in the grace period of interest.  We don't want
+ * to end the current grace period based on quiescent states detected in
+ * an earlier grace period!
+ */
+static void
+cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+{
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp;
+
+	rnp = rdp->mynode;
+	spin_lock_irqsave(&rnp->lock, flags);
+	if (lastcomp != ACCESS_ONCE(rsp->completed)) {
+
+		/*
+		 * Someone beat us to it for this grace period, so leave.
+		 * The race with GP start is resolved by the fact that we
+		 * hold the leaf rcu_node lock, so that the per-CPU bits
+		 * cannot yet be initialized -- so we would simply find our
+		 * CPU's bit already cleared in cpu_quiet_msk() if this race
+		 * occurred.
+		 */
+		rdp->passed_quiesc = 0;	/* try again later! */
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	mask = rdp->grpmask;
+	if ((rnp->qsmask & mask) == 0) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+	} else {
+		rdp->qs_pending = 0;
+
+		/*
+		 * This GP can't end until cpu checks in, so all of our
+		 * callbacks can be processed during the next GP.
+		 */
+		rdp = rsp->rda[smp_processor_id()];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
+	}
+}
+
+/*
+ * Check to see if there is a new grace period of which this CPU
+ * is not yet aware, and if so, set up local rcu_data state for it.
+ * Otherwise, see if this CPU has just passed through its first
+ * quiescent state for this grace period, and record that fact if so.
+ */
+static void
+rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* If there is now a new grace period, record and return. */
+	if (check_for_new_grace_period(rsp, rdp))
+		return;
+
+	/*
+	 * Does this CPU still need to do its part for current grace period?
+	 * If no, return and let the other CPUs do their part as well.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/*
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+
+	/* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
+	cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
+ * and move all callbacks from the outgoing CPU to the current one.
+ */
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
+{
+	int i;
+	unsigned long flags;
+	long lastcomp;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_data *rdp_me;
+	struct rcu_node *rnp;
+
+	/* Exclude any attempts to start a new grace period. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;	/* rnp->grplo is constant. */
+	do {
+		spin_lock(&rnp->lock);		/* irqs already disabled. */
+		rnp->qsmaskinit &= ~mask;
+		if (rnp->qsmaskinit != 0) {
+			spin_unlock(&rnp->lock); /* irqs already disabled. */
+			break;
+		}
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock);	/* irqs already disabled. */
+		rnp = rnp->parent;
+	} while (rnp != NULL);
+	lastcomp = rsp->completed;
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/* Being offline is a quiescent state, so go record it. */
+	cpu_quiet(cpu, rsp, rdp, lastcomp);
+
+	/*
+	 * Move callbacks from the outgoing CPU to the running CPU.
+	 * Note that the outgoing CPU is now quiscent, so it is now
+	 * (uncharacteristically) safe to access it rcu_data structure.
+	 * Note also that we must carefully retain the order of the
+	 * outgoing CPU's callbacks in order for rcu_barrier() to work
+	 * correctly.  Finally, note that we start all the callbacks
+	 * afresh, even those that have passed through a grace period
+	 * and are therefore ready to invoke.  The theory is that hotplug
+	 * events are rare, and that if they are frequent enough to
+	 * indefinitely delay callbacks, you have far worse things to
+	 * be worrying about.
+	 */
+	rdp_me = rsp->rda[smp_processor_id()];
+	if (rdp->nxtlist != NULL) {
+		*rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+		rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		rdp->nxtlist = NULL;
+		for (i = 0; i < RCU_NEXT_SIZE; i++)
+			rdp->nxttail[i] = &rdp->nxtlist;
+		rdp_me->qlen += rdp->qlen;
+		rdp->qlen = 0;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Remove the specified CPU from the RCU hierarchy and move any pending
+ * callbacks that it might have to the current CPU.  This code assumes
+ * that at least one CPU in the system will remain running at all times.
+ * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
+ */
+static void rcu_offline_cpu(int cpu)
+{
+	__rcu_offline_cpu(cpu, &rcu_state);
+	__rcu_offline_cpu(cpu, &rcu_bh_state);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Invoke any RCU callbacks that have made it to the end of their grace
+ * period.  Thottle as specified by rdp->blimit.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list, **tail;
+	int count;
+
+	/* If no callbacks are ready, just return.*/
+	if (!cpu_has_callbacks_ready_to_invoke(rdp))
+		return;
+
+	/*
+	 * Extract the list of ready callbacks, disabling to prevent
+	 * races with call_rcu() from interrupt handlers.
+	 */
+	local_irq_save(flags);
+	list = rdp->nxtlist;
+	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
+	tail = rdp->nxttail[RCU_DONE_TAIL];
+	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+			rdp->nxttail[count] = &rdp->nxtlist;
+	local_irq_restore(flags);
+
+	/* Invoke callbacks. */
+	count = 0;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+
+	local_irq_save(flags);
+
+	/* Update count, and requeue any remaining callbacks. */
+	rdp->qlen -= count;
+	if (list != NULL) {
+		*tail = rdp->nxtlist;
+		rdp->nxtlist = list;
+		for (count = 0; count < RCU_NEXT_SIZE; count++)
+			if (&rdp->nxtlist == rdp->nxttail[count])
+				rdp->nxttail[count] = tail;
+			else
+				break;
+	}
+
+	/* Reinstate batch limit if we have worked down the excess. */
+	if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+
+	local_irq_restore(flags);
+
+	/* Re-raise the RCU softirq if there are callbacks remaining. */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Check to see if this CPU is in a non-context-switch quiescent state
+ * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
+ * Also schedule the RCU softirq handler.
+ *
+ * This function must be called with hardirqs disabled.  It is normally
+ * invoked from the scheduling-clock interrupt.  If rcu_pending returns
+ * false, there is no point in invoking rcu_check_callbacks().
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+
+		/*
+		 * Get here if this CPU took its interrupt from user
+		 * mode or from the idle loop, and if this is not a
+		 * nested interrupt.  In this case, the CPU is in
+		 * a quiescent state, so count it.
+		 *
+		 * No memory barrier is required here because both
+		 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
+		 * only CPU-local variables that other CPUs neither
+		 * access nor modify, at least not while the corresponding
+		 * CPU is online.
+		 */
+
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+
+	} else if (!in_softirq()) {
+
+		/*
+		 * Get here if this CPU did not take its interrupt from
+		 * softirq, in other words, if it is not interrupting
+		 * a rcu_bh read-side critical section.  This is an _bh
+		 * critical section, so count it.
+		 */
+
+		rcu_bh_qsctr_inc(cpu);
+	}
+	raise_softirq(RCU_SOFTIRQ);
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Scan the leaf rcu_node structures, processing dyntick state for any that
+ * have not yet encountered a quiescent state, using the function specified.
+ * Returns 1 if the current grace period ends while scanning (possibly
+ * because we made it end).
+ */
+static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
+			       int (*f)(struct rcu_data *))
+{
+	unsigned long bit;
+	int cpu;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		mask = 0;
+		spin_lock_irqsave(&rnp_cur->lock, flags);
+		if (rsp->completed != lastcomp) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			return 1;
+		}
+		if (rnp_cur->qsmask == 0) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			continue;
+		}
+		cpu = rnp_cur->grplo;
+		bit = 1;
+		for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
+			if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+				mask |= bit;
+		}
+		if (mask != 0 && rsp->completed == lastcomp) {
+
+			/* cpu_quiet_msk() releases rnp_cur->lock. */
+			cpu_quiet_msk(mask, rsp, rnp_cur, flags);
+			continue;
+		}
+		spin_unlock_irqrestore(&rnp_cur->lock, flags);
+	}
+	return 0;
+}
+
+/*
+ * Force quiescent states on reluctant CPUs, and also detect which
+ * CPUs are in dyntick-idle mode.
+ */
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	unsigned long flags;
+	long lastcomp;
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	u8 signaled;
+
+	if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
+		return;  /* No grace period in progress, nothing to force. */
+	if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
+		rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
+		return;	/* Someone else is already on the job. */
+	}
+	if (relaxed &&
+	    (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
+	    (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
+		goto unlock_ret; /* no emergency and done recently. */
+	rsp->n_force_qs++;
+	spin_lock(&rnp->lock);
+	lastcomp = rsp->completed;
+	signaled = rsp->signaled;
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
+				      RCU_JIFFIES_TILL_FORCE_QS;
+	if (lastcomp == rsp->gpnum) {
+		rsp->n_force_qs_ngp++;
+		spin_unlock(&rnp->lock);
+		goto unlock_ret;  /* no GP in progress, time updated. */
+	}
+	spin_unlock(&rnp->lock);
+	switch (signaled) {
+	case RCU_GP_INIT:
+
+		break; /* grace period still initializing, ignore. */
+
+	case RCU_SAVE_DYNTICK:
+
+		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
+			break; /* So gcc recognizes the dead code. */
+
+		/* Record dyntick-idle state. */
+		if (rcu_process_dyntick(rsp, lastcomp,
+					dyntick_save_progress_counter))
+			goto unlock_ret;
+
+		/* Update state, record completion counter. */
+		spin_lock(&rnp->lock);
+		if (lastcomp == rsp->completed) {
+			rsp->signaled = RCU_FORCE_QS;
+			dyntick_record_completed(rsp, lastcomp);
+		}
+		spin_unlock(&rnp->lock);
+		break;
+
+	case RCU_FORCE_QS:
+
+		/* Check dyntick-idle state, send IPI to laggarts. */
+		if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
+					rcu_implicit_dynticks_qs))
+			goto unlock_ret;
+
+		/* Leave state in case more forcing is required. */
+
+		break;
+	}
+unlock_ret:
+	spin_unlock_irqrestore(&rsp->fqslock, flags);
+}
+
+#else /* #ifdef CONFIG_SMP */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	set_need_resched();
+}
+
+#endif /* #else #ifdef CONFIG_SMP */
+
+/*
+ * This does the RCU processing work from softirq context for the
+ * specified rcu_state and rcu_data structures.  This may be called
+ * only from the CPU to whom the rdp belongs.
+ */
+static void
+__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+
+	/*
+	 * If an RCU GP has gone long enough, go check for dyntick
+	 * idle CPUs and, if needed, send resched IPIs.
+	 */
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+	    (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+		force_quiescent_state(rsp, 1);
+
+	/*
+	 * Advance callbacks in response to end of earlier grace
+	 * period that some other CPU ended.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+
+	/* Update RCU state based on any recent quiescent states. */
+	rcu_check_quiescent_state(rsp, rdp);
+
+	/* Does this CPU require a not-yet-started grace period? */
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		rcu_start_gp(rsp, flags);  /* releases above lock */
+	}
+
+	/* If there are callbacks ready, invoke them. */
+	rcu_do_batch(rdp);
+}
+
+/*
+ * Do softirq processing for the current CPU.
+ */
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	/*
+	 * Memory references from any prior RCU read-side critical sections
+	 * executed by the interrupted code must be seen before any RCU
+	 * grace-period manipulations below.
+	 */
+	smp_mb(); /* See above block comment. */
+
+	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+
+	/*
+	 * Memory references from any later RCU read-side critical sections
+	 * executed by the interrupted code must be seen after any RCU
+	 * grace-period manipulations above.
+	 */
+	smp_mb(); /* See above block comment. */
+}
+
+static void
+__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+	   struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+
+	smp_mb(); /* Ensure RCU update seen before callback registry. */
+
+	/*
+	 * Opportunistically note grace-period endings and beginnings.
+	 * Note that we might see a beginning right after we see an
+	 * end, but never vice versa, since this CPU has to pass through
+	 * a quiescent state betweentimes.
+	 */
+	local_irq_save(flags);
+	rdp = rsp->rda[smp_processor_id()];
+	rcu_process_gp_end(rsp, rdp);
+	check_for_new_grace_period(rsp, rdp);
+
+	/* Add the callback to our list. */
+	*rdp->nxttail[RCU_NEXT_TAIL] = head;
+	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+	/* Start a new grace period if one not already started. */
+	if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
+		unsigned long nestflag;
+		struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+		spin_lock_irqsave(&rnp_root->lock, nestflag);
+		rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
+	}
+
+	/* Force the grace period if too many callbacks or too long waiting. */
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = LONG_MAX;
+		force_quiescent_state(rsp, 0);
+	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+		   (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+		force_quiescent_state(rsp, 1);
+	local_irq_restore(flags);
+}
+
+/*
+ * Queue an RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Queue an RCU for invocation after a quicker grace period.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_bh_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, for the specified type of RCU, returning 1 if so.
+ * The checks are in order of increasing expense: checks that can be
+ * carried out against CPU-local state are performed first.  However,
+ * we must check for CPU stalls first, else we might not get a chance.
+ */
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->n_rcu_pending++;
+
+	/* Check for CPU stalls, if enabled. */
+	check_cpu_stall(rsp, rdp);
+
+	/* Is the RCU core waiting for a quiescent state from this CPU? */
+	if (rdp->qs_pending)
+		return 1;
+
+	/* Does this CPU have callbacks ready to invoke? */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		return 1;
+
+	/* Has RCU gone idle with this CPU needing another grace period? */
+	if (cpu_needs_another_gp(rsp, rdp))
+		return 1;
+
+	/* Has another RCU grace period completed?  */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+		return 1;
+
+	/* Has a new RCU grace period started? */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+		return 1;
+
+	/* Has an RCU GP gone long enough to send resched IPIs &c? */
+	if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
+	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+	     (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
+	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	/* RCU callbacks either ready or pending? */
+	return per_cpu(rcu_data, cpu).nxtlist ||
+	       per_cpu(rcu_bh_data, cpu).nxtlist;
+}
+
+/*
+ * Initialize a CPU's per-CPU RCU data.  We take this "scorched earth"
+ * approach so that we don't have to worry about how long the CPU has
+ * been gone, or whether it ever was online previously.  We do trust the
+ * ->mynode field, as it is constant for a given struct rcu_data and
+ * initialized during early boot.
+ *
+ * Note that only one online or offline event can be happening at a given
+ * time.  Note also that we can accept some slop in the rsp->completed
+ * access due to the fact that this CPU cannot possibly have any RCU
+ * callbacks in flight yet.
+ */
+static void
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	int i;
+	long lastcomp;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	/* Set up local state, ensuring consistent view of global state. */
+	spin_lock_irqsave(&rnp->lock, flags);
+	lastcomp = rsp->completed;
+	rdp->completed = lastcomp;
+	rdp->gpnum = lastcomp;
+	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
+	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
+	rdp->beenonline = 1;	 /* We have now been online. */
+	rdp->passed_quiesc_completed = lastcomp - 1;
+	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+	rdp->qlen = 0;
+	rdp->blimit = blimit;
+#ifdef CONFIG_NO_HZ
+	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
+#endif /* #ifdef CONFIG_NO_HZ */
+	rdp->cpu = cpu;
+	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we won't be part
+	 * of it, but that is OK, as we are currently in a quiescent state.
+	 */
+
+	/* Exclude any attempts to start a new GP on large systems. */
+	spin_lock(&rsp->onofflock);		/* irqs already disabled. */
+
+	/* Add CPU to rcu_node bitmasks. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;
+	do {
+		/* Exclude any attempts to start a new GP on small systems. */
+		spin_lock(&rnp->lock);	/* irqs already disabled. */
+		rnp->qsmaskinit |= mask;
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock); /* irqs already disabled. */
+		rnp = rnp->parent;
+	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we will be part of
+	 * it, and its gpnum will be greater than ours, so we will
+	 * participate.  It is also possible for the gpnum to have been
+	 * incremented before this function was called, and the bitmasks
+	 * to not be filled out until now, in which case we will also
+	 * participate due to our gpnum being behind.
+	 */
+
+	/* Since it is coming online, the CPU is in a quiescent state. */
+	cpu_quiet(cpu, rsp, rdp, lastcomp);
+	local_irq_restore(flags);
+}
+
+static void __cpuinit rcu_online_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+	rdtp->dynticks_nesting = 1;
+	rdtp->dynticks |= 1; 	/* need consecutive #s even for hotplug. */
+	rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
+#endif /* #ifdef CONFIG_NO_HZ */
+	rcu_init_percpu_data(cpu, &rcu_state);
+	rcu_init_percpu_data(cpu, &rcu_bh_state);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+
+/*
+ * Handle CPU online/offline notifcation events.
+ */
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ */
+#ifdef CONFIG_RCU_FANOUT_EXACT
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int i;
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+		rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+}
+#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int ccur;
+	int cprv;
+	int i;
+
+	cprv = NR_CPUS;
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		ccur = rsp->levelcnt[i];
+		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+		cprv = ccur;
+	}
+}
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
+
+/*
+ * Helper function for rcu_init() that initializes one rcu_state structure.
+ */
+static void __init rcu_init_one(struct rcu_state *rsp)
+{
+	int cpustride = 1;
+	int i;
+	int j;
+	struct rcu_node *rnp;
+
+	/* Initialize the level-tracking arrays. */
+
+	for (i = 1; i < NUM_RCU_LVLS; i++)
+		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+	rcu_init_levelspread(rsp);
+
+	/* Initialize the elements themselves, starting from the leaves. */
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		cpustride *= rsp->levelspread[i];
+		rnp = rsp->level[i];
+		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+			spin_lock_init(&rnp->lock);
+			rnp->qsmask = 0;
+			rnp->qsmaskinit = 0;
+			rnp->grplo = j * cpustride;
+			rnp->grphi = (j + 1) * cpustride - 1;
+			if (rnp->grphi >= NR_CPUS)
+				rnp->grphi = NR_CPUS - 1;
+			if (i == 0) {
+				rnp->grpnum = 0;
+				rnp->grpmask = 0;
+				rnp->parent = NULL;
+			} else {
+				rnp->grpnum = j % rsp->levelspread[i - 1];
+				rnp->grpmask = 1UL << rnp->grpnum;
+				rnp->parent = rsp->level[i - 1] +
+					      j / rsp->levelspread[i - 1];
+			}
+			rnp->level = i;
+		}
+	}
+}
+
+/*
+ * Helper macro for __rcu_init().  To be used nowhere else!
+ * Assigns leaf node pointers into each CPU's rcu_data structure.
+ */
+#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
+do { \
+	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
+	j = 0; \
+	for_each_possible_cpu(i) { \
+		if (i > rnp[j].grphi) \
+			j++; \
+		per_cpu(rcu_data, i).mynode = &rnp[j]; \
+		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
+	} \
+} while (0)
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+void __init __rcu_init(void)
+{
+	int i;			/* All used by RCU_DATA_PTR_INIT(). */
+	int j;
+	struct rcu_node *rnp;
+
+	printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+	rcu_init_one(&rcu_state);
+	RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+	rcu_init_one(&rcu_bh_state);
+	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
+
+	for_each_online_cpu(i)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+	printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 000000000000..d6db3e837826
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,271 @@
+/*
+ * Read-Copy Update tracing for classic implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
+{
+	if (!rdp->beenonline)
+		return;
+	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
+		   rdp->cpu,
+		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
+		   rdp->completed, rdp->gpnum,
+		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		   rdp->qs_pending,
+		   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
+		   (int)(rdp->n_rcu_pending & 0xffff));
+#ifdef CONFIG_NO_HZ
+	seq_printf(m, " dt=%d/%d dn=%d df=%lu",
+		   rdp->dynticks->dynticks,
+		   rdp->dynticks->dynticks_nesting,
+		   rdp->dynticks->dynticks_nmi,
+		   rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+	seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
+	seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+}
+
+#define PRINT_RCU_DATA(name, func, m) \
+	do { \
+		int _p_r_d_i; \
+		\
+		for_each_possible_cpu(_p_r_d_i) \
+			func(m, &per_cpu(name, _p_r_d_i)); \
+	} while (0)
+
+static int show_rcudata(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "rcu:\n");
+	PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
+	seq_puts(m, "rcu_bh:\n");
+	PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
+	return 0;
+}
+
+static int rcudata_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcudata, NULL);
+}
+
+static struct file_operations rcudata_fops = {
+	.owner = THIS_MODULE,
+	.open = rcudata_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
+{
+	if (!rdp->beenonline)
+		return;
+	seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
+		   rdp->cpu,
+		   cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
+		   rdp->completed, rdp->gpnum,
+		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		   rdp->qs_pending,
+		   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
+		   rdp->n_rcu_pending);
+#ifdef CONFIG_NO_HZ
+	seq_printf(m, ",%d,%d,%d,%lu",
+		   rdp->dynticks->dynticks,
+		   rdp->dynticks->dynticks_nesting,
+		   rdp->dynticks->dynticks_nmi,
+		   rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+	seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
+	seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
+}
+
+static int show_rcudata_csv(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
+#ifdef CONFIG_NO_HZ
+	seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
+#endif /* #ifdef CONFIG_NO_HZ */
+	seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+	seq_puts(m, "\"rcu:\"\n");
+	PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
+	seq_puts(m, "\"rcu_bh:\"\n");
+	PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
+	return 0;
+}
+
+static int rcudata_csv_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcudata_csv, NULL);
+}
+
+static struct file_operations rcudata_csv_fops = {
+	.owner = THIS_MODULE,
+	.open = rcudata_csv_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
+{
+	int level = 0;
+	struct rcu_node *rnp;
+
+	seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
+	              "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
+		   rsp->completed, rsp->gpnum, rsp->signaled,
+		   (long)(rsp->jiffies_force_qs - jiffies),
+		   (int)(jiffies & 0xffff),
+		   rsp->n_force_qs, rsp->n_force_qs_ngp,
+		   rsp->n_force_qs - rsp->n_force_qs_ngp,
+		   rsp->n_force_qs_lh);
+	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+		if (rnp->level != level) {
+			seq_puts(m, "\n");
+			level = rnp->level;
+		}
+		seq_printf(m, "%lx/%lx %d:%d ^%d    ",
+			   rnp->qsmask, rnp->qsmaskinit,
+			   rnp->grplo, rnp->grphi, rnp->grpnum);
+	}
+	seq_puts(m, "\n");
+}
+
+static int show_rcuhier(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "rcu:\n");
+	print_one_rcu_state(m, &rcu_state);
+	seq_puts(m, "rcu_bh:\n");
+	print_one_rcu_state(m, &rcu_bh_state);
+	return 0;
+}
+
+static int rcuhier_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcuhier, NULL);
+}
+
+static struct file_operations rcuhier_fops = {
+	.owner = THIS_MODULE,
+	.open = rcuhier_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static int show_rcugp(struct seq_file *m, void *unused)
+{
+	seq_printf(m, "rcu: completed=%ld  gpnum=%ld\n",
+		   rcu_state.completed, rcu_state.gpnum);
+	seq_printf(m, "rcu_bh: completed=%ld  gpnum=%ld\n",
+		   rcu_bh_state.completed, rcu_bh_state.gpnum);
+	return 0;
+}
+
+static int rcugp_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcugp, NULL);
+}
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.open = rcugp_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
+static int __init rcuclassic_trace_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+
+	datadir = debugfs_create_file("rcudata", 0444, rcudir,
+						NULL, &rcudata_fops);
+	if (!datadir)
+		goto free_out;
+
+	datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
+						NULL, &rcudata_csv_fops);
+	if (!datadir_csv)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
+						NULL, &rcuhier_fops);
+	if (!hierdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (datadir)
+		debugfs_remove(datadir);
+	if (datadir_csv)
+		debugfs_remove(datadir_csv);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static void __exit rcuclassic_trace_cleanup(void)
+{
+	debugfs_remove(datadir);
+	debugfs_remove(datadir_csv);
+	debugfs_remove(gpdir);
+	debugfs_remove(hierdir);
+	debugfs_remove(rcudir);
+}
+
+
+module_init(rcuclassic_trace_init);
+module_exit(rcuclassic_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
+MODULE_LICENSE("GPL");
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d69..80d323e6f61a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -269,6 +269,7 @@ void irq_enter(void)
 {
 	int cpu = smp_processor_id();
 
+	rcu_irq_enter();
 	if (idle_cpu(cpu) && !in_interrupt()) {
 		__irq_enter();
 		tick_check_idle(cpu);
@@ -295,9 +296,9 @@ void irq_exit(void)
 
 #ifdef CONFIG_NO_HZ
 	/* Make sure that timer wheel updates are propagated */
-	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
-		tick_nohz_stop_sched_tick(0);
 	rcu_irq_exit();
+	if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
+		tick_nohz_stop_sched_tick(0);
 #endif
 	preempt_enable_no_resched();
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b0f239e443bc..465d822f3f5d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -619,6 +619,19 @@ config RCU_CPU_STALL_DETECTOR
 
 	  Say N if you are unsure.
 
+config RCU_CPU_STALL_DETECTOR
+	bool "Check for stalled CPUs delaying RCU grace periods"
+	depends on CLASSIC_RCU || TREE_RCU
+	default n
+	help
+	  This option causes RCU to printk information on which
+	  CPUs are delaying the current grace period, but only when
+	  the grace period extends for excessive time periods.
+
+	  Say Y if you want RCU to perform such checks.
+
+	  Say N if you are unsure.
+
 config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
 	depends on DEBUG_KERNEL
-- 
cgit v1.3-14-g43fede


From 7b4967c532045a1983d6d4af5c69cc7c5109f62b Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 19 Dec 2008 16:56:37 +1030
Subject: cpumask: Add alloc_cpumask_var_node()

Impact: New API

This will be needed in x86 code to allocate the domain and old_domain
cpumasks on the same node as where the containing irq_cfg struct is
allocated.

(Also fixes double-dump_stack on rare CONFIG_DEBUG_PER_CPU_MAPS case)

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (re-impl alloc_cpumask_var)
---
 include/linux/cpumask.h |  7 +++++++
 lib/cpumask.c           | 11 ++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d4bf52603e6b..b5ad19a6f43f 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1025,6 +1025,7 @@ static inline size_t cpumask_size(void)
 #ifdef CONFIG_CPUMASK_OFFSTACK
 typedef struct cpumask *cpumask_var_t;
 
+bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
 bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
 void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
 void free_cpumask_var(cpumask_var_t mask);
@@ -1038,6 +1039,12 @@ static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 	return true;
 }
 
+static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
+					  int node)
+{
+	return true;
+}
+
 static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
 {
 }
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 8d03f22c6ced..3f258f58c85b 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -76,15 +76,14 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 
 /* These are not inline because of header tangles. */
 #ifdef CONFIG_CPUMASK_OFFSTACK
-bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 {
 	if (likely(slab_is_available()))
-		*mask = kmalloc(cpumask_size(), flags);
+		*mask = kmalloc_node(cpumask_size(), flags, node);
 	else {
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 		printk(KERN_ERR
 			"=> alloc_cpumask_var: kmalloc not available!\n");
-		dump_stack();
 #endif
 		*mask = NULL;
 	}
@@ -96,6 +95,12 @@ bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 #endif
 	return *mask != NULL;
 }
+EXPORT_SYMBOL(alloc_cpumask_var_node);
+
+bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	return alloc_cpumask_var_node(mask, flags, numa_node_id());
+}
 EXPORT_SYMBOL(alloc_cpumask_var);
 
 void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask)
-- 
cgit v1.3-14-g43fede


From ec26b805879c7e77865b39ee91b737985e80006d Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 19 Dec 2008 16:56:52 +1030
Subject: cpumask: documentation for cpumask_var_t

Impact: New kerneldoc comments

Additional documentation added to all the alloc_cpumask and free_cpumask
functions.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (minor additions)
---
 lib/cpumask.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'lib')

diff --git a/lib/cpumask.c b/lib/cpumask.c
index 3f258f58c85b..a24edf137f41 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -76,6 +76,20 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 
 /* These are not inline because of header tangles. */
 #ifdef CONFIG_CPUMASK_OFFSTACK
+/**
+ * alloc_cpumask_var_node - allocate a struct cpumask on a given node
+ * @mask: pointer to cpumask_var_t where the cpumask is returned
+ * @flags: GFP_ flags
+ *
+ * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
+ * a nop returning a constant 1 (in <linux/cpumask.h>)
+ * Returns TRUE if memory allocation succeeded, FALSE otherwise.
+ *
+ * In addition, mask will be NULL if this fails.  Note that gcc is
+ * usually smart enough to know that mask can never be NULL if
+ * CONFIG_CPUMASK_OFFSTACK=n, so does code elimination in that case
+ * too.
+ */
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 {
 	if (likely(slab_is_available()))
@@ -97,23 +111,52 @@ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 }
 EXPORT_SYMBOL(alloc_cpumask_var_node);
 
+/**
+ * alloc_cpumask_var - allocate a struct cpumask
+ * @mask: pointer to cpumask_var_t where the cpumask is returned
+ * @flags: GFP_ flags
+ *
+ * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
+ * a nop returning a constant 1 (in <linux/cpumask.h>).
+ *
+ * See alloc_cpumask_var_node.
+ */
 bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 {
 	return alloc_cpumask_var_node(mask, flags, numa_node_id());
 }
 EXPORT_SYMBOL(alloc_cpumask_var);
 
+/**
+ * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena.
+ * @mask: pointer to cpumask_var_t where the cpumask is returned
+ *
+ * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
+ * a nop returning a constant 1 (in <linux/cpumask.h>)
+ * Either returns an allocated (zero-filled) cpumask, or causes the
+ * system to panic.
+ */
 void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask)
 {
 	*mask = alloc_bootmem(cpumask_size());
 }
 
+/**
+ * free_cpumask_var - frees memory allocated for a struct cpumask.
+ * @mask: cpumask to free
+ *
+ * This is safe on a NULL mask.
+ */
 void free_cpumask_var(cpumask_var_t mask)
 {
 	kfree(mask);
 }
 EXPORT_SYMBOL(free_cpumask_var);
 
+/**
+ * free_bootmem_cpumask_var - frees result of alloc_bootmem_cpumask_var
+ * @mask: cpumask to free
+ */
 void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 {
 	free_bootmem((unsigned long)mask, cpumask_size());
-- 
cgit v1.3-14-g43fede


From 69c35efcf1576ab5f00cba83e8ca740923afb6c9 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 7 Nov 2008 15:11:47 +0800
Subject: libcrc32c: Move implementation to crypto crc32c

This patch swaps the role of libcrc32c and crc32c.  Previously
the implementation was in libcrc32c and crc32c was a wrapper.
Now the code is in crc32c and libcrc32c just calls the crypto
layer.

The reason for the change is to tap into the algorithm selection
capability of the crypto API so that optimised implementations
such as the one utilising Intel's CRC32C instruction can be
used where available.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig         |   4 +-
 crypto/crc32c.c        | 113 +++++++++++++++++++++++++++++-
 include/linux/crc32c.h |   5 +-
 lib/Kconfig            |   1 +
 lib/libcrc32c.c        | 182 +++++++++----------------------------------------
 5 files changed, 146 insertions(+), 159 deletions(-)

(limited to 'lib')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index dc20a34ba5ef..aede80246df2 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -256,12 +256,10 @@ comment "Digest"
 config CRYPTO_CRC32C
 	tristate "CRC32c CRC algorithm"
 	select CRYPTO_HASH
-	select LIBCRC32C
 	help
 	  Castagnoli, et al Cyclic Redundancy-Check Algorithm.  Used
 	  by iSCSI for header and data digests and by others.
-	  See Castagnoli93.  This implementation uses lib/libcrc32c.
-	  Module will be crc32c.
+	  See Castagnoli93.  Module will be crc32c.
 
 config CRYPTO_CRC32C_INTEL
 	tristate "CRC32c INTEL hardware acceleration"
diff --git a/crypto/crc32c.c b/crypto/crc32c.c
index b21b93f2bb90..973bc2cfab2e 100644
--- a/crypto/crc32c.c
+++ b/crypto/crc32c.c
@@ -3,8 +3,29 @@
  *
  * CRC32C chksum
  *
- * This module file is a wrapper to invoke the lib/crc32c routines.
+ *@Article{castagnoli-crc,
+ * author =       { Guy Castagnoli and Stefan Braeuer and Martin Herrman},
+ * title =        {{Optimization of Cyclic Redundancy-Check Codes with 24
+ *                 and 32 Parity Bits}},
+ * journal =      IEEE Transactions on Communication,
+ * year =         {1993},
+ * volume =       {41},
+ * number =       {6},
+ * pages =        {},
+ * month =        {June},
+ *}
+ * Used by the iSCSI driver, possibly others, and derived from the
+ * the iscsi-crc.c module of the linux-iscsi driver at
+ * http://linux-iscsi.sourceforge.net.
  *
+ * Following the example of lib/crc32, this function is intended to be
+ * flexible and useful for all users.  Modules that currently have their
+ * own crc32c, but hopefully may be able to use this one are:
+ *  net/sctp (please add all your doco to here if you change to
+ *            use this one!)
+ *  <endoflist>
+ *
+ * Copyright (c) 2004 Cisco Systems, Inc.
  * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * This program is free software; you can redistribute it and/or modify it
@@ -18,7 +39,6 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
-#include <linux/crc32c.h>
 #include <linux/kernel.h>
 
 #define CHKSUM_BLOCK_SIZE	1
@@ -32,6 +52,95 @@ struct chksum_desc_ctx {
 	u32 crc;
 };
 
+/*
+ * This is the CRC-32C table
+ * Generated with:
+ * width = 32 bits
+ * poly = 0x1EDC6F41
+ * reflect input bytes = true
+ * reflect output bytes = true
+ */
+
+static const u32 crc32c_table[256] = {
+	0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
+	0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
+	0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
+	0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
+	0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
+	0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
+	0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+	0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
+	0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+	0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
+	0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
+	0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
+	0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
+	0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+	0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
+	0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
+	0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
+	0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+	0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
+	0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
+	0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+	0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
+	0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
+	0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
+	0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
+	0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
+	0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+	0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+	0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
+	0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
+	0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
+	0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
+	0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
+	0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
+	0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+	0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+	0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
+	0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
+	0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
+	0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
+	0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
+	0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+	0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
+	0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
+	0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+	0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
+	0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
+	0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
+	0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+	0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
+	0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
+	0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
+	0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
+	0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+	0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
+	0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+	0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
+	0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
+	0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
+	0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
+	0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
+	0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
+	0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+	0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
+};
+
+/*
+ * Steps through buffer one byte at at time, calculates reflected
+ * crc using table.
+ */
+
+static u32 crc32c(u32 crc, const u8 *data, unsigned int length)
+{
+	while (length--)
+		crc = crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8);
+
+	return crc;
+}
+
 /*
  * Steps through buffer one byte at at time, calculates reflected 
  * crc using table.
diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h
index 508f512e5a2f..66fa8ff795ec 100644
--- a/include/linux/crc32c.h
+++ b/include/linux/crc32c.h
@@ -3,9 +3,6 @@
 
 #include <linux/types.h>
 
-extern u32 crc32c_le(u32 crc, unsigned char const *address, size_t length);
-extern u32 crc32c_be(u32 crc, unsigned char const *address, size_t length);
-
-#define crc32c(seed, data, length)  crc32c_le(seed, (unsigned char const *)data, length)
+extern u32 crc32c(u32 crc, const void *address, unsigned int length);
 
 #endif	/* _LINUX_CRC32C_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 85cf7ea978aa..ce303f13ed92 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -64,6 +64,7 @@ config CRC7
 
 config LIBCRC32C
 	tristate "CRC32c (Castagnoli, et al) Cyclic Redundancy-Check"
+	select CRYPTO_CRC32C
 	help
 	  This option is provided for the case where no in-kernel-tree
 	  modules require CRC32c functions, but a module built outside the
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index b5c3287d8ea4..38b17ab52ff9 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -30,168 +30,50 @@
  * any later version.
  *
  */
-#include <linux/crc32c.h>
-#include <linux/compiler.h>
-#include <linux/module.h>
-
-MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>");
-MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations");
-MODULE_LICENSE("GPL");
-
-#define CRC32C_POLY_BE 0x1EDC6F41
-#define CRC32C_POLY_LE 0x82F63B78
 
-#ifndef CRC_LE_BITS 
-# define CRC_LE_BITS 8
-#endif
-
-
-/*
- * Haven't generated a big-endian table yet, but the bit-wise version
- * should at least work.
- */
-#if defined CRC_BE_BITS && CRC_BE_BITS != 1
-#undef CRC_BE_BITS
-#endif
-#ifndef CRC_BE_BITS
-# define CRC_BE_BITS 1
-#endif
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 
-EXPORT_SYMBOL(crc32c_le);
+static struct crypto_shash *tfm;
 
-#if CRC_LE_BITS == 1
-/*
- * Compute things bit-wise, as done in crc32.c.  We could share the tight 
- * loop below with crc32 and vary the POLY if we don't find value in terms
- * of space and maintainability in keeping the two modules separate.
- */
-u32 __pure
-crc32c_le(u32 crc, unsigned char const *p, size_t len)
+u32 crc32c(u32 crc, const void *address, unsigned int length)
 {
-	int i;
-	while (len--) {
-		crc ^= *p++;
-		for (i = 0; i < 8; i++)
-			crc = (crc >> 1) ^ ((crc & 1) ? CRC32C_POLY_LE : 0);
-	}
-	return crc;
-}
-#else
+	struct {
+		struct shash_desc shash;
+		char ctx[crypto_shash_descsize(tfm)];
+	} desc;
+	int err;
 
-/*
- * This is the CRC-32C table
- * Generated with:
- * width = 32 bits
- * poly = 0x1EDC6F41
- * reflect input bytes = true
- * reflect output bytes = true
- */
+	desc.shash.tfm = tfm;
+	desc.shash.flags = 0;
+	*(u32 *)desc.ctx = crc;
 
-static const u32 crc32c_table[256] = {
-	0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
-	0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
-	0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
-	0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
-	0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
-	0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
-	0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
-	0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
-	0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
-	0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
-	0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
-	0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
-	0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
-	0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
-	0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
-	0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
-	0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
-	0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
-	0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
-	0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
-	0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
-	0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
-	0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
-	0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
-	0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
-	0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
-	0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
-	0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
-	0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
-	0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
-	0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
-	0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
-	0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
-	0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
-	0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
-	0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
-	0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
-	0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
-	0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
-	0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
-	0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
-	0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
-	0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
-	0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
-	0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
-	0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
-	0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
-	0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
-	0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
-	0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
-	0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
-	0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
-	0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
-	0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
-	0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
-	0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
-	0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
-	0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
-	0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
-	0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
-	0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
-	0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
-	0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
-	0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
-};
+	err = crypto_shash_update(&desc.shash, address, length);
+	BUG_ON(err);
 
-/*
- * Steps through buffer one byte at at time, calculates reflected 
- * crc using table.
- */
+	return *(u32 *)desc.ctx;
+}
 
-u32 __pure
-crc32c_le(u32 crc, unsigned char const *data, size_t length)
+static int __init libcrc32c_mod_init(void)
 {
-	while (length--)
-		crc =
-		    crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8);
+	tfm = crypto_alloc_shash("crc32c", 0, 0);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
 
-	return crc;
+	return 0;
 }
 
-#endif	/* CRC_LE_BITS == 8 */
-
-EXPORT_SYMBOL(crc32c_be);
-
-#if CRC_BE_BITS == 1
-u32 __pure
-crc32c_be(u32 crc, unsigned char const *p, size_t len)
+static void __exit libcrc32c_mod_fini(void)
 {
-	int i;
-	while (len--) {
-		crc ^= *p++ << 24;
-		for (i = 0; i < 8; i++)
-			crc =
-			    (crc << 1) ^ ((crc & 0x80000000) ? CRC32C_POLY_BE :
-					  0);
-	}
-	return crc;
+	crypto_free_shash(tfm);
 }
-#endif
 
-/*
- * Unit test
- *
- * A small unit test suite is implemented as part of the crypto suite.
- * Select CRYPTO_CRC32C and use the tcrypt module to run the tests.
- */
+module_init(libcrc32c_mod_init);
+module_exit(libcrc32c_mod_fini);
+
+MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>");
+MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations");
+MODULE_LICENSE("GPL");
-- 
cgit v1.3-14-g43fede


From 53b146ae598268edbe2bf7ea7dfec721d51adddd Mon Sep 17 00:00:00 2001
From: Adrian-Ken Rueegsegger <rueegsegger@swiss-it.ch>
Date: Tue, 11 Nov 2008 12:14:00 +0800
Subject: libcrc32c: Fix "crc32c undefined" compilation error

The latest shash changes leave crc32c undefined:

[...]
Building modules, stage 2.
  MODPOST 1381 modules
  ERROR: "crc32c" [net/sctp/sctp.ko] undefined!
  ERROR: "crc32c" [net/ipv4/netfilter/nf_nat_proto_sctp.ko] undefined!

Adding EXPORT_SYMBOL(crc32c) to lib/libcrc32c.c fixes the compile error.
This patch has been compile-tested only.

Signed-off-by: Adrian-Ken Rueegsegger <rueegsegger@swiss-it.ch>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 lib/libcrc32c.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'lib')

diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index 38b17ab52ff9..244f5480c898 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -57,6 +57,8 @@ u32 crc32c(u32 crc, const void *address, unsigned int length)
 	return *(u32 *)desc.ctx;
 }
 
+EXPORT_SYMBOL(crc32c);
+
 static int __init libcrc32c_mod_init(void)
 {
 	tfm = crypto_alloc_shash("crc32c", 0, 0);
-- 
cgit v1.3-14-g43fede


From 93027354d6e8a66a12dedb89d59b21cf7c2d35ed Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 13 Nov 2008 22:05:13 +0800
Subject: libcrc32c: Select CRYPTO in Kconfig

Selecting CRYPTO_CRC32C is not enough as CRYPTO which CRYPTO_CRC32C
depends on may be disabled.  This patch adds the select on CRYPTO.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 lib/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'lib')

diff --git a/lib/Kconfig b/lib/Kconfig
index ce303f13ed92..fd4118e097f0 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -64,6 +64,7 @@ config CRC7
 
 config LIBCRC32C
 	tristate "CRC32c (Castagnoli, et al) Cyclic Redundancy-Check"
+	select CRYPTO
 	select CRYPTO_CRC32C
 	help
 	  This option is provided for the case where no in-kernel-tree
-- 
cgit v1.3-14-g43fede


From 70a7d3cc1308a55104fbe505d76f2aca8a4cf53e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 22 Dec 2008 10:26:05 -0800
Subject: swiotlb: add hwdev to swiotlb_phys_to_bus() / swiotlb_sg_to_bus()

Impact: extend functions with a (yet unused) parameter, update callsites

Some architectures need it - in preparation for highmem swiotlb.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-swiotlb_64.c |  2 +-
 include/linux/swiotlb.h          |  3 ++-
 lib/swiotlb.c                    | 53 +++++++++++++++++-----------------------
 3 files changed, 25 insertions(+), 33 deletions(-)

(limited to 'lib')

diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 242c3440687f..6cf8a816dc29 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -23,7 +23,7 @@ void *swiotlb_alloc(unsigned order, unsigned long nslabs)
 	return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
 }
 
-dma_addr_t swiotlb_phys_to_bus(phys_addr_t paddr)
+dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
 {
 	return paddr;
 }
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 325af1de0351..dedd3c0cfe30 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -27,7 +27,8 @@ swiotlb_init(void);
 extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs);
 extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
 
-extern dma_addr_t swiotlb_phys_to_bus(phys_addr_t address);
+extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev,
+				      phys_addr_t address);
 extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address);
 
 extern int swiotlb_arch_range_needs_mapping(void *ptr, size_t size);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index fa2dc4e5f9ba..3657da8ebbc3 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -126,7 +126,7 @@ void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs)
 	return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
 }
 
-dma_addr_t __weak swiotlb_phys_to_bus(phys_addr_t paddr)
+dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
 {
 	return paddr;
 }
@@ -136,9 +136,10 @@ phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr)
 	return baddr;
 }
 
-static dma_addr_t swiotlb_virt_to_bus(volatile void *address)
+static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
+				      volatile void *address)
 {
-	return swiotlb_phys_to_bus(virt_to_phys(address));
+	return swiotlb_phys_to_bus(hwdev, virt_to_phys(address));
 }
 
 static void *swiotlb_bus_to_virt(dma_addr_t address)
@@ -151,35 +152,23 @@ int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
 	return 0;
 }
 
-static dma_addr_t swiotlb_sg_to_bus(struct scatterlist *sg)
+static dma_addr_t swiotlb_sg_to_bus(struct device *hwdev, struct scatterlist *sg)
 {
-	return swiotlb_phys_to_bus(page_to_phys(sg_page(sg)) + sg->offset);
+	return swiotlb_phys_to_bus(hwdev, page_to_phys(sg_page(sg)) + sg->offset);
 }
 
 static void swiotlb_print_info(unsigned long bytes)
 {
 	phys_addr_t pstart, pend;
-	dma_addr_t bstart, bend;
 
 	pstart = virt_to_phys(io_tlb_start);
 	pend = virt_to_phys(io_tlb_end);
 
-	bstart = swiotlb_phys_to_bus(pstart);
-	bend = swiotlb_phys_to_bus(pend);
-
 	printk(KERN_INFO "Placing %luMB software IO TLB between %p - %p\n",
 	       bytes >> 20, io_tlb_start, io_tlb_end);
-	if (pstart != bstart || pend != bend)
-		printk(KERN_INFO "software IO TLB at phys %#llx - %#llx"
-		       " bus %#llx - %#llx\n",
-		       (unsigned long long)pstart,
-		       (unsigned long long)pend,
-		       (unsigned long long)bstart,
-		       (unsigned long long)bend);
-	else
-		printk(KERN_INFO "software IO TLB at phys %#llx - %#llx\n",
-		       (unsigned long long)pstart,
-		       (unsigned long long)pend);
+	printk(KERN_INFO "software IO TLB at phys %#llx - %#llx\n",
+	       (unsigned long long)pstart,
+	       (unsigned long long)pend);
 }
 
 /*
@@ -406,7 +395,7 @@ map_single(struct device *hwdev, struct swiotlb_phys_addr buffer, size_t size, i
 	struct swiotlb_phys_addr slot_buf;
 
 	mask = dma_get_seg_boundary(hwdev);
-	start_dma_addr = swiotlb_virt_to_bus(io_tlb_start) & mask;
+	start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start) & mask;
 
 	offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
 
@@ -585,7 +574,9 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		dma_mask = hwdev->coherent_dma_mask;
 
 	ret = (void *)__get_free_pages(flags, order);
-	if (ret && !is_buffer_dma_capable(dma_mask, swiotlb_virt_to_bus(ret), size)) {
+	if (ret &&
+	    !is_buffer_dma_capable(dma_mask, swiotlb_virt_to_bus(hwdev, ret),
+				   size)) {
 		/*
 		 * The allocated memory isn't reachable by the device.
 		 * Fall back on swiotlb_map_single().
@@ -609,7 +600,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	}
 
 	memset(ret, 0, size);
-	dev_addr = swiotlb_virt_to_bus(ret);
+	dev_addr = swiotlb_virt_to_bus(hwdev, ret);
 
 	/* Confirm address can be DMA'd by device */
 	if (!is_buffer_dma_capable(dma_mask, dev_addr, size)) {
@@ -669,7 +660,7 @@ dma_addr_t
 swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 			 int dir, struct dma_attrs *attrs)
 {
-	dma_addr_t dev_addr = swiotlb_virt_to_bus(ptr);
+	dma_addr_t dev_addr = swiotlb_virt_to_bus(hwdev, ptr);
 	void *map;
 	struct swiotlb_phys_addr buffer;
 
@@ -694,7 +685,7 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 		map = io_tlb_overflow_buffer;
 	}
 
-	dev_addr = swiotlb_virt_to_bus(map);
+	dev_addr = swiotlb_virt_to_bus(hwdev, map);
 
 	/*
 	 * Ensure that the address returned is DMA'ble
@@ -840,7 +831,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 	BUG_ON(dir == DMA_NONE);
 
 	for_each_sg(sgl, sg, nelems, i) {
-		dev_addr = swiotlb_sg_to_bus(sg);
+		dev_addr = swiotlb_sg_to_bus(hwdev, sg);
 		if (range_needs_mapping(sg_virt(sg), sg->length) ||
 		    address_needs_mapping(hwdev, dev_addr, sg->length)) {
 			void *map;
@@ -856,7 +847,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 				sgl[0].dma_length = 0;
 				return 0;
 			}
-			sg->dma_address = swiotlb_virt_to_bus(map);
+			sg->dma_address = swiotlb_virt_to_bus(hwdev, map);
 		} else
 			sg->dma_address = dev_addr;
 		sg->dma_length = sg->length;
@@ -886,7 +877,7 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 	BUG_ON(dir == DMA_NONE);
 
 	for_each_sg(sgl, sg, nelems, i) {
-		if (sg->dma_address != swiotlb_sg_to_bus(sg))
+		if (sg->dma_address != swiotlb_sg_to_bus(hwdev, sg))
 			unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				     sg->dma_length, dir);
 		else if (dir == DMA_FROM_DEVICE)
@@ -919,7 +910,7 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
 	BUG_ON(dir == DMA_NONE);
 
 	for_each_sg(sgl, sg, nelems, i) {
-		if (sg->dma_address != swiotlb_sg_to_bus(sg))
+		if (sg->dma_address != swiotlb_sg_to_bus(hwdev, sg))
 			sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				    sg->dma_length, dir, target);
 		else if (dir == DMA_FROM_DEVICE)
@@ -944,7 +935,7 @@ swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
 int
 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
-	return (dma_addr == swiotlb_virt_to_bus(io_tlb_overflow_buffer));
+	return (dma_addr == swiotlb_virt_to_bus(hwdev, io_tlb_overflow_buffer));
 }
 
 /*
@@ -956,7 +947,7 @@ swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 int
 swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
-	return swiotlb_virt_to_bus(io_tlb_end - 1) <= mask;
+	return swiotlb_virt_to_bus(hwdev, io_tlb_end - 1) <= mask;
 }
 
 EXPORT_SYMBOL(swiotlb_map_single);
-- 
cgit v1.3-14-g43fede


From bc40ac66988a7721f2a244b6df65f8c13d16479c Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Mon, 22 Dec 2008 10:26:08 -0800
Subject: swiotlb: store phys address in io_tlb_orig_addr array

Impact: refactor code, cleanup

When we enable swiotlb for platforms that support HIGHMEM, we
can no longer store the virtual address of the original dma
buffer, because that buffer might not have a permament mapping.

Change the swiotlb code to instead store the physical address of
the original buffer.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 120 +++++++++++++++-------------------------------------------
 1 file changed, 30 insertions(+), 90 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 3657da8ebbc3..98a7a4450e02 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -89,10 +89,7 @@ static unsigned int io_tlb_index;
  * We need to save away the original address corresponding to a mapped entry
  * for the sync operations.
  */
-static struct swiotlb_phys_addr {
-	struct page *page;
-	unsigned int offset;
-} *io_tlb_orig_addr;
+static phys_addr_t *io_tlb_orig_addr;
 
 /*
  * Protect the above data structures in the map and unmap calls
@@ -204,7 +201,7 @@ swiotlb_init_with_default_size(size_t default_size)
 	for (i = 0; i < io_tlb_nslabs; i++)
  		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
 	io_tlb_index = 0;
-	io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(struct swiotlb_phys_addr));
+	io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t));
 
 	/*
 	 * Get the overflow emergency buffer
@@ -278,12 +275,14 @@ swiotlb_late_init_with_default_size(size_t default_size)
  		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
 	io_tlb_index = 0;
 
-	io_tlb_orig_addr = (struct swiotlb_phys_addr *)__get_free_pages(GFP_KERNEL,
-	                           get_order(io_tlb_nslabs * sizeof(struct swiotlb_phys_addr)));
+	io_tlb_orig_addr = (phys_addr_t *)
+		__get_free_pages(GFP_KERNEL,
+				 get_order(io_tlb_nslabs *
+					   sizeof(phys_addr_t)));
 	if (!io_tlb_orig_addr)
 		goto cleanup3;
 
-	memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(struct swiotlb_phys_addr));
+	memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t));
 
 	/*
 	 * Get the overflow emergency buffer
@@ -298,8 +297,8 @@ swiotlb_late_init_with_default_size(size_t default_size)
 	return 0;
 
 cleanup4:
-	free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs *
-	                                                      sizeof(char *)));
+	free_pages((unsigned long)io_tlb_orig_addr,
+		   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
 	io_tlb_orig_addr = NULL;
 cleanup3:
 	free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
@@ -330,59 +329,11 @@ static int is_swiotlb_buffer(char *addr)
 	return addr >= io_tlb_start && addr < io_tlb_end;
 }
 
-static struct swiotlb_phys_addr swiotlb_bus_to_phys_addr(char *dma_addr)
-{
-	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	struct swiotlb_phys_addr buffer = io_tlb_orig_addr[index];
-	buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1);
-	buffer.page += buffer.offset >> PAGE_SHIFT;
-	buffer.offset &= PAGE_SIZE - 1;
-	return buffer;
-}
-
-static void
-__sync_single(struct swiotlb_phys_addr buffer, char *dma_addr, size_t size, int dir)
-{
-	if (PageHighMem(buffer.page)) {
-		size_t len, bytes;
-		char *dev, *host, *kmp;
-
-		len = size;
-		while (len != 0) {
-			unsigned long flags;
-
-			bytes = len;
-			if ((bytes + buffer.offset) > PAGE_SIZE)
-				bytes = PAGE_SIZE - buffer.offset;
-			local_irq_save(flags); /* protects KM_BOUNCE_READ */
-			kmp  = kmap_atomic(buffer.page, KM_BOUNCE_READ);
-			dev  = dma_addr + size - len;
-			host = kmp + buffer.offset;
-			if (dir == DMA_FROM_DEVICE)
-				memcpy(host, dev, bytes);
-			else
-				memcpy(dev, host, bytes);
-			kunmap_atomic(kmp, KM_BOUNCE_READ);
-			local_irq_restore(flags);
-			len -= bytes;
-			buffer.page++;
-			buffer.offset = 0;
-		}
-	} else {
-		void *v = page_address(buffer.page) + buffer.offset;
-
-		if (dir == DMA_TO_DEVICE)
-			memcpy(dma_addr, v, size);
-		else
-			memcpy(v, dma_addr, size);
-	}
-}
-
 /*
  * Allocates bounce buffer and returns its kernel virtual address.
  */
 static void *
-map_single(struct device *hwdev, struct swiotlb_phys_addr buffer, size_t size, int dir)
+map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir)
 {
 	unsigned long flags;
 	char *dma_addr;
@@ -392,7 +343,6 @@ map_single(struct device *hwdev, struct swiotlb_phys_addr buffer, size_t size, i
 	unsigned long mask;
 	unsigned long offset_slots;
 	unsigned long max_slots;
-	struct swiotlb_phys_addr slot_buf;
 
 	mask = dma_get_seg_boundary(hwdev);
 	start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start) & mask;
@@ -477,15 +427,10 @@ found:
 	 * This is needed when we sync the memory.  Then we sync the buffer if
 	 * needed.
 	 */
-	slot_buf = buffer;
-	for (i = 0; i < nslots; i++) {
-		slot_buf.page += slot_buf.offset >> PAGE_SHIFT;
-		slot_buf.offset &= PAGE_SIZE - 1;
-		io_tlb_orig_addr[index+i] = slot_buf;
-		slot_buf.offset += 1 << IO_TLB_SHIFT;
-	}
+	for (i = 0; i < nslots; i++)
+		io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
-		__sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
+		memcpy(dma_addr, phys_to_virt(phys), size);
 
 	return dma_addr;
 }
@@ -499,17 +444,17 @@ unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 	unsigned long flags;
 	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
 	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	struct swiotlb_phys_addr buffer = swiotlb_bus_to_phys_addr(dma_addr);
+	phys_addr_t phys = io_tlb_orig_addr[index];
 
 	/*
 	 * First, sync the memory before unmapping the entry
 	 */
-	if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
+	if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
 		/*
 		 * bounce... copy the data back into the original buffer * and
 		 * delete the bounce buffer.
 		 */
-		__sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
+		memcpy(phys_to_virt(phys), dma_addr, size);
 
 	/*
 	 * Return the buffer to the free list by setting the corresponding
@@ -541,18 +486,21 @@ static void
 sync_single(struct device *hwdev, char *dma_addr, size_t size,
 	    int dir, int target)
 {
-	struct swiotlb_phys_addr buffer = swiotlb_bus_to_phys_addr(dma_addr);
+	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	phys_addr_t phys = io_tlb_orig_addr[index];
+
+	phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
 
 	switch (target) {
 	case SYNC_FOR_CPU:
 		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-			__sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
+			memcpy(phys_to_virt(phys), dma_addr, size);
 		else
 			BUG_ON(dir != DMA_TO_DEVICE);
 		break;
 	case SYNC_FOR_DEVICE:
 		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-			__sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
+			memcpy(dma_addr, phys_to_virt(phys), size);
 		else
 			BUG_ON(dir != DMA_FROM_DEVICE);
 		break;
@@ -591,10 +539,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		 * swiotlb_map_single(), which will grab memory from
 		 * the lowest available address range.
 		 */
-		struct swiotlb_phys_addr buffer;
-		buffer.page = virt_to_page(NULL);
-		buffer.offset = 0;
-		ret = map_single(hwdev, buffer, size, DMA_FROM_DEVICE);
+		ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
 		if (!ret)
 			return NULL;
 	}
@@ -662,7 +607,6 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 {
 	dma_addr_t dev_addr = swiotlb_virt_to_bus(hwdev, ptr);
 	void *map;
-	struct swiotlb_phys_addr buffer;
 
 	BUG_ON(dir == DMA_NONE);
 	/*
@@ -677,9 +621,7 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 	/*
 	 * Oh well, have to allocate and map a bounce buffer.
 	 */
-	buffer.page   = virt_to_page(ptr);
-	buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
-	map = map_single(hwdev, buffer, size, dir);
+	map = map_single(hwdev, virt_to_phys(ptr), size, dir);
 	if (!map) {
 		swiotlb_full(hwdev, size, dir, 1);
 		map = io_tlb_overflow_buffer;
@@ -824,20 +766,18 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 		     int dir, struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
-	struct swiotlb_phys_addr buffer;
-	dma_addr_t dev_addr;
 	int i;
 
 	BUG_ON(dir == DMA_NONE);
 
 	for_each_sg(sgl, sg, nelems, i) {
-		dev_addr = swiotlb_sg_to_bus(hwdev, sg);
-		if (range_needs_mapping(sg_virt(sg), sg->length) ||
+		void *addr = sg_virt(sg);
+		dma_addr_t dev_addr = swiotlb_virt_to_bus(hwdev, addr);
+
+		if (range_needs_mapping(addr, sg->length) ||
 		    address_needs_mapping(hwdev, dev_addr, sg->length)) {
-			void *map;
-			buffer.page   = sg_page(sg);
-			buffer.offset = sg->offset;
-			map = map_single(hwdev, buffer, sg->length, dir);
+			void *map = map_single(hwdev, sg_phys(sg),
+					       sg->length, dir);
 			if (!map) {
 				/* Don't panic here, we expect map_sg users
 				   to do proper error handling. */
-- 
cgit v1.3-14-g43fede


From fb05a37929e0cd99016b4f5e5a5ef077fb10a947 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Mon, 22 Dec 2008 10:26:09 -0800
Subject: swiotlb: add support for systems with highmem

Impact: extend code for highmem - existing users unaffected

On highmem systems, the original dma buffer might not
have a virtual mapping - we need to kmap it in to perform
the bounce.  Extract the code that does the actual
copy into a function that does the kmap if highmem
is enabled, and default to the normal swiotlb memcpy
if not.

[ ported by Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> ]

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 68 ++++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 51 insertions(+), 17 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 98a7a4450e02..785046e4c3ab 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -14,6 +14,7 @@
  * 04/07/.. ak		Better overflow handling. Assorted fixes.
  * 05/09/10 linville	Add support for syncing ranges, support syncing for
  *			DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
+ * 08/12/11 beckyb	Add highmem support
  */
 
 #include <linux/cache.h>
@@ -24,6 +25,7 @@
 #include <linux/swiotlb.h>
 #include <linux/string.h>
 #include <linux/swiotlb.h>
+#include <linux/pfn.h>
 #include <linux/types.h>
 #include <linux/ctype.h>
 #include <linux/highmem.h>
@@ -149,11 +151,6 @@ int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
 	return 0;
 }
 
-static dma_addr_t swiotlb_sg_to_bus(struct device *hwdev, struct scatterlist *sg)
-{
-	return swiotlb_phys_to_bus(hwdev, page_to_phys(sg_page(sg)) + sg->offset);
-}
-
 static void swiotlb_print_info(unsigned long bytes)
 {
 	phys_addr_t pstart, pend;
@@ -329,6 +326,47 @@ static int is_swiotlb_buffer(char *addr)
 	return addr >= io_tlb_start && addr < io_tlb_end;
 }
 
+/*
+ * Bounce: copy the swiotlb buffer back to the original dma location
+ */
+static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+			   enum dma_data_direction dir)
+{
+	unsigned long pfn = PFN_DOWN(phys);
+
+	if (PageHighMem(pfn_to_page(pfn))) {
+		/* The buffer does not have a mapping.  Map it in and copy */
+		unsigned int offset = phys & ~PAGE_MASK;
+		char *buffer;
+		unsigned int sz = 0;
+		unsigned long flags;
+
+		while (size) {
+			sz = min(PAGE_SIZE - offset, size);
+
+			local_irq_save(flags);
+			buffer = kmap_atomic(pfn_to_page(pfn),
+					     KM_BOUNCE_READ);
+			if (dir == DMA_TO_DEVICE)
+				memcpy(dma_addr, buffer + offset, sz);
+			else
+				memcpy(buffer + offset, dma_addr, sz);
+			kunmap_atomic(buffer, KM_BOUNCE_READ);
+			local_irq_restore(flags);
+
+			size -= sz;
+			pfn++;
+			dma_addr += sz;
+			offset = 0;
+		}
+	} else {
+		if (dir == DMA_TO_DEVICE)
+			memcpy(dma_addr, phys_to_virt(phys), size);
+		else
+			memcpy(phys_to_virt(phys), dma_addr, size);
+	}
+}
+
 /*
  * Allocates bounce buffer and returns its kernel virtual address.
  */
@@ -430,7 +468,7 @@ found:
 	for (i = 0; i < nslots; i++)
 		io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
-		memcpy(dma_addr, phys_to_virt(phys), size);
+		swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
 
 	return dma_addr;
 }
@@ -450,11 +488,7 @@ unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 	 * First, sync the memory before unmapping the entry
 	 */
 	if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
-		/*
-		 * bounce... copy the data back into the original buffer * and
-		 * delete the bounce buffer.
-		 */
-		memcpy(phys_to_virt(phys), dma_addr, size);
+		swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
 
 	/*
 	 * Return the buffer to the free list by setting the corresponding
@@ -494,13 +528,13 @@ sync_single(struct device *hwdev, char *dma_addr, size_t size,
 	switch (target) {
 	case SYNC_FOR_CPU:
 		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-			memcpy(phys_to_virt(phys), dma_addr, size);
+			swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
 		else
 			BUG_ON(dir != DMA_TO_DEVICE);
 		break;
 	case SYNC_FOR_DEVICE:
 		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-			memcpy(dma_addr, phys_to_virt(phys), size);
+			swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
 		else
 			BUG_ON(dir != DMA_FROM_DEVICE);
 		break;
@@ -817,11 +851,11 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 	BUG_ON(dir == DMA_NONE);
 
 	for_each_sg(sgl, sg, nelems, i) {
-		if (sg->dma_address != swiotlb_sg_to_bus(hwdev, sg))
+		if (sg->dma_address != swiotlb_virt_to_bus(hwdev, sg_virt(sg)))
 			unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				     sg->dma_length, dir);
 		else if (dir == DMA_FROM_DEVICE)
-			dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
+			dma_mark_clean(sg_virt(sg), sg->dma_length);
 	}
 }
 EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
@@ -850,11 +884,11 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
 	BUG_ON(dir == DMA_NONE);
 
 	for_each_sg(sgl, sg, nelems, i) {
-		if (sg->dma_address != swiotlb_sg_to_bus(hwdev, sg))
+		if (sg->dma_address != swiotlb_virt_to_bus(hwdev, sg_virt(sg)))
 			sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				    sg->dma_length, dir, target);
 		else if (dir == DMA_FROM_DEVICE)
-			dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
+			dma_mark_clean(sg_virt(sg), sg->dma_length);
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From ac86ccc637458dea53077823f98bc3e1b874d478 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Sun, 28 Dec 2008 15:02:06 +0900
Subject: swiotlb: remove unnecessary declaration

Impact: cleanup

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 785046e4c3ab..5a7d0a4a9da3 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -22,7 +22,6 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <linux/swiotlb.h>
 #include <linux/string.h>
 #include <linux/swiotlb.h>
 #include <linux/pfn.h>
@@ -777,8 +776,6 @@ swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
 				  SYNC_FOR_DEVICE);
 }
 
-void swiotlb_unmap_sg_attrs(struct device *, struct scatterlist *, int, int,
-			    struct dma_attrs *);
 /*
  * Map a set of buffers described by scatterlist in streaming mode for DMA.
  * This is the scatter-gather version of the above swiotlb_map_single
-- 
cgit v1.3-14-g43fede


From 874d6a9555516cbac828131d573ce3d16ce47164 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Sun, 28 Dec 2008 15:02:07 +0900
Subject: swiotlb: clean up EXPORT_SYMBOL usage

Impact: cleanup

swiotlb uses EXPORT_SYMBOL in an inconsistent way. Some functions use
EXPORT_SYMBOL at the end of functions. Some use it at the end of
swiotlb.c.

This cleans up swiotlb to use EXPORT_SYMBOL in a consistent way (at
the end of functions).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 5a7d0a4a9da3..ef3cfe58a157 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -593,6 +593,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	*dma_handle = dev_addr;
 	return ret;
 }
+EXPORT_SYMBOL(swiotlb_alloc_coherent);
 
 void
 swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
@@ -605,6 +606,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 		/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
 		unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
 }
+EXPORT_SYMBOL(swiotlb_free_coherent);
 
 static void
 swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
@@ -677,6 +679,7 @@ swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
 {
 	return swiotlb_map_single_attrs(hwdev, ptr, size, dir, NULL);
 }
+EXPORT_SYMBOL(swiotlb_map_single);
 
 /*
  * Unmap a single streaming mode DMA translation.  The dma_addr and size must
@@ -706,6 +709,8 @@ swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
 {
 	return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL);
 }
+EXPORT_SYMBOL(swiotlb_unmap_single);
+
 /*
  * Make physical memory consistent for a single streaming mode DMA translation
  * after a transfer.
@@ -735,6 +740,7 @@ swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
 {
 	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
 }
+EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
 
 void
 swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
@@ -742,6 +748,7 @@ swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
 {
 	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
 }
+EXPORT_SYMBOL(swiotlb_sync_single_for_device);
 
 /*
  * Same as above, but for a sub-range of the mapping.
@@ -767,6 +774,7 @@ swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
 	swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
 				  SYNC_FOR_CPU);
 }
+EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_cpu);
 
 void
 swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
@@ -775,6 +783,7 @@ swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
 	swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
 				  SYNC_FOR_DEVICE);
 }
+EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
 
 /*
  * Map a set of buffers described by scatterlist in streaming mode for DMA.
@@ -833,6 +842,7 @@ swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
 {
 	return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
 }
+EXPORT_SYMBOL(swiotlb_map_sg);
 
 /*
  * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
@@ -863,6 +873,7 @@ swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
 {
 	return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
 }
+EXPORT_SYMBOL(swiotlb_unmap_sg);
 
 /*
  * Make physical memory consistent for a set of streaming mode DMA translations
@@ -895,6 +906,7 @@ swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
 {
 	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
 }
+EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
 
 void
 swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
@@ -902,12 +914,14 @@ swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
 {
 	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
 }
+EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
 
 int
 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
 	return (dma_addr == swiotlb_virt_to_bus(hwdev, io_tlb_overflow_buffer));
 }
+EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 
 /*
  * Return whether the given device DMA address mask can be supported
@@ -920,18 +934,4 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
 	return swiotlb_virt_to_bus(hwdev, io_tlb_end - 1) <= mask;
 }
-
-EXPORT_SYMBOL(swiotlb_map_single);
-EXPORT_SYMBOL(swiotlb_unmap_single);
-EXPORT_SYMBOL(swiotlb_map_sg);
-EXPORT_SYMBOL(swiotlb_unmap_sg);
-EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
-EXPORT_SYMBOL(swiotlb_sync_single_for_device);
-EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_cpu);
-EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
-EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
-EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
-EXPORT_SYMBOL(swiotlb_dma_mapping_error);
-EXPORT_SYMBOL(swiotlb_alloc_coherent);
-EXPORT_SYMBOL(swiotlb_free_coherent);
 EXPORT_SYMBOL(swiotlb_dma_supported);
-- 
cgit v1.3-14-g43fede


From 773ff60e841461cb1f9374a713ffcda029b8c317 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 23 Dec 2008 19:37:01 +0900
Subject: SLUB: failslab support

Currently fault-injection capability for SLAB allocator is only
available to SLAB. This patch makes it available to SLUB, too.

[penberg@cs.helsinki.fi: unify slab and slub implementations]
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/fault-inject.h |  9 ++++++
 lib/Kconfig.debug            |  1 +
 mm/Makefile                  |  1 +
 mm/failslab.c                | 59 ++++++++++++++++++++++++++++++++++
 mm/slab.c                    | 75 +++-----------------------------------------
 mm/slub.c                    |  4 +++
 6 files changed, 79 insertions(+), 70 deletions(-)
 create mode 100644 mm/failslab.c

(limited to 'lib')

diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 32368c4f0326..06ca9b21dad2 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -81,4 +81,13 @@ static inline void cleanup_fault_attr_dentries(struct fault_attr *attr)
 
 #endif /* CONFIG_FAULT_INJECTION */
 
+#ifdef CONFIG_FAILSLAB
+extern bool should_failslab(size_t size, gfp_t gfpflags);
+#else
+static inline bool should_failslab(size_t size, gfp_t gfpflags)
+{
+	return false;
+}
+#endif /* CONFIG_FAILSLAB */
+
 #endif /* _LINUX_FAULT_INJECT_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b0f239e443bc..af65ae7f0549 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -699,6 +699,7 @@ config FAULT_INJECTION
 config FAILSLAB
 	bool "Fault-injection capability for kmalloc"
 	depends on FAULT_INJECTION
+	depends on SLAB || SLUB
 	help
 	  Provide fault-injection capability for kmalloc.
 
diff --git a/mm/Makefile b/mm/Makefile
index c06b45a1ff5f..51c27709cc7c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
+obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
diff --git a/mm/failslab.c b/mm/failslab.c
new file mode 100644
index 000000000000..7c6ea6493f80
--- /dev/null
+++ b/mm/failslab.c
@@ -0,0 +1,59 @@
+#include <linux/fault-inject.h>
+
+static struct {
+	struct fault_attr attr;
+	u32 ignore_gfp_wait;
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+	struct dentry *ignore_gfp_wait_file;
+#endif
+} failslab = {
+	.attr = FAULT_ATTR_INITIALIZER,
+	.ignore_gfp_wait = 1,
+};
+
+bool should_failslab(size_t size, gfp_t gfpflags)
+{
+	if (gfpflags & __GFP_NOFAIL)
+		return false;
+
+        if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
+		return false;
+
+	return should_fail(&failslab.attr, size);
+}
+
+static int __init setup_failslab(char *str)
+{
+	return setup_fault_attr(&failslab.attr, str);
+}
+__setup("failslab=", setup_failslab);
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init failslab_debugfs_init(void)
+{
+	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+	struct dentry *dir;
+	int err;
+
+	err = init_fault_attr_dentries(&failslab.attr, "failslab");
+	if (err)
+		return err;
+	dir = failslab.attr.dentries.dir;
+
+	failslab.ignore_gfp_wait_file =
+		debugfs_create_bool("ignore-gfp-wait", mode, dir,
+				      &failslab.ignore_gfp_wait);
+
+	if (!failslab.ignore_gfp_wait_file) {
+		err = -ENOMEM;
+		debugfs_remove(failslab.ignore_gfp_wait_file);
+		cleanup_fault_attr_dentries(&failslab.attr);
+	}
+
+	return err;
+}
+
+late_initcall(failslab_debugfs_init);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
diff --git a/mm/slab.c b/mm/slab.c
index 09187517f9dc..c347dd8480cc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3106,79 +3106,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
 
-#ifdef CONFIG_FAILSLAB
-
-static struct failslab_attr {
-
-	struct fault_attr attr;
-
-	u32 ignore_gfp_wait;
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-	struct dentry *ignore_gfp_wait_file;
-#endif
-
-} failslab = {
-	.attr = FAULT_ATTR_INITIALIZER,
-	.ignore_gfp_wait = 1,
-};
-
-static int __init setup_failslab(char *str)
-{
-	return setup_fault_attr(&failslab.attr, str);
-}
-__setup("failslab=", setup_failslab);
-
-static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
+static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
 {
 	if (cachep == &cache_cache)
-		return 0;
-	if (flags & __GFP_NOFAIL)
-		return 0;
-	if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
-		return 0;
+		return false;
 
-	return should_fail(&failslab.attr, obj_size(cachep));
+	return should_failslab(obj_size(cachep), flags);
 }
 
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
-static int __init failslab_debugfs(void)
-{
-	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
-	struct dentry *dir;
-	int err;
-
-	err = init_fault_attr_dentries(&failslab.attr, "failslab");
-	if (err)
-		return err;
-	dir = failslab.attr.dentries.dir;
-
-	failslab.ignore_gfp_wait_file =
-		debugfs_create_bool("ignore-gfp-wait", mode, dir,
-				      &failslab.ignore_gfp_wait);
-
-	if (!failslab.ignore_gfp_wait_file) {
-		err = -ENOMEM;
-		debugfs_remove(failslab.ignore_gfp_wait_file);
-		cleanup_fault_attr_dentries(&failslab.attr);
-	}
-
-	return err;
-}
-
-late_initcall(failslab_debugfs);
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
-#else /* CONFIG_FAILSLAB */
-
-static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
-{
-	return 0;
-}
-
-#endif /* CONFIG_FAILSLAB */
-
 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *objp;
@@ -3381,7 +3316,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	unsigned long save_flags;
 	void *ptr;
 
-	if (should_failslab(cachep, flags))
+	if (slab_should_failslab(cachep, flags))
 		return NULL;
 
 	cache_alloc_debugcheck_before(cachep, flags);
@@ -3457,7 +3392,7 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 	unsigned long save_flags;
 	void *objp;
 
-	if (should_failslab(cachep, flags))
+	if (slab_should_failslab(cachep, flags))
 		return NULL;
 
 	cache_alloc_debugcheck_before(cachep, flags);
diff --git a/mm/slub.c b/mm/slub.c
index a2cd47d89e0a..640fde7e354c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -24,6 +24,7 @@
 #include <linux/kallsyms.h>
 #include <linux/memory.h>
 #include <linux/math64.h>
+#include <linux/fault-inject.h>
 
 /*
  * Lock order:
@@ -1591,6 +1592,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 	unsigned long flags;
 	unsigned int objsize;
 
+	if (should_failslab(s->objsize, gfpflags))
+		return NULL;
+
 	local_irq_save(flags);
 	c = get_cpu_slab(s, smp_processor_id());
 	objsize = c->objsize;
-- 
cgit v1.3-14-g43fede


From ea319518ba3de282c13ae1cf4bf2215c5e03e67e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Dec 2008 15:08:55 +0100
Subject: locking, percpu counters: introduce separate lock classes

Impact: fix lockdep false positives

Classify percpu_counter instances similar to regular lock objects --
that is, per instantiation site.

The networking code has increased its use of percpu_counters, which
leads to false positives if they are treated as a single class.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/percpu_counter.h | 14 ++++++++++----
 lib/percpu_counter.c           | 18 ++++--------------
 lib/proportions.c              |  6 +++---
 mm/backing-dev.c               |  2 +-
 4 files changed, 18 insertions(+), 22 deletions(-)

(limited to 'lib')

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccdfc112..96bdde36599f 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -30,8 +30,16 @@ struct percpu_counter {
 #define FBC_BATCH	(NR_CPUS*4)
 #endif
 
-int percpu_counter_init(struct percpu_counter *fbc, s64 amount);
-int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
+int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
+			  struct lock_class_key *key);
+
+#define percpu_counter_init(fbc, value)					\
+	({								\
+		static struct lock_class_key __key;			\
+									\
+		__percpu_counter_init(fbc, value, &__key);		\
+	})
+
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
@@ -85,8 +93,6 @@ static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 	return 0;
 }
 
-#define percpu_counter_init_irq percpu_counter_init
-
 static inline void percpu_counter_destroy(struct percpu_counter *fbc)
 {
 }
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index a8663890a88c..c7fe2e4e8ed1 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -71,11 +71,11 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
 }
 EXPORT_SYMBOL(__percpu_counter_sum);
 
-static struct lock_class_key percpu_counter_irqsafe;
-
-int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
+int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
+			  struct lock_class_key *key)
 {
 	spin_lock_init(&fbc->lock);
+	lockdep_set_class(&fbc->lock, key);
 	fbc->count = amount;
 	fbc->counters = alloc_percpu(s32);
 	if (!fbc->counters)
@@ -87,17 +87,7 @@ int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 #endif
 	return 0;
 }
-EXPORT_SYMBOL(percpu_counter_init);
-
-int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount)
-{
-	int err;
-
-	err = percpu_counter_init(fbc, amount);
-	if (!err)
-		lockdep_set_class(&fbc->lock, &percpu_counter_irqsafe);
-	return err;
-}
+EXPORT_SYMBOL(__percpu_counter_init);
 
 void percpu_counter_destroy(struct percpu_counter *fbc)
 {
diff --git a/lib/proportions.c b/lib/proportions.c
index 4f387a643d72..7367f2b727d0 100644
--- a/lib/proportions.c
+++ b/lib/proportions.c
@@ -83,11 +83,11 @@ int prop_descriptor_init(struct prop_descriptor *pd, int shift)
 	pd->index = 0;
 	pd->pg[0].shift = shift;
 	mutex_init(&pd->mutex);
-	err = percpu_counter_init_irq(&pd->pg[0].events, 0);
+	err = percpu_counter_init(&pd->pg[0].events, 0);
 	if (err)
 		goto out;
 
-	err = percpu_counter_init_irq(&pd->pg[1].events, 0);
+	err = percpu_counter_init(&pd->pg[1].events, 0);
 	if (err)
 		percpu_counter_destroy(&pd->pg[0].events);
 
@@ -191,7 +191,7 @@ int prop_local_init_percpu(struct prop_local_percpu *pl)
 	spin_lock_init(&pl->lock);
 	pl->shift = 0;
 	pl->period = 0;
-	return percpu_counter_init_irq(&pl->events, 0);
+	return percpu_counter_init(&pl->events, 0);
 }
 
 void prop_local_destroy_percpu(struct prop_local_percpu *pl)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f2e574dbc300..f3b125857827 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -220,7 +220,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->max_prop_frac = PROP_FRAC_BASE;
 
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
-		err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
+		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
 		if (err)
 			goto err;
 	}
-- 
cgit v1.3-14-g43fede


From e9690a6e4b1615cb0102e425e04b7ce29e7858e2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 31 Dec 2008 16:45:50 +0800
Subject: cpumask: fix bogus kernel-doc

Impact: fix kernel-doc

alloc_bootmem_cpumask_var() returns avoid.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 lib/cpumask.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/cpumask.c b/lib/cpumask.c
index a24edf137f41..8e1496cb63f7 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -132,7 +132,7 @@ EXPORT_SYMBOL(alloc_cpumask_var);
  * @mask: pointer to cpumask_var_t where the cpumask is returned
  *
  * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
- * a nop returning a constant 1 (in <linux/cpumask.h>)
+ * a nop (in <linux/cpumask.h>).
  * Either returns an allocated (zero-filled) cpumask, or causes the
  * system to panic.
  */
-- 
cgit v1.3-14-g43fede


From ab53d472e785e51fdfc08fc1d66252c1153e6c0f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:19 +1030
Subject: bitmap: find_last_bit()

Impact: New API

As the name suggests.  For the moment everyone uses the generic one.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/bitops.h | 13 ++++++++++++-
 lib/Kconfig            |  4 ++++
 lib/Makefile           |  1 +
 lib/find_last_bit.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 lib/find_last_bit.c

(limited to 'lib')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 024f2b027244..61829139795a 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -134,9 +134,20 @@ extern unsigned long find_first_bit(const unsigned long *addr,
  */
 extern unsigned long find_first_zero_bit(const unsigned long *addr,
 					 unsigned long size);
-
 #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
 
+#ifdef CONFIG_GENERIC_FIND_LAST_BIT
+/**
+ * find_last_bit - find the last set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit number of the first set bit, or size.
+ */
+extern unsigned long find_last_bit(const unsigned long *addr,
+				   unsigned long size);
+#endif /* CONFIG_GENERIC_FIND_LAST_BIT */
+
 #ifdef CONFIG_GENERIC_FIND_NEXT_BIT
 
 /**
diff --git a/lib/Kconfig b/lib/Kconfig
index 2ba43c4a5b07..fc5f5ee50bc2 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -13,6 +13,10 @@ config GENERIC_FIND_FIRST_BIT
 config GENERIC_FIND_NEXT_BIT
 	bool
 
+config GENERIC_FIND_LAST_BIT
+	bool
+	default y
+
 config CRC_CCITT
 	tristate "CRC-CCITT functions"
 	help
diff --git a/lib/Makefile b/lib/Makefile
index 80fe8a3ec12a..32b0e64ded27 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -37,6 +37,7 @@ lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
+lib-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
 obj-$(CONFIG_PLIST) += plist.o
diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
new file mode 100644
index 000000000000..5d202e36bdd8
--- /dev/null
+++ b/lib/find_last_bit.c
@@ -0,0 +1,45 @@
+/* find_last_bit.c: fallback find next bit implementation
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * Written by Rusty Russell <rusty@rustcorp.com.au>
+ * (Inspired by David Howell's find_next_bit implementation)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <asm/types.h>
+#include <asm/byteorder.h>
+
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+	unsigned long words;
+	unsigned long tmp;
+
+	/* Start at final word. */
+	words = size / BITS_PER_LONG;
+
+	/* Partial final word? */
+	if (size & (BITS_PER_LONG-1)) {
+		tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
+					 - (size & (BITS_PER_LONG-1)))));
+		if (tmp)
+			goto found;
+	}
+
+	while (words) {
+		tmp = addr[--words];
+		if (tmp) {
+found:
+			return words * BITS_PER_LONG + __fls(tmp);
+		}
+	}
+
+	/* Not found */
+	return size;
+}
+EXPORT_SYMBOL(find_last_bit);
-- 
cgit v1.3-14-g43fede


From 2a53008033189ed09bfe241c6b33811ba4ce980d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:30 +1030
Subject: cpumask: zero extra bits in alloc_cpumask_var_node

Impact: extra safety checks during transition

When CONFIG_CPUMASKS_OFFSTACK is set, the new cpumask_ operators only
use bits up to nr_cpu_ids, not NR_CPUS.  Using the old cpus_ operators
on these masks can mean accessing undefined bits.

After some discussion, Mike and I decided to err on the side of caution;
we zero the "undefined" bits in alloc_cpumask_var_node() until all the
old cpumask functions are removed.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 lib/cpumask.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'lib')

diff --git a/lib/cpumask.c b/lib/cpumask.c
index 8e1496cb63f7..3389e2440da0 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -107,6 +107,14 @@ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 		dump_stack();
 	}
 #endif
+	/* FIXME: Bandaid to save us from old primitives which go to NR_CPUS. */
+	if (*mask) {
+		unsigned int tail;
+		tail = BITS_TO_LONGS(NR_CPUS - nr_cpumask_bits) * sizeof(long);
+		memset(cpumask_bits(*mask) + cpumask_size() - tail,
+		       0, tail);
+	}
+
 	return *mask != NULL;
 }
 EXPORT_SYMBOL(alloc_cpumask_var_node);
-- 
cgit v1.3-14-g43fede


From 8c384cdee3e04d6194a2c2b192b624754f990835 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:30 +1030
Subject: cpumask: CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS

Impact: new debug CONFIG options

This helps find unconverted code.  It currently breaks compile horribly,
but we never wanted a flag day so that's expected.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/cpumask.h | 11 ++++++++++-
 lib/Kconfig             |  4 ++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 7c178a6baae3..9f315382610b 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -144,6 +144,7 @@
 typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
 extern cpumask_t _unused_cpumask_arg_;
 
+#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
 #define cpu_set(cpu, dst) __cpu_set((cpu), &(dst))
 static inline void __cpu_set(int cpu, volatile cpumask_t *dstp)
 {
@@ -267,6 +268,7 @@ static inline void __cpus_shift_left(cpumask_t *dstp,
 {
 	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
 }
+#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 
 /**
  * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
@@ -304,6 +306,7 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
 	return to_cpumask(p);
 }
 
+#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
 /*
  * In cases where we take the address of the cpumask immediately,
  * gcc optimizes it out (it's a constant) and there's no huge stack
@@ -389,19 +392,22 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
 {
 	bitmap_fold(dstp->bits, origp->bits, sz, nbits);
 }
+#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 
 #if NR_CPUS == 1
 
 #define nr_cpu_ids		1
+#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
 #define first_cpu(src)		({ (void)(src); 0; })
 #define next_cpu(n, src)	({ (void)(src); 1; })
 #define any_online_cpu(mask)	0
 #define for_each_cpu_mask(cpu, mask)	\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-
+#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 #else /* NR_CPUS > 1 */
 
 extern int nr_cpu_ids;
+#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
 int __first_cpu(const cpumask_t *srcp);
 int __next_cpu(int n, const cpumask_t *srcp);
 int __any_online_cpu(const cpumask_t *mask);
@@ -413,8 +419,10 @@ int __any_online_cpu(const cpumask_t *mask);
 	for ((cpu) = -1;				\
 		(cpu) = next_cpu((cpu), (mask)),	\
 		(cpu) < NR_CPUS; )
+#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 #endif
 
+#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
 #if NR_CPUS <= 64
 
 #define next_cpu_nr(n, src)		next_cpu(n, src)
@@ -432,6 +440,7 @@ int __next_cpu_nr(int n, const cpumask_t *srcp);
 		(cpu) < nr_cpu_ids; )
 
 #endif /* NR_CPUS > 64 */
+#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 
 /*
  * The following particular system cpumasks and operations manage
diff --git a/lib/Kconfig b/lib/Kconfig
index fc5f5ee50bc2..03c2c24b9083 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -170,4 +170,8 @@ config CPUMASK_OFFSTACK
 	  them on the stack.  This is a bit more expensive, but avoids
 	  stack overflow.
 
+config DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
+       bool "Disable obsolete cpumask functions" if DEBUG_PER_CPU_MAPS
+       depends on EXPERIMENTAL && BROKEN
+
 endmenu
-- 
cgit v1.3-14-g43fede


From 79ff56ebd3edfb16f8badc558cb439b203a3298f Mon Sep 17 00:00:00 2001
From: Roland Dreier <rdreier@cisco.com>
Date: Tue, 30 Dec 2008 20:18:00 -0800
Subject: swiotlb: add missing __init annotations

Impact: cleanup, reduce kernel size a bit

The current kernel build warns:

    WARNING: vmlinux.o(.text+0x11458): Section mismatch in reference from the function swiotlb_alloc_boot() to the function .init.text:__alloc_bootmem_low()
    The function swiotlb_alloc_boot() references
    the function __init __alloc_bootmem_low().
    This is often because swiotlb_alloc_boot lacks a __init
    annotation or the annotation of __alloc_bootmem_low is wrong.

    WARNING: vmlinux.o(.text+0x1011f2): Section mismatch in reference from the function swiotlb_late_init_with_default_size() to the function .init.text:__alloc_bootmem_low()
    The function swiotlb_late_init_with_default_size() references
    the function __init __alloc_bootmem_low().
    This is often because swiotlb_late_init_with_default_size lacks a __init
    annotation or the annotation of __alloc_bootmem_low is wrong.

and indeed the functions calling __alloc_bootmem_low() can be marked
__init as well.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-swiotlb_64.c | 2 +-
 lib/swiotlb.c                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 242c3440687f..8cba3749a511 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -13,7 +13,7 @@
 
 int swiotlb __read_mostly;
 
-void *swiotlb_alloc_boot(size_t size, unsigned long nslabs)
+void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs)
 {
 	return alloc_bootmem_low_pages(size);
 }
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index fa2dc4e5f9ba..b6d0aae4fd31 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -116,7 +116,7 @@ setup_io_tlb_npages(char *str)
 __setup("swiotlb=", setup_io_tlb_npages);
 /* make io_tlb_overflow tunable too? */
 
-void * __weak swiotlb_alloc_boot(size_t size, unsigned long nslabs)
+void * __weak __init swiotlb_alloc_boot(size_t size, unsigned long nslabs)
 {
 	return alloc_bootmem_low_pages(size);
 }
-- 
cgit v1.3-14-g43fede


From d97106ab53f812910a62d18afb9dbe882819c1ba Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 3 Jan 2009 11:46:17 -0800
Subject: Make %p print '(null)' for NULL pointers

Before, when we only ever printed out the pointer value itself, a NULL
pointer would never cause issues and might as well be printed out as
just its numeric value.

However, with the extended %p formats, especially %pR, we might validly
want to print out resources for debugging.  And sometimes they don't
even exist, and the resource pointer is just NULL.  Print it out as
such, rather than oopsing.

This is a more generic version of a patch done by Trent Piepho (catching
all %p cases rather than just %pR, and using "(null)" instead of
"[NULL]" to match glibc).

Requested-by: Trent Piepho <xyzzy@speakeasy.org>
Acked-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/vsprintf.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3b777025d876..98d632277ca8 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -661,6 +661,9 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr, int field_width,
  */
 static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field_width, int precision, int flags)
 {
+	if (!ptr)
+		return string(buf, end, "(null)", field_width, precision, flags);
+
 	switch (*fmt) {
 	case 'F':
 		ptr = dereference_function_descriptor(ptr);
-- 
cgit v1.3-14-g43fede


From 52942b6b16c6ebb25f4dd4df0208d840ba0cbc5c Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Sat, 3 Jan 2009 00:16:03 +0100
Subject: swiotlb: Don't include linux/swiotlb.h twice in lib/swiotlb.c

There's no point in including the linux/swiotlb.h header twice in
lib/swiotlb.c - this patch gets rid of the unneeded include.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index b6d0aae4fd31..7f5e21b9c16b 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -23,7 +23,6 @@
 #include <linux/spinlock.h>
 #include <linux/swiotlb.h>
 #include <linux/string.h>
-#include <linux/swiotlb.h>
 #include <linux/types.h>
 #include <linux/ctype.h>
 #include <linux/highmem.h>
-- 
cgit v1.3-14-g43fede


From 8e6bdb7f8b478323525444ecee7a1477ff5b6639 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Thu, 27 Nov 2008 11:42:22 +0100
Subject: trivial: radix-tree: document wrap-around issue of
 radix_tree_next_hole()

And some 80-line cleanups.

Signed-off-by: Wu Fengguang <wfg@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 lib/radix-tree.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index be86b32bc874..a7f5217dd67b 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -640,13 +640,14 @@ EXPORT_SYMBOL(radix_tree_tag_get);
  *
  *	Returns: the index of the hole if found, otherwise returns an index
  *	outside of the set specified (in which case 'return - index >= max_scan'
- *	will be true).
+ *	will be true). In rare cases of index wrap-around, 0 will be returned.
  *
  *	radix_tree_next_hole may be called under rcu_read_lock. However, like
- *	radix_tree_gang_lookup, this will not atomically search a snapshot of the
- *	tree at a single point in time. For example, if a hole is created at index
- *	5, then subsequently a hole is created at index 10, radix_tree_next_hole
- *	covering both indexes may return 10 if called under rcu_read_lock.
+ *	radix_tree_gang_lookup, this will not atomically search a snapshot of
+ *	the tree at a single point in time. For example, if a hole is created
+ *	at index 5, then subsequently a hole is created at index 10,
+ *	radix_tree_next_hole covering both indexes may return 10 if called
+ *	under rcu_read_lock.
  */
 unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan)
-- 
cgit v1.3-14-g43fede


From 210272a28465a7a31bcd580d2f9529f924965aa5 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Thu, 16 Oct 2008 14:57:54 -0600
Subject: driver core: Remove completion from struct klist_node

Removing the completion from klist_node reduces its size from 64 bytes
to 28 on x86-64.  To maintain the semantics of klist_remove(), we add
a single list of klist nodes which are pending deletion and scan them.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/klist.h |  2 --
 lib/klist.c           | 43 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/include/linux/klist.h b/include/linux/klist.h
index 8ea98db223e5..d5a27af9dba5 100644
--- a/include/linux/klist.h
+++ b/include/linux/klist.h
@@ -13,7 +13,6 @@
 #define _LINUX_KLIST_H
 
 #include <linux/spinlock.h>
-#include <linux/completion.h>
 #include <linux/kref.h>
 #include <linux/list.h>
 
@@ -41,7 +40,6 @@ struct klist_node {
 	void			*n_klist;	/* never access directly */
 	struct list_head	n_node;
 	struct kref		n_ref;
-	struct completion	n_removed;
 };
 
 extern void klist_add_tail(struct klist_node *n, struct klist *k);
diff --git a/lib/klist.c b/lib/klist.c
index bbdd3015c2c7..573d6068a42e 100644
--- a/lib/klist.c
+++ b/lib/klist.c
@@ -36,6 +36,7 @@
 
 #include <linux/klist.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 
 /*
  * Use the lowest bit of n_klist to mark deleted nodes and exclude
@@ -108,7 +109,6 @@ static void add_tail(struct klist *k, struct klist_node *n)
 static void klist_node_init(struct klist *k, struct klist_node *n)
 {
 	INIT_LIST_HEAD(&n->n_node);
-	init_completion(&n->n_removed);
 	kref_init(&n->n_ref);
 	knode_set_klist(n, k);
 	if (k->get)
@@ -171,13 +171,34 @@ void klist_add_before(struct klist_node *n, struct klist_node *pos)
 }
 EXPORT_SYMBOL_GPL(klist_add_before);
 
+struct klist_waiter {
+	struct list_head list;
+	struct klist_node *node;
+	struct task_struct *process;
+	int woken;
+};
+
+static DEFINE_SPINLOCK(klist_remove_lock);
+static LIST_HEAD(klist_remove_waiters);
+
 static void klist_release(struct kref *kref)
 {
+	struct klist_waiter *waiter, *tmp;
 	struct klist_node *n = container_of(kref, struct klist_node, n_ref);
 
 	WARN_ON(!knode_dead(n));
 	list_del(&n->n_node);
-	complete(&n->n_removed);
+	spin_lock(&klist_remove_lock);
+	list_for_each_entry_safe(waiter, tmp, &klist_remove_waiters, list) {
+		if (waiter->node != n)
+			continue;
+
+		waiter->woken = 1;
+		mb();
+		wake_up_process(waiter->process);
+		list_del(&waiter->list);
+	}
+	spin_unlock(&klist_remove_lock);
 	knode_set_klist(n, NULL);
 }
 
@@ -217,8 +238,24 @@ EXPORT_SYMBOL_GPL(klist_del);
  */
 void klist_remove(struct klist_node *n)
 {
+	struct klist_waiter waiter;
+
+	waiter.node = n;
+	waiter.process = current;
+	waiter.woken = 0;
+	spin_lock(&klist_remove_lock);
+	list_add(&waiter.list, &klist_remove_waiters);
+	spin_unlock(&klist_remove_lock);
+
 	klist_del(n);
-	wait_for_completion(&n->n_removed);
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (waiter.woken)
+			break;
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
 }
 EXPORT_SYMBOL_GPL(klist_remove);
 
-- 
cgit v1.3-14-g43fede


From c65b9145f40da99cad000f81823265dc70e5fcf9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Nov 2008 13:20:00 +0900
Subject: uevent: don't pass envp_ext[] as format string in
 kobject_uevent_env()

kobject_uevent_env() uses envp_ext[] as verbatim format string which
can cause problems ranging from unexpectedly mangled string to oops if
a string in envp_ext[] contains substring which can be interpreted as
format.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 lib/kobject_uevent.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 3f914725bda8..ca215bc2329a 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -165,7 +165,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 	/* keys passed in from the caller */
 	if (envp_ext) {
 		for (i = 0; envp_ext[i]; i++) {
-			retval = add_uevent_var(env, envp_ext[i]);
+			retval = add_uevent_var(env, "%s", envp_ext[i]);
 			if (retval)
 				goto exit;
 		}
-- 
cgit v1.3-14-g43fede


From e0d7bf5d580c20ff14d0200b6ab47bc77f99b152 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Sun, 16 Nov 2008 18:23:27 +0800
Subject: kobject: return the result of uevent sending by netlink

We need to return the result of uevent sending by netlink
to caller, when uevent_helper is disabled and CONFIG_NET
is defined.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 lib/kobject_uevent.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index ca215bc2329a..318328ddbd1c 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -225,8 +225,10 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 			}
 
 			NETLINK_CB(skb).dst_group = 1;
-			netlink_broadcast(uevent_sock, skb, 0, 1, GFP_KERNEL);
-		}
+			retval = netlink_broadcast(uevent_sock, skb, 0, 1,
+						   GFP_KERNEL);
+		} else
+			retval = -ENOMEM;
 	}
 #endif
 
-- 
cgit v1.3-14-g43fede


From 2e5ba26a483218b51b7b327fb235b348890aea15 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Thu, 11 Dec 2008 07:43:17 +0800
Subject: dynamic_printk: reduce one level of indentation

Cleanup pr_debug_write() to reduce one level of indentation.

Cc: Marcel Holtmann <marcel@holtmann.org>
Acked-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 lib/dynamic_printk.c | 58 +++++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 32 deletions(-)

(limited to 'lib')

diff --git a/lib/dynamic_printk.c b/lib/dynamic_printk.c
index 8e30295e8566..165a19763dc9 100644
--- a/lib/dynamic_printk.c
+++ b/lib/dynamic_printk.c
@@ -277,40 +277,34 @@ static ssize_t pr_debug_write(struct file *file, const char __user *buf,
 				dynamic_enabled = DYNAMIC_ENABLED_NONE;
 			}
 			err = 0;
-		} else {
-			if (elem) {
-				if (value && (elem->enable == 0)) {
-					dynamic_printk_enabled |=
-							(1LL << elem->hash1);
-					dynamic_printk_enabled2 |=
-							(1LL << elem->hash2);
-					elem->enable = 1;
-					num_enabled++;
-					dynamic_enabled = DYNAMIC_ENABLED_SOME;
-					err = 0;
-					printk(KERN_DEBUG
-					       "debugging enabled for module %s\n",
-					       elem->name);
-				} else if (!value && (elem->enable == 1)) {
-					elem->enable = 0;
-					num_enabled--;
-					if (disabled_hash(elem->hash1, true))
-						dynamic_printk_enabled &=
+		} else if (elem) {
+			if (value && (elem->enable == 0)) {
+				dynamic_printk_enabled |= (1LL << elem->hash1);
+				dynamic_printk_enabled2 |= (1LL << elem->hash2);
+				elem->enable = 1;
+				num_enabled++;
+				dynamic_enabled = DYNAMIC_ENABLED_SOME;
+				err = 0;
+				printk(KERN_DEBUG
+					"debugging enabled for module %s\n",
+					elem->name);
+			} else if (!value && (elem->enable == 1)) {
+				elem->enable = 0;
+				num_enabled--;
+				if (disabled_hash(elem->hash1, true))
+					dynamic_printk_enabled &=
 							~(1LL << elem->hash1);
-					if (disabled_hash(elem->hash2, false))
-						dynamic_printk_enabled2 &=
+				if (disabled_hash(elem->hash2, false))
+					dynamic_printk_enabled2 &=
 							~(1LL << elem->hash2);
-					if (num_enabled)
-						dynamic_enabled =
-							DYNAMIC_ENABLED_SOME;
-					else
-						dynamic_enabled =
-							DYNAMIC_ENABLED_NONE;
-					err = 0;
-					printk(KERN_DEBUG
-					       "debugging disabled for module "
-					       "%s\n", elem->name);
-				}
+				if (num_enabled)
+					dynamic_enabled = DYNAMIC_ENABLED_SOME;
+				else
+					dynamic_enabled = DYNAMIC_ENABLED_NONE;
+				err = 0;
+				printk(KERN_DEBUG
+					"debugging disabled for module %s\n",
+					elem->name);
 			}
 		}
 	}
-- 
cgit v1.3-14-g43fede


From 94b324864ef2a8e461f3933ab99638255299e9f0 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 6 Jan 2009 10:44:37 -0800
Subject: swiotlb: struct device - replace bus_id with dev_name(),
 dev_set_name()

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 lib/swiotlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 7f5e21b9c16b..c2a4e6401456 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -647,7 +647,7 @@ swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
 	 * the damage, or panic when the transfer is too big.
 	 */
 	printk(KERN_ERR "DMA: Out of SW-IOMMU space for %zu bytes at "
-	       "device %s\n", size, dev ? dev->bus_id : "?");
+	       "device %s\n", size, dev ? dev_name(dev) : "?");
 
 	if (size > io_tlb_overflow && do_panic) {
 		if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
-- 
cgit v1.3-14-g43fede


From b61312d353da1871778711040464b10f5cd904df Mon Sep 17 00:00:00 2001
From: Viktor Rosendahl <viktor.rosendahl@nokia.com>
Date: Tue, 6 Jan 2009 14:40:42 -0800
Subject: oops handling: ensure that any oops is flushed to the mtdoops console

This used to work unpatched with older kernels, during the development
phase of mtdoops.  Before commit e3e8a75d2acfc61ebf25524666a0a2c6abb0620c
a space was printed with console_loglevel set to 15, which probably
flushed the oops message as a side effect.

This is another patch from the Nokia N810 kernel.

Signed-off-by: Viktor Rosendahl <viktor.rosendahl@nokia.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/bust_spinlocks.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'lib')

diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c
index 486da62b2b07..9681d54b95d1 100644
--- a/lib/bust_spinlocks.c
+++ b/lib/bust_spinlocks.c
@@ -12,6 +12,7 @@
 #include <linux/tty.h>
 #include <linux/wait.h>
 #include <linux/vt_kern.h>
+#include <linux/console.h>
 
 
 void __attribute__((weak)) bust_spinlocks(int yes)
@@ -22,6 +23,7 @@ void __attribute__((weak)) bust_spinlocks(int yes)
 #ifdef CONFIG_VT
 		unblank_screen();
 #endif
+		console_unblank();
 		if (--oops_in_progress == 0)
 			wake_up_klogd();
 	}
-- 
cgit v1.3-14-g43fede


From f1883f86dea84fe47a71a39fc1afccc005915ed8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Jan 2009 14:40:45 -0800
Subject: Remove remaining unwinder code

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Gabor Gombas <gombasg@sztaki.hu>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>,
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/unwind.h | 13 ---------
 arch/x86/kernel/traps.c       |  2 --
 include/linux/module.h        |  3 --
 include/linux/unwind.h        | 68 -------------------------------------------
 init/main.c                   |  3 --
 kernel/module.c               | 15 ----------
 lib/fault-inject.c            |  1 -
 7 files changed, 105 deletions(-)
 delete mode 100644 arch/x86/include/asm/unwind.h
 delete mode 100644 include/linux/unwind.h

(limited to 'lib')

diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
deleted file mode 100644
index 8b064bd9c553..000000000000
--- a/arch/x86/include/asm/unwind.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _ASM_X86_UNWIND_H
-#define _ASM_X86_UNWIND_H
-
-#define UNW_PC(frame) ((void)(frame), 0UL)
-#define UNW_SP(frame) ((void)(frame), 0UL)
-#define UNW_FP(frame) ((void)(frame), 0UL)
-
-static inline int arch_unw_user_mode(const void *info)
-{
-	return 0;
-}
-
-#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ce6650eb64e9..c9a666cdd3db 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -20,7 +20,6 @@
 #include <linux/module.h>
 #include <linux/ptrace.h>
 #include <linux/string.h>
-#include <linux/unwind.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/kexec.h>
@@ -51,7 +50,6 @@
 #include <asm/debugreg.h>
 #include <asm/atomic.h>
 #include <asm/system.h>
-#include <asm/unwind.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/i387.h>
diff --git a/include/linux/module.h b/include/linux/module.h
index 3bfed013350b..03cb93d1865a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -294,9 +294,6 @@ struct module
 	/* The size of the executable code in each section.  */
 	unsigned int init_text_size, core_text_size;
 
-	/* The handle returned from unwind_add_table. */
-	void *unwind_info;
-
 	/* Arch-specific module values */
 	struct mod_arch_specific arch;
 
diff --git a/include/linux/unwind.h b/include/linux/unwind.h
deleted file mode 100644
index 7760860fa170..000000000000
--- a/include/linux/unwind.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef _LINUX_UNWIND_H
-#define _LINUX_UNWIND_H
-
-/*
- * Copyright (C) 2002-2006 Novell, Inc.
- *	Jan Beulich <jbeulich@novell.com>
- * This code is released under version 2 of the GNU GPL.
- *
- * A simple API for unwinding kernel stacks.  This is used for
- * debugging and error reporting purposes.  The kernel doesn't need
- * full-blown stack unwinding with all the bells and whistles, so there
- * is not much point in implementing the full Dwarf2 unwind API.
- */
-
-struct module;
-
-struct unwind_frame_info {};
-
-static inline void unwind_init(void) {}
-static inline void unwind_setup(void) {}
-
-#ifdef CONFIG_MODULES
-
-static inline void *unwind_add_table(struct module *mod,
-                                     const void *table_start,
-                                     unsigned long table_size)
-{
-	return NULL;
-}
-
-static inline void unwind_remove_table(void *handle, int init_only)
-{
-}
-
-#endif
-
-static inline int unwind_init_frame_info(struct unwind_frame_info *info,
-                                         struct task_struct *tsk,
-                                         const struct pt_regs *regs)
-{
-	return -ENOSYS;
-}
-
-static inline int unwind_init_blocked(struct unwind_frame_info *info,
-                                      struct task_struct *tsk)
-{
-	return -ENOSYS;
-}
-
-static inline int unwind_init_running(struct unwind_frame_info *info,
-                                      asmlinkage int (*cb)(struct unwind_frame_info *,
-                                                           void *arg),
-                                      void *arg)
-{
-	return -ENOSYS;
-}
-
-static inline int unwind(struct unwind_frame_info *info)
-{
-	return -ENOSYS;
-}
-
-static inline int unwind_to_user(struct unwind_frame_info *info)
-{
-	return -ENOSYS;
-}
-
-#endif /* _LINUX_UNWIND_H */
diff --git a/init/main.c b/init/main.c
index 90926dadc20d..e119dd28dd7d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -50,7 +50,6 @@
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
-#include <linux/unwind.h>
 #include <linux/buffer_head.h>
 #include <linux/page_cgroup.h>
 #include <linux/debug_locks.h>
@@ -537,7 +536,6 @@ asmlinkage void __init start_kernel(void)
 	 * Need to run as early as possible, to initialize the
 	 * lockdep hash:
 	 */
-	unwind_init();
 	lockdep_init();
 	debug_objects_early_init();
 	cgroup_init_early();
@@ -559,7 +557,6 @@ asmlinkage void __init start_kernel(void)
 	setup_arch(&command_line);
 	mm_init_owner(&init_mm, &init_task);
 	setup_command_line(command_line);
-	unwind_setup();
 	setup_per_cpu_areas();
 	setup_nr_cpu_ids();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
diff --git a/kernel/module.c b/kernel/module.c
index f47cce910f25..34b56cf06615 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -43,7 +43,6 @@
 #include <linux/device.h>
 #include <linux/string.h>
 #include <linux/mutex.h>
-#include <linux/unwind.h>
 #include <linux/rculist.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1449,8 +1448,6 @@ static void free_module(struct module *mod)
 	remove_sect_attrs(mod);
 	mod_kobject_remove(mod);
 
-	unwind_remove_table(mod->unwind_info, 0);
-
 	/* Arch-specific cleanup. */
 	module_arch_cleanup(mod);
 
@@ -1867,7 +1864,6 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int symindex = 0;
 	unsigned int strindex = 0;
 	unsigned int modindex, versindex, infoindex, pcpuindex;
-	unsigned int unwindex = 0;
 	unsigned int num_kp, num_mcount;
 	struct kernel_param *kp;
 	struct module *mod;
@@ -1957,9 +1953,6 @@ static noinline struct module *load_module(void __user *umod,
 	versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
 	infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
 	pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
-#ifdef ARCH_UNWIND_SECTION_NAME
-	unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
-#endif
 
 	/* Don't keep modinfo and version sections. */
 	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1969,8 +1962,6 @@ static noinline struct module *load_module(void __user *umod,
 	sechdrs[symindex].sh_flags |= SHF_ALLOC;
 	sechdrs[strindex].sh_flags |= SHF_ALLOC;
 #endif
-	if (unwindex)
-		sechdrs[unwindex].sh_flags |= SHF_ALLOC;
 
 	/* Check module struct version now, before we try to use module. */
 	if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2267,11 +2258,6 @@ static noinline struct module *load_module(void __user *umod,
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 	add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 
-	/* Size of section 0 is 0, so this works well if no unwind info. */
-	mod->unwind_info = unwind_add_table(mod,
-					    (void *)sechdrs[unwindex].sh_addr,
-					    sechdrs[unwindex].sh_size);
-
 	/* Get rid of temporary copy */
 	vfree(hdr);
 
@@ -2370,7 +2356,6 @@ sys_init_module(void __user *umod,
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
 	module_put(mod);
-	unwind_remove_table(mod->unwind_info, 1);
 	module_free(mod, mod->module_init);
 	mod->module_init = NULL;
 	mod->init_size = 0;
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index a50a311554cc..f97af55bdd96 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -6,7 +6,6 @@
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
-#include <linux/unwind.h>
 #include <linux/stacktrace.h>
 #include <linux/kallsyms.h>
 #include <linux/fault-inject.h>
-- 
cgit v1.3-14-g43fede


From 40bc1f2dbc29ab88176a650e51f2246526105093 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 6 Jan 2009 14:40:49 -0800
Subject: lib: fix sparse shadowed variable warning

pos is always set before being used, no need to declare a
second one inside the if() block.

lib/prio_heap.c:34:7: warning: symbol 'pos' shadows an earlier one
lib/prio_heap.c:30:6: originally declared here

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/prio_heap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/prio_heap.c b/lib/prio_heap.c
index 471944a54e23..a7af6f85eca8 100644
--- a/lib/prio_heap.c
+++ b/lib/prio_heap.c
@@ -31,7 +31,7 @@ void *heap_insert(struct ptr_heap *heap, void *p)
 
 	if (heap->size < heap->max) {
 		/* Heap insertion */
-		int pos = heap->size++;
+		pos = heap->size++;
 		while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) {
 			ptrs[pos] = ptrs[(pos-1)/2];
 			pos = (pos-1)/2;
-- 
cgit v1.3-14-g43fede


From 8cef7d57a4a47ef7166acde05eea0bc4f723691c Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 6 Jan 2009 14:40:50 -0800
Subject: lib: radix_tree.c make percpu variable static

radix_tree_preloads is unused outside of this file, make it static.

Noticed by sparse:
lib/radix-tree.c:84:1: warning: symbol 'per_cpu__radix_tree_preloads' was not declared. Should it be static?

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/radix-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index be86b32bc874..8d3fb0bd1288 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -81,7 +81,7 @@ struct radix_tree_preload {
 	int nr;
 	struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
 };
-DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
+static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
 
 static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
 {
-- 
cgit v1.3-14-g43fede


From 30079677428ae1349d5063b96f677270dfd54309 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 6 Jan 2009 14:40:50 -0800
Subject: lib: proportions.c trivial sparse lock annotation

Suppresses sparse warning:
lib/proportions.c:159:16: warning: context imbalance in 'prop_get_global': wrong count at exit
lib/proportions.c:159:16:    context 'RCU': wanted 0, got 1
lib/proportions.c:164:2: warning: context imbalance in 'prop_put_global': unexpected unlock
lib/proportions.c:164:2:    context 'RCU': wanted 0, got -1

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/proportions.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'lib')

diff --git a/lib/proportions.c b/lib/proportions.c
index 4f387a643d72..3fda810faf0d 100644
--- a/lib/proportions.c
+++ b/lib/proportions.c
@@ -147,6 +147,7 @@ out:
  * this is used to track the active references.
  */
 static struct prop_global *prop_get_global(struct prop_descriptor *pd)
+__acquires(RCU)
 {
 	int index;
 
@@ -160,6 +161,7 @@ static struct prop_global *prop_get_global(struct prop_descriptor *pd)
 }
 
 static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg)
+__releases(RCU)
 {
 	rcu_read_unlock();
 }
-- 
cgit v1.3-14-g43fede


From e899aa823ad74860a17b541b0ad38f513728923d Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Tue, 6 Jan 2009 14:40:53 -0800
Subject: strict_strto* is not strict enough

It decodes "\n" as 0, which is bad, because stray echo into backlight
will turn your backlight off, etc...

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: Yi Yang <yi.y.yang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/vsprintf.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 98d632277ca8..0fbd0121d91d 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -170,6 +170,8 @@ int strict_strtoul(const char *cp, unsigned int base, unsigned long *res)
 		return -EINVAL;
 
 	val = simple_strtoul(cp, &tail, base);
+	if (tail == cp)
+		return -EINVAL;
 	if ((*tail == '\0') ||
 		((len == (size_t)(tail - cp) + 1) && (*tail == '\n'))) {
 		*res = val;
@@ -241,6 +243,8 @@ int strict_strtoull(const char *cp, unsigned int base, unsigned long long *res)
 		return -EINVAL;
 
 	val = simple_strtoull(cp, &tail, base);
+	if (tail == cp)
+		return -EINVAL;
 	if ((*tail == '\0') ||
 		((len == (size_t)(tail - cp) + 1) && (*tail == '\n'))) {
 		*res = val;
-- 
cgit v1.3-14-g43fede


From 179f7ebff6be45738c6e2fa68c8d2cc5c2c6308e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 6 Jan 2009 14:41:04 -0800
Subject: percpu_counter: FBC_BATCH should be a variable

For NR_CPUS >= 16 values, FBC_BATCH is 2*NR_CPUS

Considering more and more distros are using high NR_CPUS values, it makes
sense to use a more sensible value for FBC_BATCH, and get rid of NR_CPUS.

A sensible value is 2*num_online_cpus(), with a minimum value of 32 (This
minimum value helps branch prediction in __percpu_counter_add())

We already have a hotcpu notifier, so we can adjust FBC_BATCH dynamically.

We rename FBC_BATCH to percpu_counter_batch since its not a constant
anymore.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/ext4.h                 |  6 +++---
 fs/ext4/inode.c                |  2 +-
 include/linux/percpu_counter.h |  8 ++------
 lib/percpu_counter.c           | 18 ++++++++++++++----
 4 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'lib')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0537c827024..6c46c648430d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1225,11 +1225,11 @@ do {								\
 } while (0)
 
 #ifdef CONFIG_SMP
-/* Each CPU can accumulate FBC_BATCH blocks in their local
+/* Each CPU can accumulate percpu_counter_batch blocks in their local
  * counters. So we need to make sure we have free blocks more
- * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
+ * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
  */
-#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
 #else
 #define EXT4_FREEBLOCKS_WATERMARK 0
 #endif
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6702a49992a6..98d3fe7057ef 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2498,7 +2498,7 @@ static int ext4_nonda_switch(struct super_block *sb)
 	/*
 	 * switch to non delalloc mode if we are running low
 	 * on free block. The free block accounting via percpu
-	 * counters can get slightly wrong with FBC_BATCH getting
+	 * counters can get slightly wrong with percpu_counter_batch getting
 	 * accumulated on each CPU without updating global counters
 	 * Delalloc need an accurate free block accounting. So switch
 	 * to non delalloc when we are near to error range.
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccdfc112..99de7a31bab8 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -24,11 +24,7 @@ struct percpu_counter {
 	s32 *counters;
 };
 
-#if NR_CPUS >= 16
-#define FBC_BATCH	(NR_CPUS*2)
-#else
-#define FBC_BATCH	(NR_CPUS*4)
-#endif
+extern int percpu_counter_batch;
 
 int percpu_counter_init(struct percpu_counter *fbc, s64 amount);
 int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
@@ -39,7 +35,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
-	__percpu_counter_add(fbc, amount, FBC_BATCH);
+	__percpu_counter_add(fbc, amount, percpu_counter_batch);
 }
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index b255b939bc1b..a60bd8046095 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -9,10 +9,8 @@
 #include <linux/cpu.h>
 #include <linux/module.h>
 
-#ifdef CONFIG_HOTPLUG_CPU
 static LIST_HEAD(percpu_counters);
 static DEFINE_MUTEX(percpu_counters_lock);
-#endif
 
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
 {
@@ -111,13 +109,24 @@ void percpu_counter_destroy(struct percpu_counter *fbc)
 }
 EXPORT_SYMBOL(percpu_counter_destroy);
 
-#ifdef CONFIG_HOTPLUG_CPU
+int percpu_counter_batch __read_mostly = 32;
+EXPORT_SYMBOL(percpu_counter_batch);
+
+static void compute_batch_value(void)
+{
+	int nr = num_online_cpus();
+
+	percpu_counter_batch = max(32, nr*2);
+}
+
 static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
 					unsigned long action, void *hcpu)
 {
+#ifdef CONFIG_HOTPLUG_CPU
 	unsigned int cpu;
 	struct percpu_counter *fbc;
 
+	compute_batch_value();
 	if (action != CPU_DEAD)
 		return NOTIFY_OK;
 
@@ -134,13 +143,14 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
 		spin_unlock_irqrestore(&fbc->lock, flags);
 	}
 	mutex_unlock(&percpu_counters_lock);
+#endif
 	return NOTIFY_OK;
 }
 
 static int __init percpu_counter_startup(void)
 {
+	compute_batch_value();
 	hotcpu_notifier(percpu_counter_hotcpu_callback, 0);
 	return 0;
 }
 module_init(percpu_counter_startup);
-#endif
-- 
cgit v1.3-14-g43fede


From da4276b8299a6544dc41ac2485d3ffca5811b3fb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 7 Jan 2009 11:05:10 +0100
Subject: x86: offer frame pointers in all build modes

CONFIG_FRAME_POINTERS=y results in much better debug info for the
kernel (clear and precise backtraces), with the only drawback being
a ~1% increase in kernel size.

So offer it unconditionally and enable it by default.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig  |  1 +
 lib/Kconfig.debug | 16 ++++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'lib')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 862adb9bf0d4..73f7fe8fd4d1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select ARCH_WANT_OPTIONAL_GPIOLIB
+	select ARCH_WANT_FRAME_POINTERS
 	select HAVE_KRETPROBES
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2e75478e9c69..2d0f14490174 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -566,14 +566,14 @@ config DEBUG_NOTIFIERS
 config FRAME_POINTER
 	bool "Compile the kernel with frame pointers"
 	depends on DEBUG_KERNEL && \
-		(X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390 || \
-		 AVR32 || SUPERH || BLACKFIN || MN10300)
-	default y if DEBUG_INFO && UML
-	help
-	  If you say Y here the resulting kernel image will be slightly larger
-	  and slower, but it might give very useful debugging information on
-	  some architectures or if you use external debuggers.
-	  If you don't debug the kernel, you can say N.
+		(CRIS || M68K || M68KNOMMU || FRV || UML || S390 || \
+		 AVR32 || SUPERH || BLACKFIN || MN10300) || \
+		ARCH_WANT_FRAME_POINTERS
+	default y if (DEBUG_INFO && UML) || ARCH_WANT_FRAME_POINTERS
+	help
+	  If you say Y here the resulting kernel image will be slightly
+	  larger and slower, but it gives very useful debugging information
+	  in case of kernel bugs. (precise oopses/stacktraces/warnings)
 
 config BOOT_PRINTK_DELAY
 	bool "Delay each boot printk message by N milliseconds"
-- 
cgit v1.3-14-g43fede


From 8feae13110d60cc6287afabc2887366b0eb226c2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Jan 2009 12:04:47 +0000
Subject: NOMMU: Make VMAs per MM as for MMU-mode linux

Make VMAs per mm_struct as for MMU-mode linux.  This solves two problems:

 (1) In SYSV SHM where nattch for a segment does not reflect the number of
     shmat's (and forks) done.

 (2) In mmap() where the VMA's vm_mm is set to point to the parent mm by an
     exec'ing process when VM_EXECUTABLE is specified, regardless of the fact
     that a VMA might be shared and already have its vm_mm assigned to another
     process or a dead process.

A new struct (vm_region) is introduced to track a mapped region and to remember
the circumstances under which it may be shared and the vm_list_struct structure
is discarded as it's no longer required.

This patch makes the following additional changes:

 (1) Regions are now allocated with alloc_pages() rather than kmalloc() and
     with no recourse to __GFP_COMP, so the pages are not composite.  Instead,
     each page has a reference on it held by the region.  Anything else that is
     interested in such a page will have to get a reference on it to retain it.
     When the pages are released due to unmapping, each page is passed to
     put_page() and will be freed when the page usage count reaches zero.

 (2) Excess pages are trimmed after an allocation as the allocation must be
     made as a power-of-2 quantity of pages.

 (3) VMAs are added to the parent MM's R/B tree and mmap lists.  As an MM may
     end up with overlapping VMAs within the tree, the VMA struct address is
     appended to the sort key.

 (4) Non-anonymous VMAs are now added to the backing inode's prio list.

 (5) Holes may be punched in anonymous VMAs with munmap(), releasing parts of
     the backing region.  The VMA and region structs will be split if
     necessary.

 (6) sys_shmdt() only releases one attachment to a SYSV IPC shared memory
     segment instead of all the attachments at that addresss.  Multiple
     shmat()'s return the same address under NOMMU-mode instead of different
     virtual addresses as under MMU-mode.

 (7) Core dumping for ELF-FDPIC requires fewer exceptions for NOMMU-mode.

 (8) /proc/maps is now the global list of mapped regions, and may list bits
     that aren't actually mapped anywhere.

 (9) /proc/meminfo gains a line (tagged "MmapCopy") that indicates the amount
     of RAM currently allocated by mmap to hold mappable regions that can't be
     mapped directly.  These are copies of the backing device or file if not
     anonymous.

These changes make NOMMU mode more similar to MMU mode.  The downside is that
NOMMU mode requires some extra memory to track things over NOMMU without this
patch (VMAs are no longer shared, and there are now region structs).

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Mike Frysinger <vapier.adi@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
---
 Documentation/nommu-mmap.txt     |  18 +-
 arch/arm/include/asm/mmu.h       |   1 -
 arch/blackfin/include/asm/mmu.h  |   1 -
 arch/blackfin/kernel/ptrace.c    |   6 +-
 arch/blackfin/kernel/traps.c     |  11 +-
 arch/frv/kernel/ptrace.c         |  11 +-
 arch/h8300/include/asm/mmu.h     |   1 -
 arch/m68knommu/include/asm/mmu.h |   1 -
 arch/sh/include/asm/mmu.h        |   1 -
 fs/binfmt_elf_fdpic.c            |  27 +-
 fs/proc/internal.h               |   2 -
 fs/proc/meminfo.c                |   6 +
 fs/proc/nommu.c                  |  71 ++-
 fs/proc/task_nommu.c             | 108 +++--
 include/asm-frv/mmu.h            |   1 -
 include/asm-m32r/mmu.h           |   1 -
 include/linux/mm.h               |  18 +-
 include/linux/mm_types.h         |  18 +-
 ipc/shm.c                        |  12 +
 kernel/fork.c                    |   4 +-
 lib/Kconfig.debug                |   7 +
 mm/mmap.c                        |  10 +
 mm/nommu.c                       | 960 +++++++++++++++++++++++++++------------
 23 files changed, 860 insertions(+), 436 deletions(-)

(limited to 'lib')

diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt
index 7714f57caad5..02b89dcf38ac 100644
--- a/Documentation/nommu-mmap.txt
+++ b/Documentation/nommu-mmap.txt
@@ -109,12 +109,18 @@ and it's also much more restricted in the latter case:
 FURTHER NOTES ON NO-MMU MMAP
 ============================
 
- (*) A request for a private mapping of less than a page in size may not return
-     a page-aligned buffer. This is because the kernel calls kmalloc() to
-     allocate the buffer, not get_free_page().
-
- (*) A list of all the mappings on the system is visible through /proc/maps in
-     no-MMU mode.
+ (*) A request for a private mapping of a file may return a buffer that is not
+     page-aligned.  This is because XIP may take place, and the data may not be
+     paged aligned in the backing store.
+
+ (*) A request for an anonymous mapping will always be page aligned.  If
+     possible the size of the request should be a power of two otherwise some
+     of the space may be wasted as the kernel must allocate a power-of-2
+     granule but will only discard the excess if appropriately configured as
+     this has an effect on fragmentation.
+
+ (*) A list of all the private copy and anonymous mappings on the system is
+     visible through /proc/maps in no-MMU mode.
 
  (*) A list of all the mappings in use by a process is visible through
      /proc/<pid>/maps in no-MMU mode.
diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h
index 53099d4ee421..b561584d04a1 100644
--- a/arch/arm/include/asm/mmu.h
+++ b/arch/arm/include/asm/mmu.h
@@ -24,7 +24,6 @@ typedef struct {
  *  modified for 2.6 by Hyok S. Choi <hyok.choi@samsung.com>
  */
 typedef struct {
-	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 } mm_context_t;
 
diff --git a/arch/blackfin/include/asm/mmu.h b/arch/blackfin/include/asm/mmu.h
index 757e43906ed4..dbfd686360e6 100644
--- a/arch/blackfin/include/asm/mmu.h
+++ b/arch/blackfin/include/asm/mmu.h
@@ -10,7 +10,6 @@ struct sram_list_struct {
 };
 
 typedef struct {
-	struct vm_list_struct *vmlist;
 	unsigned long end_brk;
 	unsigned long stack_start;
 
diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index d2d388536630..594e325b40e4 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -160,15 +160,15 @@ put_reg(struct task_struct *task, int regno, unsigned long data)
 static inline int is_user_addr_valid(struct task_struct *child,
 				     unsigned long start, unsigned long len)
 {
-	struct vm_list_struct *vml;
+	struct vm_area_struct *vma;
 	struct sram_list_struct *sraml;
 
 	/* overflow */
 	if (start + len < start)
 		return -EIO;
 
-	for (vml = child->mm->context.vmlist; vml; vml = vml->next)
-		if (start >= vml->vma->vm_start && start + len < vml->vma->vm_end)
+	vma = find_vma(child->mm, start);
+	if (vma && start >= vma->vm_start && start + len <= vma->vm_end)
 			return 0;
 
 	for (sraml = child->mm->context.sram_list; sraml; sraml = sraml->next)
diff --git a/arch/blackfin/kernel/traps.c b/arch/blackfin/kernel/traps.c
index 17d8e4172896..5b0667da8d05 100644
--- a/arch/blackfin/kernel/traps.c
+++ b/arch/blackfin/kernel/traps.c
@@ -32,6 +32,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
+#include <linux/rbtree.h>
 #include <asm/traps.h>
 #include <asm/cacheflush.h>
 #include <asm/cplb.h>
@@ -83,6 +84,7 @@ static void decode_address(char *buf, unsigned long address)
 	struct mm_struct *mm;
 	unsigned long flags, offset;
 	unsigned char in_atomic = (bfin_read_IPEND() & 0x10) || in_atomic();
+	struct rb_node *n;
 
 #ifdef CONFIG_KALLSYMS
 	unsigned long symsize;
@@ -128,9 +130,10 @@ static void decode_address(char *buf, unsigned long address)
 		if (!mm)
 			continue;
 
-		vml = mm->context.vmlist;
-		while (vml) {
-			struct vm_area_struct *vma = vml->vma;
+		for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
+			struct vm_area_struct *vma;
+
+			vma = rb_entry(n, struct vm_area_struct, vm_rb);
 
 			if (address >= vma->vm_start && address < vma->vm_end) {
 				char _tmpbuf[256];
@@ -176,8 +179,6 @@ static void decode_address(char *buf, unsigned long address)
 
 				goto done;
 			}
-
-			vml = vml->next;
 		}
 		if (!in_atomic)
 			mmput(mm);
diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c
index 709e9bdc6126..5e7d401d21e7 100644
--- a/arch/frv/kernel/ptrace.c
+++ b/arch/frv/kernel/ptrace.c
@@ -69,7 +69,8 @@ static inline int put_reg(struct task_struct *task, int regno,
 }
 
 /*
- * check that an address falls within the bounds of the target process's memory mappings
+ * check that an address falls within the bounds of the target process's memory
+ * mappings
  */
 static inline int is_user_addr_valid(struct task_struct *child,
 				     unsigned long start, unsigned long len)
@@ -79,11 +80,11 @@ static inline int is_user_addr_valid(struct task_struct *child,
 		return -EIO;
 	return 0;
 #else
-	struct vm_list_struct *vml;
+	struct vm_area_struct *vma;
 
-	for (vml = child->mm->context.vmlist; vml; vml = vml->next)
-		if (start >= vml->vma->vm_start && start + len <= vml->vma->vm_end)
-			return 0;
+	vma = find_vma(child->mm, start);
+	if (vma && start >= vma->vm_start && start + len <= vma->vm_end)
+		return 0;
 
 	return -EIO;
 #endif
diff --git a/arch/h8300/include/asm/mmu.h b/arch/h8300/include/asm/mmu.h
index 2ce06ea46104..31309969df70 100644
--- a/arch/h8300/include/asm/mmu.h
+++ b/arch/h8300/include/asm/mmu.h
@@ -4,7 +4,6 @@
 /* Copyright (C) 2002, David McCullough <davidm@snapgear.com> */
 
 typedef struct {
-	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 } mm_context_t;
 
diff --git a/arch/m68knommu/include/asm/mmu.h b/arch/m68knommu/include/asm/mmu.h
index 5fa6b68353ba..e2da1e6f09fe 100644
--- a/arch/m68knommu/include/asm/mmu.h
+++ b/arch/m68knommu/include/asm/mmu.h
@@ -4,7 +4,6 @@
 /* Copyright (C) 2002, David McCullough <davidm@snapgear.com> */
 
 typedef struct {
-	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 } mm_context_t;
 
diff --git a/arch/sh/include/asm/mmu.h b/arch/sh/include/asm/mmu.h
index fdcb93bc6d11..6c43625bb1a5 100644
--- a/arch/sh/include/asm/mmu.h
+++ b/arch/sh/include/asm/mmu.h
@@ -9,7 +9,6 @@ typedef struct {
 	mm_context_id_t		id;
 	void			*vdso;
 #else
-	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 #endif
 #ifdef CONFIG_BINFMT_ELF_FDPIC
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index aa5b43205e37..22baf1b13493 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1567,11 +1567,9 @@ end_coredump:
 static int elf_fdpic_dump_segments(struct file *file, size_t *size,
 			   unsigned long *limit, unsigned long mm_flags)
 {
-	struct vm_list_struct *vml;
-
-	for (vml = current->mm->context.vmlist; vml; vml = vml->next) {
-	struct vm_area_struct *vma = vml->vma;
+	struct vm_area_struct *vma;
 
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
 		if (!maydump(vma, mm_flags))
 			continue;
 
@@ -1617,9 +1615,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 	elf_fpxregset_t *xfpu = NULL;
 #endif
 	int thread_status_size = 0;
-#ifndef CONFIG_MMU
-	struct vm_list_struct *vml;
-#endif
 	elf_addr_t *auxv;
 	unsigned long mm_flags;
 
@@ -1685,13 +1680,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 	fill_prstatus(prstatus, current, signr);
 	elf_core_copy_regs(&prstatus->pr_reg, regs);
 
-#ifdef CONFIG_MMU
 	segs = current->mm->map_count;
-#else
-	segs = 0;
-	for (vml = current->mm->context.vmlist; vml; vml = vml->next)
-	    segs++;
-#endif
 #ifdef ELF_CORE_EXTRA_PHDRS
 	segs += ELF_CORE_EXTRA_PHDRS;
 #endif
@@ -1766,20 +1755,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 	mm_flags = current->mm->flags;
 
 	/* write program headers for segments dump */
-	for (
-#ifdef CONFIG_MMU
-		vma = current->mm->mmap; vma; vma = vma->vm_next
-#else
-			vml = current->mm->context.vmlist; vml; vml = vml->next
-#endif
-	     ) {
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
 		struct elf_phdr phdr;
 		size_t sz;
 
-#ifndef CONFIG_MMU
-		vma = vml->vma;
-#endif
-
 		sz = vma->vm_end - vma->vm_start;
 
 		phdr.p_type = PT_LOAD;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3e8aeb8b61ce..cd53ff838498 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -41,8 +41,6 @@ do {						\
 	(vmi)->used = 0;			\
 	(vmi)->largest_chunk = 0;		\
 } while(0)
-
-extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
 #endif
 
 extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b1675c4e66da..43d23948384a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -73,6 +73,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"HighFree:       %8lu kB\n"
 		"LowTotal:       %8lu kB\n"
 		"LowFree:        %8lu kB\n"
+#endif
+#ifndef CONFIG_MMU
+		"MmapCopy:       %8lu kB\n"
 #endif
 		"SwapTotal:      %8lu kB\n"
 		"SwapFree:       %8lu kB\n"
@@ -115,6 +118,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freehigh),
 		K(i.totalram-i.totalhigh),
 		K(i.freeram-i.freehigh),
+#endif
+#ifndef CONFIG_MMU
+		K((unsigned long) atomic_read(&mmap_pages_allocated)),
 #endif
 		K(i.totalswap),
 		K(i.freeswap),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 3f87d2632947..b446d7ad0b0d 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -33,33 +33,33 @@
 #include "internal.h"
 
 /*
- * display a single VMA to a sequenced file
+ * display a single region to a sequenced file
  */
-int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 {
 	unsigned long ino = 0;
 	struct file *file;
 	dev_t dev = 0;
 	int flags, len;
 
-	flags = vma->vm_flags;
-	file = vma->vm_file;
+	flags = region->vm_flags;
+	file = region->vm_file;
 
 	if (file) {
-		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+		struct inode *inode = region->vm_file->f_path.dentry->d_inode;
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 	}
 
 	seq_printf(m,
 		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
-		   vma->vm_start,
-		   vma->vm_end,
+		   region->vm_start,
+		   region->vm_end,
 		   flags & VM_READ ? 'r' : '-',
 		   flags & VM_WRITE ? 'w' : '-',
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-		   ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
+		   ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
 	if (file) {
@@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 }
 
 /*
- * display a list of all the VMAs the kernel knows about
+ * display a list of all the REGIONs the kernel knows about
  * - nommu kernals have a single flat list
  */
-static int nommu_vma_list_show(struct seq_file *m, void *v)
+static int nommu_region_list_show(struct seq_file *m, void *_p)
 {
-	struct vm_area_struct *vma;
+	struct rb_node *p = _p;
 
-	vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb);
-	return nommu_vma_show(m, vma);
+	return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb));
 }
 
-static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos)
+static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos)
 {
-	struct rb_node *_rb;
+	struct rb_node *p;
 	loff_t pos = *_pos;
-	void *next = NULL;
 
-	down_read(&nommu_vma_sem);
+	down_read(&nommu_region_sem);
 
-	for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) {
-		if (pos == 0) {
-			next = _rb;
-			break;
-		}
-		pos--;
-	}
-
-	return next;
+	for (p = rb_first(&nommu_region_tree); p; p = rb_next(p))
+		if (pos-- == 0)
+			return p;
+	return NULL;
 }
 
-static void nommu_vma_list_stop(struct seq_file *m, void *v)
+static void nommu_region_list_stop(struct seq_file *m, void *v)
 {
-	up_read(&nommu_vma_sem);
+	up_read(&nommu_region_sem);
 }
 
-static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos)
+static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	(*pos)++;
 	return rb_next((struct rb_node *) v);
 }
 
-static const struct seq_operations proc_nommu_vma_list_seqop = {
-	.start	= nommu_vma_list_start,
-	.next	= nommu_vma_list_next,
-	.stop	= nommu_vma_list_stop,
-	.show	= nommu_vma_list_show
+static struct seq_operations proc_nommu_region_list_seqop = {
+	.start	= nommu_region_list_start,
+	.next	= nommu_region_list_next,
+	.stop	= nommu_region_list_stop,
+	.show	= nommu_region_list_show
 };
 
-static int proc_nommu_vma_list_open(struct inode *inode, struct file *file)
+static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &proc_nommu_vma_list_seqop);
+	return seq_open(file, &proc_nommu_region_list_seqop);
 }
 
-static const struct file_operations proc_nommu_vma_list_operations = {
-	.open    = proc_nommu_vma_list_open,
+static const struct file_operations proc_nommu_region_list_operations = {
+	.open    = proc_nommu_region_list_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
 	.release = seq_release,
@@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = {
 
 static int __init proc_nommu_init(void)
 {
-	proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations);
+	proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
 	return 0;
 }
 
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index d4a8be32b902..ca4a48d0d311 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -15,25 +15,25 @@
  */
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	struct vm_list_struct *vml;
+	struct vm_area_struct *vma;
+	struct rb_node *p;
 	unsigned long bytes = 0, sbytes = 0, slack = 0;
         
 	down_read(&mm->mmap_sem);
-	for (vml = mm->context.vmlist; vml; vml = vml->next) {
-		if (!vml->vma)
-			continue;
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
 
-		bytes += kobjsize(vml);
+		bytes += kobjsize(vma);
 		if (atomic_read(&mm->mm_count) > 1 ||
-		    atomic_read(&vml->vma->vm_usage) > 1
-		    ) {
-			sbytes += kobjsize((void *) vml->vma->vm_start);
-			sbytes += kobjsize(vml->vma);
+		    vma->vm_region ||
+		    vma->vm_flags & VM_MAYSHARE) {
+			sbytes += kobjsize((void *) vma->vm_start);
+			if (vma->vm_region)
+				sbytes += kobjsize(vma->vm_region);
 		} else {
-			bytes += kobjsize((void *) vml->vma->vm_start);
-			bytes += kobjsize(vml->vma);
-			slack += kobjsize((void *) vml->vma->vm_start) -
-				(vml->vma->vm_end - vml->vma->vm_start);
+			bytes += kobjsize((void *) vma->vm_start);
+			slack += kobjsize((void *) vma->vm_start) -
+				(vma->vm_end - vma->vm_start);
 		}
 	}
 
@@ -70,13 +70,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 
 unsigned long task_vsize(struct mm_struct *mm)
 {
-	struct vm_list_struct *tbp;
+	struct vm_area_struct *vma;
+	struct rb_node *p;
 	unsigned long vsize = 0;
 
 	down_read(&mm->mmap_sem);
-	for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) {
-		if (tbp->vma)
-			vsize += kobjsize((void *) tbp->vma->vm_start);
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
+		vsize += vma->vm_region->vm_end - vma->vm_region->vm_start;
 	}
 	up_read(&mm->mmap_sem);
 	return vsize;
@@ -85,16 +86,15 @@ unsigned long task_vsize(struct mm_struct *mm)
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	struct vm_list_struct *tbp;
+	struct vm_area_struct *vma;
+	struct rb_node *p;
 	int size = kobjsize(mm);
 
 	down_read(&mm->mmap_sem);
-	for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) {
-		size += kobjsize(tbp);
-		if (tbp->vma) {
-			size += kobjsize(tbp->vma);
-			size += kobjsize((void *) tbp->vma->vm_start);
-		}
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
+		size += kobjsize(vma);
+		size += kobjsize((void *) vma->vm_start);
 	}
 
 	size += (*text = mm->end_code - mm->start_code);
@@ -104,21 +104,63 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 	return size;
 }
 
+/*
+ * display a single VMA to a sequenced file
+ */
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+{
+	unsigned long ino = 0;
+	struct file *file;
+	dev_t dev = 0;
+	int flags, len;
+
+	flags = vma->vm_flags;
+	file = vma->vm_file;
+
+	if (file) {
+		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+		dev = inode->i_sb->s_dev;
+		ino = inode->i_ino;
+	}
+
+	seq_printf(m,
+		   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+		   vma->vm_start,
+		   vma->vm_end,
+		   flags & VM_READ ? 'r' : '-',
+		   flags & VM_WRITE ? 'w' : '-',
+		   flags & VM_EXEC ? 'x' : '-',
+		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
+		   vma->vm_pgoff << PAGE_SHIFT,
+		   MAJOR(dev), MINOR(dev), ino, &len);
+
+	if (file) {
+		len = 25 + sizeof(void *) * 6 - len;
+		if (len < 1)
+			len = 1;
+		seq_printf(m, "%*c", len, ' ');
+		seq_path(m, &file->f_path, "");
+	}
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
 /*
  * display mapping lines for a particular process's /proc/pid/maps
  */
-static int show_map(struct seq_file *m, void *_vml)
+static int show_map(struct seq_file *m, void *_p)
 {
-	struct vm_list_struct *vml = _vml;
+	struct rb_node *p = _p;
 
-	return nommu_vma_show(m, vml->vma);
+	return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
 }
 
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_maps_private *priv = m->private;
-	struct vm_list_struct *vml;
 	struct mm_struct *mm;
+	struct rb_node *p;
 	loff_t n = *pos;
 
 	/* pin the task and mm whilst we play with them */
@@ -134,9 +176,9 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	}
 
 	/* start from the Nth VMA */
-	for (vml = mm->context.vmlist; vml; vml = vml->next)
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
 		if (n-- == 0)
-			return vml;
+			return p;
 	return NULL;
 }
 
@@ -152,12 +194,12 @@ static void m_stop(struct seq_file *m, void *_vml)
 	}
 }
 
-static void *m_next(struct seq_file *m, void *_vml, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
 {
-	struct vm_list_struct *vml = _vml;
+	struct rb_node *p = _p;
 
 	(*pos)++;
-	return vml ? vml->next : NULL;
+	return p ? rb_next(p) : NULL;
 }
 
 static const struct seq_operations proc_pid_maps_ops = {
diff --git a/include/asm-frv/mmu.h b/include/asm-frv/mmu.h
index 22c03714fb14..86ca0e86e7d2 100644
--- a/include/asm-frv/mmu.h
+++ b/include/asm-frv/mmu.h
@@ -22,7 +22,6 @@ typedef struct {
 	unsigned long	dtlb_ptd_mapping;	/* [DAMR5] PTD mapping for dtlb cached PGE */
 
 #else
-	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 
 #endif
diff --git a/include/asm-m32r/mmu.h b/include/asm-m32r/mmu.h
index d9bd724479cf..150cb92bb666 100644
--- a/include/asm-m32r/mmu.h
+++ b/include/asm-m32r/mmu.h
@@ -4,7 +4,6 @@
 #if !defined(CONFIG_MMU)
 
 typedef struct {
-	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 } mm_context_t;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4a3d28c86443..b91a73fd1bcc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -56,19 +56,9 @@ extern unsigned long mmap_min_addr;
 
 extern struct kmem_cache *vm_area_cachep;
 
-/*
- * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is
- * disabled, then there's a single shared list of VMAs maintained by the
- * system, and mm's subscribe to these individually
- */
-struct vm_list_struct {
-	struct vm_list_struct	*next;
-	struct vm_area_struct	*vma;
-};
-
 #ifndef CONFIG_MMU
-extern struct rb_root nommu_vma_tree;
-extern struct rw_semaphore nommu_vma_sem;
+extern struct rb_root nommu_region_tree;
+extern struct rw_semaphore nommu_region_sem;
 
 extern unsigned int kobjsize(const void *objp);
 #endif
@@ -1061,6 +1051,7 @@ extern void memmap_init_zone(unsigned long, int, unsigned long,
 				unsigned long, enum memmap_context);
 extern void setup_per_zone_pages_min(void);
 extern void mem_init(void);
+extern void __init mmap_init(void);
 extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
@@ -1072,6 +1063,9 @@ extern void setup_per_cpu_pageset(void);
 static inline void setup_per_cpu_pageset(void) {}
 #endif
 
+/* nommu.c */
+extern atomic_t mmap_pages_allocated;
+
 /* prio_tree.c */
 void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
 void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9cfc9b627fdd..1c1e0d3a1714 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -96,6 +96,22 @@ struct page {
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
+/*
+ * A region containing a mapping of a non-memory backed file under NOMMU
+ * conditions.  These are held in a global tree and are pinned by the VMAs that
+ * map parts of them.
+ */
+struct vm_region {
+	struct rb_node	vm_rb;		/* link in global region tree */
+	unsigned long	vm_flags;	/* VMA vm_flags */
+	unsigned long	vm_start;	/* start address of region */
+	unsigned long	vm_end;		/* region initialised to here */
+	unsigned long	vm_pgoff;	/* the offset in vm_file corresponding to vm_start */
+	struct file	*vm_file;	/* the backing file or NULL */
+
+	atomic_t	vm_usage;	/* region usage count */
+};
+
 /*
  * This struct defines a memory VMM memory area. There is one of these
  * per VM-area/task.  A VM area is any part of the process virtual memory
@@ -152,7 +168,7 @@ struct vm_area_struct {
 	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
 
 #ifndef CONFIG_MMU
-	atomic_t vm_usage;		/* refcount (VMAs shared if !MMU) */
+	struct vm_region *vm_region;	/* NOMMU mapping region */
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
diff --git a/ipc/shm.c b/ipc/shm.c
index b125b560240e..d0ab5527bf45 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -990,6 +990,7 @@ asmlinkage long sys_shmdt(char __user *shmaddr)
 	 */
 	vma = find_vma(mm, addr);
 
+#ifdef CONFIG_MMU
 	while (vma) {
 		next = vma->vm_next;
 
@@ -1034,6 +1035,17 @@ asmlinkage long sys_shmdt(char __user *shmaddr)
 		vma = next;
 	}
 
+#else /* CONFIG_MMU */
+	/* under NOMMU conditions, the exact address to be destroyed must be
+	 * given */
+	retval = -EINVAL;
+	if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
+		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+		retval = 0;
+	}
+
+#endif
+
 	up_write(&mm->mmap_sem);
 	return retval;
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 7b8f2a78be3d..0bce4a43bb37 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1481,12 +1481,10 @@ void __init proc_caches_init(void)
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-	vm_area_cachep = kmem_cache_create("vm_area_struct",
-			sizeof(struct vm_area_struct), 0,
-			SLAB_PANIC, NULL);
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	mmap_init();
 }
 
 /*
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2e75478e9c69..d0a32aab03ff 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -512,6 +512,13 @@ config DEBUG_VIRTUAL
 
 	  If unsure, say N.
 
+config DEBUG_NOMMU_REGIONS
+	bool "Debug the global anon/private NOMMU mapping region tree"
+	depends on DEBUG_KERNEL && !MMU
+	help
+	  This option causes the global tree of anonymous and private mapping
+	  regions to be regularly checked for invalid topology.
+
 config DEBUG_WRITECOUNT
 	bool "Debug filesystem writers count"
 	depends on DEBUG_KERNEL
diff --git a/mm/mmap.c b/mm/mmap.c
index a910c045cfd4..749623196cb9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2472,3 +2472,13 @@ void mm_drop_all_locks(struct mm_struct *mm)
 
 	mutex_unlock(&mm_all_locks_mutex);
 }
+
+/*
+ * initialise the VMA slab
+ */
+void __init mmap_init(void)
+{
+	vm_area_cachep = kmem_cache_create("vm_area_struct",
+			sizeof(struct vm_area_struct), 0,
+			SLAB_PANIC, NULL);
+}
diff --git a/mm/nommu.c b/mm/nommu.c
index 23f355bbe262..0d363dfcf10e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -6,7 +6,7 @@
  *
  *  See Documentation/nommu-mmap.txt
  *
- *  Copyright (c) 2004-2005 David Howells <dhowells@redhat.com>
+ *  Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
  *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
  *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
  *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
@@ -33,6 +33,28 @@
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include "internal.h"
+
+static inline __attribute__((format(printf, 1, 2)))
+void no_printk(const char *fmt, ...)
+{
+}
+
+#if 0
+#define kenter(FMT, ...) \
+	printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) \
+	printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+	printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
+#else
+#define kenter(FMT, ...) \
+	no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) \
+	no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+	no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
+#endif
 
 #include "internal.h"
 
@@ -46,12 +68,15 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int heap_stack_gap = 0;
 
+atomic_t mmap_pages_allocated;
+
 EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(num_physpages);
 
-/* list of shareable VMAs */
-struct rb_root nommu_vma_tree = RB_ROOT;
-DECLARE_RWSEM(nommu_vma_sem);
+/* list of mapped, potentially shareable regions */
+static struct kmem_cache *vm_region_jar;
+struct rb_root nommu_region_tree = RB_ROOT;
+DECLARE_RWSEM(nommu_region_sem);
 
 struct vm_operations_struct generic_file_vm_ops = {
 };
@@ -400,129 +425,174 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 	return mm->brk = brk;
 }
 
-#ifdef DEBUG
-static void show_process_blocks(void)
+/*
+ * initialise the VMA and region record slabs
+ */
+void __init mmap_init(void)
 {
-	struct vm_list_struct *vml;
-
-	printk("Process blocks %d:", current->pid);
-
-	for (vml = &current->mm->context.vmlist; vml; vml = vml->next) {
-		printk(" %p: %p", vml, vml->vma);
-		if (vml->vma)
-			printk(" (%d @%lx #%d)",
-			       kobjsize((void *) vml->vma->vm_start),
-			       vml->vma->vm_start,
-			       atomic_read(&vml->vma->vm_usage));
-		printk(vml->next ? " ->" : ".\n");
-	}
+	vm_region_jar = kmem_cache_create("vm_region_jar",
+					  sizeof(struct vm_region), 0,
+					  SLAB_PANIC, NULL);
+	vm_area_cachep = kmem_cache_create("vm_area_struct",
+					   sizeof(struct vm_area_struct), 0,
+					   SLAB_PANIC, NULL);
 }
-#endif /* DEBUG */
 
 /*
- * add a VMA into a process's mm_struct in the appropriate place in the list
- * - should be called with mm->mmap_sem held writelocked
+ * validate the region tree
+ * - the caller must hold the region lock
  */
-static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
+#ifdef CONFIG_DEBUG_NOMMU_REGIONS
+static noinline void validate_nommu_regions(void)
 {
-	struct vm_list_struct **ppv;
+	struct vm_region *region, *last;
+	struct rb_node *p, *lastp;
 
-	for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
-		if ((*ppv)->vma->vm_start > vml->vma->vm_start)
-			break;
+	lastp = rb_first(&nommu_region_tree);
+	if (!lastp)
+		return;
+
+	last = rb_entry(lastp, struct vm_region, vm_rb);
+	if (unlikely(last->vm_end <= last->vm_start))
+		BUG();
+
+	while ((p = rb_next(lastp))) {
+		region = rb_entry(p, struct vm_region, vm_rb);
+		last = rb_entry(lastp, struct vm_region, vm_rb);
+
+		if (unlikely(region->vm_end <= region->vm_start))
+			BUG();
+		if (unlikely(region->vm_start < last->vm_end))
+			BUG();
 
-	vml->next = *ppv;
-	*ppv = vml;
+		lastp = p;
+	}
 }
+#else
+#define validate_nommu_regions() do {} while(0)
+#endif
 
 /*
- * look up the first VMA in which addr resides, NULL if none
- * - should be called with mm->mmap_sem at least held readlocked
+ * add a region into the global tree
  */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+static void add_nommu_region(struct vm_region *region)
 {
-	struct vm_list_struct *loop, *vml;
+	struct vm_region *pregion;
+	struct rb_node **p, *parent;
 
-	/* search the vm_start ordered list */
-	vml = NULL;
-	for (loop = mm->context.vmlist; loop; loop = loop->next) {
-		if (loop->vma->vm_start > addr)
-			break;
-		vml = loop;
+	validate_nommu_regions();
+
+	BUG_ON(region->vm_start & ~PAGE_MASK);
+
+	parent = NULL;
+	p = &nommu_region_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		pregion = rb_entry(parent, struct vm_region, vm_rb);
+		if (region->vm_start < pregion->vm_start)
+			p = &(*p)->rb_left;
+		else if (region->vm_start > pregion->vm_start)
+			p = &(*p)->rb_right;
+		else if (pregion == region)
+			return;
+		else
+			BUG();
 	}
 
-	if (vml && vml->vma->vm_end > addr)
-		return vml->vma;
+	rb_link_node(&region->vm_rb, parent, p);
+	rb_insert_color(&region->vm_rb, &nommu_region_tree);
 
-	return NULL;
+	validate_nommu_regions();
 }
-EXPORT_SYMBOL(find_vma);
 
 /*
- * find a VMA
- * - we don't extend stack VMAs under NOMMU conditions
+ * delete a region from the global tree
  */
-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+static void delete_nommu_region(struct vm_region *region)
 {
-	return find_vma(mm, addr);
-}
+	BUG_ON(!nommu_region_tree.rb_node);
 
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
-{
-	return -ENOMEM;
+	validate_nommu_regions();
+	rb_erase(&region->vm_rb, &nommu_region_tree);
+	validate_nommu_regions();
 }
 
 /*
- * look up the first VMA exactly that exactly matches addr
- * - should be called with mm->mmap_sem at least held readlocked
+ * free a contiguous series of pages
  */
-static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
-						    unsigned long addr)
+static void free_page_series(unsigned long from, unsigned long to)
 {
-	struct vm_list_struct *vml;
-
-	/* search the vm_start ordered list */
-	for (vml = mm->context.vmlist; vml; vml = vml->next) {
-		if (vml->vma->vm_start == addr)
-			return vml->vma;
-		if (vml->vma->vm_start > addr)
-			break;
+	for (; from < to; from += PAGE_SIZE) {
+		struct page *page = virt_to_page(from);
+
+		kdebug("- free %lx", from);
+		atomic_dec(&mmap_pages_allocated);
+		if (page_count(page) != 1)
+			kdebug("free page %p [%d]", page, page_count(page));
+		put_page(page);
 	}
-
-	return NULL;
 }
 
 /*
- * find a VMA in the global tree
+ * release a reference to a region
+ * - the caller must hold the region semaphore, which this releases
+ * - the region may not have been added to the tree yet, in which case vm_end
+ *   will equal vm_start
  */
-static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
+static void __put_nommu_region(struct vm_region *region)
+	__releases(nommu_region_sem)
 {
-	struct vm_area_struct *vma;
-	struct rb_node *n = nommu_vma_tree.rb_node;
+	kenter("%p{%d}", region, atomic_read(&region->vm_usage));
 
-	while (n) {
-		vma = rb_entry(n, struct vm_area_struct, vm_rb);
+	BUG_ON(!nommu_region_tree.rb_node);
 
-		if (start < vma->vm_start)
-			n = n->rb_left;
-		else if (start > vma->vm_start)
-			n = n->rb_right;
-		else
-			return vma;
+	if (atomic_dec_and_test(&region->vm_usage)) {
+		if (region->vm_end > region->vm_start)
+			delete_nommu_region(region);
+		up_write(&nommu_region_sem);
+
+		if (region->vm_file)
+			fput(region->vm_file);
+
+		/* IO memory and memory shared directly out of the pagecache
+		 * from ramfs/tmpfs mustn't be released here */
+		if (region->vm_flags & VM_MAPPED_COPY) {
+			kdebug("free series");
+			free_page_series(region->vm_start, region->vm_end);
+		}
+		kmem_cache_free(vm_region_jar, region);
+	} else {
+		up_write(&nommu_region_sem);
 	}
+}
 
-	return NULL;
+/*
+ * release a reference to a region
+ */
+static void put_nommu_region(struct vm_region *region)
+{
+	down_write(&nommu_region_sem);
+	__put_nommu_region(region);
 }
 
 /*
- * add a VMA in the global tree
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * and tree and add to the address space's page tree also if not an anonymous
+ * page
+ * - should be called with mm->mmap_sem held writelocked
  */
-static void add_nommu_vma(struct vm_area_struct *vma)
+static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	struct vm_area_struct *pvma;
+	struct vm_area_struct *pvma, **pp;
 	struct address_space *mapping;
-	struct rb_node **p = &nommu_vma_tree.rb_node;
-	struct rb_node *parent = NULL;
+	struct rb_node **p, *parent;
+
+	kenter(",%p", vma);
+
+	BUG_ON(!vma->vm_region);
+
+	mm->map_count++;
+	vma->vm_mm = mm;
 
 	/* add the VMA to the mapping */
 	if (vma->vm_file) {
@@ -533,42 +603,62 @@ static void add_nommu_vma(struct vm_area_struct *vma)
 		flush_dcache_mmap_unlock(mapping);
 	}
 
-	/* add the VMA to the master list */
+	/* add the VMA to the tree */
+	parent = NULL;
+	p = &mm->mm_rb.rb_node;
 	while (*p) {
 		parent = *p;
 		pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
 
-		if (vma->vm_start < pvma->vm_start) {
+		/* sort by: start addr, end addr, VMA struct addr in that order
+		 * (the latter is necessary as we may get identical VMAs) */
+		if (vma->vm_start < pvma->vm_start)
 			p = &(*p)->rb_left;
-		}
-		else if (vma->vm_start > pvma->vm_start) {
+		else if (vma->vm_start > pvma->vm_start)
 			p = &(*p)->rb_right;
-		}
-		else {
-			/* mappings are at the same address - this can only
-			 * happen for shared-mem chardevs and shared file
-			 * mappings backed by ramfs/tmpfs */
-			BUG_ON(!(pvma->vm_flags & VM_SHARED));
-
-			if (vma < pvma)
-				p = &(*p)->rb_left;
-			else if (vma > pvma)
-				p = &(*p)->rb_right;
-			else
-				BUG();
-		}
+		else if (vma->vm_end < pvma->vm_end)
+			p = &(*p)->rb_left;
+		else if (vma->vm_end > pvma->vm_end)
+			p = &(*p)->rb_right;
+		else if (vma < pvma)
+			p = &(*p)->rb_left;
+		else if (vma > pvma)
+			p = &(*p)->rb_right;
+		else
+			BUG();
 	}
 
 	rb_link_node(&vma->vm_rb, parent, p);
-	rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
+	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+
+	/* add VMA to the VMA list also */
+	for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) {
+		if (pvma->vm_start > vma->vm_start)
+			break;
+		if (pvma->vm_start < vma->vm_start)
+			continue;
+		if (pvma->vm_end < vma->vm_end)
+			break;
+	}
+
+	vma->vm_next = *pp;
+	*pp = vma;
 }
 
 /*
- * delete a VMA from the global list
+ * delete a VMA from its owning mm_struct and address space
  */
-static void delete_nommu_vma(struct vm_area_struct *vma)
+static void delete_vma_from_mm(struct vm_area_struct *vma)
 {
+	struct vm_area_struct **pp;
 	struct address_space *mapping;
+	struct mm_struct *mm = vma->vm_mm;
+
+	kenter("%p", vma);
+
+	mm->map_count--;
+	if (mm->mmap_cache == vma)
+		mm->mmap_cache = NULL;
 
 	/* remove the VMA from the mapping */
 	if (vma->vm_file) {
@@ -579,8 +669,115 @@ static void delete_nommu_vma(struct vm_area_struct *vma)
 		flush_dcache_mmap_unlock(mapping);
 	}
 
-	/* remove from the master list */
-	rb_erase(&vma->vm_rb, &nommu_vma_tree);
+	/* remove from the MM's tree and list */
+	rb_erase(&vma->vm_rb, &mm->mm_rb);
+	for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) {
+		if (*pp == vma) {
+			*pp = vma->vm_next;
+			break;
+		}
+	}
+
+	vma->vm_mm = NULL;
+}
+
+/*
+ * destroy a VMA record
+ */
+static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+	kenter("%p", vma);
+	if (vma->vm_ops && vma->vm_ops->close)
+		vma->vm_ops->close(vma);
+	if (vma->vm_file) {
+		fput(vma->vm_file);
+		if (vma->vm_flags & VM_EXECUTABLE)
+			removed_exe_file_vma(mm);
+	}
+	put_nommu_region(vma->vm_region);
+	kmem_cache_free(vm_area_cachep, vma);
+}
+
+/*
+ * look up the first VMA in which addr resides, NULL if none
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma;
+	struct rb_node *n = mm->mm_rb.rb_node;
+
+	/* check the cache first */
+	vma = mm->mmap_cache;
+	if (vma && vma->vm_start <= addr && vma->vm_end > addr)
+		return vma;
+
+	/* trawl the tree (there may be multiple mappings in which addr
+	 * resides) */
+	for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
+		vma = rb_entry(n, struct vm_area_struct, vm_rb);
+		if (vma->vm_start > addr)
+			return NULL;
+		if (vma->vm_end > addr) {
+			mm->mmap_cache = vma;
+			return vma;
+		}
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(find_vma);
+
+/*
+ * find a VMA
+ * - we don't extend stack VMAs under NOMMU conditions
+ */
+struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+	return find_vma(mm, addr);
+}
+
+/*
+ * expand a stack to a given address
+ * - not supported under NOMMU conditions
+ */
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+	return -ENOMEM;
+}
+
+/*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+					     unsigned long addr,
+					     unsigned long len)
+{
+	struct vm_area_struct *vma;
+	struct rb_node *n = mm->mm_rb.rb_node;
+	unsigned long end = addr + len;
+
+	/* check the cache first */
+	vma = mm->mmap_cache;
+	if (vma && vma->vm_start == addr && vma->vm_end == end)
+		return vma;
+
+	/* trawl the tree (there may be multiple mappings in which addr
+	 * resides) */
+	for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
+		vma = rb_entry(n, struct vm_area_struct, vm_rb);
+		if (vma->vm_start < addr)
+			continue;
+		if (vma->vm_start > addr)
+			return NULL;
+		if (vma->vm_end == end) {
+			mm->mmap_cache = vma;
+			return vma;
+		}
+	}
+
+	return NULL;
 }
 
 /*
@@ -595,7 +792,7 @@ static int validate_mmap_request(struct file *file,
 				 unsigned long pgoff,
 				 unsigned long *_capabilities)
 {
-	unsigned long capabilities;
+	unsigned long capabilities, rlen;
 	unsigned long reqprot = prot;
 	int ret;
 
@@ -615,12 +812,12 @@ static int validate_mmap_request(struct file *file,
 		return -EINVAL;
 
 	/* Careful about overflows.. */
-	len = PAGE_ALIGN(len);
-	if (!len || len > TASK_SIZE)
+	rlen = PAGE_ALIGN(len);
+	if (!rlen || rlen > TASK_SIZE)
 		return -ENOMEM;
 
 	/* offset overflow? */
-	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
+	if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
 		return -EOVERFLOW;
 
 	if (file) {
@@ -794,9 +991,10 @@ static unsigned long determine_vm_flags(struct file *file,
 }
 
 /*
- * set up a shared mapping on a file
+ * set up a shared mapping on a file (the driver or filesystem provides and
+ * pins the storage)
  */
-static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
+static int do_mmap_shared_file(struct vm_area_struct *vma)
 {
 	int ret;
 
@@ -814,10 +1012,14 @@ static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
 /*
  * set up a private mapping or an anonymous shared mapping
  */
-static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
+static int do_mmap_private(struct vm_area_struct *vma,
+			   struct vm_region *region,
+			   unsigned long len)
 {
+	struct page *pages;
+	unsigned long total, point, n, rlen;
 	void *base;
-	int ret;
+	int ret, order;
 
 	/* invoke the file's mapping function so that it can keep track of
 	 * shared mappings on devices or memory
@@ -836,23 +1038,46 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
 		 * make a private copy of the data and map that instead */
 	}
 
+	rlen = PAGE_ALIGN(len);
+
 	/* allocate some memory to hold the mapping
 	 * - note that this may not return a page-aligned address if the object
 	 *   we're allocating is smaller than a page
 	 */
-	base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
-	if (!base)
+	order = get_order(rlen);
+	kdebug("alloc order %d for %lx", order, len);
+
+	pages = alloc_pages(GFP_KERNEL, order);
+	if (!pages)
 		goto enomem;
 
-	vma->vm_start = (unsigned long) base;
-	vma->vm_end = vma->vm_start + len;
-	vma->vm_flags |= VM_MAPPED_COPY;
+	/* we allocated a power-of-2 sized page set, so we need to trim off the
+	 * excess */
+	total = 1 << order;
+	atomic_add(total, &mmap_pages_allocated);
+
+	point = rlen >> PAGE_SHIFT;
+	while (total > point) {
+		order = ilog2(total - point);
+		n = 1 << order;
+		kdebug("shave %lu/%lu @%lu", n, total - point, total);
+		atomic_sub(n, &mmap_pages_allocated);
+		total -= n;
+		set_page_refcounted(pages + total);
+		__free_pages(pages + total, order);
+	}
+
+	total = rlen >> PAGE_SHIFT;
+	for (point = 1; point < total; point++)
+		set_page_refcounted(&pages[point]);
 
-#ifdef WARN_ON_SLACK
-	if (len + WARN_ON_SLACK <= kobjsize(result))
-		printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
-		       len, current->pid, kobjsize(result) - len);
-#endif
+	base = page_address(pages);
+	region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
+	region->vm_start = (unsigned long) base;
+	region->vm_end   = region->vm_start + rlen;
+
+	vma->vm_start = region->vm_start;
+	vma->vm_end   = region->vm_start + len;
 
 	if (vma->vm_file) {
 		/* read the contents of a file into the copy */
@@ -864,26 +1089,27 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
 
 		old_fs = get_fs();
 		set_fs(KERNEL_DS);
-		ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
+		ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos);
 		set_fs(old_fs);
 
 		if (ret < 0)
 			goto error_free;
 
 		/* clear the last little bit */
-		if (ret < len)
-			memset(base + ret, 0, len - ret);
+		if (ret < rlen)
+			memset(base + ret, 0, rlen - ret);
 
 	} else {
 		/* if it's an anonymous mapping, then just clear it */
-		memset(base, 0, len);
+		memset(base, 0, rlen);
 	}
 
 	return 0;
 
 error_free:
-	kfree(base);
-	vma->vm_start = 0;
+	free_page_series(region->vm_start, region->vm_end);
+	region->vm_start = vma->vm_start = 0;
+	region->vm_end   = vma->vm_end = 0;
 	return ret;
 
 enomem:
@@ -903,13 +1129,14 @@ unsigned long do_mmap_pgoff(struct file *file,
 			    unsigned long flags,
 			    unsigned long pgoff)
 {
-	struct vm_list_struct *vml = NULL;
-	struct vm_area_struct *vma = NULL;
+	struct vm_area_struct *vma;
+	struct vm_region *region;
 	struct rb_node *rb;
-	unsigned long capabilities, vm_flags;
-	void *result;
+	unsigned long capabilities, vm_flags, result;
 	int ret;
 
+	kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
+
 	if (!(flags & MAP_FIXED))
 		addr = round_hint_to_min(addr);
 
@@ -917,73 +1144,120 @@ unsigned long do_mmap_pgoff(struct file *file,
 	 * mapping */
 	ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
 				    &capabilities);
-	if (ret < 0)
+	if (ret < 0) {
+		kleave(" = %d [val]", ret);
 		return ret;
+	}
 
 	/* we've determined that we can make the mapping, now translate what we
 	 * now know into VMA flags */
 	vm_flags = determine_vm_flags(file, prot, flags, capabilities);
 
-	/* we're going to need to record the mapping if it works */
-	vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
-	if (!vml)
-		goto error_getting_vml;
+	/* we're going to need to record the mapping */
+	region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
+	if (!region)
+		goto error_getting_region;
+
+	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	if (!vma)
+		goto error_getting_vma;
+
+	atomic_set(&region->vm_usage, 1);
+	region->vm_flags = vm_flags;
+	region->vm_pgoff = pgoff;
 
-	down_write(&nommu_vma_sem);
+	INIT_LIST_HEAD(&vma->anon_vma_node);
+	vma->vm_flags = vm_flags;
+	vma->vm_pgoff = pgoff;
 
-	/* if we want to share, we need to check for VMAs created by other
+	if (file) {
+		region->vm_file = file;
+		get_file(file);
+		vma->vm_file = file;
+		get_file(file);
+		if (vm_flags & VM_EXECUTABLE) {
+			added_exe_file_vma(current->mm);
+			vma->vm_mm = current->mm;
+		}
+	}
+
+	down_write(&nommu_region_sem);
+
+	/* if we want to share, we need to check for regions created by other
 	 * mmap() calls that overlap with our proposed mapping
-	 * - we can only share with an exact match on most regular files
+	 * - we can only share with a superset match on most regular files
 	 * - shared mappings on character devices and memory backed files are
 	 *   permitted to overlap inexactly as far as we are concerned for in
 	 *   these cases, sharing is handled in the driver or filesystem rather
 	 *   than here
 	 */
 	if (vm_flags & VM_MAYSHARE) {
-		unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		unsigned long vmpglen;
+		struct vm_region *pregion;
+		unsigned long pglen, rpglen, pgend, rpgend, start;
 
-		/* suppress VMA sharing for shared regions */
-		if (vm_flags & VM_SHARED &&
-		    capabilities & BDI_CAP_MAP_DIRECT)
-			goto dont_share_VMAs;
+		pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		pgend = pgoff + pglen;
 
-		for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) {
-			vma = rb_entry(rb, struct vm_area_struct, vm_rb);
+		for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
+			pregion = rb_entry(rb, struct vm_region, vm_rb);
 
-			if (!(vma->vm_flags & VM_MAYSHARE))
+			if (!(pregion->vm_flags & VM_MAYSHARE))
 				continue;
 
 			/* search for overlapping mappings on the same file */
-			if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode)
+			if (pregion->vm_file->f_path.dentry->d_inode !=
+			    file->f_path.dentry->d_inode)
 				continue;
 
-			if (vma->vm_pgoff >= pgoff + pglen)
+			if (pregion->vm_pgoff >= pgend)
 				continue;
 
-			vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1;
-			vmpglen >>= PAGE_SHIFT;
-			if (pgoff >= vma->vm_pgoff + vmpglen)
+			rpglen = pregion->vm_end - pregion->vm_start;
+			rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+			rpgend = pregion->vm_pgoff + rpglen;
+			if (pgoff >= rpgend)
 				continue;
 
-			/* handle inexactly overlapping matches between mappings */
-			if (vma->vm_pgoff != pgoff || vmpglen != pglen) {
+			/* handle inexactly overlapping matches between
+			 * mappings */
+			if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
+			    !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
+				/* new mapping is not a subset of the region */
 				if (!(capabilities & BDI_CAP_MAP_DIRECT))
 					goto sharing_violation;
 				continue;
 			}
 
-			/* we've found a VMA we can share */
-			atomic_inc(&vma->vm_usage);
-
-			vml->vma = vma;
-			result = (void *) vma->vm_start;
-			goto shared;
+			/* we've found a region we can share */
+			atomic_inc(&pregion->vm_usage);
+			vma->vm_region = pregion;
+			start = pregion->vm_start;
+			start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
+			vma->vm_start = start;
+			vma->vm_end = start + len;
+
+			if (pregion->vm_flags & VM_MAPPED_COPY) {
+				kdebug("share copy");
+				vma->vm_flags |= VM_MAPPED_COPY;
+			} else {
+				kdebug("share mmap");
+				ret = do_mmap_shared_file(vma);
+				if (ret < 0) {
+					vma->vm_region = NULL;
+					vma->vm_start = 0;
+					vma->vm_end = 0;
+					atomic_dec(&pregion->vm_usage);
+					pregion = NULL;
+					goto error_just_free;
+				}
+			}
+			fput(region->vm_file);
+			kmem_cache_free(vm_region_jar, region);
+			region = pregion;
+			result = start;
+			goto share;
 		}
 
-	dont_share_VMAs:
-		vma = NULL;
-
 		/* obtain the address at which to make a shared mapping
 		 * - this is the hook for quasi-memory character devices to
 		 *   tell us the location of a shared mapping
@@ -994,102 +1268,93 @@ unsigned long do_mmap_pgoff(struct file *file,
 			if (IS_ERR((void *) addr)) {
 				ret = addr;
 				if (ret != (unsigned long) -ENOSYS)
-					goto error;
+					goto error_just_free;
 
 				/* the driver refused to tell us where to site
 				 * the mapping so we'll have to attempt to copy
 				 * it */
 				ret = (unsigned long) -ENODEV;
 				if (!(capabilities & BDI_CAP_MAP_COPY))
-					goto error;
+					goto error_just_free;
 
 				capabilities &= ~BDI_CAP_MAP_DIRECT;
+			} else {
+				vma->vm_start = region->vm_start = addr;
+				vma->vm_end = region->vm_end = addr + len;
 			}
 		}
 	}
 
-	/* we're going to need a VMA struct as well */
-	vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
-	if (!vma)
-		goto error_getting_vma;
-
-	INIT_LIST_HEAD(&vma->anon_vma_node);
-	atomic_set(&vma->vm_usage, 1);
-	if (file) {
-		get_file(file);
-		if (vm_flags & VM_EXECUTABLE) {
-			added_exe_file_vma(current->mm);
-			vma->vm_mm = current->mm;
-		}
-	}
-	vma->vm_file	= file;
-	vma->vm_flags	= vm_flags;
-	vma->vm_start	= addr;
-	vma->vm_end	= addr + len;
-	vma->vm_pgoff	= pgoff;
-
-	vml->vma = vma;
+	vma->vm_region = region;
 
 	/* set up the mapping */
 	if (file && vma->vm_flags & VM_SHARED)
-		ret = do_mmap_shared_file(vma, len);
+		ret = do_mmap_shared_file(vma);
 	else
-		ret = do_mmap_private(vma, len);
+		ret = do_mmap_private(vma, region, len);
 	if (ret < 0)
-		goto error;
+		goto error_put_region;
+
+	add_nommu_region(region);
 
 	/* okay... we have a mapping; now we have to register it */
-	result = (void *) vma->vm_start;
+	result = vma->vm_start;
 
 	current->mm->total_vm += len >> PAGE_SHIFT;
 
-	add_nommu_vma(vma);
+share:
+	add_vma_to_mm(current->mm, vma);
 
- shared:
-	add_vma_to_mm(current->mm, vml);
-
-	up_write(&nommu_vma_sem);
+	up_write(&nommu_region_sem);
 
 	if (prot & PROT_EXEC)
-		flush_icache_range((unsigned long) result,
-				   (unsigned long) result + len);
+		flush_icache_range(result, result + len);
 
-#ifdef DEBUG
-	printk("do_mmap:\n");
-	show_process_blocks();
-#endif
+	kleave(" = %lx", result);
+	return result;
 
-	return (unsigned long) result;
-
- error:
-	up_write(&nommu_vma_sem);
-	kfree(vml);
+error_put_region:
+	__put_nommu_region(region);
 	if (vma) {
 		if (vma->vm_file) {
 			fput(vma->vm_file);
 			if (vma->vm_flags & VM_EXECUTABLE)
 				removed_exe_file_vma(vma->vm_mm);
 		}
-		kfree(vma);
+		kmem_cache_free(vm_area_cachep, vma);
 	}
+	kleave(" = %d [pr]", ret);
 	return ret;
 
- sharing_violation:
-	up_write(&nommu_vma_sem);
-	printk("Attempt to share mismatched mappings\n");
-	kfree(vml);
-	return -EINVAL;
+error_just_free:
+	up_write(&nommu_region_sem);
+error:
+	fput(region->vm_file);
+	kmem_cache_free(vm_region_jar, region);
+	fput(vma->vm_file);
+	if (vma->vm_flags & VM_EXECUTABLE)
+		removed_exe_file_vma(vma->vm_mm);
+	kmem_cache_free(vm_area_cachep, vma);
+	kleave(" = %d", ret);
+	return ret;
+
+sharing_violation:
+	up_write(&nommu_region_sem);
+	printk(KERN_WARNING "Attempt to share mismatched mappings\n");
+	ret = -EINVAL;
+	goto error;
 
- error_getting_vma:
-	up_write(&nommu_vma_sem);
-	kfree(vml);
-	printk("Allocation of vma for %lu byte allocation from process %d failed\n",
+error_getting_vma:
+	kmem_cache_free(vm_region_jar, region);
+	printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
+	       " from process %d failed\n",
 	       len, current->pid);
 	show_free_areas();
 	return -ENOMEM;
 
- error_getting_vml:
-	printk("Allocation of vml for %lu byte allocation from process %d failed\n",
+error_getting_region:
+	printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
+	       " from process %d failed\n",
 	       len, current->pid);
 	show_free_areas();
 	return -ENOMEM;
@@ -1097,77 +1362,180 @@ unsigned long do_mmap_pgoff(struct file *file,
 EXPORT_SYMBOL(do_mmap_pgoff);
 
 /*
- * handle mapping disposal for uClinux
+ * split a vma into two pieces at address 'addr', a new vma is allocated either
+ * for the first part or the tail.
  */
-static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+	      unsigned long addr, int new_below)
 {
-	if (vma) {
-		down_write(&nommu_vma_sem);
+	struct vm_area_struct *new;
+	struct vm_region *region;
+	unsigned long npages;
 
-		if (atomic_dec_and_test(&vma->vm_usage)) {
-			delete_nommu_vma(vma);
+	kenter("");
 
-			if (vma->vm_ops && vma->vm_ops->close)
-				vma->vm_ops->close(vma);
+	/* we're only permitted to split anonymous regions that have a single
+	 * owner */
+	if (vma->vm_file ||
+	    atomic_read(&vma->vm_region->vm_usage) != 1)
+		return -ENOMEM;
 
-			/* IO memory and memory shared directly out of the pagecache from
-			 * ramfs/tmpfs mustn't be released here */
-			if (vma->vm_flags & VM_MAPPED_COPY)
-				kfree((void *) vma->vm_start);
+	if (mm->map_count >= sysctl_max_map_count)
+		return -ENOMEM;
 
-			if (vma->vm_file) {
-				fput(vma->vm_file);
-				if (vma->vm_flags & VM_EXECUTABLE)
-					removed_exe_file_vma(mm);
-			}
-			kfree(vma);
-		}
+	region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	if (!new) {
+		kmem_cache_free(vm_region_jar, region);
+		return -ENOMEM;
+	}
+
+	/* most fields are the same, copy all, and then fixup */
+	*new = *vma;
+	*region = *vma->vm_region;
+	new->vm_region = region;
+
+	npages = (addr - vma->vm_start) >> PAGE_SHIFT;
+
+	if (new_below) {
+		region->vm_end = new->vm_end = addr;
+	} else {
+		region->vm_start = new->vm_start = addr;
+		region->vm_pgoff = new->vm_pgoff += npages;
+	}
 
-		up_write(&nommu_vma_sem);
+	if (new->vm_ops && new->vm_ops->open)
+		new->vm_ops->open(new);
+
+	delete_vma_from_mm(vma);
+	down_write(&nommu_region_sem);
+	delete_nommu_region(vma->vm_region);
+	if (new_below) {
+		vma->vm_region->vm_start = vma->vm_start = addr;
+		vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
+	} else {
+		vma->vm_region->vm_end = vma->vm_end = addr;
 	}
+	add_nommu_region(vma->vm_region);
+	add_nommu_region(new->vm_region);
+	up_write(&nommu_region_sem);
+	add_vma_to_mm(mm, vma);
+	add_vma_to_mm(mm, new);
+	return 0;
 }
 
 /*
- * release a mapping
- * - under NOMMU conditions the parameters must match exactly to the mapping to
- *   be removed
+ * shrink a VMA by removing the specified chunk from either the beginning or
+ * the end
  */
-int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
+static int shrink_vma(struct mm_struct *mm,
+		      struct vm_area_struct *vma,
+		      unsigned long from, unsigned long to)
 {
-	struct vm_list_struct *vml, **parent;
-	unsigned long end = addr + len;
+	struct vm_region *region;
 
-#ifdef DEBUG
-	printk("do_munmap:\n");
-#endif
+	kenter("");
 
-	for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
-		if ((*parent)->vma->vm_start > addr)
-			break;
-		if ((*parent)->vma->vm_start == addr &&
-		    ((len == 0) || ((*parent)->vma->vm_end == end)))
-			goto found;
-	}
+	/* adjust the VMA's pointers, which may reposition it in the MM's tree
+	 * and list */
+	delete_vma_from_mm(vma);
+	if (from > vma->vm_start)
+		vma->vm_end = from;
+	else
+		vma->vm_start = to;
+	add_vma_to_mm(mm, vma);
 
-	printk("munmap of non-mmaped memory by process %d (%s): %p\n",
-	       current->pid, current->comm, (void *) addr);
-	return -EINVAL;
+	/* cut the backing region down to size */
+	region = vma->vm_region;
+	BUG_ON(atomic_read(&region->vm_usage) != 1);
 
- found:
-	vml = *parent;
+	down_write(&nommu_region_sem);
+	delete_nommu_region(region);
+	if (from > region->vm_start)
+		region->vm_end = from;
+	else
+		region->vm_start = to;
+	add_nommu_region(region);
+	up_write(&nommu_region_sem);
 
-	put_vma(mm, vml->vma);
+	free_page_series(from, to);
+	return 0;
+}
 
-	*parent = vml->next;
-	kfree(vml);
+/*
+ * release a mapping
+ * - under NOMMU conditions the chunk to be unmapped must be backed by a single
+ *   VMA, though it need not cover the whole VMA
+ */
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+{
+	struct vm_area_struct *vma;
+	struct rb_node *rb;
+	unsigned long end = start + len;
+	int ret;
 
-	update_hiwater_vm(mm);
-	mm->total_vm -= len >> PAGE_SHIFT;
+	kenter(",%lx,%zx", start, len);
 
-#ifdef DEBUG
-	show_process_blocks();
-#endif
+	if (len == 0)
+		return -EINVAL;
+
+	/* find the first potentially overlapping VMA */
+	vma = find_vma(mm, start);
+	if (!vma) {
+		printk(KERN_WARNING
+		       "munmap of memory not mmapped by process %d (%s):"
+		       " 0x%lx-0x%lx\n",
+		       current->pid, current->comm, start, start + len - 1);
+		return -EINVAL;
+	}
 
+	/* we're allowed to split an anonymous VMA but not a file-backed one */
+	if (vma->vm_file) {
+		do {
+			if (start > vma->vm_start) {
+				kleave(" = -EINVAL [miss]");
+				return -EINVAL;
+			}
+			if (end == vma->vm_end)
+				goto erase_whole_vma;
+			rb = rb_next(&vma->vm_rb);
+			vma = rb_entry(rb, struct vm_area_struct, vm_rb);
+		} while (rb);
+		kleave(" = -EINVAL [split file]");
+		return -EINVAL;
+	} else {
+		/* the chunk must be a subset of the VMA found */
+		if (start == vma->vm_start && end == vma->vm_end)
+			goto erase_whole_vma;
+		if (start < vma->vm_start || end > vma->vm_end) {
+			kleave(" = -EINVAL [superset]");
+			return -EINVAL;
+		}
+		if (start & ~PAGE_MASK) {
+			kleave(" = -EINVAL [unaligned start]");
+			return -EINVAL;
+		}
+		if (end != vma->vm_end && end & ~PAGE_MASK) {
+			kleave(" = -EINVAL [unaligned split]");
+			return -EINVAL;
+		}
+		if (start != vma->vm_start && end != vma->vm_end) {
+			ret = split_vma(mm, vma, start, 1);
+			if (ret < 0) {
+				kleave(" = %d [split]", ret);
+				return ret;
+			}
+		}
+		return shrink_vma(mm, vma, start, end);
+	}
+
+erase_whole_vma:
+	delete_vma_from_mm(vma);
+	delete_vma(mm, vma);
+	kleave(" = 0");
 	return 0;
 }
 EXPORT_SYMBOL(do_munmap);
@@ -1184,29 +1552,26 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len)
 }
 
 /*
- * Release all mappings
+ * release all the mappings made in a process's VM space
  */
-void exit_mmap(struct mm_struct * mm)
+void exit_mmap(struct mm_struct *mm)
 {
-	struct vm_list_struct *tmp;
+	struct vm_area_struct *vma;
 
-	if (mm) {
-#ifdef DEBUG
-		printk("Exit_mmap:\n");
-#endif
+	if (!mm)
+		return;
 
-		mm->total_vm = 0;
+	kenter("");
 
-		while ((tmp = mm->context.vmlist)) {
-			mm->context.vmlist = tmp->next;
-			put_vma(mm, tmp->vma);
-			kfree(tmp);
-		}
+	mm->total_vm = 0;
 
-#ifdef DEBUG
-		show_process_blocks();
-#endif
+	while ((vma = mm->mmap)) {
+		mm->mmap = vma->vm_next;
+		delete_vma_from_mm(vma);
+		delete_vma(mm, vma);
 	}
+
+	kleave("");
 }
 
 unsigned long do_brk(unsigned long addr, unsigned long len)
@@ -1219,8 +1584,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
  * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
  *
  * under NOMMU conditions, we only permit changing a mapping's size, and only
- * as long as it stays within the hole allocated by the kmalloc() call in
- * do_mmap_pgoff() and the block is not shareable
+ * as long as it stays within the region allocated by do_mmap_private() and the
+ * block is not shareable
  *
  * MREMAP_FIXED is not supported under NOMMU conditions
  */
@@ -1231,13 +1596,16 @@ unsigned long do_mremap(unsigned long addr,
 	struct vm_area_struct *vma;
 
 	/* insanity checks first */
-	if (new_len == 0)
+	if (old_len == 0 || new_len == 0)
 		return (unsigned long) -EINVAL;
 
+	if (addr & ~PAGE_MASK)
+		return -EINVAL;
+
 	if (flags & MREMAP_FIXED && new_addr != addr)
 		return (unsigned long) -EINVAL;
 
-	vma = find_vma_exact(current->mm, addr);
+	vma = find_vma_exact(current->mm, addr, old_len);
 	if (!vma)
 		return (unsigned long) -EINVAL;
 
@@ -1247,19 +1615,19 @@ unsigned long do_mremap(unsigned long addr,
 	if (vma->vm_flags & VM_MAYSHARE)
 		return (unsigned long) -EPERM;
 
-	if (new_len > kobjsize((void *) addr))
+	if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
 		return (unsigned long) -ENOMEM;
 
 	/* all checks complete - do it */
 	vma->vm_end = vma->vm_start + new_len;
-
 	return vma->vm_start;
 }
 EXPORT_SYMBOL(do_mremap);
 
-asmlinkage unsigned long sys_mremap(unsigned long addr,
-	unsigned long old_len, unsigned long new_len,
-	unsigned long flags, unsigned long new_addr)
+asmlinkage
+unsigned long sys_mremap(unsigned long addr,
+			 unsigned long old_len, unsigned long new_len,
+			 unsigned long flags, unsigned long new_addr)
 {
 	unsigned long ret;
 
-- 
cgit v1.3-14-g43fede


From b53907c0100a353a7ac53bed260e735e5ccbbbcc Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 7 Jan 2009 18:09:11 -0800
Subject: generic swap(): lib/sort.c: rename swap to swap_func

This is to avoid name clashes for the introduction of a global swap()
macro.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/sort.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'lib')

diff --git a/lib/sort.c b/lib/sort.c
index 6abbaf3d5858..926d00429ed2 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -32,11 +32,11 @@ static void generic_swap(void *a, void *b, int size)
  * @base: pointer to data to sort
  * @num: number of elements
  * @size: size of each element
- * @cmp: pointer to comparison function
- * @swap: pointer to swap function or NULL
+ * @cmp_func: pointer to comparison function
+ * @swap_func: pointer to swap function or NULL
  *
  * This function does a heapsort on the given array. You may provide a
- * swap function optimized to your element type.
+ * swap_func function optimized to your element type.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
  * qsort is about 20% faster on average, it suffers from exploitable
@@ -45,37 +45,39 @@ static void generic_swap(void *a, void *b, int size)
  */
 
 void sort(void *base, size_t num, size_t size,
-	  int (*cmp)(const void *, const void *),
-	  void (*swap)(void *, void *, int size))
+	  int (*cmp_func)(const void *, const void *),
+	  void (*swap_func)(void *, void *, int size))
 {
 	/* pre-scale counters for performance */
 	int i = (num/2 - 1) * size, n = num * size, c, r;
 
-	if (!swap)
-		swap = (size == 4 ? u32_swap : generic_swap);
+	if (!swap_func)
+		swap_func = (size == 4 ? u32_swap : generic_swap);
 
 	/* heapify */
 	for ( ; i >= 0; i -= size) {
 		for (r = i; r * 2 + size < n; r  = c) {
 			c = r * 2 + size;
-			if (c < n - size && cmp(base + c, base + c + size) < 0)
+			if (c < n - size &&
+					cmp_func(base + c, base + c + size) < 0)
 				c += size;
-			if (cmp(base + r, base + c) >= 0)
+			if (cmp_func(base + r, base + c) >= 0)
 				break;
-			swap(base + r, base + c, size);
+			swap_func(base + r, base + c, size);
 		}
 	}
 
 	/* sort */
 	for (i = n - size; i > 0; i -= size) {
-		swap(base, base + i, size);
+		swap_func(base, base + i, size);
 		for (r = 0; r * 2 + size < i; r = c) {
 			c = r * 2 + size;
-			if (c < i - size && cmp(base + c, base + c + size) < 0)
+			if (c < i - size &&
+					cmp_func(base + c, base + c + size) < 0)
 				c += size;
-			if (cmp(base + r, base + c) >= 0)
+			if (cmp_func(base + r, base + c) >= 0)
 				break;
-			swap(base + r, base + c, size);
+			swap_func(base + r, base + c, size);
 		}
 	}
 }
-- 
cgit v1.3-14-g43fede


From f4b477c47332367d35686bd2b808c2156b96d7c7 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sat, 10 Jan 2009 11:12:09 +0000
Subject: rbtree: add const qualifier to some functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 'rb_first()', 'rb_last()', 'rb_next()' and 'rb_prev()' calls
take a pointer to an RB node or RB root. They do not change the
pointed objects, so add a 'const' qualifier in order to make life
of the users of these functions easier.

Indeed, if I have my own constant pointer &const struct my_type *p,
and I call 'rb_next(&p->rb)', I get a GCC warning:

warning: passing argument 1 of ‘rb_next’ discards qualifiers from pointer target type

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rbtree.h |  8 ++++----
 lib/rbtree.c           | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'lib')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 344bc3495ddb..9c295411d01f 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -140,10 +140,10 @@ extern void rb_insert_color(struct rb_node *, struct rb_root *);
 extern void rb_erase(struct rb_node *, struct rb_root *);
 
 /* Find logical next and previous nodes in a tree */
-extern struct rb_node *rb_next(struct rb_node *);
-extern struct rb_node *rb_prev(struct rb_node *);
-extern struct rb_node *rb_first(struct rb_root *);
-extern struct rb_node *rb_last(struct rb_root *);
+extern struct rb_node *rb_next(const struct rb_node *);
+extern struct rb_node *rb_prev(const struct rb_node *);
+extern struct rb_node *rb_first(const struct rb_root *);
+extern struct rb_node *rb_last(const struct rb_root *);
 
 /* Fast replacement of a single node without remove/rebalance/add/rebalance */
 extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 48499c2d88cc..9956b99649f0 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -292,7 +292,7 @@ EXPORT_SYMBOL(rb_erase);
 /*
  * This function returns the first node (in sort order) of the tree.
  */
-struct rb_node *rb_first(struct rb_root *root)
+struct rb_node *rb_first(const struct rb_root *root)
 {
 	struct rb_node	*n;
 
@@ -305,7 +305,7 @@ struct rb_node *rb_first(struct rb_root *root)
 }
 EXPORT_SYMBOL(rb_first);
 
-struct rb_node *rb_last(struct rb_root *root)
+struct rb_node *rb_last(const struct rb_root *root)
 {
 	struct rb_node	*n;
 
@@ -318,7 +318,7 @@ struct rb_node *rb_last(struct rb_root *root)
 }
 EXPORT_SYMBOL(rb_last);
 
-struct rb_node *rb_next(struct rb_node *node)
+struct rb_node *rb_next(const struct rb_node *node)
 {
 	struct rb_node *parent;
 
@@ -331,7 +331,7 @@ struct rb_node *rb_next(struct rb_node *node)
 		node = node->rb_right; 
 		while (node->rb_left)
 			node=node->rb_left;
-		return node;
+		return (struct rb_node *)node;
 	}
 
 	/* No right-hand children.  Everything down and left is
@@ -347,7 +347,7 @@ struct rb_node *rb_next(struct rb_node *node)
 }
 EXPORT_SYMBOL(rb_next);
 
-struct rb_node *rb_prev(struct rb_node *node)
+struct rb_node *rb_prev(const struct rb_node *node)
 {
 	struct rb_node *parent;
 
@@ -360,7 +360,7 @@ struct rb_node *rb_prev(struct rb_node *node)
 		node = node->rb_left; 
 		while (node->rb_right)
 			node=node->rb_right;
-		return node;
+		return (struct rb_node *)node;
 	}
 
 	/* No left-hand children. Go up till we find an ancestor which
-- 
cgit v1.3-14-g43fede


From b098161b4d0231f7dc5306111d576c0bfe0c8eba Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 15 Jan 2009 13:51:00 -0800
Subject: idr: fix wrong kernel-doc

idr_get_new_above() and ida_get_new_above() return an id in the range of
@staring_id ... 0x7fffffff, not 0 ... 0x7fffffff.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/idr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/idr.c b/lib/idr.c
index 1c4f9281f412..6b7cfa6508fe 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -292,7 +292,7 @@ static int idr_get_new_above_int(struct idr *idp, void *ptr, int starting_id)
  * and go back to the idr_pre_get() call.  If the idr is full, it will
  * return -ENOSPC.
  *
- * @id returns a value in the range 0 ... 0x7fffffff
+ * @id returns a value in the range @starting_id ... 0x7fffffff
  */
 int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id)
 {
@@ -723,7 +723,7 @@ EXPORT_SYMBOL(ida_pre_get);
  * and go back to the ida_pre_get() call.  If the ida is full, it will
  * return -ENOSPC.
  *
- * @p_id returns a value in the range 0 ... 0x7fffffff.
+ * @p_id returns a value in the range @starting_id ... 0x7fffffff.
  */
 int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
 {
-- 
cgit v1.3-14-g43fede


From 5b019e99016f3a692ba45bf68fba73a402d7c01a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 15 Jan 2009 13:51:21 -0800
Subject: lib/idr.c: use kmem_cache_zalloc() for the idr_layer cache

David points out that the idr_remove_all() function returns unused slabs
to the kmem cache, but needs to zero them first or else they will be
uninitialized upon next use.  This causes crashes which have been observed
in the firewire subsystem.

He fixed this by zeroing the object before freeing it in idr_remove_all().

But we agree that simply removing the constructor and zeroing the object
at allocation time is simpler than relying upon slab constructor machinery
and might even be faster.

This problem was introduced by "idr: make idr_remove rcu-safe" (commit
cf481c20c476ad2c0febdace9ce23f5a4db19582), which was first released in
2.6.27.

There are no known codesites which trigger this bug in 2.6.27 or 2.6.28.
The post-2.6.28 firewire changes are the only known triggerer.

There might of course be not-yet-discovered triggerers in 2.6.27 and
2.6.28, and there might be out-of-tree triggerers which are added to those
kernel versions.  I'll let the -stable guys decide whether they want to
backport this fix.

Reported-by: David Moore <dcm@acm.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Nadia Derbey <Nadia.Derbey@bull.net>
Cc: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Kristian Hgsberg <krh@redhat.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/idr.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'lib')

diff --git a/lib/idr.c b/lib/idr.c
index 6b7cfa6508fe..c11c5765cdef 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -121,7 +121,7 @@ int idr_pre_get(struct idr *idp, gfp_t gfp_mask)
 {
 	while (idp->id_free_cnt < IDR_FREE_MAX) {
 		struct idr_layer *new;
-		new = kmem_cache_alloc(idr_layer_cache, gfp_mask);
+		new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
 		if (new == NULL)
 			return (0);
 		move_to_free_list(idp, new);
@@ -623,16 +623,10 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
 }
 EXPORT_SYMBOL(idr_replace);
 
-static void idr_cache_ctor(void *idr_layer)
-{
-	memset(idr_layer, 0, sizeof(struct idr_layer));
-}
-
 void __init idr_init_cache(void)
 {
 	idr_layer_cache = kmem_cache_create("idr_layer_cache",
-				sizeof(struct idr_layer), 0, SLAB_PANIC,
-				idr_cache_ctor);
+				sizeof(struct idr_layer), 0, SLAB_PANIC, NULL);
 }
 
 /**
-- 
cgit v1.3-14-g43fede