From dae06ac43d56d23e50a2300d511b32a9e38cd657 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 3 Sep 2005 15:54:42 -0700
Subject: [PATCH] swap: update swsusp use of swap_info

Aha, swsusp dips into swap_info[], better update it to swap_lock.  It's
bitflipping flags with 0xFF, so get_swap_page will allocate from only the one
chosen device: let's change that to flip SWP_WRITEOK.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f2bc71b9fe8b..975b1648a806 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -179,9 +179,9 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 	len=strlen(resume_file);
 	root_swap = 0xFFFF;
 
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	for (i=0; i<MAX_SWAPFILES; i++) {
-		if (swap_info[i].flags == 0) {
+		if (!(swap_info[i].flags & SWP_WRITEOK)) {
 			swapfile_used[i]=SWAPFILE_UNUSED;
 		} else {
 			if (!len) {
@@ -202,7 +202,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 			}
 		}
 	}
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	return (root_swap != 0xffff) ? 0 : -ENODEV;
 }
 
@@ -216,12 +216,12 @@ static void lock_swapdevices(void)
 {
 	int i;
 
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	for (i = 0; i< MAX_SWAPFILES; i++)
 		if (swapfile_used[i] == SWAPFILE_IGNORED) {
-			swap_info[i].flags ^= 0xFF;
+			swap_info[i].flags ^= SWP_WRITEOK;
 		}
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 2a23b5d1e119fd10e25b8e93464c8d549f5a5c5d Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 3 Sep 2005 15:56:53 -0700
Subject: [PATCH] remove busywait in refrigerator

This should make refrigerator sleep properly, not busywait after the first
schedule() returns.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/process.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 3bd0d261818f..f7da5bfc914e 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -38,7 +38,6 @@ void refrigerator(void)
 	   processes around? */
 	long save;
 	save = current->state;
-	current->state = TASK_UNINTERRUPTIBLE;
 	pr_debug("%s entered refrigerator\n", current->comm);
 	printk("=");
 
@@ -47,8 +46,10 @@ void refrigerator(void)
 	recalc_sigpending(); /* We sent fake signal, clean it up */
 	spin_unlock_irq(&current->sighand->siglock);
 
-	while (frozen(current))
+	while (frozen(current)) {
+		current->state = TASK_UNINTERRUPTIBLE;
 		schedule();
+	}
 	pr_debug("%s left refrigerator\n", current->comm);
 	current->state = save;
 }
-- 
cgit v1.2.3-59-g8ed1b


From c2ff18f4070f6303a81fd7d9d967d7c9e01b588f Mon Sep 17 00:00:00 2001
From: Andreas Steinmetz <ast@domdv.de>
Date: Sat, 3 Sep 2005 15:56:59 -0700
Subject: [PATCH] encrypt suspend data for easy wiping

The patch protects from leaking sensitive data after resume from suspend.
During suspend a temporary key is created and this key is used to encrypt the
data written to disk.  When, during resume, the data was read back into memory
the temporary key is destroyed which simply means that all data written to
disk during suspend are then inaccessible so they can't be stolen lateron.

Think of the following: you suspend while an application is running that keeps
sensitive data in memory.  The application itself prevents the data from being
swapped out.  Suspend, however, must write these data to swap to be able to
resume lateron.  Without suspend encryption your sensitive data are then
stored in plaintext on disk.  This means that after resume your sensitive data
are accessible to all applications having direct access to the swap device
which was used for suspend.  If you don't need swap after resume these data
can remain on disk virtually forever.  Thus it can happen that your system
gets broken in weeks later and sensitive data which you thought were encrypted
and protected are retrieved and stolen from the swap device.

Signed-off-by: Andreas Steinmetz <ast@domdv.de>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/Kconfig  |  12 ++++
 kernel/power/swsusp.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 171 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 2c7121d9bff1..917066a5767c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -72,6 +72,18 @@ config PM_STD_PARTITION
 	  suspended image to. It will simply pick the first available swap 
 	  device.
 
+config SWSUSP_ENCRYPT
+	bool "Encrypt suspend image"
+	depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y)
+	default ""
+	---help---
+	  To prevent data gathering from swap after resume you can encrypt
+	  the suspend image with a temporary key that is deleted on
+	  resume.
+
+	  Note that the temporary key is stored unencrypted on disk while the
+	  system is suspended.
+
 config SUSPEND_SMP
 	bool
 	depends on HOTPLUG_CPU && X86 && PM
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 975b1648a806..b041cea2e878 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,6 +31,9 @@
  * Alex Badea <vampire@go.ro>:
  * Fixed runaway init
  *
+ * Andreas Steinmetz <ast@domdv.de>:
+ * Added encrypted suspend option
+ *
  * More state savers are welcome. Especially for the scsi layer...
  *
  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -71,8 +74,16 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
+#include <linux/random.h>
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+
 #include "power.h"
 
+#define CIPHER "aes"
+#define MAXKEY 32
+#define MAXIV  32
+
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
@@ -103,7 +114,8 @@ static suspend_pagedir_t *pagedir_save;
 #define SWSUSP_SIG	"S1SUSPEND"
 
 static struct swsusp_header {
-	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+	char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)];
+	u8 key_iv[MAXKEY+MAXIV];
 	swp_entry_t swsusp_info;
 	char	orig_sig[10];
 	char	sig[10];
@@ -129,6 +141,131 @@ static struct swsusp_info swsusp_info;
 static unsigned short swapfile_used[MAX_SWAPFILES];
 static unsigned short root_swap;
 
+static int write_page(unsigned long addr, swp_entry_t * loc);
+static int bio_read_page(pgoff_t page_off, void * page);
+
+static u8 key_iv[MAXKEY+MAXIV];
+
+#ifdef CONFIG_SWSUSP_ENCRYPT
+
+static int crypto_init(int mode, void **mem)
+{
+	int error = 0;
+	int len;
+	char *modemsg;
+	struct crypto_tfm *tfm;
+
+	modemsg = mode ? "suspend not possible" : "resume not possible";
+
+	tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC);
+	if(!tfm) {
+		printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg);
+		error = -EINVAL;
+		goto out;
+	}
+
+	if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) {
+		printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg);
+		error = -ENOKEY;
+		goto fail;
+	}
+
+	if (mode)
+		get_random_bytes(key_iv, MAXKEY+MAXIV);
+
+	len = crypto_tfm_alg_max_keysize(tfm);
+	if (len > MAXKEY)
+		len = MAXKEY;
+
+	if (crypto_cipher_setkey(tfm, key_iv, len)) {
+		printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg);
+		error = -EKEYREJECTED;
+		goto fail;
+	}
+
+	len = crypto_tfm_alg_ivsize(tfm);
+
+	if (MAXIV < len) {
+		printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg);
+		error = -EOVERFLOW;
+		goto fail;
+	}
+
+	crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len);
+
+	*mem=(void *)tfm;
+
+	goto out;
+
+fail:	crypto_free_tfm(tfm);
+out:	return error;
+}
+
+static __inline__ void crypto_exit(void *mem)
+{
+	crypto_free_tfm((struct crypto_tfm *)mem);
+}
+
+static __inline__ int crypto_write(struct pbe *p, void *mem)
+{
+	int error = 0;
+	struct scatterlist src, dst;
+
+	src.page   = virt_to_page(p->address);
+	src.offset = 0;
+	src.length = PAGE_SIZE;
+	dst.page   = virt_to_page((void *)&swsusp_header);
+	dst.offset = 0;
+	dst.length = PAGE_SIZE;
+
+	error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src,
+					PAGE_SIZE);
+
+	if (!error)
+		error = write_page((unsigned long)&swsusp_header,
+				&(p->swap_address));
+	return error;
+}
+
+static __inline__ int crypto_read(struct pbe *p, void *mem)
+{
+	int error = 0;
+	struct scatterlist src, dst;
+
+	error = bio_read_page(swp_offset(p->swap_address), (void *)p->address);
+	if (!error) {
+		src.offset = 0;
+		src.length = PAGE_SIZE;
+		dst.offset = 0;
+		dst.length = PAGE_SIZE;
+		src.page = dst.page = virt_to_page((void *)p->address);
+
+		error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst,
+						&src, PAGE_SIZE);
+	}
+	return error;
+}
+#else
+static __inline__ int crypto_init(int mode, void *mem)
+{
+	return 0;
+}
+
+static __inline__ void crypto_exit(void *mem)
+{
+}
+
+static __inline__ int crypto_write(struct pbe *p, void *mem)
+{
+	return write_page(p->address, &(p->swap_address));
+}
+
+static __inline__ int crypto_read(struct pbe *p, void *mem)
+{
+	return bio_read_page(swp_offset(p->swap_address), (void *)p->address);
+}
+#endif
+
 static int mark_swapfiles(swp_entry_t prev)
 {
 	int error;
@@ -140,6 +277,7 @@ static int mark_swapfiles(swp_entry_t prev)
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
+		memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV);
 		swsusp_header.swsusp_info = prev;
 		error = rw_swap_page_sync(WRITE,
 					  swp_entry(root_swap, 0),
@@ -286,6 +424,10 @@ static int data_write(void)
 	int error = 0, i = 0;
 	unsigned int mod = nr_copy_pages / 100;
 	struct pbe *p;
+	void *tfm;
+
+	if ((error = crypto_init(1, &tfm)))
+		return error;
 
 	if (!mod)
 		mod = 1;
@@ -294,11 +436,14 @@ static int data_write(void)
 	for_each_pbe (p, pagedir_nosave) {
 		if (!(i%mod))
 			printk( "\b\b\b\b%3d%%", i / mod );
-		if ((error = write_page(p->address, &(p->swap_address))))
+		if ((error = crypto_write(p, tfm))) {
+			crypto_exit(tfm);
 			return error;
+		}
 		i++;
 	}
 	printk("\b\b\b\bdone\n");
+	crypto_exit(tfm);
 	return error;
 }
 
@@ -400,6 +545,7 @@ static int write_suspend_image(void)
 	if ((error = close_swap()))
 		goto FreePagedir;
  Done:
+	memset(key_iv, 0, MAXKEY+MAXIV);
 	return error;
  FreePagedir:
 	free_pagedir_entries();
@@ -1212,6 +1358,8 @@ static int check_sig(void)
 		return error;
 	if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+		memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV);
+		memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV);
 
 		/*
 		 * Reset swap signature now.
@@ -1239,6 +1387,10 @@ static int data_read(struct pbe *pblist)
 	int error = 0;
 	int i = 0;
 	int mod = swsusp_info.image_pages / 100;
+	void *tfm;
+
+	if ((error = crypto_init(0, &tfm)))
+		return error;
 
 	if (!mod)
 		mod = 1;
@@ -1250,14 +1402,15 @@ static int data_read(struct pbe *pblist)
 		if (!(i % mod))
 			printk("\b\b\b\b%3d%%", i / mod);
 
-		error = bio_read_page(swp_offset(p->swap_address),
-				  (void *)p->address);
-		if (error)
+		if ((error = crypto_read(p, tfm))) {
+			crypto_exit(tfm);
 			return error;
+		}
 
 		i++;
 	}
 	printk("\b\b\b\bdone\n");
+	crypto_exit(tfm);
 	return error;
 }
 
@@ -1385,6 +1538,7 @@ int swsusp_read(void)
 
 	error = read_suspend_image();
 	blkdev_put(resume_bdev);
+	memset(key_iv, 0, MAXKEY+MAXIV);
 
 	if (!error)
 		pr_debug("swsusp: Reading resume file was successful\n");
-- 
cgit v1.2.3-59-g8ed1b


From 56057e1a128a9aab516350500e5b154e70577929 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <xschmi00@stud.feec.vutbr.cz>
Date: Sat, 3 Sep 2005 15:57:02 -0700
Subject: [PATCH] swsusp: simpler calculation of number of pages in PBE list

The function calc_nr uses an iterative algorithm to calculate the number of
pages needed for the image and the pagedir.  Exactly the same result can be
obtained with a one-line expression.

Note that this was even proved correct ;-).

Signed-off-by: Michal Schmidt <xschmi00@stud.feec.vutbr.cz>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index b041cea2e878..1681e8a3fe51 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -737,18 +737,7 @@ static void copy_data_pages(void)
 
 static int calc_nr(int nr_copy)
 {
-	int extra = 0;
-	int mod = !!(nr_copy % PBES_PER_PAGE);
-	int diff = (nr_copy / PBES_PER_PAGE) + mod;
-
-	do {
-		extra += diff;
-		nr_copy += diff;
-		mod = !!(nr_copy % PBES_PER_PAGE);
-		diff = (nr_copy / PBES_PER_PAGE) + mod - extra;
-	} while (diff > 0);
-
-	return nr_copy;
+	return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1);
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From dd5d666b7995e542b7f81a4bb1c7ad634f4f6c51 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Sat, 3 Sep 2005 15:57:04 -0700
Subject: [PATCH] swsusp: add locking to software_resume

It is trying to protect swsusp_resume_device and software_resume() from two
users banging it from userspace at the same time.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 664eb0469b6e..88beec6dcd11 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -233,9 +233,12 @@ static int software_resume(void)
 {
 	int error;
 
+	down(&pm_sem);
 	if (!swsusp_resume_device) {
-		if (!strlen(resume_file))
+		if (!strlen(resume_file)) {
+			up(&pm_sem);
 			return -ENOENT;
+		}
 		swsusp_resume_device = name_to_dev_t(resume_file);
 		pr_debug("swsusp: Resume From Partition %s\n", resume_file);
 	} else {
@@ -248,6 +251,7 @@ static int software_resume(void)
 		 * FIXME: If noresume is specified, we need to find the partition
 		 * and reset it back to normal swap space.
 		 */
+		up(&pm_sem);
 		return 0;
 	}
 
@@ -284,6 +288,8 @@ static int software_resume(void)
  Cleanup:
 	unprepare_processes();
  Done:
+	/* For success case, the suspend path will release the lock */
+	up(&pm_sem);
 	pr_debug("PM: Resume from disk failed.\n");
 	return 0;
 }
@@ -390,7 +396,9 @@ static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t
 	if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
 		res = MKDEV(maj,min);
 		if (maj == MAJOR(res) && min == MINOR(res)) {
+			down(&pm_sem);
 			swsusp_resume_device = res;
+			up(&pm_sem);
 			printk("Attempting manual resume\n");
 			noresume = 0;
 			software_resume();
-- 
cgit v1.2.3-59-g8ed1b


From 99dc7d63e0dcb457580241055b2a39d011309db8 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 3 Sep 2005 15:57:05 -0700
Subject: [PATCH] swsusp: fix error handling and cleanups

Drop printing during normal boot (when no image exists in swap), print
message when drivers fail, fix error paths and consolidate near-identical
functions in disk.c (and functions with just one statement).

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c   | 45 ++++++++++++++++-----------------------------
 kernel/power/swsusp.c | 12 ++++++------
 2 files changed, 22 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 88beec6dcd11..2d8bf054d036 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -112,24 +112,12 @@ static inline void platform_finish(void)
 	}
 }
 
-static void finish(void)
-{
-	device_resume();
-	platform_finish();
-	thaw_processes();
-	enable_nonboot_cpus();
-	pm_restore_console();
-}
-
-
 static int prepare_processes(void)
 {
 	int error;
 
 	pm_prepare_console();
-
 	sys_sync();
-
 	disable_nonboot_cpus();
 
 	if (freeze_processes()) {
@@ -162,15 +150,6 @@ static void unprepare_processes(void)
 	pm_restore_console();
 }
 
-static int prepare_devices(void)
-{
-	int error;
-
-	if ((error = device_suspend(PMSG_FREEZE)))
-		printk("Some devices failed to suspend\n");
-	return error;
-}
-
 /**
  *	pm_suspend_disk - The granpappy of power management.
  *
@@ -187,17 +166,14 @@ int pm_suspend_disk(void)
 	error = prepare_processes();
 	if (error)
 		return error;
-	error = prepare_devices();
 
+	error = device_suspend(PMSG_FREEZE);
 	if (error) {
+		printk("Some devices failed to suspend\n");
 		unprepare_processes();
 		return error;
 	}
 
-	pr_debug("PM: Attempting to suspend to disk.\n");
-	if (pm_disk_mode == PM_DISK_FIRMWARE)
-		return pm_ops->enter(PM_SUSPEND_DISK);
-
 	pr_debug("PM: snapshotting memory.\n");
 	in_suspend = 1;
 	if ((error = swsusp_suspend()))
@@ -208,11 +184,20 @@ int pm_suspend_disk(void)
 		error = swsusp_write();
 		if (!error)
 			power_down(pm_disk_mode);
+		else {
+		/* swsusp_write can not fail in device_resume,
+		   no need to do second device_resume */
+			swsusp_free();
+			unprepare_processes();
+			return error;
+		}
 	} else
 		pr_debug("PM: Image restored successfully.\n");
+
 	swsusp_free();
  Done:
-	finish();
+	device_resume();
+	unprepare_processes();
 	return error;
 }
 
@@ -274,15 +259,17 @@ static int software_resume(void)
 
 	pr_debug("PM: Preparing devices for restore.\n");
 
-	if ((error = prepare_devices()))
+	if ((error = device_suspend(PMSG_FREEZE))) {
+		printk("Some devices failed to suspend\n");
 		goto Free;
+	}
 
 	mb();
 
 	pr_debug("PM: Restoring saved image.\n");
 	swsusp_resume();
 	pr_debug("PM: Restore failed, recovering.n");
-	finish();
+	device_resume();
  Free:
 	swsusp_free();
  Cleanup:
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 1681e8a3fe51..eaacd5cb5889 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -530,7 +530,6 @@ static int write_pagedir(void)
  *	write_suspend_image - Write entire image and metadata.
  *
  */
-
 static int write_suspend_image(void)
 {
 	int error;
@@ -1021,20 +1020,21 @@ int swsusp_suspend(void)
 	 * at resume time, and evil weirdness ensues.
 	 */
 	if ((error = device_power_down(PMSG_FREEZE))) {
+		printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
 		local_irq_enable();
 		return error;
 	}
 
 	if ((error = swsusp_swap_check())) {
-		printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try "
-				"swapon -a!\n");
+		printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
+		device_power_up();
 		local_irq_enable();
 		return error;
 	}
 
 	save_processor_state();
 	if ((error = swsusp_arch_suspend()))
-		printk("Error %d suspending\n", error);
+		printk(KERN_ERR "Error %d suspending\n", error);
 	/* Restore control flow magically appears here */
 	restore_processor_state();
 	BUG_ON (nr_copy_pages_check != nr_copy_pages);
@@ -1314,7 +1314,8 @@ static const char * sanity_check(void)
 	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
 		return "machine";
 #if 0
-	if(swsusp_info.cpus != num_online_cpus())
+	/* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
+	if (swsusp_info.cpus != num_possible_cpus())
 		return "number of cpus";
 #endif
 	return NULL;
@@ -1355,7 +1356,6 @@ static int check_sig(void)
 		 */
 		error = bio_write_page(0, &swsusp_header);
 	} else { 
-		printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n");
 		return -EINVAL;
 	}
 	if (!error)
-- 
cgit v1.2.3-59-g8ed1b


From 6161b2ce8116b9a623260ab811e2c035b3fac2e5 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 3 Sep 2005 15:57:05 -0700
Subject: [PATCH] pm: fix process freezing

If process freezing fails, some processes are frozen, and rest are left in
"were asked to be frozen" state.  Thats wrong, we should leave it in some
consistent state.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/process.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index f7da5bfc914e..28de118f7a0b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -81,13 +81,33 @@ int freeze_processes(void)
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 		yield();			/* Yield is okay here */
-		if (time_after(jiffies, start_time + TIMEOUT)) {
+		if (todo && time_after(jiffies, start_time + TIMEOUT)) {
 			printk( "\n" );
 			printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
-			return todo;
+			break;
 		}
 	} while(todo);
 
+	/* This does not unfreeze processes that are already frozen
+	 * (we have slightly ugly calling convention in that respect,
+	 * and caller must call thaw_processes() if something fails),
+	 * but it cleans up leftover PF_FREEZE requests.
+	 */
+	if (todo) {
+		read_lock(&tasklist_lock);
+		do_each_thread(g, p)
+			if (freezing(p)) {
+				pr_debug("  clean up: %s\n", p->comm);
+				p->flags &= ~PF_FREEZE;
+				spin_lock_irqsave(&p->sighand->siglock, flags);
+				recalc_sigpending_tsk(p);
+				spin_unlock_irqrestore(&p->sighand->siglock, flags);
+			}
+		while_each_thread(g, p);
+		read_unlock(&tasklist_lock);
+		return todo;
+	}
+
 	printk( "|\n" );
 	BUG_ON(in_atomic());
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 57c4ce3cbfba1bb0da7f37b9328a713cbd5d0919 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 3 Sep 2005 15:57:06 -0700
Subject: [PATCH] pm: clean up /sys/power/disk

Clean code up a bit, and only show suspend to disk as available when
it is configured in.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 71aa0fd22007..22bdc93cc038 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -143,11 +143,12 @@ static void suspend_finish(suspend_state_t state)
 
 
-static char * pm_states[] = {
+static char *pm_states[PM_SUSPEND_MAX] = {
 	[PM_SUSPEND_STANDBY]	= "standby",
 	[PM_SUSPEND_MEM]	= "mem",
+#ifdef CONFIG_SOFTWARE_SUSPEND
 	[PM_SUSPEND_DISK]	= "disk",
-	NULL,
+#endif
 };
 
 
-- 
cgit v1.2.3-59-g8ed1b


From ed75e8d58010fdc06e2c3a81bfbebae92314c7e3 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <LaurentVivier@wanadoo.fr>
Date: Sat, 3 Sep 2005 15:57:18 -0700
Subject: [PATCH] UML Support - Ptrace: adds the host SYSEMU support, for UML
 and general usage

      Jeff Dike <jdike@addtoit.com>,
      Paolo 'Blaisorblade' Giarrusso <blaisorblade_spam@yahoo.it>,
      Bodo Stroesser <bstroesser@fujitsu-siemens.com>

Adds a new ptrace(2) mode, called PTRACE_SYSEMU, resembling PTRACE_SYSCALL
except that the kernel does not execute the requested syscall; this is useful
to improve performance for virtual environments, like UML, which want to run
the syscall on their own.

In fact, using PTRACE_SYSCALL means stopping child execution twice, on entry
and on exit, and each time you also have two context switches; with SYSEMU you
avoid the 2nd stop and so save two context switches per syscall.

Also, some architectures don't have support in the host for changing the
syscall number via ptrace(), which is currently needed to skip syscall
execution (UML turns any syscall into getpid() to avoid it being executed on
the host).  Fixing that is hard, while SYSEMU is easier to implement.

* This version of the patch includes some suggestions of Jeff Dike to avoid
  adding any instructions to the syscall fast path, plus some other little
  changes, by myself, to make it work even when the syscall is executed with
  SYSENTER (but I'm unsure about them). It has been widely tested for quite a
  lot of time.

* Various fixed were included to handle the various switches between
  various states, i.e. when for instance a syscall entry is traced with one of
  PT_SYSCALL / _SYSEMU / _SINGLESTEP and another one is used on exit.
  Basically, this is done by remembering which one of them was used even after
  the call to ptrace_notify().

* We're combining TIF_SYSCALL_EMU with TIF_SYSCALL_TRACE or TIF_SINGLESTEP
  to make do_syscall_trace() notice that the current syscall was started with
  SYSEMU on entry, so that no notification ought to be done in the exit path;
  this is a bit of a hack, so this problem is solved in another way in next
  patches.

* Also, the effects of the patch:
"Ptrace - i386: fix Syscall Audit interaction with singlestep"
are cancelled; they are restored back in the last patch of this series.

Detailed descriptions of the patches doing this kind of processing follow (but
I've already summed everything up).

* Fix behaviour when changing interception kind #1.

  In do_syscall_trace(), we check the status of the TIF_SYSCALL_EMU flag
  only after doing the debugger notification; but the debugger might have
  changed the status of this flag because he continued execution with
  PTRACE_SYSCALL, so this is wrong.  This patch fixes it by saving the flag
  status before calling ptrace_notify().

* Fix behaviour when changing interception kind #2:
  avoid intercepting syscall on return when using SYSCALL again.

  A guest process switching from using PTRACE_SYSEMU to PTRACE_SYSCALL
  crashes.

  The problem is in arch/i386/kernel/entry.S.  The current SYSEMU patch
  inhibits the syscall-handler to be called, but does not prevent
  do_syscall_trace() to be called after this for syscall completion
  interception.

  The appended patch fixes this.  It reuses the flag TIF_SYSCALL_EMU to
  remember "we come from PTRACE_SYSEMU and now are in PTRACE_SYSCALL", since
  the flag is unused in the depicted situation.

* Fix behaviour when changing interception kind #3:
  avoid intercepting syscall on return when using SINGLESTEP.

  When testing 2.6.9 and the skas3.v6 patch, with my latest patch and had
  problems with singlestepping on UML in SKAS with SYSEMU.  It looped
  receiving SIGTRAPs without moving forward.  EIP of the traced process was
  the same for all SIGTRAPs.

What's missing is to handle switching from PTRACE_SYSCALL_EMU to
PTRACE_SINGLESTEP in a way very similar to what is done for the change from
PTRACE_SYSCALL_EMU to PTRACE_SYSCALL_TRACE.

I.e., after calling ptrace(PTRACE_SYSEMU), on the return path, the debugger is
notified and then wake ups the process; the syscall is executed (or skipped,
when do_syscall_trace() returns 0, i.e.  when using PTRACE_SYSEMU), and
do_syscall_trace() is called again.  Since we are on the return path of a
SYSEMU'd syscall, if the wake up is performed through ptrace(PTRACE_SYSCALL),
we must still avoid notifying the parent of the syscall exit.  Now, this
behaviour is extended even to resuming with PTRACE_SINGLESTEP.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/entry.S       |  9 ++++---
 arch/i386/kernel/ptrace.c      | 57 +++++++++++++++++++++++++++++-------------
 include/asm-i386/thread_info.h |  5 +++-
 include/linux/ptrace.h         |  1 +
 kernel/fork.c                  |  3 +++
 5 files changed, 53 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index a991d4e5edd2..b389e5f3bdee 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -203,7 +203,7 @@ sysenter_past_esp:
 	GET_THREAD_INFO(%ebp)
 
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
+	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -226,9 +226,9 @@ ENTRY(system_call)
 	pushl %eax			# save orig_eax
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-					# system call tracing in operation
+					# system call tracing in operation / emulation
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
+	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -338,6 +338,9 @@ syscall_trace_entry:
 	movl %esp, %eax
 	xorl %edx,%edx
 	call do_syscall_trace
+	cmpl $0, %eax
+	jne syscall_exit		# ret != 0 -> running under PTRACE_SYSEMU,
+					# so must skip actual syscall
 	movl ORIG_EAX(%esp), %eax
 	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 5ee9e1d60653..5b569dc1c227 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -509,15 +509,27 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		  }
 		  break;
 
+	case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
 	case PTRACE_SYSCALL:	/* continue and stop at next (return from) syscall */
 	case PTRACE_CONT:	/* restart after signal. */
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
+		/* If we came here with PTRACE_SYSEMU and now continue with
+		 * PTRACE_SYSCALL, entry.S used to intercept the syscall return.
+		 * But it shouldn't!
+		 * So we don't clear TIF_SYSCALL_EMU, which is always unused in
+		 * this special case, to remember, we came from SYSEMU. That
+		 * flag will be cleared by do_syscall_trace().
+		 */
+		if (request == PTRACE_SYSEMU) {
+			set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+		} else if (request == PTRACE_CONT) {
+			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+		}
 		if (request == PTRACE_SYSCALL) {
 			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		}
-		else {
+		} else {
 			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		}
 		child->exit_code = data;
@@ -546,6 +558,8 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
+		/*See do_syscall_trace to know why we don't clear
+		 * TIF_SYSCALL_EMU.*/
 		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		set_singlestep(child);
 		child->exit_code = data;
@@ -678,37 +692,43 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
  * - triggered by current->work.syscall_trace
  */
 __attribute__((regparm(3)))
-void do_syscall_trace(struct pt_regs *regs, int entryexit)
+int do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
+	int is_sysemu, is_systrace, is_singlestep, ret = 0;
 	/* do the secure computing check first */
 	secure_computing(regs->orig_eax);
 
-	if (unlikely(current->audit_context)) {
-		if (entryexit)
-			audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
-
-		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
-		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
-		 * not used, entry.S will call us only on syscall exit, not
-		 * entry ; so when TIF_SYSCALL_AUDIT is used we must avoid
-		 * calling send_sigtrap() on syscall entry.
-		 */
-		else if (is_singlestep)
-			goto out;
-	}
+	if (unlikely(current->audit_context) && entryexit)
+		audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
 
 	if (!(current->ptrace & PT_PTRACED))
 		goto out;
 
+	is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
+	is_systrace = test_thread_flag(TIF_SYSCALL_TRACE);
+	is_singlestep = test_thread_flag(TIF_SINGLESTEP);
+
+	/* We can detect the case of coming from PTRACE_SYSEMU and now running
+	 * with PTRACE_SYSCALL or PTRACE_SINGLESTEP, by TIF_SYSCALL_EMU being
+	 * set additionally.
+	 * If so let's reset the flag and return without action (no singlestep
+	 * nor syscall tracing, since no actual step has been executed).
+	 */
+	if (is_sysemu && (is_systrace || is_singlestep)) {
+		clear_thread_flag(TIF_SYSCALL_EMU);
+		goto out;
+	}
+
 	/* Fake a debug trap */
 	if (test_thread_flag(TIF_SINGLESTEP))
 		send_sigtrap(current, regs, 0);
 
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
+	if (!is_systrace && !is_sysemu)
 		goto out;
 
 	/* the 0x80 provides a way for the tracing parent to distinguish
 	   between a syscall stop and SIGTRAP delivery */
+	/* Note that the debugger could change the result of test_thread_flag!*/
 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
 
 	/*
@@ -720,9 +740,10 @@ void do_syscall_trace(struct pt_regs *regs, int entryexit)
 		send_sig(current->exit_code, current, 1);
 		current->exit_code = 0;
 	}
+	ret = is_sysemu;
  out:
 	if (unlikely(current->audit_context) && !entryexit)
 		audit_syscall_entry(current, AUDIT_ARCH_I386, regs->orig_eax,
 				    regs->ebx, regs->ecx, regs->edx, regs->esi);
-
+	return ret;
 }
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 95add81237ea..e2cb9fa6f563 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -139,6 +139,7 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
+#define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
@@ -150,13 +151,15 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
+#define _TIF_SYSCALL_EMU	(1<<TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP))
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|\
+		  _TIF_SECCOMP|_TIF_SYSCALL_EMU))
 /* work to do on any return to u-space */
 #define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP)
 
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index a373fc254df2..7528afb6b2ad 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -20,6 +20,7 @@
 #define PTRACE_DETACH		0x11
 
 #define PTRACE_SYSCALL		  24
+#define PTRACE_SYSEMU		  31
 
 /* 0x4200-0x4300 are reserved for architecture-independent additions.  */
 #define PTRACE_SETOPTIONS	0x4200
diff --git a/kernel/fork.c b/kernel/fork.c
index b65187f0c74e..7e1ead9a6ba4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -994,6 +994,9 @@ static task_t *copy_process(unsigned long clone_flags,
 	 * of CLONE_PTRACE.
 	 */
 	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+#endif
 
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
-- 
cgit v1.2.3-59-g8ed1b


From 344babaa9d39b10b85cadec4e5335d43b52b4ec0 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jgarzik@pobox.com>
Date: Wed, 7 Sep 2005 01:15:17 -0400
Subject: [kernel-doc] fix various DocBook build problems/warnings

Most serious is fixing include/sound/pcm.h, which breaks the DocBook
build.

The other stuff is just filling in things that cause warnings.
---
 Documentation/DocBook/mcabook.tmpl | 2 +-
 drivers/net/wan/syncppp.c          | 1 +
 drivers/scsi/libata-core.c         | 4 ++--
 include/sound/pcm.h                | 2 +-
 kernel/sched.c                     | 1 +
 5 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/DocBook/mcabook.tmpl b/Documentation/DocBook/mcabook.tmpl
index 4367f4642f3d..42a760cd7467 100644
--- a/Documentation/DocBook/mcabook.tmpl
+++ b/Documentation/DocBook/mcabook.tmpl
@@ -96,7 +96,7 @@
 
   <chapter id="pubfunctions">
      <title>Public Functions Provided</title>
-!Earch/i386/kernel/mca.c
+!Edrivers/mca/mca-legacy.c
   </chapter>
 
   <chapter id="dmafunctions">
diff --git a/drivers/net/wan/syncppp.c b/drivers/net/wan/syncppp.c
index f58c794a963a..b56a7b516d24 100644
--- a/drivers/net/wan/syncppp.c
+++ b/drivers/net/wan/syncppp.c
@@ -1440,6 +1440,7 @@ static void sppp_print_bytes (u_char *p, u16 len)
  *	@skb:	The buffer to process
  *	@dev:	The device it arrived on
  *	@p: Unused
+ *	@orig_dev: Unused
  *
  *	Protocol glue. This drives the deferred processing mode the poorer
  *	cards use. This can be called directly by cards that do not have
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 9fb9814525a3..5cc53cd9323e 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -2531,7 +2531,7 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
  *	@ap: port to read/write
  *	@buf: data buffer
  *	@buflen: buffer length
- *	@do_write: read/write
+ *	@write_data: read/write
  *
  *	Transfer data from/to the device data register by MMIO.
  *
@@ -2577,7 +2577,7 @@ static void ata_mmio_data_xfer(struct ata_port *ap, unsigned char *buf,
  *	@ap: port to read/write
  *	@buf: data buffer
  *	@buflen: buffer length
- *	@do_write: read/write
+ *	@write_data: read/write
  *
  *	Transfer data from/to the device data register by PIO.
  *
diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index fa23ebfb857a..389e8ebe9c19 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -903,7 +903,7 @@ int snd_pcm_format_unsigned(snd_pcm_format_t format);
 int snd_pcm_format_linear(snd_pcm_format_t format);
 int snd_pcm_format_little_endian(snd_pcm_format_t format);
 int snd_pcm_format_big_endian(snd_pcm_format_t format);
-/**
+/*
  * snd_pcm_format_cpu_endian - Check the PCM format is CPU-endian
  * @format: the format to check
  *
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..f41fa94d2070 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1478,6 +1478,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
 
 /**
  * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
-- 
cgit v1.2.3-59-g8ed1b


From 54d5d42404e7705cf3804593189e963350d470e5 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Tue, 6 Sep 2005 15:16:15 -0700
Subject: [PATCH] x86/x86_64: deferred handling of writes to
 /proc/irqxx/smp_affinity

When handling writes to /proc/irq, current code is re-programming rte
entries directly. This is not recommended and could potentially cause
chipset's to lockup, or cause missing interrupts.

CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the
interrupt is pending. The same needs to be done for /proc/irq handling as well.
Otherwise user space irq balancers are really not doing the right thing.

- Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for
  lack of a generic name.
- added move_irq out of IRQ_BALANCE, and added this same to X86_64
- Added new proc handler for write, so we can do deferred write at irq
  handling time.
- Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead
  it now shows only active cpu masks, or exactly what was set.
- Provided a common move_irq implementation, instead of duplicating
  when using generic irq framework.

Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off.
Tested UP builds as well.

MSI testing: tbd: I have cards, need to look for a x-over cable, although I
did test an earlier version of this patch.  Will test in a couple days.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Zwane Mwaikambo <zwane@holomorphy.com>
Grudgingly-acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig            |   5 ++
 arch/i386/kernel/io_apic.c   |  55 ++++++++++---------
 arch/ia64/Kconfig            |   5 ++
 arch/ia64/kernel/irq.c       |  39 +-------------
 arch/x86_64/Kconfig          |   5 ++
 arch/x86_64/kernel/io_apic.c | 102 ++++++++++++++++++++++-------------
 drivers/pci/msi.c            |  17 ++----
 drivers/pci/msi.h            |   5 --
 include/asm-ia64/hw_irq.h    |   7 ---
 include/asm-ia64/irq.h       |   6 ---
 include/linux/irq.h          | 123 +++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/manage.c          |   4 ++
 kernel/irq/proc.c            |  14 ++++-
 13 files changed, 253 insertions(+), 134 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 3b3b017e1c15..4b7de3e1e57b 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1318,6 +1318,11 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 config X86_SMP
 	bool
 	depends on SMP && !X86_VOYAGER
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 6578f40bd501..4a5940431579 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -33,6 +33,7 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
+
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/desc.h>
@@ -222,13 +223,21 @@ static void clear_IO_APIC (void)
 			clear_IO_APIC_pin(apic, pin);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 {
 	unsigned long flags;
 	int pin;
 	struct irq_pin_list *entry = irq_2_pin + irq;
 	unsigned int apicid_value;
+	cpumask_t tmp;
 	
+	cpus_and(tmp, cpumask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(cpumask, tmp, CPU_MASK_ALL);
+
 	apicid_value = cpu_mask_to_apicid(cpumask);
 	/* Prepare to do the io_apic_write */
 	apicid_value = apicid_value << 24;
@@ -242,6 +251,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 			break;
 		entry = irq_2_pin + entry->next;
 	}
+	set_irq_info(irq, cpumask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -259,7 +269,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 #  define Dprintk(x...) 
 # endif
 
-cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS];
 
 #define IRQBALANCE_CHECK_ARCH -999
 static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
@@ -328,12 +337,7 @@ static inline void balance_irq(int cpu, int irq)
 	cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
 	new_cpu = move(cpu, allowed_mask, now, 1);
 	if (cpu != new_cpu) {
-		irq_desc_t *desc = irq_desc + irq;
-		unsigned long flags;
-
-		spin_lock_irqsave(&desc->lock, flags);
-		pending_irq_balance_cpumask[irq] = cpumask_of_cpu(new_cpu);
-		spin_unlock_irqrestore(&desc->lock, flags);
+		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
 	}
 }
 
@@ -528,16 +532,12 @@ tryanotherirq:
 	cpus_and(tmp, target_cpu_mask, allowed_mask);
 
 	if (!cpus_empty(tmp)) {
-		irq_desc_t *desc = irq_desc + selected_irq;
-		unsigned long flags;
 
 		Dprintk("irq = %d moved to cpu = %d\n",
 				selected_irq, min_loaded);
 		/* mark for change destination */
-		spin_lock_irqsave(&desc->lock, flags);
-		pending_irq_balance_cpumask[selected_irq] =
-					cpumask_of_cpu(min_loaded);
-		spin_unlock_irqrestore(&desc->lock, flags);
+		set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
+
 		/* Since we made a change, come back sooner to 
 		 * check for more variation.
 		 */
@@ -568,7 +568,8 @@ static int balanced_irq(void *unused)
 	
 	/* push everything to CPU 0 to give us a starting point.  */
 	for (i = 0 ; i < NR_IRQS ; i++) {
-		pending_irq_balance_cpumask[i] = cpumask_of_cpu(0);
+		pending_irq_cpumask[i] = cpumask_of_cpu(0);
+		set_pending_irq(i, cpumask_of_cpu(0));
 	}
 
 	for ( ; ; ) {
@@ -647,20 +648,9 @@ int __init irqbalance_disable(char *str)
 
 __setup("noirqbalance", irqbalance_disable);
 
-static inline void move_irq(int irq)
-{
-	/* note - we hold the desc->lock */
-	if (unlikely(!cpus_empty(pending_irq_balance_cpumask[irq]))) {
-		set_ioapic_affinity_irq(irq, pending_irq_balance_cpumask[irq]);
-		cpus_clear(pending_irq_balance_cpumask[irq]);
-	}
-}
-
 late_initcall(balanced_irq_init);
-
-#else /* !CONFIG_IRQBALANCE */
-static inline void move_irq(int irq) { }
 #endif /* CONFIG_IRQBALANCE */
+#endif /* CONFIG_SMP */
 
 #ifndef CONFIG_SMP
 void fastcall send_IPI_self(int vector)
@@ -820,6 +810,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
+#ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
@@ -838,6 +829,7 @@ void __init setup_ioapic_dest(void)
 
 	}
 }
+#endif
 
 /*
  * EISA Edge/Level control register, ELCR
@@ -1249,6 +1241,7 @@ static void __init setup_IO_APIC_irqs(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
 		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
@@ -1944,6 +1937,7 @@ static void ack_edge_ioapic_vector(unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_irq(vector);
 	ack_edge_ioapic_irq(irq);
 }
 
@@ -1958,6 +1952,7 @@ static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_irq(vector);
 	end_level_ioapic_irq(irq);
 }
 
@@ -1975,14 +1970,17 @@ static void unmask_IO_APIC_vector (unsigned int vector)
 	unmask_IO_APIC_irq(irq);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_vector (unsigned int vector,
 					cpumask_t cpu_mask)
 {
 	int irq = vector_to_irq(vector);
 
+	set_native_irq_info(vector, cpu_mask);
 	set_ioapic_affinity_irq(irq, cpu_mask);
 }
 #endif
+#endif
 
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
@@ -2000,7 +1998,9 @@ static struct hw_interrupt_type ioapic_edge_type = {
 	.disable 	= disable_edge_ioapic,
 	.ack 		= ack_edge_ioapic,
 	.end 		= end_edge_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity 	= set_ioapic_affinity,
+#endif
 };
 
 static struct hw_interrupt_type ioapic_level_type = {
@@ -2011,7 +2011,9 @@ static struct hw_interrupt_type ioapic_level_type = {
 	.disable 	= disable_level_ioapic,
 	.ack 		= mask_and_ack_level_ioapic,
 	.end 		= end_level_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity 	= set_ioapic_affinity,
+#endif
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -2569,6 +2571,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
 	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+	set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 3deced637f07..17b5dbf8c311 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -434,6 +434,11 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 source "arch/ia64/hp/sim/Kconfig"
 
 source "arch/ia64/oprofile/Kconfig"
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index 28f2aadc38d0..205d98028261 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -91,23 +91,8 @@ skip:
 }
 
 #ifdef CONFIG_SMP
-/*
- * This is updated when the user sets irq affinity via /proc
- */
-static cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
-static unsigned long pending_irq_redir[BITS_TO_LONGS(NR_IRQS)];
-
 static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
 
-/*
- * Arch specific routine for deferred write to iosapic rte to reprogram
- * intr destination.
- */
-void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
-{
-	pending_irq_cpumask[irq] = mask_val;
-}
-
 void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 {
 	cpumask_t mask = CPU_MASK_NONE;
@@ -116,32 +101,10 @@ void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 
 	if (irq < NR_IRQS) {
 		irq_affinity[irq] = mask;
+		set_irq_info(irq, mask);
 		irq_redir[irq] = (char) (redir & 0xff);
 	}
 }
-
-
-void move_irq(int irq)
-{
-	/* note - we hold desc->lock */
-	cpumask_t tmp;
-	irq_desc_t *desc = irq_descp(irq);
-	int redir = test_bit(irq, pending_irq_redir);
-
-	if (unlikely(!desc->handler->set_affinity))
-		return;
-
-	if (!cpus_empty(pending_irq_cpumask[irq])) {
-		cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
-		if (unlikely(!cpus_empty(tmp))) {
-			desc->handler->set_affinity(irq | (redir ? IA64_IRQ_REDIRECTED : 0),
-						    pending_irq_cpumask[irq]);
-		}
-		cpus_clear(pending_irq_cpumask[irq]);
-	}
-}
-
-
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 75e52c57f19c..251ce7cf1a38 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -441,6 +441,11 @@ config ISA_DMA_API
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 menu "Power management options"
 
 source kernel/power/Kconfig
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index d206d7e49cf5..76bcc4e6979d 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -78,6 +78,54 @@ int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
 #define vector_to_irq(vector)	(vector)
 #endif
 
+#define __DO_ACTION(R, ACTION, FINAL)					\
+									\
+{									\
+	int pin;							\
+	struct irq_pin_list *entry = irq_2_pin + irq;			\
+									\
+	for (;;) {							\
+		unsigned int reg;					\
+		pin = entry->pin;					\
+		if (pin == -1)						\
+			break;						\
+		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
+		reg ACTION;						\
+		io_apic_modify(entry->apic, reg);			\
+		if (!entry->next)					\
+			break;						\
+		entry = irq_2_pin + entry->next;			\
+	}								\
+	FINAL;								\
+}
+
+#ifdef CONFIG_SMP
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	unsigned long flags;
+	unsigned int dest;
+	cpumask_t tmp;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(mask, tmp, CPU_MASK_ALL);
+
+	dest = cpu_mask_to_apicid(mask);
+
+	/*
+	 * Only the high 8 bits are valid.
+	 */
+	dest = SET_APIC_LOGICAL_ID(dest);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__DO_ACTION(1, = dest, )
+	set_irq_info(irq, mask);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#endif
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -101,26 +149,6 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 	entry->pin = pin;
 }
 
-#define __DO_ACTION(R, ACTION, FINAL)					\
-									\
-{									\
-	int pin;							\
-	struct irq_pin_list *entry = irq_2_pin + irq;			\
-									\
-	for (;;) {							\
-		unsigned int reg;					\
-		pin = entry->pin;					\
-		if (pin == -1)						\
-			break;						\
-		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
-		reg ACTION;						\
-		io_apic_modify(entry->apic, reg);			\
-		if (!entry->next)					\
-			break;						\
-		entry = irq_2_pin + entry->next;			\
-	}								\
-	FINAL;								\
-}
 
 #define DO_ACTION(name,R,ACTION, FINAL)					\
 									\
@@ -767,6 +795,7 @@ static void __init setup_IO_APIC_irqs(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
 		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
@@ -1314,6 +1343,7 @@ static unsigned int startup_edge_ioapic_irq(unsigned int irq)
  */
 static void ack_edge_ioapic_irq(unsigned int irq)
 {
+	move_irq(irq);
 	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
 					== (IRQ_PENDING | IRQ_DISABLED))
 		mask_IO_APIC_irq(irq);
@@ -1343,26 +1373,10 @@ static unsigned int startup_level_ioapic_irq (unsigned int irq)
 
 static void end_level_ioapic_irq (unsigned int irq)
 {
+	move_irq(irq);
 	ack_APIC_irq();
 }
 
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
-{
-	unsigned long flags;
-	unsigned int dest;
-
-	dest = cpu_mask_to_apicid(mask);
-
-	/*
-	 * Only the high 8 bits are valid.
-	 */
-	dest = SET_APIC_LOGICAL_ID(dest);
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__DO_ACTION(1, = dest, )
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 #ifdef CONFIG_PCI_MSI
 static unsigned int startup_edge_ioapic_vector(unsigned int vector)
 {
@@ -1375,6 +1389,7 @@ static void ack_edge_ioapic_vector(unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_native_irq(vector);
 	ack_edge_ioapic_irq(irq);
 }
 
@@ -1389,6 +1404,7 @@ static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_native_irq(vector);
 	end_level_ioapic_irq(irq);
 }
 
@@ -1406,14 +1422,17 @@ static void unmask_IO_APIC_vector (unsigned int vector)
 	unmask_IO_APIC_irq(irq);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_vector (unsigned int vector,
 					cpumask_t cpu_mask)
 {
 	int irq = vector_to_irq(vector);
 
+	set_native_irq_info(vector, cpu_mask);
 	set_ioapic_affinity_irq(irq, cpu_mask);
 }
-#endif
+#endif // CONFIG_SMP
+#endif // CONFIG_PCI_MSI
 
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
@@ -1432,7 +1451,9 @@ static struct hw_interrupt_type ioapic_edge_type = {
 	.disable 	= disable_edge_ioapic,
 	.ack 		= ack_edge_ioapic,
 	.end 		= end_edge_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity = set_ioapic_affinity,
+#endif
 };
 
 static struct hw_interrupt_type ioapic_level_type = {
@@ -1443,7 +1464,9 @@ static struct hw_interrupt_type ioapic_level_type = {
 	.disable 	= disable_level_ioapic,
 	.ack 		= mask_and_ack_level_ioapic,
 	.end 		= end_level_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity = set_ioapic_affinity,
+#endif
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -1918,6 +1941,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
 	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+	set_native_irq_info(use_pci_vector() ?  entry.vector : irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
@@ -1931,6 +1955,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
+#ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
@@ -1949,3 +1974,4 @@ void __init setup_ioapic_dest(void)
 
 	}
 }
+#endif
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 2b85aa39f954..532f73bb2224 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -91,6 +91,7 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 {
 	struct msi_desc *entry;
 	struct msg_address address;
+	unsigned int irq = vector;
 
 	entry = (struct msi_desc *)msi_desc[vector];
 	if (!entry || !entry->dev)
@@ -112,6 +113,7 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
 		pci_write_config_dword(entry->dev, msi_lower_address_reg(pos),
 			address.lo_address.value);
+		set_native_irq_info(irq, cpu_mask);
 		break;
 	}
 	case PCI_CAP_ID_MSIX:
@@ -125,22 +127,13 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 			MSI_TARGET_CPU_SHIFT);
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
 		writel(address.lo_address.value, entry->mask_base + offset);
+		set_native_irq_info(irq, cpu_mask);
 		break;
 	}
 	default:
 		break;
 	}
 }
-
-#ifdef CONFIG_IRQBALANCE
-static inline void move_msi(int vector)
-{
-	if (!cpus_empty(pending_irq_balance_cpumask[vector])) {
-		set_msi_affinity(vector, pending_irq_balance_cpumask[vector]);
-		cpus_clear(pending_irq_balance_cpumask[vector]);
-	}
-}
-#endif /* CONFIG_IRQBALANCE */
 #endif /* CONFIG_SMP */
 
 static void mask_MSI_irq(unsigned int vector)
@@ -191,13 +184,13 @@ static void shutdown_msi_irq(unsigned int vector)
 
 static void end_msi_irq_wo_maskbit(unsigned int vector)
 {
-	move_msi(vector);
+	move_native_irq(vector);
 	ack_APIC_irq();
 }
 
 static void end_msi_irq_w_maskbit(unsigned int vector)
 {
-	move_msi(vector);
+	move_native_irq(vector);
 	unmask_MSI_irq(vector);
 	ack_APIC_irq();
 }
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 390f1851c0f1..402136a5c9e4 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -19,7 +19,6 @@
 #define NR_HP_RESERVED_VECTORS 	20
 
 extern int vector_irq[NR_VECTORS];
-extern cpumask_t pending_irq_balance_cpumask[NR_IRQS];
 extern void (*interrupt[NR_IRQS])(void);
 extern int pci_vector_resources(int last, int nr_released);
 
@@ -29,10 +28,6 @@ extern int pci_vector_resources(int last, int nr_released);
 #define set_msi_irq_affinity	NULL
 #endif
 
-#ifndef CONFIG_IRQBALANCE
-static inline void move_msi(int vector) {}
-#endif
-
 /*
  * MSI-X Address Register
  */
diff --git a/include/asm-ia64/hw_irq.h b/include/asm-ia64/hw_irq.h
index 041ab8c51a64..0cf119b42f7d 100644
--- a/include/asm-ia64/hw_irq.h
+++ b/include/asm-ia64/hw_irq.h
@@ -116,13 +116,6 @@ __ia64_local_vector_to_irq (ia64_vector vec)
  * and to obtain the irq descriptor for a given irq number.
  */
 
-/* Return a pointer to the irq descriptor for IRQ.  */
-static inline irq_desc_t *
-irq_descp (int irq)
-{
-	return irq_desc + irq;
-}
-
 /* Extract the IA-64 vector that corresponds to IRQ.  */
 static inline ia64_vector
 irq_to_vector (int irq)
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index bd07d11d9f37..5d930fdc0bea 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -30,12 +30,6 @@ extern void disable_irq_nosync (unsigned int);
 extern void enable_irq (unsigned int);
 extern void set_irq_affinity_info (unsigned int irq, int dest, int redir);
 
-#ifdef CONFIG_SMP
-extern void move_irq(int irq);
-#else
-#define move_irq(irq)
-#endif
-
 struct irqaction;
 struct pt_regs;
 int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 069d3b84d311..4a362b9ec966 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -71,16 +71,139 @@ typedef struct irq_desc {
 	unsigned int irq_count;		/* For detecting broken interrupts */
 	unsigned int irqs_unhandled;
 	spinlock_t lock;
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+	unsigned int move_irq;		/* Flag need to re-target intr dest*/
+#endif
 } ____cacheline_aligned irq_desc_t;
 
 extern irq_desc_t irq_desc [NR_IRQS];
 
+/* Return a pointer to the irq descriptor for IRQ.  */
+static inline irq_desc_t *
+irq_descp (int irq)
+{
+	return irq_desc + irq;
+}
+
 #include <asm/hw_irq.h> /* the arch dependent stuff */
 
 extern int setup_irq(unsigned int irq, struct irqaction * new);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 extern cpumask_t irq_affinity[NR_IRQS];
+
+#ifdef CONFIG_SMP
+static inline void set_native_irq_info(int irq, cpumask_t mask)
+{
+	irq_affinity[irq] = mask;
+}
+#else
+static inline void set_native_irq_info(int irq, cpumask_t mask)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+extern cpumask_t pending_irq_cpumask[NR_IRQS];
+
+static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->move_irq = 1;
+	pending_irq_cpumask[irq] = mask;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+static inline void
+move_native_irq(int irq)
+{
+	cpumask_t tmp;
+	irq_desc_t *desc = irq_descp(irq);
+
+	if (likely (!desc->move_irq))
+		return;
+
+	desc->move_irq = 0;
+
+	if (likely(cpus_empty(pending_irq_cpumask[irq])))
+		return;
+
+	if (!desc->handler->set_affinity)
+		return;
+
+	/* note - we hold the desc->lock */
+	cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
+
+	/*
+	 * If there was a valid mask to work with, please
+	 * do the disable, re-program, enable sequence.
+	 * This is *not* particularly important for level triggered
+	 * but in a edge trigger case, we might be setting rte
+	 * when an active trigger is comming in. This could
+	 * cause some ioapics to mal-function.
+	 * Being paranoid i guess!
+	 */
+	if (unlikely(!cpus_empty(tmp))) {
+		desc->handler->disable(irq);
+		desc->handler->set_affinity(irq,tmp);
+		desc->handler->enable(irq);
+	}
+	cpus_clear(pending_irq_cpumask[irq]);
+}
+
+#ifdef CONFIG_PCI_MSI
+/*
+ * Wonder why these are dummies?
+ * For e.g the set_ioapic_affinity_vector() calls the set_ioapic_affinity_irq()
+ * counter part after translating the vector to irq info. We need to perform
+ * this operation on the real irq, when we dont use vector, i.e when
+ * pci_use_vector() is false.
+ */
+static inline void move_irq(int irq)
+{
+}
+
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+}
+
+#else // CONFIG_PCI_MSI
+
+static inline void move_irq(int irq)
+{
+	move_native_irq(irq);
+}
+
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+	set_native_irq_info(irq, mask);
+}
+#endif // CONFIG_PCI_MSI
+
+#else	// CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE
+
+#define move_irq(x)
+#define move_native_irq(x)
+#define set_pending_irq(x,y)
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+	set_native_irq_info(irq, mask);
+}
+
+#endif // CONFIG_GENERIC_PENDING_IRQ
+
+#else // CONFIG_SMP
+
+#define move_irq(x)
+#define move_native_irq(x)
+
+#endif // CONFIG_SMP
+
 extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac6700985705..1cfdb08ddf20 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
 
 cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
 
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
+#endif
+
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa6600..f26e534c6585 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
  */
 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
 
-void __attribute__((weak))
-proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+	/*
+	 * Save these away for later use. Re-progam when the
+	 * interrupt is pending
+	 */
+	set_pending_irq(irq, mask_val);
+}
+#else
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
 	irq_affinity[irq] = mask_val;
 	irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
+#endif
 
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
 				  int count, int *eof, void *data)
-- 
cgit v1.2.3-59-g8ed1b


From d7ae79c72d072e3208c18ff2dc402a69229b7b1b Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Tue, 6 Sep 2005 15:16:21 -0700
Subject: [PATCH] swsusp: update documentation

This updates documentation a bit (mostly removing obsolete stuff), and
marks swsusp as no longer experimental in config.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/power/swsusp.txt | 101 +++++++++++++++++++++++------------------
 Documentation/power/video.txt  |   1 +
 kernel/power/Kconfig           |   2 +-
 3 files changed, 58 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index ddf907fbcc05..b0d50840788e 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -1,22 +1,20 @@
-From kernel/suspend.c:
+Some warnings, first.
 
  * BIG FAT WARNING *********************************************************
  *
- * If you have unsupported (*) devices using DMA...
- *				...say goodbye to your data.
- *
  * If you touch anything on disk between suspend and resume...
  *				...kiss your data goodbye.
  *
- * If your disk driver does not support suspend... (IDE does)
- *				...you'd better find out how to get along
- *				   without your data.
- *
- * If you change kernel command line between suspend and resume...
- *			        ...prepare for nasty fsck or worse.
+ * If you do resume from initrd after your filesystems are mounted...
+ *				...bye bye root partition.
+ *			[this is actually same case as above]
  *
- * If you change your hardware while system is suspended...
- *			        ...well, it was not good idea.
+ * If you have unsupported (*) devices using DMA, you may have some
+ * problems. If your disk driver does not support suspend... (IDE does),
+ * it may cause some problems, too. If you change kernel command line
+ * between suspend and resume, it may do something wrong. If you change
+ * your hardware while system is suspended... well, it was not good idea;
+ * but it will probably only crash.
  *
  * (*) suspend/resume support is needed to make it safe.
 
@@ -30,6 +28,13 @@ echo shutdown > /sys/power/disk; echo disk > /sys/power/state
 echo platform > /sys/power/disk; echo disk > /sys/power/state
 
 
+Encrypted suspend image:
+------------------------
+If you want to store your suspend image encrypted with a temporary
+key to prevent data gathering after resume you must compile
+crypto and the aes algorithm into the kernel - modules won't work
+as they cannot be loaded at resume time.
+
 
 Article about goals and implementation of Software Suspend for Linux
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -85,11 +90,6 @@ resume.
 You have your server on UPS. Power died, and UPS is indicating 30
 seconds to failure. What do you do? Suspend to disk.
 
-Ethernet card in your server died. You want to replace it. Your
-server is not hotplug capable. What do you do? Suspend to disk,
-replace ethernet card, resume. If you are fast your users will not
-even see broken connections.
-
 
 Q: Maybe I'm missing something, but why don't the regular I/O paths work?
 
@@ -117,31 +117,6 @@ Q: Does linux support ACPI S4?
 
 A: Yes. That's what echo platform > /sys/power/disk does.
 
-Q: My machine doesn't work with ACPI. How can I use swsusp than ?
-
-A: Do a reboot() syscall with right parameters. Warning: glibc gets in
-its way, so check with strace:
-
-reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, 0xd000fce2)
-
-(Thanks to Peter Osterlund:)
-
-#include <unistd.h>
-#include <syscall.h>
-
-#define LINUX_REBOOT_MAGIC1     0xfee1dead
-#define LINUX_REBOOT_MAGIC2     672274793
-#define LINUX_REBOOT_CMD_SW_SUSPEND     0xD000FCE2
-
-int main()
-{
-    syscall(SYS_reboot, LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2,
-            LINUX_REBOOT_CMD_SW_SUSPEND, 0);
-    return 0;
-}
-
-Also /sys/ interface should be still present.
-
 Q: What is 'suspend2'?
 
 A: suspend2 is 'Software Suspend 2', a forked implementation of
@@ -312,9 +287,45 @@ system is shut down or suspended. Additionally use the encrypted
 suspend image to prevent sensitive data from being stolen after
 resume.
 
-Q: Why we cannot suspend to a swap file?
+Q: Why can't we suspend to a swap file?
 
 A: Because accessing swap file needs the filesystem mounted, and
 filesystem might do something wrong (like replaying the journal)
-during mount. [Probably could be solved by modifying every filesystem
-to support some kind of "really read-only!" option. Patches welcome.]
+during mount.
+
+There are few ways to get that fixed:
+
+1) Probably could be solved by modifying every filesystem to support
+some kind of "really read-only!" option. Patches welcome.
+
+2) suspend2 gets around that by storing absolute positions in on-disk
+image (and blocksize), with resume parameter pointing directly to
+suspend header.
+
+Q: Is there a maximum system RAM size that is supported by swsusp?
+
+A: It should work okay with highmem.
+
+Q: Does swsusp (to disk) use only one swap partition or can it use
+multiple swap partitions (aggregate them into one logical space)?
+
+A: Only one swap partition, sorry.
+
+Q: If my application(s) causes lots of memory & swap space to be used
+(over half of the total system RAM), is it correct that it is likely
+to be useless to try to suspend to disk while that app is running?
+
+A: No, it should work okay, as long as your app does not mlock()
+it. Just prepare big enough swap partition.
+
+Q: What information is usefull for debugging suspend-to-disk problems?
+
+A: Well, last messages on the screen are always useful. If something
+is broken, it is usually some kernel driver, therefore trying with as
+little as possible modules loaded helps a lot. I also prefer people to
+suspend from console, preferably without X running. Booting with
+init=/bin/bash, then swapon and starting suspend sequence manually
+usually does the trick. Then it is good idea to try with latest
+vanilla kernel.
+
+
diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt
index 1a44e8acb54c..526d6dd267ea 100644
--- a/Documentation/power/video.txt
+++ b/Documentation/power/video.txt
@@ -120,6 +120,7 @@ IBM ThinkPad T42p (2373-GTG)	s3_bios (2)
 IBM TP X20			??? (*)
 IBM TP X30			s3_bios (2)
 IBM TP X31 / Type 2672-XXH      none (1), use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
+IBM TP X32			none (1), but backlight is on and video is trashed after long suspend
 IBM Thinkpad X40 Type 2371-7JG  s3_bios,s3_mode (4)
 Medion MD4220			??? (*)
 Samsung P35			vbetool needed (6)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 917066a5767c..c14cd9991181 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -28,7 +28,7 @@ config PM_DEBUG
 
 config SOFTWARE_SUSPEND
 	bool "Software Suspend"
-	depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
+	depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP))
 	---help---
 	  Enable the possibility of suspending the machine.
 	  It doesn't need APM.
-- 
cgit v1.2.3-59-g8ed1b


From 4732efbeb997189d9f9b04708dc26bf8613ed721 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Tue, 6 Sep 2005 15:16:25 -0700
Subject: [PATCH] FUTEX_WAKE_OP: pthread_cond_signal() speedup

ATM pthread_cond_signal is unnecessarily slow, because it wakes one waiter
(which at least on UP usually means an immediate context switch to one of
the waiter threads).  This waiter wakes up and after a few instructions it
attempts to acquire the cv internal lock, but that lock is still held by
the thread calling pthread_cond_signal.  So it goes to sleep and eventually
the signalling thread is scheduled in, unlocks the internal lock and wakes
the waiter again.

Now, before 2003-09-21 NPTL was using FUTEX_REQUEUE in pthread_cond_signal
to avoid this performance issue, but it was removed when locks were
redesigned to the 3 state scheme (unlocked, locked uncontended, locked
contended).

Following scenario shows why simply using FUTEX_REQUEUE in
pthread_cond_signal together with using lll_mutex_unlock_force in place of
lll_mutex_unlock is not enough and probably why it has been disabled at
that time:

The number is value in cv->__data.__lock.
        thr1            thr2            thr3
0       pthread_cond_wait
1       lll_mutex_lock (cv->__data.__lock)
0       lll_mutex_unlock (cv->__data.__lock)
0       lll_futex_wait (&cv->__data.__futex, futexval)
0                       pthread_cond_signal
1                       lll_mutex_lock (cv->__data.__lock)
1                                       pthread_cond_signal
2                                       lll_mutex_lock (cv->__data.__lock)
2                                         lll_futex_wait (&cv->__data.__lock, 2)
2                       lll_futex_requeue (&cv->__data.__futex, 0, 1, &cv->__data.__lock)
                          # FUTEX_REQUEUE, not FUTEX_CMP_REQUEUE
2                       lll_mutex_unlock_force (cv->__data.__lock)
0                         cv->__data.__lock = 0
0                         lll_futex_wake (&cv->__data.__lock, 1)
1       lll_mutex_lock (cv->__data.__lock)
0       lll_mutex_unlock (cv->__data.__lock)
          # Here, lll_mutex_unlock doesn't know there are threads waiting
          # on the internal cv's lock

Now, I believe it is possible to use FUTEX_REQUEUE in pthread_cond_signal,
but it will cost us not one, but 2 extra syscalls and, what's worse, one of
these extra syscalls will be done for every single waiting loop in
pthread_cond_*wait.

We would need to use lll_mutex_unlock_force in pthread_cond_signal after
requeue and lll_mutex_cond_lock in pthread_cond_*wait after lll_futex_wait.

Another alternative is to do the unlocking pthread_cond_signal needs to do
(the lock can't be unlocked before lll_futex_wake, as that is racy) in the
kernel.

I have implemented both variants, futex-requeue-glibc.patch is the first
one and futex-wake_op{,-glibc}.patch is the unlocking inside of the kernel.
 The kernel interface allows userland to specify how exactly an unlocking
operation should look like (some atomic arithmetic operation with optional
constant argument and comparison of the previous futex value with another
constant).

It has been implemented just for ppc*, x86_64 and i?86, for other
architectures I'm including just a stub header which can be used as a
starting point by maintainers to write support for their arches and ATM
will just return -ENOSYS for FUTEX_WAKE_OP.  The requeue patch has been
(lightly) tested just on x86_64, the wake_op patch on ppc64 kernel running
32-bit and 64-bit NPTL and x86_64 kernel running 32-bit and 64-bit NPTL.

With the following benchmark on UP x86-64 I get:

for i in nptl-orig nptl-requeue nptl-wake_op; do echo time elf/ld.so --library-path .:$i /tmp/bench; \
for j in 1 2; do echo ( time elf/ld.so --library-path .:$i /tmp/bench ) 2>&1; done; done
time elf/ld.so --library-path .:nptl-orig /tmp/bench
real 0m0.655s user 0m0.253s sys 0m0.403s
real 0m0.657s user 0m0.269s sys 0m0.388s
time elf/ld.so --library-path .:nptl-requeue /tmp/bench
real 0m0.496s user 0m0.225s sys 0m0.271s
real 0m0.531s user 0m0.242s sys 0m0.288s
time elf/ld.so --library-path .:nptl-wake_op /tmp/bench
real 0m0.380s user 0m0.176s sys 0m0.204s
real 0m0.382s user 0m0.175s sys 0m0.207s

The benchmark is at:
http://sourceware.org/ml/libc-alpha/2005-03/txt00001.txt
Older futex-requeue-glibc.patch version is at:
http://sourceware.org/ml/libc-alpha/2005-03/txt00002.txt
Older futex-wake_op-glibc.patch version is at:
http://sourceware.org/ml/libc-alpha/2005-03/txt00003.txt
Will post a new version (just x86-64 fixes so that the patch
applies against pthread_cond_signal.S) to libc-hacker ml soon.

Attached is the kernel FUTEX_WAKE_OP patch as well as a simple-minded
testcase that will not test the atomicity of the operation, but at least
check if the threads that should have been woken up are woken up and
whether the arithmetic operation in the kernel gave the expected results.

Acked-by: Ingo Molnar <mingo@redhat.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/futex.h     |  53 +++++++++++++++++++
 include/asm-arm/futex.h       |  53 +++++++++++++++++++
 include/asm-arm26/futex.h     |  53 +++++++++++++++++++
 include/asm-cris/futex.h      |  53 +++++++++++++++++++
 include/asm-frv/futex.h       |  53 +++++++++++++++++++
 include/asm-h8300/futex.h     |  53 +++++++++++++++++++
 include/asm-i386/futex.h      | 108 +++++++++++++++++++++++++++++++++++++++
 include/asm-ia64/futex.h      |  53 +++++++++++++++++++
 include/asm-m32r/futex.h      |  53 +++++++++++++++++++
 include/asm-m68k/futex.h      |  53 +++++++++++++++++++
 include/asm-m68knommu/futex.h |  53 +++++++++++++++++++
 include/asm-mips/futex.h      |  53 +++++++++++++++++++
 include/asm-parisc/futex.h    |  53 +++++++++++++++++++
 include/asm-ppc/futex.h       |  53 +++++++++++++++++++
 include/asm-ppc64/futex.h     |  83 ++++++++++++++++++++++++++++++
 include/asm-ppc64/memory.h    |   2 +
 include/asm-s390/futex.h      |  53 +++++++++++++++++++
 include/asm-sh/futex.h        |  53 +++++++++++++++++++
 include/asm-sh64/futex.h      |  53 +++++++++++++++++++
 include/asm-sparc/futex.h     |  53 +++++++++++++++++++
 include/asm-sparc64/futex.h   |  53 +++++++++++++++++++
 include/asm-um/futex.h        |  53 +++++++++++++++++++
 include/asm-v850/futex.h      |  53 +++++++++++++++++++
 include/asm-x86_64/futex.h    |  98 +++++++++++++++++++++++++++++++++++
 include/linux/futex.h         |  36 +++++++++++--
 kernel/futex.c                | 116 ++++++++++++++++++++++++++++++++++++++++++
 26 files changed, 1498 insertions(+), 5 deletions(-)
 create mode 100644 include/asm-alpha/futex.h
 create mode 100644 include/asm-arm/futex.h
 create mode 100644 include/asm-arm26/futex.h
 create mode 100644 include/asm-cris/futex.h
 create mode 100644 include/asm-frv/futex.h
 create mode 100644 include/asm-h8300/futex.h
 create mode 100644 include/asm-i386/futex.h
 create mode 100644 include/asm-ia64/futex.h
 create mode 100644 include/asm-m32r/futex.h
 create mode 100644 include/asm-m68k/futex.h
 create mode 100644 include/asm-m68knommu/futex.h
 create mode 100644 include/asm-mips/futex.h
 create mode 100644 include/asm-parisc/futex.h
 create mode 100644 include/asm-ppc/futex.h
 create mode 100644 include/asm-ppc64/futex.h
 create mode 100644 include/asm-s390/futex.h
 create mode 100644 include/asm-sh/futex.h
 create mode 100644 include/asm-sh64/futex.h
 create mode 100644 include/asm-sparc/futex.h
 create mode 100644 include/asm-sparc64/futex.h
 create mode 100644 include/asm-um/futex.h
 create mode 100644 include/asm-v850/futex.h
 create mode 100644 include/asm-x86_64/futex.h

(limited to 'kernel')

diff --git a/include/asm-alpha/futex.h b/include/asm-alpha/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-alpha/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-arm/futex.h b/include/asm-arm/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-arm/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-arm26/futex.h b/include/asm-arm26/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-arm26/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-cris/futex.h b/include/asm-cris/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-cris/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-frv/futex.h b/include/asm-frv/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-frv/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-h8300/futex.h b/include/asm-h8300/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-h8300/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-i386/futex.h b/include/asm-i386/futex.h
new file mode 100644
index 000000000000..44b9db806474
--- /dev/null
+++ b/include/asm-i386/futex.h
@@ -0,0 +1,108 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+
+#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
+  __asm__ __volatile (						\
+"1:	" insn "\n"						\
+"2:	.section .fixup,\"ax\"\n\
+3:	mov	%3, %1\n\
+	jmp	2b\n\
+	.previous\n\
+	.section __ex_table,\"a\"\n\
+	.align	8\n\
+	.long	1b,3b\n\
+	.previous"						\
+	: "=r" (oldval), "=r" (ret), "=m" (*uaddr)		\
+	: "i" (-EFAULT), "m" (*uaddr), "0" (oparg), "1" (0))
+
+#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
+  __asm__ __volatile (						\
+"1:	movl	%2, %0\n\
+	movl	%0, %3\n"					\
+	insn "\n"						\
+"2:	" LOCK_PREFIX "cmpxchgl %3, %2\n\
+	jnz	1b\n\
+3:	.section .fixup,\"ax\"\n\
+4:	mov	%5, %1\n\
+	jmp	3b\n\
+	.previous\n\
+	.section __ex_table,\"a\"\n\
+	.align	8\n\
+	.long	1b,4b,2b,4b\n\
+	.previous"						\
+	: "=&a" (oldval), "=&r" (ret), "=m" (*uaddr),		\
+	  "=&r" (tem)						\
+	: "r" (oparg), "i" (-EFAULT), "m" (*uaddr), "1" (0))
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	if (op == FUTEX_OP_SET)
+		__futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
+	else {
+#ifndef CONFIG_X86_BSWAP
+		if (boot_cpu_data.x86 == 3)
+			ret = -ENOSYS;
+		else
+#endif
+		switch (op) {
+		case FUTEX_OP_ADD:
+			__futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
+					   oldval, uaddr, oparg);
+			break;
+		case FUTEX_OP_OR:
+			__futex_atomic_op2("orl %4, %3", ret, oldval, uaddr,
+					   oparg);
+			break;
+		case FUTEX_OP_ANDN:
+			__futex_atomic_op2("andl %4, %3", ret, oldval, uaddr,
+					   ~oparg);
+			break;
+		case FUTEX_OP_XOR:
+			__futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr,
+					   oparg);
+			break;
+		default:
+			ret = -ENOSYS;
+		}
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-ia64/futex.h b/include/asm-ia64/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-ia64/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-m32r/futex.h b/include/asm-m32r/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-m32r/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-m68k/futex.h b/include/asm-m68k/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-m68k/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-m68knommu/futex.h b/include/asm-m68knommu/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-m68knommu/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-mips/futex.h b/include/asm-mips/futex.h
new file mode 100644
index 000000000000..9feff4ce1424
--- /dev/null
+++ b/include/asm-mips/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-parisc/futex.h b/include/asm-parisc/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-parisc/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-ppc/futex.h b/include/asm-ppc/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-ppc/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-ppc64/futex.h b/include/asm-ppc64/futex.h
new file mode 100644
index 000000000000..cb2640b3a408
--- /dev/null
+++ b/include/asm-ppc64/futex.h
@@ -0,0 +1,83 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/memory.h>
+#include <asm/uaccess.h>
+
+#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
+  __asm__ __volatile (SYNC_ON_SMP				\
+"1:	lwarx	%0,0,%2\n"					\
+	insn							\
+"2:	stwcx.	%1,0,%2\n\
+	bne-	1b\n\
+	li	%1,0\n\
+3:	.section .fixup,\"ax\"\n\
+4:	li	%1,%3\n\
+	b	3b\n\
+	.previous\n\
+	.section __ex_table,\"a\"\n\
+	.align 3\n\
+	.llong	1b,4b,2b,4b\n\
+	.previous"						\
+	: "=&r" (oldval), "=&r" (ret)				\
+	: "b" (uaddr), "i" (-EFAULT), "1" (oparg)		\
+	: "cr0", "memory")
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+		__futex_atomic_op("", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ADD:
+		__futex_atomic_op("add %1,%0,%1\n", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_OR:
+		__futex_atomic_op("or %1,%0,%1\n", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ANDN:
+		__futex_atomic_op("andc %1,%0,%1\n", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_XOR:
+		__futex_atomic_op("xor %1,%0,%1\n", ret, oldval, uaddr, oparg);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-ppc64/memory.h b/include/asm-ppc64/memory.h
index 56e09face9a8..af53ffb55726 100644
--- a/include/asm-ppc64/memory.h
+++ b/include/asm-ppc64/memory.h
@@ -18,9 +18,11 @@
 #ifdef CONFIG_SMP
 #define EIEIO_ON_SMP	"eieio\n"
 #define ISYNC_ON_SMP	"\n\tisync"
+#define SYNC_ON_SMP	"lwsync\n\t"
 #else
 #define EIEIO_ON_SMP
 #define ISYNC_ON_SMP
+#define SYNC_ON_SMP
 #endif
 
 static inline void eieio(void)
diff --git a/include/asm-s390/futex.h b/include/asm-s390/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-s390/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-sh/futex.h b/include/asm-sh/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-sh/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-sh64/futex.h b/include/asm-sh64/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-sh64/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-sparc/futex.h b/include/asm-sparc/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-sparc/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-sparc64/futex.h b/include/asm-sparc64/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-sparc64/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-um/futex.h b/include/asm-um/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-um/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-v850/futex.h b/include/asm-v850/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-v850/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-x86_64/futex.h b/include/asm-x86_64/futex.h
new file mode 100644
index 000000000000..8602c09bf89e
--- /dev/null
+++ b/include/asm-x86_64/futex.h
@@ -0,0 +1,98 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
+  __asm__ __volatile (						\
+"1:	" insn "\n"						\
+"2:	.section .fixup,\"ax\"\n\
+3:	mov	%3, %1\n\
+	jmp	2b\n\
+	.previous\n\
+	.section __ex_table,\"a\"\n\
+	.align	8\n\
+	.quad	1b,3b\n\
+	.previous"						\
+	: "=r" (oldval), "=r" (ret), "=m" (*uaddr)		\
+	: "i" (-EFAULT), "m" (*uaddr), "0" (oparg), "1" (0))
+
+#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
+  __asm__ __volatile (						\
+"1:	movl	%2, %0\n\
+	movl	%0, %3\n"					\
+	insn "\n"						\
+"2:	" LOCK_PREFIX "cmpxchgl %3, %2\n\
+	jnz	1b\n\
+3:	.section .fixup,\"ax\"\n\
+4:	mov	%5, %1\n\
+	jmp	3b\n\
+	.previous\n\
+	.section __ex_table,\"a\"\n\
+	.align	8\n\
+	.quad	1b,4b,2b,4b\n\
+	.previous"						\
+	: "=&a" (oldval), "=&r" (ret), "=m" (*uaddr),		\
+	  "=&r" (tem)						\
+	: "r" (oparg), "i" (-EFAULT), "m" (*uaddr), "1" (0))
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+		__futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ADD:
+		__futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
+				   uaddr, oparg);
+		break;
+	case FUTEX_OP_OR:
+		__futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ANDN:
+		__futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg);
+		break;
+	case FUTEX_OP_XOR:
+		__futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 65d6cfdb6d39..10f96c31971e 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -4,14 +4,40 @@
 /* Second argument to futex syscall */
 
 
-#define FUTEX_WAIT (0)
-#define FUTEX_WAKE (1)
-#define FUTEX_FD (2)
-#define FUTEX_REQUEUE (3)
-#define FUTEX_CMP_REQUEUE (4)
+#define FUTEX_WAIT		0
+#define FUTEX_WAKE		1
+#define FUTEX_FD		2
+#define FUTEX_REQUEUE		3
+#define FUTEX_CMP_REQUEUE	4
+#define FUTEX_WAKE_OP		5
 
 long do_futex(unsigned long uaddr, int op, int val,
 		unsigned long timeout, unsigned long uaddr2, int val2,
 		int val3);
 
+#define FUTEX_OP_SET		0	/* *(int *)UADDR2 = OPARG; */
+#define FUTEX_OP_ADD		1	/* *(int *)UADDR2 += OPARG; */
+#define FUTEX_OP_OR		2	/* *(int *)UADDR2 |= OPARG; */
+#define FUTEX_OP_ANDN		3	/* *(int *)UADDR2 &= ~OPARG; */
+#define FUTEX_OP_XOR		4	/* *(int *)UADDR2 ^= OPARG; */
+
+#define FUTEX_OP_OPARG_SHIFT	8	/* Use (1 << OPARG) instead of OPARG.  */
+
+#define FUTEX_OP_CMP_EQ		0	/* if (oldval == CMPARG) wake */
+#define FUTEX_OP_CMP_NE		1	/* if (oldval != CMPARG) wake */
+#define FUTEX_OP_CMP_LT		2	/* if (oldval < CMPARG) wake */
+#define FUTEX_OP_CMP_LE		3	/* if (oldval <= CMPARG) wake */
+#define FUTEX_OP_CMP_GT		4	/* if (oldval > CMPARG) wake */
+#define FUTEX_OP_CMP_GE		5	/* if (oldval >= CMPARG) wake */
+
+/* FUTEX_WAKE_OP will perform atomically
+   int oldval = *(int *)UADDR2;
+   *(int *)UADDR2 = oldval OP OPARG;
+   if (oldval CMP CMPARG)
+     wake UADDR2;  */
+
+#define FUTEX_OP(op, oparg, cmp, cmparg) \
+  (((op & 0xf) << 28) | ((cmp & 0xf) << 24)		\
+   | ((oparg & 0xfff) << 12) | (cmparg & 0xfff))
+
 #endif
diff --git a/kernel/futex.c b/kernel/futex.c
index c7130f86106c..07ba87de9658 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -40,6 +40,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <asm/futex.h>
 
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
@@ -326,6 +327,118 @@ out:
 	return ret;
 }
 
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+{
+	union futex_key key1, key2;
+	struct futex_hash_bucket *bh1, *bh2;
+	struct list_head *head;
+	struct futex_q *this, *next;
+	int ret, op_ret, attempt = 0;
+
+retryfull:
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr1, &key1);
+	if (unlikely(ret != 0))
+		goto out;
+	ret = get_futex_key(uaddr2, &key2);
+	if (unlikely(ret != 0))
+		goto out;
+
+	bh1 = hash_futex(&key1);
+	bh2 = hash_futex(&key2);
+
+retry:
+	if (bh1 < bh2)
+		spin_lock(&bh1->lock);
+	spin_lock(&bh2->lock);
+	if (bh1 > bh2)
+		spin_lock(&bh1->lock);
+
+	op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+	if (unlikely(op_ret < 0)) {
+		int dummy;
+
+		spin_unlock(&bh1->lock);
+		if (bh1 != bh2)
+			spin_unlock(&bh2->lock);
+
+		/* futex_atomic_op_inuser needs to both read and write
+		 * *(int __user *)uaddr2, but we can't modify it
+		 * non-atomically.  Therefore, if get_user below is not
+		 * enough, we need to handle the fault ourselves, while
+		 * still holding the mmap_sem.  */
+		if (attempt++) {
+			struct vm_area_struct * vma;
+			struct mm_struct *mm = current->mm;
+
+			ret = -EFAULT;
+			if (attempt >= 2 ||
+			    !(vma = find_vma(mm, uaddr2)) ||
+			    vma->vm_start > uaddr2 ||
+			    !(vma->vm_flags & VM_WRITE))
+				goto out;
+
+			switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
+			case VM_FAULT_MINOR:
+				current->min_flt++;
+				break;
+			case VM_FAULT_MAJOR:
+				current->maj_flt++;
+				break;
+			default:
+				goto out;
+			}
+			goto retry;
+		}
+
+		/* If we would have faulted, release mmap_sem,
+		 * fault it in and start all over again.  */
+		up_read(&current->mm->mmap_sem);
+
+		ret = get_user(dummy, (int __user *)uaddr2);
+		if (ret)
+			return ret;
+
+		goto retryfull;
+	}
+
+	head = &bh1->chain;
+
+	list_for_each_entry_safe(this, next, head, list) {
+		if (match_futex (&this->key, &key1)) {
+			wake_futex(this);
+			if (++ret >= nr_wake)
+				break;
+		}
+	}
+
+	if (op_ret > 0) {
+		head = &bh2->chain;
+
+		op_ret = 0;
+		list_for_each_entry_safe(this, next, head, list) {
+			if (match_futex (&this->key, &key2)) {
+				wake_futex(this);
+				if (++op_ret >= nr_wake2)
+					break;
+			}
+		}
+		ret += op_ret;
+	}
+
+	spin_unlock(&bh1->lock);
+	if (bh1 != bh2)
+		spin_unlock(&bh2->lock);
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
 /*
  * Requeue all waiters hashed on one physical page to another
  * physical page.
@@ -740,6 +853,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
 	case FUTEX_CMP_REQUEUE:
 		ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
 		break;
+	case FUTEX_WAKE_OP:
+		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 8446f1d391f3d27e6bf9c43d4cbcdac0ca720417 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 6 Sep 2005 15:16:27 -0700
Subject: [PATCH] detect soft lockups

This patch adds a new kernel debug feature: CONFIG_DETECT_SOFTLOCKUP.

When enabled then per-CPU watchdog threads are started, which try to run
once per second.  If they get delayed for more than 10 seconds then a
callback from the timer interrupt detects this condition and prints out a
warning message and a stack dump (once per lockup incident).  The feature
is otherwise non-intrusive, it doesnt try to unlock the box in any way, it
only gets the debug info out, automatically, and on all CPUs affected by
the lockup.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-Off-By: Matthias Urlichs <smurf@smurf.noris.de>
Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/nmi.c       |   5 ++
 arch/i386/kernel/time.c      |   1 +
 arch/x86_64/kernel/nmi.c     |   2 +
 arch/x86_64/kernel/time.c    |   1 +
 drivers/mtd/nand/nand_base.c |   1 +
 include/linux/sched.h        |  17 +++++
 init/main.c                  |   1 +
 kernel/Makefile              |   1 +
 kernel/power/swsusp.c        |   1 +
 kernel/softlockup.c          | 151 +++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c               |   1 +
 lib/Kconfig.debug            |  19 ++++++
 12 files changed, 201 insertions(+)
 create mode 100644 kernel/softlockup.c

(limited to 'kernel')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 8bbdbda07a2d..0178457db721 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -478,6 +478,11 @@ void touch_nmi_watchdog (void)
 	 */
 	for (i = 0; i < NR_CPUS; i++)
 		alert_counter[i] = 0;
+
+	/*
+	 * Tickle the softlockup detector too:
+	 */
+	touch_softlockup_watchdog();
 }
 
 extern void die_nmi(struct pt_regs *, const char *msg);
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 6f794a78ee1e..b0c5ee2b3446 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -422,6 +422,7 @@ static int timer_resume(struct sys_device *dev)
 		last_timer->resume();
 	cur_timer = last_timer;
 	last_timer = NULL;
+	touch_softlockup_watchdog();
 	return 0;
 }
 
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 64a8e05d5811..84cae81fff8b 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -463,6 +463,8 @@ void touch_nmi_watchdog (void)
 	 */
 	for (i = 0; i < NR_CPUS; i++)
 		per_cpu(nmi_touch, i) = 1;
+
+ 	touch_softlockup_watchdog();
 }
 
 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 66bf6ddeb0c3..2b5d9da912a2 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1041,6 +1041,7 @@ static int timer_resume(struct sys_device *dev)
 	write_sequnlock_irqrestore(&xtime_lock,flags);
 	jiffies += sleep_length;
 	wall_jiffies += sleep_length;
+	touch_softlockup_watchdog();
 	return 0;
 }
 
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index eee5115658c8..04e54318bc6a 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -526,6 +526,7 @@ static void nand_wait_ready(struct mtd_info *mtd)
 	do {
 		if (this->dev_ready(mtd))
 			return;
+		touch_softlockup_watchdog();
 	} while (time_before(jiffies, timeo));	
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dec5827c7742..5fb31bede103 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -176,6 +176,23 @@ extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+extern void softlockup_tick(struct pt_regs *regs);
+extern void spawn_softlockup_task(void);
+extern void touch_softlockup_watchdog(void);
+#else
+static inline void softlockup_tick(struct pt_regs *regs)
+{
+}
+static inline void spawn_softlockup_task(void)
+{
+}
+static inline void touch_softlockup_watchdog(void)
+{
+}
+#endif
+
+
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 /* Is this address in the __sched functions? */
diff --git a/init/main.c b/init/main.c
index ff410063e4e1..a29fb2ac7240 100644
--- a/init/main.c
+++ b/init/main.c
@@ -614,6 +614,7 @@ static void do_pre_smp_initcalls(void)
 	migration_init();
 #endif
 	spawn_ksoftirqd();
+	spawn_softlockup_task();
 }
 
 static void run_init_process(char *init_filename)
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d237..8d57a2f1226b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index eaacd5cb5889..d967e875ee82 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1059,6 +1059,7 @@ int swsusp_resume(void)
 	BUG_ON(!error);
 	restore_processor_state();
 	restore_highmem();
+	touch_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
 	return error;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 000000000000..75976209cea7
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
+/*
+ * Detect Soft Lockups
+ *
+ * started by Ingo Molnar, (C) 2005, Red Hat
+ *
+ * this code detects soft lockups: incidents in where on a CPU
+ * the kernel does not reschedule for 10 seconds or more.
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+
+static DEFINE_SPINLOCK(print_lock);
+
+static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+
+static int did_panic = 0;
+static int softlock_panic(struct notifier_block *this, unsigned long event,
+				void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = softlock_panic,
+};
+
+void touch_softlockup_watchdog(void)
+{
+	per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+
+/*
+ * This callback runs from the timer interrupt, and checks
+ * whether the watchdog thread has hung or not:
+ */
+void softlockup_tick(struct pt_regs *regs)
+{
+	int this_cpu = smp_processor_id();
+	unsigned long timestamp = per_cpu(timestamp, this_cpu);
+
+	if (per_cpu(print_timestamp, this_cpu) == timestamp)
+		return;
+
+	/* Do not cause a second panic when there already was one */
+	if (did_panic)
+		return;
+
+	if (time_after(jiffies, timestamp + 10*HZ)) {
+		per_cpu(print_timestamp, this_cpu) = timestamp;
+
+		spin_lock(&print_lock);
+		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
+			this_cpu);
+		show_regs(regs);
+		spin_unlock(&print_lock);
+	}
+}
+
+/*
+ * The watchdog thread - runs every second and touches the timestamp.
+ */
+static int watchdog(void * __bind_cpu)
+{
+	struct sched_param param = { .sched_priority = 99 };
+	int this_cpu = (long) __bind_cpu;
+
+	printk("softlockup thread %d started up.\n", this_cpu);
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+	current->flags |= PF_NOFREEZE;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/*
+	 * Run briefly once per second - if this gets delayed for
+	 * more than 10 seconds then the debug-printout triggers
+	 * in softlockup_tick():
+	 */
+	while (!kthread_should_stop()) {
+		msleep_interruptible(1000);
+		touch_softlockup_watchdog();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __devinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		BUG_ON(per_cpu(watchdog_task, hotcpu));
+		p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("watchdog for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+  		per_cpu(watchdog_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+ 		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(watchdog_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+		p = per_cpu(watchdog_task, hotcpu);
+		per_cpu(watchdog_task, hotcpu) = NULL;
+		kthread_stop(p);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init void spawn_softlockup_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+
+	notifier_chain_register(&panic_notifier_list, &panic_block);
+}
+
diff --git a/kernel/timer.c b/kernel/timer.c
index 5377f40723ff..1433d87f46b3 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
 {
 	jiffies_64++;
 	update_times();
+	softlockup_tick(regs);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 299f7f3b5b08..3754c9a8f5c8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -46,6 +46,25 @@ config LOG_BUF_SHIFT
 		     13 =>  8 KB
 		     12 =>  4 KB
 
+config DETECT_SOFTLOCKUP
+	bool "Detect Soft Lockups"
+	depends on DEBUG_KERNEL
+	default y
+	help
+	  Say Y here to enable the kernel to detect "soft lockups",
+	  which are bugs that cause the kernel to loop in kernel
+	  mode for more than 10 seconds, without giving other tasks a
+	  chance to run.
+
+	  When a soft-lockup is detected, the kernel will print the
+	  current stack trace (which you should report), but the
+	  system will stay locked up. This feature has negligible
+	  overhead.
+
+	  (Note that "hard lockups" are separate type of bugs that
+	   can be detected via the NMI-watchdog, on platforms that
+	   support it.)
+
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
 	depends on DEBUG_KERNEL && PROC_FS
-- 
cgit v1.2.3-59-g8ed1b


From fe21773d655c2c64641ec2cef499289ea175c817 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 6 Sep 2005 15:16:34 -0700
Subject: [PATCH] Provide better printk() support for SMP machines

The attached patch prevents oopses interleaving with characters from
other printks on other CPUs by only breaking the lock if the oops is
happening on the machine holding the lock.

It might be better if the oops generator got the lock and then called an
inner vprintk routine that assumed the caller holds the lock, thus
making oops reports "atomic".

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 5092397fac29..a967605bc2e3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -514,6 +514,9 @@ asmlinkage int printk(const char *fmt, ...)
 	return r;
 }
 
+/* cpu currently holding logbuf_lock */
+static volatile unsigned int printk_cpu = UINT_MAX;
+
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
 	unsigned long flags;
@@ -522,11 +525,15 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	static char printk_buf[1024];
 	static int log_level_unknown = 1;
 
-	if (unlikely(oops_in_progress))
+	preempt_disable();
+	if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
+		/* If a crash is occurring during printk() on this CPU,
+		 * make sure we can't deadlock */
 		zap_locks();
 
 	/* This stops the holder of console_sem just where we want him */
 	spin_lock_irqsave(&logbuf_lock, flags);
+	printk_cpu = smp_processor_id();
 
 	/* Emit the output into the temporary buffer */
 	printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -595,6 +602,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 		 * CPU until it is officially up.  We shouldn't be calling into
 		 * random console drivers on a CPU which doesn't exist yet..
 		 */
+		printk_cpu = UINT_MAX;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 		goto out;
 	}
@@ -604,6 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 		 * We own the drivers.  We can drop the spinlock and let
 		 * release_console_sem() print the text
 		 */
+		printk_cpu = UINT_MAX;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 		console_may_schedule = 0;
 		release_console_sem();
@@ -613,9 +622,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 		 * allows the semaphore holder to proceed and to call the
 		 * console drivers with the output which we just produced.
 		 */
+		printk_cpu = UINT_MAX;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 	}
 out:
+	preempt_enable();
 	return printed_len;
 }
 EXPORT_SYMBOL(printk);
-- 
cgit v1.2.3-59-g8ed1b


From 486d46aefe7ded0d343e306be740edd972aff740 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@hp.com>
Date: Tue, 6 Sep 2005 15:17:04 -0700
Subject: [PATCH] optimize writer path in time_interpolator_get_counter()

      Christoph Lameter <clameter@engr.sgi.com>

When using a time interpolator that is susceptible to jitter there's
potentially contention over a cmpxchg used to prevent time from going
backwards.  This is unnecessary when the caller holds the xtime write
seqlock as all readers will be blocked from returning until the write is
complete.  We can therefore allow writers to insert a new value and exit
rather than fight with CPUs who only hold a reader lock.

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 1433d87f46b3..13e2b513be01 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1429,7 +1429,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src)
 	}
 }
 
-static inline u64 time_interpolator_get_counter(void)
+static inline u64 time_interpolator_get_counter(int writelock)
 {
 	unsigned int src = time_interpolator->source;
 
@@ -1443,6 +1443,15 @@ static inline u64 time_interpolator_get_counter(void)
 			now = time_interpolator_get_cycles(src);
 			if (lcycle && time_after(lcycle, now))
 				return lcycle;
+
+			/* When holding the xtime write lock, there's no need
+			 * to add the overhead of the cmpxchg.  Readers are
+			 * force to retry until the write lock is released.
+			 */
+			if (writelock) {
+				time_interpolator->last_cycle = now;
+				return now;
+			}
 			/* Keep track of the last timer value returned. The use of cmpxchg here
 			 * will cause contention in an SMP environment.
 			 */
@@ -1456,7 +1465,7 @@ static inline u64 time_interpolator_get_counter(void)
 void time_interpolator_reset(void)
 {
 	time_interpolator->offset = 0;
-	time_interpolator->last_counter = time_interpolator_get_counter();
+	time_interpolator->last_counter = time_interpolator_get_counter(1);
 }
 
 #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
@@ -1468,7 +1477,7 @@ unsigned long time_interpolator_get_offset(void)
 		return 0;
 
 	return time_interpolator->offset +
-		GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator);
+		GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
 }
 
 #define INTERPOLATOR_ADJUST 65536
@@ -1491,7 +1500,7 @@ static void time_interpolator_update(long delta_nsec)
 	 * and the tuning logic insures that.
          */
 
-	counter = time_interpolator_get_counter();
+	counter = time_interpolator_get_counter(1);
 	offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
 
 	if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
-- 
cgit v1.2.3-59-g8ed1b


From 378bac820be6a0ec95df8151524de73ad2b2d2ac Mon Sep 17 00:00:00 2001
From: Thomas Koeller <thomas@koeller.dyndns.org>
Date: Tue, 6 Sep 2005 15:17:11 -0700
Subject: [PATCH] flush icache early when loading module

Change the sequence of operations performed during module loading to flush
the instruction cache before module parameters are processed.  If a module
has parameters of an unusual type that cannot be handled using the standard
accessor functions param_set_xxx and param_get_xxx, it has to to provide a
set of accessor functions for this type.  This requires module code to be
executed during parameter processing, which is of course only possible
after the icache has been flushed.

Signed-off-by: Thomas Koeller <thomas@koeller.dyndns.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index c32995fbd8fd..4b39d3793c72 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1509,6 +1509,7 @@ static struct module *load_module(void __user *umod,
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
 	struct exception_table_entry *extable;
+	mm_segment_t old_fs;
 
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
 	       umod, len, uargs);
@@ -1779,6 +1780,24 @@ static struct module *load_module(void __user *umod,
 	if (err < 0)
 		goto cleanup;
 
+	/* flush the icache in correct context */
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	/*
+	 * Flush the instruction cache, since we've played with text.
+	 * Do it before processing of module parameters, so the module
+	 * can provide parameter accessor functions of its own.
+	 */
+	if (mod->module_init)
+		flush_icache_range((unsigned long)mod->module_init,
+				   (unsigned long)mod->module_init
+				   + mod->init_size);
+	flush_icache_range((unsigned long)mod->module_core,
+			   (unsigned long)mod->module_core + mod->core_size);
+
+	set_fs(old_fs);
+
 	mod->args = args;
 	if (obsparmindex) {
 		err = obsolete_params(mod->name, mod->args,
@@ -1860,7 +1879,6 @@ sys_init_module(void __user *umod,
 		const char __user *uargs)
 {
 	struct module *mod;
-	mm_segment_t old_fs = get_fs();
 	int ret = 0;
 
 	/* Must have permission */
@@ -1878,19 +1896,6 @@ sys_init_module(void __user *umod,
 		return PTR_ERR(mod);
 	}
 
-	/* flush the icache in correct context */
-	set_fs(KERNEL_DS);
-
-	/* Flush the instruction cache, since we've played with text */
-	if (mod->module_init)
-		flush_icache_range((unsigned long)mod->module_init,
-				   (unsigned long)mod->module_init
-				   + mod->init_size);
-	flush_icache_range((unsigned long)mod->module_core,
-			   (unsigned long)mod->module_core + mod->core_size);
-
-	set_fs(old_fs);
-
 	/* Now sew it into the lists.  They won't access us, since
            strong_try_module_get() will fail. */
 	stop_machine_run(__link_module, mod, NR_CPUS);
-- 
cgit v1.2.3-59-g8ed1b


From 230649da7cb73914b8b2a1ffc802a2951e970454 Mon Sep 17 00:00:00 2001
From: Mika Kukkonen <mikukkon@gmail.com>
Date: Tue, 6 Sep 2005 15:17:17 -0700
Subject: [PATCH] create_workqueue_thread() signedness fix

With "-W -Wno-unused -Wno-sign-compare" I get the following compile warning:

  CC      kernel/workqueue.o
kernel/workqueue.c: In function `workqueue_cpu_callback':
kernel/workqueue.c:504: warning: ordered comparison of pointer with integer zero

On error create_workqueue_thread() returns NULL, not negative pointer, so
following trivial patch suggests itself.

Signed-off-by: Mika Kukkonen <mikukkon@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c7e36d4a70ca..a3de837a8ddd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -499,7 +499,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	case CPU_UP_PREPARE:
 		/* Create a new workqueue thread for it. */
 		list_for_each_entry(wq, &workqueues, list) {
-			if (create_workqueue_thread(wq, hotcpu) < 0) {
+			if (!create_workqueue_thread(wq, hotcpu)) {
 				printk("workqueue for %i failed\n", hotcpu);
 				return NOTIFY_BAD;
 			}
-- 
cgit v1.2.3-59-g8ed1b


From f26fdd59929e1144c6caf72adcaf4561d6e682a4 Mon Sep 17 00:00:00 2001
From: Karsten Wiese <annabellesgarden@yahoo.de>
Date: Tue, 6 Sep 2005 15:17:25 -0700
Subject: [PATCH] CHECK_IRQ_PER_CPU() to avoid dead code in __do_IRQ()

IRQ_PER_CPU is not used by all architectures.  This patch introduces the
macros ARCH_HAS_IRQ_PER_CPU and CHECK_IRQ_PER_CPU() to avoid the generation
of dead code in __do_IRQ().

ARCH_HAS_IRQ_PER_CPU is defined by architectures using IRQ_PER_CPU in their
include/asm_ARCH/irq.h file.

Through grepping the tree I found the following architectures currently use
IRQ_PER_CPU:

        cris, ia64, ppc, ppc64 and parisc.

Signed-off-by: Karsten Wiese <annabellesgarden@yahoo.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-cris/irq.h   | 5 +++++
 include/asm-ia64/irq.h   | 5 +++++
 include/asm-parisc/irq.h | 5 +++++
 include/asm-ppc/irq.h    | 5 +++++
 include/asm-ppc64/irq.h  | 5 +++++
 include/linux/irq.h      | 7 ++++++-
 kernel/irq/handle.c      | 2 +-
 7 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-cris/irq.h b/include/asm-cris/irq.h
index 8e787fdaedd4..4fab5c3b2e15 100644
--- a/include/asm-cris/irq.h
+++ b/include/asm-cris/irq.h
@@ -1,6 +1,11 @@
 #ifndef _ASM_IRQ_H
 #define _ASM_IRQ_H
 
+/*
+ * IRQ line status macro IRQ_PER_CPU is used
+ */
+#define ARCH_HAS_IRQ_PER_CPU
+
 #include <asm/arch/irq.h>
 
 extern __inline__ int irq_canonicalize(int irq)
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index 5d930fdc0bea..cd984d08fd15 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -14,6 +14,11 @@
 #define NR_IRQS		256
 #define NR_IRQ_VECTORS	NR_IRQS
 
+/*
+ * IRQ line status macro IRQ_PER_CPU is used
+ */
+#define ARCH_HAS_IRQ_PER_CPU
+
 static __inline__ int
 irq_canonicalize (int irq)
 {
diff --git a/include/asm-parisc/irq.h b/include/asm-parisc/irq.h
index 75654ba93353..f876bdf22056 100644
--- a/include/asm-parisc/irq.h
+++ b/include/asm-parisc/irq.h
@@ -26,6 +26,11 @@
 
 #define NR_IRQS		(CPU_IRQ_MAX + 1)
 
+/*
+ * IRQ line status macro IRQ_PER_CPU is used
+ */
+#define ARCH_HAS_IRQ_PER_CPU
+
 static __inline__ int irq_canonicalize(int irq)
 {
 	return (irq == 2) ? 9 : irq;
diff --git a/include/asm-ppc/irq.h b/include/asm-ppc/irq.h
index a244d93ca953..b4b270457edd 100644
--- a/include/asm-ppc/irq.h
+++ b/include/asm-ppc/irq.h
@@ -19,6 +19,11 @@
 #define IRQ_POLARITY_POSITIVE	0x2	/* high level or low->high edge */
 #define IRQ_POLARITY_NEGATIVE	0x0	/* low level or high->low edge */
 
+/*
+ * IRQ line status macro IRQ_PER_CPU is used
+ */
+#define ARCH_HAS_IRQ_PER_CPU
+
 #if defined(CONFIG_40x)
 #include <asm/ibm4xx.h>
 
diff --git a/include/asm-ppc64/irq.h b/include/asm-ppc64/irq.h
index 570678b1da95..99782afb4cde 100644
--- a/include/asm-ppc64/irq.h
+++ b/include/asm-ppc64/irq.h
@@ -33,6 +33,11 @@
 #define IRQ_POLARITY_POSITIVE	0x2	/* high level or low->high edge */
 #define IRQ_POLARITY_NEGATIVE	0x0	/* low level or high->low edge */
 
+/*
+ * IRQ line status macro IRQ_PER_CPU is used
+ */
+#define ARCH_HAS_IRQ_PER_CPU
+
 #define get_irq_desc(irq) (&irq_desc[(irq)])
 
 /* Define a way to iterate across irqs. */
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 4a362b9ec966..69681c3b1f05 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -32,7 +32,12 @@
 #define IRQ_WAITING	32	/* IRQ not yet seen - for autodetection */
 #define IRQ_LEVEL	64	/* IRQ level triggered */
 #define IRQ_MASKED	128	/* IRQ masked - shouldn't be seen again */
-#define IRQ_PER_CPU	256	/* IRQ is per CPU */
+#if defined(ARCH_HAS_IRQ_PER_CPU)
+# define IRQ_PER_CPU	256	/* IRQ is per CPU */
+# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
+#else
+# define CHECK_IRQ_PER_CPU(var) 0
+#endif
 
 /*
  * Interrupt controller descriptor. This is all we need
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c29f83c16497..3ff7b925c387 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 	unsigned int status;
 
 	kstat_this_cpu.irqs[irq]++;
-	if (desc->status & IRQ_PER_CPU) {
+	if (CHECK_IRQ_PER_CPU(desc->status)) {
 		irqreturn_t action_ret;
 
 		/*
-- 
cgit v1.2.3-59-g8ed1b


From bc505a478d3fffcfb269b72f64df4510305cfe81 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 6 Sep 2005 15:17:32 -0700
Subject: [PATCH] do_notify_parent_cldstop() cleanup

This patch simplifies the usage of do_notify_parent_cldstop(), it lessens
the source and .text size slightly, and makes the code (in my opinion) a
bit more readable.

I am sending this patch now because I'm afraid Paul will touch
do_notify_parent_cldstop() really soon, It's better to cleanup first.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 61 ++++++++++++++++++++++++---------------------------------
 1 file changed, 26 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index d282fea81138..56e33df2b67f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -678,7 +678,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 
 /* forward decl */
 static void do_notify_parent_cldstop(struct task_struct *tsk,
-				     struct task_struct *parent,
+				     int to_self,
 				     int why);
 
 /*
@@ -729,14 +729,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			p->signal->group_stop_count = 0;
 			p->signal->flags = SIGNAL_STOP_CONTINUED;
 			spin_unlock(&p->sighand->siglock);
-			if (p->ptrace & PT_PTRACED)
-				do_notify_parent_cldstop(p, p->parent,
-							 CLD_STOPPED);
-			else
-				do_notify_parent_cldstop(
-					p->group_leader,
-					p->group_leader->real_parent,
-							 CLD_STOPPED);
+			do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
 			spin_lock(&p->sighand->siglock);
 		}
 		rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -777,14 +770,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			p->signal->flags = SIGNAL_STOP_CONTINUED;
 			p->signal->group_exit_code = 0;
 			spin_unlock(&p->sighand->siglock);
-			if (p->ptrace & PT_PTRACED)
-				do_notify_parent_cldstop(p, p->parent,
-							 CLD_CONTINUED);
-			else
-				do_notify_parent_cldstop(
-					p->group_leader,
-					p->group_leader->real_parent,
-							 CLD_CONTINUED);
+			do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
 			spin_lock(&p->sighand->siglock);
 		} else {
 			/*
@@ -1542,14 +1528,20 @@ void do_notify_parent(struct task_struct *tsk, int sig)
 	spin_unlock_irqrestore(&psig->siglock, flags);
 }
 
-static void
-do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
-			 int why)
+static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
 {
 	struct siginfo info;
 	unsigned long flags;
+	struct task_struct *parent;
 	struct sighand_struct *sighand;
 
+	if (to_self)
+		parent = tsk->parent;
+	else {
+		tsk = tsk->group_leader;
+		parent = tsk->real_parent;
+	}
+
 	info.si_signo = SIGCHLD;
 	info.si_errno = 0;
 	info.si_pid = tsk->pid;
@@ -1618,8 +1610,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
 		   !(current->ptrace & PT_ATTACHED)) &&
 	    (likely(current->parent->signal != current->signal) ||
 	     !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
-		do_notify_parent_cldstop(current, current->parent,
-					 CLD_TRAPPED);
+		do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
 		schedule();
 	} else {
@@ -1668,25 +1659,25 @@ void ptrace_notify(int exit_code)
 static void
 finish_stop(int stop_count)
 {
+	int to_self;
+
 	/*
 	 * If there are no other threads in the group, or if there is
 	 * a group stop in progress and we are the last to stop,
 	 * report to the parent.  When ptraced, every thread reports itself.
 	 */
-	if (stop_count < 0 || (current->ptrace & PT_PTRACED)) {
-		read_lock(&tasklist_lock);
-		do_notify_parent_cldstop(current, current->parent,
-					 CLD_STOPPED);
-		read_unlock(&tasklist_lock);
-	}
-	else if (stop_count == 0) {
-		read_lock(&tasklist_lock);
-		do_notify_parent_cldstop(current->group_leader,
-					 current->group_leader->real_parent,
-					 CLD_STOPPED);
-		read_unlock(&tasklist_lock);
-	}
+	if (stop_count < 0 || (current->ptrace & PT_PTRACED))
+		to_self = 1;
+	else if (stop_count == 0)
+		to_self = 0;
+	else
+		goto out;
 
+	read_lock(&tasklist_lock);
+	do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
+	read_unlock(&tasklist_lock);
+
+out:
 	schedule();
 	/*
 	 * Now we don't run again until continued.
-- 
cgit v1.2.3-59-g8ed1b


From 6c9c0b52b8c6b68b05bb06efd7079a8fc5e9ba60 Mon Sep 17 00:00:00 2001
From: Peter Staubach <staubach@redhat.com>
Date: Tue, 6 Sep 2005 15:17:35 -0700
Subject: [PATCH] largefile support for accounting

There is a problem in the accounting subsystem in the kernel can not
correctly handle files larger than 2GB.  The output file containing the
process accounting data can grow very large if the system is large enough
and active enough.  If the 2GB limit is reached, then the system simply
stops storing process accounting data.

Another annoying problem is that once the system reaches this 2GB limit,
then every process which exits will receive a signal, SIGXFSZ.  This signal
is generated because an attempt was made to write beyond the limit for the
file descriptor.  This signal makes it look like every process has exited
due to a signal, when in fact, they have not.

The solution is to add the O_LARGEFILE flag to the list of flags used to
open the accounting file.  The rest of the accounting support is already
largefile safe.

The changes were tested by constructing a large file (just short of 2GB),
enabling accounting, and then running enough commands to cause the
accounting data generated to increase the size of the file to 2GB.  Without
the changes, the file grows to 2GB and the last command run in the test
script appears to exit due a signal when it has not.  With the changes,
things work as expected and quietly.

There are some user level changes required so that it can deal with
largefiles, but those are being handled separately.

Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 4168f631868e..f70e6027cca9 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -220,7 +220,7 @@ asmlinkage long sys_acct(const char __user *name)
 			return (PTR_ERR(tmp));
 		}
 		/* Difference from BSD - they don't do O_APPEND */
-		file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
+		file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
 		putname(tmp);
 		if (IS_ERR(file)) {
 			return (PTR_ERR(file));
-- 
cgit v1.2.3-59-g8ed1b


From 0730ded5be28653675ed314fdd878b8db5f88aa4 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Tue, 6 Sep 2005 15:17:37 -0700
Subject: [PATCH] remove a redundant variable in sys_prctl()

The patch removes a redundant variable `sig' from sys_prctl().

For some reason, when sys_prctl is called with option == PR_SET_PDEATHSIG
then the value of arg2 is assigned to an int variable named sig.  Then sig
is tested with valid_signal() and later used to set the value of
current->pdeath_signal .

There is no reason to use this intermediate variable since valid_signal()
takes a unsigned long argument, so it can handle being passed arg2
directly, and if the call to valid_signal is OK, then we know the value of
arg2 is in the range zero to _NSIG and thus it'll easily fit in a plain int
and thus there's no problem assigning it later to current->pdeath_signal
(which is an int).

The patch gets rid of the pointless variable `sig'.
This reduces the size of kernel/sys.o in 2.6.13-rc6-mm1 by 32 bytes on my
system.

Patch has been compile tested, boot tested, and just to make damn sure I
didn't break anything I wrote a quick test app that calls
prctl(PR_SET_PDEATHSIG ...) with the entire range of values for a
unsigned long, and it behaves as expected with and without the patch.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 0bcaed6560ac..c80412be2302 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1711,7 +1711,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			  unsigned long arg4, unsigned long arg5)
 {
 	long error;
-	int sig;
 
 	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
 	if (error)
@@ -1719,12 +1718,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 
 	switch (option) {
 		case PR_SET_PDEATHSIG:
-			sig = arg2;
-			if (!valid_signal(sig)) {
+			if (!valid_signal(arg2)) {
 				error = -EINVAL;
 				break;
 			}
-			current->pdeath_signal = sig;
+			current->pdeath_signal = arg2;
 			break;
 		case PR_GET_PDEATHSIG:
 			error = put_user(current->pdeath_signal, (int __user *)arg2);
-- 
cgit v1.2.3-59-g8ed1b


From e752dd6cc66a3e6a11396928998baf390cc00420 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 6 Sep 2005 15:17:42 -0700
Subject: [PATCH] fix send_sigqueue() vs thread exit race

posix_timer_event() first checks that the thread (SIGEV_THREAD_ID case)
does not have PF_EXITING flag, then it calls send_sigqueue() which locks
task list.  But if the thread exits in between the kernel will oops
(->sighand == NULL after __exit_sighand).

This patch moves the PF_EXITING check into the send_sigqueue(), it must be
done atomically under tasklist_lock.  When send_sigqueue() detects exiting
thread it returns -1.  In that case posix_timer_event will send the signal
to thread group.

Also, this patch fixes task_struct use-after-free in posix_timer_event.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 28 +++++++++++++++-------------
 kernel/signal.c       | 22 ++++++++++++----------
 2 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 38798a2ff994..b7b532acd9fc 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
 	timr->sigq->info.si_code = SI_TIMER;
 	timr->sigq->info.si_tid = timr->it_id;
 	timr->sigq->info.si_value = timr->it_sigev_value;
+
 	if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
-		if (unlikely(timr->it_process->flags & PF_EXITING)) {
-			timr->it_sigev_notify = SIGEV_SIGNAL;
-			put_task_struct(timr->it_process);
-			timr->it_process = timr->it_process->group_leader;
-			goto group;
-		}
-		return send_sigqueue(timr->it_sigev_signo, timr->sigq,
-			timr->it_process);
-	}
-	else {
-	group:
-		return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
-			timr->it_process);
+		struct task_struct *leader;
+		int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
+					timr->it_process);
+
+		if (likely(ret >= 0))
+			return ret;
+
+		timr->it_sigev_notify = SIGEV_SIGNAL;
+		leader = timr->it_process->group_leader;
+		put_task_struct(timr->it_process);
+		timr->it_process = leader;
 	}
+
+	return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
+				   timr->it_process);
 }
 EXPORT_SYMBOL_GPL(posix_timer_event);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 56e33df2b67f..4980a073237f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1366,16 +1366,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	unsigned long flags;
 	int ret = 0;
 
-	/*
-	 * We need the tasklist lock even for the specific
-	 * thread case (when we don't need to follow the group
-	 * lists) in order to avoid races with "p->sighand"
-	 * going away or changing from under us.
-	 */
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);  
+	read_lock(&tasklist_lock);
+
+	if (unlikely(p->flags & PF_EXITING)) {
+		ret = -1;
+		goto out_err;
+	}
+
 	spin_lock_irqsave(&p->sighand->siglock, flags);
-	
+
 	if (unlikely(!list_empty(&q->list))) {
 		/*
 		 * If an SI_TIMER entry is already queue just increment
@@ -1385,7 +1385,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 			BUG();
 		q->info.si_overrun++;
 		goto out;
-	} 
+	}
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(p, sig)) {
 		ret = 1;
@@ -1400,8 +1400,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 
 out:
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+out_err:
 	read_unlock(&tasklist_lock);
-	return(ret);
+
+	return ret;
 }
 
 int
-- 
cgit v1.2.3-59-g8ed1b


From 39ed3fdeec1290dd246dcf1da6b278566987a084 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Tue, 6 Sep 2005 15:17:44 -0700
Subject: [PATCH] futex: remove duplicate code

This patch cleans up the error path of futex_fd() by removing duplicate
code.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 07ba87de9658..ca05fe6a70b2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -786,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal)
 	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
 
 	if (signal) {
-		int err;
 		err = f_setown(filp, current->pid, 1);
 		if (err < 0) {
-			put_unused_fd(ret);
-			put_filp(filp);
-			ret = err;
-			goto out;
+			goto error;
 		}
 		filp->f_owner.signum = signal;
 	}
 
 	q = kmalloc(sizeof(*q), GFP_KERNEL);
 	if (!q) {
-		put_unused_fd(ret);
-		put_filp(filp);
-		ret = -ENOMEM;
-		goto out;
+		err = -ENOMEM;
+		goto error;
 	}
 
 	down_read(&current->mm->mmap_sem);
@@ -810,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal)
 
 	if (unlikely(err != 0)) {
 		up_read(&current->mm->mmap_sem);
-		put_unused_fd(ret);
-		put_filp(filp);
 		kfree(q);
-		return err;
+		goto error;
 	}
 
 	/*
@@ -829,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal)
 	fd_install(ret, filp);
 out:
 	return ret;
+error:
+	put_unused_fd(ret);
+	put_filp(filp);
+	ret = err;
+	goto out;
 }
 
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
-- 
cgit v1.2.3-59-g8ed1b


From 9bf2229f8817677127a60c177aefce1badd22d7b Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 6 Sep 2005 15:18:12 -0700
Subject: [PATCH] cpusets: formalize intermediate GFP_KERNEL containment

This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution.  With this patch, there are now the following four layers of
memory placement available:

 1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
 2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
 3) The current tasks cpuset (GFP_USER allocations constrained to here), and
 4) Specific node placement, using mbind and set_mempolicy.

These nest - each layer is a subset (same or within) of the previous.

Layer (2) above is new, with this patch.  The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed.  The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.

GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.

The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such.  Swapper and oom_kill activity is also constrained to Layer (2).  A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset.  Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.

This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.

Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt | 12 +++++++
 include/linux/cpuset.h    |  5 +--
 kernel/cpuset.c           | 80 ++++++++++++++++++++++++++++++++++++++++++-----
 mm/page_alloc.c           | 16 ++++++----
 mm/vmscan.c               |  8 ++---
 5 files changed, 101 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index ad944c060312..47f4114fbf54 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -60,6 +60,18 @@ all of the cpus in the system. This removes any overhead due to
 load balancing code trying to pull tasks outside of the cpu exclusive
 cpuset only to be prevented by the tasks' cpus_allowed mask.
 
+A cpuset that is mem_exclusive restricts kernel allocations for
+page, buffer and other data commonly shared by the kernel across
+multiple users.  All cpusets, whether mem_exclusive or not, restrict
+allocations of memory for user space.  This enables configuring a
+system so that several independent jobs can share common kernel
+data, such as file system pages, while isolating each jobs user
+allocation in its own cpuset.  To do this, construct a large
+mem_exclusive cpuset to hold all the jobs, and construct child,
+non-mem_exclusive cpusets for each individual job.  Only a small
+amount of typical kernel memory, such as requests from interrupt
+handlers, is allowed to be taken outside even a mem_exclusive cpuset.
+
 User level code may create and destroy cpusets by name in the cpuset
 virtual file system, manage the attributes and permissions of these
 cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 3438233305a3..1fe1c3ebad30 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -23,7 +23,7 @@ void cpuset_init_current_mems_allowed(void);
 void cpuset_update_current_mems_allowed(void);
 void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
-int cpuset_zone_allowed(struct zone *z);
+extern int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask);
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
@@ -48,7 +48,8 @@ static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 	return 1;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z)
+static inline int cpuset_zone_allowed(struct zone *z,
+					unsigned int __nocast gfp_mask)
 {
 	return 1;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8ab1b4e518b8..214806deca99 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1611,17 +1611,81 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 	return 0;
 }
 
+/*
+ * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
+ * ancestor to the specified cpuset.  Call while holding cpuset_sem.
+ * If no ancestor is mem_exclusive (an unusual configuration), then
+ * returns the root cpuset.
+ */
+static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+{
+	while (!is_mem_exclusive(cs) && cs->parent)
+		cs = cs->parent;
+	return cs;
+}
+
 /**
- * cpuset_zone_allowed - is zone z allowed in current->mems_allowed
- * @z: zone in question
+ * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
  *
- * Is zone z allowed in current->mems_allowed, or is
- * the CPU in interrupt context? (zone is always allowed in this case)
- */
-int cpuset_zone_allowed(struct zone *z)
+ * If we're in interrupt, yes, we can always allocate.  If zone
+ * z's node is in our tasks mems_allowed, yes.  If it's not a
+ * __GFP_HARDWALL request and this zone's nodes is in the nearest
+ * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * Otherwise, no.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest mem_exclusive ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires cpuset_sem.  The __alloc_pages()
+ * routine only calls here with __GFP_HARDWALL bit _not_ set if
+ * it's a GFP_KERNEL allocation, and all nodes in the current tasks
+ * mems_allowed came up empty on the first pass over the zonelist.
+ * So only GFP_KERNEL allocations, if all nodes in the cpuset are
+ * short of memory, might require taking the cpuset_sem semaphore.
+ *
+ * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * hardwall cpusets - no allocation on a node outside the cpuset is
+ * allowed (unless in interrupt, of course).
+ *
+ * The second loop doesn't even call here for GFP_ATOMIC requests
+ * (if the __alloc_pages() local variable 'wait' is set).  That check
+ * and the checks below have the combined affect in the second loop of
+ * the __alloc_pages() routine that:
+ *	in_interrupt - any node ok (current task context irrelevant)
+ *	GFP_ATOMIC   - any node ok
+ *	GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
+ *	GFP_USER     - only nodes in current tasks mems allowed ok.
+ **/
+
+int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
 {
-	return in_interrupt() ||
-		node_isset(z->zone_pgdat->node_id, current->mems_allowed);
+	int node;			/* node that zone z is on */
+	const struct cpuset *cs;	/* current cpuset ancestors */
+	int allowed = 1;		/* is allocation in zone z allowed? */
+
+	if (in_interrupt())
+		return 1;
+	node = z->zone_pgdat->node_id;
+	if (node_isset(node, current->mems_allowed))
+		return 1;
+	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
+		return 0;
+
+	/* Not hardwall and node outside mems_allowed: scan up cpusets */
+	down(&cpuset_sem);
+	cs = current->cpuset;
+	if (!cs)
+		goto done;		/* current task exiting */
+	cs = nearest_exclusive_ancestor(cs);
+	allowed = node_isset(node, cs->mems_allowed);
+done:
+	up(&cpuset_sem);
+	return allowed;
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 14d7032c1d12..3974fd81d27c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -806,11 +806,14 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
 	classzone_idx = zone_idx(zones[0]);
 
 restart:
-	/* Go through the zonelist once, looking for a zone with enough free */
+	/*
+	 * Go through the zonelist once, looking for a zone with enough free.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		int do_reclaim = should_reclaim_zone(z, gfp_mask);
 
-		if (!cpuset_zone_allowed(z))
+		if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 			continue;
 
 		/*
@@ -845,6 +848,7 @@ zone_reclaim_retry:
 	 *
 	 * This is the last chance, in general, before the goto nopage.
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		if (!zone_watermark_ok(z, order, z->pages_min,
@@ -852,7 +856,7 @@ zone_reclaim_retry:
 				       gfp_mask & __GFP_HIGH))
 			continue;
 
-		if (wait && !cpuset_zone_allowed(z))
+		if (wait && !cpuset_zone_allowed(z, gfp_mask))
 			continue;
 
 		page = buffered_rmqueue(z, order, gfp_mask);
@@ -867,7 +871,7 @@ zone_reclaim_retry:
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			/* go through the zonelist yet again, ignoring mins */
 			for (i = 0; (z = zones[i]) != NULL; i++) {
-				if (!cpuset_zone_allowed(z))
+				if (!cpuset_zone_allowed(z, gfp_mask))
 					continue;
 				page = buffered_rmqueue(z, order, gfp_mask);
 				if (page)
@@ -903,7 +907,7 @@ rebalance:
 					       gfp_mask & __GFP_HIGH))
 				continue;
 
-			if (!cpuset_zone_allowed(z))
+			if (!cpuset_zone_allowed(z, gfp_mask))
 				continue;
 
 			page = buffered_rmqueue(z, order, gfp_mask);
@@ -922,7 +926,7 @@ rebalance:
 					       classzone_idx, 0, 0))
 				continue;
 
-			if (!cpuset_zone_allowed(z))
+			if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 				continue;
 
 			page = buffered_rmqueue(z, order, gfp_mask);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0095533cdde9..a740778f688d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -894,7 +894,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 		if (zone->present_pages == 0)
 			continue;
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->temp_priority = sc->priority;
@@ -940,7 +940,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->temp_priority = DEF_PRIORITY;
@@ -986,7 +986,7 @@ out:
 	for (i = 0; zones[i] != 0; i++) {
 		struct zone *zone = zones[i];
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->prev_priority = zone->temp_priority;
@@ -1256,7 +1256,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 		return;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
-	if (!cpuset_zone_allowed(zone))
+	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 		return;
 	if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
 		return;
-- 
cgit v1.2.3-59-g8ed1b


From ef08e3b4981aebf2ba9bd7025ef7210e8eec07ce Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 6 Sep 2005 15:18:13 -0700
Subject: [PATCH] cpusets: confine oom_killer to mem_exclusive cpuset

Now the real motivation for this cpuset mem_exclusive patch series seems
trivial.

This patch keeps a task in or under one mem_exclusive cpuset from provoking an
oom kill of a task under a non-overlapping mem_exclusive cpuset.  Since only
interrupt and GFP_ATOMIC allocations are allowed to escape mem_exclusive
containment, there is little to gain from oom killing a task under a
non-overlapping mem_exclusive cpuset, as almost all kernel and user memory
allocation must come from disjoint memory nodes.

This patch enables configuring a system so that a runaway job under one
mem_exclusive cpuset cannot cause the killing of a job in another such cpuset
that might be using very high compute and memory resources for a prolonged
time.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  6 ++++++
 kernel/cpuset.c        | 33 +++++++++++++++++++++++++++++++++
 mm/oom_kill.c          |  5 +++++
 3 files changed, 44 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 1fe1c3ebad30..24062a1dbf61 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -24,6 +24,7 @@ void cpuset_update_current_mems_allowed(void);
 void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
 extern int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask);
+extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
@@ -54,6 +55,11 @@ static inline int cpuset_zone_allowed(struct zone *z,
 	return 1;
 }
 
+static inline int cpuset_excl_nodes_overlap(const struct task_struct *p)
+{
+	return 1;
+}
+
 static inline char *cpuset_task_status_allowed(struct task_struct *task,
 							char *buffer)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 214806deca99..40c6d801dd66 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1688,6 +1688,39 @@ done:
 	return allowed;
 }
 
+/**
+ * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
+ * @p: pointer to task_struct of some other task.
+ *
+ * Description: Return true if the nearest mem_exclusive ancestor
+ * cpusets of tasks @p and current overlap.  Used by oom killer to
+ * determine if task @p's memory usage might impact the memory
+ * available to the current task.
+ *
+ * Acquires cpuset_sem - not suitable for calling from a fast path.
+ **/
+
+int cpuset_excl_nodes_overlap(const struct task_struct *p)
+{
+	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
+	int overlap = 0;		/* do cpusets overlap? */
+
+	down(&cpuset_sem);
+	cs1 = current->cpuset;
+	if (!cs1)
+		goto done;		/* current task exiting */
+	cs2 = p->cpuset;
+	if (!cs2)
+		goto done;		/* task p is exiting */
+	cs1 = nearest_exclusive_ancestor(cs1);
+	cs2 = nearest_exclusive_ancestor(cs2);
+	overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+done:
+	up(&cpuset_sem);
+
+	return overlap;
+}
+
 /*
  * proc_cpuset_show()
  *  - Print tasks cpuset path into seq_file.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3a1d46502938..5ec8da12cfd9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -20,6 +20,7 @@
 #include <linux/swap.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
+#include <linux/cpuset.h>
 
 /* #define DEBUG */
 
@@ -152,6 +153,10 @@ static struct task_struct * select_bad_process(void)
 			continue;
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
+		/* If p's nodes don't overlap ours, it won't help to kill p. */
+		if (!cpuset_excl_nodes_overlap(p))
+			continue;
+
 		/*
 		 * This is in the process of releasing memory so for wait it
 		 * to finish before killing some other task by mistake.
-- 
cgit v1.2.3-59-g8ed1b


From 9c1cfda20a508b181bdda8c0045f7c0c333880a5 Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Tue, 6 Sep 2005 15:18:14 -0700
Subject: [PATCH] cpusets: Move the ia64 domain setup code to the generic code

Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/Makefile    |   2 +-
 arch/ia64/kernel/domain.c    | 444 -------------------------------------------
 include/asm-ia64/processor.h |   3 -
 include/asm-ia64/topology.h  |  23 ---
 include/linux/sched.h        |   7 -
 include/linux/topology.h     |  23 +++
 kernel/sched.c               | 290 ++++++++++++++++++++++------
 7 files changed, 260 insertions(+), 532 deletions(-)
 delete mode 100644 arch/ia64/kernel/domain.c

(limited to 'kernel')

diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index b242594be55b..307514f7a282 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acpi-ext.o
 obj-$(CONFIG_IA64_PALINFO)	+= palinfo.o
 obj-$(CONFIG_IOSAPIC)		+= iosapic.o
 obj-$(CONFIG_MODULES)		+= module.o
-obj-$(CONFIG_SMP)		+= smp.o smpboot.o domain.o
+obj-$(CONFIG_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
deleted file mode 100644
index e907109983f1..000000000000
--- a/arch/ia64/kernel/domain.c
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * arch/ia64/kernel/domain.c
- * Architecture specific sched-domains builder.
- *
- * Copyright (C) 2004 Jesse Barnes
- * Copyright (C) 2004 Silicon Graphics, Inc.
- */
-
-#include <linux/sched.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/cpumask.h>
-#include <linux/init.h>
-#include <linux/topology.h>
-#include <linux/nodemask.h>
-
-#define SD_NODES_PER_DOMAIN 16
-
-#ifdef CONFIG_NUMA
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain.  Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, unsigned long *used_nodes)
-{
-	int i, n, val, min_val, best_node = 0;
-
-	min_val = INT_MAX;
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
-
-		if (!nr_cpus_node(n))
-			continue;
-
-		/* Skip already used nodes */
-		if (test_bit(n, used_nodes))
-			continue;
-
-		/* Simple min distance search */
-		val = node_distance(node, n);
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	set_bit(best_node, used_nodes);
-	return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
- *
- * Given a node, construct a good cpumask for its sched_domain to span.  It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static cpumask_t sched_domain_node_span(int node)
-{
-	int i;
-	cpumask_t span, nodemask;
-	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-
-	cpus_clear(span);
-	bitmap_zero(used_nodes, MAX_NUMNODES);
-
-	nodemask = node_to_cpumask(node);
-	cpus_or(span, span, nodemask);
-	set_bit(node, used_nodes);
-
-	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-		int next_node = find_next_best_node(node, used_nodes);
-		nodemask = node_to_cpumask(next_node);
-		cpus_or(span, span, nodemask);
-	}
-
-	return span;
-}
-#endif
-
-/*
- * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
- * can switch it on easily if needed.
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
-static int cpu_to_cpu_group(int cpu)
-{
-	return cpu;
-}
-#endif
-
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
-static int cpu_to_phys_group(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-	return first_cpu(cpu_sibling_map[cpu]);
-#else
-	return cpu;
-#endif
-}
-
-#ifdef CONFIG_NUMA
-/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
-
-static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
-
-static int cpu_to_allnodes_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-void build_sched_domains(const cpumask_t *cpu_map)
-{
-	int i;
-#ifdef CONFIG_NUMA
-	struct sched_group **sched_group_nodes = NULL;
-	struct sched_group *sched_group_allnodes = NULL;
-
-	/*
-	 * Allocate the per-node list of sched groups
-	 */
-	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
-					   GFP_ATOMIC);
-	if (!sched_group_nodes) {
-		printk(KERN_WARNING "Can not alloc sched group node list\n");
-		return;
-	}
-	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
-#endif
-
-	/*
-	 * Set up domains for cpus specified by the cpu_map.
-	 */
-	for_each_cpu_mask(i, *cpu_map) {
-		int group;
-		struct sched_domain *sd = NULL, *p;
-		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-
-#ifdef CONFIG_NUMA
-		if (cpus_weight(*cpu_map)
-				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
-			if (!sched_group_allnodes) {
-				sched_group_allnodes
-					= kmalloc(sizeof(struct sched_group)
-							* MAX_NUMNODES,
-						  GFP_KERNEL);
-				if (!sched_group_allnodes) {
-					printk(KERN_WARNING
-					"Can not alloc allnodes sched group\n");
-					break;
-				}
-				sched_group_allnodes_bycpu[i]
-						= sched_group_allnodes;
-			}
-			sd = &per_cpu(allnodes_domains, i);
-			*sd = SD_ALLNODES_INIT;
-			sd->span = *cpu_map;
-			group = cpu_to_allnodes_group(i);
-			sd->groups = &sched_group_allnodes[group];
-			p = sd;
-		} else
-			p = NULL;
-
-		sd = &per_cpu(node_domains, i);
-		*sd = SD_NODE_INIT;
-		sd->span = sched_domain_node_span(cpu_to_node(i));
-		sd->parent = p;
-		cpus_and(sd->span, sd->span, *cpu_map);
-#endif
-
-		p = sd;
-		sd = &per_cpu(phys_domains, i);
-		group = cpu_to_phys_group(i);
-		*sd = SD_CPU_INIT;
-		sd->span = nodemask;
-		sd->parent = p;
-		sd->groups = &sched_group_phys[group];
-
-#ifdef CONFIG_SCHED_SMT
-		p = sd;
-		sd = &per_cpu(cpu_domains, i);
-		group = cpu_to_cpu_group(i);
-		*sd = SD_SIBLING_INIT;
-		sd->span = cpu_sibling_map[i];
-		cpus_and(sd->span, sd->span, *cpu_map);
-		sd->parent = p;
-		sd->groups = &sched_group_cpus[group];
-#endif
-	}
-
-#ifdef CONFIG_SCHED_SMT
-	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask(i, *cpu_map) {
-		cpumask_t this_sibling_map = cpu_sibling_map[i];
-		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
-		if (i != first_cpu(this_sibling_map))
-			continue;
-
-		init_sched_build_groups(sched_group_cpus, this_sibling_map,
-						&cpu_to_cpu_group);
-	}
-#endif
-
-	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t nodemask = node_to_cpumask(i);
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
-
-		init_sched_build_groups(sched_group_phys, nodemask,
-						&cpu_to_phys_group);
-	}
-
-#ifdef CONFIG_NUMA
-	if (sched_group_allnodes)
-		init_sched_build_groups(sched_group_allnodes, *cpu_map,
-					&cpu_to_allnodes_group);
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		/* Set up node groups */
-		struct sched_group *sg, *prev;
-		cpumask_t nodemask = node_to_cpumask(i);
-		cpumask_t domainspan;
-		cpumask_t covered = CPU_MASK_NONE;
-		int j;
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask)) {
-			sched_group_nodes[i] = NULL;
-			continue;
-		}
-
-		domainspan = sched_domain_node_span(i);
-		cpus_and(domainspan, domainspan, *cpu_map);
-
-		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-		sched_group_nodes[i] = sg;
-		for_each_cpu_mask(j, nodemask) {
-			struct sched_domain *sd;
-			sd = &per_cpu(node_domains, j);
-			sd->groups = sg;
-			if (sd->groups == NULL) {
-				/* Turn off balancing if we have no groups */
-				sd->flags = 0;
-			}
-		}
-		if (!sg) {
-			printk(KERN_WARNING
-			"Can not alloc domain group for node %d\n", i);
-			continue;
-		}
-		sg->cpu_power = 0;
-		sg->cpumask = nodemask;
-		cpus_or(covered, covered, nodemask);
-		prev = sg;
-
-		for (j = 0; j < MAX_NUMNODES; j++) {
-			cpumask_t tmp, notcovered;
-			int n = (i + j) % MAX_NUMNODES;
-
-			cpus_complement(notcovered, covered);
-			cpus_and(tmp, notcovered, *cpu_map);
-			cpus_and(tmp, tmp, domainspan);
-			if (cpus_empty(tmp))
-				break;
-
-			nodemask = node_to_cpumask(n);
-			cpus_and(tmp, tmp, nodemask);
-			if (cpus_empty(tmp))
-				continue;
-
-			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-			if (!sg) {
-				printk(KERN_WARNING
-				"Can not alloc domain group for node %d\n", j);
-				break;
-			}
-			sg->cpu_power = 0;
-			sg->cpumask = tmp;
-			cpus_or(covered, covered, tmp);
-			prev->next = sg;
-			prev = sg;
-		}
-		prev->next = sched_group_nodes[i];
-	}
-#endif
-
-	/* Calculate CPU power for physical packages and nodes */
-	for_each_cpu_mask(i, *cpu_map) {
-		int power;
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-		power = SCHED_LOAD_SCALE;
-		sd->groups->cpu_power = power;
-#endif
-
-		sd = &per_cpu(phys_domains, i);
-		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-		sd->groups->cpu_power = power;
-
-#ifdef CONFIG_NUMA
-		sd = &per_cpu(allnodes_domains, i);
-		if (sd->groups) {
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-			sd->groups->cpu_power = power;
-		}
-#endif
-	}
-
-#ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		struct sched_group *sg = sched_group_nodes[i];
-		int j;
-
-		if (sg == NULL)
-			continue;
-next_sg:
-		for_each_cpu_mask(j, sg->cpumask) {
-			struct sched_domain *sd;
-			int power;
-
-			sd = &per_cpu(phys_domains, j);
-			if (j != first_cpu(sd->groups->cpumask)) {
-				/*
-				 * Only add "power" once for each
-				 * physical package.
-				 */
-				continue;
-			}
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-
-			sg->cpu_power += power;
-		}
-		sg = sg->next;
-		if (sg != sched_group_nodes[i])
-			goto next_sg;
-	}
-#endif
-
-	/* Attach the domains */
-	for_each_cpu_mask(i, *cpu_map) {
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-#else
-		sd = &per_cpu(phys_domains, i);
-#endif
-		cpu_attach_domain(sd, i);
-	}
-}
-/*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
- */
-void arch_init_sched_domains(const cpumask_t *cpu_map)
-{
-	cpumask_t cpu_default_map;
-
-	/*
-	 * Setup mask for cpus without special case scheduling requirements.
-	 * For now this just excludes isolated cpus, but could be used to
-	 * exclude other special cases in the future.
-	 */
-	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
-
-	build_sched_domains(&cpu_default_map);
-}
-
-void arch_destroy_sched_domains(const cpumask_t *cpu_map)
-{
-#ifdef CONFIG_NUMA
-	int i;
-	int cpu;
-
-	for_each_cpu_mask(cpu, *cpu_map) {
-		struct sched_group *sched_group_allnodes
-			= sched_group_allnodes_bycpu[cpu];
-		struct sched_group **sched_group_nodes
-			= sched_group_nodes_bycpu[cpu];
-
-		if (sched_group_allnodes) {
-			kfree(sched_group_allnodes);
-			sched_group_allnodes_bycpu[cpu] = NULL;
-		}
-
-		if (!sched_group_nodes)
-			continue;
-
-		for (i = 0; i < MAX_NUMNODES; i++) {
-			cpumask_t nodemask = node_to_cpumask(i);
-			struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
-			cpus_and(nodemask, nodemask, *cpu_map);
-			if (cpus_empty(nodemask))
-				continue;
-
-			if (sg == NULL)
-				continue;
-			sg = sg->next;
-next_sg:
-			oldsg = sg;
-			sg = sg->next;
-			kfree(oldsg);
-			if (oldsg != sched_group_nodes[i])
-				goto next_sg;
-		}
-		kfree(sched_group_nodes);
-		sched_group_nodes_bycpu[cpu] = NULL;
-	}
-#endif
-}
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 91bbd1f22461..94e07e727395 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -20,9 +20,6 @@
 #include <asm/ptrace.h>
 #include <asm/ustack.h>
 
-/* Our arch specific arch_init_sched_domain is in arch/ia64/kernel/domain.c */
-#define ARCH_HAS_SCHED_DOMAIN
-
 #define IA64_NUM_DBG_REGS	8
 /*
  * Limits for PMC and PMD are set to less than maximum architected values
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 399bc29729fd..a9f738bf18a7 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -98,29 +98,6 @@ void build_cpu_to_node_map(void);
 	.nr_balance_failed	= 0,			\
 }
 
-/* sched_domains SD_ALLNODES_INIT for IA64 NUMA machines */
-#define SD_ALLNODES_INIT (struct sched_domain) {	\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 64,			\
-	.max_interval		= 64*num_online_cpus(),	\
-	.busy_factor		= 128,			\
-	.imbalance_pct		= 133,			\
-	.cache_hot_time		= (10*1000000),		\
-	.cache_nice_tries	= 1,			\
-	.busy_idx		= 3,			\
-	.idle_idx		= 3,			\
-	.newidle_idx		= 0, /* unused */	\
-	.wake_idx		= 0, /* unused */	\
-	.forkexec_idx		= 0, /* unused */	\
-	.per_cpu_gain		= 100,			\
-	.flags			= SD_LOAD_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 64,			\
-	.nr_balance_failed	= 0,			\
-}
-
 #endif /* CONFIG_NUMA */
 
 #include <asm-generic/topology.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b5a22ea80045..ea1b5f32ec5c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -564,13 +564,6 @@ struct sched_domain {
 
 extern void partition_sched_domains(cpumask_t *partition1,
 				    cpumask_t *partition2);
-#ifdef ARCH_HAS_SCHED_DOMAIN
-/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
-extern cpumask_t cpu_isolated_map;
-extern void init_sched_build_groups(struct sched_group groups[],
-	                        cpumask_t span, int (*group_fn)(int cpu));
-extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
-#endif /* ARCH_HAS_SCHED_DOMAIN */
 #endif /* CONFIG_SMP */
 
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 0320225e96da..3df1d474e5c5 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -135,6 +135,29 @@
 }
 #endif
 
+/* sched_domains SD_ALLNODES_INIT for NUMA machines */
+#define SD_ALLNODES_INIT (struct sched_domain) {	\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 64,			\
+	.max_interval		= 64*num_online_cpus(),	\
+	.busy_factor		= 128,			\
+	.imbalance_pct		= 133,			\
+	.cache_hot_time		= (10*1000000),		\
+	.cache_nice_tries	= 1,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 3,			\
+	.newidle_idx		= 0, /* unused */	\
+	.wake_idx		= 0, /* unused */	\
+	.forkexec_idx		= 0, /* unused */	\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_LOAD_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 64,			\
+	.nr_balance_failed	= 0,			\
+}
+
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..50860ad5b624 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4779,7 +4779,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -4802,7 +4802,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4830,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void init_sched_build_groups(struct sched_group groups[],
-			cpumask_t span, int (*group_fn)(int cpu))
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+				    int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4864,85 @@ void init_sched_build_groups(struct sched_group groups[],
 	last->next = first;
 }
 
+#define SD_NODES_PER_DOMAIN 16
 
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void build_sched_domains(const cpumask_t *cpu_map);
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
-#else
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Start at @node */
+		n = (node + i) % MAX_NUMNODES;
+
+		if (!nr_cpus_node(n))
+			continue;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+	int i;
+	cpumask_t span, nodemask;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	nodemask = node_to_cpumask(node);
+	cpus_or(span, span, nodemask);
+	set_bit(node, used_nodes);
+
+	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		nodemask = node_to_cpumask(next_node);
+		cpus_or(span, span, nodemask);
+	}
+
+	return span;
+}
+#endif
+
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +4964,20 @@ static int cpu_to_phys_group(int cpu)
 }
 
 #ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
 /*
- * The domains setup code relies on siblings not spanning
- * multiple nodes. Make sure the architecture has a proper
- * siblings map:
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
  */
-static void check_sibling_maps(void)
-{
-	int i, j;
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group *sched_group_nodes[MAX_NUMNODES];
 
-	for_each_online_cpu(i) {
-		for_each_cpu_mask(j, cpu_sibling_map[i]) {
-			if (cpu_to_node(i) != cpu_to_node(j)) {
-				printk(KERN_INFO "warning: CPU %d siblings map "
-					"to different node - isolating "
-					"them.\n", i);
-				cpu_sibling_map[i] = cpumask_of_cpu(i);
-				break;
-			}
-		}
-	}
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+
+static int cpu_to_allnodes_group(int cpu)
+{
+	return cpu_to_node(cpu);
 }
 #endif
 
@@ -4928,7 +4985,7 @@ static void check_sibling_maps(void)
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 
@@ -4943,11 +5000,22 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
+		if (num_online_cpus()
+				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			sd = &per_cpu(allnodes_domains, i);
+			*sd = SD_ALLNODES_INIT;
+			sd->span = *cpu_map;
+			group = cpu_to_allnodes_group(i);
+			sd->groups = &sched_group_allnodes[group];
+			p = sd;
+		} else
+			p = NULL;
+
 		sd = &per_cpu(node_domains, i);
-		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		sd->span = *cpu_map;
-		sd->groups = &sched_group_nodes[group];
+		sd->span = sched_domain_node_span(cpu_to_node(i));
+		sd->parent = p;
+		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
 		p = sd;
@@ -4972,7 +5040,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
 		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5065,74 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	init_sched_build_groups(sched_group_nodes, *cpu_map,
-					&cpu_to_node_group);
+	init_sched_build_groups(sched_group_allnodes, *cpu_map,
+				&cpu_to_allnodes_group);
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Set up node groups */
+		struct sched_group *sg, *prev;
+		cpumask_t nodemask = node_to_cpumask(i);
+		cpumask_t domainspan;
+		cpumask_t covered = CPU_MASK_NONE;
+		int j;
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		domainspan = sched_domain_node_span(i);
+		cpus_and(domainspan, domainspan, *cpu_map);
+
+		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+		sched_group_nodes[i] = sg;
+		for_each_cpu_mask(j, nodemask) {
+			struct sched_domain *sd;
+			sd = &per_cpu(node_domains, j);
+			sd->groups = sg;
+			if (sd->groups == NULL) {
+				/* Turn off balancing if we have no groups */
+				sd->flags = 0;
+			}
+		}
+		if (!sg) {
+			printk(KERN_WARNING
+			"Can not alloc domain group for node %d\n", i);
+			continue;
+		}
+		sg->cpu_power = 0;
+		sg->cpumask = nodemask;
+		cpus_or(covered, covered, nodemask);
+		prev = sg;
+
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			cpumask_t tmp, notcovered;
+			int n = (i + j) % MAX_NUMNODES;
+
+			cpus_complement(notcovered, covered);
+			cpus_and(tmp, notcovered, *cpu_map);
+			cpus_and(tmp, tmp, domainspan);
+			if (cpus_empty(tmp))
+				break;
+
+			nodemask = node_to_cpumask(n);
+			cpus_and(tmp, tmp, nodemask);
+			if (cpus_empty(tmp))
+				continue;
+
+			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+			if (!sg) {
+				printk(KERN_WARNING
+				"Can not alloc domain group for node %d\n", j);
+				break;
+			}
+			sg->cpu_power = 0;
+			sg->cpumask = tmp;
+			cpus_or(covered, covered, tmp);
+			prev->next = sg;
+			prev = sg;
+		}
+		prev->next = sched_group_nodes[i];
+	}
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5151,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 		sd->groups->cpu_power = power;
 
 #ifdef CONFIG_NUMA
-		if (i == first_cpu(sd->groups->cpumask)) {
-			/* Only add "power" once for each physical package. */
-			sd = &per_cpu(node_domains, i);
-			sd->groups->cpu_power += power;
+		sd = &per_cpu(allnodes_domains, i);
+		if (sd->groups) {
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+			sd->groups->cpu_power = power;
 		}
 #endif
 	}
 
+#ifdef CONFIG_NUMA
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		struct sched_group *sg = sched_group_nodes[i];
+		int j;
+
+		if (sg == NULL)
+			continue;
+next_sg:
+		for_each_cpu_mask(j, sg->cpumask) {
+			struct sched_domain *sd;
+			int power;
+
+			sd = &per_cpu(phys_domains, j);
+			if (j != first_cpu(sd->groups->cpumask)) {
+				/*
+				 * Only add "power" once for each
+				 * physical package.
+				 */
+				continue;
+			}
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+
+			sg->cpu_power += power;
+		}
+		sg = sg->next;
+		if (sg != sched_group_nodes[i])
+			goto next_sg;
+	}
+#endif
+
 	/* Attach the domains */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
@@ -5039,13 +5205,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t cpu_default_map;
 
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-	check_sibling_maps();
-#endif
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
 	 * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5221,29 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-	/* Do nothing: everything is statically allocated. */
-}
+#ifdef CONFIG_NUMA
+	int i;
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		cpumask_t nodemask = node_to_cpumask(i);
+		struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		if (sg == NULL)
+			continue;
+		sg = sg->next;
+next_sg:
+		oldsg = sg;
+		sg = sg->next;
+		kfree(oldsg);
+		if (oldsg != sched_group_nodes[i])
+			goto next_sg;
+		sched_group_nodes[i] = NULL;
+	}
+#endif
+}
 
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
-- 
cgit v1.2.3-59-g8ed1b


From d1b551386a5f3f50a5003b691f819b07f8e6f034 Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Tue, 6 Sep 2005 15:18:14 -0700
Subject: [PATCH] cpusets: fix the "dynamic sched domains" bug

For a NUMA system with multiple CPUs per node, declaring a cpu-exclusive
cpuset that includes only some, but not all, of the CPUs in a node will mangle
the sched domain structures.

Signed-off-by: John Hawkes <hawkes@sgi.com>
Cc; Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 89 +++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 50860ad5b624..9508527845df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4970,10 +4970,10 @@ static int cpu_to_phys_group(int cpu)
  * gets dynamically allocated.
  */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group *sched_group_nodes[MAX_NUMNODES];
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
 
 static int cpu_to_allnodes_group(int cpu)
 {
@@ -4988,6 +4988,21 @@ static int cpu_to_allnodes_group(int cpu)
 void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
+#ifdef CONFIG_NUMA
+	struct sched_group **sched_group_nodes = NULL;
+	struct sched_group *sched_group_allnodes = NULL;
+
+	/*
+	 * Allocate the per-node list of sched groups
+	 */
+	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+					   GFP_ATOMIC);
+	if (!sched_group_nodes) {
+		printk(KERN_WARNING "Can not alloc sched group node list\n");
+		return;
+	}
+	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
 
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
@@ -5000,8 +5015,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
-		if (num_online_cpus()
+		if (cpus_weight(*cpu_map)
 				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			if (!sched_group_allnodes) {
+				sched_group_allnodes
+					= kmalloc(sizeof(struct sched_group)
+							* MAX_NUMNODES,
+						  GFP_KERNEL);
+				if (!sched_group_allnodes) {
+					printk(KERN_WARNING
+					"Can not alloc allnodes sched group\n");
+					break;
+				}
+				sched_group_allnodes_bycpu[i]
+						= sched_group_allnodes;
+			}
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
 			sd->span = *cpu_map;
@@ -5065,8 +5093,9 @@ void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	init_sched_build_groups(sched_group_allnodes, *cpu_map,
-				&cpu_to_allnodes_group);
+	if (sched_group_allnodes)
+		init_sched_build_groups(sched_group_allnodes, *cpu_map,
+					&cpu_to_allnodes_group);
 
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
@@ -5077,8 +5106,10 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		int j;
 
 		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
+		if (cpus_empty(nodemask)) {
+			sched_group_nodes[i] = NULL;
 			continue;
+		}
 
 		domainspan = sched_domain_node_span(i);
 		cpus_and(domainspan, domainspan, *cpu_map);
@@ -5223,24 +5254,42 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 #ifdef CONFIG_NUMA
 	int i;
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t nodemask = node_to_cpumask(i);
-		struct sched_group *oldsg, *sg = sched_group_nodes[i];
+	int cpu;
 
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
+	for_each_cpu_mask(cpu, *cpu_map) {
+		struct sched_group *sched_group_allnodes
+			= sched_group_allnodes_bycpu[cpu];
+		struct sched_group **sched_group_nodes
+			= sched_group_nodes_bycpu[cpu];
 
-		if (sg == NULL)
+		if (sched_group_allnodes) {
+			kfree(sched_group_allnodes);
+			sched_group_allnodes_bycpu[cpu] = NULL;
+		}
+
+		if (!sched_group_nodes)
 			continue;
-		sg = sg->next;
+
+		for (i = 0; i < MAX_NUMNODES; i++) {
+			cpumask_t nodemask = node_to_cpumask(i);
+			struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+			cpus_and(nodemask, nodemask, *cpu_map);
+			if (cpus_empty(nodemask))
+				continue;
+
+			if (sg == NULL)
+				continue;
+			sg = sg->next;
 next_sg:
-		oldsg = sg;
-		sg = sg->next;
-		kfree(oldsg);
-		if (oldsg != sched_group_nodes[i])
-			goto next_sg;
-		sched_group_nodes[i] = NULL;
+			oldsg = sg;
+			sg = sg->next;
+			kfree(oldsg);
+			if (oldsg != sched_group_nodes[i])
+				goto next_sg;
+		}
+		kfree(sched_group_nodes);
+		sched_group_nodes_bycpu[cpu] = NULL;
 	}
 #endif
 }
-- 
cgit v1.2.3-59-g8ed1b


From 0811bab24ff1eecab38110eda7ea7847db95c64e Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Tue, 6 Sep 2005 15:18:15 -0700
Subject: [PATCH] cpusets: re-enable "dynamic sched domains"

Revert the hack introduced last week.

Signed-off-by: John Hawkes <hawkes@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 40c6d801dd66..1f06e7690106 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -628,13 +628,6 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
  */
 
-/*
- * Hack to avoid 2.6.13 partial node dynamic sched domain bug.
- * Disable letting 'cpu_exclusive' cpusets define dynamic sched
- * domains, until the sched domain can handle partial nodes.
- * Remove this #if hackery when sched domains fixed.
- */
-#if 0
 static void update_cpu_domains(struct cpuset *cur)
 {
 	struct cpuset *c, *par = cur->parent;
@@ -675,11 +668,6 @@ static void update_cpu_domains(struct cpuset *cur)
 	partition_sched_domains(&pspan, &cspan);
 	unlock_cpu_hotplug();
 }
-#else
-static void update_cpu_domains(struct cpuset *cur)
-{
-}
-#endif
 
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
-- 
cgit v1.2.3-59-g8ed1b


From ab8d11beb46f0bd0617e04205c01f5c1fe845b61 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 6 Sep 2005 15:18:24 -0700
Subject: [PATCH] remove duplicated code from proc and ptrace

Extract common code used by ptrace_attach() and may_ptrace_attach()
into a separate function.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c         | 35 ++++-------------------------------
 include/linux/ptrace.h |  1 +
 kernel/ptrace.c        | 41 ++++++++++++++++++++++++++++-------------
 3 files changed, 33 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 24eed139e54e..84751f3f52d5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -346,33 +346,6 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
 	 (task->state == TASK_STOPPED || task->state == TASK_TRACED) && \
 	 security_ptrace(current,task) == 0))
 
-static int may_ptrace_attach(struct task_struct *task)
-{
-	int retval = 0;
-
-	task_lock(task);
-
-	if (!task->mm)
-		goto out;
-	if (((current->uid != task->euid) ||
-	     (current->uid != task->suid) ||
-	     (current->uid != task->uid) ||
-	     (current->gid != task->egid) ||
-	     (current->gid != task->sgid) ||
-	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-		goto out;
-	rmb();
-	if (task->mm->dumpable != 1 && !capable(CAP_SYS_PTRACE))
-		goto out;
-	if (security_ptrace(current, task))
-		goto out;
-
-	retval = 1;
-out:
-	task_unlock(task);
-	return retval;
-}
-
 static int proc_pid_environ(struct task_struct *task, char * buffer)
 {
 	int res = 0;
@@ -382,7 +355,7 @@ static int proc_pid_environ(struct task_struct *task, char * buffer)
 		if (len > PAGE_SIZE)
 			len = PAGE_SIZE;
 		res = access_process_vm(task, mm->env_start, buffer, len, 0);
-		if (!may_ptrace_attach(task))
+		if (!ptrace_may_attach(task))
 			res = -ESRCH;
 		mmput(mm);
 	}
@@ -685,7 +658,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 	int ret = -ESRCH;
 	struct mm_struct *mm;
 
-	if (!MAY_PTRACE(task) || !may_ptrace_attach(task))
+	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
 		goto out;
 
 	ret = -ENOMEM;
@@ -711,7 +684,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 
 		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 		retval = access_process_vm(task, src, page, this_len, 0);
-		if (!retval || !MAY_PTRACE(task) || !may_ptrace_attach(task)) {
+		if (!retval || !MAY_PTRACE(task) || !ptrace_may_attach(task)) {
 			if (!ret)
 				ret = -EIO;
 			break;
@@ -749,7 +722,7 @@ static ssize_t mem_write(struct file * file, const char * buf,
 	struct task_struct *task = proc_task(file->f_dentry->d_inode);
 	unsigned long dst = *ppos;
 
-	if (!MAY_PTRACE(task) || !may_ptrace_attach(task))
+	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
 		return -ESRCH;
 
 	page = (char *)__get_free_page(GFP_USER);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 2afdafb62123..dc6f3647bfbc 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -90,6 +90,7 @@ extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void ptrace_untrace(struct task_struct *child);
+extern int ptrace_may_attach(struct task_struct *task);
 
 static inline void ptrace_link(struct task_struct *child,
 			       struct task_struct *new_parent)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8dcb8f6288bc..019e04ec065a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	return ret;
 }
 
+static int may_attach(struct task_struct *task)
+{
+	if (!task->mm)
+		return -EPERM;
+	if (((current->uid != task->euid) ||
+	     (current->uid != task->suid) ||
+	     (current->uid != task->uid) ||
+	     (current->gid != task->egid) ||
+	     (current->gid != task->sgid) ||
+	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+		return -EPERM;
+	smp_rmb();
+	if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+		return -EPERM;
+
+	return security_ptrace(current, task);
+}
+
+int ptrace_may_attach(struct task_struct *task)
+{
+	int err;
+	task_lock(task);
+	err = may_attach(task);
+	task_unlock(task);
+	return !err;
+}
+
 int ptrace_attach(struct task_struct *task)
 {
 	int retval;
@@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task)
 		goto bad;
 	if (task == current)
 		goto bad;
-	if (!task->mm)
-		goto bad;
-	if(((current->uid != task->euid) ||
-	    (current->uid != task->suid) ||
-	    (current->uid != task->uid) ||
- 	    (current->gid != task->egid) ||
- 	    (current->gid != task->sgid) ||
- 	    (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-		goto bad;
-	smp_rmb();
-	if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
-		goto bad;
 	/* the same process cannot be attached many times */
 	if (task->ptrace & PT_PTRACED)
 		goto bad;
-	retval = security_ptrace(current, task);
+	retval = may_attach(task);
 	if (retval)
 		goto bad;
 
-- 
cgit v1.2.3-59-g8ed1b


From dd3927105b6f65afb7dac17682172cdfb86d3f00 Mon Sep 17 00:00:00 2001
From: Pekka J Enberg <penberg@cs.Helsinki.FI>
Date: Tue, 6 Sep 2005 15:18:31 -0700
Subject: [PATCH] introduce and use kzalloc

This patch introduces a kzalloc wrapper and converts kernel/ to use it.  It
saves a little program text.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h | 16 +++++++++++++++-
 kernel/intermodule.c |  3 +--
 kernel/params.c      |  4 ++--
 kernel/power/pm.c    |  3 +--
 kernel/resource.c    |  3 +--
 kernel/workqueue.c   |  3 +--
 mm/slab.c            | 18 ++++++------------
 7 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 80b2dfde2e80..42a6bea58af3 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -99,7 +99,21 @@ found:
 	return __kmalloc(size, flags);
 }
 
-extern void *kcalloc(size_t, size_t, unsigned int __nocast);
+extern void *kzalloc(size_t, unsigned int __nocast);
+
+/**
+ * kcalloc - allocate memory for an array. The memory is set to zero.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate.
+ */
+static inline void *kcalloc(size_t n, size_t size, unsigned int __nocast flags)
+{
+	if (n != 0 && size > INT_MAX / n)
+		return NULL;
+	return kzalloc(n * size, flags);
+}
+
 extern void kfree(const void *);
 extern unsigned int ksize(const void *);
 
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 388977f3e9b7..0cbe633420fb 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void
 	struct list_head *tmp;
 	struct inter_module_entry *ime, *ime_new;
 
-	if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
+	if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
 		/* Overloaded kernel, not fatal */
 		printk(KERN_ERR
 			"Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
@@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void
 		kmalloc_failed = 1;
 		return;
 	}
-	memset(ime_new, 0, sizeof(*ime_new));
 	ime_new->im_name = im_name;
 	ime_new->owner = owner;
 	ime_new->userdata = userdata;
diff --git a/kernel/params.c b/kernel/params.c
index d586c35ef8fc..fbf173215fd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -542,8 +542,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
 {
 	struct module_kobject *mk;
 
-	mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
-	memset(mk, 0, sizeof(struct module_kobject));
+	mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+	BUG_ON(!mk);
 
 	mk->mod = THIS_MODULE;
 	kobj_set_kset_s(mk, module_subsys);
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 61deda04e39e..159149321b3c 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type,
 			   unsigned long id,
 			   pm_callback callback)
 {
-	struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
+	struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
 	if (dev) {
-		memset(dev, 0, sizeof(*dev));
 		dev->type = type;
 		dev->id = id;
 		dev->callback = callback;
diff --git a/kernel/resource.c b/kernel/resource.c
index 26967e042201..92285d822de6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource);
  */
 struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
 {
-	struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
+	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
 
 	if (res) {
-		memset(res, 0, sizeof(*res));
 		res->name = name;
 		res->start = start;
 		res->end = start + n - 1;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a3de837a8ddd..91bacb13a7e2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	struct workqueue_struct *wq;
 	struct task_struct *p;
 
-	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
 		return NULL;
-	memset(wq, 0, sizeof(*wq));
 
 	wq->name = name;
 	/* We don't need the distraction of CPUs appearing and vanishing. */
diff --git a/mm/slab.c b/mm/slab.c
index a9ff4f7f9860..d7c4443991fe 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2558,24 +2558,18 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
 EXPORT_SYMBOL(kmem_cache_free);
 
 /**
- * kcalloc - allocate memory for an array. The memory is set to zero.
- * @n: number of elements.
- * @size: element size.
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
  */
-void *kcalloc(size_t n, size_t size, unsigned int __nocast flags)
+void *kzalloc(size_t size, unsigned int __nocast flags)
 {
-	void *ret = NULL;
-
-	if (n != 0 && size > INT_MAX / n)
-		return ret;
-
-	ret = kmalloc(n * size, flags);
+	void *ret = kmalloc(size, flags);
 	if (ret)
-		memset(ret, 0, n * size);
+		memset(ret, 0, size);
 	return ret;
 }
-EXPORT_SYMBOL(kcalloc);
+EXPORT_SYMBOL(kzalloc);
 
 /**
  * kfree - free previously allocated memory
-- 
cgit v1.2.3-59-g8ed1b


From d0aaff9796c3310326d10da44fc0faed352a1d29 Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Tue, 6 Sep 2005 15:19:26 -0700
Subject: [PATCH] Kprobes: prevent possible race conditions generic

There are possible race conditions if probes are placed on routines within the
kprobes files and routines used by the kprobes.  For example if you put probe
on get_kprobe() routines, the system can hang while inserting probes on any
routine such as do_fork().  Because while inserting probes on do_fork(),
register_kprobes() routine grabs the kprobes spin lock and executes
get_kprobe() routine and to handle probe of get_kprobe(), kprobes_handler()
gets executed and tries to grab kprobes spin lock, and spins forever.  This
patch avoids such possible race conditions by preventing probes on routines
within the kprobes file and routines used by kprobes.

I have modified the patches as per Andi Kleen's suggestion to move kprobes
routines and other routines used by kprobes to a seperate section
.kprobes.text.

Also moved page fault and exception handlers, general protection fault to
.kprobes.text section.

These patches have been tested on i386, x86_64 and ppc64 architectures, also
compiled on ia64 and sparc64 architectures.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/sections.h    |  1 +
 include/asm-generic/vmlinux.lds.h |  6 ++++
 include/linux/kprobes.h           |  3 ++
 include/linux/linkage.h           |  7 ++++
 kernel/kprobes.c                  | 72 +++++++++++++++++++++++----------------
 5 files changed, 60 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 450eae22c39a..886dbd116899 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -12,5 +12,6 @@ extern char _sextratext[] __attribute__((weak));
 extern char _eextratext[] __attribute__((weak));
 extern char _end[];
 extern char __per_cpu_start[], __per_cpu_end[];
+extern char __kprobes_text_start[], __kprobes_text_end[];
 
 #endif /* _ASM_GENERIC_SECTIONS_H_ */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 3fa94288aa93..6f857be2b644 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -97,3 +97,9 @@
 		VMLINUX_SYMBOL(__lock_text_start) = .;			\
 		*(.spinlock.text)					\
 		VMLINUX_SYMBOL(__lock_text_end) = .;
+
+#define KPROBES_TEXT							\
+		ALIGN_FUNCTION();					\
+		VMLINUX_SYMBOL(__kprobes_text_start) = .;		\
+		*(.kprobes.text)					\
+		VMLINUX_SYMBOL(__kprobes_text_end) = .;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index e050fc2d4c26..e30afdca7917 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -42,6 +42,9 @@
 #define KPROBE_REENTER		0x00000004
 #define KPROBE_HIT_SSDONE	0x00000008
 
+/* Attach to insert probes on any functions which should be ignored*/
+#define __kprobes	__attribute__((__section__(".kprobes.text")))
+
 struct kprobe;
 struct pt_regs;
 struct kretprobe;
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 338f7795d8a0..147eb01e0d4b 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -33,6 +33,13 @@
   ALIGN; \
   name:
 
+#define KPROBE_ENTRY(name) \
+  .section .kprobes.text, "ax"; \
+  .globl name; \
+  ALIGN; \
+  name:
+
+
 #endif
 
 #define NORET_TYPE    /**/
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b0237122b24e..3b7653f2e7ae 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -37,6 +37,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/kdebug.h>
@@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages;
  * get_insn_slot() - Find a slot on an executable page for an instruction.
  * We allocate an executable page if there's no room on existing ones.
  */
-kprobe_opcode_t *get_insn_slot(void)
+kprobe_opcode_t __kprobes *get_insn_slot(void)
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
@@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void)
 	return kip->insns;
 }
 
-void free_insn_slot(kprobe_opcode_t *slot)
+void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
@@ -152,20 +153,20 @@ void free_insn_slot(kprobe_opcode_t *slot)
 }
 
 /* Locks kprobe: irqs must be disabled */
-void lock_kprobes(void)
+void __kprobes lock_kprobes(void)
 {
 	spin_lock(&kprobe_lock);
 	kprobe_cpu = smp_processor_id();
 }
 
-void unlock_kprobes(void)
+void __kprobes unlock_kprobes(void)
 {
 	kprobe_cpu = NR_CPUS;
 	spin_unlock(&kprobe_lock);
 }
 
 /* You have to be holding the kprobe_lock */
-struct kprobe *get_kprobe(void *addr)
+struct kprobe __kprobes *get_kprobe(void *addr)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
@@ -183,7 +184,7 @@ struct kprobe *get_kprobe(void *addr)
  * Aggregate handlers for multiple kprobes support - these handlers
  * take care of invoking the individual kprobe handlers on p->list
  */
-static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe *kp;
 
@@ -198,8 +199,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
-			      unsigned long flags)
+static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+					unsigned long flags)
 {
 	struct kprobe *kp;
 
@@ -213,8 +214,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 	return;
 }
 
-static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
-			      int trapnr)
+static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+					int trapnr)
 {
 	/*
 	 * if we faulted "during" the execution of a user specified
@@ -227,7 +228,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
 	return 0;
 }
 
-static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe *kp = curr_kprobe;
 	if (curr_kprobe && kp->break_handler) {
@@ -240,7 +241,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
+struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
 {
 	struct hlist_node *node;
 	struct kretprobe_instance *ri;
@@ -249,7 +250,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
 	return NULL;
 }
 
-static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
+static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
+							      *rp)
 {
 	struct hlist_node *node;
 	struct kretprobe_instance *ri;
@@ -258,7 +260,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
 	return NULL;
 }
 
-void add_rp_inst(struct kretprobe_instance *ri)
+void __kprobes add_rp_inst(struct kretprobe_instance *ri)
 {
 	/*
 	 * Remove rp inst off the free list -
@@ -276,7 +278,7 @@ void add_rp_inst(struct kretprobe_instance *ri)
 	hlist_add_head(&ri->uflist, &ri->rp->used_instances);
 }
 
-void recycle_rp_inst(struct kretprobe_instance *ri)
+void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
 {
 	/* remove rp inst off the rprobe_inst_table */
 	hlist_del(&ri->hlist);
@@ -291,7 +293,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri)
 		kfree(ri);
 }
 
-struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
+struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
 {
 	return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
 }
@@ -302,7 +304,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
  * instances associated with this task. These left over instances represent
  * probed functions that have been called but will never return.
  */
-void kprobe_flush_task(struct task_struct *tk)
+void __kprobes kprobe_flush_task(struct task_struct *tk)
 {
         struct kretprobe_instance *ri;
         struct hlist_head *head;
@@ -322,7 +324,8 @@ void kprobe_flush_task(struct task_struct *tk)
  * This kprobe pre_handler is registered with every kretprobe. When probe
  * hits it will set up the return probe.
  */
-static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+					   struct pt_regs *regs)
 {
 	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
 
@@ -353,7 +356,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 * Add the new probe to old_p->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
-static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
 {
         struct kprobe *kp;
 
@@ -395,7 +398,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
  * the intricacies
  * TODO: Move kcalloc outside the spinlock
  */
-static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+					  struct kprobe *p)
 {
 	int ret = 0;
 	struct kprobe *ap;
@@ -434,15 +438,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
 		spin_unlock_irqrestore(&kprobe_lock, flags);
 }
 
-int register_kprobe(struct kprobe *p)
+static int __kprobes in_kprobes_functions(unsigned long addr)
+{
+	if (addr >= (unsigned long)__kprobes_text_start
+		&& addr < (unsigned long)__kprobes_text_end)
+		return -EINVAL;
+	return 0;
+}
+
+int __kprobes register_kprobe(struct kprobe *p)
 {
 	int ret = 0;
 	unsigned long flags = 0;
 	struct kprobe *old_p;
 
-	if ((ret = arch_prepare_kprobe(p)) != 0) {
+	if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
+		return ret;
+	if ((ret = arch_prepare_kprobe(p)) != 0)
 		goto rm_kprobe;
-	}
+
 	spin_lock_irqsave(&kprobe_lock, flags);
 	old_p = get_kprobe(p->addr);
 	p->nmissed = 0;
@@ -466,7 +480,7 @@ rm_kprobe:
 	return ret;
 }
 
-void unregister_kprobe(struct kprobe *p)
+void __kprobes unregister_kprobe(struct kprobe *p)
 {
 	unsigned long flags;
 	struct kprobe *old_p;
@@ -487,7 +501,7 @@ static struct notifier_block kprobe_exceptions_nb = {
 	.priority = 0x7fffffff /* we need to notified first */
 };
 
-int register_jprobe(struct jprobe *jp)
+int __kprobes register_jprobe(struct jprobe *jp)
 {
 	/* Todo: Verify probepoint is a function entry point */
 	jp->kp.pre_handler = setjmp_pre_handler;
@@ -496,14 +510,14 @@ int register_jprobe(struct jprobe *jp)
 	return register_kprobe(&jp->kp);
 }
 
-void unregister_jprobe(struct jprobe *jp)
+void __kprobes unregister_jprobe(struct jprobe *jp)
 {
 	unregister_kprobe(&jp->kp);
 }
 
 #ifdef ARCH_SUPPORTS_KRETPROBES
 
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	int ret = 0;
 	struct kretprobe_instance *inst;
@@ -540,14 +554,14 @@ int register_kretprobe(struct kretprobe *rp)
 
 #else /* ARCH_SUPPORTS_KRETPROBES */
 
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	return -ENOSYS;
 }
 
 #endif /* ARCH_SUPPORTS_KRETPROBES */
 
-void unregister_kretprobe(struct kretprobe *rp)
+void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
 	unsigned long flags;
 	struct kretprobe_instance *ri;
-- 
cgit v1.2.3-59-g8ed1b


From deac66ae454cacf942c051b86d9232af546fb187 Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Tue, 6 Sep 2005 15:19:35 -0700
Subject: [PATCH] kprobes: fix bug when probed on task and isr functions

This patch fixes a race condition where in system used to hang or sometime
crash within minutes when kprobes are inserted on ISR routine and a task
routine.

The fix has been stress tested on i386, ia64, pp64 and on x86_64.  To
reproduce the problem insert kprobes on schedule() and do_IRQ() functions
and you should see hang or system crash.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c   |  3 ++-
 arch/ia64/kernel/kprobes.c   | 22 +++++++++++++++++++---
 arch/ppc64/kernel/kprobes.c  | 11 ++++++-----
 arch/x86_64/kernel/kprobes.c |  3 ++-
 include/asm-ia64/kprobes.h   |  1 +
 include/asm-ppc64/kprobes.h  |  3 +++
 kernel/kprobes.c             | 22 ++++++++++++++++++++++
 7 files changed, 55 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index e5cec32018a5..6345b430b105 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -177,7 +177,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			if (kprobe_status == KPROBE_HIT_SS &&
+				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
 				regs->eflags &= ~TF_MASK;
 				regs->eflags |= kprobe_saved_eflags;
 				unlock_kprobes();
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 4b1bd539ec47..471086b808a4 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -95,6 +95,17 @@ static void __kprobes update_kprobe_inst_flag(uint template, uint  slot,
 	p->ainsn.inst_flag = 0;
 	p->ainsn.target_br_reg = 0;
 
+	/* Check for Break instruction
+ 	 * Bits 37:40 Major opcode to be zero
+	 * Bits 27:32 X6 to be zero
+	 * Bits 32:35 X3 to be zero
+	 */
+	if ((!major_opcode) && (!((kprobe_inst >> 27) & 0x1FF)) ) {
+		/* is a break instruction */
+	 	p->ainsn.inst_flag |= INST_FLAG_BREAK_INST;
+		return;
+	}
+
 	if (bundle_encoding[template][slot] == B) {
 		switch (major_opcode) {
 		  case INDIRECT_CALL_OPCODE:
@@ -542,8 +553,11 @@ static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs)
 	unsigned long bundle_addr = (unsigned long) &p->opcode.bundle;
 	unsigned long slot = (unsigned long)p->addr & 0xf;
 
-	/* Update instruction pointer (IIP) and slot number (IPSR.ri) */
-	regs->cr_iip = bundle_addr & ~0xFULL;
+	/* single step inline if break instruction */
+	if (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)
+		regs->cr_iip = (unsigned long)p->addr & ~0xFULL;
+	else
+		regs->cr_iip = bundle_addr & ~0xFULL;
 
 	if (slot > 2)
 		slot = 0;
@@ -599,7 +613,9 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 	if (kprobe_running()) {
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			if ( (kprobe_status == KPROBE_HIT_SS) &&
+	 		     (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)) {
+  				ia64_psr(regs)->ss = 0;
 				unlock_kprobes();
 				goto no_kprobe;
 			}
diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c
index 591e4b67b5a5..7e80d49c589a 100644
--- a/arch/ppc64/kernel/kprobes.c
+++ b/arch/ppc64/kernel/kprobes.c
@@ -102,7 +102,7 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 	regs->msr |= MSR_SE;
 
 	/* single step inline if it is a trap variant */
-	if (IS_TW(insn) || IS_TD(insn) || IS_TWI(insn) || IS_TDI(insn))
+	if (is_trap(insn))
 		regs->nip = (unsigned long)p->addr;
 	else
 		regs->nip = (unsigned long)p->ainsn.insn;
@@ -152,7 +152,9 @@ static inline int kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			kprobe_opcode_t insn = *p->ainsn.insn;
+			if (kprobe_status == KPROBE_HIT_SS &&
+					is_trap(insn)) {
 				regs->msr &= ~MSR_SE;
 				regs->msr |= kprobe_saved_msr;
 				unlock_kprobes();
@@ -192,8 +194,7 @@ static inline int kprobe_handler(struct pt_regs *regs)
 			 * trap variant, it could belong to someone else
 			 */
 			kprobe_opcode_t cur_insn = *addr;
-			if (IS_TW(cur_insn) || IS_TD(cur_insn) ||
-					IS_TWI(cur_insn) || IS_TDI(cur_insn))
+			if (is_trap(cur_insn))
 		       		goto no_kprobe;
 			/*
 			 * The breakpoint instruction was removed right
@@ -403,7 +404,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	default:
 		break;
 	}
-	preempt_enable();
+	preempt_enable_no_resched();
 	return ret;
 }
 
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 2d7658fbbb28..df08c43276a0 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -311,7 +311,8 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			if (kprobe_status == KPROBE_HIT_SS &&
+				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
 				regs->eflags &= ~TF_MASK;
 				regs->eflags |= kprobe_saved_rflags;
 				unlock_kprobes();
diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index bf36a32e37e4..573a3574a24f 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -92,6 +92,7 @@ struct arch_specific_insn {
 	kprobe_opcode_t insn;
  #define INST_FLAG_FIX_RELATIVE_IP_ADDR		1
  #define INST_FLAG_FIX_BRANCH_REG		2
+ #define INST_FLAG_BREAK_INST			4
  	unsigned long inst_flag;
  	unsigned short target_br_reg;
 };
diff --git a/include/asm-ppc64/kprobes.h b/include/asm-ppc64/kprobes.h
index 0802919c3235..d9129d2b038e 100644
--- a/include/asm-ppc64/kprobes.h
+++ b/include/asm-ppc64/kprobes.h
@@ -42,6 +42,9 @@ typedef unsigned int kprobe_opcode_t;
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)((func_descr_t *)pentry)
 
+#define is_trap(instr)	(IS_TW(instr) || IS_TD(instr) || \
+			IS_TWI(instr) || IS_TDI(instr))
+
 #define ARCH_SUPPORTS_KRETPROBES
 void kretprobe_trampoline(void);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3b7653f2e7ae..f3ea492ab44d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -155,14 +155,36 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 /* Locks kprobe: irqs must be disabled */
 void __kprobes lock_kprobes(void)
 {
+	unsigned long flags = 0;
+
+	/* Avoiding local interrupts to happen right after we take the kprobe_lock
+	 * and before we get a chance to update kprobe_cpu, this to prevent
+	 * deadlock when we have a kprobe on ISR routine and a kprobe on task
+	 * routine
+	 */
+	local_irq_save(flags);
+
 	spin_lock(&kprobe_lock);
 	kprobe_cpu = smp_processor_id();
+
+ 	local_irq_restore(flags);
 }
 
 void __kprobes unlock_kprobes(void)
 {
+	unsigned long flags = 0;
+
+	/* Avoiding local interrupts to happen right after we update
+	 * kprobe_cpu and before we get a a chance to release kprobe_lock,
+	 * this to prevent deadlock when we have a kprobe on ISR routine and
+	 * a kprobe on task routine
+	 */
+	local_irq_save(flags);
+
 	kprobe_cpu = NR_CPUS;
 	spin_unlock(&kprobe_lock);
+
+ 	local_irq_restore(flags);
 }
 
 /* You have to be holding the kprobe_lock */
-- 
cgit v1.2.3-59-g8ed1b