From 85b6bce3658a823aa169586fe71ffba0f12ccc71 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Fri, 31 Mar 2006 02:30:06 -0800
Subject: [PATCH] Fix suspend with traced tasks

strace /bin/bash misbehaves after resume; this fixes it.

(akpm: it's scary calling refrigerator() in state TASK_TRACED, but it seems to
do the right thing).

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/process.c | 3 +--
 kernel/signal.c        | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8ac7c35fad77..b2a5f671d6cd 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -26,8 +26,7 @@ static inline int freezeable(struct task_struct * p)
 	    (p->flags & PF_NOFREEZE) ||
 	    (p->exit_state == EXIT_ZOMBIE) ||
 	    (p->exit_state == EXIT_DEAD) ||
-	    (p->state == TASK_STOPPED) ||
-	    (p->state == TASK_TRACED))
+	    (p->state == TASK_STOPPED))
 		return 0;
 	return 1;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 4922928d91f6..92025b108791 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1560,6 +1560,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
 	/* Let the debugger run.  */
 	set_current_state(TASK_TRACED);
 	spin_unlock_irq(&current->sighand->siglock);
+	try_to_freeze();
 	read_lock(&tasklist_lock);
 	if (likely(current->ptrace & PT_PTRACED) &&
 	    likely(current->parent != current->real_parent ||
-- 
cgit v1.3-6-gb490


From 3691c5199e8a4be1c7a91b5ab925db5feb866e19 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 31 Mar 2006 02:30:30 -0800
Subject: [PATCH] kill __init_timer_base in favor of boot_tvec_bases

Commit a4a6198b80cf82eb8160603c98da218d1bd5e104:
	[PATCH] tvec_bases too large for per-cpu data

introduced "struct tvec_t_base_s boot_tvec_bases" which is visible at
compile time.  This means we can kill __init_timer_base and move
timer_base_s's content into tvec_t_base_s.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timer.h |  8 ++---
 kernel/timer.c        | 84 +++++++++++++++++++++------------------------------
 2 files changed, 39 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index b5caabca553c..0a485beba9f5 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -6,7 +6,7 @@
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 
-struct timer_base_s;
+struct tvec_t_base_s;
 
 struct timer_list {
 	struct list_head entry;
@@ -15,16 +15,16 @@ struct timer_list {
 	void (*function)(unsigned long);
 	unsigned long data;
 
-	struct timer_base_s *base;
+	struct tvec_t_base_s *base;
 };
 
-extern struct timer_base_s __init_timer_base;
+extern struct tvec_t_base_s boot_tvec_bases;
 
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
-		.base = &__init_timer_base,			\
+		.base = &boot_tvec_bases,			\
 	}
 
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
diff --git a/kernel/timer.c b/kernel/timer.c
index ab189dd187cb..b04dc03b5934 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(jiffies_64);
 /*
  * per-CPU timer vector definitions:
  */
-
 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
 #define TVN_SIZE (1 << TVN_BITS)
@@ -62,11 +61,6 @@ EXPORT_SYMBOL(jiffies_64);
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
 
-struct timer_base_s {
-	spinlock_t lock;
-	struct timer_list *running_timer;
-};
-
 typedef struct tvec_s {
 	struct list_head vec[TVN_SIZE];
 } tvec_t;
@@ -76,7 +70,8 @@ typedef struct tvec_root_s {
 } tvec_root_t;
 
 struct tvec_t_base_s {
-	struct timer_base_s t_base;
+	spinlock_t lock;
+	struct timer_list *running_timer;
 	unsigned long timer_jiffies;
 	tvec_root_t tv1;
 	tvec_t tv2;
@@ -87,13 +82,14 @@ struct tvec_t_base_s {
 
 typedef struct tvec_t_base_s tvec_base_t;
 static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
-static tvec_base_t boot_tvec_bases;
+tvec_base_t boot_tvec_bases;
+EXPORT_SYMBOL(boot_tvec_bases);
 
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
 {
 #ifdef CONFIG_SMP
-	base->t_base.running_timer = timer;
+	base->running_timer = timer;
 #endif
 }
 
@@ -139,15 +135,6 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 	list_add_tail(&timer->entry, vec);
 }
 
-typedef struct timer_base_s timer_base_t;
-/*
- * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
- * at compile time, and we need timer->base to lock the timer.
- */
-timer_base_t __init_timer_base
-	____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
-EXPORT_SYMBOL(__init_timer_base);
-
 /***
  * init_timer - initialize a timer.
  * @timer: the timer to be initialized
@@ -158,7 +145,7 @@ EXPORT_SYMBOL(__init_timer_base);
 void fastcall init_timer(struct timer_list *timer)
 {
 	timer->entry.next = NULL;
-	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base;
+	timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
 }
 EXPORT_SYMBOL(init_timer);
 
@@ -174,7 +161,7 @@ static inline void detach_timer(struct timer_list *timer,
 }
 
 /*
- * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
+ * We are using hashed locking: holding per_cpu(tvec_bases).lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
  *
@@ -185,10 +172,10 @@ static inline void detach_timer(struct timer_list *timer,
  * possible to set timer->base = NULL and drop the lock: the timer remains
  * locked.
  */
-static timer_base_t *lock_timer_base(struct timer_list *timer,
+static tvec_base_t *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
 {
-	timer_base_t *base;
+	tvec_base_t *base;
 
 	for (;;) {
 		base = timer->base;
@@ -205,8 +192,7 @@ static timer_base_t *lock_timer_base(struct timer_list *timer,
 
 int __mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	timer_base_t *base;
-	tvec_base_t *new_base;
+	tvec_base_t *base, *new_base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -221,7 +207,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 
 	new_base = __get_cpu_var(tvec_bases);
 
-	if (base != &new_base->t_base) {
+	if (base != new_base) {
 		/*
 		 * We are trying to schedule the timer on the local CPU.
 		 * However we can't change timer's base while it is running,
@@ -231,19 +217,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 		 */
 		if (unlikely(base->running_timer == timer)) {
 			/* The timer remains on a former base */
-			new_base = container_of(base, tvec_base_t, t_base);
+			new_base = base;
 		} else {
 			/* See the comment in lock_timer_base() */
 			timer->base = NULL;
 			spin_unlock(&base->lock);
-			spin_lock(&new_base->t_base.lock);
-			timer->base = &new_base->t_base;
+			spin_lock(&new_base->lock);
+			timer->base = new_base;
 		}
 	}
 
 	timer->expires = expires;
 	internal_add_timer(new_base, timer);
-	spin_unlock_irqrestore(&new_base->t_base.lock, flags);
+	spin_unlock_irqrestore(&new_base->lock, flags);
 
 	return ret;
 }
@@ -263,10 +249,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
   	unsigned long flags;
 
   	BUG_ON(timer_pending(timer) || !timer->function);
-	spin_lock_irqsave(&base->t_base.lock, flags);
-	timer->base = &base->t_base;
+	spin_lock_irqsave(&base->lock, flags);
+	timer->base = base;
 	internal_add_timer(base, timer);
-	spin_unlock_irqrestore(&base->t_base.lock, flags);
+	spin_unlock_irqrestore(&base->lock, flags);
 }
 
 
@@ -319,7 +305,7 @@ EXPORT_SYMBOL(mod_timer);
  */
 int del_timer(struct timer_list *timer)
 {
-	timer_base_t *base;
+	tvec_base_t *base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -346,7 +332,7 @@ EXPORT_SYMBOL(del_timer);
  */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
-	timer_base_t *base;
+	tvec_base_t *base;
 	unsigned long flags;
 	int ret = -1;
 
@@ -410,7 +396,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 		struct timer_list *tmp;
 
 		tmp = list_entry(curr, struct timer_list, entry);
-		BUG_ON(tmp->base != &base->t_base);
+		BUG_ON(tmp->base != base);
 		curr = curr->next;
 		internal_add_timer(base, tmp);
 	}
@@ -432,7 +418,7 @@ static inline void __run_timers(tvec_base_t *base)
 {
 	struct timer_list *timer;
 
-	spin_lock_irq(&base->t_base.lock);
+	spin_lock_irq(&base->lock);
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
 		struct list_head work_list = LIST_HEAD_INIT(work_list);
 		struct list_head *head = &work_list;
@@ -458,7 +444,7 @@ static inline void __run_timers(tvec_base_t *base)
 
 			set_running_timer(base, timer);
 			detach_timer(timer, 1);
-			spin_unlock_irq(&base->t_base.lock);
+			spin_unlock_irq(&base->lock);
 			{
 				int preempt_count = preempt_count();
 				fn(data);
@@ -471,11 +457,11 @@ static inline void __run_timers(tvec_base_t *base)
 					BUG();
 				}
 			}
-			spin_lock_irq(&base->t_base.lock);
+			spin_lock_irq(&base->lock);
 		}
 	}
 	set_running_timer(base, NULL);
-	spin_unlock_irq(&base->t_base.lock);
+	spin_unlock_irq(&base->lock);
 }
 
 #ifdef CONFIG_NO_IDLE_HZ
@@ -506,7 +492,7 @@ unsigned long next_timer_interrupt(void)
 	hr_expires += jiffies;
 
 	base = __get_cpu_var(tvec_bases);
-	spin_lock(&base->t_base.lock);
+	spin_lock(&base->lock);
 	expires = base->timer_jiffies + (LONG_MAX >> 1);
 	list = NULL;
 
@@ -554,7 +540,7 @@ found:
 				expires = nte->expires;
 		}
 	}
-	spin_unlock(&base->t_base.lock);
+	spin_unlock(&base->lock);
 
 	if (time_before(hr_expires, expires))
 		return hr_expires;
@@ -1262,7 +1248,7 @@ static int __devinit init_timers_cpu(int cpu)
 		}
 		per_cpu(tvec_bases, cpu) = base;
 	}
-	spin_lock_init(&base->t_base.lock);
+	spin_lock_init(&base->lock);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
 		INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1284,7 +1270,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
 	while (!list_empty(head)) {
 		timer = list_entry(head->next, struct timer_list, entry);
 		detach_timer(timer, 0);
-		timer->base = &new_base->t_base;
+		timer->base = new_base;
 		internal_add_timer(new_base, timer);
 	}
 }
@@ -1300,11 +1286,11 @@ static void __devinit migrate_timers(int cpu)
 	new_base = get_cpu_var(tvec_bases);
 
 	local_irq_disable();
-	spin_lock(&new_base->t_base.lock);
-	spin_lock(&old_base->t_base.lock);
+	spin_lock(&new_base->lock);
+	spin_lock(&old_base->lock);
+
+	BUG_ON(old_base->running_timer);
 
-	if (old_base->t_base.running_timer)
-		BUG();
 	for (i = 0; i < TVR_SIZE; i++)
 		migrate_timer_list(new_base, old_base->tv1.vec + i);
 	for (i = 0; i < TVN_SIZE; i++) {
@@ -1314,8 +1300,8 @@ static void __devinit migrate_timers(int cpu)
 		migrate_timer_list(new_base, old_base->tv5.vec + i);
 	}
 
-	spin_unlock(&old_base->t_base.lock);
-	spin_unlock(&new_base->t_base.lock);
+	spin_unlock(&old_base->lock);
+	spin_unlock(&new_base->lock);
 	local_irq_enable();
 	put_cpu_var(tvec_bases);
 }
-- 
cgit v1.3-6-gb490


From a2c348fe0117adced11e374329a5ea3f7c43cb41 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 31 Mar 2006 02:30:31 -0800
Subject: [PATCH] __mod_timer: simplify ->base changing

Since base and new_base are of the same type now, we can save one 'if'
branch and simplify the code a bit.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index b04dc03b5934..9062a82ee8ec 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -215,21 +215,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 		 * handler yet has not finished. This also guarantees that
 		 * the timer is serialized wrt itself.
 		 */
-		if (unlikely(base->running_timer == timer)) {
-			/* The timer remains on a former base */
-			new_base = base;
-		} else {
+		if (likely(base->running_timer != timer)) {
 			/* See the comment in lock_timer_base() */
 			timer->base = NULL;
 			spin_unlock(&base->lock);
-			spin_lock(&new_base->lock);
-			timer->base = new_base;
+			base = new_base;
+			spin_lock(&base->lock);
+			timer->base = base;
 		}
 	}
 
 	timer->expires = expires;
-	internal_add_timer(new_base, timer);
-	spin_unlock_irqrestore(&new_base->lock, flags);
+	internal_add_timer(base, timer);
+	spin_unlock_irqrestore(&base->lock, flags);
 
 	return ret;
 }
-- 
cgit v1.3-6-gb490


From 9b41046cd0ee0a57f849d6e1363f7933e363cca9 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 31 Mar 2006 02:30:33 -0800
Subject: [PATCH] Don't pass boot parameters to argv_init[]

The boot cmdline is parsed in parse_early_param() and
parse_args(,unknown_bootoption).

And __setup() is used in obsolete_checksetup().

	start_kernel()
		-> parse_args()
			-> unknown_bootoption()
				-> obsolete_checksetup()

If __setup()'s callback (->setup_func()) returns 1 in
obsolete_checksetup(), obsolete_checksetup() thinks a parameter was
handled.

If ->setup_func() returns 0, obsolete_checksetup() tries other
->setup_func().  If all ->setup_func() that matched a parameter returns 0,
a parameter is seted to argv_init[].

Then, when runing /sbin/init or init=app, argv_init[] is passed to the app.
If the app doesn't ignore those arguments, it will warning and exit.

This patch fixes a wrong usage of it, however fixes obvious one only.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/core_marvel.c   |  2 +-
 arch/i386/kernel/apic.c           |  2 +-
 arch/i386/kernel/cpu/mcheck/mce.c |  4 ++--
 arch/i386/kernel/io_apic.c        |  2 +-
 arch/i386/kernel/traps.c          |  2 +-
 arch/powerpc/kernel/crash_dump.c  |  4 ++--
 arch/sh/kernel/cpu/init.c         |  2 +-
 arch/x86_64/kernel/apic.c         | 14 +++++++-------
 arch/x86_64/kernel/early_printk.c |  2 +-
 arch/x86_64/kernel/mce.c          |  4 ++--
 arch/x86_64/kernel/pmtimer.c      |  2 +-
 arch/x86_64/kernel/setup.c        |  2 +-
 arch/x86_64/kernel/setup64.c      |  4 ++--
 arch/x86_64/kernel/smpboot.c      |  2 +-
 arch/x86_64/kernel/time.c         |  4 ++--
 arch/x86_64/kernel/traps.c        |  4 ++--
 arch/x86_64/mm/fault.c            |  2 +-
 block/elevator.c                  |  2 +-
 drivers/acpi/ec.c                 |  4 ++--
 drivers/block/amiflop.c           |  1 +
 drivers/media/video/cpia_pp.c     |  2 +-
 drivers/net/netconsole.c          |  2 +-
 drivers/net/pcmcia/xirc2ps_cs.c   |  2 +-
 drivers/pcmcia/vrc4171_card.c     | 12 ++++++------
 drivers/pcmcia/vrc4173_cardu.c    |  8 ++++----
 drivers/scsi/ibmmca.c             |  2 +-
 drivers/video/console/fbcon.c     | 10 +++++-----
 drivers/video/console/sticore.c   |  4 ++--
 drivers/video/fbmem.c             |  2 +-
 drivers/video/stifb.c             |  4 ++--
 kernel/audit.c                    |  2 +-
 mm/memory.c                       |  2 +-
 32 files changed, 59 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c
index 44866cb26a80..7f6a98455e74 100644
--- a/arch/alpha/kernel/core_marvel.c
+++ b/arch/alpha/kernel/core_marvel.c
@@ -435,7 +435,7 @@ marvel_specify_io7(char *str)
 		str = pchar;
 	} while(*str);
 
-	return 0;
+	return 1;
 }
 __setup("io7=", marvel_specify_io7);
 
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 3fff3c62d57a..6273bf74c203 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -752,7 +752,7 @@ static int __init apic_set_verbosity(char *str)
 		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
 				" use apic=verbose or apic=debug\n", str);
 
-	return 0;
+	return 1;
 }
 
 __setup("apic=", apic_set_verbosity);
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index 6170af3c271a..afa0888f9a1e 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -64,13 +64,13 @@ void mcheck_init(struct cpuinfo_x86 *c)
 static int __init mcheck_disable(char *str)
 {
 	mce_disabled = 1;
-	return 0;
+	return 1;
 }
 
 static int __init mcheck_enable(char *str)
 {
 	mce_disabled = -1;
-	return 0;
+	return 1;
 }
 
 __setup("nomce", mcheck_disable);
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 3b329af4afc5..f8f132aa5472 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -644,7 +644,7 @@ failed:
 int __init irqbalance_disable(char *str)
 {
 	irqbalance_disabled = 1;
-	return 0;
+	return 1;
 }
 
 __setup("noirqbalance", irqbalance_disable);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 6b63a5aa1e46..e38527994590 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1193,6 +1193,6 @@ void __init trap_init(void)
 static int __init kstack_setup(char *s)
 {
 	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
-	return 0;
+	return 1;
 }
 __setup("kstack=", kstack_setup);
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
index 211d72653ea6..764d07329716 100644
--- a/arch/powerpc/kernel/crash_dump.c
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -61,7 +61,7 @@ static int __init parse_elfcorehdr(char *p)
 	if (p)
 		elfcorehdr_addr = memparse(p, &p);
 
-	return 0;
+	return 1;
 }
 __setup("elfcorehdr=", parse_elfcorehdr);
 #endif
@@ -71,7 +71,7 @@ static int __init parse_savemaxmem(char *p)
 	if (p)
 		saved_max_pfn = (memparse(p, &p) >> PAGE_SHIFT) - 1;
 
-	return 0;
+	return 1;
 }
 __setup("savemaxmem=", parse_savemaxmem);
 
diff --git a/arch/sh/kernel/cpu/init.c b/arch/sh/kernel/cpu/init.c
index cf94e8ef17c5..868e68b28880 100644
--- a/arch/sh/kernel/cpu/init.c
+++ b/arch/sh/kernel/cpu/init.c
@@ -30,7 +30,7 @@ static int x##_disabled __initdata = 0;		\
 static int __init x##_setup(char *opts)		\
 {						\
 	x##_disabled = 1;			\
-	return 0;				\
+	return 1;				\
 }						\
 __setup("no" __stringify(x), x##_setup);
 
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index d54620147e8e..100a30c40044 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -615,7 +615,7 @@ static int __init apic_set_verbosity(char *str)
 		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
 				" use apic=verbose or apic=debug", str);
 
-	return 0;
+	return 1;
 }
 
 __setup("apic=", apic_set_verbosity);
@@ -1137,35 +1137,35 @@ int __init APIC_init_uniprocessor (void)
 static __init int setup_disableapic(char *str) 
 { 
 	disable_apic = 1;
-	return 0;
+	return 1;
 } 
 
 static __init int setup_nolapic(char *str) 
 { 
 	disable_apic = 1;
-	return 0;
+	return 1;
 } 
 
 static __init int setup_noapictimer(char *str) 
 { 
 	if (str[0] != ' ' && str[0] != 0)
-		return -1;
+		return 0;
 	disable_apic_timer = 1;
-	return 0;
+	return 1;
 } 
 
 static __init int setup_apicmaintimer(char *str)
 {
 	apic_runs_main_timer = 1;
 	nohpet = 1;
-	return 0;
+	return 1;
 }
 __setup("apicmaintimer", setup_apicmaintimer);
 
 static __init int setup_noapicmaintimer(char *str)
 {
 	apic_runs_main_timer = -1;
-	return 0;
+	return 1;
 }
 __setup("noapicmaintimer", setup_noapicmaintimer);
 
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 13af920b6594..b93ef5b51980 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -221,7 +221,7 @@ int __init setup_early_printk(char *opt)
 	char buf[256];
 
 	if (early_console_initialized)
-		return -1;
+		return 1;
 
 	strlcpy(buf,opt,sizeof(buf));
 	space = strchr(buf, ' ');
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 04282ef9fbd4..10b3e348fc99 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -501,7 +501,7 @@ static struct miscdevice mce_log_device = {
 static int __init mcheck_disable(char *str)
 {
 	mce_dont_init = 1;
-	return 0;
+	return 1;
 }
 
 /* mce=off disables machine check. Note you can reenable it later
@@ -521,7 +521,7 @@ static int __init mcheck_enable(char *str)
 		get_option(&str, &tolerant);
 	else
 		printk("mce= argument %s ignored. Please use /sys", str); 
-	return 0;
+	return 1;
 }
 
 __setup("nomce", mcheck_disable);
diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c
index ee5ee4891f3d..b0444a415bd6 100644
--- a/arch/x86_64/kernel/pmtimer.c
+++ b/arch/x86_64/kernel/pmtimer.c
@@ -121,7 +121,7 @@ unsigned int do_gettimeoffset_pm(void)
 static int __init nopmtimer_setup(char *s)
 {
 	pmtmr_ioport = 0;
-	return 0;
+	return 1;
 }
 
 __setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index d1f3e9272c05..0856ad444f90 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -540,7 +540,7 @@ void __init alternative_instructions(void)
 static int __init noreplacement_setup(char *s)
 { 
      no_replacement = 1; 
-     return 0; 
+     return 1;
 } 
 
 __setup("noreplacement", noreplacement_setup); 
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index eabdb63fec31..8a691fa6d393 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -55,7 +55,7 @@ int __init nonx_setup(char *str)
 		do_not_nx = 1;
 		__supported_pte_mask &= ~_PAGE_NX;
         }
-	return 0;
+	return 1;
 } 
 __setup("noexec=", nonx_setup);	/* parsed early actually */
 
@@ -74,7 +74,7 @@ static int __init nonx32_setup(char *str)
 		force_personality32 &= ~READ_IMPLIES_EXEC;
 	else if (!strcmp(str, "off"))
 		force_personality32 |= READ_IMPLIES_EXEC;
-	return 0;
+	return 1;
 }
 __setup("noexec32=", nonx32_setup);
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index ea48fa638070..71a7222cf9ce 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -353,7 +353,7 @@ static void __cpuinit tsc_sync_wait(void)
 static __init int notscsync_setup(char *s)
 {
 	notscsync = 1;
-	return 0;
+	return 1;
 }
 __setup("notscsync", notscsync_setup);
 
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 473b514b66e4..ef8bc46dc140 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1306,7 +1306,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 static int __init nohpet_setup(char *s) 
 { 
 	nohpet = 1;
-	return 0;
+	return 1;
 } 
 
 __setup("nohpet", nohpet_setup);
@@ -1314,7 +1314,7 @@ __setup("nohpet", nohpet_setup);
 int __init notsc_setup(char *s)
 {
 	notsc = 1;
-	return 0;
+	return 1;
 }
 
 __setup("notsc", notsc_setup);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index edaa9fe654dc..6bda322d3caf 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -973,14 +973,14 @@ void __init trap_init(void)
 static int __init oops_dummy(char *s)
 { 
 	panic_on_oops = 1;
-	return -1; 
+	return 1;
 } 
 __setup("oops=", oops_dummy); 
 
 static int __init kstack_setup(char *s)
 {
 	kstack_depth_to_print = simple_strtoul(s,NULL,0);
-	return 0;
+	return 1;
 }
 __setup("kstack=", kstack_setup);
 
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 316c53de47bd..55250593d8c9 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -623,6 +623,6 @@ void vmalloc_sync_all(void)
 static int __init enable_pagefaulttrace(char *str)
 {
 	page_fault_trace = 1;
-	return 0;
+	return 1;
 }
 __setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/block/elevator.c b/block/elevator.c
index 56c2ed06a9e2..0d6be03d929e 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -145,7 +145,7 @@ static int __init elevator_setup(char *str)
 		strcpy(chosen_elevator, "anticipatory");
 	else
 		strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
-	return 0;
+	return 1;
 }
 
 __setup("elevator=", elevator_setup);
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 79b09d76c180..eee0864ba300 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -1572,7 +1572,7 @@ static void __exit acpi_ec_exit(void)
 static int __init acpi_fake_ecdt_setup(char *str)
 {
 	acpi_fake_ecdt_enabled = 1;
-	return 0;
+	return 1;
 }
 
 __setup("acpi_fake_ecdt", acpi_fake_ecdt_setup);
@@ -1591,7 +1591,7 @@ static int __init acpi_ec_set_intr_mode(char *str)
 		acpi_ec_driver.ops.add = acpi_ec_poll_add;
 	}
 	printk(KERN_INFO PREFIX "EC %s mode.\n", intr ? "interrupt" : "polling");
-	return 0;
+	return 1;
 }
 
 __setup("ec_intr=", acpi_ec_set_intr_mode);
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index b6e290956214..2a8af685926f 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1850,6 +1850,7 @@ static int __init amiga_floppy_setup (char *str)
 		return 0;
 	printk (KERN_INFO "amiflop: Setting default df0 to %x\n", n);
 	fd_def_df0 = n;
+	return 1;
 }
 
 __setup("floppy=", amiga_floppy_setup);
diff --git a/drivers/media/video/cpia_pp.c b/drivers/media/video/cpia_pp.c
index 3021f21aae36..0b00e6027dfb 100644
--- a/drivers/media/video/cpia_pp.c
+++ b/drivers/media/video/cpia_pp.c
@@ -873,7 +873,7 @@ static int __init cpia_pp_setup(char *str)
 		parport_nr[parport_ptr++] = PPCPIA_PARPORT_NONE;
 	}
 
-	return 0;
+	return 1;
 }
 
 __setup("cpia_pp=", cpia_pp_setup);
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index edd1b5306b16..75b35ad760de 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -94,7 +94,7 @@ static struct console netconsole = {
 static int option_setup(char *opt)
 {
 	configured = !netpoll_parse_options(&np, opt);
-	return 0;
+	return 1;
 }
 
 __setup("netconsole=", option_setup);
diff --git a/drivers/net/pcmcia/xirc2ps_cs.c b/drivers/net/pcmcia/xirc2ps_cs.c
index eed496803fe4..e8f849e12976 100644
--- a/drivers/net/pcmcia/xirc2ps_cs.c
+++ b/drivers/net/pcmcia/xirc2ps_cs.c
@@ -1973,7 +1973,7 @@ static int __init setup_xirc2ps_cs(char *str)
 	MAYBE_SET(lockup_hack, 6);
 #undef  MAYBE_SET
 
-	return 0;
+	return 1;
 }
 
 __setup("xirc2ps_cs=", setup_xirc2ps_cs);
diff --git a/drivers/pcmcia/vrc4171_card.c b/drivers/pcmcia/vrc4171_card.c
index 0574efd7828a..459e6e1946fd 100644
--- a/drivers/pcmcia/vrc4171_card.c
+++ b/drivers/pcmcia/vrc4171_card.c
@@ -634,7 +634,7 @@ static void vrc4171_remove_sockets(void)
 static int __devinit vrc4171_card_setup(char *options)
 {
 	if (options == NULL || *options == '\0')
-		return 0;
+		return 1;
 
 	if (strncmp(options, "irq:", 4) == 0) {
 		int irq;
@@ -644,7 +644,7 @@ static int __devinit vrc4171_card_setup(char *options)
 			vrc4171_irq = irq;
 
 		if (*options != ',')
-			return 0;
+			return 1;
 		options++;
 	}
 
@@ -663,10 +663,10 @@ static int __devinit vrc4171_card_setup(char *options)
 			}
 
 			if (*options != ',')
-				return 0;
+				return 1;
 			options++;
 		} else
-			return 0;
+			return 1;
 
 	}
 
@@ -688,7 +688,7 @@ static int __devinit vrc4171_card_setup(char *options)
 			}
 
 			if (*options != ',')
-				return 0;
+				return 1;
 			options++;
 
 			if (strncmp(options, "memnoprobe", 10) == 0)
@@ -700,7 +700,7 @@ static int __devinit vrc4171_card_setup(char *options)
 		}
 	}
 
-	return 0;
+	return 1;
 }
 
 __setup("vrc4171_card=", vrc4171_card_setup);
diff --git a/drivers/pcmcia/vrc4173_cardu.c b/drivers/pcmcia/vrc4173_cardu.c
index 57f38dba0a48..6004196f7cc1 100644
--- a/drivers/pcmcia/vrc4173_cardu.c
+++ b/drivers/pcmcia/vrc4173_cardu.c
@@ -516,7 +516,7 @@ static int __devinit vrc4173_cardu_probe(struct pci_dev *dev,
 static int __devinit vrc4173_cardu_setup(char *options)
 {
 	if (options == NULL || *options == '\0')
-		return 0;
+		return 1;
 
 	if (strncmp(options, "cardu1:", 7) == 0) {
 		options += 7;
@@ -527,9 +527,9 @@ static int __devinit vrc4173_cardu_setup(char *options)
 			}
 
 			if (*options != ',')
-				return 0;
+				return 1;
 		} else
-			return 0;
+			return 1;
 	}
 
 	if (strncmp(options, "cardu2:", 7) == 0) {
@@ -538,7 +538,7 @@ static int __devinit vrc4173_cardu_setup(char *options)
 			cardu_sockets[CARDU2].noprobe = 1;
 	}
 
-	return 0;
+	return 1;
 }
 
 __setup("vrc4173_cardu=", vrc4173_cardu_setup);
diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c
index 3a8462e8d063..24eb59e143a9 100644
--- a/drivers/scsi/ibmmca.c
+++ b/drivers/scsi/ibmmca.c
@@ -2488,7 +2488,7 @@ static int option_setup(char *str)
 	}
 	ints[0] = i - 1;
 	internal_ibmmca_scsi_setup(cur, ints);
-	return 0;
+	return 1;
 }
 
 __setup("ibmmcascsi=", option_setup);
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 041d06987861..d7351560f581 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -466,7 +466,7 @@ static int __init fb_console_setup(char *this_opt)
 	int i, j;
 
 	if (!this_opt || !*this_opt)
-		return 0;
+		return 1;
 
 	while ((options = strsep(&this_opt, ",")) != NULL) {
 		if (!strncmp(options, "font:", 5))
@@ -481,10 +481,10 @@ static int __init fb_console_setup(char *this_opt)
 					options++;
 				}
 				if (*options != ',')
-					return 0;
+					return 1;
 				options++;
 			} else
-				return 0;
+				return 1;
 		}
 		
 		if (!strncmp(options, "map:", 4)) {
@@ -496,7 +496,7 @@ static int __init fb_console_setup(char *this_opt)
 					con2fb_map_boot[i] =
 						(options[j++]-'0') % FB_MAX;
 				}
-			return 0;
+			return 1;
 		}
 
 		if (!strncmp(options, "vc:", 3)) {
@@ -518,7 +518,7 @@ static int __init fb_console_setup(char *this_opt)
 				rotate = 0;
 		}
 	}
-	return 0;
+	return 1;
 }
 
 __setup("fbcon=", fb_console_setup);
diff --git a/drivers/video/console/sticore.c b/drivers/video/console/sticore.c
index d6041e781aca..74ac2acaf72c 100644
--- a/drivers/video/console/sticore.c
+++ b/drivers/video/console/sticore.c
@@ -275,7 +275,7 @@ static int __init sti_setup(char *str)
 	if (str)
 		strlcpy (default_sti_path, str, sizeof (default_sti_path));
 	
-	return 0;
+	return 1;
 }
 
 /*	Assuming the machine has multiple STI consoles (=graphic cards) which
@@ -321,7 +321,7 @@ static int __init sti_font_setup(char *str)
 		i++;
 	}
 
-	return 0;
+	return 1;
 }
 
 /*	The optional linux kernel parameter "sti_font" defines which font
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index b1a8dca76430..944855b3e4af 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1588,7 +1588,7 @@ static int __init video_setup(char *options)
 		}
 	}
 
-	return 0;
+	return 1;
 }
 __setup("video=", video_setup);
 #endif
diff --git a/drivers/video/stifb.c b/drivers/video/stifb.c
index 8d5f35676f9a..4a292aae6eb2 100644
--- a/drivers/video/stifb.c
+++ b/drivers/video/stifb.c
@@ -1378,7 +1378,7 @@ stifb_setup(char *options)
 	int i;
 	
 	if (!options || !*options)
-		return 0;
+		return 1;
 	
 	if (strncmp(options, "off", 3) == 0) {
 		stifb_disabled = 1;
@@ -1393,7 +1393,7 @@ stifb_setup(char *options)
 			stifb_bpp_pref[i] = simple_strtoul(options, &options, 10);
 		}
 	}
-	return 0;
+	return 1;
 }
 
 __setup("stifb=", stifb_setup);
diff --git a/kernel/audit.c b/kernel/audit.c
index 04fe2e301b61..c8ccbd09048f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -578,7 +578,7 @@ static int __init audit_enable(char *str)
 	       audit_initialized ? "" : " (after initialization)");
 	if (audit_initialized)
 		audit_enabled = audit_default;
-	return 0;
+	return 1;
 }
 
 __setup("audit=", audit_enable);
diff --git a/mm/memory.c b/mm/memory.c
index 8d8f52569f32..0ec7bc644271 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -87,7 +87,7 @@ int randomize_va_space __read_mostly = 1;
 static int __init disable_randmaps(char *s)
 {
 	randomize_va_space = 0;
-	return 0;
+	return 1;
 }
 __setup("norandmaps", disable_randmaps);
 
-- 
cgit v1.3-6-gb490


From bb231fe3a53b2d34c1aef119613816fcb18864b1 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Fri, 31 Mar 2006 02:30:45 -0800
Subject: [PATCH] Fix pacct bug in multithreading case.

I noticed a bug on the process accounting facility.  In multi-threading
process, some data would be recorded incorrectly when the group_leader dies
earlier than one or more threads.  The attached patch fixes this problem.

See below.  'bugacct' is a test program that create a worker thread after 4
seconds sleeping, then the group_leader dies soon.  The worker thread
consume CPU/Memory for 6 seconds, then exit.  We can estimate 10 seconds as
etime and 6 seconds as stime + utime.  This is a sample program which the
group_leader dies earlier than other threads.

The results of same binary execution on different kernel are below.
-- accounted records --------------------
         |   btime  | utime | stime | etime | minflt | majflt |   comm  |
original | 13:16:40 |  0.00 |  0.00 |  6.10 |    171 |      0 | bugacct |
 patched | 13:20:21 |  5.83 |  0.18 | 10.03 |  32776 |      0 | bugacct |
(*) bugacct allocates 128MB memory, thus 128MB / 4KB = 32768 of minflt is
    appropriate.

-- Test results in original kernel ------
$ date; time -p ./bugacct
Tue Mar 28 13:16:36 JST 2006  <- But pacct said btime is 13:16:40
real 10.11                    <- But pacct said etime is 6.10
user 5.96                     <- But pacct said utime is 0.00
sys 0.14                      <- But pacct said stime is 0.00
$
-- Test results in patched kernel -------
$ date; time -p ./bugacct
Tue Mar 28 13:20:21 JST 2006
real 10.04
user 5.83
sys 0.19
$

In the original 2.6.16 kernel, pacct records btime, utime, stime, etime and
minflt incorrectly.  In my opinion, this problem is caused by an assumption
that group_leader dies last.

The following section calculates process running time for etime and btime.
But it means running time of the thread that dies last, not process.  The
start_time of the first thread in the process (group_leader) should be
reduced from uptime to calculate etime and btime correctly.

   ---- do_acct_process() in kernel/acct.c:
   /* calculate run_time in nsec*/
   do_posix_clock_monotonic_gettime(&uptime);
   run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
   run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
                                   + current->start_time.tv_nsec;
   ----

The following section calculates stime and utime of the process.
But it might count the utime and stime of the group_leader duplicatly
and ignore the utime and stime of the thread dies last, when one or
more threads remain after group_leader dead.
The ac_utime should be calculated as the sum of the signal->utime
and utime of the thread dies last. The ac_stime should be done also.

   ---- do_acct_process() in kernel/acct.c:
   jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
                                            current->signal->utime));
   ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
   jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
                                            current->signal->stime));
   ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
   ----

The part of the minflt/majflt calculation has same problem.
This patch solves those problems, I think.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 065d8b4e51ef..b327f4d20104 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -449,8 +449,8 @@ static void do_acct_process(long exitcode, struct file *file)
 	/* calculate run_time in nsec*/
 	do_posix_clock_monotonic_gettime(&uptime);
 	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
-	run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
-					+ current->start_time.tv_nsec;
+	run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
+		       + current->group_leader->start_time.tv_nsec;
 	/* convert nsec -> AHZ */
 	elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION==3
@@ -469,10 +469,10 @@ static void do_acct_process(long exitcode, struct file *file)
 #endif
 	do_div(elapsed, AHZ);
 	ac.ac_btime = xtime.tv_sec - elapsed;
-	jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
+	jiffies = cputime_to_jiffies(cputime_add(current->utime,
 						 current->signal->utime));
 	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
-	jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
+	jiffies = cputime_to_jiffies(cputime_add(current->stime,
 						 current->signal->stime));
 	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
 	/* we really need to bite the bullet and change layout */
@@ -522,9 +522,9 @@ static void do_acct_process(long exitcode, struct file *file)
 	ac.ac_io = encode_comp_t(0 /* current->io_usage */);	/* %% */
 	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 	ac.ac_minflt = encode_comp_t(current->signal->min_flt +
-				     current->group_leader->min_flt);
+				     current->min_flt);
 	ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
-				     current->group_leader->maj_flt);
+				     current->maj_flt);
 	ac.ac_swaps = encode_comp_t(0);
 	ac.ac_exitcode = exitcode;
 
-- 
cgit v1.3-6-gb490


From 4a01c8d5be628ac20cfd432c21808d76be5813e6 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 31 Mar 2006 02:30:50 -0800
Subject: [PATCH] cpuset: task_lock comment fix

Fix cpuset comment involving case of a tasks cpuset pointer being NULL.
Thanks to "the_top_cpuset_hack", this code no longer sees NULL task->cpuset
pointers.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 18aea1bd1284..2523a4b6a8c6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -616,12 +616,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * current->cpuset if a task has its memory placement changed.
  * Do not call this routine if in_interrupt().
  *
- * Call without callback_mutex or task_lock() held.  May be called
- * with or without manage_mutex held.  Doesn't need task_lock to guard
- * against another task changing a non-NULL cpuset pointer to NULL,
- * as that is only done by a task on itself, and if the current task
- * is here, it is not simultaneously in the exit code NULL'ing its
- * cpuset pointer.  This routine also might acquire callback_mutex and
+ * Call without callback_mutex or task_lock() held.  May be
+ * called with or without manage_mutex held.  Thanks in part to
+ * 'the_top_cpuset_hack', the tasks cpuset pointer will never
+ * be NULL.  This routine also might acquire callback_mutex and
  * current->mm->mmap_sem during call.
  *
  * Reading current->cpuset->mems_generation doesn't need task_lock
-- 
cgit v1.3-6-gb490


From 2741a559a01e1ba9bf87285569dc1a104d134ecf Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 31 Mar 2006 02:30:51 -0800
Subject: [PATCH] cpuset: unsafe mm reference fix

Fix unsafe reference to a tasks mm struct, by moving the reference inside of a
convenient nearby properly guarded code block.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2523a4b6a8c6..bf42381a4195 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1183,11 +1183,11 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	mm = get_task_mm(tsk);
 	if (mm) {
 		mpol_rebind_mm(mm, &to);
+		if (is_memory_migrate(cs))
+			do_migrate_pages(mm, &from, &to, MPOL_MF_MOVE_ALL);
 		mmput(mm);
 	}
 
-	if (is_memory_migrate(cs))
-		do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
 	put_task_struct(tsk);
 	synchronize_rcu();
 	if (atomic_dec_and_test(&oldcs->count))
-- 
cgit v1.3-6-gb490


From e4e364e865b382f9d99c7fc230ec2ce7df21257a Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 31 Mar 2006 02:30:52 -0800
Subject: [PATCH] cpuset: memory migration interaction fix

Fix memory migration so that it works regardless of what cpuset the invoking
task is in.

If a task invoked a memory migration, by doing one of:

       1) writing a different nodemask to a cpuset 'mems' file, or

       2) writing a tasks pid to a different cpuset's 'tasks' file,
          where the cpuset had its 'memory_migrate' option turned on, then the
          allocation of the new pages for the migrated task(s) was constrained
          by the invoking tasks cpuset.

If this task wasn't in a cpuset that allowed the requested memory nodes, the
memory migration would happen to some other nodes that were in that invoking
tasks cpuset.  This was usually surprising and puzzling behaviour: Why didn't
the pages move?  Why did the pages move -there-?

To fix this, temporarilly change the invoking tasks 'mems_allowed' task_struct
field to the nodes the migrating tasks is moving to, so that new pages can be
allocated there.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index bf42381a4195..72248d1b9e3f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -833,6 +833,55 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	return 0;
 }
 
+/*
+ * cpuset_migrate_mm
+ *
+ *    Migrate memory region from one set of nodes to another.
+ *
+ *    Temporarilly set tasks mems_allowed to target nodes of migration,
+ *    so that the migration code can allocate pages on these nodes.
+ *
+ *    Call holding manage_mutex, so our current->cpuset won't change
+ *    during this call, as manage_mutex holds off any attach_task()
+ *    calls.  Therefore we don't need to take task_lock around the
+ *    call to guarantee_online_mems(), as we know no one is changing
+ *    our tasks cpuset.
+ *
+ *    Hold callback_mutex around the two modifications of our tasks
+ *    mems_allowed to synchronize with cpuset_mems_allowed().
+ *
+ *    While the mm_struct we are migrating is typically from some
+ *    other task, the task_struct mems_allowed that we are hacking
+ *    is for our current task, which must allocate new pages for that
+ *    migrating memory region.
+ *
+ *    We call cpuset_update_task_memory_state() before hacking
+ *    our tasks mems_allowed, so that we are assured of being in
+ *    sync with our tasks cpuset, and in particular, callbacks to
+ *    cpuset_update_task_memory_state() from nested page allocations
+ *    won't see any mismatch of our cpuset and task mems_generation
+ *    values, so won't overwrite our hacked tasks mems_allowed
+ *    nodemask.
+ */
+
+static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+							const nodemask_t *to)
+{
+	struct task_struct *tsk = current;
+
+	cpuset_update_task_memory_state();
+
+	mutex_lock(&callback_mutex);
+	tsk->mems_allowed = *to;
+	mutex_unlock(&callback_mutex);
+
+	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+
+	mutex_lock(&callback_mutex);
+	guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);
+	mutex_unlock(&callback_mutex);
+}
+
 /*
  * Handle user request to change the 'mems' memory placement
  * of a cpuset.  Needs to validate the request, update the
@@ -945,10 +994,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 		struct mm_struct *mm = mmarray[i];
 
 		mpol_rebind_mm(mm, &cs->mems_allowed);
-		if (migrate) {
-			do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
-							MPOL_MF_MOVE_ALL);
-		}
+		if (migrate)
+			cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
 		mmput(mm);
 	}
 
@@ -1184,7 +1231,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	if (mm) {
 		mpol_rebind_mm(mm, &to);
 		if (is_memory_migrate(cs))
-			do_migrate_pages(mm, &from, &to, MPOL_MF_MOVE_ALL);
+			cpuset_migrate_mm(mm, &from, &to);
 		mmput(mm);
 	}
 
-- 
cgit v1.3-6-gb490


From 7529c301165079d0f149d0e54724829e602f8fc0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 31 Mar 2006 02:30:59 -0800
Subject: [PATCH] modules: permit Dual-MIT/GPL licenses

One of the LEDs driver files wants to use this.

Probably drivers/mtd/maps/ipaq-flash.c wants to convert as well - right now
it'll be tainting the kernel.

Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Bowler <jbowler@acm.org>
Cc: "'Richard Purdie'" <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index bd088a7c1499..d24deb0dbbc9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1254,6 +1254,7 @@ static inline int license_is_gpl_compatible(const char *license)
 		|| strcmp(license, "GPL v2") == 0
 		|| strcmp(license, "GPL and additional rights") == 0
 		|| strcmp(license, "Dual BSD/GPL") == 0
+		|| strcmp(license, "Dual MIT/GPL") == 0
 		|| strcmp(license, "Dual MPL/GPL") == 0);
 }
 
-- 
cgit v1.3-6-gb490


From 00362e33f65f1cb5d15e62ea5509520ce2770360 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 31 Mar 2006 02:31:17 -0800
Subject: [PATCH] hrtimer: create generic sleeper

The removal of the data field in the hrtimer structure enforces the
embedding of the timer into another data structure.  nanosleep now uses a
private implementation of the most common used timer callback function
(simple task wakeup).

In order to avoid the reimplentation of such functionality all over the
place a generic hrtimer_sleeper functionality is created.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h | 16 ++++++++++++++++
 kernel/hrtimer.c        | 19 +++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 93830158348e..b20939287613 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -57,6 +57,19 @@ struct hrtimer {
 	struct hrtimer_base	*base;
 };
 
+/**
+ * struct hrtimer_sleeper - simple sleeper structure
+ *
+ * @timer:	embedded timer structure
+ * @task:	task to wake up
+ *
+ * task is set to NULL, when the timer expires.
+ */
+struct hrtimer_sleeper {
+	struct hrtimer timer;
+	struct task_struct *task;
+};
+
 /**
  * struct hrtimer_base - the timer base for a specific clock
  *
@@ -127,6 +140,9 @@ extern long hrtimer_nanosleep(struct timespec *rqtp,
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
 
+extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
+				 struct task_struct *tsk);
+
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0237a556eb1f..877cdf9678bf 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -656,6 +656,25 @@ void hrtimer_run_queues(void)
  * Sleep related functions:
  */
 
+static int hrtimer_wakeup(struct hrtimer *timer)
+{
+	struct hrtimer_sleeper *t =
+		container_of(timer, struct hrtimer_sleeper, timer);
+	struct task_struct *task = t->task;
+
+	t->task = NULL;
+	if (task)
+		wake_up_process(task);
+
+	return HRTIMER_NORESTART;
+}
+
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
+{
+	sl->timer.function = hrtimer_wakeup;
+	sl->task = task;
+}
+
 struct sleep_hrtimer {
 	struct hrtimer timer;
 	struct task_struct *task;
-- 
cgit v1.3-6-gb490


From 669d7868ae414cdb7b7e5df375dc8e4b47f26f6d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 31 Mar 2006 02:31:19 -0800
Subject: [PATCH] hrtimer: use generic sleeper for nanosleep

Replace the nanosleep private sleeper functionality by the generic hrtimer
sleeper.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/hrtimer.c | 39 +++++++++------------------------------
 1 file changed, 9 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 877cdf9678bf..49cbf7cffebd 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -655,7 +655,6 @@ void hrtimer_run_queues(void)
 /*
  * Sleep related functions:
  */
-
 static int hrtimer_wakeup(struct hrtimer *timer)
 {
 	struct hrtimer_sleeper *t =
@@ -675,28 +674,9 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
 	sl->task = task;
 }
 
-struct sleep_hrtimer {
-	struct hrtimer timer;
-	struct task_struct *task;
-	int expired;
-};
-
-static int nanosleep_wakeup(struct hrtimer *timer)
+static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
 {
-	struct sleep_hrtimer *t =
-		container_of(timer, struct sleep_hrtimer, timer);
-
-	t->expired = 1;
-	wake_up_process(t->task);
-
-	return HRTIMER_NORESTART;
-}
-
-static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
-{
-	t->timer.function = nanosleep_wakeup;
-	t->task = current;
-	t->expired = 0;
+	hrtimer_init_sleeper(t, current);
 
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -704,18 +684,17 @@ static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
 
 		schedule();
 
-		if (unlikely(!t->expired)) {
-			hrtimer_cancel(&t->timer);
-			mode = HRTIMER_ABS;
-		}
-	} while (!t->expired && !signal_pending(current));
+		hrtimer_cancel(&t->timer);
+		mode = HRTIMER_ABS;
+
+	} while (t->task && !signal_pending(current));
 
-	return t->expired;
+	return t->task == NULL;
 }
 
 static long __sched nanosleep_restart(struct restart_block *restart)
 {
-	struct sleep_hrtimer t;
+	struct hrtimer_sleeper t;
 	struct timespec __user *rmtp;
 	struct timespec tu;
 	ktime_t time;
@@ -748,7 +727,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 		       const enum hrtimer_mode mode, const clockid_t clockid)
 {
 	struct restart_block *restart;
-	struct sleep_hrtimer t;
+	struct hrtimer_sleeper t;
 	struct timespec tu;
 	ktime_t rem;
 
-- 
cgit v1.3-6-gb490


From 3055addadbe9bfb2365006a1c13fd342a8d30d52 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Fri, 31 Mar 2006 02:31:20 -0800
Subject: [PATCH] hrtimer: call get_softirq_time() only when necessary in
 run_hrtimer_queue()

It seems that run_hrtimer_queue() is calling get_softirq_time() more
often than it needs to.

With this patch, it only calls get_softirq_time() if there's a
pending timer.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/hrtimer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 49cbf7cffebd..f181ff4dd32e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -606,6 +606,9 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 {
 	struct rb_node *node;
 
+	if (!base->first)
+		return;
+
 	if (base->get_softirq_time)
 		base->softirq_time = base->get_softirq_time();
 
-- 
cgit v1.3-6-gb490


From db1b1fefc2cecbff2e4214062fa8c680cb6e7b7d Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Fri, 31 Mar 2006 02:31:21 -0800
Subject: [PATCH] sched: reduce overhead of calc_load

Currently, count_active_tasks() calls both nr_running() &
nr_interruptible().  Each of these functions does a "for_each_cpu" & reads
values from the runqueue of each cpu.  Although this is not a lot of
instructions, each runqueue may be located on different node.  Depending on
the architecture, a unique TLB entry may be required to access each
runqueue.

Since there may be more runqueues than cpu TLB entries, a scan of all
runqueues can trash the TLB.  Each memory reference incurs a TLB miss &
refill.

In addition, the runqueue cacheline that contains nr_running &
nr_uninterruptible may be evicted from the cache between the two passes.
This causes unnecessary cache misses.

Combining nr_running() & nr_interruptible() into a single function
substantially reduces the TLB & cache misses on large systems.  This should
have no measureable effect on smaller systems.

On a 128p IA64 system running a memory stress workload, the new function
reduced the overhead of calc_load() from 605 usec/call to 324 usec/call.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        | 15 +++++++++++++++
 kernel/timer.c        |  2 +-
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d04186d8cc68..ab84adf5bb9a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -100,6 +100,7 @@ DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
+extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
 
 #include <linux/time.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index a9ecac398bb9..6e52e0adff80 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1658,6 +1658,21 @@ unsigned long nr_iowait(void)
 	return sum;
 }
 
+unsigned long nr_active(void)
+{
+	unsigned long i, running = 0, uninterruptible = 0;
+
+	for_each_online_cpu(i) {
+		running += cpu_rq(i)->nr_running;
+		uninterruptible += cpu_rq(i)->nr_uninterruptible;
+	}
+
+	if (unlikely((long)uninterruptible < 0))
+		uninterruptible = 0;
+
+	return running + uninterruptible;
+}
+
 #ifdef CONFIG_SMP
 
 /*
diff --git a/kernel/timer.c b/kernel/timer.c
index 9062a82ee8ec..6b812c04737b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -825,7 +825,7 @@ void update_process_times(int user_tick)
  */
 static unsigned long count_active_tasks(void)
 {
-	return (nr_running() + nr_uninterruptible()) * FIXED_1;
+	return nr_active() * FIXED_1;
 }
 
 /*
-- 
cgit v1.3-6-gb490


From 3dee386e14045484a6c41c8f03a263f9d79de740 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 31 Mar 2006 02:31:23 -0800
Subject: [PATCH] sched: cleanup task_activated()

The activated flag in task_struct is used to track different sleep types and
its usage is somewhat obfuscated.  Convert the variable to an enum with more
descriptive names without altering the function.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  9 ++++++++-
 kernel/sched.c        | 24 +++++++++++++++---------
 2 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ab84adf5bb9a..c4fd3fcd3feb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -684,6 +684,13 @@ static inline void prefetch_stack(struct task_struct *t) { }
 struct audit_context;		/* See audit.c */
 struct mempolicy;
 
+enum sleep_type {
+	SLEEP_NORMAL,
+	SLEEP_NONINTERACTIVE,
+	SLEEP_INTERACTIVE,
+	SLEEP_INTERRUPTED,
+};
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
@@ -706,7 +713,7 @@ struct task_struct {
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
 	unsigned long long sched_time; /* sched_clock time spent running */
-	int activated;
+	enum sleep_type sleep_type;
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6e52e0adff80..f55ce5adac55 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -704,7 +704,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
 		 * prevent them suddenly becoming cpu hogs and starving
 		 * other processes.
 		 */
-		if (p->mm && p->activated != -1 &&
+		if (p->mm && p->sleep_type != SLEEP_NONINTERACTIVE &&
 			sleep_time > INTERACTIVE_SLEEP(p)) {
 				p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
 						DEF_TIMESLICE);
@@ -714,7 +714,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
 			 * limited in their sleep_avg rise as they
 			 * are likely to be waiting on I/O
 			 */
-			if (p->activated == -1 && p->mm) {
+			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
 				if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
 					sleep_time = 0;
 				else if (p->sleep_avg + sleep_time >=
@@ -769,7 +769,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
 	 * This checks to make sure it's not an uninterruptible task
 	 * that is now waking up.
 	 */
-	if (!p->activated) {
+	if (p->sleep_type == SLEEP_NORMAL) {
 		/*
 		 * Tasks which were woken up by interrupts (ie. hw events)
 		 * are most likely of interactive nature. So we give them
@@ -778,13 +778,13 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
 		 * on a CPU, first time around:
 		 */
 		if (in_interrupt())
-			p->activated = 2;
+			p->sleep_type = SLEEP_INTERRUPTED;
 		else {
 			/*
 			 * Normal first-time wakeups get a credit too for
 			 * on-runqueue time, but it will be weighted down:
 			 */
-			p->activated = 1;
+			p->sleep_type = SLEEP_INTERACTIVE;
 		}
 	}
 	p->timestamp = now;
@@ -1272,7 +1272,7 @@ out_activate:
 		 * Tasks on involuntary sleep don't earn
 		 * sleep_avg beyond just interactive state.
 		 */
-		p->activated = -1;
+		p->sleep_type = SLEEP_NONINTERACTIVE;
 	}
 
 	/*
@@ -2875,6 +2875,12 @@ EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
+static inline int interactive_sleep(enum sleep_type sleep_type)
+{
+	return (sleep_type == SLEEP_INTERACTIVE ||
+		sleep_type == SLEEP_INTERRUPTED);
+}
+
 /*
  * schedule() is the main scheduler function.
  */
@@ -2998,12 +3004,12 @@ go_idle:
 	queue = array->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
 
-	if (!rt_task(next) && next->activated > 0) {
+	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
 		unsigned long long delta = now - next->timestamp;
 		if (unlikely((long long)(now - next->timestamp) < 0))
 			delta = 0;
 
-		if (next->activated == 1)
+		if (next->sleep_type == SLEEP_INTERACTIVE)
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
 		array = next->array;
@@ -3016,7 +3022,7 @@ go_idle:
 		} else
 			requeue_task(next, array);
 	}
-	next->activated = 0;
+	next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);
-- 
cgit v1.3-6-gb490


From e7c38cb49c6cc05bc11f70d9e9889da1c4a0d37f Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 31 Mar 2006 02:31:25 -0800
Subject: [PATCH] sched: make task_noninteractive use sleep_type

Alterations to the pipe code in the kernel made it possible for relative
starvation to occur with tasks that slept waiting on a pipe getting unfair
priority bonuses even if they were otherwise fully cpu bound so the
TASK_NONINTERACTIVE flag was introduced which prevented any change to
sleep_avg while sleeping waiting on a pipe.  This change also leads to the
converse though, preventing any priority boost from occurring in truly
interactive tasks that wait on pipes.

Convert the TASK_NONINTERACTIVE flag to set sleep_type to SLEEP_NONINTERACTIVE
which will allow a linear bonus to priority based on sleep time thus allowing
interactive tasks to get high priority if they sleep enough.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f55ce5adac55..589e55a42214 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1273,18 +1273,18 @@ out_activate:
 		 * sleep_avg beyond just interactive state.
 		 */
 		p->sleep_type = SLEEP_NONINTERACTIVE;
-	}
+	} else
 
 	/*
 	 * Tasks that have marked their sleep as noninteractive get
-	 * woken up without updating their sleep average. (i.e. their
-	 * sleep is handled in a priority-neutral manner, no priority
-	 * boost and no penalty.)
+	 * woken up with their sleep average not weighted in an
+	 * interactive way.
 	 */
-	if (old_state & TASK_NONINTERACTIVE)
-		__activate_task(p, rq);
-	else
-		activate_task(p, rq, cpu == this_cpu);
+		if (old_state & TASK_NONINTERACTIVE)
+			p->sleep_type = SLEEP_NONINTERACTIVE;
+
+
+	activate_task(p, rq, cpu == this_cpu);
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
-- 
cgit v1.3-6-gb490


From e72ff0bb2c163eb13014ba113701bd42dab382fe Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 31 Mar 2006 02:31:26 -0800
Subject: [PATCH] sched: dont decrease idle sleep avg

We watch for tasks that sleep extended periods and don't allow one single
prolonged sleep period from elevating priority to maximum bonus to prevent cpu
bound tasks from getting high priority with single long sleeps.  There is a
bug in the current code that also penalises tasks that already have high
priority.  Correct that bug.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 589e55a42214..7b371931114f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -700,14 +700,19 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
 	if (likely(sleep_time > 0)) {
 		/*
 		 * User tasks that sleep a long time are categorised as
-		 * idle and will get just interactive status to stay active &
-		 * prevent them suddenly becoming cpu hogs and starving
-		 * other processes.
+		 * idle. They will only have their sleep_avg increased to a
+		 * level that makes them just interactive priority to stay
+		 * active yet prevent them suddenly becoming cpu hogs and
+		 * starving other processes.
 		 */
 		if (p->mm && p->sleep_type != SLEEP_NONINTERACTIVE &&
 			sleep_time > INTERACTIVE_SLEEP(p)) {
-				p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
-						DEF_TIMESLICE);
+				unsigned long ceiling;
+
+				ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
+					DEF_TIMESLICE);
+				if (p->sleep_avg < ceiling)
+					p->sleep_avg = ceiling;
 		} else {
 			/*
 			 * Tasks waking from uninterruptible sleep are
-- 
cgit v1.3-6-gb490


From 5138930e6a69f1c7851a82d7cedaa01fad029fcf Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 31 Mar 2006 02:31:27 -0800
Subject: [PATCH] sched: include noninteractive sleep in idle detect

Tasks waiting in SLEEP_NONINTERACTIVE state can now get to best priority so
they need to be included in the idle detection code.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7b371931114f..3055fe806ff7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -705,8 +705,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
 		 * active yet prevent them suddenly becoming cpu hogs and
 		 * starving other processes.
 		 */
-		if (p->mm && p->sleep_type != SLEEP_NONINTERACTIVE &&
-			sleep_time > INTERACTIVE_SLEEP(p)) {
+		if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
 				unsigned long ceiling;
 
 				ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
-- 
cgit v1.3-6-gb490


From 7c4bb1f9b3788309e1159961c606ba0bdf7ed382 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 31 Mar 2006 02:31:29 -0800
Subject: [PATCH] sched: remove on runqueue requeueing

On runqueue time is used to elevate priority in schedule().

In the code it currently requeues tasks even if their priority is not
elevated, which would end up placing them at the end of their runqueue
array effectively delaying them instead of improving their priority.

Bug spotted by Mike Galbraith <efault@gmx.de>

This patch removes this requeueing.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3055fe806ff7..73bb4d9ef989 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3023,8 +3023,7 @@ go_idle:
 			dequeue_task(next, array);
 			next->prio = new_prio;
 			enqueue_task(next, array);
-		} else
-			requeue_task(next, array);
+		}
 	}
 	next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
-- 
cgit v1.3-6-gb490


From d425b274ba83ba4e7746a40446ec0ba3267de51f Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 31 Mar 2006 02:31:29 -0800
Subject: [PATCH] sched: activate SCHED BATCH expired

To increase the strength of SCHED_BATCH as a scheduling hint we can
activate batch tasks on the expired array since by definition they are
latency insensitive tasks.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c4fd3fcd3feb..78c40dd2e19a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -484,6 +484,7 @@ struct signal_struct {
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
 
 #define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
+#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
 
 /*
  * Some day this will be a full-fledged user tracking system..
diff --git a/kernel/sched.c b/kernel/sched.c
index 73bb4d9ef989..dd153d6f8a04 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -667,9 +667,13 @@ static int effective_prio(task_t *p)
 /*
  * __activate_task - move a task to the runqueue.
  */
-static inline void __activate_task(task_t *p, runqueue_t *rq)
+static void __activate_task(task_t *p, runqueue_t *rq)
 {
-	enqueue_task(p, rq->active);
+	prio_array_t *target = rq->active;
+
+	if (batch_task(p))
+		target = rq->expired;
+	enqueue_task(p, target);
 	rq->nr_running++;
 }
 
@@ -688,7 +692,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
 	unsigned long long __sleep_time = now - p->timestamp;
 	unsigned long sleep_time;
 
-	if (unlikely(p->policy == SCHED_BATCH))
+	if (batch_task(p))
 		sleep_time = 0;
 	else {
 		if (__sleep_time > NS_MAX_SLEEP_AVG)
-- 
cgit v1.3-6-gb490


From 9741ef964dc8bfeb6520825df9fed8f538c3336e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 31 Mar 2006 02:31:32 -0800
Subject: [PATCH] futex: check and validate timevals

The futex timeval is not checked for correctness.  The change does not
break existing applications as the timeval is supplied by glibc (and glibc
always passes a correct value), but the glibc-internal tests for this
functionality fail.

Signed-off-by: Thomas Gleixner <tglx@tglx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c        | 4 +++-
 kernel/futex_compat.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 9c9b2b6b22dd..5699c512057b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1039,9 +1039,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
 	int val2 = 0;
 
-	if ((op == FUTEX_WAIT) && utime) {
+	if (utime && (op == FUTEX_WAIT)) {
 		if (copy_from_user(&t, utime, sizeof(t)) != 0)
 			return -EFAULT;
+		if (!timespec_valid(&t))
+			return -EINVAL;
 		timeout = timespec_to_jiffies(&t) + 1;
 	}
 	/*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 54274fc85321..1ab6a0ea3d14 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,9 +129,11 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
 	int val2 = 0;
 
-	if ((op == FUTEX_WAIT) && utime) {
+	if (utime && (op == FUTEX_WAIT)) {
 		if (get_compat_timespec(&t, utime))
 			return -EFAULT;
+		if (!timespec_valid(&t))
+			return -EINVAL;
 		timeout = timespec_to_jiffies(&t) + 1;
 	}
 	if (op >= FUTEX_REQUEUE)
-- 
cgit v1.3-6-gb490


From 390e2ff07712468ce6600a43aa91e897b056ce12 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 31 Mar 2006 02:31:33 -0800
Subject: [PATCH] Make setsid() more robust

The core problem: setsid fails if it is called by init.  The effect in 2.6.16
and the earlier kernels that have this problem is that if you do a "ps -j 1 or
ps -ej 1" you will see that init and several of it's children have process
group and session == 0.  Instead of process group == session == 1.  Despite
init calling setsid.

The reason it fails is that daemonize calls set_special_pids(1,1) on kernel
threads that are launched before /sbin/init is called.

The only remaining effect in that current->signal->leader == 0 for init
instead of 1.  And the setsid call fails.  No one has noticed because
/sbin/init does not check the return value of setsid.

In 2.4 where we don't have the pidhash table, and daemonize doesn't exist
setsid actually works for init.

I care a lot about pid == 1 not being a special case that we leave broken,
because of the container/jail work that I am doing.

- Carefully allow init (pid == 1) to call setsid despite the kernel using
  its session.

- Use find_task_by_pid instead of find_pid because find_pid taking a
  pidtype is going away.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 7ef7f6054c28..0b6ec0e7936f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1372,18 +1372,29 @@ asmlinkage long sys_getsid(pid_t pid)
 asmlinkage long sys_setsid(void)
 {
 	struct task_struct *group_leader = current->group_leader;
-	struct pid *pid;
+	pid_t session;
 	int err = -EPERM;
 
 	mutex_lock(&tty_mutex);
 	write_lock_irq(&tasklist_lock);
 
-	pid = find_pid(PIDTYPE_PGID, group_leader->pid);
-	if (pid)
+	/* Fail if I am already a session leader */
+	if (group_leader->signal->leader)
+		goto out;
+
+	session = group_leader->pid;
+	/* Fail if a process group id already exists that equals the
+	 * proposed session id.
+	 *
+	 * Don't check if session id == 1 because kernel threads use this
+	 * session id and so the check will always fail and make it so
+	 * init cannot successfully call setsid.
+	 */
+	if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session))
 		goto out;
 
 	group_leader->signal->leader = 1;
-	__set_special_pids(group_leader->pid, group_leader->pid);
+	__set_special_pids(session, session);
 	group_leader->signal->tty = NULL;
 	group_leader->signal->tty_old_pgrp = 0;
 	err = process_group(group_leader);
-- 
cgit v1.3-6-gb490


From 158d9ebd19280582da172626ad3edda1a626dace Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 31 Mar 2006 02:31:34 -0800
Subject: [PATCH] resurrect __put_task_struct

This just got nuked in mainline.  Bring it back because Eric's patches use it.

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  1 +
 kernel/fork.c         | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78c40dd2e19a..95f248ba36c9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -906,6 +906,7 @@ extern void free_task(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
 extern void __put_task_struct_cb(struct rcu_head *rhp);
+extern void __put_task_struct(struct task_struct *t);
 
 static inline void put_task_struct(struct task_struct *t)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index b3f7a1bb5e55..b1341205be27 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -108,10 +108,8 @@ void free_task(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(free_task);
 
-void __put_task_struct_cb(struct rcu_head *rhp)
+void __put_task_struct(struct task_struct *tsk)
 {
-	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
-
 	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
@@ -126,6 +124,12 @@ void __put_task_struct_cb(struct rcu_head *rhp)
 		free_task(tsk);
 }
 
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+	__put_task_struct(tsk);
+}
+
 void __init fork_init(unsigned long mempages)
 {
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-- 
cgit v1.3-6-gb490


From 8c7904a00b06d2ee51149794b619e07369fcf9d4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 31 Mar 2006 02:31:37 -0800
Subject: [PATCH] task: RCU protect task->usage

A big problem with rcu protected data structures that are also reference
counted is that you must jump through several hoops to increase the reference
count.  I think someone finally implemented atomic_inc_not_zero(&count) to
automate the common case.  Unfortunately this means you must special case the
rcu access case.

When data structures are only visible via rcu in a manner that is not
determined by the reference count on the object (i.e.  tasks are visible until
their zombies are reaped) there is a much simpler technique we can employ.
Simply delaying the decrement of the reference count until the rcu interval is
over.

What that means is that the proc code that looks up a task and later
wants to sleep can now do:

rcu_read_lock();
task = find_task_by_pid(some_pid);
if (task) {
	get_task_struct(task);
}
rcu_read_unlock();

The effect on the rest of the kernel is that put_task_struct becomes cheaper
and immediate, and in the case where the task has been reaped it frees the
task immediate instead of unnecessarily waiting an until the rcu interval is
over.

Cleanup of task_struct does not happen when its reference count drops to
zero, instead cleanup happens when release_task is called.  Tasks can only
be looked up via rcu before release_task is called.  All rcu protected
members of task_struct are freed by release_task.

Therefore we can move call_rcu from put_task_struct into release_task.  And
we can modify release_task to not immediately release the reference count
but instead have it call put_task_struct from the function it gives to
call_rcu.

The end result:

- get_task_struct is safe in an rcu context where we have just looked
  up the task.

- put_task_struct() simplifies into its old pre rcu self.

This reorganization also makes put_task_struct uncallable from modules as
it is not exported but it does not appear to be called from any modules so
this should not be an issue, and is trivially fixed.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 2 +-
 kernel/exit.c         | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 95f248ba36c9..7e0ff5dba986 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -911,7 +911,7 @@ extern void __put_task_struct(struct task_struct *t);
 static inline void put_task_struct(struct task_struct *t)
 {
 	if (atomic_dec_and_test(&t->usage))
-		call_rcu(&t->rcu, __put_task_struct_cb);
+		__put_task_struct(t);
 }
 
 /*
diff --git a/kernel/exit.c b/kernel/exit.c
index bc0ec674d3f4..6c2eeb8f6390 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -127,6 +127,11 @@ static void __exit_signal(struct task_struct *tsk)
 	}
 }
 
+static void delayed_put_task_struct(struct rcu_head *rhp)
+{
+	put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
 void release_task(struct task_struct * p)
 {
 	int zap_leader;
@@ -168,7 +173,7 @@ repeat:
 	spin_unlock(&p->proc_lock);
 	proc_pid_flush(proc_dentry);
 	release_thread(p);
-	put_task_struct(p);
+	call_rcu(&p->rcu, delayed_put_task_struct);
 
 	p = leader;
 	if (unlikely(zap_leader))
-- 
cgit v1.3-6-gb490


From 92476d7fc0326a409ab1d3864a04093a6be9aca7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 31 Mar 2006 02:31:42 -0800
Subject: [PATCH] pidhash: Refactor the pid hash table

Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.

In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work.  With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed.  Instead of
playing with the hash lists when ever a process would attach or detach to a
process.

For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win.  The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory.  If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.

If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller.  In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.

This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct.  struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.

The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.

Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pid.h   |  96 +++++++++++++++++++----
 include/linux/sched.h |   4 +-
 kernel/fork.c         |  16 ++--
 kernel/pid.c          | 212 ++++++++++++++++++++++++++++++++++----------------
 4 files changed, 238 insertions(+), 90 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 5b9082cc600f..29960b03bef7 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_PID_H
 #define _LINUX_PID_H
 
+#include <linux/rcupdate.h>
+
 enum pid_type
 {
 	PIDTYPE_PID,
@@ -9,45 +11,109 @@ enum pid_type
 	PIDTYPE_MAX
 };
 
+/*
+ * What is struct pid?
+ *
+ * A struct pid is the kernel's internal notion of a process identifier.
+ * It refers to individual tasks, process groups, and sessions.  While
+ * there are processes attached to it the struct pid lives in a hash
+ * table, so it and then the processes that it refers to can be found
+ * quickly from the numeric pid value.  The attached processes may be
+ * quickly accessed by following pointers from struct pid.
+ *
+ * Storing pid_t values in the kernel and refering to them later has a
+ * problem.  The process originally with that pid may have exited and the
+ * pid allocator wrapped, and another process could have come along
+ * and been assigned that pid.
+ *
+ * Referring to user space processes by holding a reference to struct
+ * task_struct has a problem.  When the user space process exits
+ * the now useless task_struct is still kept.  A task_struct plus a
+ * stack consumes around 10K of low kernel memory.  More precisely
+ * this is THREAD_SIZE + sizeof(struct task_struct).  By comparison
+ * a struct pid is about 64 bytes.
+ *
+ * Holding a reference to struct pid solves both of these problems.
+ * It is small so holding a reference does not consume a lot of
+ * resources, and since a new struct pid is allocated when the numeric
+ * pid value is reused we don't mistakenly refer to new processes.
+ */
+
 struct pid
 {
+	atomic_t count;
 	/* Try to keep pid_chain in the same cacheline as nr for find_pid */
 	int nr;
 	struct hlist_node pid_chain;
-	/* list of pids with the same nr, only one of them is in the hash */
-	struct list_head pid_list;
+	/* lists of tasks that use this pid */
+	struct hlist_head tasks[PIDTYPE_MAX];
+	struct rcu_head rcu;
 };
 
-#define pid_task(elem, type) \
-	list_entry(elem, struct task_struct, pids[type].pid_list)
+struct pid_link
+{
+	struct hlist_node node;
+	struct pid *pid;
+};
+
+static inline struct pid *get_pid(struct pid *pid)
+{
+	if (pid)
+		atomic_inc(&pid->count);
+	return pid;
+}
+
+extern void FASTCALL(put_pid(struct pid *pid));
+extern struct task_struct *FASTCALL(pid_task(struct pid *pid, enum pid_type));
+extern struct task_struct *FASTCALL(get_pid_task(struct pid *pid,
+						enum pid_type));
 
 /*
  * attach_pid() and detach_pid() must be called with the tasklist_lock
  * write-held.
  */
-extern int FASTCALL(attach_pid(struct task_struct *task, enum pid_type type, int nr));
+extern int FASTCALL(attach_pid(struct task_struct *task,
+				enum pid_type type, int nr));
 
 extern void FASTCALL(detach_pid(struct task_struct *task, enum pid_type));
 
 /*
  * look up a PID in the hash table. Must be called with the tasklist_lock
- * held.
+ * or rcu_read_lock() held.
+ */
+extern struct pid *FASTCALL(find_pid(int nr));
+
+/*
+ * Lookup a PID in the hash table, and return with it's count elevated.
  */
-extern struct pid *FASTCALL(find_pid(enum pid_type, int));
+extern struct pid *find_get_pid(int nr);
 
-extern int alloc_pidmap(void);
-extern void FASTCALL(free_pidmap(int));
+extern struct pid *alloc_pid(void);
+extern void FASTCALL(free_pid(struct pid *pid));
 
+#define pid_next(task, type)					\
+	((task)->pids[(type)].node.next)
+
+#define pid_next_task(task, type) 				\
+	hlist_entry(pid_next(task, type), struct task_struct,	\
+			pids[(type)].node)
+
+
+/* We could use hlist_for_each_entry_rcu here but it takes more arguments
+ * than the do_each_task_pid/while_each_task_pid.  So we roll our own
+ * to preserve the existing interface.
+ */
 #define do_each_task_pid(who, type, task)				\
 	if ((task = find_task_by_pid_type(type, who))) {		\
-		prefetch((task)->pids[type].pid_list.next);		\
+		prefetch(pid_next(task, type));				\
 		do {
 
 #define while_each_task_pid(who, type, task)				\
-		} while (task = pid_task((task)->pids[type].pid_list.next,\
-						type),			\
-			prefetch((task)->pids[type].pid_list.next),	\
-			hlist_unhashed(&(task)->pids[type].pid_chain));	\
-	}								\
+		} while (pid_next(task, type) &&  ({			\
+				task = pid_next_task(task, type);	\
+				rcu_dereference(task);			\
+				prefetch(pid_next(task, type));		\
+				1; }) );				\
+	}
 
 #endif /* _LINUX_PID_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7e0ff5dba986..541f4828f5e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -760,7 +760,7 @@ struct task_struct {
 	struct task_struct *group_leader;	/* threadgroup leader */
 
 	/* PID/PID hash table linkage. */
-	struct pid pids[PIDTYPE_MAX];
+	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
 
 	struct completion *vfork_done;		/* for vfork() */
@@ -899,7 +899,7 @@ static inline pid_t process_group(struct task_struct *tsk)
  */
 static inline int pid_alive(struct task_struct *p)
 {
-	return p->pids[PIDTYPE_PID].nr != 0;
+	return p->pids[PIDTYPE_PID].pid != NULL;
 }
 
 extern void free_task(struct task_struct *tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index b1341205be27..03975d0467f9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1315,17 +1315,19 @@ long do_fork(unsigned long clone_flags,
 {
 	struct task_struct *p;
 	int trace = 0;
-	long pid = alloc_pidmap();
+	struct pid *pid = alloc_pid();
+	long nr;
 
-	if (pid < 0)
+	if (!pid)
 		return -EAGAIN;
+	nr = pid->nr;
 	if (unlikely(current->ptrace)) {
 		trace = fork_traceflag (clone_flags);
 		if (trace)
 			clone_flags |= CLONE_PTRACE;
 	}
 
-	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
+	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
@@ -1352,7 +1354,7 @@ long do_fork(unsigned long clone_flags,
 			p->state = TASK_STOPPED;
 
 		if (unlikely (trace)) {
-			current->ptrace_message = pid;
+			current->ptrace_message = nr;
 			ptrace_notify ((trace << 8) | SIGTRAP);
 		}
 
@@ -1362,10 +1364,10 @@ long do_fork(unsigned long clone_flags,
 				ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
 		}
 	} else {
-		free_pidmap(pid);
-		pid = PTR_ERR(p);
+		free_pid(pid);
+		nr = PTR_ERR(p);
 	}
-	return pid;
+	return nr;
 }
 
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
diff --git a/kernel/pid.c b/kernel/pid.c
index a9f2dfd006d2..eeb836b65ca4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -28,8 +28,9 @@
 #include <linux/hash.h>
 
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
-static struct hlist_head *pid_hash[PIDTYPE_MAX];
+static struct hlist_head *pid_hash;
 static int pidhash_shift;
+static kmem_cache_t *pid_cachep;
 
 int pid_max = PID_MAX_DEFAULT;
 int last_pid;
@@ -60,9 +61,22 @@ typedef struct pidmap {
 static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
 	 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
 
+/*
+ * Note: disable interrupts while the pidmap_lock is held as an
+ * interrupt might come in and do read_lock(&tasklist_lock).
+ *
+ * If we don't disable interrupts there is a nasty deadlock between
+ * detach_pid()->free_pid() and another cpu that does
+ * spin_lock(&pidmap_lock) followed by an interrupt routine that does
+ * read_lock(&tasklist_lock);
+ *
+ * After we clean up the tasklist_lock and know there are no
+ * irq handlers that take it we can leave the interrupts enabled.
+ * For now it is easier to be safe than to prove it can't happen.
+ */
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
-fastcall void free_pidmap(int pid)
+static fastcall void free_pidmap(int pid)
 {
 	pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
 	int offset = pid & BITS_PER_PAGE_MASK;
@@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid)
 	atomic_inc(&map->nr_free);
 }
 
-int alloc_pidmap(void)
+static int alloc_pidmap(void)
 {
 	int i, offset, max_scan, pid, last = last_pid;
 	pidmap_t *map;
@@ -89,12 +103,12 @@ int alloc_pidmap(void)
 			 * Free the page if someone raced with us
 			 * installing it:
 			 */
-			spin_lock(&pidmap_lock);
+			spin_lock_irq(&pidmap_lock);
 			if (map->page)
 				free_page(page);
 			else
 				map->page = (void *)page;
-			spin_unlock(&pidmap_lock);
+			spin_unlock_irq(&pidmap_lock);
 			if (unlikely(!map->page))
 				break;
 		}
@@ -131,13 +145,73 @@ int alloc_pidmap(void)
 	return -1;
 }
 
-struct pid * fastcall find_pid(enum pid_type type, int nr)
+fastcall void put_pid(struct pid *pid)
+{
+	if (!pid)
+		return;
+	if ((atomic_read(&pid->count) == 1) ||
+	     atomic_dec_and_test(&pid->count))
+		kmem_cache_free(pid_cachep, pid);
+}
+
+static void delayed_put_pid(struct rcu_head *rhp)
+{
+	struct pid *pid = container_of(rhp, struct pid, rcu);
+	put_pid(pid);
+}
+
+fastcall void free_pid(struct pid *pid)
+{
+	/* We can be called with write_lock_irq(&tasklist_lock) held */
+	unsigned long flags;
+
+	spin_lock_irqsave(&pidmap_lock, flags);
+	hlist_del_rcu(&pid->pid_chain);
+	spin_unlock_irqrestore(&pidmap_lock, flags);
+
+	free_pidmap(pid->nr);
+	call_rcu(&pid->rcu, delayed_put_pid);
+}
+
+struct pid *alloc_pid(void)
+{
+	struct pid *pid;
+	enum pid_type type;
+	int nr = -1;
+
+	pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
+	if (!pid)
+		goto out;
+
+	nr = alloc_pidmap();
+	if (nr < 0)
+		goto out_free;
+
+	atomic_set(&pid->count, 1);
+	pid->nr = nr;
+	for (type = 0; type < PIDTYPE_MAX; ++type)
+		INIT_HLIST_HEAD(&pid->tasks[type]);
+
+	spin_lock_irq(&pidmap_lock);
+	hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
+	spin_unlock_irq(&pidmap_lock);
+
+out:
+	return pid;
+
+out_free:
+	kmem_cache_free(pid_cachep, pid);
+	pid = NULL;
+	goto out;
+}
+
+struct pid * fastcall find_pid(int nr)
 {
 	struct hlist_node *elem;
 	struct pid *pid;
 
 	hlist_for_each_entry_rcu(pid, elem,
-			&pid_hash[type][pid_hashfn(nr)], pid_chain) {
+			&pid_hash[pid_hashfn(nr)], pid_chain) {
 		if (pid->nr == nr)
 			return pid;
 	}
@@ -146,77 +220,82 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
 
 int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
 {
-	struct pid *pid, *task_pid;
-
-	task_pid = &task->pids[type];
-	pid = find_pid(type, nr);
-	task_pid->nr = nr;
-	if (pid == NULL) {
-		INIT_LIST_HEAD(&task_pid->pid_list);
-		hlist_add_head_rcu(&task_pid->pid_chain,
-				   &pid_hash[type][pid_hashfn(nr)]);
-	} else {
-		INIT_HLIST_NODE(&task_pid->pid_chain);
-		list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
-	}
+	struct pid_link *link;
+	struct pid *pid;
+
+	WARN_ON(!task->pid); /* to be removed soon */
+	WARN_ON(!nr); /* to be removed soon */
+
+	link = &task->pids[type];
+	link->pid = pid = find_pid(nr);
+	hlist_add_head_rcu(&link->node, &pid->tasks[type]);
 
 	return 0;
 }
 
-static fastcall int __detach_pid(task_t *task, enum pid_type type)
+void fastcall detach_pid(task_t *task, enum pid_type type)
 {
-	struct pid *pid, *pid_next;
-	int nr = 0;
+	struct pid_link *link;
+	struct pid *pid;
+	int tmp;
 
-	pid = &task->pids[type];
-	if (!hlist_unhashed(&pid->pid_chain)) {
+	link = &task->pids[type];
+	pid = link->pid;
 
-		if (list_empty(&pid->pid_list)) {
-			nr = pid->nr;
-			hlist_del_rcu(&pid->pid_chain);
-		} else {
-			pid_next = list_entry(pid->pid_list.next,
-						struct pid, pid_list);
-			/* insert next pid from pid_list to hash */
-			hlist_replace_rcu(&pid->pid_chain,
-					  &pid_next->pid_chain);
-		}
-	}
+	hlist_del_rcu(&link->node);
+	link->pid = NULL;
 
-	list_del_rcu(&pid->pid_list);
-	pid->nr = 0;
+	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+		if (!hlist_empty(&pid->tasks[tmp]))
+			return;
 
-	return nr;
+	free_pid(pid);
 }
 
-void fastcall detach_pid(task_t *task, enum pid_type type)
+struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
 {
-	int tmp, nr;
+	struct task_struct *result = NULL;
+	if (pid) {
+		struct hlist_node *first;
+		first = rcu_dereference(pid->tasks[type].first);
+		if (first)
+			result = hlist_entry(first, struct task_struct, pids[(type)].node);
+	}
+	return result;
+}
 
-	nr = __detach_pid(task, type);
-	if (!nr)
-		return;
+/*
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ */
+task_t *find_task_by_pid_type(int type, int nr)
+{
+	return pid_task(find_pid(nr), type);
+}
 
-	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
-		if (tmp != type && find_pid(tmp, nr))
-			return;
+EXPORT_SYMBOL(find_task_by_pid_type);
 
-	free_pidmap(nr);
+struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
+{
+	struct task_struct *result;
+	rcu_read_lock();
+	result = pid_task(pid, type);
+	if (result)
+		get_task_struct(result);
+	rcu_read_unlock();
+	return result;
 }
 
-task_t *find_task_by_pid_type(int type, int nr)
+struct pid *find_get_pid(pid_t nr)
 {
 	struct pid *pid;
 
-	pid = find_pid(type, nr);
-	if (!pid)
-		return NULL;
+	rcu_read_lock();
+	pid = get_pid(find_pid(nr));
+	rcu_read_unlock();
 
-	return pid_task(&pid->pid_list, type);
+	return pid;
 }
 
-EXPORT_SYMBOL(find_task_by_pid_type);
-
 /*
  * The pid hash table is scaled according to the amount of memory in the
  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -224,7 +303,7 @@ EXPORT_SYMBOL(find_task_by_pid_type);
  */
 void __init pidhash_init(void)
 {
-	int i, j, pidhash_size;
+	int i, pidhash_size;
 	unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
 
 	pidhash_shift = max(4, fls(megabytes * 4));
@@ -233,16 +312,13 @@ void __init pidhash_init(void)
 
 	printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
 		pidhash_size, pidhash_shift,
-		PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head));
-
-	for (i = 0; i < PIDTYPE_MAX; i++) {
-		pid_hash[i] = alloc_bootmem(pidhash_size *
-					sizeof(*(pid_hash[i])));
-		if (!pid_hash[i])
-			panic("Could not alloc pidhash!\n");
-		for (j = 0; j < pidhash_size; j++)
-			INIT_HLIST_HEAD(&pid_hash[i][j]);
-	}
+		pidhash_size * sizeof(struct hlist_head));
+
+	pid_hash = alloc_bootmem(pidhash_size *	sizeof(*(pid_hash)));
+	if (!pid_hash)
+		panic("Could not alloc pidhash!\n");
+	for (i = 0; i < pidhash_size; i++)
+		INIT_HLIST_HEAD(&pid_hash[i]);
 }
 
 void __init pidmap_init(void)
@@ -251,4 +327,8 @@ void __init pidmap_init(void)
 	/* Reserve PID 0. We never call free_pidmap(0) */
 	set_bit(0, pidmap_array->page);
 	atomic_dec(&pidmap_array->nr_free);
+
+	pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
+					__alignof__(struct pid),
+					SLAB_PANIC, NULL, NULL);
 }
-- 
cgit v1.3-6-gb490


From 428622986858aebddc32d022af65e88b9d2ea8bb Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@openvz.org>
Date: Fri, 31 Mar 2006 17:58:46 +0400
Subject: [PATCH] wrong error path in dup_fd() leading to oopses in RCU

Wrong error path in dup_fd() - it should return NULL on error,
not an address of already freed memory :/

Triggered by OpenVZ stress test suite.

What is interesting is that it was causing different oopses in RCU like
below:
Call Trace:
   [<c013492c>] rcu_do_batch+0x2c/0x80
   [<c0134bdd>] rcu_process_callbacks+0x3d/0x70
   [<c0126cf3>] tasklet_action+0x73/0xe0
   [<c01269aa>] __do_softirq+0x10a/0x130
   [<c01058ff>] do_softirq+0x4f/0x60
   =======================
   [<c0113817>] smp_apic_timer_interrupt+0x77/0x110
   [<c0103b54>] apic_timer_interrupt+0x1c/0x24
  Code:  Bad EIP value.
   <0>Kernel panic - not syncing: Fatal exception in interrupt

Signed-Off-By: Pavel Emelianov <xemul@sw.ru>
Signed-Off-By: Dmitry Mishin <dim@openvz.org>
Signed-Off-By: Kirill Korotaev <dev@openvz.org>
Signed-Off-By: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 03975d0467f9..3384eb89cb1c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -725,7 +725,7 @@ out_release:
 	free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
 	free_fd_array(new_fdt->fd, new_fdt->max_fds);
 	kmem_cache_free(files_cachep, newf);
-	goto out;
+	return NULL;
 }
 
 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
-- 
cgit v1.3-6-gb490