From 8feb8e896d77439146d2e2ab3d0ab55bb5baf5fc Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Tue, 29 May 2012 15:16:05 +0800
Subject: smp: Remove ipi_call_lock[_irq]()/ipi_call_unlock[_irq]()

There is no user of those APIs anymore, just remove it.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: ralf@linux-mips.org
Cc: sshtylyov@mvista.com
Cc: david.daney@cavium.com
Cc: nikunj@linux.vnet.ibm.com
Cc: paulmck@linux.vnet.ibm.com
Cc: axboe@kernel.dk
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1338275765-3217-11-git-send-email-yong.zhang0@gmail.com
Acked-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/smp.c | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index d0ae5b24875e..29dd40a9f2f4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
 	return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
-
-void ipi_call_lock(void)
-{
-	raw_spin_lock(&call_function.lock);
-}
-
-void ipi_call_unlock(void)
-{
-	raw_spin_unlock(&call_function.lock);
-}
-
-void ipi_call_lock_irq(void)
-{
-	raw_spin_lock_irq(&call_function.lock);
-}
-
-void ipi_call_unlock_irq(void)
-{
-	raw_spin_unlock_irq(&call_function.lock);
-}
 #endif /* USE_GENERIC_SMP_HELPERS */
 
 /* Setup configured maximum number of CPUs to activate */
-- 
cgit v1.2.3-59-g8ed1b


From ec44bc7acc3687ba6ae8154b4b5a845b70279237 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 May 2012 22:08:57 +0000
Subject: timers: Create detach_if_pending() and use it

Most callers of detach_timer() have the same pattern around
them. Check whether the timer is pending and eventually updating
base->next_timer.

Create detach_if_pending() and replace the duplicated code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Gilad Ben-Yossef <gilad@benyossef.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20120525214819.131246037@linutronix.de
---
 kernel/timer.c | 56 +++++++++++++++++++++++---------------------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 6ec7e7e0db43..0f70deb20151 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -654,8 +654,7 @@ void init_timer_deferrable_key(struct timer_list *timer,
 }
 EXPORT_SYMBOL(init_timer_deferrable_key);
 
-static inline void detach_timer(struct timer_list *timer,
-				int clear_pending)
+static inline void detach_timer(struct timer_list *timer, bool clear_pending)
 {
 	struct list_head *entry = &timer->entry;
 
@@ -667,6 +666,19 @@ static inline void detach_timer(struct timer_list *timer,
 	entry->prev = LIST_POISON2;
 }
 
+static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
+			     bool clear_pending)
+{
+	if (!timer_pending(timer))
+		return 0;
+
+	detach_timer(timer, clear_pending);
+	if (timer->expires == base->next_timer &&
+	    !tbase_get_deferrable(timer->base))
+		base->next_timer = base->timer_jiffies;
+	return 1;
+}
+
 /*
  * We are using hashed locking: holding per_cpu(tvec_bases).lock
  * means that all timers which are tied to this base via timer->base are
@@ -712,16 +724,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	base = lock_timer_base(timer, &flags);
 
-	if (timer_pending(timer)) {
-		detach_timer(timer, 0);
-		if (timer->expires == base->next_timer &&
-		    !tbase_get_deferrable(timer->base))
-			base->next_timer = base->timer_jiffies;
-		ret = 1;
-	} else {
-		if (pending_only)
-			goto out_unlock;
-	}
+	ret = detach_if_pending(timer, base, false);
+	if (!ret && pending_only)
+		goto out_unlock;
 
 	debug_activate(timer, expires);
 
@@ -959,13 +964,7 @@ int del_timer(struct timer_list *timer)
 	timer_stats_timer_clear_start_info(timer);
 	if (timer_pending(timer)) {
 		base = lock_timer_base(timer, &flags);
-		if (timer_pending(timer)) {
-			detach_timer(timer, 1);
-			if (timer->expires == base->next_timer &&
-			    !tbase_get_deferrable(timer->base))
-				base->next_timer = base->timer_jiffies;
-			ret = 1;
-		}
+		ret = detach_if_pending(timer, base, true);
 		spin_unlock_irqrestore(&base->lock, flags);
 	}
 
@@ -990,19 +989,10 @@ int try_to_del_timer_sync(struct timer_list *timer)
 
 	base = lock_timer_base(timer, &flags);
 
-	if (base->running_timer == timer)
-		goto out;
-
-	timer_stats_timer_clear_start_info(timer);
-	ret = 0;
-	if (timer_pending(timer)) {
-		detach_timer(timer, 1);
-		if (timer->expires == base->next_timer &&
-		    !tbase_get_deferrable(timer->base))
-			base->next_timer = base->timer_jiffies;
-		ret = 1;
+	if (base->running_timer != timer) {
+		timer_stats_timer_clear_start_info(timer);
+		ret = detach_if_pending(timer, base, true);
 	}
-out:
 	spin_unlock_irqrestore(&base->lock, flags);
 
 	return ret;
@@ -1178,7 +1168,7 @@ static inline void __run_timers(struct tvec_base *base)
 			timer_stats_account_timer(timer);
 
 			base->running_timer = timer;
-			detach_timer(timer, 1);
+			detach_timer(timer, true);
 
 			spin_unlock_irq(&base->lock);
 			call_timer_fn(timer, fn, data);
@@ -1714,7 +1704,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
 
 	while (!list_empty(head)) {
 		timer = list_first_entry(head, struct timer_list, entry);
-		detach_timer(timer, 0);
+		detach_timer(timer, false);
 		timer_set_base(timer, new_base);
 		if (time_before(timer->expires, new_base->next_timer) &&
 		    !tbase_get_deferrable(timer->base))
-- 
cgit v1.2.3-59-g8ed1b


From facbb4a7efbd658046bf615f03cd97a1504785d8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 May 2012 22:08:57 +0000
Subject: timers: Consolidate base->next_timer update

Another bunch of mindlessly copied code. All callers of
internal_add_timer() except the recascading code updates
base->next_timer.

Move this into internal_add_timer() and let the cascading code call
__internal_add_timer().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Gilad Ben-Yossef <gilad@benyossef.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20120525214819.189946224@linutronix.de
---
 kernel/timer.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 0f70deb20151..7207690b5353 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -330,7 +330,8 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+static void
+__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
 	unsigned long idx = expires - base->timer_jiffies;
@@ -372,6 +373,17 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	list_add_tail(&timer->entry, vec);
 }
 
+static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+{
+	__internal_add_timer(base, timer);
+	/*
+	 * Update base->next_timer if this is the earliest one.
+	 */
+	if (time_before(timer->expires, base->next_timer) &&
+	    !tbase_get_deferrable(timer->base))
+		base->next_timer = timer->expires;
+}
+
 #ifdef CONFIG_TIMER_STATS
 void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 {
@@ -757,9 +769,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 	}
 
 	timer->expires = expires;
-	if (time_before(timer->expires, base->next_timer) &&
-	    !tbase_get_deferrable(timer->base))
-		base->next_timer = timer->expires;
 	internal_add_timer(base, timer);
 
 out_unlock:
@@ -925,9 +934,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	spin_lock_irqsave(&base->lock, flags);
 	timer_set_base(timer, base);
 	debug_activate(timer, timer->expires);
-	if (time_before(timer->expires, base->next_timer) &&
-	    !tbase_get_deferrable(timer->base))
-		base->next_timer = timer->expires;
 	internal_add_timer(base, timer);
 	/*
 	 * Check whether the other CPU is idle and needs to be
@@ -1079,7 +1085,8 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 	 */
 	list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
 		BUG_ON(tbase_get_base(timer->base) != base);
-		internal_add_timer(base, timer);
+		/* No accounting, while moving them */
+		__internal_add_timer(base, timer);
 	}
 
 	return index;
@@ -1706,9 +1713,6 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
 		timer = list_first_entry(head, struct timer_list, entry);
 		detach_timer(timer, false);
 		timer_set_base(timer, new_base);
-		if (time_before(timer->expires, new_base->next_timer) &&
-		    !tbase_get_deferrable(timer->base))
-			new_base->next_timer = timer->expires;
 		internal_add_timer(new_base, timer);
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From 99d5f3aac674fe081ffddd2dbb8946ccbc14c410 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 May 2012 22:08:58 +0000
Subject: timers: Add accounting of non deferrable timers

The code in get_next_timer_interrupt() is suboptimal as it has to run
through the cascade to find the next expiring timer. On a completely
idle core we should only do that when there is an active timer
enqueued and base->next_timer does not give us a fast answer.

Add accounting of the active timers to the now consolidated
attach/detach code. I deliberately avoided sanity checks because the
code is fully symetric and any fiddling with timers w/o using the API
functions will lead to cute explosions anyway. ulong is big enough
even on 32bit and if we really run into the situation to have more
than 1<<32 timers enqueued there, then we are definitely not in a
state to go idle and run through that code.

This allows us to fix another shortcoming of get_next_timer_interrupt().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Gilad Ben-Yossef <gilad@benyossef.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20120525214819.236377028@linutronix.de
---
 kernel/timer.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 7207690b5353..7fada698bd1a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -77,6 +77,7 @@ struct tvec_base {
 	struct timer_list *running_timer;
 	unsigned long timer_jiffies;
 	unsigned long next_timer;
+	unsigned long active_timers;
 	struct tvec_root tv1;
 	struct tvec tv2;
 	struct tvec tv3;
@@ -377,11 +378,13 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
 	__internal_add_timer(base, timer);
 	/*
-	 * Update base->next_timer if this is the earliest one.
+	 * Update base->active_timers and base->next_timer
 	 */
-	if (time_before(timer->expires, base->next_timer) &&
-	    !tbase_get_deferrable(timer->base))
-		base->next_timer = timer->expires;
+	if (!tbase_get_deferrable(timer->base)) {
+		if (time_before(timer->expires, base->next_timer))
+			base->next_timer = timer->expires;
+		base->active_timers++;
+	}
 }
 
 #ifdef CONFIG_TIMER_STATS
@@ -678,6 +681,14 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)
 	entry->prev = LIST_POISON2;
 }
 
+static inline void
+detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
+{
+	detach_timer(timer, true);
+	if (!tbase_get_deferrable(timer->base))
+		timer->base->active_timers--;
+}
+
 static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
 			     bool clear_pending)
 {
@@ -685,9 +696,11 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
 		return 0;
 
 	detach_timer(timer, clear_pending);
-	if (timer->expires == base->next_timer &&
-	    !tbase_get_deferrable(timer->base))
-		base->next_timer = base->timer_jiffies;
+	if (!tbase_get_deferrable(timer->base)) {
+		timer->base->active_timers--;
+		if (timer->expires == base->next_timer)
+			base->next_timer = base->timer_jiffies;
+	}
 	return 1;
 }
 
@@ -1175,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base)
 			timer_stats_account_timer(timer);
 
 			base->running_timer = timer;
-			detach_timer(timer, true);
+			detach_expired_timer(timer, base);
 
 			spin_unlock_irq(&base->lock);
 			call_timer_fn(timer, fn, data);
@@ -1701,6 +1714,7 @@ static int __cpuinit init_timers_cpu(int cpu)
 
 	base->timer_jiffies = jiffies;
 	base->next_timer = base->timer_jiffies;
+	base->active_timers = 0;
 	return 0;
 }
 
@@ -1711,6 +1725,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
 
 	while (!list_empty(head)) {
 		timer = list_first_entry(head, struct timer_list, entry);
+		/* We ignore the accounting on the dying cpu */
 		detach_timer(timer, false);
 		timer_set_base(timer, new_base);
 		internal_add_timer(new_base, timer);
-- 
cgit v1.2.3-59-g8ed1b


From e40468a54882ef7411fb178dbf2e465ec2349af7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 May 2012 22:08:59 +0000
Subject: timers: Improve get_next_timer_interrupt()

Gilad reported at

 http://lkml.kernel.org/r/1336056962-10465-2-git-send-email-gilad@benyossef.com

"Current timer code fails to correctly return a value meaning that
 there is no future timer event, with the result that the timer keeps
 getting re-armed in HZ one shot mode even when we could turn it off,
 generating unneeded interrupts.

 What is happening is that when __next_timer_interrupt() wishes
 to return a value that signifies "there is no future timer
 event", it returns (base->timer_jiffies + NEXT_TIMER_MAX_DELTA).

 However, the code in tick_nohz_stop_sched_tick(), which called
 __next_timer_interrupt() via get_next_timer_interrupt(),
 compares the return value to (last_jiffies + NEXT_TIMER_MAX_DELTA)
 to see if the timer needs to be re-armed.

 base->timer_jiffies != last_jiffies and so tick_nohz_stop_sched_tick()
 interperts the return value as indication that there is a distant
 future event 12 days from now and programs the timer to fire next
 after KTIME_MAX nsecs instead of avoiding to arm it. This ends up
 causing a needless interrupt once every KTIME_MAX nsecs."

Fix this by using the new active timer accounting. This avoids scans
when no active timer is enqueued completely, so we don't have to rely
on base->timer_next and base->timer_jiffies anymore.

Reported-by: Gilad Ben-Yossef <gilad@benyossef.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20120525214819.317535385@linutronix.de
---
 kernel/timer.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 7fada698bd1a..a61c09374eba 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1326,18 +1326,21 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
 unsigned long get_next_timer_interrupt(unsigned long now)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
-	unsigned long expires;
+	unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
 
 	/*
 	 * Pretend that there is no timer pending if the cpu is offline.
 	 * Possible pending timers will be migrated later to an active cpu.
 	 */
 	if (cpu_is_offline(smp_processor_id()))
-		return now + NEXT_TIMER_MAX_DELTA;
+		return expires;
+
 	spin_lock(&base->lock);
-	if (time_before_eq(base->next_timer, base->timer_jiffies))
-		base->next_timer = __next_timer_interrupt(base);
-	expires = base->next_timer;
+	if (base->active_timers) {
+		if (time_before_eq(base->next_timer, base->timer_jiffies))
+			base->next_timer = __next_timer_interrupt(base);
+		expires = base->next_timer;
+	}
 	spin_unlock(&base->lock);
 
 	if (time_before_eq(expires, now))
-- 
cgit v1.2.3-59-g8ed1b


From c00b275043adc14d668f36266b890f0c53d46640 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 29 May 2012 21:27:44 +0200
Subject: uprobes: Optimize is_swbp_at_addr() for current->mm

Change is_swbp_at_addr() to try to avoid the costly
read_opcode() if mm == current->mm, __copy_from_user_inatomic()
should succeed in the likely case.

Currently this optimization is not important, but we are going
to add more is_swbp_at_addr(current->mm) callers.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120529192744.GA8057@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 985be4d80fe8..d0f5ec0dcdea 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -333,10 +333,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	uprobe_opcode_t opcode;
 	int result;
 
+	if (current->mm == mm) {
+		pagefault_disable();
+		result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
+								sizeof(opcode));
+		pagefault_enable();
+
+		if (likely(result == 0))
+			goto out;
+	}
+
 	result = read_opcode(mm, vaddr, &opcode);
 	if (result)
 		return result;
-
+out:
 	if (is_swbp_insn(&opcode))
 		return 1;
 
-- 
cgit v1.2.3-59-g8ed1b


From a3d7bb47937b3a40b9f0c75655e97b3bb6407cbe Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 29 May 2012 21:27:59 +0200
Subject: uprobes: Change read_opcode() to use FOLL_FORCE

set_orig_insn()->read_opcode() should not fail if the probed
task did mprotect() after uprobe_register(), change it to use
FOLL_FORCE. Without FOLL_WRITE this doesn't have any "side"
effect but allows to read the !VM_READ memory.

There is another reason for this change, we are going to use
is_swbp_at_addr() from handle_swbp() which can race with another
thread doing mprotect().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120529192759.GB8057@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d0f5ec0dcdea..a0dbc87a2ec6 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -312,7 +312,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
 	void *vaddr_new;
 	int ret;
 
-	ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL);
+	ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
 	if (ret <= 0)
 		return ret;
 
-- 
cgit v1.2.3-59-g8ed1b


From 3a9ea0520f38def4a3915b91f82455b749f07d88 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 29 May 2012 21:28:57 +0200
Subject: uprobes: Introduce find_active_uprobe() helper

No functional changes. Move the "find uprobe" code from
handle_swbp() to the new helper, find_active_uprobe().

Note: with or without this change, the find-active-uprobe logic
is not exactly right. We can race with another thread which
unmaps the memory with the valid uprobe before we take
mm->mmap_sem. We can't find this uprobe simply because
find_vma() fails. In this case we wrongly assume that this trap
was not caused by uprobe and send the erroneous SIGTRAP. See the
next changes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120529192857.GC8057@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a0dbc87a2ec6..eaf4d55fd424 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1489,38 +1489,47 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
 	return false;
 }
 
-/*
- * Run handler and ask thread to singlestep.
- * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
- */
-static void handle_swbp(struct pt_regs *regs)
+static struct uprobe *find_active_uprobe(unsigned long bp_vaddr)
 {
+	struct mm_struct *mm = current->mm;
+	struct uprobe *uprobe = NULL;
 	struct vm_area_struct *vma;
-	struct uprobe_task *utask;
-	struct uprobe *uprobe;
-	struct mm_struct *mm;
-	unsigned long bp_vaddr;
 
-	uprobe = NULL;
-	bp_vaddr = uprobe_get_swbp_addr(regs);
-	mm = current->mm;
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, bp_vaddr);
 
-	if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
-		struct inode *inode;
-		loff_t offset;
+	if (vma && vma->vm_start <= bp_vaddr) {
+		if (valid_vma(vma, false)) {
+			struct inode *inode;
+			loff_t offset;
 
-		inode = vma->vm_file->f_mapping->host;
-		offset = bp_vaddr - vma->vm_start;
-		offset += (vma->vm_pgoff << PAGE_SHIFT);
-		uprobe = find_uprobe(inode, offset);
+			inode = vma->vm_file->f_mapping->host;
+			offset = bp_vaddr - vma->vm_start;
+			offset += (vma->vm_pgoff << PAGE_SHIFT);
+			uprobe = find_uprobe(inode, offset);
+		}
 	}
 
 	srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
 	current->uprobe_srcu_id = -1;
 	up_read(&mm->mmap_sem);
 
+	return uprobe;
+}
+
+/*
+ * Run handler and ask thread to singlestep.
+ * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
+ */
+static void handle_swbp(struct pt_regs *regs)
+{
+	struct uprobe_task *utask;
+	struct uprobe *uprobe;
+	unsigned long bp_vaddr;
+
+	bp_vaddr = uprobe_get_swbp_addr(regs);
+	uprobe = find_active_uprobe(bp_vaddr);
+
 	if (!uprobe) {
 		/* No matching uprobe; signal SIGTRAP. */
 		send_sig(SIGTRAP, current, 0);
-- 
cgit v1.2.3-59-g8ed1b


From d790d34653ab20c74034902f5f0889bba807949a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 29 May 2012 21:29:14 +0200
Subject: uprobes: Teach find_active_uprobe() to provide the "is_swbp" info

A separate patch to simplify the review, and for the
documentation.

The patch adds another "int *is_swbp" argument to
find_active_uprobe(), so far its only caller doesn't use this
info.

With this patch find_active_uprobe() additionally does:

	- if find_vma() + ->vm_start check fails, *is_swbp = -EFAULT

	- otherwise, if valid_vma() + find_uprobe() fails, it holds
	  the result of is_swbp_at_addr(), can be negative too. The
	  latter is only possible if we raced with another thread
	  which did munmap/etc after we hit this bp.

IOW. If find_active_uprobe(&is_swbp) returns NULL, the caller
can look at is_swbp to figure out whether the current insn is bp
or not, or detect the race with another thread if it is
negative.

Note: I think that performance-wise this change is fine. This
adds is_swbp_at_addr(), but only if we raced with
uprobe_unregister() or if we hit the "normal" int3 but this mm
has uprobes as well. And even in this case the slow
read_opcode() path is very unlikely, this insn recently
triggered do_int3(), __copy_from_user_inatomic() shouldn't fail
in the likely case.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120529192914.GD8057@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index eaf4d55fd424..ee3df704e78a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1489,7 +1489,7 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
 	return false;
 }
 
-static struct uprobe *find_active_uprobe(unsigned long bp_vaddr)
+static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
 {
 	struct mm_struct *mm = current->mm;
 	struct uprobe *uprobe = NULL;
@@ -1497,7 +1497,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr)
 
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, bp_vaddr);
-
 	if (vma && vma->vm_start <= bp_vaddr) {
 		if (valid_vma(vma, false)) {
 			struct inode *inode;
@@ -1508,6 +1507,11 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr)
 			offset += (vma->vm_pgoff << PAGE_SHIFT);
 			uprobe = find_uprobe(inode, offset);
 		}
+
+		if (!uprobe)
+			*is_swbp = is_swbp_at_addr(mm, bp_vaddr);
+	} else {
+		*is_swbp = -EFAULT;
 	}
 
 	srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
@@ -1526,9 +1530,10 @@ static void handle_swbp(struct pt_regs *regs)
 	struct uprobe_task *utask;
 	struct uprobe *uprobe;
 	unsigned long bp_vaddr;
+	int is_swbp;
 
 	bp_vaddr = uprobe_get_swbp_addr(regs);
-	uprobe = find_active_uprobe(bp_vaddr);
+	uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
 
 	if (!uprobe) {
 		/* No matching uprobe; signal SIGTRAP. */
-- 
cgit v1.2.3-59-g8ed1b


From 77fc4af1b59d12ab3b1467adf0a5204806853123 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 29 May 2012 21:29:28 +0200
Subject: uprobes: Change register_for_each_vma() to take mm->mmap_sem for
 writing

Change register_for_each_vma() to take mm->mmap_sem for writing.
This is a bit unfortunate but hopefully not too bad, this is the
slow path anyway.

This is needed to ensure that find_active_uprobe() can not race
with uprobe_register() which adds the new bp at the same
bp_vaddr, after find_uprobe() fails and before
is_swbp_at_addr_fast() checks the memory.

IOW, this is needed to ensure that if find_active_uprobe()
returns NULL but is_swbp == true, we can safely assume that it
was the "normal" int3 and we should send SIGTRAP.

There is another reason for this change. We are going to replace
uprobes_state->count with MMF_ flags set by register/unregister
and cleared by find_active_uprobe(), and set/clear shouldn't
race with each other.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120529192928.GE8057@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ee3df704e78a..a2ed82b4808c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -853,12 +853,12 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 		}
 
 		mm = vi->mm;
-		down_read(&mm->mmap_sem);
+		down_write(&mm->mmap_sem);
 		vma = find_vma(mm, (unsigned long)vi->vaddr);
 		if (!vma || !valid_vma(vma, is_register)) {
 			list_del(&vi->probe_list);
 			kfree(vi);
-			up_read(&mm->mmap_sem);
+			up_write(&mm->mmap_sem);
 			mmput(mm);
 			continue;
 		}
@@ -867,7 +867,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 						vaddr != vi->vaddr) {
 			list_del(&vi->probe_list);
 			kfree(vi);
-			up_read(&mm->mmap_sem);
+			up_write(&mm->mmap_sem);
 			mmput(mm);
 			continue;
 		}
@@ -877,7 +877,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 		else
 			remove_breakpoint(uprobe, mm, vi->vaddr);
 
-		up_read(&mm->mmap_sem);
+		up_write(&mm->mmap_sem);
 		mmput(mm);
 		if (is_register) {
 			if (ret && ret == -EEXIST)
-- 
cgit v1.2.3-59-g8ed1b


From 56bb4cf6475d702d2fb00fc641aa6441097c0330 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 29 May 2012 21:29:47 +0200
Subject: uprobes: Teach handle_swbp() to rely on "is_swbp" rather than
 uprobes_srcu

Currently handle_swbp() assumes that it can't race with
unregister, so it roughly does:

	if (find_uprobe(vaddr))
		process_uprobe();
	else
		send_sig(SIGTRAP);

This relies on the not-really-working uprobes_srcu code we are
going to remove, see the next patch.

With this patch we rely on the result of
is_swbp_at_addr(bp_vaddr) if find_uprobe() fails.

If is_swbp == 1, then we hit the normal int3, we should send
SIGTRAP.

If is_swbp == 0, we raced with uprobe_unregister(), we simply
restart this insn again.

The "difficult" case is is_swbp == -EFAULT, when we can't read
this memory. In this case I think we should restart too, and
this is more correct compared to the current code which sends
SIGTRAP.

Ignoring ENOMEM/etc from get_user_pages(), this can only happen
if another thread unmaps this memory before find_active_uprobe()
takes mmap_sem. It would be better to pretend it was unmapped
before this insn was executed, restart, and get SIGSEGV.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120529192947.GF8057@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a2ed82b4808c..1f02e3bbfc1d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1530,14 +1530,26 @@ static void handle_swbp(struct pt_regs *regs)
 	struct uprobe_task *utask;
 	struct uprobe *uprobe;
 	unsigned long bp_vaddr;
-	int is_swbp;
+	int uninitialized_var(is_swbp);
 
 	bp_vaddr = uprobe_get_swbp_addr(regs);
 	uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
 
 	if (!uprobe) {
-		/* No matching uprobe; signal SIGTRAP. */
-		send_sig(SIGTRAP, current, 0);
+		if (is_swbp > 0) {
+			/* No matching uprobe; signal SIGTRAP. */
+			send_sig(SIGTRAP, current, 0);
+		} else {
+			/*
+			 * Either we raced with uprobe_unregister() or we can't
+			 * access this memory. The latter is only possible if
+			 * another thread plays with our ->mm. In both cases
+			 * we can simply restart. If this vma was unmapped we
+			 * can pretend this insn was not executed yet and get
+			 * the (correct) SIGSEGV after restart.
+			 */
+			instruction_pointer_set(regs, bp_vaddr);
+		}
 		return;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 778b032d96909690c19d84f8d17c13be65ed6f8e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 29 May 2012 21:30:08 +0200
Subject: uprobes: Kill uprobes_srcu/uprobe_srcu_id

Kill the no longer needed uprobes_srcu/uprobe_srcu_id code.

It doesn't really work anyway. synchronize_srcu() can only
synchronize with the code "inside" the
srcu_read_lock/srcu_read_unlock section, while
uprobe_pre_sstep_notifier() does srcu_read_lock() _after_ we
already hit the breakpoint.

I guess this probably works "in practice". synchronize_srcu() is
slow and it implies synchronize_sched(), and the probed task
enters the non- preemptible section at the start of exception
handler. Still this is not right at least in theory, and
task->uprobe_srcu_id blows task_struct.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120529193008.GG8057@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h   |  1 -
 kernel/events/uprobes.c | 22 +++-------------------
 2 files changed, 3 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6029d8c54476..6bd19655c1a7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1569,7 +1569,6 @@ struct task_struct {
 #endif
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
-	int uprobe_srcu_id;
 #endif
 };
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1f02e3bbfc1d..8c5e043cd309 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -38,7 +38,6 @@
 #define UINSNS_PER_PAGE			(PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
 #define MAX_UPROBE_XOL_SLOTS		UINSNS_PER_PAGE
 
-static struct srcu_struct uprobes_srcu;
 static struct rb_root uprobes_tree = RB_ROOT;
 
 static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
@@ -738,20 +737,14 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr)
 }
 
 /*
- * There could be threads that have hit the breakpoint and are entering the
- * notifier code and trying to acquire the uprobes_treelock. The thread
- * calling delete_uprobe() that is removing the uprobe from the rb_tree can
- * race with these threads and might acquire the uprobes_treelock compared
- * to some of the breakpoint hit threads. In such a case, the breakpoint
- * hit threads will not find the uprobe. The current unregistering thread
- * waits till all other threads have hit a breakpoint, to acquire the
- * uprobes_treelock before the uprobe is removed from the rbtree.
+ * There could be threads that have already hit the breakpoint. They
+ * will recheck the current insn and restart if find_uprobe() fails.
+ * See find_active_uprobe().
  */
 static void delete_uprobe(struct uprobe *uprobe)
 {
 	unsigned long flags;
 
-	synchronize_srcu(&uprobes_srcu);
 	spin_lock_irqsave(&uprobes_treelock, flags);
 	rb_erase(&uprobe->rb_node, &uprobes_tree);
 	spin_unlock_irqrestore(&uprobes_treelock, flags);
@@ -1388,9 +1381,6 @@ void uprobe_free_utask(struct task_struct *t)
 {
 	struct uprobe_task *utask = t->utask;
 
-	if (t->uprobe_srcu_id != -1)
-		srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
-
 	if (!utask)
 		return;
 
@@ -1408,7 +1398,6 @@ void uprobe_free_utask(struct task_struct *t)
 void uprobe_copy_process(struct task_struct *t)
 {
 	t->utask = NULL;
-	t->uprobe_srcu_id = -1;
 }
 
 /*
@@ -1513,9 +1502,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
 	} else {
 		*is_swbp = -EFAULT;
 	}
-
-	srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
-	current->uprobe_srcu_id = -1;
 	up_read(&mm->mmap_sem);
 
 	return uprobe;
@@ -1656,7 +1642,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
 		utask->state = UTASK_BP_HIT;
 
 	set_thread_flag(TIF_UPROBE);
-	current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
 
 	return 1;
 }
@@ -1691,7 +1676,6 @@ static int __init init_uprobes(void)
 		mutex_init(&uprobes_mutex[i]);
 		mutex_init(&uprobes_mmap_mutex[i]);
 	}
-	init_srcu_struct(&uprobes_srcu);
 
 	return register_die_notifier(&uprobe_exception_nb);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 6be96a5c905178637ec06a5d456a76b2b304fca3 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Wed, 6 Jun 2012 19:12:30 -0700
Subject: cgroup: remove hierarchy_mutex

It was introduced for memcg to iterate cgroup hierarchy without
holding cgroup_mutex, but soon after that it was replaced with
a lockless way in memcg.

No one used hierarchy_mutex since that, so remove it.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/cgroups/cgroups.txt |  2 +-
 include/linux/cgroup.h            | 17 ++-------------
 kernel/cgroup.c                   | 45 ---------------------------------------
 3 files changed, 3 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 8e74980ab385..e86faaea7d66 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -656,7 +656,7 @@ example in cpusets, no task may attach before 'cpus' and 'mems' are set
 up.
 
 void bind(struct cgroup *root)
-(cgroup_mutex and ss->hierarchy_mutex held by caller)
+(cgroup_mutex held by caller)
 
 Called when a cgroup subsystem is rebound to a different hierarchy
 and root cgroup. Currently this will only involve movement between
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d3f5fba2c159..c90eaa803440 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -499,22 +499,9 @@ struct cgroup_subsys {
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
-	/*
-	 * Protects sibling/children links of cgroups in this
-	 * hierarchy, plus protects which hierarchy (or none) the
-	 * subsystem is a part of (i.e. root/sibling).  To avoid
-	 * potential deadlocks, the following operations should not be
-	 * undertaken while holding any hierarchy_mutex:
-	 *
-	 * - allocating memory
-	 * - initiating hotplug events
-	 */
-	struct mutex hierarchy_mutex;
-	struct lock_class_key subsys_key;
-
 	/*
 	 * Link to parent, and list entry in parent's children.
-	 * Protected by this->hierarchy_mutex and cgroup_lock()
+	 * Protected by cgroup_lock()
 	 */
 	struct cgroupfs_root *root;
 	struct list_head sibling;
@@ -602,7 +589,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
  * the lifetime of cgroup_subsys_state is subsys's matter.
  *
  * Looking up and scanning function should be called under rcu_read_lock().
- * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
+ * Taking cgroup_mutex is not necessary for following calls.
  * But the css returned by this routine can be "not populated yet" or "being
  * destroyed". The caller should check css and cgroup's status.
  */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ceeafe874b3f..dec62f5936ef 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1073,28 +1073,24 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			BUG_ON(cgrp->subsys[i]);
 			BUG_ON(!dummytop->subsys[i]);
 			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
-			mutex_lock(&ss->hierarchy_mutex);
 			cgrp->subsys[i] = dummytop->subsys[i];
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
 			if (ss->bind)
 				ss->bind(cgrp);
-			mutex_unlock(&ss->hierarchy_mutex);
 			/* refcount was already taken, and we're keeping it */
 		} else if (bit & removed_bits) {
 			/* We're removing this subsystem */
 			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
-			mutex_lock(&ss->hierarchy_mutex);
 			if (ss->bind)
 				ss->bind(dummytop);
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
 			subsys[i]->root = &rootnode;
 			list_move(&ss->sibling, &rootnode.subsys_list);
-			mutex_unlock(&ss->hierarchy_mutex);
 			/* subsystem is now free - drop reference on module */
 			module_put(ss->module);
 		} else if (bit & final_bits) {
@@ -3917,37 +3913,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 		set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
 }
 
-static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
-{
-	/* We need to take each hierarchy_mutex in a consistent order */
-	int i;
-
-	/*
-	 * No worry about a race with rebind_subsystems that might mess up the
-	 * locking order, since both parties are under cgroup_mutex.
-	 */
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-		struct cgroup_subsys *ss = subsys[i];
-		if (ss == NULL)
-			continue;
-		if (ss->root == root)
-			mutex_lock(&ss->hierarchy_mutex);
-	}
-}
-
-static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
-{
-	int i;
-
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-		struct cgroup_subsys *ss = subsys[i];
-		if (ss == NULL)
-			continue;
-		if (ss->root == root)
-			mutex_unlock(&ss->hierarchy_mutex);
-	}
-}
-
 /*
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
@@ -4008,9 +3973,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			ss->post_clone(cgrp);
 	}
 
-	cgroup_lock_hierarchy(root);
 	list_add(&cgrp->sibling, &cgrp->parent->children);
-	cgroup_unlock_hierarchy(root);
 	root->number_of_cgroups++;
 
 	err = cgroup_create_dir(cgrp, dentry, mode);
@@ -4037,9 +4000,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
  err_remove:
 
-	cgroup_lock_hierarchy(root);
 	list_del(&cgrp->sibling);
-	cgroup_unlock_hierarchy(root);
 	root->number_of_cgroups--;
 
  err_destroy:
@@ -4247,10 +4208,8 @@ again:
 		list_del_init(&cgrp->release_list);
 	raw_spin_unlock(&release_list_lock);
 
-	cgroup_lock_hierarchy(cgrp->root);
 	/* delete this cgroup from parent->children */
 	list_del_init(&cgrp->sibling);
-	cgroup_unlock_hierarchy(cgrp->root);
 
 	list_del_init(&cgrp->allcg_node);
 
@@ -4324,8 +4283,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	 * need to invoke fork callbacks here. */
 	BUG_ON(!list_empty(&init_task.tasks));
 
-	mutex_init(&ss->hierarchy_mutex);
-	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
 	ss->active = 1;
 
 	/* this function shouldn't be used with modular subsystems, since they
@@ -4452,8 +4409,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	}
 	write_unlock(&css_set_lock);
 
-	mutex_init(&ss->hierarchy_mutex);
-	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
 	ss->active = 1;
 
 	/* success! */
-- 
cgit v1.2.3-59-g8ed1b


From 7eb9ba5ed312ec6ed9d22259c5da1acb7cf4bd29 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Fri, 8 Jun 2012 15:02:57 +0530
Subject: uprobes: Pass probed vaddr to arch_uprobe_analyze_insn()

On RISC architectures like powerpc, instructions are fixed size.
Instruction analysis on such platforms is just a matter of
(insn % 4). Pass the vaddr at which the uprobe is to be inserted so
that arch_uprobe_analyze_insn() can flag misaligned registration
requests.

Signed-off-by: Ananth N Mavinakaynahalli <ananth@in.ibm.com>
Cc: michael@ellerman.id.au
Cc: antonb@thinktux.localdomain
Cc: Paul Mackerras <paulus@samba.org>
Cc: benh@kernel.crashing.org
Cc: peterz@infradead.org
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: oleg@redhat.com
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20120608093257.GG13409@in.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/uprobes.h | 2 +-
 arch/x86/kernel/uprobes.c      | 3 ++-
 kernel/events/uprobes.c        | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 1e9bed14f7ae..f3971bbcd1de 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -48,7 +48,7 @@ struct arch_uprobe_task {
 #endif
 };
 
-extern int  arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm);
+extern int  arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr);
 extern int  arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern int  arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index dc4e910a7d96..36fd42091fa7 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -409,9 +409,10 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,
  * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
  * @mm: the probed address space.
  * @arch_uprobe: the probepoint information.
+ * @addr: virtual address at which to install the probepoint
  * Return 0 on success or a -ve number on error.
  */
-int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm)
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
 {
 	int ret;
 	struct insn insn;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8c5e043cd309..b52376d02332 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -706,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 		if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
 			return -EEXIST;
 
-		ret = arch_uprobe_analyze_insn(&uprobe->arch, mm);
+		ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr);
 		if (ret)
 			return ret;
 
-- 
cgit v1.2.3-59-g8ed1b


From b871a42b6091b720e82ddff237659534c525c25b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 11 Jun 2012 15:07:08 +0200
Subject: smpboot: Remove leftover declaration

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/smpboot.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 80c0acfb8472..6ef9433e1c70 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -3,8 +3,6 @@
 
 struct task_struct;
 
-int smpboot_prepare(unsigned int cpu);
-
 #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
 struct task_struct *idle_thread_get(unsigned int cpu);
 void idle_thread_set_boot_cpu(void);
-- 
cgit v1.2.3-59-g8ed1b


From 19f5f7364a1cc770b14692f609bb9b802fc334d5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 27 Jul 2011 17:29:28 +0200
Subject: nohz: Separate idle sleeping time accounting from nohz logic

As we plan to be able to stop the tick outside the idle task, we
need to prepare for separating nohz logic from idle. As a start,
this pulls the idle sleeping time accounting out of the tick
stop/restart API to the callers on idle entry/exit.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alessio Igor Bogani <abogani@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@ti.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 77 ++++++++++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index da70c6db496c..81409bba2425 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -271,10 +271,10 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 
-static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
+static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
-	ktime_t last_update, expires, now;
+	ktime_t last_update, expires;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
 	u64 time_delta;
 	int cpu;
@@ -282,8 +282,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
 	cpu = smp_processor_id();
 	ts = &per_cpu(tick_cpu_sched, cpu);
 
-	now = tick_nohz_start_idle(cpu, ts);
-
 	/*
 	 * If this cpu is offline and it is the one which updates
 	 * jiffies, then give up the assignment and let it be taken by
@@ -444,6 +442,14 @@ out:
 	ts->sleep_length = ktime_sub(dev->next_event, now);
 }
 
+static void __tick_nohz_idle_enter(struct tick_sched *ts)
+{
+	ktime_t now;
+
+	now = tick_nohz_start_idle(smp_processor_id(), ts);
+	tick_nohz_stop_sched_tick(ts, now);
+}
+
 /**
  * tick_nohz_idle_enter - stop the idle tick from the idle task
  *
@@ -479,7 +485,7 @@ void tick_nohz_idle_enter(void)
 	 * update of the idle time accounting in tick_nohz_start_idle().
 	 */
 	ts->inidle = 1;
-	tick_nohz_stop_sched_tick(ts);
+	__tick_nohz_idle_enter(ts);
 
 	local_irq_enable();
 }
@@ -499,7 +505,7 @@ void tick_nohz_irq_exit(void)
 	if (!ts->inidle)
 		return;
 
-	tick_nohz_stop_sched_tick(ts);
+	__tick_nohz_idle_enter(ts);
 }
 
 /**
@@ -540,39 +546,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 	}
 }
 
-/**
- * tick_nohz_idle_exit - restart the idle tick from the idle task
- *
- * Restart the idle tick when the CPU is woken up from idle
- * This also exit the RCU extended quiescent state. The CPU
- * can use RCU again after this function is called.
- */
-void tick_nohz_idle_exit(void)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 {
-	int cpu = smp_processor_id();
-	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	unsigned long ticks;
 #endif
-	ktime_t now;
-
-	local_irq_disable();
-
-	WARN_ON_ONCE(!ts->inidle);
-
-	ts->inidle = 0;
-
-	if (ts->idle_active || ts->tick_stopped)
-		now = ktime_get();
-
-	if (ts->idle_active)
-		tick_nohz_stop_idle(cpu, now);
-
-	if (!ts->tick_stopped) {
-		local_irq_enable();
-		return;
-	}
-
 	/* Update jiffies first */
 	select_nohz_load_balancer(0);
 	tick_do_update_jiffies64(now);
@@ -600,6 +578,35 @@ void tick_nohz_idle_exit(void)
 	ts->idle_exittime = now;
 
 	tick_nohz_restart(ts, now);
+}
+
+/**
+ * tick_nohz_idle_exit - restart the idle tick from the idle task
+ *
+ * Restart the idle tick when the CPU is woken up from idle
+ * This also exit the RCU extended quiescent state. The CPU
+ * can use RCU again after this function is called.
+ */
+void tick_nohz_idle_exit(void)
+{
+	int cpu = smp_processor_id();
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t now;
+
+	local_irq_disable();
+
+	WARN_ON_ONCE(!ts->inidle);
+
+	ts->inidle = 0;
+
+	if (ts->idle_active || ts->tick_stopped)
+		now = ktime_get();
+
+	if (ts->idle_active)
+		tick_nohz_stop_idle(cpu, now);
+
+	if (ts->tick_stopped)
+		tick_nohz_restart_sched_tick(ts, now);
 
 	local_irq_enable();
 }
-- 
cgit v1.2.3-59-g8ed1b


From 2ac0d98fd624ae50f5e6ae9c800977a9dbbfcde6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 28 Jul 2011 04:00:47 +0200
Subject: nohz: Make nohz API agnostic against idle ticks cputime accounting

When the timer tick fires, it accounts the new jiffy as either part
of system, user or idle time. This is how we record the cputime
statistics.

But when the tick is stopped from the idle task, we still need
to record the number of jiffies spent tickless until we restart
the tick and fall back to traditional tick-based cputime accounting.

To do this, we take a snapshot of jiffies when the tick is stopped
and compute the difference against the new value of jiffies when
the tick is restarted. Then we account this whole difference to
the idle cputime.

However we are preparing to be able to stop the tick from other places
than idle. So this idle time accounting needs to be performed from
the callers of nohz APIs, not from the nohz APIs themselves because
we now want them to be agnostic against places that stop/restart tick.

Therefore, we pull the tickless idle time accounting out of generic
nohz helpers up to idle entry/exit callers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alessio Igor Bogani <abogani@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@ti.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 81409bba2425..911834b33b8a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -402,7 +402,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now)
 
 			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
 			ts->tick_stopped = 1;
-			ts->idle_jiffies = last_jiffies;
 		}
 
 		ts->idle_sleeps++;
@@ -445,9 +444,13 @@ out:
 static void __tick_nohz_idle_enter(struct tick_sched *ts)
 {
 	ktime_t now;
+	int was_stopped = ts->tick_stopped;
 
 	now = tick_nohz_start_idle(smp_processor_id(), ts);
 	tick_nohz_stop_sched_tick(ts, now);
+
+	if (!was_stopped && ts->tick_stopped)
+		ts->idle_jiffies = ts->last_jiffies;
 }
 
 /**
@@ -548,15 +551,25 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 
 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 {
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-	unsigned long ticks;
-#endif
 	/* Update jiffies first */
 	select_nohz_load_balancer(0);
 	tick_do_update_jiffies64(now);
 	update_cpu_load_nohz();
 
+	touch_softlockup_watchdog();
+	/*
+	 * Cancel the scheduled timer and restore the tick
+	 */
+	ts->tick_stopped  = 0;
+	ts->idle_exittime = now;
+
+	tick_nohz_restart(ts, now);
+}
+
+static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
+{
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
+	unsigned long ticks;
 	/*
 	 * We stopped the tick in idle. Update process times would miss the
 	 * time we slept as update_process_times does only a 1 tick
@@ -569,15 +582,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 	if (ticks && ticks < LONG_MAX)
 		account_idle_ticks(ticks);
 #endif
-
-	touch_softlockup_watchdog();
-	/*
-	 * Cancel the scheduled timer and restore the tick
-	 */
-	ts->tick_stopped  = 0;
-	ts->idle_exittime = now;
-
-	tick_nohz_restart(ts, now);
 }
 
 /**
@@ -605,8 +609,10 @@ void tick_nohz_idle_exit(void)
 	if (ts->idle_active)
 		tick_nohz_stop_idle(cpu, now);
 
-	if (ts->tick_stopped)
+	if (ts->tick_stopped) {
 		tick_nohz_restart_sched_tick(ts, now);
+		tick_nohz_account_idle_ticks(ts);
+	}
 
 	local_irq_enable();
 }
@@ -811,7 +817,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 		 */
 		if (ts->tick_stopped) {
 			touch_softlockup_watchdog();
-			ts->idle_jiffies++;
+			if (idle_cpu(cpu))
+				ts->idle_jiffies++;
 		}
 		update_process_times(user_mode(regs));
 		profile_tick(CPU_PROFILING);
-- 
cgit v1.2.3-59-g8ed1b


From f5d411c91ede162240f34e05a233f2759412988e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 31 Jul 2011 17:44:12 +0200
Subject: nohz: Rename ts->idle_tick to ts->last_tick

Now that idle and nohz logics are going to be independant each others,
ts->idle_tick becomes too much a biased name to describe the field that
saves the last scheduled tick on top of which we re-calculate the next
tick to schedule when the timer is restarted.

We want to reuse this even to stop the tick outside idle cases. So let's
rename it to some more generic name: ts->last_tick.

This changes a bit the timer list stat export so we need to increase its
version.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alessio Igor Bogani <abogani@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@ti.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/tick.h     | 8 ++++----
 kernel/time/tick-sched.c | 4 ++--
 kernel/time/timer_list.c | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index ab8be90b5cc9..f37fceb69b73 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -31,10 +31,10 @@ enum tick_nohz_mode {
  * struct tick_sched - sched tick emulation and no idle tick control/stats
  * @sched_timer:	hrtimer to schedule the periodic tick in high
  *			resolution mode
- * @idle_tick:		Store the last idle tick expiry time when the tick
- *			timer is modified for idle sleeps. This is necessary
+ * @last_tick:		Store the last tick expiry time when the tick
+ *			timer is modified for nohz sleeps. This is necessary
  *			to resume the tick timer operation in the timeline
- *			when the CPU returns from idle
+ *			when the CPU returns from nohz sleep.
  * @tick_stopped:	Indicator that the idle tick has been stopped
  * @idle_jiffies:	jiffies at the entry to idle for idle time accounting
  * @idle_calls:		Total number of idle calls
@@ -51,7 +51,7 @@ struct tick_sched {
 	struct hrtimer			sched_timer;
 	unsigned long			check_clocks;
 	enum tick_nohz_mode		nohz_mode;
-	ktime_t				idle_tick;
+	ktime_t				last_tick;
 	int				inidle;
 	int				tick_stopped;
 	unsigned long			idle_jiffies;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 911834b33b8a..73cc4901336d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -400,7 +400,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now)
 		if (!ts->tick_stopped) {
 			select_nohz_load_balancer(1);
 
-			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
+			ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 			ts->tick_stopped = 1;
 		}
 
@@ -526,7 +526,7 @@ ktime_t tick_nohz_get_sleep_length(void)
 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 {
 	hrtimer_cancel(&ts->sched_timer);
-	hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
+	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
 
 	while (1) {
 		/* Forward the time to expire in the future */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3258455549f4..af5a7e9f164b 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 	{
 		struct tick_sched *ts = tick_get_tick_sched(cpu);
 		P(nohz_mode);
-		P_ns(idle_tick);
+		P_ns(last_tick);
 		P(tick_stopped);
 		P(idle_jiffies);
 		P(idle_calls);
@@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Timer List Version: v0.6\n");
+	SEQ_printf(m, "Timer List Version: v0.7\n");
 	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
 	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
 
-- 
cgit v1.2.3-59-g8ed1b


From 5b39939a40801f0c17e31adaf643d6e974227856 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 1 Aug 2011 00:06:10 +0200
Subject: nohz: Move ts->idle_calls incrementation into strict idle logic

Since we want to prepare for making the nohz API to work further
the idle case, we need to pull ts->idle_calls incrementation up to
the callers in idle.

To perform this, we split tick_nohz_stop_sched_tick() in two parts:
a first one that checks if we can really stop the tick for idle,
and another that actually stops it. Then from the callers in idle,
we check if we can stop the tick and only then we increment idle_calls
and finally relay to the nohz API that won't care about these details
anymore.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alessio Igor Bogani <abogani@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@ti.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 86 ++++++++++++++++++++++++++----------------------
 1 file changed, 47 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 73cc4901336d..430e1b6901cc 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -271,47 +271,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 
-static void tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now)
+static void tick_nohz_stop_sched_tick(struct tick_sched *ts,
+				      ktime_t now, int cpu)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
 	ktime_t last_update, expires;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
 	u64 time_delta;
-	int cpu;
-
-	cpu = smp_processor_id();
-	ts = &per_cpu(tick_cpu_sched, cpu);
-
-	/*
-	 * If this cpu is offline and it is the one which updates
-	 * jiffies, then give up the assignment and let it be taken by
-	 * the cpu which runs the tick timer next. If we don't drop
-	 * this here the jiffies might be stale and do_timer() never
-	 * invoked.
-	 */
-	if (unlikely(!cpu_online(cpu))) {
-		if (cpu == tick_do_timer_cpu)
-			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
-	}
-
-	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
-		return;
-
-	if (need_resched())
-		return;
 
-	if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
-		static int ratelimit;
-
-		if (ratelimit < 10) {
-			printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-			       (unsigned int) local_softirq_pending());
-			ratelimit++;
-		}
-		return;
-	}
 
-	ts->idle_calls++;
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
 		seq = read_seqbegin(&xtime_lock);
@@ -441,16 +409,56 @@ out:
 	ts->sleep_length = ktime_sub(dev->next_event, now);
 }
 
+static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
+{
+	/*
+	 * If this cpu is offline and it is the one which updates
+	 * jiffies, then give up the assignment and let it be taken by
+	 * the cpu which runs the tick timer next. If we don't drop
+	 * this here the jiffies might be stale and do_timer() never
+	 * invoked.
+	 */
+	if (unlikely(!cpu_online(cpu))) {
+		if (cpu == tick_do_timer_cpu)
+			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+	}
+
+	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+		return false;
+
+	if (need_resched())
+		return false;
+
+	if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
+		static int ratelimit;
+
+		if (ratelimit < 10) {
+			printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+			       (unsigned int) local_softirq_pending());
+			ratelimit++;
+		}
+		return false;
+	}
+
+	return true;
+}
+
 static void __tick_nohz_idle_enter(struct tick_sched *ts)
 {
 	ktime_t now;
-	int was_stopped = ts->tick_stopped;
+	int cpu = smp_processor_id();
 
-	now = tick_nohz_start_idle(smp_processor_id(), ts);
-	tick_nohz_stop_sched_tick(ts, now);
+	now = tick_nohz_start_idle(cpu, ts);
 
-	if (!was_stopped && ts->tick_stopped)
-		ts->idle_jiffies = ts->last_jiffies;
+	if (can_stop_idle_tick(cpu, ts)) {
+		int was_stopped = ts->tick_stopped;
+
+		ts->idle_calls++;
+		tick_nohz_stop_sched_tick(ts, now, cpu);
+
+		if (!was_stopped && ts->tick_stopped)
+			ts->idle_jiffies = ts->last_jiffies;
+	}
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 84bf1bccc60cc64376125ea2eae05e4ba12f795b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 1 Aug 2011 01:25:38 +0200
Subject: nohz: Move next idle expiry time record into idle logic area

The next idle expiry time record and idle sleeps tracking are
statistics that only concern idle.

Since we want the nohz APIs to become usable further idle
context, let's pull up the handling of these statistics to the
callers in idle.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alessio Igor Bogani <abogani@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@ti.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 430e1b6901cc..60c9c60e9108 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -271,11 +271,11 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 
-static void tick_nohz_stop_sched_tick(struct tick_sched *ts,
-				      ktime_t now, int cpu)
+static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
+					 ktime_t now, int cpu)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
-	ktime_t last_update, expires;
+	ktime_t last_update, expires, ret = { .tv64 = 0 };
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
 	u64 time_delta;
 
@@ -358,6 +358,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
 			goto out;
 
+		ret = expires;
+
 		/*
 		 * nohz_stop_sched_tick can be called several times before
 		 * the nohz_restart_sched_tick is called. This happens when
@@ -372,11 +374,6 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts,
 			ts->tick_stopped = 1;
 		}
 
-		ts->idle_sleeps++;
-
-		/* Mark expires */
-		ts->idle_expires = expires;
-
 		/*
 		 * If the expiration time == KTIME_MAX, then
 		 * in this case we simply stop the tick timer.
@@ -407,6 +404,8 @@ out:
 	ts->next_jiffies = next_jiffies;
 	ts->last_jiffies = last_jiffies;
 	ts->sleep_length = ktime_sub(dev->next_event, now);
+
+	return ret;
 }
 
 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
@@ -445,7 +444,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 
 static void __tick_nohz_idle_enter(struct tick_sched *ts)
 {
-	ktime_t now;
+	ktime_t now, expires;
 	int cpu = smp_processor_id();
 
 	now = tick_nohz_start_idle(cpu, ts);
@@ -454,7 +453,12 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
 		int was_stopped = ts->tick_stopped;
 
 		ts->idle_calls++;
-		tick_nohz_stop_sched_tick(ts, now, cpu);
+
+		expires = tick_nohz_stop_sched_tick(ts, now, cpu);
+		if (expires.tv64 > 0LL) {
+			ts->idle_sleeps++;
+			ts->idle_expires = expires;
+		}
 
 		if (!was_stopped && ts->tick_stopped)
 			ts->idle_jiffies = ts->last_jiffies;
-- 
cgit v1.2.3-59-g8ed1b


From 82ec90eac304e81b1389175b4dded7abecc678ef Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 17 May 2012 18:51:11 -0700
Subject: resources: allow adjust_resource() for resources with no parent

If a resource has no parent, allow its start/end to be set arbitrarily
as long as any children are still contained within the new range.

[bhelgaas: changelog]
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 kernel/resource.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index e1d2b8ee76d5..dc8b47764443 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -722,14 +722,12 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
 
 	write_lock(&resource_lock);
 
+	if (!parent)
+		goto skip;
+
 	if ((start < parent->start) || (end > parent->end))
 		goto out;
 
-	for (tmp = res->child; tmp; tmp = tmp->sibling) {
-		if ((tmp->start < start) || (tmp->end > end))
-			goto out;
-	}
-
 	if (res->sibling && (res->sibling->start <= end))
 		goto out;
 
@@ -741,6 +739,11 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
 			goto out;
 	}
 
+skip:
+	for (tmp = res->child; tmp; tmp = tmp->sibling)
+		if ((tmp->start < start) || (tmp->end > end))
+			goto out;
+
 	res->start = start;
 	res->end = end;
 	result = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 8d240dd88cca33b704adf3fe281aa64b5aac2dd8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@amd64.org>
Date: Thu, 29 Mar 2012 19:11:40 +0200
Subject: ftrace: Remove a superfluous check

register_ftrace_function() checks ftrace_disabled and calls
__register_ftrace_function which does it again.

Drop the first check and add the unlikely hint to the second one. Also,
drop the label as John correctly notices.

No functional change.

Link: http://lkml.kernel.org/r/20120329171140.GE6409@aftab

Cc: Borislav Petkov <bp@amd64.org>
Cc: John Kacur <jkacur@redhat.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a008663d86c8..b4f20fba09fc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
 
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-	if (ftrace_disabled)
+	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
 	if (FTRACE_WARN_ON(ops == &global_ops))
@@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops)
 
 	mutex_lock(&ftrace_lock);
 
-	if (unlikely(ftrace_disabled))
-		goto out_unlock;
-
 	ret = __register_ftrace_function(ops);
 	if (!ret)
 		ret = ftrace_startup(ops, 0);
 
-
- out_unlock:
 	mutex_unlock(&ftrace_lock);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_function);
-- 
cgit v1.2.3-59-g8ed1b


From 7374e82771c6d5a9af2080be46f64a5826c7efb1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 31 May 2012 21:40:05 -0400
Subject: tracing: Register the ftrace internal events during early boot

All trace events including ftrace internel events (like trace_printk
and function tracing), register functions that describe how to print
their output. The events may be recorded as soon as the ring buffer
is allocated, but they are just raw binary in the buffer. The mapping
of event ids to how to print them are held within a structure that
is registered on system boot.

If a crash happens in boot up before these functions are registered
then their output (via ftrace_dump_on_oops) will be useless:

Dumping ftrace buffer:
---------------------------------
   <...>-1       0.... 319705us : Unknown type 6
---------------------------------

This can be quite frustrating for a kernel developer trying to see
what is going wrong.

There's no reason to register them so late in the boot up process.
They can be registered by early_initcall().

Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index df611a0e76c5..123b189c732c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1325,4 +1325,4 @@ __init static int init_events(void)
 
 	return 0;
 }
-device_initcall(init_events);
+early_initcall(init_events);
-- 
cgit v1.2.3-59-g8ed1b


From ea131377148cdfe90641b42ae9aa5a6b3a4fa327 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:22 +0200
Subject: uprobes: Valid_vma() should reject VM_HUGETLB

__replace_page() obviously can't work with the hugetlbfs
mappings, uprobe_register() will likely crash the kernel. Change
valid_vma() to check VM_HUGETLB as well.

As for PageTransHuge() no need to worry, vma->vm_file != NULL.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120615154322.GA9561@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b52376d02332..f0d04530af63 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -99,7 +99,8 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register)
 	if (!is_register)
 		return true;
 
-	if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC))
+	if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED))
+				== (VM_READ|VM_EXEC))
 		return true;
 
 	return false;
-- 
cgit v1.2.3-59-g8ed1b


From cc359d180fa9c25a4c1819f17e07a422d788353d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:25 +0200
Subject: uprobes: __copy_insn() should ensure a_ops->readpage != NULL

__copy_insn() blindly calls read_mapping_page(), this will crash
the kernel if ->readpage == NULL, add the necessary check. For
example, hugetlbfs_aops->readpage is NULL. Perhaps we should
change read_mapping_page() instead.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120615154325.GA9568@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f0d04530af63..604930bf9c92 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -610,6 +610,9 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins
 	if (!filp)
 		return -EINVAL;
 
+	if (!mapping->a_ops->readpage)
+		return -EIO;
+
 	idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT);
 	off1 = offset &= ~PAGE_MASK;
 
-- 
cgit v1.2.3-59-g8ed1b


From 5323ce71e4b4e1f188ebbc0cc7776885ea6c75fb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:28 +0200
Subject: uprobes: Write_opcode()->__replace_page() can race with
 try_to_unmap()

write_opcode() gets old_page via get_user_pages() and then calls
__replace_page() which assumes that this old_page is still
mapped after pte_offset_map_lock().

This is not true if this old_page was already try_to_unmap()'ed,
and in this case everything __replace_page() does with old_page
is wrong. Just for example, put_page() is not balanced.

I think it is possible to teach __replace_page() to handle this
unlikely case correctly, but this patch simply changes it to use
page_check_address() and return -EAGAIN if it fails. The caller
should notice this error code and retry.

Note: write_opcode() asks for the cleanups, I'll try to do this
in a separate patch.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120615154328.GA9571@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 41 +++++++++++++----------------------------
 1 file changed, 13 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 604930bf9c92..3ccdb29ee8d6 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -129,33 +129,17 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
 static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *ptep;
-	spinlock_t *ptl;
 	unsigned long addr;
-	int err = -EFAULT;
+	spinlock_t *ptl;
+	pte_t *ptep;
 
 	addr = page_address_in_vma(page, vma);
 	if (addr == -EFAULT)
-		goto out;
-
-	pgd = pgd_offset(mm, addr);
-	if (!pgd_present(*pgd))
-		goto out;
-
-	pud = pud_offset(pgd, addr);
-	if (!pud_present(*pud))
-		goto out;
+		return -EFAULT;
 
-	pmd = pmd_offset(pud, addr);
-	if (!pmd_present(*pmd))
-		goto out;
-
-	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	ptep = page_check_address(page, mm, addr, &ptl, 0);
 	if (!ptep)
-		goto out;
+		return -EAGAIN;
 
 	get_page(kpage);
 	page_add_new_anon_rmap(kpage, vma, addr);
@@ -174,10 +158,8 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct
 		try_to_free_swap(page);
 	put_page(page);
 	pte_unmap_unlock(ptep, ptl);
-	err = 0;
 
-out:
-	return err;
+	return 0;
 }
 
 /**
@@ -222,9 +204,10 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	void *vaddr_old, *vaddr_new;
 	struct vm_area_struct *vma;
 	struct uprobe *uprobe;
+	unsigned long pgoff;
 	loff_t addr;
 	int ret;
-
+retry:
 	/* Read the page with vaddr into memory */
 	ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
 	if (ret <= 0)
@@ -269,9 +252,9 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
 
 	/* poke the new insn in, ASSUMES we don't cross page boundary */
-	vaddr &= ~PAGE_MASK;
-	BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-	memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+	pgoff = (vaddr & ~PAGE_MASK);
+	BUG_ON(pgoff + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+	memcpy(vaddr_new + pgoff, &opcode, UPROBE_SWBP_INSN_SIZE);
 
 	kunmap_atomic(vaddr_new);
 	kunmap_atomic(vaddr_old);
@@ -291,6 +274,8 @@ unlock_out:
 put_out:
 	put_page(old_page);
 
+	if (unlikely(ret == -EAGAIN))
+		goto retry;
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From c1914a0936f79ed0236f670122e06e36e4d332ee Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:31 +0200
Subject: uprobes: Install_breakpoint() should fail if is_swbp_insn() == T

install_breakpoint() returns -EEXIST if is_swbp_insn(orig_insn)
== T, the caller treats this code as success.

This is doubly wrong. The successful return should set
UPROBE_COPY_INSN, but the real problem is that it shouldn't
succeed. If the probed insn is int3 the application should get
SIGTRAP, this won't happen with uprobe.

Probably we can fix this, we can add the UPROBE_SHARED_BP flag
and teach handle_swbp/set_orig_insn to handle this case
correctly. But this needs some complications and we have other
insns which can't be probed, lets make a simple fix for now.

I think this needs a cleanup. UPROBE_COPY_INSN should die,
copy_insn() should be called by alloc_uprobe().
arch_uprobe_analyze_insn() depends on ->mm (ia32_compat) but it
is called only once.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120615154331.GA9578@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 3ccdb29ee8d6..ec78152e32e9 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -693,7 +693,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 			return ret;
 
 		if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
-			return -EEXIST;
+			return -ENOTSUPP;
 
 		ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr);
 		if (ret)
-- 
cgit v1.2.3-59-g8ed1b


From 268720903f87e0b84b161626c4447b81671b5d18 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:33 +0200
Subject: uprobes: Rework register_for_each_vma() to make it O(n)

Currently register_for_each_vma() is O(n ** 2) + O(n ** 3),
every time find_next_vma_info() "restarts" the
vma_prio_tree_foreach() loop and each iteration rechecks the
whole try_list. This also means that try_list can grow
"indefinitely" if register/unregister races with munmap/mmap
activity even if the number of mapping is bounded at any time.

With this patch register_for_each_vma() builds the list of
mm/vaddr structures only once and does install_breakpoint() for
each entry.

We do not care about the new mappings which can be created after
build_map_info() drops mapping->i_mmap_mutex, uprobe_mmap()
should do its work.

Note that we do not allocate map_info under i_mmap_mutex, this
can deadlock with page reclaim (but see the next patch). So we
use 2 lists, "curr" which we are going to return, and "prev"
which holds the already allocated memory. The main loop deques
the entry from "prev" (initially it is empty), and if "prev"
becomes empty again it counts the number of entries we need to
pre-allocate outside of i_mmap_mutex.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Link: http://lkml.kernel.org/r/20120615154333.GA9581@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 199 +++++++++++++++++++++---------------------------
 1 file changed, 86 insertions(+), 113 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ec78152e32e9..4e0db3496d70 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -60,17 +60,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
  */
 static atomic_t uprobe_events = ATOMIC_INIT(0);
 
-/*
- * Maintain a temporary per vma info that can be used to search if a vma
- * has already been handled. This structure is introduced since extending
- * vm_area_struct wasnt recommended.
- */
-struct vma_info {
-	struct list_head	probe_list;
-	struct mm_struct	*mm;
-	loff_t			vaddr;
-};
-
 struct uprobe {
 	struct rb_node		rb_node;	/* node in the rb tree */
 	atomic_t		ref;
@@ -742,139 +731,123 @@ static void delete_uprobe(struct uprobe *uprobe)
 	atomic_dec(&uprobe_events);
 }
 
-static struct vma_info *
-__find_next_vma_info(struct address_space *mapping, struct list_head *head,
-			struct vma_info *vi, loff_t offset, bool is_register)
+struct map_info {
+	struct map_info *next;
+	struct mm_struct *mm;
+	loff_t vaddr;
+};
+
+static inline struct map_info *free_map_info(struct map_info *info)
 {
+	struct map_info *next = info->next;
+	kfree(info);
+	return next;
+}
+
+static struct map_info *
+build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
+{
+	unsigned long pgoff = offset >> PAGE_SHIFT;
 	struct prio_tree_iter iter;
 	struct vm_area_struct *vma;
-	struct vma_info *tmpvi;
-	unsigned long pgoff;
-	int existing_vma;
-	loff_t vaddr;
-
-	pgoff = offset >> PAGE_SHIFT;
+	struct map_info *curr = NULL;
+	struct map_info *prev = NULL;
+	struct map_info *info;
+	int more = 0;
 
+ again:
+	mutex_lock(&mapping->i_mmap_mutex);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		if (!valid_vma(vma, is_register))
 			continue;
 
-		existing_vma = 0;
-		vaddr = vma_address(vma, offset);
-
-		list_for_each_entry(tmpvi, head, probe_list) {
-			if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
-				existing_vma = 1;
-				break;
-			}
-		}
-
-		/*
-		 * Another vma needs a probe to be installed. However skip
-		 * installing the probe if the vma is about to be unlinked.
-		 */
-		if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
-			vi->mm = vma->vm_mm;
-			vi->vaddr = vaddr;
-			list_add(&vi->probe_list, head);
-
-			return vi;
+		if (!prev) {
+			more++;
+			continue;
 		}
-	}
-
-	return NULL;
-}
 
-/*
- * Iterate in the rmap prio tree  and find a vma where a probe has not
- * yet been inserted.
- */
-static struct vma_info *
-find_next_vma_info(struct address_space *mapping, struct list_head *head,
-		loff_t offset, bool is_register)
-{
-	struct vma_info *vi, *retvi;
+		if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+			continue;
 
-	vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
-	if (!vi)
-		return ERR_PTR(-ENOMEM);
+		info = prev;
+		prev = prev->next;
+		info->next = curr;
+		curr = info;
 
-	mutex_lock(&mapping->i_mmap_mutex);
-	retvi = __find_next_vma_info(mapping, head, vi, offset, is_register);
+		info->mm = vma->vm_mm;
+		info->vaddr = vma_address(vma, offset);
+	}
 	mutex_unlock(&mapping->i_mmap_mutex);
 
-	if (!retvi)
-		kfree(vi);
+	if (!more)
+		goto out;
+
+	prev = curr;
+	while (curr) {
+		mmput(curr->mm);
+		curr = curr->next;
+	}
 
-	return retvi;
+	do {
+		info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
+		if (!info) {
+			curr = ERR_PTR(-ENOMEM);
+			goto out;
+		}
+		info->next = prev;
+		prev = info;
+	} while (--more);
+
+	goto again;
+ out:
+	while (prev)
+		prev = free_map_info(prev);
+	return curr;
 }
 
 static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 {
-	struct list_head try_list;
-	struct vm_area_struct *vma;
-	struct address_space *mapping;
-	struct vma_info *vi, *tmpvi;
-	struct mm_struct *mm;
-	loff_t vaddr;
-	int ret;
+	struct map_info *info;
+	int err = 0;
 
-	mapping = uprobe->inode->i_mapping;
-	INIT_LIST_HEAD(&try_list);
-
-	ret = 0;
+	info = build_map_info(uprobe->inode->i_mapping,
+					uprobe->offset, is_register);
+	if (IS_ERR(info))
+		return PTR_ERR(info);
 
-	for (;;) {
-		vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register);
-		if (!vi)
-			break;
+	while (info) {
+		struct mm_struct *mm = info->mm;
+		struct vm_area_struct *vma;
+		loff_t vaddr;
 
-		if (IS_ERR(vi)) {
-			ret = PTR_ERR(vi);
-			break;
-		}
+		if (err)
+			goto free;
 
-		mm = vi->mm;
 		down_write(&mm->mmap_sem);
-		vma = find_vma(mm, (unsigned long)vi->vaddr);
-		if (!vma || !valid_vma(vma, is_register)) {
-			list_del(&vi->probe_list);
-			kfree(vi);
-			up_write(&mm->mmap_sem);
-			mmput(mm);
-			continue;
-		}
+		vma = find_vma(mm, (unsigned long)info->vaddr);
+		if (!vma || !valid_vma(vma, is_register))
+			goto unlock;
+
 		vaddr = vma_address(vma, uprobe->offset);
 		if (vma->vm_file->f_mapping->host != uprobe->inode ||
-						vaddr != vi->vaddr) {
-			list_del(&vi->probe_list);
-			kfree(vi);
-			up_write(&mm->mmap_sem);
-			mmput(mm);
-			continue;
-		}
+						vaddr != info->vaddr)
+			goto unlock;
 
-		if (is_register)
-			ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
-		else
-			remove_breakpoint(uprobe, mm, vi->vaddr);
-
-		up_write(&mm->mmap_sem);
-		mmput(mm);
 		if (is_register) {
-			if (ret && ret == -EEXIST)
-				ret = 0;
-			if (ret)
-				break;
+			err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+			if (err == -EEXIST)
+				err = 0;
+		} else {
+			remove_breakpoint(uprobe, mm, info->vaddr);
 		}
+ unlock:
+		up_write(&mm->mmap_sem);
+ free:
+		mmput(mm);
+		info = free_map_info(info);
 	}
 
-	list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
-		list_del(&vi->probe_list);
-		kfree(vi);
-	}
-
-	return ret;
+	return err;
 }
 
 static int __uprobe_register(struct uprobe *uprobe)
-- 
cgit v1.2.3-59-g8ed1b


From 7a5bfb66b07f22d2429db776da7bb8b57bfb5cff Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:36 +0200
Subject: uprobes: Change build_map_info() to try kmalloc(GFP_NOWAIT) first

build_map_info() doesn't allocate the memory under i_mmap_mutex
to avoid the deadlock with page reclaim. But it can try
GFP_NOWAIT first, it should work in the likely case and thus we
almost never need the pre-alloc-and-retry path.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Link: http://lkml.kernel.org/r/20120615154336.GA9588@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4e0db3496d70..897417dbca8e 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -761,6 +761,16 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 		if (!valid_vma(vma, is_register))
 			continue;
 
+		if (!prev && !more) {
+			/*
+			 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+			 * reclaim. This is optimistic, no harm done if it fails.
+			 */
+			prev = kmalloc(sizeof(struct map_info),
+					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+			if (prev)
+				prev->next = NULL;
+		}
 		if (!prev) {
 			more++;
 			continue;
-- 
cgit v1.2.3-59-g8ed1b


From c5784de2b351fe871bb57487878f7fc7ec5b075c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 Jun 2012 17:43:39 +0200
Subject: uprobes: Document uprobe_register() vs uprobe_mmap() race

Because the mind is treacherous and makes us forget we need to
write stuff down.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20120615154339.GA9591@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 897417dbca8e..2671d9ad49be 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -44,6 +44,23 @@ static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
 
 #define UPROBES_HASH_SZ	13
 
+/*
+ * We need separate register/unregister and mmap/munmap lock hashes because
+ * of mmap_sem nesting.
+ *
+ * uprobe_register() needs to install probes on (potentially) all processes
+ * and thus needs to acquire multiple mmap_sems (consequtively, not
+ * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
+ * for the particular process doing the mmap.
+ *
+ * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
+ * because of lock order against i_mmap_mutex. This means there's a hole in
+ * the register vma iteration where a mmap() can happen.
+ *
+ * Thus uprobe_register() can race with uprobe_mmap() and we can try and
+ * install a probe where one is already installed.
+ */
+
 /* serialize (un)register */
 static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
 
@@ -339,7 +356,9 @@ out:
 int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
 	int result;
-
+	/*
+	 * See the comment near uprobes_hash().
+	 */
 	result = is_swbp_at_addr(mm, vaddr);
 	if (result == 1)
 		return -EEXIST;
@@ -845,6 +864,10 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 
 		if (is_register) {
 			err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+			/*
+			 * We can race against uprobe_mmap(), see the
+			 * comment near uprobe_hash().
+			 */
 			if (err == -EEXIST)
 				err = 0;
 		} else {
@@ -1054,8 +1077,10 @@ int uprobe_mmap(struct vm_area_struct *vma)
 			}
 
 			ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
-
-			/* Ignore double add: */
+			/*
+			 * We can race against uprobe_register(), see the
+			 * comment near uprobe_hash().
+			 */
 			if (ret == -EEXIST) {
 				ret = 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From d436615e60c386095dac4a9bf72b08868d2a7564 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:42 +0200
Subject: uprobes: Copy_insn() shouldn't depend on mm/vma/vaddr

1. copy_insn() doesn't need "addr", it can use uprobe->offset.
   Remove this argument.

2. Change copy_insn/__copy_insn to accept "struct file*" instead
   of vma.

copy_insn() is called only once and mm/vma/vaddr are random, it
shouldn't depend on them.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120615154342.GA9598@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2671d9ad49be..08ef566da763 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -591,10 +591,9 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
 }
 
 static int
-__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn,
+__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
 			unsigned long nbytes, unsigned long offset)
 {
-	struct file *filp = vma->vm_file;
 	struct page *page;
 	void *vaddr;
 	unsigned long off1;
@@ -625,15 +624,13 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins
 	return 0;
 }
 
-static int
-copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
+static int copy_insn(struct uprobe *uprobe, struct file *filp)
 {
 	struct address_space *mapping;
 	unsigned long nbytes;
 	int bytes;
 
-	addr &= ~PAGE_MASK;
-	nbytes = PAGE_SIZE - addr;
+	nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
 	mapping = uprobe->inode->i_mapping;
 
 	/* Instruction at end of binary; copy only available bytes */
@@ -644,13 +641,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
 
 	/* Instruction at the page-boundary; copy bytes in second page */
 	if (nbytes < bytes) {
-		if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes,
+		if (__copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
 				bytes - nbytes, uprobe->offset + nbytes))
 			return -ENOMEM;
 
 		bytes = nbytes;
 	}
-	return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset);
+	return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
 }
 
 /*
@@ -696,7 +693,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 	addr = (unsigned long)vaddr;
 
 	if (!(uprobe->flags & UPROBE_COPY_INSN)) {
-		ret = copy_insn(uprobe, vma, addr);
+		ret = copy_insn(uprobe, vma->vm_file);
 		if (ret)
 			return ret;
 
-- 
cgit v1.2.3-59-g8ed1b


From fc36f59565861af2e897225bc3782479a26c5d5a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:44 +0200
Subject: uprobes: Copy_insn() should not return -ENOMEM if __copy_insn() fails

copy_insn() returns -ENOMEM if the first __copy_insn() fails,
it should return the correct error code.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120615154344.GA9601@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 08ef566da763..2db1d94d7dfc 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -641,10 +641,10 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp)
 
 	/* Instruction at the page-boundary; copy bytes in second page */
 	if (nbytes < bytes) {
-		if (__copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
-				bytes - nbytes, uprobe->offset + nbytes))
-			return -ENOMEM;
-
+		int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
+				bytes - nbytes, uprobe->offset + nbytes);
+		if (err)
+			return err;
 		bytes = nbytes;
 	}
 	return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
-- 
cgit v1.2.3-59-g8ed1b


From eb2bf57bee42c7565032f93adaa211e2c9fcc52c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:47 +0200
Subject: uprobes: No need to re-check vma_address() in write_opcode()

write_opcode() is called by register_for_each_vma() and
uprobe_mmap() paths. In both cases the caller has already
verified this vaddr under mmap_sem, no need to re-check.

Note also that this check is wrong anyway, we should not
truncate loff_t returned by vma_address() if we do not trust
this mapping.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120615154347.GA9604@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2db1d94d7dfc..14c71a2aadad 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -211,7 +211,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	struct vm_area_struct *vma;
 	struct uprobe *uprobe;
 	unsigned long pgoff;
-	loff_t addr;
 	int ret;
 retry:
 	/* Read the page with vaddr into memory */
@@ -235,10 +234,6 @@ retry:
 	if (mapping != vma->vm_file->f_mapping)
 		goto put_out;
 
-	addr = vma_address(vma, uprobe->offset);
-	if (vaddr != (unsigned long)addr)
-		goto put_out;
-
 	ret = -ENOMEM;
 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
 	if (!new_page)
-- 
cgit v1.2.3-59-g8ed1b


From d9c4a30e82614d43b55893a73f31e7284007ce82 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:50 +0200
Subject: uprobes: Move BUG_ON(UPROBE_SWBP_INSN_SIZE) from write_opcode() to
 install_breakpoint()

write_opcode() ensures that UPROBE_SWBP_INSN doesn't cross the
page boundary. This looks a bit confusing, the check does not
depend on vaddr and it is enough to do it only once right after
install_breakpoint()->arch_uprobe_analyze_insn().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120615154350.GA9611@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 14c71a2aadad..b9c61bda9029 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -210,7 +210,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	void *vaddr_old, *vaddr_new;
 	struct vm_area_struct *vma;
 	struct uprobe *uprobe;
-	unsigned long pgoff;
 	int ret;
 retry:
 	/* Read the page with vaddr into memory */
@@ -251,11 +250,7 @@ retry:
 	vaddr_new = kmap_atomic(new_page);
 
 	memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
-
-	/* poke the new insn in, ASSUMES we don't cross page boundary */
-	pgoff = (vaddr & ~PAGE_MASK);
-	BUG_ON(pgoff + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-	memcpy(vaddr_new + pgoff, &opcode, UPROBE_SWBP_INSN_SIZE);
+	memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
 
 	kunmap_atomic(vaddr_new);
 	kunmap_atomic(vaddr_old);
@@ -699,6 +694,10 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 		if (ret)
 			return ret;
 
+		/* write_opcode() assumes we don't cross page boundary */
+		BUG_ON((uprobe->offset & ~PAGE_MASK) +
+				UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+
 		uprobe->flags |= UPROBE_COPY_INSN;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 449d0d7c9fb87277175db34c009c70cb348004a8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:53 +0200
Subject: uprobes: Simplify the usage of uprobe->pending_list

uprobe->pending_list is only used to create the temporary list,
it has no meaning after we drop uprobes_mmap_hash(inode).

No need to initialize this node or remove it from tmp_list, and
we can use list_for_each_entry().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20120615154353.GA9614@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b9c61bda9029..7d5c78f063ae 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -513,7 +513,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 	uprobe->inode = igrab(inode);
 	uprobe->offset = offset;
 	init_rwsem(&uprobe->consumer_rwsem);
-	INIT_LIST_HEAD(&uprobe->pending_list);
 
 	/* add to uprobes_tree, sorted on inode:offset */
 	cur_uprobe = insert_uprobe(uprobe);
@@ -1037,7 +1036,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head)
 int uprobe_mmap(struct vm_area_struct *vma)
 {
 	struct list_head tmp_list;
-	struct uprobe *uprobe, *u;
+	struct uprobe *uprobe;
 	struct inode *inode;
 	int ret, count;
 
@@ -1055,10 +1054,9 @@ int uprobe_mmap(struct vm_area_struct *vma)
 	ret = 0;
 	count = 0;
 
-	list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+	list_for_each_entry(uprobe, &tmp_list, pending_list) {
 		loff_t vaddr;
 
-		list_del(&uprobe->pending_list);
 		if (!ret) {
 			vaddr = vma_address(vma, uprobe->offset);
 
@@ -1106,7 +1104,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
 void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
 {
 	struct list_head tmp_list;
-	struct uprobe *uprobe, *u;
+	struct uprobe *uprobe;
 	struct inode *inode;
 
 	if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
@@ -1123,12 +1121,10 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
 	mutex_lock(uprobes_mmap_hash(inode));
 	build_probe_list(inode, &tmp_list);
 
-	list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+	list_for_each_entry(uprobe, &tmp_list, pending_list) {
 		loff_t vaddr;
 
-		list_del(&uprobe->pending_list);
 		vaddr = vma_address(vma, uprobe->offset);
-
 		if (vaddr >= start && vaddr < end) {
 			/*
 			 * An unregister could have removed the probe before
-- 
cgit v1.2.3-59-g8ed1b


From 816c03fbabe64fa09f66fbb64e932081af381415 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:55 +0200
Subject: uprobes: Don't use loff_t for the valid virtual address

loff_t looks confusing when it is used for the virtual address.
Change map_info and install_breakpoint/remove_breakpoint paths
to use "unsigned long".

The patch doesn't change vma_address(), it can't return "long"
because it is used to verify the mapping. But probably this
needs some cleanups too.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Anton Arapov <anton@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120615154355.GA9622@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7d5c78f063ae..4df84b76dd48 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -664,9 +664,8 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp)
  */
 static int
 install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
-			struct vm_area_struct *vma, loff_t vaddr)
+			struct vm_area_struct *vma, unsigned long vaddr)
 {
-	unsigned long addr;
 	int ret;
 
 	/*
@@ -679,8 +678,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 	if (!uprobe->consumers)
 		return -EEXIST;
 
-	addr = (unsigned long)vaddr;
-
 	if (!(uprobe->flags & UPROBE_COPY_INSN)) {
 		ret = copy_insn(uprobe, vma->vm_file);
 		if (ret)
@@ -689,7 +686,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 		if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
 			return -ENOTSUPP;
 
-		ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr);
+		ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
 		if (ret)
 			return ret;
 
@@ -709,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 	 * Hence increment before and decrement on failure.
 	 */
 	atomic_inc(&mm->uprobes_state.count);
-	ret = set_swbp(&uprobe->arch, mm, addr);
+	ret = set_swbp(&uprobe->arch, mm, vaddr);
 	if (ret)
 		atomic_dec(&mm->uprobes_state.count);
 
@@ -717,9 +714,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 }
 
 static void
-remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr)
+remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-	if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true))
+	if (!set_orig_insn(&uprobe->arch, mm, vaddr, true))
 		atomic_dec(&mm->uprobes_state.count);
 }
 
@@ -743,7 +740,7 @@ static void delete_uprobe(struct uprobe *uprobe)
 struct map_info {
 	struct map_info *next;
 	struct mm_struct *mm;
-	loff_t vaddr;
+	unsigned long vaddr;
 };
 
 static inline struct map_info *free_map_info(struct map_info *info)
@@ -837,7 +834,6 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 	while (info) {
 		struct mm_struct *mm = info->mm;
 		struct vm_area_struct *vma;
-		loff_t vaddr;
 
 		if (err)
 			goto free;
@@ -847,9 +843,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 		if (!vma || !valid_vma(vma, is_register))
 			goto unlock;
 
-		vaddr = vma_address(vma, uprobe->offset);
 		if (vma->vm_file->f_mapping->host != uprobe->inode ||
-						vaddr != info->vaddr)
+		    vma_address(vma, uprobe->offset) != info->vaddr)
 			goto unlock;
 
 		if (is_register) {
@@ -1055,10 +1050,8 @@ int uprobe_mmap(struct vm_area_struct *vma)
 	count = 0;
 
 	list_for_each_entry(uprobe, &tmp_list, pending_list) {
-		loff_t vaddr;
-
 		if (!ret) {
-			vaddr = vma_address(vma, uprobe->offset);
+			loff_t vaddr = vma_address(vma, uprobe->offset);
 
 			if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
 				put_uprobe(uprobe);
@@ -1122,9 +1115,8 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
 	build_probe_list(inode, &tmp_list);
 
 	list_for_each_entry(uprobe, &tmp_list, pending_list) {
-		loff_t vaddr;
+		loff_t vaddr = vma_address(vma, uprobe->offset);
 
-		vaddr = vma_address(vma, uprobe->offset);
 		if (vaddr >= start && vaddr < end) {
 			/*
 			 * An unregister could have removed the probe before
-- 
cgit v1.2.3-59-g8ed1b


From 593609a59600c8377f311b300f14deacb155b9a4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:43:59 +0200
Subject: uprobes: __copy_insn() needs "loff_t offset"

1. __copy_insn() needs "loff_t offset", not "unsigned long",
   to read the file.

2. use pgoff_t for "idx" and remove the unnecessary typecast.

3. fix the typo, "&=" is not what we want

4. can't resist, rename off1 to off.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120615154359.GA9625@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4df84b76dd48..d1b2eeb80837 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -581,12 +581,12 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
 
 static int
 __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
-			unsigned long nbytes, unsigned long offset)
+			unsigned long nbytes, loff_t offset)
 {
 	struct page *page;
 	void *vaddr;
-	unsigned long off1;
-	unsigned long idx;
+	unsigned long off;
+	pgoff_t idx;
 
 	if (!filp)
 		return -EINVAL;
@@ -594,8 +594,8 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
 	if (!mapping->a_ops->readpage)
 		return -EIO;
 
-	idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT);
-	off1 = offset &= ~PAGE_MASK;
+	idx = offset >> PAGE_CACHE_SHIFT;
+	off = offset & ~PAGE_MASK;
 
 	/*
 	 * Ensure that the page that has the original instruction is
@@ -606,7 +606,7 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
 		return PTR_ERR(page);
 
 	vaddr = kmap_atomic(page);
-	memcpy(insn, vaddr + off1, nbytes);
+	memcpy(insn, vaddr + off, nbytes);
 	kunmap_atomic(vaddr);
 	page_cache_release(page);
 
-- 
cgit v1.2.3-59-g8ed1b


From e227051b13956b8f71c0abecc41ad351e64671c8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 15 Jun 2012 17:44:01 +0200
Subject: uprobes: Remove the unnecessary initialization in add_utask()

Trivial cleanup. No need to nullify ->active_uprobe after
kzalloc().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anton Arapov <anton@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20120615154401.GA9633@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d1b2eeb80837..f93532748bca 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1392,7 +1392,6 @@ static struct uprobe_task *add_utask(void)
 	if (unlikely(!utask))
 		return NULL;
 
-	utask->active_uprobe = NULL;
 	current->utask = utask;
 	return utask;
 }
-- 
cgit v1.2.3-59-g8ed1b


From fbfc623f8231c8d8c78aab5841e9c6e5811ab638 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 15 Jun 2012 14:31:31 +0800
Subject: perf: Avoid race between cpu hotplug and installing event

perf_event_open() requires the cpu on which to install event is online,
but the cpu can go offline after perf_event_open checks that. Add a
get_online_cpus()/put_online_cpus() pair to avoid the race.

Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1339741902-8449-3-git-send-email-zheng.z.yan@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7d71d6ec972..31d182e01549 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6252,6 +6252,8 @@ SYSCALL_DEFINE5(perf_event_open,
 		}
 	}
 
+	get_online_cpus();
+
 	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
 				 NULL, NULL);
 	if (IS_ERR(event)) {
@@ -6391,6 +6393,8 @@ SYSCALL_DEFINE5(perf_event_open,
 	perf_unpin_context(ctx);
 	mutex_unlock(&ctx->mutex);
 
+	put_online_cpus();
+
 	event->owner = current;
 
 	mutex_lock(&current->perf_event_mutex);
@@ -6419,6 +6423,7 @@ err_context:
 err_alloc:
 	free_event(event);
 err_task:
+	put_online_cpus();
 	if (task)
 		put_task_struct(task);
 err_group_fd:
-- 
cgit v1.2.3-59-g8ed1b


From e2d37cd213dcc0aeb3db4b37b9bd1710fe36fbf7 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 15 Jun 2012 14:31:32 +0800
Subject: perf: Allow the PMU driver to choose the CPU on which to install
 events

Allow the pmu->event_init callback to change event->cpu, so the PMU driver
can choose the CPU on which to install events.

Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1339741902-8449-4-git-send-email-zheng.z.yan@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 31d182e01549..fa36a39e8bb7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6306,7 +6306,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	/*
 	 * Get the target context (task or percpu):
 	 */
-	ctx = find_get_context(pmu, task, cpu);
+	ctx = find_get_context(pmu, task, event->cpu);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
 		goto err_alloc;
@@ -6379,16 +6379,16 @@ SYSCALL_DEFINE5(perf_event_open,
 	mutex_lock(&ctx->mutex);
 
 	if (move_group) {
-		perf_install_in_context(ctx, group_leader, cpu);
+		perf_install_in_context(ctx, group_leader, event->cpu);
 		get_ctx(ctx);
 		list_for_each_entry(sibling, &group_leader->sibling_list,
 				    group_entry) {
-			perf_install_in_context(ctx, sibling, cpu);
+			perf_install_in_context(ctx, sibling, event->cpu);
 			get_ctx(ctx);
 		}
 	}
 
-	perf_install_in_context(ctx, event, cpu);
+	perf_install_in_context(ctx, event, event->cpu);
 	++ctx->generation;
 	perf_unpin_context(ctx);
 	mutex_unlock(&ctx->mutex);
-- 
cgit v1.2.3-59-g8ed1b


From 0cda4c023132aa93f2dd94811061f812e88daf4c Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 15 Jun 2012 14:31:33 +0800
Subject: perf: Introduce perf_pmu_migrate_context()

Originally from Peter Zijlstra. The helper migrates perf events
from one cpu to another cpu.

Signed-off-by: Zheng Yan <zheng.z.yan@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1339741902-8449-5-git-send-email-zheng.z.yan@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  2 ++
 kernel/events/core.c       | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1ce887abcc5c..76c5c8b724a7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1107,6 +1107,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				struct task_struct *task,
 				perf_overflow_handler_t callback,
 				void *context);
+extern void perf_pmu_migrate_context(struct pmu *pmu,
+				int src_cpu, int dst_cpu);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fa36a39e8bb7..f1cf0edeb39a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1645,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 	lockdep_assert_held(&ctx->mutex);
 
 	event->ctx = ctx;
+	if (event->cpu != -1)
+		event->cpu = cpu;
 
 	if (!task) {
 		/*
@@ -6379,6 +6381,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	mutex_lock(&ctx->mutex);
 
 	if (move_group) {
+		synchronize_rcu();
 		perf_install_in_context(ctx, group_leader, event->cpu);
 		get_ctx(ctx);
 		list_for_each_entry(sibling, &group_leader->sibling_list,
@@ -6484,6 +6487,39 @@ err:
 }
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
 
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+	struct perf_event_context *src_ctx;
+	struct perf_event_context *dst_ctx;
+	struct perf_event *event, *tmp;
+	LIST_HEAD(events);
+
+	src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
+	dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+
+	mutex_lock(&src_ctx->mutex);
+	list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
+				 event_entry) {
+		perf_remove_from_context(event);
+		put_ctx(src_ctx);
+		list_add(&event->event_entry, &events);
+	}
+	mutex_unlock(&src_ctx->mutex);
+
+	synchronize_rcu();
+
+	mutex_lock(&dst_ctx->mutex);
+	list_for_each_entry_safe(event, tmp, &events, event_entry) {
+		list_del(&event->event_entry);
+		if (event->state >= PERF_EVENT_STATE_OFF)
+			event->state = PERF_EVENT_STATE_INACTIVE;
+		perf_install_in_context(dst_ctx, event, dst_cpu);
+		get_ctx(dst_ctx);
+	}
+	mutex_unlock(&dst_ctx->mutex);
+}
+EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
+
 static void sync_child_event(struct perf_event *child_event,
 			       struct task_struct *child)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 6648bd7e0e62c0c8c03b15e00c9e7015e232feff Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 21 Jun 2012 13:58:31 +0000
Subject: ipv4: Add sysctl knob to control early socket demux

This change is meant to add a control for disabling early socket demux.
The main motivation behind this patch is to provide an option to disable
the feature as it adds an additional cost to routing that reduces overall
throughput by up to 5%.  For example one of my systems went from 12.1Mpps
to 11.6 after the early socket demux was added.  It looks like the reason
for the regression is that we are now having to perform two lookups, first
the one for an established socket, and then the one for the routing table.

By adding this patch and toggling the value for ip_early_demux to 0 I am
able to get back to the 12.1Mpps I was previously seeing.

[ Move local variables in ip_rcv_finish() down into the basic
  block in which they are actually used.  -DaveM ]

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h     |  1 +
 include/net/ip.h           |  3 +++
 kernel/sysctl_binary.c     |  2 ++
 net/ipv4/ip_input.c        | 22 +++++++++++++---------
 net/ipv4/sysctl_net_ipv4.c |  7 +++++++
 5 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index c34b4c82b0dc..20825e5f433f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -425,6 +425,7 @@ enum
 	NET_TCP_ALLOWED_CONG_CONTROL=123,
 	NET_TCP_MAX_SSTHRESH=124,
 	NET_TCP_FRTO_RESPONSE=125,
+	NET_IPV4_EARLY_DEMUX=126,
 };
 
 enum {
diff --git a/include/net/ip.h b/include/net/ip.h
index 83e0619f59d0..50841bd6f10e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -210,6 +210,9 @@ extern int inet_peer_threshold;
 extern int inet_peer_minttl;
 extern int inet_peer_maxttl;
 
+/* From ip_input.c */
+extern int sysctl_ip_early_demux;
+
 /* From ip_output.c */
 extern int sysctl_ip_dynaddr;
 
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index a650694883a1..6a3cf8253aae 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -415,6 +415,8 @@ static const struct bin_table bin_net_ipv4_table[] = {
 	{ CTL_INT,	NET_IPV4_IPFRAG_SECRET_INTERVAL,	"ipfrag_secret_interval" },
 	/* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
 
+	{ CTL_INT,	NET_IPV4_EARLY_DEMUX,			"ip_early_demux" },
+
 	{ CTL_INT,	2088 /* NET_IPQ_QMAX */,		"ip_queue_maxlen" },
 
 	/* NET_TCP_DEFAULT_WIN_SCALE unused */
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 93b092c9a394..bca25179cdb9 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -313,6 +313,8 @@ drop:
 	return true;
 }
 
+int sysctl_ip_early_demux __read_mostly = 1;
+
 static int ip_rcv_finish(struct sk_buff *skb)
 {
 	const struct iphdr *iph = ip_hdr(skb);
@@ -323,16 +325,18 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (skb_dst(skb) == NULL) {
-		const struct net_protocol *ipprot;
-		int protocol = iph->protocol;
-		int err;
+		int err = -ENOENT;
 
-		rcu_read_lock();
-		ipprot = rcu_dereference(inet_protos[protocol]);
-		err = -ENOENT;
-		if (ipprot && ipprot->early_demux)
-			err = ipprot->early_demux(skb);
-		rcu_read_unlock();
+		if (sysctl_ip_early_demux) {
+			const struct net_protocol *ipprot;
+			int protocol = iph->protocol;
+
+			rcu_read_lock();
+			ipprot = rcu_dereference(inet_protos[protocol]);
+			if (ipprot && ipprot->early_demux)
+				err = ipprot->early_demux(skb);
+			rcu_read_unlock();
+		}
 
 		if (err) {
 			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ef32956ed655..12aa0c5867c4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -300,6 +300,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "ip_early_demux",
+		.data		= &sysctl_ip_early_demux,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "ip_dynaddr",
 		.data		= &sysctl_ip_dynaddr,
-- 
cgit v1.2.3-59-g8ed1b


From dfbce08c19cba2ba4faaf8c0dd6d7678f46c78dd Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 22 Jun 2012 23:02:22 -0700
Subject: ipv4: Don't add deprecated new binary sysctl value.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h | 1 -
 kernel/sysctl_binary.c | 2 --
 2 files changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 20825e5f433f..c34b4c82b0dc 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -425,7 +425,6 @@ enum
 	NET_TCP_ALLOWED_CONG_CONTROL=123,
 	NET_TCP_MAX_SSTHRESH=124,
 	NET_TCP_FRTO_RESPONSE=125,
-	NET_IPV4_EARLY_DEMUX=126,
 };
 
 enum {
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 6a3cf8253aae..a650694883a1 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -415,8 +415,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
 	{ CTL_INT,	NET_IPV4_IPFRAG_SECRET_INTERVAL,	"ipfrag_secret_interval" },
 	/* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
 
-	{ CTL_INT,	NET_IPV4_EARLY_DEMUX,			"ip_early_demux" },
-
 	{ CTL_INT,	2088 /* NET_IPQ_QMAX */,		"ip_queue_maxlen" },
 
 	/* NET_TCP_DEFAULT_WIN_SCALE unused */
-- 
cgit v1.2.3-59-g8ed1b


From c64e66c67b574f25a048886807c2007d17d50d0a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 26 Jun 2012 21:45:21 -0700
Subject: audit: netlink: Move away from NLMSG_NEW().

And use nlmsg_data() while we're here too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/audit.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 1c7f2c61416b..30b252a1fb61 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb)
 static void audit_printk_skb(struct sk_buff *skb)
 {
 	struct nlmsghdr *nlh = nlmsg_hdr(skb);
-	char *data = NLMSG_DATA(nlh);
+	char *data = nlmsg_data(nlh);
 
 	if (nlh->nlmsg_type != AUDIT_EOE) {
 		if (printk_ratelimit())
@@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
 	if (!skb)
 		return NULL;
 
-	nlh	= NLMSG_NEW(skb, pid, seq, t, size, flags);
-	data	= NLMSG_DATA(nlh);
+	nlh	= nlmsg_put(skb, pid, seq, t, size, flags);
+	if (!nlh)
+		goto out_kfree_skb;
+	data = nlmsg_data(nlh);
 	memcpy(data, payload, size);
 	return skb;
 
-nlmsg_failure:			/* Used by NLMSG_NEW */
-	if (skb)
-		kfree_skb(skb);
+out_kfree_skb:
+	kfree_skb(skb);
 	return NULL;
 }
 
@@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	sessionid = audit_get_sessionid(current);
 	security_task_getsecid(current, &sid);
 	seq  = nlh->nlmsg_seq;
-	data = NLMSG_DATA(nlh);
+	data = nlmsg_data(nlh);
 
 	switch (msg_type) {
 	case AUDIT_GET:
@@ -1060,13 +1061,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
 
 	ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
 	if (!ab->skb)
-		goto nlmsg_failure;
+		goto err;
 
-	nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
+	nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
+	if (!nlh)
+		goto out_kfree_skb;
 
 	return ab;
 
-nlmsg_failure:                  /* Used by NLMSG_NEW */
+out_kfree_skb:
 	kfree_skb(ab->skb);
 	ab->skb = NULL;
 err:
-- 
cgit v1.2.3-59-g8ed1b


From 0be61ebc18b919dddbdbcd1c4f42513c310ecf59 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 18 Jun 2012 09:28:16 -0400
Subject: tracing/selftest: Add a WARN_ON() if a tracer test fails

Add a WARN_ON() output on test failures so that they are easier to detect
in automated tests. Although, the WARN_ON() will not print if the test
causes the system to crash, obviously.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 49249c28690d..748f6401edf6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -830,6 +830,8 @@ int register_tracer(struct tracer *type)
 		current_trace = saved_tracer;
 		if (ret) {
 			printk(KERN_CONT "FAILED!\n");
+			/* Add the warning after printing 'FAILED' */
+			WARN_ON(1);
 			goto out;
 		}
 		/* Only reset on passing, to avoid touching corrupted buffers */
-- 
cgit v1.2.3-59-g8ed1b


From 6d158a813efcd09661c23f16ddf7e2ff834cb20c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 27 Jun 2012 20:46:14 -0400
Subject: tracing: Remove NR_CPUS array from trace_iterator

Replace the NR_CPUS array of buffer_iter from the trace_iterator
with an allocated array. This will just create an array of
possible CPUS instead of the max number specified.

The use of NR_CPUS in that array caused allocation failures for
machines that were tight on memory. This did not cause any failures
to the system itself (no crashes), but caused unnecessary failures
for reading the trace files.

Added a helper function called 'trace_buffer_iter()' that returns
the buffer_iter item or NULL if it is not defined or the array was
not allocated. Some routines do not require the array
(tracing_open_pipe() for one).

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h         |  2 +-
 kernel/trace/trace.c                 | 27 ++++++++++++++++++---------
 kernel/trace/trace.h                 |  8 ++++++++
 kernel/trace/trace_functions_graph.c |  2 +-
 4 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 1aff18346c71..af961d6f7ab1 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -65,7 +65,7 @@ struct trace_iterator {
 	void			*private;
 	int			cpu_file;
 	struct mutex		mutex;
-	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
+	struct ring_buffer_iter	**buffer_iter;
 	unsigned long		iter_flags;
 
 	/* trace_seq for __print_flags() and __print_symbolic() etc. */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 748f6401edf6..b2af14e94c28 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1710,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
 
 static void trace_iterator_increment(struct trace_iterator *iter)
 {
+	struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
+
 	iter->idx++;
-	if (iter->buffer_iter[iter->cpu])
-		ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+	if (buf_iter)
+		ring_buffer_read(buf_iter, NULL);
 }
 
 static struct trace_entry *
@@ -1720,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
 		unsigned long *lost_events)
 {
 	struct ring_buffer_event *event;
-	struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
+	struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);
 
 	if (buf_iter)
 		event = ring_buffer_iter_peek(buf_iter, ts);
@@ -1858,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 
 	tr->data[cpu]->skipped_entries = 0;
 
-	if (!iter->buffer_iter[cpu])
+	buf_iter = trace_buffer_iter(iter, cpu);
+	if (!buf_iter)
 		return;
 
-	buf_iter = iter->buffer_iter[cpu];
 	ring_buffer_iter_reset(buf_iter);
 
 	/*
@@ -2207,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 int trace_empty(struct trace_iterator *iter)
 {
+	struct ring_buffer_iter *buf_iter;
 	int cpu;
 
 	/* If we are looking at one CPU buffer, only check that one */
 	if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
 		cpu = iter->cpu_file;
-		if (iter->buffer_iter[cpu]) {
-			if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+		buf_iter = trace_buffer_iter(iter, cpu);
+		if (buf_iter) {
+			if (!ring_buffer_iter_empty(buf_iter))
 				return 0;
 		} else {
 			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2223,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter)
 	}
 
 	for_each_tracing_cpu(cpu) {
-		if (iter->buffer_iter[cpu]) {
-			if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+		buf_iter = trace_buffer_iter(iter, cpu);
+		if (buf_iter) {
+			if (!ring_buffer_iter_empty(buf_iter))
 				return 0;
 		} else {
 			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2383,6 +2388,8 @@ __tracing_open(struct inode *inode, struct file *file)
 	if (!iter)
 		return ERR_PTR(-ENOMEM);
 
+	iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
+				    GFP_KERNEL);
 	/*
 	 * We make a copy of the current tracer to avoid concurrent
 	 * changes on it while we are reading.
@@ -2443,6 +2450,7 @@ __tracing_open(struct inode *inode, struct file *file)
  fail:
 	mutex_unlock(&trace_types_lock);
 	kfree(iter->trace);
+	kfree(iter->buffer_iter);
 	seq_release_private(inode, file);
 	return ERR_PTR(-ENOMEM);
 }
@@ -2483,6 +2491,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 	mutex_destroy(&iter->mutex);
 	free_cpumask_var(iter->started);
 	kfree(iter->trace);
+	kfree(iter->buffer_iter);
 	seq_release_private(inode, file);
 	return 0;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5aec220d2de0..55e1f7f0db12 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -317,6 +317,14 @@ struct tracer {
 
 #define TRACE_PIPE_ALL_CPU	-1
 
+static inline struct ring_buffer_iter *
+trace_buffer_iter(struct trace_iterator *iter, int cpu)
+{
+	if (iter->buffer_iter && iter->buffer_iter[cpu])
+		return iter->buffer_iter[cpu];
+	return NULL;
+}
+
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a7d2a4c653d8..ce27c8ba8d31 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter,
 		next = &data->ret;
 	} else {
 
-		ring_iter = iter->buffer_iter[iter->cpu];
+		ring_iter = trace_buffer_iter(iter, iter->cpu);
 
 		/* First peek to compare current entry and the next one */
 		if (ring_iter)
-- 
cgit v1.2.3-59-g8ed1b


From a5fb833172eca69136e9ee1ada778e404086ab8a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 28 Jun 2012 13:35:04 -0400
Subject: ring-buffer: Fix uninitialized read_stamp

The ring buffer reader page is used to swap a page from the writable
ring buffer. If the writer happens to be on that page, it ends up on the
reader page, but will simply move off of it, back into the writable ring
buffer as writes are added.

The time stamp passed back to the readers is stored in the cpu_buffer per
CPU descriptor. This stamp is updated when a swap of the reader page takes
place, and it reads the current stamp from the page taken from the writable
ring buffer. Everytime a writer goes to a new page, it updates the time stamp
of that page.

The problem happens if a reader reads a page from an empty per CPU ring buffer.
If the buffer is empty, the swap still takes place, placing the writer at the
start of the reader page. If at a later time, a write happens, it updates the
page's time stamp and continues. But the problem is that the read_stamp does
not get updated, because the page was already swapped.

The solution to this was to not swap the page if the ring buffer happens to
be empty. This also removes the side effect that the writes on the reader
page will not get updated because the writer never gets back on the reader
page without a swap. That is, if a read happens on an empty buffer, but then
no reads happen for a while. If a swap took place, and the writer were to start
writing a lot of data (function tracer), it will start overflowing the ring buffer
and overwrite the older data. But because the writer never goes back onto the
reader page, the data left on the reader page never gets overwritten. This
causes the reader to see really old data, followed by a jump to newer data.

Link: http://lkml.kernel.org/r/1340060577-9112-1-git-send-email-dhsharp@google.com
Google-Bug-Id: 6410455
Reported-by: David Sharp <dhsharp@google.com>
tested-by: David Sharp <dhsharp@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d0f6a8a0e5e..82a3e0c56b1d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	if (cpu_buffer->commit_page == cpu_buffer->reader_page)
 		goto out;
 
+	/* Don't bother swapping if the ring buffer is empty */
+	if (rb_num_of_entries(cpu_buffer) == 0)
+		goto out;
+
 	/*
 	 * Reset the reader page to size zero.
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From 44b99462d9d776522e174d6c531ce5ccef309e26 Mon Sep 17 00:00:00 2001
From: Vaibhav Nagarnaik <vnagarnaik@google.com>
Date: Fri, 22 Jun 2012 11:50:05 -0700
Subject: ring-buffer: Fix crash due to uninitialized new_pages list head

The new_pages list head in the cpu_buffer is not initialized. When
adding pages to the ring buffer, if the memory allocation fails in
ring_buffer_resize, the clean up handler tries to free up the allocated
pages from all the cpu buffers. The panic is caused by referencing the
uninitialized new_pages list head.

Initializing the new_pages list head in rb_allocate_cpu_buffer fixes
this.

Link: http://lkml.kernel.org/r/1340391005-10880-1-git-send-email-vnagarnaik@google.com

Cc: Justin Teravest <teravest@google.com>
Cc: David Sharp <dhsharp@google.com>
Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d0f6a8a0e5e..ba39cbabdc9f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
 	rb_init_page(bpage->page);
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+	INIT_LIST_HEAD(&cpu_buffer->new_pages);
 
 	ret = rb_allocate_pages(cpu_buffer, nr_pages);
 	if (ret < 0)
-- 
cgit v1.2.3-59-g8ed1b


From 48fdc72f23ad9a9956e524a47843135d0bbc3317 Mon Sep 17 00:00:00 2001
From: Vaibhav Nagarnaik <vnagarnaik@google.com>
Date: Fri, 29 Jun 2012 12:31:41 -0700
Subject: ring-buffer: Fix accounting of entries when removing pages

When removing pages from the ring buffer, its state is not reset. This
means that the counters need to be correctly updated to account for the
pages removed.

Update the overrun counter to reflect the removed events from the pages.

Link: http://lkml.kernel.org/r/1340998301-1715-1-git-send-email-vnagarnaik@google.com

Cc: Justin Teravest <teravest@google.com>
Cc: David Sharp <dhsharp@google.com>
Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ba39cbabdc9f..f765465bffe4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1347,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
 			 * If something was added to this page, it was full
 			 * since it is not the tail page. So we deduct the
 			 * bytes consumed in ring buffer from here.
-			 * No need to update overruns, since this page is
-			 * deleted from ring buffer and its entries are
-			 * already accounted for.
+			 * Increment overrun to account for the lost events.
 			 */
+			local_add(page_entries, &cpu_buffer->overrun);
 			local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From a31f2d17b331db970259e875b7223d3aba7e3821 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 29 Jun 2012 06:15:21 +0000
Subject: netlink: add netlink_kernel_cfg parameter to netlink_kernel_create

This patch adds the following structure:

struct netlink_kernel_cfg {
        unsigned int    groups;
        void            (*input)(struct sk_buff *skb);
        struct mutex    *cb_mutex;
};

That can be passed to netlink_kernel_create to set optional configurations
for netlink kernel sockets.

I've populated this structure by looking for NULL and zero parameters at the
existing code. The remaining parameters that always need to be set are still
left in the original interface.

That includes optional parameters for the netlink socket creation. This allows
easy extensibility of this interface in the future.

This patch also adapts all callers to use this new interface.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/crypto_user.c                |  7 +++++--
 drivers/connector/connector.c       | 13 +++++++++----
 drivers/infiniband/core/netlink.c   |  7 +++++--
 drivers/scsi/scsi_netlink.c         |  7 +++++--
 drivers/scsi/scsi_transport_iscsi.c |  9 ++++++---
 drivers/staging/gdm72xx/netlink_k.c |  6 ++++--
 include/linux/netlink.h             | 15 ++++++++++-----
 kernel/audit.c                      |  7 +++++--
 lib/kobject_uevent.c                |  5 ++++-
 net/bridge/netfilter/ebt_ulog.c     |  6 ++++--
 net/core/rtnetlink.c                |  9 +++++++--
 net/core/sock_diag.c                |  8 ++++++--
 net/decnet/netfilter/dn_rtmsg.c     |  8 +++++---
 net/ipv4/fib_frontend.c             |  7 +++++--
 net/ipv4/netfilter/ipt_ULOG.c       |  8 +++++---
 net/netfilter/nfnetlink.c           |  7 +++++--
 net/netlink/af_netlink.c            | 16 ++++++++++------
 net/netlink/genetlink.c             | 10 +++++++---
 net/xfrm/xfrm_user.c                |  7 +++++--
 security/selinux/netlink.c          |  6 +++++-
 20 files changed, 117 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c
index 5a37eadb4e56..ba2c611154af 100644
--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
@@ -496,9 +496,12 @@ static void crypto_netlink_rcv(struct sk_buff *skb)
 
 static int __init crypto_user_init(void)
 {
+	struct netlink_kernel_cfg cfg = {
+		.input	= crypto_netlink_rcv,
+	};
+
 	crypto_nlsk = netlink_kernel_create(&init_net, NETLINK_CRYPTO,
-					    0, crypto_netlink_rcv,
-					    NULL, THIS_MODULE);
+					    THIS_MODULE, &cfg);
 	if (!crypto_nlsk)
 		return -ENOMEM;
 
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 34e0e9e4d913..116cf8d02834 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -251,15 +251,20 @@ static const struct file_operations cn_file_ops = {
 	.release = single_release
 };
 
+static struct cn_dev cdev = {
+	.input   = cn_rx_skb,
+};
+
 static int __devinit cn_init(void)
 {
 	struct cn_dev *dev = &cdev;
-
-	dev->input = cn_rx_skb;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= CN_NETLINK_USERS + 0xf,
+		.input	= dev->input,
+	};
 
 	dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR,
-					 CN_NETLINK_USERS + 0xf,
-					 dev->input, NULL, THIS_MODULE);
+					 THIS_MODULE, &cfg);
 	if (!dev->nls)
 		return -EIO;
 
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 1e691dca1820..3ae2bfd31015 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -173,8 +173,11 @@ static void ibnl_rcv(struct sk_buff *skb)
 
 int __init ibnl_init(void)
 {
-	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, 0, ibnl_rcv,
-				    NULL, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.input	= ibnl_rcv,
+	};
+
+	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, THIS_MODULE, &cfg);
 	if (!nls) {
 		pr_warn("Failed to create netlink socket\n");
 		return -ENOMEM;
diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c
index c77628afbf9f..8818dd681c19 100644
--- a/drivers/scsi/scsi_netlink.c
+++ b/drivers/scsi/scsi_netlink.c
@@ -486,6 +486,10 @@ void
 scsi_netlink_init(void)
 {
 	int error;
+	struct netlink_kernel_cfg cfg = {
+		.input	= scsi_nl_rcv_msg,
+		.groups	= SCSI_NL_GRP_CNT,
+	};
 
 	INIT_LIST_HEAD(&scsi_nl_drivers);
 
@@ -497,8 +501,7 @@ scsi_netlink_init(void)
 	}
 
 	scsi_nl_sock = netlink_kernel_create(&init_net, NETLINK_SCSITRANSPORT,
-				SCSI_NL_GRP_CNT, scsi_nl_rcv_msg, NULL,
-				THIS_MODULE);
+					     THIS_MODULE, &cfg);
 	if (!scsi_nl_sock) {
 		printk(KERN_ERR "%s: register of receive handler failed\n",
 				__func__);
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 1cf640e575da..6042954d8f3b 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2936,7 +2936,10 @@ EXPORT_SYMBOL_GPL(iscsi_unregister_transport);
 static __init int iscsi_transport_init(void)
 {
 	int err;
-
+	struct netlink_kernel_cfg cfg = {
+		.groups	= 1,
+		.input	= iscsi_if_rx,
+	};
 	printk(KERN_INFO "Loading iSCSI transport class v%s.\n",
 		ISCSI_TRANSPORT_VERSION);
 
@@ -2966,8 +2969,8 @@ static __init int iscsi_transport_init(void)
 	if (err)
 		goto unregister_conn_class;
 
-	nls = netlink_kernel_create(&init_net, NETLINK_ISCSI, 1, iscsi_if_rx,
-				    NULL, THIS_MODULE);
+	nls = netlink_kernel_create(&init_net, NETLINK_ISCSI,
+				    THIS_MODULE, &cfg);
 	if (!nls) {
 		err = -ENOBUFS;
 		goto unregister_session_class;
diff --git a/drivers/staging/gdm72xx/netlink_k.c b/drivers/staging/gdm72xx/netlink_k.c
index 2489bb5597ca..87c3a07ed80e 100644
--- a/drivers/staging/gdm72xx/netlink_k.c
+++ b/drivers/staging/gdm72xx/netlink_k.c
@@ -88,13 +88,15 @@ struct sock *netlink_init(int unit, void (*cb)(struct net_device *dev, u16 type,
 						void *msg, int len))
 {
 	struct sock *sock;
+	struct netlink_kernel_cfg cfg = {
+		.input  = netlink_rcv,
+	};
 
 #if !defined(DEFINE_MUTEX)
 	init_MUTEX(&netlink_mutex);
 #endif
 
-	sock = netlink_kernel_create(&init_net, unit, 0, netlink_rcv, NULL,
-					THIS_MODULE);
+	sock = netlink_kernel_create(&init_net, unit, THIS_MODULE, &cfg);
 
 	if (sock)
 		rcv_cb = cb;
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index ed33f0901bc2..6085e4919cb3 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -174,11 +174,16 @@ struct netlink_skb_parms {
 extern void netlink_table_grab(void);
 extern void netlink_table_ungrab(void);
 
-extern struct sock *netlink_kernel_create(struct net *net,
-					  int unit,unsigned int groups,
-					  void (*input)(struct sk_buff *skb),
-					  struct mutex *cb_mutex,
-					  struct module *module);
+/* optional Netlink kernel configuration parameters */
+struct netlink_kernel_cfg {
+	unsigned int	groups;
+	void		(*input)(struct sk_buff *skb);
+	struct mutex	*cb_mutex;
+};
+
+extern struct sock *netlink_kernel_create(struct net *net, int unit,
+					  struct module *module,
+					  struct netlink_kernel_cfg *cfg);
 extern void netlink_kernel_release(struct sock *sk);
 extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
diff --git a/kernel/audit.c b/kernel/audit.c
index 30b252a1fb61..4a3f28d2ca65 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -962,14 +962,17 @@ static void audit_receive(struct sk_buff  *skb)
 static int __init audit_init(void)
 {
 	int i;
+	struct netlink_kernel_cfg cfg = {
+		.input	= audit_receive,
+	};
 
 	if (audit_initialized == AUDIT_DISABLED)
 		return 0;
 
 	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
 	       audit_default ? "enabled" : "disabled");
-	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
-					   audit_receive, NULL, THIS_MODULE);
+	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT,
+					   THIS_MODULE, &cfg);
 	if (!audit_sock)
 		audit_panic("cannot initialize netlink socket");
 	else
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 1a91efa6d121..0401d2916d9f 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -373,13 +373,16 @@ EXPORT_SYMBOL_GPL(add_uevent_var);
 static int uevent_net_init(struct net *net)
 {
 	struct uevent_sock *ue_sk;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= 1,
+	};
 
 	ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL);
 	if (!ue_sk)
 		return -ENOMEM;
 
 	ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT,
-					  1, NULL, NULL, THIS_MODULE);
+					  THIS_MODULE, &cfg);
 	if (!ue_sk->sk) {
 		printk(KERN_ERR
 		       "kobject_uevent: unable to create netlink socket!\n");
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 1bd173218f7b..374bdcd77039 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -282,6 +282,9 @@ static int __init ebt_ulog_init(void)
 {
 	int ret;
 	int i;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= EBT_ULOG_MAXNLGROUPS,
+	};
 
 	if (nlbufsiz >= 128*1024) {
 		pr_warning("Netlink buffer has to be <= 128kB,"
@@ -296,8 +299,7 @@ static int __init ebt_ulog_init(void)
 	}
 
 	ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
-					  EBT_ULOG_MAXNLGROUPS, NULL, NULL,
-					  THIS_MODULE);
+					  THIS_MODULE, &cfg);
 	if (!ebtulognl)
 		ret = -ENOMEM;
 	else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bc8a1cdaac98..2b325c340b44 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2353,8 +2353,13 @@ static struct notifier_block rtnetlink_dev_notifier = {
 static int __net_init rtnetlink_net_init(struct net *net)
 {
 	struct sock *sk;
-	sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX,
-				   rtnetlink_rcv, &rtnl_mutex, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.groups		= RTNLGRP_MAX,
+		.input		= rtnetlink_rcv,
+		.cb_mutex	= &rtnl_mutex,
+	};
+
+	sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg);
 	if (!sk)
 		return -ENOMEM;
 	net->rtnl = sk;
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index ff2967acbfae..07a29eb34a41 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -171,8 +171,12 @@ EXPORT_SYMBOL_GPL(sock_diag_nlsk);
 
 static int __init sock_diag_init(void)
 {
-	sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0,
-					sock_diag_rcv, NULL, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.input	= sock_diag_rcv,
+	};
+
+	sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG,
+					       THIS_MODULE, &cfg);
 	return sock_diag_nlsk == NULL ? -ENOMEM : 0;
 }
 
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index b8f7f5b8c350..11db0ecf342f 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -125,11 +125,13 @@ static struct nf_hook_ops dnrmg_ops __read_mostly = {
 static int __init dn_rtmsg_init(void)
 {
 	int rv = 0;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= DNRNG_NLGRP_MAX,
+		.input	= dnrmg_receive_user_skb,
+	};
 
 	dnrmg = netlink_kernel_create(&init_net,
-				      NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
-				      dnrmg_receive_user_skb,
-				      NULL, THIS_MODULE);
+				      NETLINK_DNRTMSG, THIS_MODULE, &cfg);
 	if (dnrmg == NULL) {
 		printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
 		return -ENOMEM;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ae528d1b293a..3e11ea225dad 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -976,8 +976,11 @@ static void nl_fib_input(struct sk_buff *skb)
 static int __net_init nl_fib_lookup_init(struct net *net)
 {
 	struct sock *sk;
-	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
-				   nl_fib_input, NULL, THIS_MODULE);
+	struct netlink_kernel_cfg cfg = {
+		.input	= nl_fib_input,
+	};
+
+	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg);
 	if (sk == NULL)
 		return -EAFNOSUPPORT;
 	net->ipv4.fibnl = sk;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 99b3f53f16a7..1109f7f6c254 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -381,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
 static int __init ulog_tg_init(void)
 {
 	int ret, i;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= ULOG_MAXNLGROUPS,
+	};
 
 	pr_debug("init module\n");
 
@@ -393,9 +396,8 @@ static int __init ulog_tg_init(void)
 	for (i = 0; i < ULOG_MAXNLGROUPS; i++)
 		setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
 
-	nflognl = netlink_kernel_create(&init_net,
-					NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
-					NULL, THIS_MODULE);
+	nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
+					THIS_MODULE, &cfg);
 	if (!nflognl)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 3e797d1fcb94..700e4616a098 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -203,9 +203,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 static int __net_init nfnetlink_net_init(struct net *net)
 {
 	struct sock *nfnl;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= NFNLGRP_MAX,
+		.input	= nfnetlink_rcv,
+	};
 
-	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX,
-				     nfnetlink_rcv, NULL, THIS_MODULE);
+	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, THIS_MODULE, &cfg);
 	if (!nfnl)
 		return -ENOMEM;
 	net->nfnl_stash = nfnl;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b3025a603d56..43a124feaad8 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1503,14 +1503,16 @@ static void netlink_data_ready(struct sock *sk, int len)
  */
 
 struct sock *
-netlink_kernel_create(struct net *net, int unit, unsigned int groups,
-		      void (*input)(struct sk_buff *skb),
-		      struct mutex *cb_mutex, struct module *module)
+netlink_kernel_create(struct net *net, int unit,
+		      struct module *module,
+		      struct netlink_kernel_cfg *cfg)
 {
 	struct socket *sock;
 	struct sock *sk;
 	struct netlink_sock *nlk;
 	struct listeners *listeners = NULL;
+	struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
+	unsigned int groups;
 
 	BUG_ON(!nl_table);
 
@@ -1532,16 +1534,18 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups,
 	sk = sock->sk;
 	sk_change_net(sk, net);
 
-	if (groups < 32)
+	if (!cfg || cfg->groups < 32)
 		groups = 32;
+	else
+		groups = cfg->groups;
 
 	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
 	if (!listeners)
 		goto out_sock_release;
 
 	sk->sk_data_ready = netlink_data_ready;
-	if (input)
-		nlk_sk(sk)->netlink_rcv = input;
+	if (cfg && cfg->input)
+		nlk_sk(sk)->netlink_rcv = cfg->input;
 
 	if (netlink_insert(sk, net, 0))
 		goto out_sock_release;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 2cc7c1ee7690..32761b53015e 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -915,10 +915,14 @@ static struct genl_multicast_group notify_grp = {
 
 static int __net_init genl_pernet_init(struct net *net)
 {
+	struct netlink_kernel_cfg cfg = {
+		.input		= genl_rcv,
+		.cb_mutex	= &genl_mutex,
+	};
+
 	/* we'll bump the group number right afterwards */
-	net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, 0,
-					       genl_rcv, &genl_mutex,
-					       THIS_MODULE);
+	net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC,
+					       THIS_MODULE, &cfg);
 
 	if (!net->genl_sock && net_eq(net, &init_net))
 		panic("GENL: Cannot initialize generic netlink\n");
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 540762726aaf..e75d8e47f35c 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2959,9 +2959,12 @@ static struct xfrm_mgr netlink_mgr = {
 static int __net_init xfrm_user_net_init(struct net *net)
 {
 	struct sock *nlsk;
+	struct netlink_kernel_cfg cfg = {
+		.groups	= XFRMNLGRP_MAX,
+		.input	= xfrm_netlink_rcv,
+	};
 
-	nlsk = netlink_kernel_create(net, NETLINK_XFRM, XFRMNLGRP_MAX,
-				     xfrm_netlink_rcv, NULL, THIS_MODULE);
+	nlsk = netlink_kernel_create(net, NETLINK_XFRM, THIS_MODULE, &cfg);
 	if (nlsk == NULL)
 		return -ENOMEM;
 	net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */
diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c
index 8a23a35b9c5b..8a77725423e0 100644
--- a/security/selinux/netlink.c
+++ b/security/selinux/netlink.c
@@ -111,8 +111,12 @@ void selnl_notify_policyload(u32 seqno)
 
 static int __init selnl_init(void)
 {
+	struct netlink_kernel_cfg cfg = {
+		.groups	= SELNLGRP_MAX,
+	};
+
 	selnl = netlink_kernel_create(&init_net, NETLINK_SELINUX,
-				      SELNLGRP_MAX, NULL, NULL, THIS_MODULE);
+				      THIS_MODULE, &cfg);
 	if (selnl == NULL)
 		panic("SELinux:  Cannot create netlink socket.");
 	netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV);
-- 
cgit v1.2.3-59-g8ed1b


From 62c552ccc3eda1198632a4f344aa32623d226bab Mon Sep 17 00:00:00 2001
From: Bojan Smojver <bojan@rexursive.com>
Date: Sat, 16 Jun 2012 00:09:58 +0200
Subject: PM / Hibernate: Enable suspend to both for in-kernel hibernation.

It is often useful to suspend to memory after hibernation image has been
written to disk. If the battery runs out or power is otherwise lost, the
computer will resume from the hibernated image. If not, it will resume
from memory and hibernation image will be discarded.

Signed-off-by: Bojan Smojver <bojan@rexursive.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 Documentation/power/swsusp.txt |  5 +++++
 kernel/power/hibernate.c       | 36 ++++++++++++++++++++++++++++++++++++
 kernel/power/power.h           |  3 +++
 kernel/power/swap.c            | 28 ++++++++++++++++++++++++++++
 4 files changed, 72 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index ac190cf1963e..92341b84250d 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -33,6 +33,11 @@ echo shutdown > /sys/power/disk; echo disk > /sys/power/state
 
 echo platform > /sys/power/disk; echo disk > /sys/power/state
 
+. If you would like to write hibernation image to swap and then suspend
+to RAM (provided your platform supports it), you can try
+
+echo suspend > /sys/power/disk; echo disk > /sys/power/state
+
 . If you have SATA disks, you'll need recent kernels with SATA suspend
 support. For suspend and resume to work, make sure your disk drivers
 are built into kernel -- not modules. [There's way to make
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8b53db38a279..21ad3fe3164f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -5,6 +5,7 @@
  * Copyright (c) 2003 Open Source Development Lab
  * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
  * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
+ * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
  *
  * This file is released under the GPLv2.
  */
@@ -46,6 +47,9 @@ enum {
 	HIBERNATION_PLATFORM,
 	HIBERNATION_SHUTDOWN,
 	HIBERNATION_REBOOT,
+#ifdef CONFIG_SUSPEND
+	HIBERNATION_SUSPEND,
+#endif
 	/* keep last */
 	__HIBERNATION_AFTER_LAST
 };
@@ -574,6 +578,10 @@ int hibernation_platform_enter(void)
  */
 static void power_down(void)
 {
+#ifdef CONFIG_SUSPEND
+	int error;
+#endif
+
 	switch (hibernation_mode) {
 	case HIBERNATION_REBOOT:
 		kernel_restart(NULL);
@@ -583,6 +591,25 @@ static void power_down(void)
 	case HIBERNATION_SHUTDOWN:
 		kernel_power_off();
 		break;
+#ifdef CONFIG_SUSPEND
+	case HIBERNATION_SUSPEND:
+		error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+		if (error) {
+			if (hibernation_ops)
+				hibernation_mode = HIBERNATION_PLATFORM;
+			else
+				hibernation_mode = HIBERNATION_SHUTDOWN;
+			power_down();
+		}
+		/*
+		 * Restore swap signature.
+		 */
+		error = swsusp_unmark();
+		if (error)
+			printk(KERN_ERR "PM: Swap will be unusable! "
+			                "Try swapon -a.\n");
+		return;
+#endif
 	}
 	kernel_halt();
 	/*
@@ -827,6 +854,9 @@ static const char * const hibernation_modes[] = {
 	[HIBERNATION_PLATFORM]	= "platform",
 	[HIBERNATION_SHUTDOWN]	= "shutdown",
 	[HIBERNATION_REBOOT]	= "reboot",
+#ifdef CONFIG_SUSPEND
+	[HIBERNATION_SUSPEND]	= "suspend",
+#endif
 };
 
 /*
@@ -867,6 +897,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
 		switch (i) {
 		case HIBERNATION_SHUTDOWN:
 		case HIBERNATION_REBOOT:
+#ifdef CONFIG_SUSPEND
+		case HIBERNATION_SUSPEND:
+#endif
 			break;
 		case HIBERNATION_PLATFORM:
 			if (hibernation_ops)
@@ -907,6 +940,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
 		switch (mode) {
 		case HIBERNATION_SHUTDOWN:
 		case HIBERNATION_REBOOT:
+#ifdef CONFIG_SUSPEND
+		case HIBERNATION_SUSPEND:
+#endif
 			hibernation_mode = mode;
 			break;
 		case HIBERNATION_PLATFORM:
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b0bd4beaebfe..7d4b7ffb3c1d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -156,6 +156,9 @@ extern void swsusp_free(void);
 extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
 extern void swsusp_close(fmode_t);
+#ifdef CONFIG_SUSPEND
+extern int swsusp_unmark(void);
+#endif
 
 /* kernel/power/block_io.c */
 extern struct block_device *hib_resume_bdev;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11e22c068e8b..83d505142b00 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1472,6 +1472,34 @@ void swsusp_close(fmode_t mode)
 	blkdev_put(hib_resume_bdev, mode);
 }
 
+/**
+ *      swsusp_unmark - Unmark swsusp signature in the resume device
+ */
+
+#ifdef CONFIG_SUSPEND
+int swsusp_unmark(void)
+{
+	int error;
+
+	hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+	if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
+		memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
+		error = hib_bio_write_page(swsusp_resume_block,
+					swsusp_header, NULL);
+	} else {
+		printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+		error = -ENODEV;
+	}
+
+	/*
+	 * We just returned from suspend, we don't need the image any more.
+	 */
+	free_all_swap_pages(root_swap);
+
+	return error;
+}
+#endif
+
 static int swsusp_header_init(void)
 {
 	swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
-- 
cgit v1.2.3-59-g8ed1b


From 443772d408a25af62498793f6f805ce3c559309a Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Sat, 16 Jun 2012 15:30:45 +0200
Subject: ftrace: Disable function tracing during suspend/resume and
 hibernation, again

If function tracing is enabled for some of the low-level suspend/resume
functions, it leads to triple fault during resume from suspend, ultimately
ending up in a reboot instead of a resume (or a total refusal to come out
of suspended state, on some machines).

This issue was explained in more detail in commit f42ac38c59e0a03d (ftrace:
disable tracing for suspend to ram). However, the changes made by that commit
got reverted by commit cbe2f5a6e84eebb (tracing: allow tracing of
suspend/resume & hibernation code again). So, unfortunately since things are
not yet robust enough to allow tracing of low-level suspend/resume functions,
suspend/resume is still broken when ftrace is enabled.

So fix this by disabling function tracing during suspend/resume & hibernation.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: stable@vger.kernel.org
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 6 ++++++
 kernel/power/suspend.c   | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 21ad3fe3164f..0d4b354bc1be 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -358,6 +358,7 @@ int hibernation_snapshot(int platform_mode)
 	}
 
 	suspend_console();
+	ftrace_stop();
 	pm_restrict_gfp_mask();
 
 	error = dpm_suspend(PMSG_FREEZE);
@@ -383,6 +384,7 @@ int hibernation_snapshot(int platform_mode)
 	if (error || !in_suspend)
 		pm_restore_gfp_mask();
 
+	ftrace_start();
 	resume_console();
 	dpm_complete(msg);
 
@@ -485,6 +487,7 @@ int hibernation_restore(int platform_mode)
 
 	pm_prepare_console();
 	suspend_console();
+	ftrace_stop();
 	pm_restrict_gfp_mask();
 	error = dpm_suspend_start(PMSG_QUIESCE);
 	if (!error) {
@@ -492,6 +495,7 @@ int hibernation_restore(int platform_mode)
 		dpm_resume_end(PMSG_RECOVER);
 	}
 	pm_restore_gfp_mask();
+	ftrace_start();
 	resume_console();
 	pm_restore_console();
 	return error;
@@ -518,6 +522,7 @@ int hibernation_platform_enter(void)
 
 	entering_platform_hibernation = true;
 	suspend_console();
+	ftrace_stop();
 	error = dpm_suspend_start(PMSG_HIBERNATE);
 	if (error) {
 		if (hibernation_ops->recover)
@@ -561,6 +566,7 @@ int hibernation_platform_enter(void)
  Resume_devices:
 	entering_platform_hibernation = false;
 	dpm_resume_end(PMSG_RESTORE);
+	ftrace_start();
 	resume_console();
 
  Close:
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 396d262b8fd0..c8b7446b27df 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
+#include <linux/ftrace.h>
 #include <trace/events/power.h>
 
 #include "power.h"
@@ -212,6 +213,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 			goto Close;
 	}
 	suspend_console();
+	ftrace_stop();
 	suspend_test_start();
 	error = dpm_suspend_start(PMSG_SUSPEND);
 	if (error) {
@@ -231,6 +233,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 	suspend_test_start();
 	dpm_resume_end(PMSG_RESUME);
 	suspend_test_finish("resume devices");
+	ftrace_start();
 	resume_console();
  Close:
 	if (suspend_ops->end)
-- 
cgit v1.2.3-59-g8ed1b


From 4b7760ba0dd3319f66886ab2335a0fbecdbc808a Mon Sep 17 00:00:00 2001
From: Sameer Nanda <snanda@chromium.org>
Date: Tue, 19 Jun 2012 22:23:33 +0200
Subject: PM / Sleep: add knob for printing device resume times

Added a new knob called /sys/power/pm_print_times. Setting it to 1
enables printing of time taken by devices to suspend and resume.
Setting it to 0 disables this printing (unless overridden by
initcall_debug kernel command line option).

Signed-off-by: Sameer Nanda <snanda@chromium.org>
Acked-by: Greg KH <gregkh@linuxfoundation.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 Documentation/ABI/testing/sysfs-power | 13 +++++++++++++
 drivers/base/power/main.c             |  4 ++--
 drivers/base/power/power.h            | 11 +++++++++++
 kernel/power/main.c                   | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 59 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 31725ffeeb3a..217772615d02 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -231,3 +231,16 @@ Description:
 		Reads from this file return a string consisting of the names of
 		wakeup sources created with the help of /sys/power/wake_lock
 		that are inactive at the moment, separated with spaces.
+
+What:		/sys/power/pm_print_times
+Date:		May 2012
+Contact:	Sameer Nanda <snanda@chromium.org>
+Description:
+		The /sys/power/pm_print_times file allows user space to
+		control whether the time taken by devices to suspend and
+		resume is printed.  These prints are useful for hunting down
+		devices that take too long to suspend or resume.
+
+		Writing a "1" enables this printing while writing a "0"
+		disables it.  The default value is "0".  Reading from this file
+		will display the current value.
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 9cb845e49334..6e4db96958d1 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -166,7 +166,7 @@ static ktime_t initcall_debug_start(struct device *dev)
 {
 	ktime_t calltime = ktime_set(0, 0);
 
-	if (initcall_debug) {
+	if (pm_print_times) {
 		pr_info("calling  %s+ @ %i, parent: %s\n",
 			dev_name(dev), task_pid_nr(current),
 			dev->parent ? dev_name(dev->parent) : "none");
@@ -181,7 +181,7 @@ static void initcall_debug_report(struct device *dev, ktime_t calltime,
 {
 	ktime_t delta, rettime;
 
-	if (initcall_debug) {
+	if (pm_print_times) {
 		rettime = ktime_get();
 		delta = ktime_sub(rettime, calltime);
 		pr_info("call %s+ returned %d after %Ld usecs\n", dev_name(dev),
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index eeb4bff9505c..12c77b7ff8e8 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -85,3 +85,14 @@ static inline int pm_qos_sysfs_add(struct device *dev) { return 0; }
 static inline void pm_qos_sysfs_remove(struct device *dev) {}
 
 #endif
+
+#ifdef CONFIG_PM_DEBUG
+
+extern int pm_print_times_enabled;
+#define pm_print_times (initcall_debug || pm_print_times_enabled)
+
+#else /* CONFIG_PM_DEBUG */
+
+#define pm_print_times initcall_debug
+
+#endif /* CONFIG_PM_DEBUG */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 428f8a034e96..7beb3fb3670b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -132,6 +132,38 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
 }
 
 power_attr(pm_test);
+
+/*
+ * pm_print_times: print time taken by devices to suspend and resume.
+ *
+ * show() returns whether printing of suspend and resume times is enabled.
+ * store() accepts 0 or 1.  0 disables printing and 1 enables it.
+ */
+int pm_print_times_enabled;
+
+static ssize_t pm_print_times_show(struct kobject *kobj,
+				   struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", pm_print_times_enabled);
+}
+
+static ssize_t pm_print_times_store(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    const char *buf, size_t n)
+{
+	unsigned long val;
+
+	if (kstrtoul(buf, 10, &val))
+		return -EINVAL;
+
+	if (val > 1)
+		return -EINVAL;
+
+	pm_print_times_enabled = val;
+	return n;
+}
+
+power_attr(pm_print_times);
 #endif /* CONFIG_PM_DEBUG */
 
 #ifdef CONFIG_DEBUG_FS
@@ -530,6 +562,7 @@ static struct attribute * g[] = {
 #endif
 #ifdef CONFIG_PM_DEBUG
 	&pm_test_attr.attr,
+	&pm_print_times_attr.attr,
 #endif
 #endif
 	NULL,
-- 
cgit v1.2.3-59-g8ed1b


From b2df1d4f8b95d9d1e3f064cef02fc5c5116b05cf Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 21 Jun 2012 00:19:33 +0200
Subject: PM / Sleep: Separate printing suspend times from initcall_debug

Change the behavior of the newly introduced
/sys/power/pm_print_times attribute so that its initial value
depends on initcall_debug, but setting it to 0 will cause device
suspend/resume times not to be printed, even if initcall_debug has
been set.  This way, the people who use initcall_debug for reasons
other than PM debugging will be able to switch the suspend/resume
times printing off, if need be.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/power/main.c  |  4 +--
 drivers/base/power/power.h | 11 -------
 include/linux/suspend.h    |  6 ++++
 kernel/power/Kconfig       |  4 +--
 kernel/power/main.c        | 76 +++++++++++++++++++++++++++-------------------
 5 files changed, 54 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 6e4db96958d1..df5f41d2ec95 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -166,7 +166,7 @@ static ktime_t initcall_debug_start(struct device *dev)
 {
 	ktime_t calltime = ktime_set(0, 0);
 
-	if (pm_print_times) {
+	if (pm_print_times_enabled) {
 		pr_info("calling  %s+ @ %i, parent: %s\n",
 			dev_name(dev), task_pid_nr(current),
 			dev->parent ? dev_name(dev->parent) : "none");
@@ -181,7 +181,7 @@ static void initcall_debug_report(struct device *dev, ktime_t calltime,
 {
 	ktime_t delta, rettime;
 
-	if (pm_print_times) {
+	if (pm_print_times_enabled) {
 		rettime = ktime_get();
 		delta = ktime_sub(rettime, calltime);
 		pr_info("call %s+ returned %d after %Ld usecs\n", dev_name(dev),
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index 12c77b7ff8e8..eeb4bff9505c 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -85,14 +85,3 @@ static inline int pm_qos_sysfs_add(struct device *dev) { return 0; }
 static inline void pm_qos_sysfs_remove(struct device *dev) {}
 
 #endif
-
-#ifdef CONFIG_PM_DEBUG
-
-extern int pm_print_times_enabled;
-#define pm_print_times (initcall_debug || pm_print_times_enabled)
-
-#else /* CONFIG_PM_DEBUG */
-
-#define pm_print_times initcall_debug
-
-#endif /* CONFIG_PM_DEBUG */
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index cd83059fb592..0c808d7fa579 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -408,6 +408,12 @@ static inline void unlock_system_sleep(void) {}
 
 #endif /* !CONFIG_PM_SLEEP */
 
+#ifdef CONFIG_PM_SLEEP_DEBUG
+extern bool pm_print_times_enabled;
+#else
+#define pm_print_times_enabled	(false)
+#endif
+
 #ifdef CONFIG_PM_AUTOSLEEP
 
 /* kernel/power/autosleep.c */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 8f9b4eb974e0..a70518c9d82f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -175,7 +175,7 @@ config PM_TEST_SUSPEND
 	You probably want to have your system's RTC driver statically
 	linked, ensuring that it's available when this test runs.
 
-config CAN_PM_TRACE
+config PM_SLEEP_DEBUG
 	def_bool y
 	depends on PM_DEBUG && PM_SLEEP
 
@@ -196,7 +196,7 @@ config PM_TRACE
 
 config PM_TRACE_RTC
 	bool "Suspend/resume event tracing"
-	depends on CAN_PM_TRACE
+	depends on PM_SLEEP_DEBUG
 	depends on X86
 	select PM_TRACE
 	---help---
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7beb3fb3670b..f458238109cc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -132,38 +132,6 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
 }
 
 power_attr(pm_test);
-
-/*
- * pm_print_times: print time taken by devices to suspend and resume.
- *
- * show() returns whether printing of suspend and resume times is enabled.
- * store() accepts 0 or 1.  0 disables printing and 1 enables it.
- */
-int pm_print_times_enabled;
-
-static ssize_t pm_print_times_show(struct kobject *kobj,
-				   struct kobj_attribute *attr, char *buf)
-{
-	return sprintf(buf, "%d\n", pm_print_times_enabled);
-}
-
-static ssize_t pm_print_times_store(struct kobject *kobj,
-				    struct kobj_attribute *attr,
-				    const char *buf, size_t n)
-{
-	unsigned long val;
-
-	if (kstrtoul(buf, 10, &val))
-		return -EINVAL;
-
-	if (val > 1)
-		return -EINVAL;
-
-	pm_print_times_enabled = val;
-	return n;
-}
-
-power_attr(pm_print_times);
 #endif /* CONFIG_PM_DEBUG */
 
 #ifdef CONFIG_DEBUG_FS
@@ -267,6 +235,47 @@ late_initcall(pm_debugfs_init);
 
 #endif /* CONFIG_PM_SLEEP */
 
+#ifdef CONFIG_PM_SLEEP_DEBUG
+/*
+ * pm_print_times: print time taken by devices to suspend and resume.
+ *
+ * show() returns whether printing of suspend and resume times is enabled.
+ * store() accepts 0 or 1.  0 disables printing and 1 enables it.
+ */
+bool pm_print_times_enabled;
+
+static ssize_t pm_print_times_show(struct kobject *kobj,
+				   struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", pm_print_times_enabled);
+}
+
+static ssize_t pm_print_times_store(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    const char *buf, size_t n)
+{
+	unsigned long val;
+
+	if (kstrtoul(buf, 10, &val))
+		return -EINVAL;
+
+	if (val > 1)
+		return -EINVAL;
+
+	pm_print_times_enabled = !!val;
+	return n;
+}
+
+power_attr(pm_print_times);
+
+static inline void pm_print_times_init(void)
+{
+	pm_print_times_enabled = !!initcall_debug;
+}
+#else /* !CONFIG_PP_SLEEP_DEBUG */
+static inline void pm_print_times_init(void) {}
+#endif /* CONFIG_PM_SLEEP_DEBUG */
+
 struct kobject *power_kobj;
 
 /**
@@ -562,6 +571,8 @@ static struct attribute * g[] = {
 #endif
 #ifdef CONFIG_PM_DEBUG
 	&pm_test_attr.attr,
+#endif
+#ifdef CONFIG_PM_SLEEP_DEBUG
 	&pm_print_times_attr.attr,
 #endif
 #endif
@@ -599,6 +610,7 @@ static int __init pm_init(void)
 	error = sysfs_create_group(power_kobj, &attr_group);
 	if (error)
 		return error;
+	pm_print_times_init();
 	return pm_autosleep_init();
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From d8150d350408de6fb2b9ee7b7625ae8e2bb7aa4a Mon Sep 17 00:00:00 2001
From: Bojan Smojver <bojan@rexursive.com>
Date: Thu, 21 Jun 2012 22:27:24 +0200
Subject: PM / Hibernate: Print hibernation/thaw progress indicator one line at
 a time.

With the introduction of suspend to both into in-kernel hibernation
code, dmesg was getting polluted with backspace characters printed as
part of image saving progress indicator. This patch introduces printing
of progress indicator on image save/load every 10% and one line at a
time. As an additional benefit, all other messages emitted by the kernel
during hibernation/thaw should now print cleanly as well.

Signed-off-by: Bojan Smojver <bojan@rexursive.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/swap.c | 54 ++++++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 83d505142b00..3c9d764eb0d8 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -448,9 +448,9 @@ static int save_image(struct swap_map_handle *handle,
 	struct timeval start;
 	struct timeval stop;
 
-	printk(KERN_INFO "PM: Saving image data pages (%u pages) ...     ",
+	printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
 		nr_to_write);
-	m = nr_to_write / 100;
+	m = nr_to_write / 10;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
@@ -464,7 +464,8 @@ static int save_image(struct swap_map_handle *handle,
 		if (ret)
 			break;
 		if (!(nr_pages % m))
-			printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
+			printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
+			       nr_pages / m * 10);
 		nr_pages++;
 	}
 	err2 = hib_wait_on_bio_chain(&bio);
@@ -472,9 +473,7 @@ static int save_image(struct swap_map_handle *handle,
 	if (!ret)
 		ret = err2;
 	if (!ret)
-		printk(KERN_CONT "\b\b\b\bdone\n");
-	else
-		printk(KERN_CONT "\n");
+		printk(KERN_INFO "PM: Image saving done.\n");
 	swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
 	return ret;
 }
@@ -668,9 +667,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
 
 	printk(KERN_INFO
 		"PM: Using %u thread(s) for compression.\n"
-		"PM: Compressing and saving image data (%u pages) ...     ",
+		"PM: Compressing and saving image data (%u pages)...\n",
 		nr_threads, nr_to_write);
-	m = nr_to_write / 100;
+	m = nr_to_write / 10;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
@@ -690,8 +689,10 @@ static int save_image_lzo(struct swap_map_handle *handle,
 				       data_of(*snapshot), PAGE_SIZE);
 
 				if (!(nr_pages % m))
-					printk(KERN_CONT "\b\b\b\b%3d%%",
-				               nr_pages / m);
+					printk(KERN_INFO
+					       "PM: Image saving progress: "
+					       "%3d%%\n",
+				               nr_pages / m * 10);
 				nr_pages++;
 			}
 			if (!off)
@@ -761,11 +762,8 @@ out_finish:
 	do_gettimeofday(&stop);
 	if (!ret)
 		ret = err2;
-	if (!ret) {
-		printk(KERN_CONT "\b\b\b\bdone\n");
-	} else {
-		printk(KERN_CONT "\n");
-	}
+	if (!ret)
+		printk(KERN_INFO "PM: Image saving done.\n");
 	swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
 out_clean:
 	if (crc) {
@@ -973,9 +971,9 @@ static int load_image(struct swap_map_handle *handle,
 	int err2;
 	unsigned nr_pages;
 
-	printk(KERN_INFO "PM: Loading image data pages (%u pages) ...     ",
+	printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
 		nr_to_read);
-	m = nr_to_read / 100;
+	m = nr_to_read / 10;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
@@ -993,7 +991,8 @@ static int load_image(struct swap_map_handle *handle,
 		if (ret)
 			break;
 		if (!(nr_pages % m))
-			printk("\b\b\b\b%3d%%", nr_pages / m);
+			printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
+			       nr_pages / m * 10);
 		nr_pages++;
 	}
 	err2 = hib_wait_on_bio_chain(&bio);
@@ -1001,12 +1000,11 @@ static int load_image(struct swap_map_handle *handle,
 	if (!ret)
 		ret = err2;
 	if (!ret) {
-		printk("\b\b\b\bdone\n");
+		printk(KERN_INFO "PM: Image loading done.\n");
 		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			ret = -ENODATA;
-	} else
-		printk("\n");
+	}
 	swsusp_show_speed(&start, &stop, nr_to_read, "Read");
 	return ret;
 }
@@ -1185,9 +1183,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 	printk(KERN_INFO
 		"PM: Using %u thread(s) for decompression.\n"
-		"PM: Loading and decompressing image data (%u pages) ...     ",
+		"PM: Loading and decompressing image data (%u pages)...\n",
 		nr_threads, nr_to_read);
-	m = nr_to_read / 100;
+	m = nr_to_read / 10;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
@@ -1319,7 +1317,10 @@ static int load_image_lzo(struct swap_map_handle *handle,
 				       data[thr].unc + off, PAGE_SIZE);
 
 				if (!(nr_pages % m))
-					printk("\b\b\b\b%3d%%", nr_pages / m);
+					printk(KERN_INFO
+					       "PM: Image loading progress: "
+					       "%3d%%\n",
+					       nr_pages / m * 10);
 				nr_pages++;
 
 				ret = snapshot_write_next(snapshot);
@@ -1344,7 +1345,7 @@ out_finish:
 	}
 	do_gettimeofday(&stop);
 	if (!ret) {
-		printk("\b\b\b\bdone\n");
+		printk(KERN_INFO "PM: Image loading done.\n");
 		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			ret = -ENODATA;
@@ -1357,8 +1358,7 @@ out_finish:
 				}
 			}
 		}
-	} else
-		printk("\n");
+	}
 	swsusp_show_speed(&start, &stop, nr_to_read, "Read");
 out_clean:
 	for (i = 0; i < ring_size; i++)
-- 
cgit v1.2.3-59-g8ed1b


From cba6d0d64ee53772b285d0c0c288deefbeaf7775 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 2 Jul 2012 07:08:42 -0700
Subject: Revert "rcu: Move PREEMPT_RCU preemption to switch_to() invocation"

This reverts commit 616c310e83b872024271c915c1b9ab505b9efad9.
(Move PREEMPT_RCU preemption to switch_to() invocation).
Testing by Sasha Levin <levinsasha928@gmail.com> showed that this
can result in deadlock due to invoking the scheduler when one of
the runqueue locks is held.  Because this commit was simply a
performance optimization, revert it.

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
---
 arch/um/drivers/mconsole_kern.c |  1 -
 include/linux/rcupdate.h        |  1 -
 include/linux/rcutiny.h         |  6 ++++++
 include/linux/sched.h           | 10 ----------
 kernel/rcutree.c                |  1 +
 kernel/rcutree.h                |  1 +
 kernel/rcutree_plugin.h         | 14 +++++++++++---
 kernel/sched/core.c             |  1 -
 8 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 88e466b159dc..43b39d61b538 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -705,7 +705,6 @@ static void stack_proc(void *arg)
 	struct task_struct *from = current, *to = arg;
 
 	to->thread.saved_task = from;
-	rcu_switch_from(from);
 	switch_to(from, to, from);
 }
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 26d1a47591f1..9cac722b169c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -184,7 +184,6 @@ static inline int rcu_preempt_depth(void)
 /* Internal to kernel */
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
-extern void rcu_preempt_note_context_switch(void);
 extern void rcu_check_callbacks(int cpu, int user);
 struct notifier_block;
 extern void rcu_idle_enter(void);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 854dc4c5c271..4e56a9c69a35 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -87,6 +87,10 @@ static inline void kfree_call_rcu(struct rcu_head *head,
 
 #ifdef CONFIG_TINY_RCU
 
+static inline void rcu_preempt_note_context_switch(void)
+{
+}
+
 static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
 	*delta_jiffies = ULONG_MAX;
@@ -95,6 +99,7 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 
 #else /* #ifdef CONFIG_TINY_RCU */
 
+void rcu_preempt_note_context_switch(void);
 int rcu_preempt_needs_cpu(void);
 
 static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
@@ -108,6 +113,7 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 static inline void rcu_note_context_switch(int cpu)
 {
 	rcu_sched_qs(cpu);
+	rcu_preempt_note_context_switch();
 }
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4059c0f33f07..06a4c5f4f55c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1871,22 +1871,12 @@ static inline void rcu_copy_process(struct task_struct *p)
 	INIT_LIST_HEAD(&p->rcu_node_entry);
 }
 
-static inline void rcu_switch_from(struct task_struct *prev)
-{
-	if (prev->rcu_read_lock_nesting != 0)
-		rcu_preempt_note_context_switch();
-}
-
 #else
 
 static inline void rcu_copy_process(struct task_struct *p)
 {
 }
 
-static inline void rcu_switch_from(struct task_struct *prev)
-{
-}
-
 #endif
 
 #ifdef CONFIG_SMP
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 38ecdda3f55f..4b97bba7396e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu)
 {
 	trace_rcu_utilization("Start context switch");
 	rcu_sched_qs(cpu);
+	rcu_preempt_note_context_switch(cpu);
 	trace_rcu_utilization("End context switch");
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index ea056495783e..19b61ac1079f 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -444,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
 long rcu_batches_completed(void);
+static void rcu_preempt_note_context_switch(int cpu);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 5271a020887e..3e4899459f3d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
  *
  * Caller must disable preemption.
  */
-void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(int cpu)
 {
 	struct task_struct *t = current;
 	unsigned long flags;
@@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void)
 	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
 
 		/* Possibly blocking in an RCU read-side critical section. */
-		rdp = __this_cpu_ptr(rcu_preempt_state.rda);
+		rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
 		rnp = rdp->mynode;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void)
 	 * means that we continue to block the current grace period.
 	 */
 	local_irq_save(flags);
-	rcu_preempt_qs(smp_processor_id());
+	rcu_preempt_qs(cpu);
 	local_irq_restore(flags);
 }
 
@@ -1001,6 +1001,14 @@ void rcu_force_quiescent_state(void)
 }
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * CPUs being in quiescent states.
+ */
+static void rcu_preempt_note_context_switch(int cpu)
+{
+}
+
 /*
  * Because preemptible RCU does not exist, there are never any preempted
  * RCU readers.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5594a4268d4..eaead2df6aa8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 #endif
 
 	/* Here we just switch the register state and the stack. */
-	rcu_switch_from(prev);
 	switch_to(prev, next, prev);
 
 	barrier();
-- 
cgit v1.2.3-59-g8ed1b


From f885b7f2b2de70be266d2cecc476f773a1e2ca5d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 23 Apr 2012 15:52:53 -0700
Subject: rcu: Control RCU_FANOUT_LEAF from boot-time parameter

Although making RCU_FANOUT_LEAF a kernel configuration parameter rather
than a fixed constant makes it easier for people to decrease cache-miss
overhead for large systems, it is of little help for people who must
run a single pre-built kernel binary.

This commit therefore allows the value of RCU_FANOUT_LEAF to be
increased (but not decreased!) via a boot-time parameter named
rcutree.rcu_fanout_leaf.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt |  5 ++
 kernel/rcutree.c                    | 97 +++++++++++++++++++++++++++++++------
 kernel/rcutree.h                    | 23 +++++----
 kernel/rcutree_plugin.h             |  2 +
 kernel/rcutree_trace.c              |  2 +-
 5 files changed, 104 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a92c5ebf373e..12783fa833c3 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2367,6 +2367,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Set maximum number of finished RCU callbacks to process
 			in one batch.
 
+	rcutree.fanout_leaf=	[KNL,BOOT]
+			Increase the number of CPUs assigned to each
+			leaf rcu_node structure.  Useful for very large
+			systems.
+
 	rcutree.qhimark=	[KNL,BOOT]
 			Set threshold of queued
 			RCU callbacks over which batch limiting is disabled.
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4b97bba7396e..a4c592b66e10 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -60,17 +60,10 @@
 
 /* Data structures. */
 
-static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
+static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 
 #define RCU_STATE_INITIALIZER(structname) { \
 	.level = { &structname##_state.node[0] }, \
-	.levelcnt = { \
-		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
-		NUM_RCU_LVL_1, \
-		NUM_RCU_LVL_2, \
-		NUM_RCU_LVL_3, \
-		NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
-	}, \
 	.fqs_state = RCU_GP_IDLE, \
 	.gpnum = -300, \
 	.completed = -300, \
@@ -91,6 +84,19 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 static struct rcu_state *rcu_state;
 
+/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+module_param(rcu_fanout_leaf, int, 0);
+int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
+static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
+	NUM_RCU_LVL_0,
+	NUM_RCU_LVL_1,
+	NUM_RCU_LVL_2,
+	NUM_RCU_LVL_3,
+	NUM_RCU_LVL_4,
+};
+int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
+
 /*
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
@@ -2574,9 +2580,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
 	int i;
 
-	for (i = NUM_RCU_LVLS - 1; i > 0; i--)
+	for (i = rcu_num_lvls - 1; i > 0; i--)
 		rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-	rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
+	rsp->levelspread[0] = rcu_fanout_leaf;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -2586,7 +2592,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 	int i;
 
 	cprv = NR_CPUS;
-	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+	for (i = rcu_num_lvls - 1; i >= 0; i--) {
 		ccur = rsp->levelcnt[i];
 		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
 		cprv = ccur;
@@ -2613,13 +2619,15 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 
 	/* Initialize the level-tracking arrays. */
 
-	for (i = 1; i < NUM_RCU_LVLS; i++)
+	for (i = 0; i < rcu_num_lvls; i++)
+		rsp->levelcnt[i] = num_rcu_lvl[i];
+	for (i = 1; i < rcu_num_lvls; i++)
 		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
 	rcu_init_levelspread(rsp);
 
 	/* Initialize the elements themselves, starting from the leaves. */
 
-	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+	for (i = rcu_num_lvls - 1; i >= 0; i--) {
 		cpustride *= rsp->levelspread[i];
 		rnp = rsp->level[i];
 		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
@@ -2649,7 +2657,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 	}
 
 	rsp->rda = rda;
-	rnp = rsp->level[NUM_RCU_LVLS - 1];
+	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
 			rnp++;
@@ -2658,11 +2666,72 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 	}
 }
 
+/*
+ * Compute the rcu_node tree geometry from kernel parameters.  This cannot
+ * replace the definitions in rcutree.h because those are needed to size
+ * the ->node array in the rcu_state structure.
+ */
+static void __init rcu_init_geometry(void)
+{
+	int i;
+	int j;
+	int n = NR_CPUS;
+	int rcu_capacity[MAX_RCU_LVLS + 1];
+
+	/* If the compile-time values are accurate, just leave. */
+	if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF)
+		return;
+
+	/*
+	 * Compute number of nodes that can be handled an rcu_node tree
+	 * with the given number of levels.  Setting rcu_capacity[0] makes
+	 * some of the arithmetic easier.
+	 */
+	rcu_capacity[0] = 1;
+	rcu_capacity[1] = rcu_fanout_leaf;
+	for (i = 2; i <= MAX_RCU_LVLS; i++)
+		rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+
+	/*
+	 * The boot-time rcu_fanout_leaf parameter is only permitted
+	 * to increase the leaf-level fanout, not decrease it.  Of course,
+	 * the leaf-level fanout cannot exceed the number of bits in
+	 * the rcu_node masks.  Finally, the tree must be able to accommodate
+	 * the configured number of CPUs.  Complain and fall back to the
+	 * compile-time values if these limits are exceeded.
+	 */
+	if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+	    rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
+	    n > rcu_capacity[MAX_RCU_LVLS]) {
+		WARN_ON(1);
+		return;
+	}
+
+	/* Calculate the number of rcu_nodes at each level of the tree. */
+	for (i = 1; i <= MAX_RCU_LVLS; i++)
+		if (n <= rcu_capacity[i]) {
+			for (j = 0; j <= i; j++)
+				num_rcu_lvl[j] =
+					DIV_ROUND_UP(n, rcu_capacity[i - j]);
+			rcu_num_lvls = i;
+			for (j = i + 1; j <= MAX_RCU_LVLS; j++)
+				num_rcu_lvl[j] = 0;
+			break;
+		}
+
+	/* Calculate the total number of rcu_node structures. */
+	rcu_num_nodes = 0;
+	for (i = 0; i <= MAX_RCU_LVLS; i++)
+		rcu_num_nodes += num_rcu_lvl[i];
+	rcu_num_nodes -= n;
+}
+
 void __init rcu_init(void)
 {
 	int cpu;
 
 	rcu_bootup_announce();
+	rcu_init_geometry();
 	rcu_init_one(&rcu_sched_state, &rcu_sched_data);
 	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
 	__rcu_init_preempt();
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 19b61ac1079f..780a0195d35a 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -42,28 +42,28 @@
 #define RCU_FANOUT_4	      (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
 
 #if NR_CPUS <= RCU_FANOUT_1
-#  define NUM_RCU_LVLS	      1
+#  define RCU_NUM_LVLS	      1
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      (NR_CPUS)
 #  define NUM_RCU_LVL_2	      0
 #  define NUM_RCU_LVL_3	      0
 #  define NUM_RCU_LVL_4	      0
 #elif NR_CPUS <= RCU_FANOUT_2
-#  define NUM_RCU_LVLS	      2
+#  define RCU_NUM_LVLS	      2
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2	      (NR_CPUS)
 #  define NUM_RCU_LVL_3	      0
 #  define NUM_RCU_LVL_4	      0
 #elif NR_CPUS <= RCU_FANOUT_3
-#  define NUM_RCU_LVLS	      3
+#  define RCU_NUM_LVLS	      3
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
 #  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_3	      (NR_CPUS)
 #  define NUM_RCU_LVL_4	      0
 #elif NR_CPUS <= RCU_FANOUT_4
-#  define NUM_RCU_LVLS	      4
+#  define RCU_NUM_LVLS	      4
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
 #  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
@@ -76,6 +76,9 @@
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
 
+extern int rcu_num_lvls;
+extern int rcu_num_nodes;
+
 /*
  * Dynticks per-CPU state.
  */
@@ -206,7 +209,7 @@ struct rcu_node {
  */
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
 	for ((rnp) = &(rsp)->node[0]; \
-	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+	     (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
 
 /*
  * Do a breadth-first scan of the non-leaf rcu_node structures for the
@@ -215,7 +218,7 @@ struct rcu_node {
  */
 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
 	for ((rnp) = &(rsp)->node[0]; \
-	     (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
+	     (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
 
 /*
  * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -224,8 +227,8 @@ struct rcu_node {
  * It is still a leaf node, even if it is also the root node.
  */
 #define rcu_for_each_leaf_node(rsp, rnp) \
-	for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
-	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+	for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+	     (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
 
 /* Index values for nxttail array in struct rcu_data. */
 #define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
@@ -357,9 +360,9 @@ do {									\
  */
 struct rcu_state {
 	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
-	struct rcu_node *level[NUM_RCU_LVLS];	/* Hierarchy levels. */
+	struct rcu_node *level[RCU_NUM_LVLS];	/* Hierarchy levels. */
 	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
-	u8 levelspread[NUM_RCU_LVLS];		/* kids/node in each level. */
+	u8 levelspread[RCU_NUM_LVLS];		/* kids/node in each level. */
 	struct rcu_data __percpu *rda;		/* pointer of percu rcu_data. */
 
 	/* The following fields are guarded by the root rcu_node's lock. */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3e4899459f3d..fb92b6dd9980 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -70,6 +70,8 @@ static void __init rcu_bootup_announce_oddness(void)
 #if NUM_RCU_LVL_4 != 0
 	printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
 #endif
+	if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+		printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
 }
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d4bc16ddd1d4..a3556a2d842c 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -278,7 +278,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
 		   rsp->n_force_qs - rsp->n_force_qs_ngp,
 		   rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
-	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
 		if (rnp->level != level) {
 			seq_puts(m, "\n");
 			level = rnp->level;
-- 
cgit v1.2.3-59-g8ed1b


From cc5df65b0370fc6aa2bfe3bb19e0451d5cafb99f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 15 Jun 2012 18:16:00 -0700
Subject: rcu: Four-level hierarchy is no longer experimental

Time to make the four-level-hierarchy setting less scary, so this
commit removes "Experimental" from the boot-time message.  Leave the
message in order to get a heads-up on any possible need to expand to
a five-level hierarchy.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index fb92b6dd9980..70e0fd256cc6 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -68,7 +68,7 @@ static void __init rcu_bootup_announce_oddness(void)
 	printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
 #endif
 #if NUM_RCU_LVL_4 != 0
-	printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
+	printk(KERN_INFO "\tFour-level hierarchy is enabled.\n");
 #endif
 	if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
 		printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
-- 
cgit v1.2.3-59-g8ed1b


From cca6f3931920a7547d02e68adc2ca635bea5600c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 8 May 2012 21:00:28 -0700
Subject: rcu: Size rcu_node tree from nr_cpu_ids rather than NR_CPUS

The rcu_node tree array is sized based on compile-time constants,
including NR_CPUS.  Although this approach has worked well in the past,
the recent trend by many distros to define NR_CPUS=4096 results in
excessive grace-period-initialization latencies.

This commit therefore substitutes the run-time computed nr_cpu_ids for
the compile-time NR_CPUS when building the tree.  This can result in
much of the compile-time-allocated rcu_node array being unused.  If
this is a major problem, you are in a specialized situation anyway,
so you can manually adjust the NR_CPUS, RCU_FANOUT, and RCU_FANOUT_LEAF
kernel config parameters.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c        | 2 +-
 kernel/rcutree_plugin.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a4c592b66e10..0fdbc5e07302 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -2675,7 +2675,7 @@ static void __init rcu_init_geometry(void)
 {
 	int i;
 	int j;
-	int n = NR_CPUS;
+	int n = nr_cpu_ids;
 	int rcu_capacity[MAX_RCU_LVLS + 1];
 
 	/* If the compile-time values are accurate, just leave. */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 70e0fd256cc6..ef2b5231afa4 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -72,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void)
 #endif
 	if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
 		printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
+	if (nr_cpu_ids != NR_CPUS)
+		printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
 }
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
-- 
cgit v1.2.3-59-g8ed1b


From 6c90cc7bf077f28144013e949ee0c122012d194a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 28 May 2012 19:41:41 -0700
Subject: rcu: Prevent excessive line length in RCU_STATE_INITIALIZER()

Upcoming rcu_barrier() concurrency commits will result in line lengths
greater than 80 characters in the RCU_STATE_INITIALIZER(), so this commit
shortens the name of the macro's argument to prevent this.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0fdbc5e07302..dd7fd96c90c5 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -62,18 +62,18 @@
 
 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 
-#define RCU_STATE_INITIALIZER(structname) { \
-	.level = { &structname##_state.node[0] }, \
+#define RCU_STATE_INITIALIZER(sname) { \
+	.level = { &sname##_state.node[0] }, \
 	.fqs_state = RCU_GP_IDLE, \
 	.gpnum = -300, \
 	.completed = -300, \
-	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
-	.orphan_nxttail = &structname##_state.orphan_nxtlist, \
-	.orphan_donetail = &structname##_state.orphan_donelist, \
-	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
+	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \
+	.orphan_nxttail = &sname##_state.orphan_nxtlist, \
+	.orphan_donetail = &sname##_state.orphan_donelist, \
+	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
-	.name = #structname, \
+	.name = #sname, \
 }
 
 struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
-- 
cgit v1.2.3-59-g8ed1b


From 037b64ed0bf2405a1a01542164d3418564b44fff Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 28 May 2012 23:26:01 -0700
Subject: rcu: Place pointer to call_rcu() in rcu_data structure

This is a preparatory commit for increasing rcu_barrier()'s concurrency.
It adds a pointer in the rcu_data structure to the corresponding call_rcu()
function.  This allows a pointer to the rcu_data structure to imply the
function pointer, which allows _rcu_barrier() state to be placed in the
rcu_state structure.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c        | 27 ++++++++++++---------------
 kernel/rcutree.h        |  2 ++
 kernel/rcutree_plugin.h |  5 +++--
 3 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd7fd96c90c5..00c518fa34bb 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -62,8 +62,9 @@
 
 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 
-#define RCU_STATE_INITIALIZER(sname) { \
+#define RCU_STATE_INITIALIZER(sname, cr) { \
 	.level = { &sname##_state.node[0] }, \
+	.call = cr, \
 	.fqs_state = RCU_GP_IDLE, \
 	.gpnum = -300, \
 	.completed = -300, \
@@ -76,10 +77,11 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 	.name = #sname, \
 }
 
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
+struct rcu_state rcu_sched_state =
+	RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 static struct rcu_state *rcu_state;
@@ -2282,21 +2284,17 @@ static void rcu_barrier_func(void *type)
 {
 	int cpu = smp_processor_id();
 	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
-	void (*call_rcu_func)(struct rcu_head *head,
-			      void (*func)(struct rcu_head *head));
+	struct rcu_state *rsp = type;
 
 	atomic_inc(&rcu_barrier_cpu_count);
-	call_rcu_func = type;
-	call_rcu_func(head, rcu_barrier_callback);
+	rsp->call(head, rcu_barrier_callback);
 }
 
 /*
  * Orchestrate the specified type of RCU barrier, waiting for all
  * RCU callbacks of the specified type to complete.
  */
-static void _rcu_barrier(struct rcu_state *rsp,
-			 void (*call_rcu_func)(struct rcu_head *head,
-					       void (*func)(struct rcu_head *head)))
+static void _rcu_barrier(struct rcu_state *rsp)
 {
 	int cpu;
 	unsigned long flags;
@@ -2348,8 +2346,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
 			while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
 				schedule_timeout_interruptible(1);
 		} else if (ACCESS_ONCE(rdp->qlen)) {
-			smp_call_function_single(cpu, rcu_barrier_func,
-						 (void *)call_rcu_func, 1);
+			smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
 			preempt_enable();
 		} else {
 			preempt_enable();
@@ -2370,7 +2367,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
 	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 	atomic_inc(&rcu_barrier_cpu_count);
 	smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
-	call_rcu_func(&rh, rcu_barrier_callback);
+	rsp->call(&rh, rcu_barrier_callback);
 
 	/*
 	 * Now that we have an rcu_barrier_callback() callback on each
@@ -2393,7 +2390,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
  */
 void rcu_barrier_bh(void)
 {
-	_rcu_barrier(&rcu_bh_state, call_rcu_bh);
+	_rcu_barrier(&rcu_bh_state);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
 
@@ -2402,7 +2399,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
  */
 void rcu_barrier_sched(void)
 {
-	_rcu_barrier(&rcu_sched_state, call_rcu_sched);
+	_rcu_barrier(&rcu_sched_state);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 780a0195d35a..049896a835d9 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -364,6 +364,8 @@ struct rcu_state {
 	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
 	u8 levelspread[RCU_NUM_LVLS];		/* kids/node in each level. */
 	struct rcu_data __percpu *rda;		/* pointer of percu rcu_data. */
+	void (*call)(struct rcu_head *head,	/* call_rcu() flavor. */
+		     void (*func)(struct rcu_head *head));
 
 	/* The following fields are guarded by the root rcu_node's lock. */
 
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ef2b5231afa4..9cb3a68819fa 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -78,7 +78,8 @@ static void __init rcu_bootup_announce_oddness(void)
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
-struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
+struct rcu_state rcu_preempt_state =
+	RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
 
@@ -944,7 +945,7 @@ static int rcu_preempt_cpu_has_callbacks(int cpu)
  */
 void rcu_barrier(void)
 {
-	_rcu_barrier(&rcu_preempt_state, call_rcu);
+	_rcu_barrier(&rcu_preempt_state);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
-- 
cgit v1.2.3-59-g8ed1b


From 06668efa9180f4824fe846a8ff96338c18646bc7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 28 May 2012 23:57:46 -0700
Subject: rcu: Move _rcu_barrier()'s rcu_head structures to rcu_data structures

In order for multiple flavors of RCU to each concurrently run one
rcu_barrier(), each flavor needs its own per-CPU set of rcu_head
structures.  This commit therefore moves _rcu_barrier()'s set of
per-CPU rcu_head structures from per-CPU variables to the existing
per-CPU and per-RCU-flavor rcu_data structures.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c | 6 ++----
 kernel/rcutree.h | 3 +++
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 00c518fa34bb..1e552598b55d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -157,7 +157,6 @@ unsigned long rcutorture_vernum;
 
 /* State information for rcu_barrier() and friends. */
 
-static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
@@ -2282,12 +2281,11 @@ static void rcu_barrier_callback(struct rcu_head *notused)
  */
 static void rcu_barrier_func(void *type)
 {
-	int cpu = smp_processor_id();
-	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
 	struct rcu_state *rsp = type;
+	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
 
 	atomic_inc(&rcu_barrier_cpu_count);
-	rsp->call(head, rcu_barrier_callback);
+	rsp->call(&rdp->barrier_head, rcu_barrier_callback);
 }
 
 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 049896a835d9..586d93c978f2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -314,6 +314,9 @@ struct rcu_data {
 	unsigned long n_rp_need_fqs;
 	unsigned long n_rp_need_nothing;
 
+	/* 6) _rcu_barrier() callback. */
+	struct rcu_head barrier_head;
+
 	int cpu;
 	struct rcu_state *rsp;
 };
-- 
cgit v1.2.3-59-g8ed1b


From 24ebbca8ecdd5129d7f829a7cb5146aaeb531f77 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 29 May 2012 00:34:56 -0700
Subject: rcu: Move rcu_barrier_cpu_count to rcu_state structure

In order to allow each RCU flavor to concurrently execute its rcu_barrier()
function, it is necessary to move the relevant state to the rcu_state
structure.  This commit therefore moves the rcu_barrier_cpu_count global
variable to a new ->barrier_cpu_count field in the rcu_state structure.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c | 25 ++++++++++++++-----------
 kernel/rcutree.h |  1 +
 2 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 1e552598b55d..5929b021666d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -157,7 +157,6 @@ unsigned long rcutorture_vernum;
 
 /* State information for rcu_barrier() and friends. */
 
-static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 
@@ -2270,9 +2269,12 @@ static int rcu_cpu_has_callbacks(int cpu)
  * RCU callback function for _rcu_barrier().  If we are last, wake
  * up the task executing _rcu_barrier().
  */
-static void rcu_barrier_callback(struct rcu_head *notused)
+static void rcu_barrier_callback(struct rcu_head *rhp)
 {
-	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+	struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
+	struct rcu_state *rsp = rdp->rsp;
+
+	if (atomic_dec_and_test(&rsp->barrier_cpu_count))
 		complete(&rcu_barrier_completion);
 }
 
@@ -2284,7 +2286,7 @@ static void rcu_barrier_func(void *type)
 	struct rcu_state *rsp = type;
 	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
 
-	atomic_inc(&rcu_barrier_cpu_count);
+	atomic_inc(&rsp->barrier_cpu_count);
 	rsp->call(&rdp->barrier_head, rcu_barrier_callback);
 }
 
@@ -2297,9 +2299,9 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	int cpu;
 	unsigned long flags;
 	struct rcu_data *rdp;
-	struct rcu_head rh;
+	struct rcu_data rd;
 
-	init_rcu_head_on_stack(&rh);
+	init_rcu_head_on_stack(&rd.barrier_head);
 
 	/* Take mutex to serialize concurrent rcu_barrier() requests. */
 	mutex_lock(&rcu_barrier_mutex);
@@ -2324,7 +2326,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	 *	us -- but before CPU 1's orphaned callbacks are invoked!!!
 	 */
 	init_completion(&rcu_barrier_completion);
-	atomic_set(&rcu_barrier_cpu_count, 1);
+	atomic_set(&rsp->barrier_cpu_count, 1);
 	raw_spin_lock_irqsave(&rsp->onofflock, flags);
 	rsp->rcu_barrier_in_progress = current;
 	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
@@ -2363,15 +2365,16 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	rcu_adopt_orphan_cbs(rsp);
 	rsp->rcu_barrier_in_progress = NULL;
 	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-	atomic_inc(&rcu_barrier_cpu_count);
+	atomic_inc(&rsp->barrier_cpu_count);
 	smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
-	rsp->call(&rh, rcu_barrier_callback);
+	rd.rsp = rsp;
+	rsp->call(&rd.barrier_head, rcu_barrier_callback);
 
 	/*
 	 * Now that we have an rcu_barrier_callback() callback on each
 	 * CPU, and thus each counted, remove the initial count.
 	 */
-	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+	if (atomic_dec_and_test(&rsp->barrier_cpu_count))
 		complete(&rcu_barrier_completion);
 
 	/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
@@ -2380,7 +2383,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	/* Other rcu_barrier() invocations can now safely proceed. */
 	mutex_unlock(&rcu_barrier_mutex);
 
-	destroy_rcu_head_on_stack(&rh);
+	destroy_rcu_head_on_stack(&rd.barrier_head);
 }
 
 /**
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 586d93c978f2..c57ef0b7f097 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -400,6 +400,7 @@ struct rcu_state {
 	struct task_struct *rcu_barrier_in_progress;
 						/* Task doing rcu_barrier(), */
 						/*  or NULL if no barrier. */
+	atomic_t barrier_cpu_count;		/* # CPUs waiting on. */
 	raw_spinlock_t fqslock;			/* Only one task forcing */
 						/*  quiescent states. */
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
-- 
cgit v1.2.3-59-g8ed1b


From 7db74df88b52844f4e966901e2972bba725e6766 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 29 May 2012 03:03:37 -0700
Subject: rcu: Move rcu_barrier_completion to rcu_state structure

In order to allow each RCU flavor to concurrently execute its
rcu_barrier() function, it is necessary to move the relevant
state to the rcu_state structure.  This commit therefore moves the
rcu_barrier_completion global variable to a new ->barrier_completion
field in the rcu_state structure.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c | 9 ++++-----
 kernel/rcutree.h | 1 +
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 5929b021666d..ca7d1678ac79 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -158,7 +158,6 @@ unsigned long rcutorture_vernum;
 /* State information for rcu_barrier() and friends. */
 
 static DEFINE_MUTEX(rcu_barrier_mutex);
-static struct completion rcu_barrier_completion;
 
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
@@ -2275,7 +2274,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
 	struct rcu_state *rsp = rdp->rsp;
 
 	if (atomic_dec_and_test(&rsp->barrier_cpu_count))
-		complete(&rcu_barrier_completion);
+		complete(&rsp->barrier_completion);
 }
 
 /*
@@ -2325,7 +2324,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	 * 6.	Both rcu_barrier_callback() callbacks are invoked, awakening
 	 *	us -- but before CPU 1's orphaned callbacks are invoked!!!
 	 */
-	init_completion(&rcu_barrier_completion);
+	init_completion(&rsp->barrier_completion);
 	atomic_set(&rsp->barrier_cpu_count, 1);
 	raw_spin_lock_irqsave(&rsp->onofflock, flags);
 	rsp->rcu_barrier_in_progress = current;
@@ -2375,10 +2374,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	 * CPU, and thus each counted, remove the initial count.
 	 */
 	if (atomic_dec_and_test(&rsp->barrier_cpu_count))
-		complete(&rcu_barrier_completion);
+		complete(&rsp->barrier_completion);
 
 	/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
-	wait_for_completion(&rcu_barrier_completion);
+	wait_for_completion(&rsp->barrier_completion);
 
 	/* Other rcu_barrier() invocations can now safely proceed. */
 	mutex_unlock(&rcu_barrier_mutex);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index c57ef0b7f097..d1ca4424122b 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -401,6 +401,7 @@ struct rcu_state {
 						/* Task doing rcu_barrier(), */
 						/*  or NULL if no barrier. */
 	atomic_t barrier_cpu_count;		/* # CPUs waiting on. */
+	struct completion barrier_completion;	/* Wake at barrier end. */
 	raw_spinlock_t fqslock;			/* Only one task forcing */
 						/*  quiescent states. */
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
-- 
cgit v1.2.3-59-g8ed1b


From 7be7f0be907224445acc62b3884c892f38b7ff40 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 29 May 2012 05:18:53 -0700
Subject: rcu: Move rcu_barrier_mutex to rcu_state structure

In order to allow each RCU flavor to concurrently execute its
rcu_barrier() function, it is necessary to move the relevant
state to the rcu_state structure.  This commit therefore moves the
rcu_barrier_mutex global variable to a new ->barrier_mutex field
in the rcu_state structure.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 9 +++------
 kernel/rcutree.h | 1 +
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ca7d1678ac79..ff992ac24e73 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -71,6 +71,7 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \
 	.orphan_nxttail = &sname##_state.orphan_nxtlist, \
 	.orphan_donetail = &sname##_state.orphan_donelist, \
+	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
 	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
@@ -155,10 +156,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 unsigned long rcutorture_testseq;
 unsigned long rcutorture_vernum;
 
-/* State information for rcu_barrier() and friends. */
-
-static DEFINE_MUTEX(rcu_barrier_mutex);
-
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
  * permit this function to be invoked without holding the root rcu_node
@@ -2303,7 +2300,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	init_rcu_head_on_stack(&rd.barrier_head);
 
 	/* Take mutex to serialize concurrent rcu_barrier() requests. */
-	mutex_lock(&rcu_barrier_mutex);
+	mutex_lock(&rsp->barrier_mutex);
 
 	smp_mb();  /* Prevent any prior operations from leaking in. */
 
@@ -2380,7 +2377,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	wait_for_completion(&rsp->barrier_completion);
 
 	/* Other rcu_barrier() invocations can now safely proceed. */
-	mutex_unlock(&rcu_barrier_mutex);
+	mutex_unlock(&rsp->barrier_mutex);
 
 	destroy_rcu_head_on_stack(&rd.barrier_head);
 }
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index d1ca4424122b..7641aec3e59c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -400,6 +400,7 @@ struct rcu_state {
 	struct task_struct *rcu_barrier_in_progress;
 						/* Task doing rcu_barrier(), */
 						/*  or NULL if no barrier. */
+	struct mutex barrier_mutex;		/* Guards barrier fields. */
 	atomic_t barrier_cpu_count;		/* # CPUs waiting on. */
 	struct completion barrier_completion;	/* Wake at barrier end. */
 	raw_spinlock_t fqslock;			/* Only one task forcing */
-- 
cgit v1.2.3-59-g8ed1b


From cfed0a85dad921c683e9c0d25b072bcc5745ede0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 15 Jun 2012 18:22:05 -0700
Subject: rcu: Remove needless initialization

For global variables, C defaults all fields to zero.  The initialization
of the rcu_state structure's ->n_force_qs and ->n_force_qs_ngp fields
is therefore redundant, so this commit removes these initializations.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ff992ac24e73..44a8fda9be86 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -73,8 +73,6 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 	.orphan_donetail = &sname##_state.orphan_donelist, \
 	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
 	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
-	.n_force_qs = 0, \
-	.n_force_qs_ngp = 0, \
 	.name = #sname, \
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From cf3a9c4842b1e097dbe0854933c471d43dd24f69 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 29 May 2012 14:56:46 -0700
Subject: rcu: Increase rcu_barrier() concurrency

The traditional rcu_barrier() implementation has serialized all requests,
regardless of RCU flavor, and also does not coalesce concurrent requests.
In the past, this has been good and sufficient.

However, systems are getting larger and use of rcu_barrier() has been
increasing.  This commit therefore introduces a counter-based scheme
that allows _rcu_barrier() calls for the same flavor of RCU to take
advantage of each others' work.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 36 +++++++++++++++++++++++++++++++++++-
 kernel/rcutree.h |  2 ++
 2 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 44a8fda9be86..6bb5d562253f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -2294,13 +2294,41 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	unsigned long flags;
 	struct rcu_data *rdp;
 	struct rcu_data rd;
+	unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+	unsigned long snap_done;
 
 	init_rcu_head_on_stack(&rd.barrier_head);
 
 	/* Take mutex to serialize concurrent rcu_barrier() requests. */
 	mutex_lock(&rsp->barrier_mutex);
 
-	smp_mb();  /* Prevent any prior operations from leaking in. */
+	/*
+	 * Ensure that all prior references, including to ->n_barrier_done,
+	 * are ordered before the _rcu_barrier() machinery.
+	 */
+	smp_mb();  /* See above block comment. */
+
+	/*
+	 * Recheck ->n_barrier_done to see if others did our work for us.
+	 * This means checking ->n_barrier_done for an even-to-odd-to-even
+	 * transition.  The "if" expression below therefore rounds the old
+	 * value up to the next even number and adds two before comparing.
+	 */
+	snap_done = ACCESS_ONCE(rsp->n_barrier_done);
+	if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
+		smp_mb(); /* caller's subsequent code after above check. */
+		mutex_unlock(&rsp->barrier_mutex);
+		return;
+	}
+
+	/*
+	 * Increment ->n_barrier_done to avoid duplicate work.  Use
+	 * ACCESS_ONCE() to prevent the compiler from speculating
+	 * the increment to precede the early-exit check.
+	 */
+	ACCESS_ONCE(rsp->n_barrier_done)++;
+	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
+	smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
 
 	/*
 	 * Initialize the count to one rather than to zero in order to
@@ -2371,6 +2399,12 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	if (atomic_dec_and_test(&rsp->barrier_cpu_count))
 		complete(&rsp->barrier_completion);
 
+	/* Increment ->n_barrier_done to prevent duplicate work. */
+	smp_mb(); /* Keep increment after above mechanism. */
+	ACCESS_ONCE(rsp->n_barrier_done)++;
+	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
+	smp_mb(); /* Keep increment before caller's subsequent code. */
+
 	/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
 	wait_for_completion(&rsp->barrier_completion);
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 7641aec3e59c..be10286ad380 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -403,6 +403,8 @@ struct rcu_state {
 	struct mutex barrier_mutex;		/* Guards barrier fields. */
 	atomic_t barrier_cpu_count;		/* # CPUs waiting on. */
 	struct completion barrier_completion;	/* Wake at barrier end. */
+	unsigned long n_barrier_done;		/* ++ at start and end of */
+						/*  _rcu_barrier(). */
 	raw_spinlock_t fqslock;			/* Only one task forcing */
 						/*  quiescent states. */
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
-- 
cgit v1.2.3-59-g8ed1b


From a83eff0a82a7f3f14fea477fd41e6c082e7fc96a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 23 May 2012 18:47:05 -0700
Subject: rcu: Add tracing for _rcu_barrier()

This commit adds event tracing for _rcu_barrier() execution.  This
is defined only if RCU_TRACE=y.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/trace/events/rcu.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcutree.c           | 29 ++++++++++++++++++++++++++++-
 2 files changed, 73 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index d274734b2aa4..5bde94d8585b 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -541,6 +541,50 @@ TRACE_EVENT(rcu_torture_read,
 		  __entry->rcutorturename, __entry->rhp)
 );
 
+/*
+ * Tracepoint for _rcu_barrier() execution.  The string "s" describes
+ * the _rcu_barrier phase:
+ *	"Begin": rcu_barrier_callback() started.
+ *	"Check": rcu_barrier_callback() checking for piggybacking.
+ *	"EarlyExit": rcu_barrier_callback() piggybacked, thus early exit.
+ *	"Inc1": rcu_barrier_callback() piggyback check counter incremented.
+ *	"Offline": rcu_barrier_callback() found offline CPU
+ *	"OnlineQ": rcu_barrier_callback() found online CPU with callbacks.
+ *	"OnlineNQ": rcu_barrier_callback() found online CPU, no callbacks.
+ *	"IRQ": An rcu_barrier_callback() callback posted on remote CPU.
+ *	"CB": An rcu_barrier_callback() invoked a callback, not the last.
+ *	"LastCB": An rcu_barrier_callback() invoked the last callback.
+ *	"Inc2": rcu_barrier_callback() piggyback check counter incremented.
+ * The "cpu" argument is the CPU or -1 if meaningless, the "cnt" argument
+ * is the count of remaining callbacks, and "done" is the piggybacking count.
+ */
+TRACE_EVENT(rcu_barrier,
+
+	TP_PROTO(char *rcuname, char *s, int cpu, int cnt, unsigned long done),
+
+	TP_ARGS(rcuname, s, cpu, cnt, done),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(char *, s)
+		__field(int, cpu)
+		__field(int, cnt)
+		__field(unsigned long, done)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->s = s;
+		__entry->cpu = cpu;
+		__entry->cnt = cnt;
+		__entry->done = done;
+	),
+
+	TP_printk("%s %s cpu %d remaining %d # %lu",
+		  __entry->rcuname, __entry->s, __entry->cpu, __entry->cnt,
+		  __entry->done)
+);
+
 #else /* #ifdef CONFIG_RCU_TRACE */
 
 #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
@@ -564,6 +608,7 @@ TRACE_EVENT(rcu_torture_read,
 #define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \
 	do { } while (0)
 #define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#define trace_rcu_barrier(name, s, cpu, cnt, done) do { } while (0)
 
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6bb5d562253f..dda43d826504 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -2259,6 +2259,17 @@ static int rcu_cpu_has_callbacks(int cpu)
 	       rcu_preempt_cpu_has_callbacks(cpu);
 }
 
+/*
+ * Helper function for _rcu_barrier() tracing.  If tracing is disabled,
+ * the compiler is expected to optimize this away.
+ */
+static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
+			       int cpu, unsigned long done)
+{
+	trace_rcu_barrier(rsp->name, s, cpu,
+			  atomic_read(&rsp->barrier_cpu_count), done);
+}
+
 /*
  * RCU callback function for _rcu_barrier().  If we are last, wake
  * up the task executing _rcu_barrier().
@@ -2268,8 +2279,12 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
 	struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
 	struct rcu_state *rsp = rdp->rsp;
 
-	if (atomic_dec_and_test(&rsp->barrier_cpu_count))
+	if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
+		_rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
 		complete(&rsp->barrier_completion);
+	} else {
+		_rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+	}
 }
 
 /*
@@ -2280,6 +2295,7 @@ static void rcu_barrier_func(void *type)
 	struct rcu_state *rsp = type;
 	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
 
+	_rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
 	atomic_inc(&rsp->barrier_cpu_count);
 	rsp->call(&rdp->barrier_head, rcu_barrier_callback);
 }
@@ -2298,6 +2314,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	unsigned long snap_done;
 
 	init_rcu_head_on_stack(&rd.barrier_head);
+	_rcu_barrier_trace(rsp, "Begin", -1, snap);
 
 	/* Take mutex to serialize concurrent rcu_barrier() requests. */
 	mutex_lock(&rsp->barrier_mutex);
@@ -2315,7 +2332,9 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	 * value up to the next even number and adds two before comparing.
 	 */
 	snap_done = ACCESS_ONCE(rsp->n_barrier_done);
+	_rcu_barrier_trace(rsp, "Check", -1, snap_done);
 	if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
+		_rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
 		smp_mb(); /* caller's subsequent code after above check. */
 		mutex_unlock(&rsp->barrier_mutex);
 		return;
@@ -2328,6 +2347,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	 */
 	ACCESS_ONCE(rsp->n_barrier_done)++;
 	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
+	_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
 	smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
 
 	/*
@@ -2364,13 +2384,19 @@ static void _rcu_barrier(struct rcu_state *rsp)
 		preempt_disable();
 		rdp = per_cpu_ptr(rsp->rda, cpu);
 		if (cpu_is_offline(cpu)) {
+			_rcu_barrier_trace(rsp, "Offline", cpu,
+					   rsp->n_barrier_done);
 			preempt_enable();
 			while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
 				schedule_timeout_interruptible(1);
 		} else if (ACCESS_ONCE(rdp->qlen)) {
+			_rcu_barrier_trace(rsp, "OnlineQ", cpu,
+					   rsp->n_barrier_done);
 			smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
 			preempt_enable();
 		} else {
+			_rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+					   rsp->n_barrier_done);
 			preempt_enable();
 		}
 	}
@@ -2403,6 +2429,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	smp_mb(); /* Keep increment after above mechanism. */
 	ACCESS_ONCE(rsp->n_barrier_done)++;
 	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
+	_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
 	smp_mb(); /* Keep increment before caller's subsequent code. */
 
 	/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
-- 
cgit v1.2.3-59-g8ed1b


From d7e187c8e9f30543f9cadfed094896ff414acb8f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Fri, 25 May 2012 11:49:15 -0700
Subject: rcu: Add rcu_barrier() statistics to debugfs tracing

This commit adds an rcubarrier file to RCU's debugfs statistical tracing
directory, providing diagnostic information on rcu_barrier().

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree_trace.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index a3556a2d842c..057408be6c3b 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,40 @@
 #define RCU_TREE_NONCORE
 #include "rcutree.h"
 
+static void print_rcubarrier(struct seq_file *m, struct rcu_state *rsp)
+{
+	seq_printf(m, "%c bcc: %d nbd: %lu\n",
+		   rsp->rcu_barrier_in_progress ? 'B' : '.',
+		   atomic_read(&rsp->barrier_cpu_count),
+		   rsp->n_barrier_done);
+}
+
+static int show_rcubarrier(struct seq_file *m, void *unused)
+{
+#ifdef CONFIG_TREE_PREEMPT_RCU
+	seq_puts(m, "rcu_preempt: ");
+	print_rcubarrier(m, &rcu_preempt_state);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+	seq_puts(m, "rcu_sched: ");
+	print_rcubarrier(m, &rcu_sched_state);
+	seq_puts(m, "rcu_bh: ");
+	print_rcubarrier(m, &rcu_bh_state);
+	return 0;
+}
+
+static int rcubarrier_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcubarrier, NULL);
+}
+
+static const struct file_operations rcubarrier_fops = {
+	.owner = THIS_MODULE,
+	.open = rcubarrier_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
 #ifdef CONFIG_RCU_BOOST
 
 static char convert_kthread_status(unsigned int kthread_status)
@@ -453,6 +487,11 @@ static int __init rcutree_trace_init(void)
 	if (!rcudir)
 		goto free_out;
 
+	retval = debugfs_create_file("rcubarrier", 0444, rcudir,
+						NULL, &rcubarrier_fops);
+	if (!retval)
+		goto free_out;
+
 	retval = debugfs_create_file("rcudata", 0444, rcudir,
 						NULL, &rcudata_fops);
 	if (!retval)
-- 
cgit v1.2.3-59-g8ed1b


From 1bca8cf1a2c3c6683b12ad28a3e826ca7a834978 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Jun 2012 09:40:38 -0700
Subject: rcu: Remove unneeded __rcu_process_callbacks() argument

With the advent of __this_cpu_ptr(), it is no longer necessary to pass
both the rcu_state and rcu_data structures into __rcu_process_callbacks().
This commit therefore computes the rcu_data pointer from the rcu_state
pointer within __rcu_process_callbacks() so that callers can pass in
only the pointer to the rcu_state structure.  This paves the way for
linking the rcu_state structures together and iterating over them.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c        | 8 ++++----
 kernel/rcutree_plugin.h | 3 +--
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dda43d826504..5376a156be8a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1788,9 +1788,10 @@ unlock_fqs_ret:
  * whom the rdp belongs.
  */
 static void
-__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+__rcu_process_callbacks(struct rcu_state *rsp)
 {
 	unsigned long flags;
+	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
 
 	WARN_ON_ONCE(rdp->beenonline == 0);
 
@@ -1827,9 +1828,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
 	trace_rcu_utilization("Start RCU core");
-	__rcu_process_callbacks(&rcu_sched_state,
-				&__get_cpu_var(rcu_sched_data));
-	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+	__rcu_process_callbacks(&rcu_sched_state);
+	__rcu_process_callbacks(&rcu_bh_state);
 	rcu_preempt_process_callbacks();
 	trace_rcu_utilization("End RCU core");
 }
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 9cb3a68819fa..5a80cdd9a0a3 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -687,8 +687,7 @@ static void rcu_preempt_check_callbacks(int cpu)
  */
 static void rcu_preempt_process_callbacks(void)
 {
-	__rcu_process_callbacks(&rcu_preempt_state,
-				&__get_cpu_var(rcu_preempt_data));
+	__rcu_process_callbacks(&rcu_preempt_state);
 }
 
 #ifdef CONFIG_RCU_BOOST
-- 
cgit v1.2.3-59-g8ed1b


From 6ce75a2326e6f8b3bdfb60e1de7934b89858e87b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Jun 2012 11:01:13 -0700
Subject: rcu: Introduce for_each_rcu_flavor() and use it

The arrival of TREE_PREEMPT_RCU some years back included some ugly
code involving either #ifdef or #ifdef'ed wrapper functions to iterate
over all non-SRCU flavors of RCU.  This commit therefore introduces
a for_each_rcu_flavor() iterator over the rcu_state structures for each
flavor of RCU to clean up a bit of the ugliness.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c        |  53 +++++++++++++---------
 kernel/rcutree.h        |  12 +++--
 kernel/rcutree_plugin.h | 116 ------------------------------------------------
 3 files changed, 37 insertions(+), 144 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 5376a156be8a..b61c3ffc80e9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,6 +84,7 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 static struct rcu_state *rcu_state;
+LIST_HEAD(rcu_struct_flavors);
 
 /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
 static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
@@ -860,9 +861,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
  */
 void rcu_cpu_stall_reset(void)
 {
-	rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
-	rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
-	rcu_preempt_stall_reset();
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp)
+		rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
 }
 
 static struct notifier_block rcu_panic_block = {
@@ -1827,10 +1829,11 @@ __rcu_process_callbacks(struct rcu_state *rsp)
  */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
+	struct rcu_state *rsp;
+
 	trace_rcu_utilization("Start RCU core");
-	__rcu_process_callbacks(&rcu_sched_state);
-	__rcu_process_callbacks(&rcu_bh_state);
-	rcu_preempt_process_callbacks();
+	for_each_rcu_flavor(rsp)
+		__rcu_process_callbacks(rsp);
 	trace_rcu_utilization("End RCU core");
 }
 
@@ -2241,9 +2244,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
  */
 static int rcu_pending(int cpu)
 {
-	return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
-	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
-	       rcu_preempt_pending(cpu);
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp)
+		if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
+			return 1;
+	return 0;
 }
 
 /*
@@ -2253,10 +2259,13 @@ static int rcu_pending(int cpu)
  */
 static int rcu_cpu_has_callbacks(int cpu)
 {
+	struct rcu_state *rsp;
+
 	/* RCU callbacks either ready or pending? */
-	return per_cpu(rcu_sched_data, cpu).nxtlist ||
-	       per_cpu(rcu_bh_data, cpu).nxtlist ||
-	       rcu_preempt_cpu_has_callbacks(cpu);
+	for_each_rcu_flavor(rsp)
+		if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
+			return 1;
+	return 0;
 }
 
 /*
@@ -2551,9 +2560,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 
 static void __cpuinit rcu_prepare_cpu(int cpu)
 {
-	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
-	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
-	rcu_preempt_init_percpu_data(cpu);
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp)
+		rcu_init_percpu_data(cpu, rsp,
+				     strcmp(rsp->name, "rcu_preempt") == 0);
 }
 
 /*
@@ -2565,6 +2576,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	long cpu = (long)hcpu;
 	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;
+	struct rcu_state *rsp;
 
 	trace_rcu_utilization("Start CPU hotplug");
 	switch (action) {
@@ -2589,18 +2601,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 		 * touch any data without introducing corruption. We send the
 		 * dying CPU's callbacks to an arbitrarily chosen online CPU.
 		 */
-		rcu_cleanup_dying_cpu(&rcu_bh_state);
-		rcu_cleanup_dying_cpu(&rcu_sched_state);
-		rcu_preempt_cleanup_dying_cpu();
+		for_each_rcu_flavor(rsp)
+			rcu_cleanup_dying_cpu(rsp);
 		rcu_cleanup_after_idle(cpu);
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
-		rcu_cleanup_dead_cpu(cpu, &rcu_bh_state);
-		rcu_cleanup_dead_cpu(cpu, &rcu_sched_state);
-		rcu_preempt_cleanup_dead_cpu(cpu);
+		for_each_rcu_flavor(rsp)
+			rcu_cleanup_dead_cpu(cpu, rsp);
 		break;
 	default:
 		break;
@@ -2717,6 +2727,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 		per_cpu_ptr(rsp->rda, i)->mynode = rnp;
 		rcu_boot_init_percpu_data(i, rsp);
 	}
+	list_add(&rsp->flavors, &rcu_struct_flavors);
 }
 
 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index be10286ad380..b92c4550a6e6 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -422,8 +422,13 @@ struct rcu_state {
 	unsigned long gp_max;			/* Maximum GP duration in */
 						/*  jiffies. */
 	char *name;				/* Name of structure. */
+	struct list_head flavors;		/* List of RCU flavors. */
 };
 
+extern struct list_head rcu_struct_flavors;
+#define for_each_rcu_flavor(rsp) \
+	list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
+
 /* Return values for rcu_preempt_offline_tasks(). */
 
 #define RCU_OFL_TASKS_NORM_GP	0x1		/* Tasks blocking normal */
@@ -466,25 +471,18 @@ static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
-static void rcu_preempt_stall_reset(void);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 				     struct rcu_node *rnp,
 				     struct rcu_data *rdp);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_preempt_cleanup_dead_cpu(int cpu);
 static void rcu_preempt_check_callbacks(int cpu);
-static void rcu_preempt_process_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			       bool wake);
 #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
-static int rcu_preempt_pending(int cpu);
-static int rcu_preempt_cpu_has_callbacks(int cpu);
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_cleanup_dying_cpu(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 5a80cdd9a0a3..d18b4d383afe 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -544,16 +544,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 	return ndetected;
 }
 
-/*
- * Suppress preemptible RCU's CPU stall warnings by pushing the
- * time of the next stall-warning message comfortably far into the
- * future.
- */
-static void rcu_preempt_stall_reset(void)
-{
-	rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
-}
-
 /*
  * Check that the list of blocked tasks for the newly completed grace
  * period is in fact empty.  It is a serious bug to complete a grace
@@ -654,14 +644,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
-/*
- * Do CPU-offline processing for preemptible RCU.
- */
-static void rcu_preempt_cleanup_dead_cpu(int cpu)
-{
-	rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state);
-}
-
 /*
  * Check for a quiescent state from the current CPU.  When a task blocks,
  * the task is recorded in the corresponding CPU's rcu_node structure,
@@ -682,14 +664,6 @@ static void rcu_preempt_check_callbacks(int cpu)
 		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 }
 
-/*
- * Process callbacks for preemptible RCU.
- */
-static void rcu_preempt_process_callbacks(void)
-{
-	__rcu_process_callbacks(&rcu_preempt_state);
-}
-
 #ifdef CONFIG_RCU_BOOST
 
 static void rcu_preempt_do_callbacks(void)
@@ -921,24 +895,6 @@ mb_ret:
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
-/*
- * Check to see if there is any immediate preemptible-RCU-related work
- * to be done.
- */
-static int rcu_preempt_pending(int cpu)
-{
-	return __rcu_pending(&rcu_preempt_state,
-			     &per_cpu(rcu_preempt_data, cpu));
-}
-
-/*
- * Does preemptible RCU have callbacks on this CPU?
- */
-static int rcu_preempt_cpu_has_callbacks(int cpu)
-{
-	return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
-}
-
 /**
  * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
  */
@@ -948,23 +904,6 @@ void rcu_barrier(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
-/*
- * Initialize preemptible RCU's per-CPU data.
- */
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
-{
-	rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
-}
-
-/*
- * Move preemptible RCU's callbacks from dying CPU to other online CPU
- * and record a quiescent state.
- */
-static void rcu_preempt_cleanup_dying_cpu(void)
-{
-	rcu_cleanup_dying_cpu(&rcu_preempt_state);
-}
-
 /*
  * Initialize preemptible RCU's state structures.
  */
@@ -1049,14 +988,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 	return 0;
 }
 
-/*
- * Because preemptible RCU does not exist, there is no need to suppress
- * its CPU stall warnings.
- */
-static void rcu_preempt_stall_reset(void)
-{
-}
-
 /*
  * Because there is no preemptible RCU, there can be no readers blocked,
  * so there is no need to check for blocked tasks.  So check only for
@@ -1084,14 +1015,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
-/*
- * Because preemptible RCU does not exist, it never needs CPU-offline
- * processing.
- */
-static void rcu_preempt_cleanup_dead_cpu(int cpu)
-{
-}
-
 /*
  * Because preemptible RCU does not exist, it never has any callbacks
  * to check.
@@ -1100,14 +1023,6 @@ static void rcu_preempt_check_callbacks(int cpu)
 {
 }
 
-/*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to process.
- */
-static void rcu_preempt_process_callbacks(void)
-{
-}
-
 /*
  * Queue an RCU callback for lazy invocation after a grace period.
  * This will likely be later named something like "call_rcu_lazy()",
@@ -1148,22 +1063,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
-/*
- * Because preemptible RCU does not exist, it never has any work to do.
- */
-static int rcu_preempt_pending(int cpu)
-{
-	return 0;
-}
-
-/*
- * Because preemptible RCU does not exist, it never has callbacks
- */
-static int rcu_preempt_cpu_has_callbacks(int cpu)
-{
-	return 0;
-}
-
 /*
  * Because preemptible RCU does not exist, rcu_barrier() is just
  * another name for rcu_barrier_sched().
@@ -1174,21 +1073,6 @@ void rcu_barrier(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
-/*
- * Because preemptible RCU does not exist, there is no per-CPU
- * data to initialize.
- */
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
-{
-}
-
-/*
- * Because there is no preemptible RCU, there is no cleanup to do.
- */
-static void rcu_preempt_cleanup_dying_cpu(void)
-{
-}
-
 /*
  * Because preemptible RCU does not exist, it need not be initialized.
  */
-- 
cgit v1.2.3-59-g8ed1b


From c0cc962da3e7770feb3665f087ea3e23d8c15479 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Jun 2012 12:25:39 -0700
Subject: rcu: Use for_each_rcu_flavor() in TREE_RCU tracing

This commit applies the new for_each_rcu_flavor() macro to the
kernel/rcutree_trace.c file.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_trace.c | 116 ++++++++++++++++++-------------------------------
 1 file changed, 43 insertions(+), 73 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 057408be6c3b..a16ddbd6fdc4 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,24 +46,15 @@
 #define RCU_TREE_NONCORE
 #include "rcutree.h"
 
-static void print_rcubarrier(struct seq_file *m, struct rcu_state *rsp)
-{
-	seq_printf(m, "%c bcc: %d nbd: %lu\n",
-		   rsp->rcu_barrier_in_progress ? 'B' : '.',
-		   atomic_read(&rsp->barrier_cpu_count),
-		   rsp->n_barrier_done);
-}
-
 static int show_rcubarrier(struct seq_file *m, void *unused)
 {
-#ifdef CONFIG_TREE_PREEMPT_RCU
-	seq_puts(m, "rcu_preempt: ");
-	print_rcubarrier(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-	seq_puts(m, "rcu_sched: ");
-	print_rcubarrier(m, &rcu_sched_state);
-	seq_puts(m, "rcu_bh: ");
-	print_rcubarrier(m, &rcu_bh_state);
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp)
+		seq_printf(m, "%s: %c bcc: %d nbd: %lu\n",
+			   rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.',
+			   atomic_read(&rsp->barrier_cpu_count),
+			   rsp->n_barrier_done);
 	return 0;
 }
 
@@ -129,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 		   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
 
-#define PRINT_RCU_DATA(name, func, m) \
-	do { \
-		int _p_r_d_i; \
-		\
-		for_each_possible_cpu(_p_r_d_i) \
-			func(m, &per_cpu(name, _p_r_d_i)); \
-	} while (0)
-
 static int show_rcudata(struct seq_file *m, void *unused)
 {
-#ifdef CONFIG_TREE_PREEMPT_RCU
-	seq_puts(m, "rcu_preempt:\n");
-	PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-	seq_puts(m, "rcu_sched:\n");
-	PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
-	seq_puts(m, "rcu_bh:\n");
-	PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
+	int cpu;
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp) {
+		seq_printf(m, "%s:\n", rsp->name);
+		for_each_possible_cpu(cpu)
+			print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
+	}
 	return 0;
 }
 
@@ -200,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
+	int cpu;
+	struct rcu_state *rsp;
+
 	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
 	seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
 	seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
@@ -207,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
 	seq_puts(m, "\"kt\",\"ktl\"");
 #endif /* #ifdef CONFIG_RCU_BOOST */
 	seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
-#ifdef CONFIG_TREE_PREEMPT_RCU
-	seq_puts(m, "\"rcu_preempt:\"\n");
-	PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-	seq_puts(m, "\"rcu_sched:\"\n");
-	PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
-	seq_puts(m, "\"rcu_bh:\"\n");
-	PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
+	for_each_rcu_flavor(rsp) {
+		seq_printf(m, "\"%s:\"\n", rsp->name);
+		for_each_possible_cpu(cpu)
+			print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu));
+	}
 	return 0;
 }
 
@@ -304,9 +287,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 	struct rcu_node *rnp;
 
 	gpnum = rsp->gpnum;
-	seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
+	seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x "
 		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
-		   rsp->completed, gpnum, rsp->fqs_state,
+		   rsp->name, rsp->completed, gpnum, rsp->fqs_state,
 		   (long)(rsp->jiffies_force_qs - jiffies),
 		   (int)(jiffies & 0xffff),
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
@@ -329,14 +312,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 
 static int show_rcuhier(struct seq_file *m, void *unused)
 {
-#ifdef CONFIG_TREE_PREEMPT_RCU
-	seq_puts(m, "rcu_preempt:\n");
-	print_one_rcu_state(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-	seq_puts(m, "rcu_sched:\n");
-	print_one_rcu_state(m, &rcu_sched_state);
-	seq_puts(m, "rcu_bh:\n");
-	print_one_rcu_state(m, &rcu_bh_state);
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp)
+		print_one_rcu_state(m, rsp);
 	return 0;
 }
 
@@ -377,11 +356,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
 
 static int show_rcugp(struct seq_file *m, void *unused)
 {
-#ifdef CONFIG_TREE_PREEMPT_RCU
-	show_one_rcugp(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-	show_one_rcugp(m, &rcu_sched_state);
-	show_one_rcugp(m, &rcu_bh_state);
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp)
+		show_one_rcugp(m, rsp);
 	return 0;
 }
 
@@ -416,28 +394,20 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
 		   rdp->n_rp_need_nothing);
 }
 
-static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
+static int show_rcu_pending(struct seq_file *m, void *unused)
 {
 	int cpu;
 	struct rcu_data *rdp;
-
-	for_each_possible_cpu(cpu) {
-		rdp = per_cpu_ptr(rsp->rda, cpu);
-		if (rdp->beenonline)
-			print_one_rcu_pending(m, rdp);
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp) {
+		seq_printf(m, "%s:\n", rsp->name);
+		for_each_possible_cpu(cpu) {
+			rdp = per_cpu_ptr(rsp->rda, cpu);
+			if (rdp->beenonline)
+				print_one_rcu_pending(m, rdp);
+		}
 	}
-}
-
-static int show_rcu_pending(struct seq_file *m, void *unused)
-{
-#ifdef CONFIG_TREE_PREEMPT_RCU
-	seq_puts(m, "rcu_preempt:\n");
-	print_rcu_pendings(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-	seq_puts(m, "rcu_sched:\n");
-	print_rcu_pendings(m, &rcu_sched_state);
-	seq_puts(m, "rcu_bh:\n");
-	print_rcu_pendings(m, &rcu_bh_state);
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From ff015030c939f0bec68fa9b8898da3aaa7fe55ea Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Jun 2012 13:16:58 -0700
Subject: rcu: RCU_SAVE_DYNTICK code no longer ever dead

Before RCU had unified idle, the RCU_SAVE_DYNTICK leg of the switch
statement in force_quiescent_state() was dead code for CONFIG_NO_HZ=n
kernel builds.  With unified idle, the code is never dead.  This commit
therefore removes the "if" statement designed to make gcc aware of when
the code was and was not dead.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index b61c3ffc80e9..967b4bed2cf3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1747,8 +1747,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 		break; /* grace period idle or initializing, ignore. */
 
 	case RCU_SAVE_DYNTICK:
-		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
-			break; /* So gcc recognizes the dead code. */
 
 		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 
-- 
cgit v1.2.3-59-g8ed1b


From 751a68b2e46fe11a42450cb55a16e8065eddec7e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 7 May 2012 13:44:44 -0700
Subject: rcu: Rationalize ordering of torture_ops list

Move the raw SRCU interfaces out of the middle of the normal SRCU
interfaces.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutorture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e66b34ab7555..9850479f319d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1908,8 +1908,8 @@ rcu_torture_init(void)
 	static struct rcu_torture_ops *torture_ops[] =
 		{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
 		  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
-		  &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
-		  &srcu_raw_sync_ops, &srcu_expedited_ops,
+		  &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
+		  &srcu_raw_ops, &srcu_raw_sync_ops,
 		  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
 
 	mutex_lock(&fullstop_mutex);
-- 
cgit v1.2.3-59-g8ed1b


From e3f8d3788ed7cf55946030dc9b76e73edb111602 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 8 May 2012 10:21:50 -0700
Subject: rcu: Test srcu_barrier() from rcutorture test suite

SRCU now has a call_srcu() and an srcu_barrier(), but rcutorture does not
test them.  This commit adds the machinery to allow rcutorture's existing
tests for call_rcu() and rcu_barrier() to apply to the SRCU equivalents.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutorture.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9850479f319d..7b6935e0cee3 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -635,6 +635,17 @@ static void srcu_torture_synchronize(void)
 	synchronize_srcu(&srcu_ctl);
 }
 
+static void srcu_torture_call(struct rcu_head *head,
+			      void (*func)(struct rcu_head *head))
+{
+	call_srcu(&srcu_ctl, head, func);
+}
+
+static void srcu_torture_barrier(void)
+{
+	srcu_barrier(&srcu_ctl);
+}
+
 static int srcu_torture_stats(char *page)
 {
 	int cnt = 0;
@@ -661,8 +672,8 @@ static struct rcu_torture_ops srcu_ops = {
 	.completed	= srcu_torture_completed,
 	.deferred_free	= srcu_torture_deferred_free,
 	.sync		= srcu_torture_synchronize,
-	.call		= NULL,
-	.cb_barrier	= NULL,
+	.call		= srcu_torture_call,
+	.cb_barrier	= srcu_torture_barrier,
 	.stats		= srcu_torture_stats,
 	.name		= "srcu"
 };
-- 
cgit v1.2.3-59-g8ed1b


From c6ebcbb60c8c68a88160fe54302e851700d1362c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 28 May 2012 19:21:41 -0700
Subject: rcu: Fix bug in rcu_barrier() torture test

The child threads in the rcu_torture_barrier_cbs() are improperly
synchronized, which can cause the rcu_barrier() tests to hang.  The
failure mode is as follows:

1.	CPU 0 running in rcu_torture_barrier() sets barrier_cbs_count
    	to n_barrier_cbs.

2.	CPU 1 running in rcu_torture_barrier_cbs() wakes up, posts
    	its RCU callback, and atomically decrements barrier_cbs_count.
    	Because barrier_cbs_count is not zero, it does not do the wake_up().

3.	CPU 2 running in rcu_torture_barrier_cbs() wakes up, but
    	finds that barrier_cbs_count is not equal to n_barrier_cbs,
    	and so returns to sleep.

4.	The value of barrier_cbs_count therefore never reaches zero,
    	which causes the test to hang.

This commit therefore uses a phase variable to coordinate the test,
preventing this scenario from occurring.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutorture.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7b6935e0cee3..f7fe73e59c9f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -206,6 +206,7 @@ static unsigned long boost_starttime;	/* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);		/* protect setting boost_starttime */
 					/*  and boost task create/destroy. */
 static atomic_t barrier_cbs_count;	/* Barrier callbacks registered. */
+static bool barrier_phase;		/* Test phase. */
 static atomic_t barrier_cbs_invoked;	/* Barrier callbacks invoked. */
 static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
 static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
@@ -1642,6 +1643,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu)
 static int rcu_torture_barrier_cbs(void *arg)
 {
 	long myid = (long)arg;
+	bool lastphase = 0;
 	struct rcu_head rcu;
 
 	init_rcu_head_on_stack(&rcu);
@@ -1649,9 +1651,11 @@ static int rcu_torture_barrier_cbs(void *arg)
 	set_user_nice(current, 19);
 	do {
 		wait_event(barrier_cbs_wq[myid],
-			   atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
+			   barrier_phase != lastphase ||
 			   kthread_should_stop() ||
 			   fullstop != FULLSTOP_DONTSTOP);
+		lastphase = barrier_phase;
+		smp_mb(); /* ensure barrier_phase load before ->call(). */
 		if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
 			break;
 		cur_ops->call(&rcu, rcu_torture_barrier_cbf);
@@ -1676,7 +1680,8 @@ static int rcu_torture_barrier(void *arg)
 	do {
 		atomic_set(&barrier_cbs_invoked, 0);
 		atomic_set(&barrier_cbs_count, n_barrier_cbs);
-		/* wake_up() path contains the required barriers. */
+		smp_mb(); /* Ensure barrier_phase after prior assignments. */
+		barrier_phase = !barrier_phase;
 		for (i = 0; i < n_barrier_cbs; i++)
 			wake_up(&barrier_cbs_wq[i]);
 		wait_event(barrier_wq,
-- 
cgit v1.2.3-59-g8ed1b


From 143aa672f4fc643420c8325ad09c379ed33a27cf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 24 May 2012 18:17:48 -0700
Subject: rcu: Fix diagnostic-printk typo in rcutorture

The rcu_torture_barrier() function has a copy-and-paste typo in the
string passed to rcutorture_shutdown_absorb(), which this commit fixes.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index f7fe73e59c9f..045a3dc233ea 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1700,7 +1700,7 @@ static int rcu_torture_barrier(void *arg)
 		schedule_timeout_interruptible(HZ / 10);
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+	rcutorture_shutdown_absorb("rcu_torture_barrier");
 	while (!kthread_should_stop())
 		schedule_timeout_interruptible(1);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 72472a02a9c4507ef54d03d71bb253c26015f52c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 29 May 2012 17:50:51 -0700
Subject: rcu: Make rcutorture fakewriters invoke rcu_barrier()

The current rcutorture rcu_barrier() testing never intentionally runs
more than one instance of rcu_barrier() at a given time.  This fails
to test the the shiny new concurrency features of rcu_barrier().  This
commit therefore modifies the rcutorture fakewriter kthread to randomly
invoke rcu_barrier() rather than the usual synchronize_rcu().

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutorture.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 045a3dc233ea..c279ee920947 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1025,7 +1025,11 @@ rcu_torture_fakewriter(void *arg)
 	do {
 		schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
 		udelay(rcu_random(&rand) & 0x3ff);
-		cur_ops->sync();
+		if (cur_ops->cb_barrier != NULL &&
+		    rcu_random(&rand) % (nfakewriters * 8) == 0)
+			cur_ops->cb_barrier();
+		else
+			cur_ops->sync();
 		rcu_stutter_wait("rcu_torture_fakewriter");
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 
-- 
cgit v1.2.3-59-g8ed1b


From 285fe29481d865ae381ad3924c80894e6968c2d8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 9 May 2012 08:45:12 -0700
Subject: rcu: Fix detection of abruptly-ending stall

The code that attempts to identify stalls that end just as we detect
them is broken by both flavors of initialization failure.  This commit
therefore properly initializes and computes the count of the number
of reasons why the RCU grace period is stalled.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4b97bba7396e..dc8c5284fe06 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -733,7 +733,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 	int cpu;
 	long delta;
 	unsigned long flags;
-	int ndetected;
+	int ndetected = 0;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Only let one CPU complain about others per time interval. */
@@ -774,7 +774,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 	 */
 	rnp = rcu_get_root(rsp);
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	ndetected = rcu_print_task_stall(rnp);
+	ndetected += rcu_print_task_stall(rnp);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	print_cpu_stall_info_end();
-- 
cgit v1.2.3-59-g8ed1b


From 3f5d3ea64f1783f0d4ea0d35890ae3297f045a8b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 9 May 2012 15:39:56 -0700
Subject: rcu: Consolidate duplicate callback-list initialization

There are a couple of open-coded initializations of the rcu_data
structure's RCU callback list.  This commit therefore consolidates
them into a new init_callback_list() function.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dc8c5284fe06..81e0394e46af 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
 	return ret;
 }
 
+/*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+	int i;
+
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+}
+
 /*
  * Advance this CPU's callbacks, but only if the current grace period
  * has ended.  This may be called only from the CPU to whom the rdp
@@ -1328,8 +1340,6 @@ static void
 rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 			  struct rcu_node *rnp, struct rcu_data *rdp)
 {
-	int i;
-
 	/*
 	 * Orphan the callbacks.  First adjust the counts.  This is safe
 	 * because ->onofflock excludes _rcu_barrier()'s adoption of
@@ -1369,9 +1379,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 	}
 
 	/* Finally, initialize the rcu_data structure's list to empty.  */
-	rdp->nxtlist = NULL;
-	for (i = 0; i < RCU_NEXT_SIZE; i++)
-		rdp->nxttail[i] = &rdp->nxtlist;
+	init_callback_list(rdp);
 }
 
 /*
@@ -2407,16 +2415,13 @@ static void __init
 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
-	int i;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
-	rdp->nxtlist = NULL;
-	for (i = 0; i < RCU_NEXT_SIZE; i++)
-		rdp->nxttail[i] = &rdp->nxtlist;
+	init_callback_list(rdp);
 	rdp->qlen_lazy = 0;
 	rdp->qlen = 0;
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
-- 
cgit v1.2.3-59-g8ed1b


From 1d1fb395f6dbc07b36285bbedcf01a73b57f7cb5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 9 May 2012 15:44:42 -0700
Subject: rcu: Add ACCESS_ONCE() to ->qlen accesses

The _rcu_barrier() function accesses other CPUs' rcu_data structure's
->qlen field without benefit of locking.  This commit therefore adds
the required ACCESS_ONCE() wrappers around accesses and updates that
need it.

ACCESS_ONCE() is not needed when a CPU accesses its own ->qlen, or
in code that cannot run while _rcu_barrier() is sampling ->qlen fields.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 81e0394e46af..89addada3e3a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1350,7 +1350,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 		rsp->qlen += rdp->qlen;
 		rdp->n_cbs_orphaned += rdp->qlen;
 		rdp->qlen_lazy = 0;
-		rdp->qlen = 0;
+		ACCESS_ONCE(rdp->qlen) = 0;
 	}
 
 	/*
@@ -1600,7 +1600,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 	smp_mb(); /* List handling before counting for rcu_barrier(). */
 	rdp->qlen_lazy -= count_lazy;
-	rdp->qlen -= count;
+	ACCESS_ONCE(rdp->qlen) -= count;
 	rdp->n_cbs_invoked += count;
 
 	/* Reinstate batch limit if we have worked down the excess. */
@@ -1889,7 +1889,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	rdp = this_cpu_ptr(rsp->rda);
 
 	/* Add the callback to our list. */
-	rdp->qlen++;
+	ACCESS_ONCE(rdp->qlen)++;
 	if (lazy)
 		rdp->qlen_lazy++;
 	else
@@ -2423,7 +2423,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
 	init_callback_list(rdp);
 	rdp->qlen_lazy = 0;
-	rdp->qlen = 0;
+	ACCESS_ONCE(rdp->qlen) = 0;
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
 	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
-- 
cgit v1.2.3-59-g8ed1b


From 2a3fa843b555d202e682bf08c65ee1a4a93c79cf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 21 May 2012 11:58:36 -0700
Subject: rcu: Consolidate tree/tiny __rcu_read_{,un}lock() implementations

The CONFIG_TREE_PREEMPT_RCU and CONFIG_TINY_PREEMPT_RCU versions of
__rcu_read_lock() and __rcu_read_unlock() are identical, so this commit
consolidates them into kernel/rcupdate.h.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h |  1 +
 kernel/rcupdate.c        | 44 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcutiny_plugin.h  | 47 +----------------------------------------------
 kernel/rcutree_plugin.h  | 47 +----------------------------------------------
 4 files changed, 47 insertions(+), 92 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index fb8e6db511d7..698555ebf49b 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -147,6 +147,7 @@ extern void synchronize_sched(void);
 
 extern void __rcu_read_lock(void);
 extern void __rcu_read_unlock(void);
+extern void rcu_read_unlock_special(struct task_struct *t);
 void synchronize_rcu(void);
 
 /*
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 95cba41ce1e9..4e6a61b15e86 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -53,6 +53,50 @@
 
 #ifdef CONFIG_PREEMPT_RCU
 
+/*
+ * Preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+	current->rcu_read_lock_nesting++;
+	barrier();  /* critical section after entry code. */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+	struct task_struct *t = current;
+
+	if (t->rcu_read_lock_nesting != 1) {
+		--t->rcu_read_lock_nesting;
+	} else {
+		barrier();  /* critical section before exit code. */
+		t->rcu_read_lock_nesting = INT_MIN;
+		barrier();  /* assign before ->rcu_read_unlock_special load */
+		if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+			rcu_read_unlock_special(t);
+		barrier();  /* ->rcu_read_unlock_special load before assign */
+		t->rcu_read_lock_nesting = 0;
+	}
+#ifdef CONFIG_PROVE_LOCKING
+	{
+		int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+
+		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+	}
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
 /*
  * Check for a task exiting while in a preemptible-RCU read-side
  * critical section, clean up if so.  No need to issue warnings,
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index fc31a2d65100..a269b0da0eb6 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
 	RCU_TRACE(.rcb.name = "rcu_preempt")
 };
 
-static void rcu_read_unlock_special(struct task_struct *t);
 static int rcu_preempted_readers_exp(void);
 static void rcu_report_exp_done(void);
 
@@ -526,24 +525,12 @@ void rcu_preempt_note_context_switch(void)
 	local_irq_restore(flags);
 }
 
-/*
- * Tiny-preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
-	current->rcu_read_lock_nesting++;
-	barrier();  /* needed if we ever invoke rcu_read_lock in rcutiny.c */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
 /*
  * Handle special cases during rcu_read_unlock(), such as needing to
  * notify RCU core processing or task having blocked during the RCU
  * read-side critical section.
  */
-static noinline void rcu_read_unlock_special(struct task_struct *t)
+void rcu_read_unlock_special(struct task_struct *t)
 {
 	int empty;
 	int empty_exp;
@@ -626,38 +613,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 	local_irq_restore(flags);
 }
 
-/*
- * Tiny-preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
-	struct task_struct *t = current;
-
-	barrier();  /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
-	if (t->rcu_read_lock_nesting != 1)
-		--t->rcu_read_lock_nesting;
-	else {
-		t->rcu_read_lock_nesting = INT_MIN;
-		barrier();  /* assign before ->rcu_read_unlock_special load */
-		if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
-			rcu_read_unlock_special(t);
-		barrier();  /* ->rcu_read_unlock_special load before assign */
-		t->rcu_read_lock_nesting = 0;
-	}
-#ifdef CONFIG_PROVE_LOCKING
-	{
-		int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
-
-		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
-	}
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
 /*
  * Check for a quiescent state from the current CPU.  When a task blocks,
  * the task is recorded in the rcu_preempt_ctrlblk structure, which is
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3e4899459f3d..4b6b17cdf66b 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -78,7 +78,6 @@ struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
 
-static void rcu_read_unlock_special(struct task_struct *t);
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 
 /*
@@ -232,18 +231,6 @@ static void rcu_preempt_note_context_switch(int cpu)
 	local_irq_restore(flags);
 }
 
-/*
- * Tree-preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
-	current->rcu_read_lock_nesting++;
-	barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
 /*
  * Check for preempted RCU readers blocking the current grace period
  * for the specified rcu_node structure.  If the caller needs a reliable
@@ -310,7 +297,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
  * notify RCU core processing or task having blocked during the RCU
  * read-side critical section.
  */
-static noinline void rcu_read_unlock_special(struct task_struct *t)
+void rcu_read_unlock_special(struct task_struct *t)
 {
 	int empty;
 	int empty_exp;
@@ -418,38 +405,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 	}
 }
 
-/*
- * Tree-preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
-	struct task_struct *t = current;
-
-	if (t->rcu_read_lock_nesting != 1)
-		--t->rcu_read_lock_nesting;
-	else {
-		barrier();  /* critical section before exit code. */
-		t->rcu_read_lock_nesting = INT_MIN;
-		barrier();  /* assign before ->rcu_read_unlock_special load */
-		if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
-			rcu_read_unlock_special(t);
-		barrier();  /* ->rcu_read_unlock_special load before assign */
-		t->rcu_read_lock_nesting = 0;
-	}
-#ifdef CONFIG_PROVE_LOCKING
-	{
-		int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
-
-		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
-	}
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
 #ifdef CONFIG_RCU_CPU_STALL_VERBOSE
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 62fde6edf12b60fddb13a3f0a779c8be0bb7447e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 22 May 2012 22:10:24 -0700
Subject: rcu: Make __call_rcu() handle invocation from idle

Although __call_rcu() is handled correctly when called from a momentary
non-idle period, if it is called on a CPU that RCU believes to be idle
on RCU_FAST_NO_HZ kernels, the callback might be indefinitely postponed.
This commit therefore ensures that RCU is aware of the new callback and
has a chance to force the CPU out of dyntick-idle mode when a new callback
is posted.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h | 13 ++++---------
 kernel/rcutree.c         | 15 +++++++++------
 2 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 31568c734525..26f6417f0264 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -256,6 +256,10 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head)
 }
 #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SMP)
+extern int rcu_is_cpu_idle(void);
+#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SMP) */
+
 #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU)
 bool rcu_lockdep_current_cpu_online(void);
 #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
@@ -267,15 +271,6 @@ static inline bool rcu_lockdep_current_cpu_online(void)
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-#ifdef CONFIG_PROVE_RCU
-extern int rcu_is_cpu_idle(void);
-#else /* !CONFIG_PROVE_RCU */
-static inline int rcu_is_cpu_idle(void)
-{
-	return 0;
-}
-#endif /* else !CONFIG_PROVE_RCU */
-
 static inline void rcu_lock_acquire(struct lockdep_map *map)
 {
 	lock_acquire(map, 0, 0, 2, 1, NULL, _THIS_IP_);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 89addada3e3a..a4a9c916ad36 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -585,8 +585,6 @@ void rcu_nmi_exit(void)
 	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
 
-#ifdef CONFIG_PROVE_RCU
-
 /**
  * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
  *
@@ -604,7 +602,7 @@ int rcu_is_cpu_idle(void)
 }
 EXPORT_SYMBOL(rcu_is_cpu_idle);
 
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
 
 /*
  * Is the current CPU online?  Disable preemption to avoid false positives
@@ -645,9 +643,7 @@ bool rcu_lockdep_current_cpu_online(void)
 }
 EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
 
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-#endif /* #ifdef CONFIG_PROVE_RCU */
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
 
 /**
  * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
@@ -1904,6 +1900,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	else
 		trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
 
+	/*
+	 * If called from an extended quiescent state, invoke the RCU
+	 * core in order to force a re-evaluation of RCU's idleness.
+	 */
+	if (rcu_is_cpu_idle())
+		invoke_rcu_core();
+
 	/* If interrupts were disabled, don't dive into RCU core. */
 	if (irqs_disabled_flags(flags)) {
 		local_irq_restore(flags);
-- 
cgit v1.2.3-59-g8ed1b


From a16b7a693430406dc229ab0c6b154f669a2031c5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Sat, 26 May 2012 08:56:01 -0700
Subject: rcu: Prevent __call_rcu() from invoking RCU core on offline CPUs

The __call_rcu() function will invoke the RCU core, for example, if
it detects that the current CPU has too many callbacks.  However, this
can happen on an offline CPU that is on its way to the idle loop, in
which case it is an error to invoke the RCU core, and the excess callbacks
will be adopted in any case.  This commit therefore adds checks to
__call_rcu() for running on an offline CPU, refraining from invoking
the RCU core in this case.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a4a9c916ad36..ceaa95923a87 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1904,11 +1904,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	 * If called from an extended quiescent state, invoke the RCU
 	 * core in order to force a re-evaluation of RCU's idleness.
 	 */
-	if (rcu_is_cpu_idle())
+	if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
 		invoke_rcu_core();
 
-	/* If interrupts were disabled, don't dive into RCU core. */
-	if (irqs_disabled_flags(flags)) {
+	/* If interrupts were disabled or CPU offline, don't invoke RCU core. */
+	if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) {
 		local_irq_restore(flags);
 		return;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 29154c57e35a191c83b19c61b1935c9f21957662 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 30 May 2012 03:21:48 -0700
Subject: rcu: Split RCU core processing out of __call_rcu()

The __call_rcu() function is a bit overweight, so this commit splits
it into actual enqueuing of and accounting for the callback (__call_rcu())
and associated RCU-core processing (__call_rcu_core()).

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c | 90 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ceaa95923a87..70c4da7d2a97 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1861,45 +1861,12 @@ static void invoke_rcu_core(void)
 	raise_softirq(RCU_SOFTIRQ);
 }
 
-static void
-__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
-	   struct rcu_state *rsp, bool lazy)
+/*
+ * Handle any core-RCU processing required by a call_rcu() invocation.
+ */
+static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
+			    struct rcu_head *head, unsigned long flags)
 {
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
-	debug_rcu_head_queue(head);
-	head->func = func;
-	head->next = NULL;
-
-	smp_mb(); /* Ensure RCU update seen before callback registry. */
-
-	/*
-	 * Opportunistically note grace-period endings and beginnings.
-	 * Note that we might see a beginning right after we see an
-	 * end, but never vice versa, since this CPU has to pass through
-	 * a quiescent state betweentimes.
-	 */
-	local_irq_save(flags);
-	rdp = this_cpu_ptr(rsp->rda);
-
-	/* Add the callback to our list. */
-	ACCESS_ONCE(rdp->qlen)++;
-	if (lazy)
-		rdp->qlen_lazy++;
-	else
-		rcu_idle_count_callbacks_posted();
-	smp_mb();  /* Count before adding callback for rcu_barrier(). */
-	*rdp->nxttail[RCU_NEXT_TAIL] = head;
-	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
-
-	if (__is_kfree_rcu_offset((unsigned long)func))
-		trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
-					 rdp->qlen_lazy, rdp->qlen);
-	else
-		trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
-
 	/*
 	 * If called from an extended quiescent state, invoke the RCU
 	 * core in order to force a re-evaluation of RCU's idleness.
@@ -1908,10 +1875,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 		invoke_rcu_core();
 
 	/* If interrupts were disabled or CPU offline, don't invoke RCU core. */
-	if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) {
-		local_irq_restore(flags);
+	if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
 		return;
-	}
 
 	/*
 	 * Force the grace period if too many callbacks or too long waiting.
@@ -1944,6 +1909,49 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 		}
 	} else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
 		force_quiescent_state(rsp, 1);
+}
+
+static void
+__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+	   struct rcu_state *rsp, bool lazy)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
+	debug_rcu_head_queue(head);
+	head->func = func;
+	head->next = NULL;
+
+	smp_mb(); /* Ensure RCU update seen before callback registry. */
+
+	/*
+	 * Opportunistically note grace-period endings and beginnings.
+	 * Note that we might see a beginning right after we see an
+	 * end, but never vice versa, since this CPU has to pass through
+	 * a quiescent state betweentimes.
+	 */
+	local_irq_save(flags);
+	rdp = this_cpu_ptr(rsp->rda);
+
+	/* Add the callback to our list. */
+	ACCESS_ONCE(rdp->qlen)++;
+	if (lazy)
+		rdp->qlen_lazy++;
+	else
+		rcu_idle_count_callbacks_posted();
+	smp_mb();  /* Count before adding callback for rcu_barrier(). */
+	*rdp->nxttail[RCU_NEXT_TAIL] = head;
+	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+	if (__is_kfree_rcu_offset((unsigned long)func))
+		trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
+					 rdp->qlen_lazy, rdp->qlen);
+	else
+		trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
+
+	/* Go handle any RCU core processing required. */
+	__call_rcu_core(rsp, rdp, head, flags);
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From d7118175cce7e76b3292b60832813ef1f28b6523 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 6 Jun 2012 07:12:43 -0700
Subject: rcu: Fix rcu_is_cpu_idle() #ifdef in TINY_RCU

The rcu_is_cpu_idle() function is used if CONFIG_DEBUG_LOCK_ALLOC,
but TINY_RCU defines it only when CONFIG_PROVE_RCU.  This causes
build failures when CONFIG_DEBUG_LOCK_ALLOC=y but CONFIG_PROVE_RCU=n.
This commit therefore adjusts the #ifdefs for rcu_is_cpu_idle() so
that it is defined when CONFIG_DEBUG_LOCK_ALLOC=y.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutiny.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 37a5444204d2..547b1fe5b052 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -172,7 +172,7 @@ void rcu_irq_enter(void)
 	local_irq_restore(flags);
 }
 
-#ifdef CONFIG_PROVE_RCU
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 /*
  * Test whether RCU thinks that the current CPU is idle.
@@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void)
 }
 EXPORT_SYMBOL(rcu_is_cpu_idle);
 
-#endif /* #ifdef CONFIG_PROVE_RCU */
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 /*
  * Test whether the current CPU was interrupted from idle.  Nested
-- 
cgit v1.2.3-59-g8ed1b


From 1c17e4d4437d8045a596d9f06c1558dc09e2b372 Mon Sep 17 00:00:00 2001
From: Carsten Emde <C.Emde@osadl.org>
Date: Tue, 19 Jun 2012 10:43:16 +0200
Subject: rcu: Prevent uninitialized string in RCU CPU stall info

An uninitialized string may be displayed at the end of the rcu_preempt
detected stall info such as

0: (1 GPs behind) idle=075/140000000000000/0 =8?^D=8?^D
                                             ^^^^^^^^^^
if CONFIG_RCU_FAST_NO_HZ is not defined.

This trivial patch clears the string in this case.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 4b6b17cdf66b..395cdd1e0634 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -2224,6 +2224,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 
 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 {
+	*cp = '\0';
 }
 
 #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-- 
cgit v1.2.3-59-g8ed1b


From 95f0c1de3e6ed4383cc4b5f52ce4ecfb21026b49 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 19 Jun 2012 11:58:27 -0700
Subject: rcu: Disable preemption in rcu_blocking_is_gp()

It is time to optimize CONFIG_TREE_PREEMPT_RCU's synchronize_rcu()
for uniprocessor optimization, which means that rcu_blocking_is_gp()
can no longer rely on RCU read-side critical sections having disabled
preemption.  This commit therefore disables preemption across
rcu_blocking_is_gp()'s scan of the cpu_online_mask.

(Updated from previous version to fix embarrassing bug spotted by
Wu Fengguang.)

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 70c4da7d2a97..e000a623e635 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1981,28 +1981,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
  * occasionally incorrectly indicate that there are multiple CPUs online
  * when there was in fact only one the whole time, as this just adds
  * some overhead: RCU still operates correctly.
- *
- * Of course, sampling num_online_cpus() with preemption enabled can
- * give erroneous results if there are concurrent CPU-hotplug operations.
- * For example, given a demonic sequence of preemptions in num_online_cpus()
- * and CPU-hotplug operations, there could be two or more CPUs online at
- * all times, but num_online_cpus() might well return one (or even zero).
- *
- * However, all such demonic sequences require at least one CPU-offline
- * operation.  Furthermore, rcu_blocking_is_gp() giving the wrong answer
- * is only a problem if there is an RCU read-side critical section executing
- * throughout.  But RCU-sched and RCU-bh read-side critical sections
- * disable either preemption or bh, which prevents a CPU from going offline.
- * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
- * that there is only one CPU when in fact there was more than one throughout
- * is when there were no RCU readers in the system.  If there are no
- * RCU readers, the grace period by definition can be of zero length,
- * regardless of the number of online CPUs.
  */
 static inline int rcu_blocking_is_gp(void)
 {
+	int ret;
+
 	might_sleep();  /* Check for RCU read-side critical section. */
-	return num_online_cpus() <= 1;
+	preempt_disable();
+	ret = num_online_cpus() <= 1;
+	preempt_enable();
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From cf01537ecf192d9ff15c32a355db5d5af22bea4d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 21 Jun 2012 11:26:42 -0700
Subject: rcu: Add check for CPUs going offline with callbacks queued

If a CPU goes offline with callbacks queued, those callbacks might be
indefinitely postponed, which can result in a system hang.  This commit
therefore inserts warnings for this condition.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e000a623e635..95c7b61e77e4 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1509,6 +1509,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	if (need_report & RCU_OFL_TASKS_EXP_GP)
 		rcu_report_exp_rnp(rsp, rnp, true);
+	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
+		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
+		  cpu, rdp->qlen, rdp->nxtlist);
 }
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
-- 
cgit v1.2.3-59-g8ed1b


From bf1304e9cd755be7814ac8365834b7a1d0f06b58 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 9 May 2012 15:55:39 -0700
Subject: rcu: Dump only the current CPU's buffers for idle-entry/exit warnings

Problems in RCU idle entry and exit are almost always confined to the
offending CPU.  This commit therefore switches ftrace_dump() from
DUMP_ALL to DUMP_ORIG.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Tested-by: Pascal Chapperon <pascal.chapperon@wanadoo.fr>
---
 kernel/rcutree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4b97bba7396e..6eb48f13eeeb 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -358,7 +358,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
 		struct task_struct *idle = idle_task(smp_processor_id());
 
 		trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
-		ftrace_dump(DUMP_ALL);
+		ftrace_dump(DUMP_ORIG);
 		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
 			  current->pid, current->comm,
 			  idle->pid, idle->comm); /* must be idle task! */
@@ -468,7 +468,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
 
 		trace_rcu_dyntick("Error on exit: not idle task",
 				  oldval, rdtp->dynticks_nesting);
-		ftrace_dump(DUMP_ALL);
+		ftrace_dump(DUMP_ORIG);
 		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
 			  current->pid, current->comm,
 			  idle->pid, idle->comm); /* must be idle task! */
-- 
cgit v1.2.3-59-g8ed1b


From 28f8555364ef5b54b21251c5c8022109a70626e9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 10 May 2012 15:37:48 -0700
Subject: rcu: The rcu_needs_cpu() function is not a quiescent state

The TINY_PREEMPT_RCU() function rcu_preempt_needs_cpu(), which is called
from rcu_needs_cpu(), assumes that it is in a quiescent state with respect
to the CPU.  This is no longer the case.  This commit therefore updates
rcu_preempt_needs_cpu() to make it aware that it is not running in a
quiescent state.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Tested-by: Pascal Chapperon <pascal.chapperon@wanadoo.fr>
---
 kernel/rcutiny_plugin.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index fc31a2d65100..849ede51c12d 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -846,8 +846,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
  */
 int rcu_preempt_needs_cpu(void)
 {
-	if (!rcu_preempt_running_reader())
-		rcu_preempt_cpu_qs();
 	return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From e84c48ae3024ac2f14ed1c3671e5ea37c60fb838 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 4 Jun 2012 20:45:10 -0700
Subject: rcu: Round FAST_NO_HZ lazy timeout to nearest second

Currently, if several CPUs in the same package have all lazy RCU
callbacks, their wakeups will be uncorrelated.  If all the CPUs are in the
same power domain (as is often the case), this will result in unnecessary
power-ups of the package.  This commit therefore uses round_jiffies()
to round the timeouts to a second boundary, increasing the odds that
they can be coalesced with each other or with other timeouts.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3e4899459f3d..c28d25522a81 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1968,7 +1968,7 @@ static void rcu_idle_count_callbacks_posted(void)
  */
 #define RCU_IDLE_FLUSHES 5		/* Number of dyntick-idle tries. */
 #define RCU_IDLE_OPT_FLUSHES 3		/* Optional dyntick-idle tries. */
-#define RCU_IDLE_GP_DELAY 6		/* Roughly one grace period. */
+#define RCU_IDLE_GP_DELAY 4		/* Roughly one grace period. */
 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)	/* Roughly six seconds. */
 
 /*
@@ -2047,10 +2047,13 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 		return 1;
 	}
 	/* Set up for the possibility that RCU will post a timer. */
-	if (rcu_cpu_has_nonlazy_callbacks(cpu))
-		*delta_jiffies = RCU_IDLE_GP_DELAY;
-	else
-		*delta_jiffies = RCU_IDLE_LAZY_GP_DELAY;
+	if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
+		*delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
+					  RCU_IDLE_GP_DELAY) - jiffies;
+	} else {
+		*delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
+		*delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
+	}
 	return 0;
 }
 
@@ -2187,10 +2190,11 @@ static void rcu_prepare_for_idle(int cpu)
 		if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
 			trace_rcu_prep_idle("Dyntick with callbacks");
 			rdtp->idle_gp_timer_expires =
-					   jiffies + RCU_IDLE_GP_DELAY;
+				round_up(jiffies + RCU_IDLE_GP_DELAY,
+					 RCU_IDLE_GP_DELAY);
 		} else {
 			rdtp->idle_gp_timer_expires =
-					   jiffies + RCU_IDLE_LAZY_GP_DELAY;
+				round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
 			trace_rcu_prep_idle("Dyntick with lazy callbacks");
 		}
 		tp = &rdtp->idle_gp_timer;
-- 
cgit v1.2.3-59-g8ed1b


From 9d2ad24306f2fafc3612e5a216aab31f9e56e879 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Sun, 24 Jun 2012 10:15:02 -0700
Subject: rcu: Make RCU_FAST_NO_HZ respect nohz= boot parameter

If the nohz= boot parameter disables nohz, then RCU_FAST_NO_HZ needs to
also disable itself.  This commit therefore checks for tick_nohz_enabled
being zero, disabling rcu_prepare_for_idle() if so.  This commit assumes
that tick_nohz_enabled can change at runtime: If this is not the case,
then a simpler approach suffices.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.h         |  1 +
 kernel/rcutree_plugin.h  | 15 +++++++++++++++
 kernel/time/tick-sched.c |  2 +-
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 19b61ac1079f..e978845c6800 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -97,6 +97,7 @@ struct rcu_dynticks {
 				    /* # times non-lazy CBs posted to CPU. */
 	unsigned long nonlazy_posted_snap;
 				    /* idle-period nonlazy_posted snapshot. */
+	int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
 
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c28d25522a81..3508000e077d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1971,6 +1971,8 @@ static void rcu_idle_count_callbacks_posted(void)
 #define RCU_IDLE_GP_DELAY 4		/* Roughly one grace period. */
 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)	/* Roughly six seconds. */
 
+extern int tick_nohz_enabled;
+
 /*
  * Does the specified flavor of RCU have non-lazy callbacks pending on
  * the specified CPU?  Both RCU flavor and CPU are specified by the
@@ -2112,6 +2114,7 @@ static void rcu_cleanup_after_idle(int cpu)
 
 	del_timer(&rdtp->idle_gp_timer);
 	trace_rcu_prep_idle("Cleanup after idle");
+	rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
 }
 
 /*
@@ -2137,6 +2140,18 @@ static void rcu_prepare_for_idle(int cpu)
 {
 	struct timer_list *tp;
 	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+	int tne;
+
+	/* Handle nohz enablement switches conservatively. */
+	tne = ACCESS_ONCE(tick_nohz_enabled);
+	if (tne != rdtp->tick_nohz_enabled_snap) {
+		if (rcu_cpu_has_callbacks(cpu))
+			invoke_rcu_core(); /* force nohz to see update. */
+		rdtp->tick_nohz_enabled_snap = tne;
+		return;
+	}
+	if (!tne)
+		return;
 
 	/*
 	 * If this is an idle re-entry, for example, due to use of
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 869997833928..66ff07f6184c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void)
 /*
  * NO HZ enabled ?
  */
-static int tick_nohz_enabled __read_mostly  = 1;
+int tick_nohz_enabled __read_mostly  = 1;
 
 /*
  * Enable / Disable tickless mode
-- 
cgit v1.2.3-59-g8ed1b


From 164c33c6adee609b8b9062cce4c10f764d0dce13 Mon Sep 17 00:00:00 2001
From: Salman Qazi <sqazi@google.com>
Date: Mon, 25 Jun 2012 18:18:15 -0700
Subject: sched: Fix fork() error path to not crash

In dup_task_struct(), if arch_dup_task_struct() fails, the clean up
code fails to clean up correctly.  That's because the clean up
code depends on unininitalized ti->task pointer.  We fix this
by making sure that the task and thread_info know about each other
before we attempt to take the error path.

Signed-off-by: Salman Qazi <sqazi@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120626011815.11323.5533.stgit@dungbeetle.mtv.corp.google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/fork.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b9e622..f00e319d8376 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	}
 
 	err = arch_dup_task_struct(tsk, orig);
-	if (err)
-		goto out;
 
+	/*
+	 * We defer looking at err, because we will need this setup
+	 * for the clean up path to work correctly.
+	 */
 	tsk->stack = ti;
-
 	setup_thread_stack(tsk, orig);
+
+	if (err)
+		goto out;
+
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
 	stackend = end_of_stack(tsk);
-- 
cgit v1.2.3-59-g8ed1b


From 5167e8d5417bf5c322a703d2927daec727ea40dd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 22 Jun 2012 15:52:09 +0200
Subject: sched/nohz: Rewrite and fix load-avg computation -- again

Thanks to Charles Wang for spotting the defects in the current code:

 - If we go idle during the sample window -- after sampling, we get a
   negative bias because we can negate our own sample.

 - If we wake up during the sample window we get a positive bias
   because we push the sample to a known active period.

So rewrite the entire nohz load-avg muck once again, now adding
copious documentation to the code.

Reported-and-tested-by: Doug Smythies <dsmythies@telus.net>
Reported-and-tested-by: Charles Wang <muming.wq@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: stable@kernel.org
Link: http://lkml.kernel.org/r/1340373782.18025.74.camel@twins
[ minor edits ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h    |   8 ++
 kernel/sched/core.c      | 275 ++++++++++++++++++++++++++++++++++-------------
 kernel/sched/idle_task.c |   1 -
 kernel/sched/sched.h     |   2 -
 kernel/time/tick-sched.c |   2 +
 5 files changed, 213 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4059c0f33f07..20cb7497c59c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1909,6 +1909,14 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 }
 #endif
 
+#ifdef CONFIG_NO_HZ
+void calc_load_enter_idle(void);
+void calc_load_exit_idle(void);
+#else
+static inline void calc_load_enter_idle(void) { }
+static inline void calc_load_exit_idle(void) { }
+#endif /* CONFIG_NO_HZ */
+
 #ifndef CONFIG_CPUMASK_OFFSTACK
 static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5594a4268d4..bb840405335d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2161,11 +2161,73 @@ unsigned long this_cpu_load(void)
 }
 
 
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ *   nr_active = 0;
+ *   for_each_possible_cpu(cpu)
+ *   	nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ *  - for_each_possible_cpu() is prohibitively expensive on machines with
+ *    serious number of cpus, therefore we need to take a distributed approach
+ *    to calculating nr_active.
+ *
+ *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ *    So assuming nr_active := 0 when we start out -- true per definition, we
+ *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    to obtain the same result. See calc_load_fold_active().
+ *
+ *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    across the machine, we assume 10 ticks is sufficient time for every
+ *    cpu to have completed this task.
+ *
+ *    This places an upper-bound on the IRQ-off latency of the machine. Then
+ *    again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ *    this would add another cross-cpu cacheline miss and atomic operation
+ *    to the wakeup path. Instead we increment on whatever cpu the task ran
+ *    when it went into uninterruptible state and decrement on whatever cpu
+ *    did the wakeup. This means that only the sum of nr_uninterruptible over
+ *    all cpus yields the correct result.
+ *
+ *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
 static unsigned long calc_load_update;
 unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
+}
 
 static long calc_load_fold_active(struct rq *this_rq)
 {
@@ -2182,6 +2244,9 @@ static long calc_load_fold_active(struct rq *this_rq)
 	return delta;
 }
 
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
@@ -2193,30 +2258,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 
 #ifdef CONFIG_NO_HZ
 /*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ *  - When we go NO_HZ idle during the window, we can negate our sample
+ *    contribution, causing under-accounting.
+ *
+ *    We avoid this by keeping two idle-delta counters and flipping them
+ *    when the window starts, thus separating old and new NO_HZ load.
+ *
+ *    The only trick is the slight shift in index flip for read vs write.
+ *
+ *        0s            5s            10s           15s
+ *          +10           +10           +10           +10
+ *        |-|-----------|-|-----------|-|-----------|-|
+ *    r:0 0 1           1 0           0 1           1 0
+ *    w:0 1 1           0 0           1 1           0 0
+ *
+ *    This ensures we'll fold the old idle contribution in this window while
+ *    accumlating the new one.
+ *
+ *  - When we wake up from NO_HZ idle during the window, we push up our
+ *    contribution, since we effectively move our sample point to a known
+ *    busy state.
+ *
+ *    This is solved by pushing the window forward, and thus skipping the
+ *    sample, for this cpu (effectively using the idle-delta for this cpu which
+ *    was in effect at the time the window opened). This also solves the issue
+ *    of having to deal with a cpu having been in NOHZ idle for multiple
+ *    LOAD_FREQ intervals.
  *
  * When making the ILB scale, we should try to pull this in as well.
  */
-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
 
-void calc_load_account_idle(struct rq *this_rq)
+static inline int calc_load_write_idx(void)
 {
+	int idx = calc_load_idx;
+
+	/*
+	 * See calc_global_nohz(), if we observe the new index, we also
+	 * need to observe the new update time.
+	 */
+	smp_rmb();
+
+	/*
+	 * If the folding window started, make sure we start writing in the
+	 * next idle-delta.
+	 */
+	if (!time_before(jiffies, calc_load_update))
+		idx++;
+
+	return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+	return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
+{
+	struct rq *this_rq = this_rq();
 	long delta;
 
+	/*
+	 * We're going into NOHZ mode, if there's any pending delta, fold it
+	 * into the pending idle delta.
+	 */
 	delta = calc_load_fold_active(this_rq);
-	if (delta)
-		atomic_long_add(delta, &calc_load_tasks_idle);
+	if (delta) {
+		int idx = calc_load_write_idx();
+		atomic_long_add(delta, &calc_load_idle[idx]);
+	}
 }
 
-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
 {
-	long delta = 0;
+	struct rq *this_rq = this_rq();
+
+	/*
+	 * If we're still before the sample window, we're done.
+	 */
+	if (time_before(jiffies, this_rq->calc_load_update))
+		return;
 
 	/*
-	 * Its got a race, we don't care...
+	 * We woke inside or after the sample window, this means we're already
+	 * accounted through the nohz accounting, so skip the entire deal and
+	 * sync up for the next window.
 	 */
-	if (atomic_long_read(&calc_load_tasks_idle))
-		delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+	this_rq->calc_load_update = calc_load_update;
+	if (time_before(jiffies, this_rq->calc_load_update + 10))
+		this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+	int idx = calc_load_read_idx();
+	long delta = 0;
+
+	if (atomic_long_read(&calc_load_idle[idx]))
+		delta = atomic_long_xchg(&calc_load_idle[idx], 0);
 
 	return delta;
 }
@@ -2302,66 +2455,39 @@ static void calc_global_nohz(void)
 {
 	long delta, active, n;
 
-	/*
-	 * If we crossed a calc_load_update boundary, make sure to fold
-	 * any pending idle changes, the respective CPUs might have
-	 * missed the tick driven calc_load_account_active() update
-	 * due to NO_HZ.
-	 */
-	delta = calc_load_fold_idle();
-	if (delta)
-		atomic_long_add(delta, &calc_load_tasks);
-
-	/*
-	 * It could be the one fold was all it took, we done!
-	 */
-	if (time_before(jiffies, calc_load_update + 10))
-		return;
-
-	/*
-	 * Catch-up, fold however many we are behind still
-	 */
-	delta = jiffies - calc_load_update - 10;
-	n = 1 + (delta / LOAD_FREQ);
+	if (!time_before(jiffies, calc_load_update + 10)) {
+		/*
+		 * Catch-up, fold however many we are behind still
+		 */
+		delta = jiffies - calc_load_update - 10;
+		n = 1 + (delta / LOAD_FREQ);
 
-	active = atomic_long_read(&calc_load_tasks);
-	active = active > 0 ? active * FIXED_1 : 0;
+		active = atomic_long_read(&calc_load_tasks);
+		active = active > 0 ? active * FIXED_1 : 0;
 
-	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
 
-	calc_load_update += n * LOAD_FREQ;
-}
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
-}
+		calc_load_update += n * LOAD_FREQ;
+	}
 
-static inline long calc_load_fold_idle(void)
-{
-	return 0;
+	/*
+	 * Flip the idle index...
+	 *
+	 * Make sure we first write the new time then flip the index, so that
+	 * calc_load_write_idx() will see the new time when it reads the new
+	 * index, this avoids a double flip messing things up.
+	 */
+	smp_wmb();
+	calc_load_idx++;
 }
+#else /* !CONFIG_NO_HZ */
 
-static void calc_global_nohz(void)
-{
-}
-#endif
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
 
-/**
- * get_avenrun - get the load average array
- * @loads:	pointer to dest load array
- * @offset:	offset to add
- * @shift:	shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-	loads[0] = (avenrun[0] + offset) << shift;
-	loads[1] = (avenrun[1] + offset) << shift;
-	loads[2] = (avenrun[2] + offset) << shift;
-}
+#endif /* CONFIG_NO_HZ */
 
 /*
  * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2369,11 +2495,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
  */
 void calc_global_load(unsigned long ticks)
 {
-	long active;
+	long active, delta;
 
 	if (time_before(jiffies, calc_load_update + 10))
 		return;
 
+	/*
+	 * Fold the 'old' idle-delta to include all NO_HZ cpus.
+	 */
+	delta = calc_load_fold_idle();
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
+
 	active = atomic_long_read(&calc_load_tasks);
 	active = active > 0 ? active * FIXED_1 : 0;
 
@@ -2384,12 +2517,7 @@ void calc_global_load(unsigned long ticks)
 	calc_load_update += LOAD_FREQ;
 
 	/*
-	 * Account one period with whatever state we found before
-	 * folding in the nohz state and ageing the entire idle period.
-	 *
-	 * This avoids loosing a sample when we go idle between 
-	 * calc_load_account_active() (10 ticks ago) and now and thus
-	 * under-accounting.
+	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
 	 */
 	calc_global_nohz();
 }
@@ -2406,13 +2534,16 @@ static void calc_load_account_active(struct rq *this_rq)
 		return;
 
 	delta  = calc_load_fold_active(this_rq);
-	delta += calc_load_fold_idle();
 	if (delta)
 		atomic_long_add(delta, &calc_load_tasks);
 
 	this_rq->calc_load_update += LOAD_FREQ;
 }
 
+/*
+ * End of global load-average stuff
+ */
+
 /*
  * The exact cpuload at various idx values, calculated at every tick would be
  * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b44d604b35d1..b6baf370cae9 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	schedstat_inc(rq, sched_goidle);
-	calc_load_account_idle(rq);
 	return rq->idle;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d52cea7f33d..55844f24435a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void)
 	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
 }
 
-void calc_load_account_idle(struct rq *this_rq);
-
 #ifdef CONFIG_SCHED_HRTICK
 
 /*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 869997833928..4a08472c3ca7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -406,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
 		 */
 		if (!ts->tick_stopped) {
 			select_nohz_load_balancer(1);
+			calc_load_enter_idle();
 
 			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
 			ts->tick_stopped = 1;
@@ -597,6 +598,7 @@ void tick_nohz_idle_exit(void)
 		account_idle_ticks(ticks);
 #endif
 
+	calc_load_exit_idle();
 	touch_softlockup_watchdog();
 	/*
 	 * Cancel the scheduled timer and restore the tick
-- 
cgit v1.2.3-59-g8ed1b


From 74a7f08448adea6cb47cd9b260c98ff168117e92 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 15 Jun 2012 11:50:25 -0600
Subject: devicetree: add helper inline for retrieving a node's full name

The pattern (np ? np->full_name : "<none>") is rather common in the
kernel, but can also make for quite long lines.  This patch adds a new
inline function, of_node_full_name() so that the test for a valid node
pointer doesn't need to be open coded at all call sites.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rob Herring <rob.herring@calxeda.com>
---
 arch/microblaze/pci/pci-common.c       |  6 ++----
 arch/powerpc/kernel/pci-common.c       |  6 ++----
 arch/powerpc/kernel/vio.c              |  5 ++---
 arch/powerpc/platforms/cell/iommu.c    |  3 +--
 arch/powerpc/platforms/pseries/iommu.c |  2 +-
 arch/sparc/kernel/of_device_64.c       |  2 +-
 drivers/of/base.c                      |  2 +-
 drivers/of/irq.c                       |  2 +-
 include/linux/of.h                     | 10 ++++++++++
 kernel/irq/irqdomain.c                 |  8 ++++----
 10 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index ed22bfc5db14..ca8f6e769960 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -249,8 +249,7 @@ int pci_read_irq_line(struct pci_dev *pci_dev)
 	} else {
 		pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %s\n",
 			 oirq.size, oirq.specifier[0], oirq.specifier[1],
-			 oirq.controller ? oirq.controller->full_name :
-			 "<default>");
+			 of_node_full_name(oirq.controller));
 
 		virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
 					     oirq.size);
@@ -1493,8 +1492,7 @@ static void __devinit pcibios_scan_phb(struct pci_controller *hose)
 	struct pci_bus *bus;
 	struct device_node *node = hose->dn;
 
-	pr_debug("PCI: Scanning PHB %s\n",
-		 node ? node->full_name : "<NO NAME>");
+	pr_debug("PCI: Scanning PHB %s\n", of_node_full_name(node));
 
 	pcibios_setup_phb_resources(hose, &resources);
 
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 8e78e93c8185..886c254fd565 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -248,8 +248,7 @@ static int pci_read_irq_line(struct pci_dev *pci_dev)
 	} else {
 		pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %s\n",
 			 oirq.size, oirq.specifier[0], oirq.specifier[1],
-			 oirq.controller ? oirq.controller->full_name :
-			 "<default>");
+			 of_node_full_name(oirq.controller));
 
 		virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
 					     oirq.size);
@@ -1628,8 +1627,7 @@ void __devinit pcibios_scan_phb(struct pci_controller *hose)
 	struct device_node *node = hose->dn;
 	int mode;
 
-	pr_debug("PCI: Scanning PHB %s\n",
-		 node ? node->full_name : "<NO NAME>");
+	pr_debug("PCI: Scanning PHB %s\n", of_node_full_name(node));
 
 	/* Get some IO space for the new PHB */
 	pcibios_setup_phb_io_space(hose);
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index cb87301ccd55..63f72ede4341 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -1296,8 +1296,7 @@ static void __devinit vio_dev_release(struct device *dev)
 	struct iommu_table *tbl = get_iommu_table_base(dev);
 
 	if (tbl)
-		iommu_free_table(tbl, dev->of_node ?
-			dev->of_node->full_name : dev_name(dev));
+		iommu_free_table(tbl, of_node_full_name(dev->of_node));
 	of_node_put(dev->of_node);
 	kfree(to_vio_dev(dev));
 }
@@ -1509,7 +1508,7 @@ static ssize_t devspec_show(struct device *dev,
 {
 	struct device_node *of_node = dev->of_node;
 
-	return sprintf(buf, "%s\n", of_node ? of_node->full_name : "none");
+	return sprintf(buf, "%s\n", of_node_full_name(of_node));
 }
 
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index b9f509a34c01..b6732004c882 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -552,8 +552,7 @@ static struct iommu_table *cell_get_iommu_table(struct device *dev)
 	iommu = cell_iommu_for_node(dev_to_node(dev));
 	if (iommu == NULL || list_empty(&iommu->windows)) {
 		printk(KERN_ERR "iommu: missing iommu for %s (node %d)\n",
-		       dev->of_node ? dev->of_node->full_name : "?",
-		       dev_to_node(dev));
+		       of_node_full_name(dev->of_node), dev_to_node(dev));
 		return NULL;
 	}
 	window = list_entry(iommu->windows.next, struct iommu_window, list);
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 2d311c0caf8e..6b58a395dff6 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1051,7 +1051,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	if (!pdn || !PCI_DN(pdn)) {
 		printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
 		       "no DMA window found for pci dev=%s dn=%s\n",
-				 pci_name(dev), dn? dn->full_name : "<null>");
+				 pci_name(dev), of_node_full_name(dn));
 		return;
 	}
 	pr_debug("  parent is %s\n", pdn->full_name);
diff --git a/arch/sparc/kernel/of_device_64.c b/arch/sparc/kernel/of_device_64.c
index 7a3be6f6737a..7bbdc26d9512 100644
--- a/arch/sparc/kernel/of_device_64.c
+++ b/arch/sparc/kernel/of_device_64.c
@@ -580,7 +580,7 @@ static unsigned int __init build_one_device_irq(struct platform_device *op,
 				printk("%s: Apply [%s:%x] imap --> [%s:%x]\n",
 				       op->dev.of_node->full_name,
 				       pp->full_name, this_orig_irq,
-				       (iret ? iret->full_name : "NULL"), irq);
+				       of_node_full_name(iret), irq);
 
 			if (!iret)
 				break;
diff --git a/drivers/of/base.c b/drivers/of/base.c
index 85757952f12d..9ec0a2f1b028 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1173,7 +1173,7 @@ static void of_alias_add(struct alias_prop *ap, struct device_node *np,
 	ap->stem[stem_len] = 0;
 	list_add_tail(&ap->link, &aliases_lookup);
 	pr_debug("adding DT alias:%s: stem=%s id=%i node=%s\n",
-		 ap->alias, ap->stem, ap->id, np ? np->full_name : NULL);
+		 ap->alias, ap->stem, ap->id, of_node_full_name(np));
 }
 
 /**
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index 9cf00602f566..ff8ab7b27373 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -255,7 +255,7 @@ int of_irq_map_raw(struct device_node *parent, const __be32 *intspec,
 
 	skiplevel:
 		/* Iterate again with new parent */
-		pr_debug(" -> new parent: %s\n", newpar ? newpar->full_name : "<>");
+		pr_debug(" -> new parent: %s\n", of_node_full_name(newpar));
 		of_node_put(ipar);
 		ipar = newpar;
 		newpar = NULL;
diff --git a/include/linux/of.h b/include/linux/of.h
index 2ec1083af7ff..1012377cae92 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -163,6 +163,11 @@ static inline int of_node_to_nid(struct device_node *np) { return -1; }
 #define of_node_to_nid of_node_to_nid
 #endif
 
+static inline const char* of_node_full_name(struct device_node *np)
+{
+	return np ? np->full_name : "<no-node>";
+}
+
 extern struct device_node *of_find_node_by_name(struct device_node *from,
 	const char *name);
 #define for_each_node_by_name(dn, name) \
@@ -303,6 +308,11 @@ const char *of_prop_next_string(struct property *prop, const char *cur);
 
 #else /* CONFIG_OF */
 
+static inline const char* of_node_full_name(struct device_node *np)
+{
+	return "<no-node>";
+}
+
 static inline bool of_have_populated_dt(void)
 {
 	return false;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 41c1564103f1..38c5eb839c92 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -448,7 +448,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
 	}
 
 	pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
-		hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
+		hwirq, of_node_full_name(domain->of_node), virq);
 
 	return virq;
 }
@@ -477,7 +477,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
 			return intspec[0];
 #endif
 		pr_warning("no irq domain found for %s !\n",
-			   controller->full_name);
+			   of_node_full_name(controller));
 		return 0;
 	}
 
@@ -725,8 +725,8 @@ static int virq_debug_show(struct seq_file *m, void *private)
 			data = irq_desc_get_chip_data(desc);
 			seq_printf(m, data ? "0x%p  " : "  %p  ", data);
 
-			if (desc->irq_data.domain && desc->irq_data.domain->of_node)
-				p = desc->irq_data.domain->of_node->full_name;
+			if (desc->irq_data.domain)
+				p = of_node_full_name(desc->irq_data.domain->of_node);
 			else
 				p = none;
 			seq_printf(m, "%s\n", p);
-- 
cgit v1.2.3-59-g8ed1b


From cfca927972e31a5b3da49bf641c525732ff3c357 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 25 Jun 2012 12:54:17 -0700
Subject: rcu: Introduce check for callback list/count mismatch

The recent bug that introduced the RCU callback list/count mismatch
showed the need for a diagnostic to check for this, which this commit
adds.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 95c7b61e77e4..4154c9567a6d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1612,6 +1612,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 		rdp->n_force_qs_snap = rsp->n_force_qs;
 	} else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
 		rdp->qlen_last_fqs_check = rdp->qlen;
+	WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
 
 	local_irq_restore(flags);
 
-- 
cgit v1.2.3-59-g8ed1b


From c701d5d9b384ff03ceb232ef21236364d784a411 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 28 Jun 2012 08:08:25 -0700
Subject: rcu: Fix code-style issues involving "else"

The Linux kernel coding style says that single-statement blocks should
omit curly braces unless the other leg of the "if" statement has
multiple statements, in which case the curly braces should be included.
This commit fixes RCU's violations of this rule.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcutiny_plugin.h |  7 ++++---
 kernel/rcutorture.c     |  3 ++-
 kernel/rcutree.c        |  7 ++++---
 kernel/rcutree_plugin.h | 14 ++++++++------
 4 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 116725b5edfb..918fd1e8509c 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -350,8 +350,9 @@ static int rcu_initiate_boost(void)
 			rcu_preempt_ctrlblk.boost_tasks =
 				rcu_preempt_ctrlblk.gp_tasks;
 		invoke_rcu_callbacks();
-	} else
+	} else {
 		RCU_TRACE(rcu_initiate_boost_trace());
+	}
 	return 1;
 }
 
@@ -778,9 +779,9 @@ void synchronize_rcu_expedited(void)
 		rpcp->exp_tasks = NULL;
 
 	/* Wait for tail of ->blkd_tasks list to drain. */
-	if (!rcu_preempted_readers_exp())
+	if (!rcu_preempted_readers_exp()) {
 		local_irq_restore(flags);
-	else {
+	} else {
 		rcu_initiate_boost();
 		local_irq_restore(flags);
 		wait_event(sync_rcu_preempt_exp_wq,
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c279ee920947..155fb129b641 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -408,8 +408,9 @@ rcu_torture_cb(struct rcu_head *p)
 	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
 		rp->rtort_mbtest = 0;
 		rcu_torture_free(rp);
-	} else
+	} else {
 		cur_ops->deferred_free(rp);
+	}
 }
 
 static int rcu_no_completed(void)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 117218a43724..f280e542e3e9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -892,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
 		if (rnp->qsmask & rdp->grpmask) {
 			rdp->qs_pending = 1;
 			rdp->passed_quiesce = 0;
-		} else
+		} else {
 			rdp->qs_pending = 0;
+		}
 		zero_cpu_stall_ticks(rdp);
 	}
 }
@@ -2130,9 +2131,9 @@ void synchronize_sched_expedited(void)
 		put_online_cpus();
 
 		/* No joy, try again later.  Or just synchronize_sched(). */
-		if (trycount++ < 10)
+		if (trycount++ < 10) {
 			udelay(trycount * num_online_cpus());
-		else {
+		} else {
 			synchronize_sched();
 			return;
 		}
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a9194d5606c4..7f3244c0df01 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -390,8 +390,9 @@ void rcu_read_unlock_special(struct task_struct *t)
 							 rnp->grphi,
 							 !!rnp->gp_tasks);
 			rcu_report_unblock_qs_rnp(rnp, flags);
-		} else
+		} else {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		}
 
 #ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
@@ -757,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 	int must_wait = 0;
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	if (list_empty(&rnp->blkd_tasks))
+	if (list_empty(&rnp->blkd_tasks)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-	else {
+	} else {
 		rnp->exp_tasks = rnp->blkd_tasks.next;
 		rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
 		must_wait = 1;
@@ -803,9 +804,9 @@ void synchronize_rcu_expedited(void)
 	 * expedited grace period for us, just leave.
 	 */
 	while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
-		if (trycount++ < 10)
+		if (trycount++ < 10) {
 			udelay(trycount * num_online_cpus());
-		else {
+		} else {
 			synchronize_rcu();
 			return;
 		}
@@ -2093,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu)
 	if (rcu_cpu_has_callbacks(cpu)) {
 		trace_rcu_prep_idle("More callbacks");
 		invoke_rcu_core();
-	} else
+	} else {
 		trace_rcu_prep_idle("Callbacks drained");
+	}
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 5cf05ad758c30d17ff23c2be346b5de982bc2121 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 17 May 2012 15:12:45 -0700
Subject: rcu: Fix broken strings in RCU's source code.

Although the C language allows you to break strings across lines, doing
this makes it hard for people to find the Linux kernel code corresponding
to a given console message.  This commit therefore fixes broken strings
throughout RCU's source code.

Suggested-by: Josh Triplett <josh@joshtriplett.org>
Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  3 +--
 kernel/rcutorture.c      | 33 ++++++++++++++-------------------
 kernel/rcutree_trace.c   | 25 ++++++++++++-------------
 lib/list_debug.c         |  6 ++----
 4 files changed, 29 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index c2c0d86dd3ac..115ead2b5155 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -427,8 +427,7 @@ extern int rcu_my_thread_group_empty(void);
 static inline void rcu_preempt_sleep_check(void)
 {
 	rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
-			   "Illegal context switch in RCU read-side "
-			   "critical section");
+			   "Illegal context switch in RCU read-side critical section");
 }
 #else /* #ifdef CONFIG_PROVE_RCU */
 static inline void rcu_preempt_sleep_check(void)
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 155fb129b641..25b15033c61f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -49,8 +49,7 @@
 #include <asm/byteorder.h>
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
-	      "Josh Triplett <josh@freedesktop.org>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
 
 static int nreaders = -1;	/* # reader threads, defaults to 2*ncpus */
 static int nfakewriters = 4;	/* # fake writer threads */
@@ -1200,27 +1199,27 @@ rcu_torture_printk(char *page)
 	}
 	cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
 	cnt += sprintf(&page[cnt],
-		       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
-		       "rtmbe: %d rtbke: %ld rtbre: %ld "
-		       "rtbf: %ld rtb: %ld nt: %ld "
-		       "onoff: %ld/%ld:%ld/%ld "
-		       "barrier: %ld/%ld:%ld",
+		       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
 		       rcu_torture_current,
 		       rcu_torture_current_version,
 		       list_empty(&rcu_torture_freelist),
 		       atomic_read(&n_rcu_torture_alloc),
 		       atomic_read(&n_rcu_torture_alloc_fail),
-		       atomic_read(&n_rcu_torture_free),
+		       atomic_read(&n_rcu_torture_free));
+	cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ",
 		       atomic_read(&n_rcu_torture_mberror),
 		       n_rcu_torture_boost_ktrerror,
-		       n_rcu_torture_boost_rterror,
+		       n_rcu_torture_boost_rterror);
+	cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ",
 		       n_rcu_torture_boost_failure,
 		       n_rcu_torture_boosts,
-		       n_rcu_torture_timers,
+		       n_rcu_torture_timers);
+	cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ",
 		       n_online_successes,
 		       n_online_attempts,
 		       n_offline_successes,
-		       n_offline_attempts,
+		       n_offline_attempts);
+	cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
 		       n_barrier_successes,
 		       n_barrier_attempts,
 		       n_rcu_torture_barrier_error);
@@ -1462,8 +1461,7 @@ rcu_torture_shutdown(void *arg)
 		delta = shutdown_time - jiffies_snap;
 		if (verbose)
 			printk(KERN_ALERT "%s" TORTURE_FLAG
-			       "rcu_torture_shutdown task: %lu "
-			       "jiffies remaining\n",
+			       "rcu_torture_shutdown task: %lu jiffies remaining\n",
 			       torture_type, delta);
 		schedule_timeout_interruptible(delta);
 		jiffies_snap = ACCESS_ONCE(jiffies);
@@ -1515,8 +1513,7 @@ rcu_torture_onoff(void *arg)
 			if (cpu_down(cpu) == 0) {
 				if (verbose)
 					printk(KERN_ALERT "%s" TORTURE_FLAG
-					       "rcu_torture_onoff task: "
-					       "offlined %d\n",
+					       "rcu_torture_onoff task: offlined %d\n",
 					       torture_type, cpu);
 				n_offline_successes++;
 			}
@@ -1529,8 +1526,7 @@ rcu_torture_onoff(void *arg)
 			if (cpu_up(cpu) == 0) {
 				if (verbose)
 					printk(KERN_ALERT "%s" TORTURE_FLAG
-					       "rcu_torture_onoff task: "
-					       "onlined %d\n",
+					       "rcu_torture_onoff task: onlined %d\n",
 					       torture_type, cpu);
 				n_online_successes++;
 			}
@@ -1952,8 +1948,7 @@ rcu_torture_init(void)
 		return -EINVAL;
 	}
 	if (cur_ops->fqs == NULL && fqs_duration != 0) {
-		printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
-				  "fqs_duration, fqs disabled.\n");
+		printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
 		fqs_duration = 0;
 	}
 	if (cur_ops->init)
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index a16ddbd6fdc4..abffb486e94e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -218,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = {
 
 static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
 {
-	seq_printf(m,  "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
-		   "j=%04x bt=%04x\n",
+	seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
 		   rnp->grplo, rnp->grphi,
 		   "T."[list_empty(&rnp->blkd_tasks)],
 		   "N."[!rnp->gp_tasks],
@@ -227,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
 		   "B."[!rnp->boost_tasks],
 		   convert_kthread_status(rnp->boost_kthread_status),
 		   rnp->n_tasks_boosted, rnp->n_exp_boosts,
-		   rnp->n_normal_boosts,
+		   rnp->n_normal_boosts);
+	seq_printf(m, "j=%04x bt=%04x\n",
 		   (int)(jiffies & 0xffff),
 		   (int)(rnp->boost_time & 0xffff));
-	seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
-		   "     balk",
+	seq_printf(m, "    balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
 		   rnp->n_balk_blkd_tasks,
 		   rnp->n_balk_exp_gp_tasks,
 		   rnp->n_balk_boost_tasks,
@@ -287,11 +286,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 	struct rcu_node *rnp;
 
 	gpnum = rsp->gpnum;
-	seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x "
-		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
+	seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ",
 		   rsp->name, rsp->completed, gpnum, rsp->fqs_state,
 		   (long)(rsp->jiffies_force_qs - jiffies),
-		   (int)(jiffies & 0xffff),
+		   (int)(jiffies & 0xffff));
+	seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
 		   rsp->n_force_qs - rsp->n_force_qs_ngp,
 		   rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
@@ -378,16 +377,16 @@ static const struct file_operations rcugp_fops = {
 
 static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
 {
-	seq_printf(m, "%3d%cnp=%ld "
-		   "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
-		   "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+	seq_printf(m, "%3d%cnp=%ld ",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
-		   rdp->n_rcu_pending,
+		   rdp->n_rcu_pending);
+	seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
 		   rdp->n_rp_qs_pending,
 		   rdp->n_rp_report_qs,
 		   rdp->n_rp_cb_ready,
-		   rdp->n_rp_cpu_needs_gp,
+		   rdp->n_rp_cpu_needs_gp);
+	seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
 		   rdp->n_rp_gp_completed,
 		   rdp->n_rp_gp_started,
 		   rdp->n_rp_need_fqs,
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 23a5e031cd8b..c24c2f7e296f 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -87,12 +87,10 @@ void __list_add_rcu(struct list_head *new,
 		    struct list_head *prev, struct list_head *next)
 {
 	WARN(next->prev != prev,
-		"list_add_rcu corruption. next->prev should be "
-		"prev (%p), but was %p. (next=%p).\n",
+		"list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
 		prev, next->prev, next);
 	WARN(prev->next != next,
-		"list_add_rcu corruption. prev->next should be "
-		"next (%p), but was %p. (prev=%p).\n",
+		"list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
 		next, prev->next, prev);
 	new->next = next;
 	new->prev = prev;
-- 
cgit v1.2.3-59-g8ed1b


From 5c53d819c71c63fdc91f30a59164583f68e2d63a Mon Sep 17 00:00:00 2001
From: liu chuansheng <chuansheng.liu@intel.com>
Date: Fri, 6 Jul 2012 09:50:08 -0700
Subject: printk: replacing the raw_spin_lock/unlock with
 raw_spin_lock/unlock_irq

In function devkmsg_read/writev/llseek/poll/open()..., the function
raw_spin_lock/unlock is used, there is potential deadlock case happening.
CPU1: thread1 doing the cat /dev/kmsg:
        raw_spin_lock(&logbuf_lock);
        while (user->seq == log_next_seq) {
when thread1 run here, at this time one interrupt is coming on CPU1 and running
based on this thread,if the interrupt handle called the printk which need the
logbuf_lock spin also, it will cause deadlock.

So we should use raw_spin_lock/unlock_irq here.

Acked-by: Kay Sievers <kay@vrfy.org>
Signed-off-by: liu chuansheng <chuansheng.liu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/printk.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index dba18211685e..12886cd19cd9 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -430,20 +430,20 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	ret = mutex_lock_interruptible(&user->lock);
 	if (ret)
 		return ret;
-	raw_spin_lock(&logbuf_lock);
+	raw_spin_lock_irq(&logbuf_lock);
 	while (user->seq == log_next_seq) {
 		if (file->f_flags & O_NONBLOCK) {
 			ret = -EAGAIN;
-			raw_spin_unlock(&logbuf_lock);
+			raw_spin_unlock_irq(&logbuf_lock);
 			goto out;
 		}
 
-		raw_spin_unlock(&logbuf_lock);
+		raw_spin_unlock_irq(&logbuf_lock);
 		ret = wait_event_interruptible(log_wait,
 					       user->seq != log_next_seq);
 		if (ret)
 			goto out;
-		raw_spin_lock(&logbuf_lock);
+		raw_spin_lock_irq(&logbuf_lock);
 	}
 
 	if (user->seq < log_first_seq) {
@@ -451,7 +451,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 		user->idx = log_first_idx;
 		user->seq = log_first_seq;
 		ret = -EPIPE;
-		raw_spin_unlock(&logbuf_lock);
+		raw_spin_unlock_irq(&logbuf_lock);
 		goto out;
 	}
 
@@ -501,7 +501,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 
 	user->idx = log_next(user->idx);
 	user->seq++;
-	raw_spin_unlock(&logbuf_lock);
+	raw_spin_unlock_irq(&logbuf_lock);
 
 	if (len > count) {
 		ret = -EINVAL;
@@ -528,7 +528,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 	if (offset)
 		return -ESPIPE;
 
-	raw_spin_lock(&logbuf_lock);
+	raw_spin_lock_irq(&logbuf_lock);
 	switch (whence) {
 	case SEEK_SET:
 		/* the first record */
@@ -552,7 +552,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 	default:
 		ret = -EINVAL;
 	}
-	raw_spin_unlock(&logbuf_lock);
+	raw_spin_unlock_irq(&logbuf_lock);
 	return ret;
 }
 
@@ -566,14 +566,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &log_wait, wait);
 
-	raw_spin_lock(&logbuf_lock);
+	raw_spin_lock_irq(&logbuf_lock);
 	if (user->seq < log_next_seq) {
 		/* return error when data has vanished underneath us */
 		if (user->seq < log_first_seq)
 			ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
 		ret = POLLIN|POLLRDNORM;
 	}
-	raw_spin_unlock(&logbuf_lock);
+	raw_spin_unlock_irq(&logbuf_lock);
 
 	return ret;
 }
@@ -597,10 +597,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
 
 	mutex_init(&user->lock);
 
-	raw_spin_lock(&logbuf_lock);
+	raw_spin_lock_irq(&logbuf_lock);
 	user->idx = log_first_idx;
 	user->seq = log_first_seq;
-	raw_spin_unlock(&logbuf_lock);
+	raw_spin_unlock_irq(&logbuf_lock);
 
 	file->private_data = user;
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From e3f5a5f27153228569f3396049838e9727dae86e Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay@vrfy.org>
Date: Fri, 6 Jul 2012 09:50:09 -0700
Subject: kmsg: escape the backslash character while exporting data

Non-printable characters in the log data are hex-escaped to ensure safe
post processing. We need to escape a backslash we find in the data, to be
able to distinguish it from a backslash we add for the escaping.

Also escape the non-printable character 127.

Thanks to Miloslav Trmac for the heads up.

Reported-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/printk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 12886cd19cd9..505863aa3a7f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -465,7 +465,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	for (i = 0; i < msg->text_len; i++) {
 		unsigned char c = log_text(msg)[i];
 
-		if (c < ' ' || c >= 128)
+		if (c < ' ' || c >= 127 || c == '\\')
 			len += sprintf(user->buf + len, "\\x%02x", c);
 		else
 			user->buf[len++] = c;
@@ -489,7 +489,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 				continue;
 			}
 
-			if (c < ' ' || c >= 128) {
+			if (c < ' ' || c >= 127 || c == '\\') {
 				len += sprintf(user->buf + len, "\\x%02x", c);
 				continue;
 			}
-- 
cgit v1.2.3-59-g8ed1b


From 43a73a50b352cd3df25b3ced72033942a6a0f919 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay@vrfy.org>
Date: Fri, 6 Jul 2012 09:50:09 -0700
Subject: kmsg: add the facility number to the syslog prefix

After the recent split of facility and level into separate variables,
we miss the facility value (always 0 for kernel-originated messages)
in the syslog prefix.

On Tue, Jul 3, 2012 at 12:45 PM, Dan Carpenter <dan.carpenter@oracle.com> wrote:
> Static checkers complain about the impossible condition here.
>
> In 084681d14e ('printk: flush continuation lines immediately to
> console'), we changed msg->level from being a u16 to being an unsigned
> 3 bit bitfield.

Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/printk.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 505863aa3a7f..37cde752cb8a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -818,15 +818,18 @@ static size_t print_time(u64 ts, char *buf)
 static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
 {
 	size_t len = 0;
+	unsigned int prefix = (msg->facility << 3) | msg->level;
 
 	if (syslog) {
 		if (buf) {
-			len += sprintf(buf, "<%u>", msg->level);
+			len += sprintf(buf, "<%u>", prefix);
 		} else {
 			len += 3;
-			if (msg->level > 9)
-				len++;
-			if (msg->level > 99)
+			if (prefix > 999)
+				len += 3;
+			else if (prefix > 99)
+				len += 2;
+			else if (prefix > 9)
 				len++;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From cb424ffe9f45ad80267f2a98fbd9bf21caa0ce22 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay@vrfy.org>
Date: Fri, 6 Jul 2012 09:50:09 -0700
Subject: kmsg: properly handle concurrent non-blocking read() from /proc/kmsg

The /proc/kmsg read() interface is internally simply wired up to a sequence
of syslog() syscalls, which might are racy between their checks and actions,
regarding concurrency.

In the (very uncommon) case of concurrent readers of /dev/kmsg, relying on
usual O_NONBLOCK behavior, the recently introduced mutex might block an
O_NONBLOCK reader in read(), when poll() returns for it, but another process
has already read the data in the meantime. We've seen that while running
artificial test setups and tools that "fight" about /proc/kmsg data.

This restores the original /proc/kmsg behavior, where in case of concurrent
read()s, poll() might wake up but the read() syscall will just return 0 to
the caller, while another process has "stolen" the data.

This is in the general case not the expected behavior, but it is the exact
same one, that can easily be triggered with a 3.4 kernel, and some tools
might just rely on it.

The mutex is not needed, the original integrity issue which introduced it,
is in the meantime covered by:
  "fill buffer with more than a single message for SYSLOG_ACTION_READ"
  116e90b23f74d303e8d607c7a7d54f60f14ab9f2

Cc: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/printk.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 37cde752cb8a..be9a82b2f0b3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1021,7 +1021,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
 	bool clear = false;
 	static int saved_console_loglevel = -1;
-	static DEFINE_MUTEX(syslog_mutex);
 	int error;
 
 	error = check_syslog_permissions(type, from_file);
@@ -1048,17 +1047,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			error = -EFAULT;
 			goto out;
 		}
-		error = mutex_lock_interruptible(&syslog_mutex);
-		if (error)
-			goto out;
 		error = wait_event_interruptible(log_wait,
 						 syslog_seq != log_next_seq);
-		if (error) {
-			mutex_unlock(&syslog_mutex);
+		if (error)
 			goto out;
-		}
 		error = syslog_print(buf, len);
-		mutex_unlock(&syslog_mutex);
 		break;
 	/* Read/clear last kernel messages */
 	case SYSLOG_ACTION_READ_CLEAR:
-- 
cgit v1.2.3-59-g8ed1b


From 68b6507dc554ba015b5ed5e13b1ed4993cdf4024 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay@vrfy.org>
Date: Fri, 6 Jul 2012 09:50:09 -0700
Subject: kmsg: make sure all messages reach a newly registered boot console

We suppress printing kmsg records to the console, which are already printed
immediately while we have received their fragments.

Newly registered boot consoles print the entire kmsg buffer during
registration. Clear the console-suppress flag after we skipped the record
during its first storage, so any later print will see these records as usual.

Signed-off-by: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/printk.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index be9a82b2f0b3..f02f1f5ddc30 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1953,6 +1953,12 @@ skip:
 			 */
 			console_idx = log_next(console_idx);
 			console_seq++;
+			/*
+			 * We will get here again when we register a new
+			 * CON_PRINTBUFFER console. Clear the flag so we
+			 * will properly dump everything later.
+			 */
+			msg->flags &= ~LOG_NOCONS;
 			goto skip;
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From 7db5b3ca0ecdb2e8fad52a4770e4e320e61c77a6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Jul 2012 15:55:47 -0700
Subject: Revert "cgroup: superblock can't be released with active dentries"

This reverts commit fa980ca87d15bb8a1317853f257a505990f3ffde.  The
commit was an attempt to fix a race condition where a cgroup hierarchy
may be unmounted with positive dentry reference on root cgroup.  While
the commit made the race condition slightly more difficult to trigger,
the race was still there and could be reliably triggered using a
different test case.

Revert the incorrect fix.  The next commit will describe the race and
fix it correctly.

Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <4FEEA5CB.8070809@huawei.com>
Reported-by: shyju pv <shyju.pv@huawei.com>
Cc: Sasha Levin <levinsasha928@gmail.com>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2097684cf194..5f134a0e0e3f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -901,13 +901,10 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		mutex_unlock(&cgroup_mutex);
 
 		/*
-		 * We want to drop the active superblock reference from the
-		 * cgroup creation after all the dentry refs are gone -
-		 * kill_sb gets mighty unhappy otherwise.  Mark
-		 * dentry->d_fsdata with cgroup_diput() to tell
-		 * cgroup_d_release() to call deactivate_super().
+		 * Drop the active superblock reference that we took when we
+		 * created the cgroup
 		 */
-		dentry->d_fsdata = cgroup_diput;
+		deactivate_super(cgrp->root->sb);
 
 		/*
 		 * if we're getting rid of the cgroup, refcount should ensure
@@ -933,13 +930,6 @@ static int cgroup_delete(const struct dentry *d)
 	return 1;
 }
 
-static void cgroup_d_release(struct dentry *dentry)
-{
-	/* did cgroup_diput() tell me to deactivate super? */
-	if (dentry->d_fsdata == cgroup_diput)
-		deactivate_super(dentry->d_sb);
-}
-
 static void remove_dir(struct dentry *d)
 {
 	struct dentry *parent = dget(d->d_parent);
@@ -1547,7 +1537,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
 	static const struct dentry_operations cgroup_dops = {
 		.d_iput = cgroup_diput,
 		.d_delete = cgroup_delete,
-		.d_release = cgroup_d_release,
 	};
 
 	struct inode *inode =
-- 
cgit v1.2.3-59-g8ed1b


From 5db9a4d99b0157a513944e9a44d29c9cec2e91dc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Jul 2012 16:08:18 -0700
Subject: cgroup: fix cgroup hierarchy umount race

48ddbe1946 "cgroup: make css->refcnt clearing on cgroup removal
optional" allowed a css to linger after the associated cgroup is
removed.  As a css holds a reference on the cgroup's dentry, it means
that cgroup dentries may linger for a while.

Destroying a superblock which has dentries with positive refcnts is a
critical bug and triggers BUG() in vfs code.  As each cgroup dentry
holds an s_active reference, any lingering cgroup has both its dentry
and the superblock pinned and thus preventing premature release of
superblock.

Unfortunately, after 48ddbe1946, there's a small window while
releasing a cgroup which is directly under the root of the hierarchy.
When a cgroup directory is released, vfs layer first deletes the
corresponding dentry and then invokes dput() on the parent, which may
recurse further, so when a cgroup directly below root cgroup is
released, the cgroup is first destroyed - which releases the s_active
it was holding - and then the dentry for the root cgroup is dput().

This creates a window where the root dentry's refcnt isn't zero but
superblock's s_active is.  If umount happens before or during this
window, vfs will see the root dentry with non-zero refcnt and trigger
BUG().

Before 48ddbe1946, this problem didn't exist because the last dentry
reference was guaranteed to be put synchronously from rmdir(2)
invocation which holds s_active around the whole process.

Fix it by holding an extra superblock->s_active reference across
dput() from css release, which is the dput() path added by 48ddbe1946
and the only one which doesn't hold an extra s_active ref across the
final cgroup dput().

Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <4FEEA5CB.8070809@huawei.com>
Reported-by: shyju pv <shyju.pv@huawei.com>
Tested-by: shyju pv <shyju.pv@huawei.com>
Cc: Sasha Levin <levinsasha928@gmail.com>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5f134a0e0e3f..b303dfc7dce0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3883,8 +3883,12 @@ static void css_dput_fn(struct work_struct *work)
 {
 	struct cgroup_subsys_state *css =
 		container_of(work, struct cgroup_subsys_state, dput_work);
+	struct dentry *dentry = css->cgroup->dentry;
+	struct super_block *sb = dentry->d_sb;
 
-	dput(css->cgroup->dentry);
+	atomic_inc(&sb->s_active);
+	dput(dentry);
+	deactivate_super(sb);
 }
 
 static void init_cgroup_css(struct cgroup_subsys_state *css,
-- 
cgit v1.2.3-59-g8ed1b


From eb02dac93708f581c99858a19162af8ca2b6bfcb Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay@vrfy.org>
Date: Mon, 9 Jul 2012 10:05:10 -0700
Subject: kmsg: /proc/kmsg - support reading of partial log records

Restore support for partial reads of any size on /proc/kmsg, in case the
supplied read buffer is smaller than the record size.

Some people seem to think is is ia good idea to run:
  $ dd if=/proc/kmsg bs=1 of=...
as a klog bridge.

Resolves-bug: https://bugzilla.kernel.org/show_bug.cgi?id=44211
Reported-by: Jukka Ollila <jiiksteri@gmail.com>
Signed-off-by: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/printk.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index f02f1f5ddc30..50c33411442d 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -217,6 +217,7 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock);
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
+static size_t syslog_partial;
 
 /* index and sequence number of the first record stored in the buffer */
 static u64 log_first_seq;
@@ -890,22 +891,33 @@ static int syslog_print(char __user *buf, int size)
 
 	while (size > 0) {
 		size_t n;
+		size_t skip;
 
 		raw_spin_lock_irq(&logbuf_lock);
 		if (syslog_seq < log_first_seq) {
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
 			syslog_idx = log_first_idx;
+			syslog_partial = 0;
 		}
 		if (syslog_seq == log_next_seq) {
 			raw_spin_unlock_irq(&logbuf_lock);
 			break;
 		}
+
+		skip = syslog_partial;
 		msg = log_from_idx(syslog_idx);
 		n = msg_print_text(msg, true, text, LOG_LINE_MAX);
-		if (n <= size) {
+		if (n - syslog_partial <= size) {
+			/* message fits into buffer, move forward */
 			syslog_idx = log_next(syslog_idx);
 			syslog_seq++;
+			n -= syslog_partial;
+			syslog_partial = 0;
+		} else if (!len){
+			/* partial read(), remember position */
+			n = size;
+			syslog_partial += n;
 		} else
 			n = 0;
 		raw_spin_unlock_irq(&logbuf_lock);
@@ -913,17 +925,15 @@ static int syslog_print(char __user *buf, int size)
 		if (!n)
 			break;
 
-		len += n;
-		size -= n;
-		buf += n;
-		n = copy_to_user(buf - n, text, n);
-
-		if (n) {
-			len -= n;
+		if (copy_to_user(buf, text + skip, n)) {
 			if (!len)
 				len = -EFAULT;
 			break;
 		}
+
+		len += n;
+		size -= n;
+		buf += n;
 	}
 
 	kfree(text);
@@ -1107,6 +1117,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
 			syslog_idx = log_first_idx;
+			syslog_partial = 0;
 		}
 		if (from_file) {
 			/*
@@ -1129,6 +1140,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 				idx = log_next(idx);
 				seq++;
 			}
+			error -= syslog_partial;
 		}
 		raw_spin_unlock_irq(&logbuf_lock);
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From ce27e317ba22b359bde02216afab934dac3af095 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 3 Jul 2012 10:38:06 -0700
Subject: cgroup: cgroup_rm_files() was calling simple_unlink() with the wrong
 inode

While refactoring cgroup file removal path, 05ef1d7c4a "cgroup:
introduce struct cfent" incorrectly changed the @dir argument of
simple_unlink() to the inode of the file being deleted instead of that
of the containing directory.

The effect of this bug is minor - ctime and mtime of the parent
weren't properly updated on file deletion.

Fix it by using @cgrp->dentry->d_inode instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dec62f5936ef..a56805aa0f1b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -959,7 +959,7 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 
 		dget(d);
 		d_delete(d);
-		simple_unlink(d->d_inode, d);
+		simple_unlink(cgrp->dentry->d_inode, d);
 		list_del_init(&cfe->node);
 		dput(d);
 
-- 
cgit v1.2.3-59-g8ed1b


From 5becfb1df5ac8e491338e64b1029685ccad4b39c Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay@vrfy.org>
Date: Mon, 9 Jul 2012 12:15:42 -0700
Subject: kmsg: merge continuation records while printing

In (the unlikely) case our continuation merge buffer is busy, we unfortunately
can not merge further continuation printk()s into a single record and have to
store them separately, which leads to split-up output of these lines when they
are printed.

Add some flags about newlines and prefix existence to these records and try to
reconstruct the full line again, when the separated records are printed.

Reported-By: Michael Neuling <mikey@neuling.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Tested-By: Michael Neuling <mikey@neuling.org>
Signed-off-by: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/printk.c | 120 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 78 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 50c33411442d..177fa49357a5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -194,8 +194,10 @@ static int console_may_schedule;
  */
 
 enum log_flags {
-	LOG_DEFAULT = 0,
-	LOG_NOCONS = 1,		/* already flushed, do not print to console */
+	LOG_NOCONS	= 1,	/* already flushed, do not print to console */
+	LOG_NEWLINE	= 2,	/* text ended with a newline */
+	LOG_PREFIX	= 4,	/* text started with a prefix */
+	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
 };
 
 struct log {
@@ -217,6 +219,7 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock);
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
+static enum log_flags syslog_prev;
 static size_t syslog_partial;
 
 /* index and sequence number of the first record stored in the buffer */
@@ -839,13 +842,26 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
 	return len;
 }
 
-static size_t msg_print_text(const struct log *msg, bool syslog,
-			     char *buf, size_t size)
+static size_t msg_print_text(const struct log *msg, enum log_flags prev,
+			     bool syslog, char *buf, size_t size)
 {
 	const char *text = log_text(msg);
 	size_t text_size = msg->text_len;
+	bool prefix = true;
+	bool newline = true;
 	size_t len = 0;
 
+	if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
+		prefix = false;
+
+	if (msg->flags & LOG_CONT) {
+		if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
+			prefix = false;
+
+		if (!(msg->flags & LOG_NEWLINE))
+			newline = false;
+	}
+
 	do {
 		const char *next = memchr(text, '\n', text_size);
 		size_t text_len;
@@ -863,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog,
 			    text_len + 1>= size - len)
 				break;
 
-			len += print_prefix(msg, syslog, buf + len);
+			if (prefix)
+				len += print_prefix(msg, syslog, buf + len);
 			memcpy(buf + len, text, text_len);
 			len += text_len;
-			buf[len++] = '\n';
+			if (next || newline)
+				buf[len++] = '\n';
 		} else {
 			/* SYSLOG_ACTION_* buffer size only calculation */
-			len += print_prefix(msg, syslog, NULL);
-			len += text_len + 1;
+			if (prefix)
+				len += print_prefix(msg, syslog, NULL);
+			len += text_len;
+			if (next || newline)
+				len++;
 		}
 
+		prefix = true;
 		text = next;
 	} while (text);
 
@@ -898,6 +920,7 @@ static int syslog_print(char __user *buf, int size)
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
 			syslog_idx = log_first_idx;
+			syslog_prev = 0;
 			syslog_partial = 0;
 		}
 		if (syslog_seq == log_next_seq) {
@@ -907,11 +930,12 @@ static int syslog_print(char __user *buf, int size)
 
 		skip = syslog_partial;
 		msg = log_from_idx(syslog_idx);
-		n = msg_print_text(msg, true, text, LOG_LINE_MAX);
+		n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
 		if (n - syslog_partial <= size) {
 			/* message fits into buffer, move forward */
 			syslog_idx = log_next(syslog_idx);
 			syslog_seq++;
+			syslog_prev = msg->flags;
 			n -= syslog_partial;
 			syslog_partial = 0;
 		} else if (!len){
@@ -954,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		u64 next_seq;
 		u64 seq;
 		u32 idx;
+		enum log_flags prev;
 
 		if (clear_seq < log_first_seq) {
 			/* messages are gone, move to first available one */
@@ -967,10 +992,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		 */
 		seq = clear_seq;
 		idx = clear_idx;
+		prev = 0;
 		while (seq < log_next_seq) {
 			struct log *msg = log_from_idx(idx);
 
-			len += msg_print_text(msg, true, NULL, 0);
+			len += msg_print_text(msg, prev, true, NULL, 0);
 			idx = log_next(idx);
 			seq++;
 		}
@@ -978,10 +1004,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		/* move first record forward until length fits into the buffer */
 		seq = clear_seq;
 		idx = clear_idx;
+		prev = 0;
 		while (len > size && seq < log_next_seq) {
 			struct log *msg = log_from_idx(idx);
 
-			len -= msg_print_text(msg, true, NULL, 0);
+			len -= msg_print_text(msg, prev, true, NULL, 0);
 			idx = log_next(idx);
 			seq++;
 		}
@@ -990,17 +1017,19 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		next_seq = log_next_seq;
 
 		len = 0;
+		prev = 0;
 		while (len >= 0 && seq < next_seq) {
 			struct log *msg = log_from_idx(idx);
 			int textlen;
 
-			textlen = msg_print_text(msg, true, text, LOG_LINE_MAX);
+			textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
 			if (textlen < 0) {
 				len = textlen;
 				break;
 			}
 			idx = log_next(idx);
 			seq++;
+			prev = msg->flags;
 
 			raw_spin_unlock_irq(&logbuf_lock);
 			if (copy_to_user(buf + len, text, textlen))
@@ -1013,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 				/* messages are gone, move to next one */
 				seq = log_first_seq;
 				idx = log_first_idx;
+				prev = 0;
 			}
 		}
 	}
@@ -1117,6 +1147,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
 			syslog_idx = log_first_idx;
+			syslog_prev = 0;
 			syslog_partial = 0;
 		}
 		if (from_file) {
@@ -1127,18 +1158,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			 */
 			error = log_next_idx - syslog_idx;
 		} else {
-			u64 seq;
-			u32 idx;
+			u64 seq = syslog_seq;
+			u32 idx = syslog_idx;
+			enum log_flags prev = syslog_prev;
 
 			error = 0;
-			seq = syslog_seq;
-			idx = syslog_idx;
 			while (seq < log_next_seq) {
 				struct log *msg = log_from_idx(idx);
 
-				error += msg_print_text(msg, true, NULL, 0);
+				error += msg_print_text(msg, prev, true, NULL, 0);
 				idx = log_next(idx);
 				seq++;
+				prev = msg->flags;
 			}
 			error -= syslog_partial;
 		}
@@ -1408,10 +1439,9 @@ asmlinkage int vprintk_emit(int facility, int level,
 	static char textbuf[LOG_LINE_MAX];
 	char *text = textbuf;
 	size_t text_len;
+	enum log_flags lflags = 0;
 	unsigned long flags;
 	int this_cpu;
-	bool newline = false;
-	bool prefix = false;
 	int printed_len = 0;
 
 	boot_delay_msec();
@@ -1450,7 +1480,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 		recursion_bug = 0;
 		printed_len += strlen(recursion_msg);
 		/* emit KERN_CRIT message */
-		log_store(0, 2, LOG_DEFAULT, 0,
+		log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
 			  NULL, 0, recursion_msg, printed_len);
 	}
 
@@ -1463,7 +1493,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 	/* mark and strip a trailing newline */
 	if (text_len && text[text_len-1] == '\n') {
 		text_len--;
-		newline = true;
+		lflags |= LOG_NEWLINE;
 	}
 
 	/* strip syslog prefix and extract log level or control flags */
@@ -1473,7 +1503,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 			if (level == -1)
 				level = text[1] - '0';
 		case 'd':	/* KERN_DEFAULT */
-			prefix = true;
+			lflags |= LOG_PREFIX;
 		case 'c':	/* KERN_CONT */
 			text += 3;
 			text_len -= 3;
@@ -1483,22 +1513,20 @@ asmlinkage int vprintk_emit(int facility, int level,
 	if (level == -1)
 		level = default_message_loglevel;
 
-	if (dict) {
-		prefix = true;
-		newline = true;
-	}
+	if (dict)
+		lflags |= LOG_PREFIX|LOG_NEWLINE;
 
-	if (!newline) {
+	if (!(lflags & LOG_NEWLINE)) {
 		/*
 		 * Flush the conflicting buffer. An earlier newline was missing,
 		 * or another task also prints continuation lines.
 		 */
-		if (cont.len && (prefix || cont.owner != current))
+		if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
 			cont_flush();
 
 		/* buffer line if possible, otherwise store it right away */
 		if (!cont_add(facility, level, text, text_len))
-			log_store(facility, level, LOG_DEFAULT, 0,
+			log_store(facility, level, lflags | LOG_CONT, 0,
 				  dict, dictlen, text, text_len);
 	} else {
 		bool stored = false;
@@ -1510,13 +1538,13 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 * flush it out and store this line separately.
 		 */
 		if (cont.len && cont.owner == current) {
-			if (!prefix)
+			if (!(lflags & LOG_PREFIX))
 				stored = cont_add(facility, level, text, text_len);
 			cont_flush();
 		}
 
 		if (!stored)
-			log_store(facility, level, LOG_DEFAULT, 0,
+			log_store(facility, level, lflags, 0,
 				  dict, dictlen, text, text_len);
 	}
 	printed_len += text_len;
@@ -1615,8 +1643,8 @@ static struct cont {
 static struct log *log_from_idx(u32 idx) { return NULL; }
 static u32 log_next(u32 idx) { return 0; }
 static void call_console_drivers(int level, const char *text, size_t len) {}
-static size_t msg_print_text(const struct log *msg, bool syslog,
-			     char *buf, size_t size) { return 0; }
+static size_t msg_print_text(const struct log *msg, enum log_flags prev,
+			     bool syslog, char *buf, size_t size) { return 0; }
 static size_t cont_print_text(char *text, size_t size) { return 0; }
 
 #endif /* CONFIG_PRINTK */
@@ -1892,6 +1920,7 @@ void wake_up_klogd(void)
 /* the next printk record to write to the console */
 static u64 console_seq;
 static u32 console_idx;
+static enum log_flags console_prev;
 
 /**
  * console_unlock - unlock the console system
@@ -1952,6 +1981,7 @@ again:
 			/* messages are gone, move to first one */
 			console_seq = log_first_seq;
 			console_idx = log_first_idx;
+			console_prev = 0;
 		}
 skip:
 		if (console_seq == log_next_seq)
@@ -1975,10 +2005,11 @@ skip:
 		}
 
 		level = msg->level;
-		len = msg_print_text(msg, false, text, sizeof(text));
-
+		len = msg_print_text(msg, console_prev, false,
+				     text, sizeof(text));
 		console_idx = log_next(console_idx);
 		console_seq++;
+		console_prev = msg->flags;
 		raw_spin_unlock(&logbuf_lock);
 
 		stop_critical_timings();	/* don't trace print latency */
@@ -2241,6 +2272,7 @@ void register_console(struct console *newcon)
 		raw_spin_lock_irqsave(&logbuf_lock, flags);
 		console_seq = syslog_seq;
 		console_idx = syslog_idx;
+		console_prev = syslog_prev;
 		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 		/*
 		 * We're about to replay the log buffer.  Only do this to the
@@ -2534,8 +2566,7 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
 	}
 
 	msg = log_from_idx(dumper->cur_idx);
-	l = msg_print_text(msg, syslog,
-			      line, size);
+	l = msg_print_text(msg, 0, syslog, line, size);
 
 	dumper->cur_idx = log_next(dumper->cur_idx);
 	dumper->cur_seq++;
@@ -2575,6 +2606,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	u32 idx;
 	u64 next_seq;
 	u32 next_idx;
+	enum log_flags prev;
 	size_t l = 0;
 	bool ret = false;
 
@@ -2597,23 +2629,27 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	/* calculate length of entire buffer */
 	seq = dumper->cur_seq;
 	idx = dumper->cur_idx;
+	prev = 0;
 	while (seq < dumper->next_seq) {
 		struct log *msg = log_from_idx(idx);
 
-		l += msg_print_text(msg, true, NULL, 0);
+		l += msg_print_text(msg, prev, true, NULL, 0);
 		idx = log_next(idx);
 		seq++;
+		prev = msg->flags;
 	}
 
 	/* move first record forward until length fits into the buffer */
 	seq = dumper->cur_seq;
 	idx = dumper->cur_idx;
+	prev = 0;
 	while (l > size && seq < dumper->next_seq) {
 		struct log *msg = log_from_idx(idx);
 
-		l -= msg_print_text(msg, true, NULL, 0);
+		l -= msg_print_text(msg, prev, true, NULL, 0);
 		idx = log_next(idx);
 		seq++;
+		prev = msg->flags;
 	}
 
 	/* last message in next interation */
@@ -2621,14 +2657,14 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	next_idx = idx;
 
 	l = 0;
+	prev = 0;
 	while (seq < dumper->next_seq) {
 		struct log *msg = log_from_idx(idx);
 
-		l += msg_print_text(msg, syslog,
-				    buf + l, size - l);
-
+		l += msg_print_text(msg, prev, syslog, buf + l, size - l);
 		idx = log_next(idx);
 		seq++;
+		prev = msg->flags;
 	}
 
 	dumper->next_seq = next_seq;
-- 
cgit v1.2.3-59-g8ed1b


From f55a6faa384304c89cfef162768e88374d3312cb Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 10 Jul 2012 18:43:19 -0400
Subject: hrtimer: Provide clock_was_set_delayed()

clock_was_set() cannot be called from hard interrupt context because
it calls on_each_cpu().

For fixing the widely reported leap seconds issue it is necessary to
call it from hard interrupt context, i.e. the timer tick code, which
does the timekeeping updates.

Provide a new function which denotes it in the hrtimer cpu base
structure of the cpu on which it is called and raise the hrtimer
softirq. We then execute the clock_was_set() notificiation from
softirq context in run_hrtimer_softirq(). The hrtimer softirq is
rarely used, so polling the flag there is not a performance issue.

[ tglx: Made it depend on CONFIG_HIGH_RES_TIMERS. We really should get
  rid of all this ifdeffery ASAP ]

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Reported-by: Jan Engelhardt <jengelh@inai.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1341960205-56738-2-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  9 ++++++++-
 kernel/hrtimer.c        | 20 ++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index fd0dc30c9f15..c9ec9400ee5b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -165,6 +165,7 @@ enum  hrtimer_base_type {
  * @lock:		lock protecting the base and associated clock bases
  *			and timers
  * @active_bases:	Bitfield to mark bases with active timers
+ * @clock_was_set:	Indicates that clock was set from irq context.
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
  * @hres_active:	State of high resolution mode
@@ -177,7 +178,8 @@ enum  hrtimer_base_type {
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
-	unsigned long			active_bases;
+	unsigned int			active_bases;
+	unsigned int			clock_was_set;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t				expires_next;
 	int				hres_active;
@@ -286,6 +288,8 @@ extern void hrtimer_peek_ahead_timers(void);
 # define MONOTONIC_RES_NSEC	HIGH_RES_NSEC
 # define KTIME_MONOTONIC_RES	KTIME_HIGH_RES
 
+extern void clock_was_set_delayed(void);
+
 #else
 
 # define MONOTONIC_RES_NSEC	LOW_RES_NSEC
@@ -306,6 +310,9 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 {
 	return 0;
 }
+
+static inline void clock_was_set_delayed(void) { }
+
 #endif
 
 extern void clock_was_set(void);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf51682b..3c24fb2c25c8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -717,6 +717,19 @@ static int hrtimer_switch_to_hres(void)
 	return 1;
 }
 
+/*
+ * Called from timekeeping code to reprogramm the hrtimer interrupt
+ * device. If called from the timer interrupt context we defer it to
+ * softirq context.
+ */
+void clock_was_set_delayed(void)
+{
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+
+	cpu_base->clock_was_set = 1;
+	__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
+
 #else
 
 static inline int hrtimer_hres_active(void) { return 0; }
@@ -1395,6 +1408,13 @@ void hrtimer_peek_ahead_timers(void)
 
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+
+	if (cpu_base->clock_was_set) {
+		cpu_base->clock_was_set = 0;
+		clock_was_set();
+	}
+
 	hrtimer_peek_ahead_timers();
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 4873fa070ae84a4115f0b3c9dfabc224f1bc7c51 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 10 Jul 2012 18:43:20 -0400
Subject: timekeeping: Fix leapsecond triggered load spike issue

The timekeeping code misses an update of the hrtimer subsystem after a
leap second happened. Due to that timers based on CLOCK_REALTIME are
either expiring a second early or late depending on whether a leap
second has been inserted or deleted until an operation is initiated
which causes that update. Unless the update happens by some other
means this discrepancy between the timekeeping and the hrtimer data
stays forever and timers are expired either early or late.

The reported immediate workaround - $ data -s "`date`" - is causing a
call to clock_was_set() which updates the hrtimer data structures.
See: http://www.sheeri.com/content/mysql-and-leap-second-high-cpu-and-fix

Add the missing clock_was_set() call to update_wall_time() in case of
a leap second event. The actual update is deferred to softirq context
as the necessary smp function call cannot be invoked from hard
interrupt context.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Reported-by: Jan Engelhardt <jengelh@inai.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1341960205-56738-3-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6f46a00a1e8a..a413e5940e06 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -963,6 +963,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
 		leap = second_overflow(timekeeper.xtime.tv_sec);
 		timekeeper.xtime.tv_sec += leap;
 		timekeeper.wall_to_monotonic.tv_sec -= leap;
+		if (leap)
+			clock_was_set_delayed();
 	}
 
 	/* Accumulate raw time */
@@ -1079,6 +1081,8 @@ static void update_wall_time(void)
 		leap = second_overflow(timekeeper.xtime.tv_sec);
 		timekeeper.xtime.tv_sec += leap;
 		timekeeper.wall_to_monotonic.tv_sec -= leap;
+		if (leap)
+			clock_was_set_delayed();
 	}
 
 	timekeeping_update(false);
-- 
cgit v1.2.3-59-g8ed1b


From 5b9fe759a678e05be4937ddf03d50e950207c1c0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Jul 2012 18:43:21 -0400
Subject: timekeeping: Maintain ktime_t based offsets for hrtimers

We need to update the hrtimer clock offsets from the hrtimer interrupt
context. To avoid conversions from timespec to ktime_t maintain a
ktime_t based representation of those offsets in the timekeeper. This
puts the conversion overhead into the code which updates the
underlying offsets and provides fast accessible values in the hrtimer
interrupt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1341960205-56738-4-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a413e5940e06..1c038dac71a2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -70,6 +70,12 @@ struct timekeeper {
 	/* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
 	struct timespec raw_time;
 
+	/* Offset clock monotonic -> clock realtime */
+	ktime_t offs_real;
+
+	/* Offset clock monotonic -> clock boottime */
+	ktime_t offs_boot;
+
 	/* Seqlock for all timekeeper values */
 	seqlock_t lock;
 };
@@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void)
 	return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
 }
 
+static void update_rt_offset(void)
+{
+	struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic;
+
+	set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
+	timekeeper.offs_real = timespec_to_ktime(tmp);
+}
+
 /* must hold write on timekeeper.lock */
 static void timekeeping_update(bool clearntp)
 {
@@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp)
 		timekeeper.ntp_error = 0;
 		ntp_clear();
 	}
+	update_rt_offset();
 	update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
 			 timekeeper.clock, timekeeper.mult);
 }
@@ -604,6 +619,7 @@ void __init timekeeping_init(void)
 	}
 	set_normalized_timespec(&timekeeper.wall_to_monotonic,
 				-boot.tv_sec, -boot.tv_nsec);
+	update_rt_offset();
 	timekeeper.total_sleep_time.tv_sec = 0;
 	timekeeper.total_sleep_time.tv_nsec = 0;
 	write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -612,6 +628,12 @@ void __init timekeeping_init(void)
 /* time in seconds when suspend began */
 static struct timespec timekeeping_suspend_time;
 
+static void update_sleep_time(struct timespec t)
+{
+	timekeeper.total_sleep_time = t;
+	timekeeper.offs_boot = timespec_to_ktime(t);
+}
+
 /**
  * __timekeeping_inject_sleeptime - Internal function to add sleep interval
  * @delta: pointer to a timespec delta value
@@ -630,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
 	timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
 	timekeeper.wall_to_monotonic =
 			timespec_sub(timekeeper.wall_to_monotonic, *delta);
-	timekeeper.total_sleep_time = timespec_add(
-					timekeeper.total_sleep_time, *delta);
+	update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta));
 }
 
 
-- 
cgit v1.2.3-59-g8ed1b


From 196951e91262fccda81147d2bcf7fdab08668b40 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Jul 2012 18:43:23 -0400
Subject: hrtimers: Move lock held region in hrtimer_interrupt()

We need to update the base offsets from this code and we need to do
that under base->lock. Move the lock held region around the
ktime_get() calls. The ktime_get() calls are going to be replaced with
a function which gets the time and the offsets atomically.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Link: http://lkml.kernel.org/r/1341960205-56738-6-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3c24fb2c25c8..8f320af837b5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1263,11 +1263,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	cpu_base->nr_events++;
 	dev->next_event.tv64 = KTIME_MAX;
 
+	raw_spin_lock(&cpu_base->lock);
 	entry_time = now = ktime_get();
 retry:
 	expires_next.tv64 = KTIME_MAX;
-
-	raw_spin_lock(&cpu_base->lock);
 	/*
 	 * We set expires_next to KTIME_MAX here with cpu_base->lock
 	 * held to prevent that a timer is enqueued in our queue via
@@ -1344,6 +1343,7 @@ retry:
 	 * interrupt routine. We give it 3 attempts to avoid
 	 * overreacting on some spurious event.
 	 */
+	raw_spin_lock(&cpu_base->lock);
 	now = ktime_get();
 	cpu_base->nr_retries++;
 	if (++retries < 3)
@@ -1356,6 +1356,7 @@ retry:
 	 */
 	cpu_base->nr_hangs++;
 	cpu_base->hang_detected = 1;
+	raw_spin_unlock(&cpu_base->lock);
 	delta = ktime_sub(now, entry_time);
 	if (delta.tv64 > cpu_base->max_hang_time.tv64)
 		cpu_base->max_hang_time = delta;
-- 
cgit v1.2.3-59-g8ed1b


From f6c06abfb3972ad4914cef57d8348fcb2932bc3b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Jul 2012 18:43:24 -0400
Subject: timekeeping: Provide hrtimer update function

To finally fix the infamous leap second issue and other race windows
caused by functions which change the offsets between the various time
bases (CLOCK_MONOTONIC, CLOCK_REALTIME and CLOCK_BOOTTIME) we need a
function which atomically gets the current monotonic time and updates
the offsets of CLOCK_REALTIME and CLOCK_BOOTTIME with minimalistic
overhead. The previous patch which provides ktime_t offsets allows us
to make this function almost as cheap as ktime_get() which is going to
be replaced in hrtimer_interrupt().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Link: http://lkml.kernel.org/r/1341960205-56738-7-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h   |  1 +
 kernel/time/timekeeping.c | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index c9ec9400ee5b..cc07d2777bbe 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -327,6 +327,7 @@ extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_real(void);
 extern ktime_t ktime_get_boottime(void);
 extern ktime_t ktime_get_monotonic_offset(void);
+extern ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot);
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1c038dac71a2..269b1fe5f2ae 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1271,6 +1271,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 	} while (read_seqretry(&timekeeper.lock, seq));
 }
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * ktime_get_update_offsets - hrtimer helper
+ * @offs_real:	pointer to storage for monotonic -> realtime offset
+ * @offs_boot:	pointer to storage for monotonic -> boottime offset
+ *
+ * Returns current monotonic time and updates the offsets
+ * Called from hrtimer_interupt() or retrigger_next_event()
+ */
+ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
+{
+	ktime_t now;
+	unsigned int seq;
+	u64 secs, nsecs;
+
+	do {
+		seq = read_seqbegin(&timekeeper.lock);
+
+		secs = timekeeper.xtime.tv_sec;
+		nsecs = timekeeper.xtime.tv_nsec;
+		nsecs += timekeeping_get_ns();
+		/* If arch requires, add in gettimeoffset() */
+		nsecs += arch_gettimeoffset();
+
+		*offs_real = timekeeper.offs_real;
+		*offs_boot = timekeeper.offs_boot;
+	} while (read_seqretry(&timekeeper.lock, seq));
+
+	now = ktime_add_ns(ktime_set(secs, 0), nsecs);
+	now = ktime_sub(now, *offs_real);
+	return now;
+}
+#endif
+
 /**
  * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
  */
-- 
cgit v1.2.3-59-g8ed1b


From 5baefd6d84163443215f4a99f6a20f054ef11236 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 10 Jul 2012 18:43:25 -0400
Subject: hrtimer: Update hrtimer base offsets each hrtimer_interrupt

The update of the hrtimer base offsets on all cpus cannot be made
atomically from the timekeeper.lock held and interrupt disabled region
as smp function calls are not allowed there.

clock_was_set(), which enforces the update on all cpus, is called
either from preemptible process context in case of do_settimeofday()
or from the softirq context when the offset modification happened in
the timer interrupt itself due to a leap second.

In both cases there is a race window for an hrtimer interrupt between
dropping timekeeper lock, enabling interrupts and clock_was_set()
issuing the updates. Any interrupt which arrives in that window will
see the new time but operate on stale offsets.

So we need to make sure that an hrtimer interrupt always sees a
consistent state of time and offsets.

ktime_get_update_offsets() allows us to get the current monotonic time
and update the per cpu hrtimer base offsets from hrtimer_interrupt()
to capture a consistent state of monotonic time and the offsets. The
function replaces the existing ktime_get() calls in hrtimer_interrupt().

The overhead of the new function vs. ktime_get() is minimal as it just
adds two store operations.

This ensures that any changes to realtime or boottime offsets are
noticed and stored into the per-cpu hrtimer base structures, prior to
any hrtimer expiration and guarantees that timers are not expired early.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Prarit Bhargava <prarit@redhat.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1341960205-56738-8-git-send-email-johnstul@us.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 8f320af837b5..6db7a5ed52b5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 	return 0;
 }
 
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+
+	return ktime_get_update_offsets(offs_real, offs_boot);
+}
+
 /*
  * Retrigger next event is called after clock was set
  *
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 static void retrigger_next_event(void *arg)
 {
 	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
-	struct timespec realtime_offset, xtim, wtm, sleep;
 
 	if (!hrtimer_hres_active())
 		return;
 
-	/* Optimized out for !HIGH_RES */
-	get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
-	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-
-	/* Adjust CLOCK_REALTIME offset */
 	raw_spin_lock(&base->lock);
-	base->clock_base[HRTIMER_BASE_REALTIME].offset =
-		timespec_to_ktime(realtime_offset);
-	base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
-		timespec_to_ktime(sleep);
-
+	hrtimer_update_base(base);
 	hrtimer_force_reprogram(base, 0);
 	raw_spin_unlock(&base->lock);
 }
@@ -710,7 +708,6 @@ static int hrtimer_switch_to_hres(void)
 		base->clock_base[i].resolution = KTIME_HIGH_RES;
 
 	tick_setup_sched_timer();
-
 	/* "Retrigger" the interrupt to get things going */
 	retrigger_next_event(NULL);
 	local_irq_restore(flags);
@@ -1264,7 +1261,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	dev->next_event.tv64 = KTIME_MAX;
 
 	raw_spin_lock(&cpu_base->lock);
-	entry_time = now = ktime_get();
+	entry_time = now = hrtimer_update_base(cpu_base);
 retry:
 	expires_next.tv64 = KTIME_MAX;
 	/*
@@ -1342,9 +1339,12 @@ retry:
 	 * We need to prevent that we loop forever in the hrtimer
 	 * interrupt routine. We give it 3 attempts to avoid
 	 * overreacting on some spurious event.
+	 *
+	 * Acquire base lock for updating the offsets and retrieving
+	 * the current time.
 	 */
 	raw_spin_lock(&cpu_base->lock);
-	now = ktime_get();
+	now = hrtimer_update_base(cpu_base);
 	cpu_base->nr_retries++;
 	if (++retries < 3)
 		goto retry;
-- 
cgit v1.2.3-59-g8ed1b


From 4229fb1dc6843c49a14bb098719f8a696cdc44f8 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Wed, 11 Jul 2012 14:02:11 -0700
Subject: c/r: prctl: less paranoid prctl_set_mm_exe_file()

"no other files mapped" requirement from my previous patch (c/r: prctl:
update prctl_set_mm_exe_file() after mm->num_exe_file_vmas removal) is too
paranoid, it forbids operation even if there mapped one shared-anon vma.

Let's check that current mm->exe_file already unmapped, in this case
exe_file symlink already outdated and its changing is reasonable.

Plus, this patch fixes exit code in case operation success.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Reported-by: Cyrill Gorcunov <gorcunov@openvz.org>
Tested-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index e0c8ffc50d7f..2d39a84cd857 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask)
 #ifdef CONFIG_CHECKPOINT_RESTORE
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
-	struct vm_area_struct *vma;
 	struct file *exe_file;
 	struct dentry *dentry;
 	int err;
@@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 	down_write(&mm->mmap_sem);
 
 	/*
-	 * Forbid mm->exe_file change if there are mapped other files.
+	 * Forbid mm->exe_file change if old file still mapped.
 	 */
 	err = -EBUSY;
-	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-		if (vma->vm_file && !path_equal(&vma->vm_file->f_path,
-						&exe_file->f_path))
-			goto exit_unlock;
+	if (mm->exe_file) {
+		struct vm_area_struct *vma;
+
+		for (vma = mm->mmap; vma; vma = vma->vm_next)
+			if (vma->vm_file &&
+			    path_equal(&vma->vm_file->f_path,
+				       &mm->exe_file->f_path))
+				goto exit_unlock;
 	}
 
 	/*
@@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 	if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
 		goto exit_unlock;
 
+	err = 0;
 	set_mm_exe_file(mm, exe_file);
 exit_unlock:
 	up_write(&mm->mmap_sem);
-- 
cgit v1.2.3-59-g8ed1b


From 93574fcc5b50cc7b8834698acb2ce947e5b6a5dc Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 11 Jul 2012 09:35:08 +0300
Subject: tracing: Check for allocation failure in __tracing_open()

Clean up and return -ENOMEM on if the kzalloc() fails.

This also prevents a potential crash, as the pointer that failed to
allocate would be later used.

Link: http://lkml.kernel.org/r/20120711063507.GF11812@elgon.mountain

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 814ff306ae74..a120f98c4112 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2390,6 +2390,9 @@ __tracing_open(struct inode *inode, struct file *file)
 
 	iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
 				    GFP_KERNEL);
+	if (!iter->buffer_iter)
+		goto release;
+
 	/*
 	 * We make a copy of the current tracer to avoid concurrent
 	 * changes on it while we are reading.
@@ -2451,6 +2454,7 @@ __tracing_open(struct inode *inode, struct file *file)
 	mutex_unlock(&trace_types_lock);
 	kfree(iter->trace);
 	kfree(iter->buffer_iter);
+release:
 	seq_release_private(inode, file);
 	return ERR_PTR(-ENOMEM);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 974271c485a4d8bb801decc616748f90aafb07ec Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 12 Jul 2012 14:46:37 -0700
Subject: workqueue: don't use WQ_HIGHPRI for unbound workqueues

Unbound wqs aren't concurrency-managed and try to execute work items
as soon as possible.  This is currently achieved by implicitly setting
%WQ_HIGHPRI on all unbound workqueues; however, WQ_HIGHPRI
implementation is about to be restructured and this usage won't be
valid anymore.

Add an explicit chain-wakeup path for unbound workqueues in
process_one_work() instead of piggy backing on %WQ_HIGHPRI.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9a3128dc67df..27637c284cb9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -580,6 +580,10 @@ static bool __need_more_worker(struct global_cwq *gcwq)
 /*
  * Need to wake up a worker?  Called from anything but currently
  * running workers.
+ *
+ * Note that, because unbound workers never contribute to nr_running, this
+ * function will always return %true for unbound gcwq as long as the
+ * worklist isn't empty.
  */
 static bool need_more_worker(struct global_cwq *gcwq)
 {
@@ -1867,6 +1871,13 @@ __acquires(&gcwq->lock)
 	if (unlikely(cpu_intensive))
 		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
 
+	/*
+	 * Unbound gcwq isn't concurrency managed and work items should be
+	 * executed ASAP.  Wake up another worker if necessary.
+	 */
+	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(gcwq))
+		wake_up_worker(gcwq);
+
 	spin_unlock_irq(&gcwq->lock);
 
 	work_clear_pending(work);
@@ -2984,13 +2995,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 	if (flags & WQ_MEM_RECLAIM)
 		flags |= WQ_RESCUER;
 
-	/*
-	 * Unbound workqueues aren't concurrency managed and should be
-	 * dispatched to workers immediately.
-	 */
-	if (flags & WQ_UNBOUND)
-		flags |= WQ_HIGHPRI;
-
 	max_active = max_active ?: WQ_DFL_ACTIVE;
 	max_active = wq_clamp_max_active(max_active, flags, wq->name);
 
-- 
cgit v1.2.3-59-g8ed1b


From bd7bdd43dcb81bb08240b9401b36a104f77dc135 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 12 Jul 2012 14:46:37 -0700
Subject: workqueue: factor out worker_pool from global_cwq

Move worklist and all worker management fields from global_cwq into
the new struct worker_pool.  worker_pool points back to the containing
gcwq.  worker and cpu_workqueue_struct are updated to point to
worker_pool instead of gcwq too.

This change is mechanical and doesn't introduce any functional
difference other than rearranging of fields and an added level of
indirection in some places.  This is to prepare for multiple pools per
gcwq.

v2: Comment typo fixes as suggested by Namhyung.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
---
 include/trace/events/workqueue.h |   2 +-
 kernel/workqueue.c               | 216 +++++++++++++++++++++------------------
 2 files changed, 118 insertions(+), 100 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h
index 4018f5058f27..f28d1b65f178 100644
--- a/include/trace/events/workqueue.h
+++ b/include/trace/events/workqueue.h
@@ -54,7 +54,7 @@ TRACE_EVENT(workqueue_queue_work,
 		__entry->function	= work->func;
 		__entry->workqueue	= cwq->wq;
 		__entry->req_cpu	= req_cpu;
-		__entry->cpu		= cwq->gcwq->cpu;
+		__entry->cpu		= cwq->pool->gcwq->cpu;
 	),
 
 	TP_printk("work struct=%p function=%pf workqueue=%p req_cpu=%u cpu=%u",
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 27637c284cb9..61f154467026 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -115,6 +115,7 @@ enum {
  */
 
 struct global_cwq;
+struct worker_pool;
 
 /*
  * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -131,7 +132,7 @@ struct worker {
 	struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
 	struct list_head	scheduled;	/* L: scheduled works */
 	struct task_struct	*task;		/* I: worker task */
-	struct global_cwq	*gcwq;		/* I: the associated gcwq */
+	struct worker_pool	*pool;		/* I: the associated pool */
 	/* 64 bytes boundary on 64bit, 32 on 32bit */
 	unsigned long		last_active;	/* L: last active timestamp */
 	unsigned int		flags;		/* X: flags */
@@ -139,6 +140,21 @@ struct worker {
 	struct work_struct	rebind_work;	/* L: rebind worker to cpu */
 };
 
+struct worker_pool {
+	struct global_cwq	*gcwq;		/* I: the owning gcwq */
+
+	struct list_head	worklist;	/* L: list of pending works */
+	int			nr_workers;	/* L: total number of workers */
+	int			nr_idle;	/* L: currently idle ones */
+
+	struct list_head	idle_list;	/* X: list of idle workers */
+	struct timer_list	idle_timer;	/* L: worker idle timeout */
+	struct timer_list	mayday_timer;	/* L: SOS timer for workers */
+
+	struct ida		worker_ida;	/* L: for worker IDs */
+	struct worker		*first_idle;	/* L: first idle worker */
+};
+
 /*
  * Global per-cpu workqueue.  There's one and only one for each cpu
  * and all works are queued and processed here regardless of their
@@ -146,27 +162,18 @@ struct worker {
  */
 struct global_cwq {
 	spinlock_t		lock;		/* the gcwq lock */
-	struct list_head	worklist;	/* L: list of pending works */
 	unsigned int		cpu;		/* I: the associated cpu */
 	unsigned int		flags;		/* L: GCWQ_* flags */
 
-	int			nr_workers;	/* L: total number of workers */
-	int			nr_idle;	/* L: currently idle ones */
-
-	/* workers are chained either in the idle_list or busy_hash */
-	struct list_head	idle_list;	/* X: list of idle workers */
+	/* workers are chained either in busy_hash or pool idle_list */
 	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
 						/* L: hash of busy workers */
 
-	struct timer_list	idle_timer;	/* L: worker idle timeout */
-	struct timer_list	mayday_timer;	/* L: SOS timer for dworkers */
-
-	struct ida		worker_ida;	/* L: for worker IDs */
+	struct worker_pool	pool;		/* the worker pools */
 
 	struct task_struct	*trustee;	/* L: for gcwq shutdown */
 	unsigned int		trustee_state;	/* L: trustee state */
 	wait_queue_head_t	trustee_wait;	/* trustee wait */
-	struct worker		*first_idle;	/* L: first idle worker */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -175,7 +182,7 @@ struct global_cwq {
  * aligned at two's power of the number of flag bits.
  */
 struct cpu_workqueue_struct {
-	struct global_cwq	*gcwq;		/* I: the associated gcwq */
+	struct worker_pool	*pool;		/* I: the associated pool */
 	struct workqueue_struct *wq;		/* I: the owning workqueue */
 	int			work_color;	/* L: current color */
 	int			flush_color;	/* L: flushing color */
@@ -555,7 +562,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 
 	if (data & WORK_STRUCT_CWQ)
 		return ((struct cpu_workqueue_struct *)
-			(data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
+			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
 
 	cpu = data >> WORK_STRUCT_FLAG_BITS;
 	if (cpu == WORK_CPU_NONE)
@@ -587,13 +594,13 @@ static bool __need_more_worker(struct global_cwq *gcwq)
  */
 static bool need_more_worker(struct global_cwq *gcwq)
 {
-	return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
+	return !list_empty(&gcwq->pool.worklist) && __need_more_worker(gcwq);
 }
 
 /* Can I start working?  Called from busy but !running workers. */
 static bool may_start_working(struct global_cwq *gcwq)
 {
-	return gcwq->nr_idle;
+	return gcwq->pool.nr_idle;
 }
 
 /* Do I need to keep working?  Called from currently running workers. */
@@ -601,7 +608,7 @@ static bool keep_working(struct global_cwq *gcwq)
 {
 	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
 
-	return !list_empty(&gcwq->worklist) &&
+	return !list_empty(&gcwq->pool.worklist) &&
 		(atomic_read(nr_running) <= 1 ||
 		 gcwq->flags & GCWQ_HIGHPRI_PENDING);
 }
@@ -622,8 +629,8 @@ static bool need_to_manage_workers(struct global_cwq *gcwq)
 static bool too_many_workers(struct global_cwq *gcwq)
 {
 	bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
-	int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
-	int nr_busy = gcwq->nr_workers - nr_idle;
+	int nr_idle = gcwq->pool.nr_idle + managing; /* manager is considered idle */
+	int nr_busy = gcwq->pool.nr_workers - nr_idle;
 
 	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
@@ -635,10 +642,10 @@ static bool too_many_workers(struct global_cwq *gcwq)
 /* Return the first worker.  Safe with preemption disabled */
 static struct worker *first_worker(struct global_cwq *gcwq)
 {
-	if (unlikely(list_empty(&gcwq->idle_list)))
+	if (unlikely(list_empty(&gcwq->pool.idle_list)))
 		return NULL;
 
-	return list_first_entry(&gcwq->idle_list, struct worker, entry);
+	return list_first_entry(&gcwq->pool.idle_list, struct worker, entry);
 }
 
 /**
@@ -696,7 +703,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 				       unsigned int cpu)
 {
 	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
-	struct global_cwq *gcwq = get_gcwq(cpu);
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 	atomic_t *nr_running = get_gcwq_nr_running(cpu);
 
 	if (worker->flags & WORKER_NOT_RUNNING)
@@ -716,7 +724,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 	 * could be manipulating idle_list, so dereferencing idle_list
 	 * without gcwq lock is safe.
 	 */
-	if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
+	if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
 		to_wakeup = first_worker(gcwq);
 	return to_wakeup ? to_wakeup->task : NULL;
 }
@@ -737,7 +745,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 				    bool wakeup)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 
 	WARN_ON_ONCE(worker->task != current);
 
@@ -752,7 +761,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 
 		if (wakeup) {
 			if (atomic_dec_and_test(nr_running) &&
-			    !list_empty(&gcwq->worklist))
+			    !list_empty(&pool->worklist))
 				wake_up_worker(gcwq);
 		} else
 			atomic_dec(nr_running);
@@ -773,7 +782,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
  */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct global_cwq *gcwq = worker->pool->gcwq;
 	unsigned int oflags = worker->flags;
 
 	WARN_ON_ONCE(worker->task != current);
@@ -894,9 +903,9 @@ static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
 	struct work_struct *twork;
 
 	if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
-		return &gcwq->worklist;
+		return &gcwq->pool.worklist;
 
-	list_for_each_entry(twork, &gcwq->worklist, entry) {
+	list_for_each_entry(twork, &gcwq->pool.worklist, entry) {
 		struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
 
 		if (!(tcwq->wq->flags & WQ_HIGHPRI))
@@ -924,7 +933,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 			struct work_struct *work, struct list_head *head,
 			unsigned int extra_flags)
 {
-	struct global_cwq *gcwq = cwq->gcwq;
+	struct global_cwq *gcwq = cwq->pool->gcwq;
 
 	/* we own @work, set data and link */
 	set_work_cwq(work, cwq, extra_flags);
@@ -1196,7 +1205,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
  */
 static void worker_enter_idle(struct worker *worker)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 
 	BUG_ON(worker->flags & WORKER_IDLE);
 	BUG_ON(!list_empty(&worker->entry) &&
@@ -1204,15 +1214,15 @@ static void worker_enter_idle(struct worker *worker)
 
 	/* can't use worker_set_flags(), also called from start_worker() */
 	worker->flags |= WORKER_IDLE;
-	gcwq->nr_idle++;
+	pool->nr_idle++;
 	worker->last_active = jiffies;
 
 	/* idle_list is LIFO */
-	list_add(&worker->entry, &gcwq->idle_list);
+	list_add(&worker->entry, &pool->idle_list);
 
 	if (likely(!(worker->flags & WORKER_ROGUE))) {
-		if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
-			mod_timer(&gcwq->idle_timer,
+		if (too_many_workers(gcwq) && !timer_pending(&pool->idle_timer))
+			mod_timer(&pool->idle_timer,
 				  jiffies + IDLE_WORKER_TIMEOUT);
 	} else
 		wake_up_all(&gcwq->trustee_wait);
@@ -1223,7 +1233,7 @@ static void worker_enter_idle(struct worker *worker)
 	 * warning may trigger spuriously.  Check iff trustee is idle.
 	 */
 	WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
-		     gcwq->nr_workers == gcwq->nr_idle &&
+		     pool->nr_workers == pool->nr_idle &&
 		     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
 }
 
@@ -1238,11 +1248,11 @@ static void worker_enter_idle(struct worker *worker)
  */
 static void worker_leave_idle(struct worker *worker)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct worker_pool *pool = worker->pool;
 
 	BUG_ON(!(worker->flags & WORKER_IDLE));
 	worker_clr_flags(worker, WORKER_IDLE);
-	gcwq->nr_idle--;
+	pool->nr_idle--;
 	list_del_init(&worker->entry);
 }
 
@@ -1279,7 +1289,7 @@ static void worker_leave_idle(struct worker *worker)
 static bool worker_maybe_bind_and_lock(struct worker *worker)
 __acquires(&gcwq->lock)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct global_cwq *gcwq = worker->pool->gcwq;
 	struct task_struct *task = worker->task;
 
 	while (true) {
@@ -1321,7 +1331,7 @@ __acquires(&gcwq->lock)
 static void worker_rebind_fn(struct work_struct *work)
 {
 	struct worker *worker = container_of(work, struct worker, rebind_work);
-	struct global_cwq *gcwq = worker->gcwq;
+	struct global_cwq *gcwq = worker->pool->gcwq;
 
 	if (worker_maybe_bind_and_lock(worker))
 		worker_clr_flags(worker, WORKER_REBIND);
@@ -1362,13 +1372,14 @@ static struct worker *alloc_worker(void)
 static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 {
 	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+	struct worker_pool *pool = &gcwq->pool;
 	struct worker *worker = NULL;
 	int id = -1;
 
 	spin_lock_irq(&gcwq->lock);
-	while (ida_get_new(&gcwq->worker_ida, &id)) {
+	while (ida_get_new(&pool->worker_ida, &id)) {
 		spin_unlock_irq(&gcwq->lock);
-		if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
+		if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
 			goto fail;
 		spin_lock_irq(&gcwq->lock);
 	}
@@ -1378,7 +1389,7 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 	if (!worker)
 		goto fail;
 
-	worker->gcwq = gcwq;
+	worker->pool = pool;
 	worker->id = id;
 
 	if (!on_unbound_cpu)
@@ -1409,7 +1420,7 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 fail:
 	if (id >= 0) {
 		spin_lock_irq(&gcwq->lock);
-		ida_remove(&gcwq->worker_ida, id);
+		ida_remove(&pool->worker_ida, id);
 		spin_unlock_irq(&gcwq->lock);
 	}
 	kfree(worker);
@@ -1428,7 +1439,7 @@ fail:
 static void start_worker(struct worker *worker)
 {
 	worker->flags |= WORKER_STARTED;
-	worker->gcwq->nr_workers++;
+	worker->pool->nr_workers++;
 	worker_enter_idle(worker);
 	wake_up_process(worker->task);
 }
@@ -1444,7 +1455,8 @@ static void start_worker(struct worker *worker)
  */
 static void destroy_worker(struct worker *worker)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 	int id = worker->id;
 
 	/* sanity check frenzy */
@@ -1452,9 +1464,9 @@ static void destroy_worker(struct worker *worker)
 	BUG_ON(!list_empty(&worker->scheduled));
 
 	if (worker->flags & WORKER_STARTED)
-		gcwq->nr_workers--;
+		pool->nr_workers--;
 	if (worker->flags & WORKER_IDLE)
-		gcwq->nr_idle--;
+		pool->nr_idle--;
 
 	list_del_init(&worker->entry);
 	worker->flags |= WORKER_DIE;
@@ -1465,7 +1477,7 @@ static void destroy_worker(struct worker *worker)
 	kfree(worker);
 
 	spin_lock_irq(&gcwq->lock);
-	ida_remove(&gcwq->worker_ida, id);
+	ida_remove(&pool->worker_ida, id);
 }
 
 static void idle_worker_timeout(unsigned long __gcwq)
@@ -1479,11 +1491,12 @@ static void idle_worker_timeout(unsigned long __gcwq)
 		unsigned long expires;
 
 		/* idle_list is kept in LIFO order, check the last one */
-		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+		worker = list_entry(gcwq->pool.idle_list.prev, struct worker,
+				    entry);
 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
 		if (time_before(jiffies, expires))
-			mod_timer(&gcwq->idle_timer, expires);
+			mod_timer(&gcwq->pool.idle_timer, expires);
 		else {
 			/* it's been idle for too long, wake up manager */
 			gcwq->flags |= GCWQ_MANAGE_WORKERS;
@@ -1504,7 +1517,7 @@ static bool send_mayday(struct work_struct *work)
 		return false;
 
 	/* mayday mayday mayday */
-	cpu = cwq->gcwq->cpu;
+	cpu = cwq->pool->gcwq->cpu;
 	/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
 	if (cpu == WORK_CPU_UNBOUND)
 		cpu = 0;
@@ -1527,13 +1540,13 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
 		 * allocation deadlock.  Send distress signals to
 		 * rescuers.
 		 */
-		list_for_each_entry(work, &gcwq->worklist, entry)
+		list_for_each_entry(work, &gcwq->pool.worklist, entry)
 			send_mayday(work);
 	}
 
 	spin_unlock_irq(&gcwq->lock);
 
-	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
+	mod_timer(&gcwq->pool.mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
 
 /**
@@ -1568,14 +1581,14 @@ restart:
 	spin_unlock_irq(&gcwq->lock);
 
 	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
-	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+	mod_timer(&gcwq->pool.mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
 
 	while (true) {
 		struct worker *worker;
 
 		worker = create_worker(gcwq, true);
 		if (worker) {
-			del_timer_sync(&gcwq->mayday_timer);
+			del_timer_sync(&gcwq->pool.mayday_timer);
 			spin_lock_irq(&gcwq->lock);
 			start_worker(worker);
 			BUG_ON(need_to_create_worker(gcwq));
@@ -1592,7 +1605,7 @@ restart:
 			break;
 	}
 
-	del_timer_sync(&gcwq->mayday_timer);
+	del_timer_sync(&gcwq->pool.mayday_timer);
 	spin_lock_irq(&gcwq->lock);
 	if (need_to_create_worker(gcwq))
 		goto restart;
@@ -1622,11 +1635,12 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
 		struct worker *worker;
 		unsigned long expires;
 
-		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+		worker = list_entry(gcwq->pool.idle_list.prev, struct worker,
+				    entry);
 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
 		if (time_before(jiffies, expires)) {
-			mod_timer(&gcwq->idle_timer, expires);
+			mod_timer(&gcwq->pool.idle_timer, expires);
 			break;
 		}
 
@@ -1659,7 +1673,7 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
  */
 static bool manage_workers(struct worker *worker)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct global_cwq *gcwq = worker->pool->gcwq;
 	bool ret = false;
 
 	if (gcwq->flags & GCWQ_MANAGING_WORKERS)
@@ -1732,7 +1746,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 {
 	struct work_struct *work = list_first_entry(&cwq->delayed_works,
 						    struct work_struct, entry);
-	struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
+	struct list_head *pos = gcwq_determine_ins_pos(cwq->pool->gcwq, cwq);
 
 	trace_workqueue_activate_work(work);
 	move_linked_works(work, pos, NULL);
@@ -1808,7 +1822,8 @@ __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
-	struct global_cwq *gcwq = cwq->gcwq;
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 	struct hlist_head *bwh = busy_worker_head(gcwq, work);
 	bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
 	work_func_t f = work->func;
@@ -1854,10 +1869,10 @@ __acquires(&gcwq->lock)
 	 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
 	 */
 	if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
-		struct work_struct *nwork = list_first_entry(&gcwq->worklist,
-						struct work_struct, entry);
+		struct work_struct *nwork = list_first_entry(&pool->worklist,
+					 struct work_struct, entry);
 
-		if (!list_empty(&gcwq->worklist) &&
+		if (!list_empty(&pool->worklist) &&
 		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
 			wake_up_worker(gcwq);
 		else
@@ -1950,7 +1965,8 @@ static void process_scheduled_works(struct worker *worker)
 static int worker_thread(void *__worker)
 {
 	struct worker *worker = __worker;
-	struct global_cwq *gcwq = worker->gcwq;
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 
 	/* tell the scheduler that this is a workqueue worker */
 	worker->task->flags |= PF_WQ_WORKER;
@@ -1990,7 +2006,7 @@ recheck:
 
 	do {
 		struct work_struct *work =
-			list_first_entry(&gcwq->worklist,
+			list_first_entry(&pool->worklist,
 					 struct work_struct, entry);
 
 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -2064,14 +2080,15 @@ repeat:
 	for_each_mayday_cpu(cpu, wq->mayday_mask) {
 		unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
 		struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
-		struct global_cwq *gcwq = cwq->gcwq;
+		struct worker_pool *pool = cwq->pool;
+		struct global_cwq *gcwq = pool->gcwq;
 		struct work_struct *work, *n;
 
 		__set_current_state(TASK_RUNNING);
 		mayday_clear_cpu(cpu, wq->mayday_mask);
 
 		/* migrate to the target cpu if possible */
-		rescuer->gcwq = gcwq;
+		rescuer->pool = pool;
 		worker_maybe_bind_and_lock(rescuer);
 
 		/*
@@ -2079,7 +2096,7 @@ repeat:
 		 * process'em.
 		 */
 		BUG_ON(!list_empty(&rescuer->scheduled));
-		list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+		list_for_each_entry_safe(work, n, &pool->worklist, entry)
 			if (get_work_cwq(work) == cwq)
 				move_linked_works(work, scheduled, &n);
 
@@ -2216,7 +2233,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
 
 	for_each_cwq_cpu(cpu, wq) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-		struct global_cwq *gcwq = cwq->gcwq;
+		struct global_cwq *gcwq = cwq->pool->gcwq;
 
 		spin_lock_irq(&gcwq->lock);
 
@@ -2432,9 +2449,9 @@ reflush:
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		bool drained;
 
-		spin_lock_irq(&cwq->gcwq->lock);
+		spin_lock_irq(&cwq->pool->gcwq->lock);
 		drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
-		spin_unlock_irq(&cwq->gcwq->lock);
+		spin_unlock_irq(&cwq->pool->gcwq->lock);
 
 		if (drained)
 			continue;
@@ -2474,7 +2491,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 		 */
 		smp_rmb();
 		cwq = get_work_cwq(work);
-		if (unlikely(!cwq || gcwq != cwq->gcwq))
+		if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
 			goto already_gone;
 	} else if (wait_executing) {
 		worker = find_worker_executing_work(gcwq, work);
@@ -3017,7 +3034,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 		struct global_cwq *gcwq = get_gcwq(cpu);
 
 		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
-		cwq->gcwq = gcwq;
+		cwq->pool = &gcwq->pool;
 		cwq->wq = wq;
 		cwq->flush_color = -1;
 		cwq->max_active = max_active;
@@ -3344,7 +3361,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 
 	gcwq->flags |= GCWQ_MANAGING_WORKERS;
 
-	list_for_each_entry(worker, &gcwq->idle_list, entry)
+	list_for_each_entry(worker, &gcwq->pool.idle_list, entry)
 		worker->flags |= WORKER_ROGUE;
 
 	for_each_busy_worker(worker, i, pos, gcwq)
@@ -3369,7 +3386,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
 
 	spin_unlock_irq(&gcwq->lock);
-	del_timer_sync(&gcwq->idle_timer);
+	del_timer_sync(&gcwq->pool.idle_timer);
 	spin_lock_irq(&gcwq->lock);
 
 	/*
@@ -3391,17 +3408,17 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * may be frozen works in freezable cwqs.  Don't declare
 	 * completion while frozen.
 	 */
-	while (gcwq->nr_workers != gcwq->nr_idle ||
+	while (gcwq->pool.nr_workers != gcwq->pool.nr_idle ||
 	       gcwq->flags & GCWQ_FREEZING ||
 	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
 		int nr_works = 0;
 
-		list_for_each_entry(work, &gcwq->worklist, entry) {
+		list_for_each_entry(work, &gcwq->pool.worklist, entry) {
 			send_mayday(work);
 			nr_works++;
 		}
 
-		list_for_each_entry(worker, &gcwq->idle_list, entry) {
+		list_for_each_entry(worker, &gcwq->pool.idle_list, entry) {
 			if (!nr_works--)
 				break;
 			wake_up_process(worker->task);
@@ -3428,11 +3445,11 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * all workers till we're canceled.
 	 */
 	do {
-		rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
-		while (!list_empty(&gcwq->idle_list))
-			destroy_worker(list_first_entry(&gcwq->idle_list,
+		rc = trustee_wait_event(!list_empty(&gcwq->pool.idle_list));
+		while (!list_empty(&gcwq->pool.idle_list))
+			destroy_worker(list_first_entry(&gcwq->pool.idle_list,
 							struct worker, entry));
-	} while (gcwq->nr_workers && rc >= 0);
+	} while (gcwq->pool.nr_workers && rc >= 0);
 
 	/*
 	 * At this point, either draining has completed and no worker
@@ -3441,7 +3458,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * Tell the remaining busy ones to rebind once it finishes the
 	 * currently scheduled works by scheduling the rebind_work.
 	 */
-	WARN_ON(!list_empty(&gcwq->idle_list));
+	WARN_ON(!list_empty(&gcwq->pool.idle_list));
 
 	for_each_busy_worker(worker, i, pos, gcwq) {
 		struct work_struct *rebind_work = &worker->rebind_work;
@@ -3522,7 +3539,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		kthread_bind(new_trustee, cpu);
 		/* fall through */
 	case CPU_UP_PREPARE:
-		BUG_ON(gcwq->first_idle);
+		BUG_ON(gcwq->pool.first_idle);
 		new_worker = create_worker(gcwq, false);
 		if (!new_worker) {
 			if (new_trustee)
@@ -3544,8 +3561,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
 		/* fall through */
 	case CPU_UP_PREPARE:
-		BUG_ON(gcwq->first_idle);
-		gcwq->first_idle = new_worker;
+		BUG_ON(gcwq->pool.first_idle);
+		gcwq->pool.first_idle = new_worker;
 		break;
 
 	case CPU_DYING:
@@ -3562,8 +3579,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		gcwq->trustee_state = TRUSTEE_BUTCHER;
 		/* fall through */
 	case CPU_UP_CANCELED:
-		destroy_worker(gcwq->first_idle);
-		gcwq->first_idle = NULL;
+		destroy_worker(gcwq->pool.first_idle);
+		gcwq->pool.first_idle = NULL;
 		break;
 
 	case CPU_DOWN_FAILED:
@@ -3581,11 +3598,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		 * take a look.
 		 */
 		spin_unlock_irq(&gcwq->lock);
-		kthread_bind(gcwq->first_idle->task, cpu);
+		kthread_bind(gcwq->pool.first_idle->task, cpu);
 		spin_lock_irq(&gcwq->lock);
 		gcwq->flags |= GCWQ_MANAGE_WORKERS;
-		start_worker(gcwq->first_idle);
-		gcwq->first_idle = NULL;
+		start_worker(gcwq->pool.first_idle);
+		gcwq->pool.first_idle = NULL;
 		break;
 	}
 
@@ -3794,22 +3811,23 @@ static int __init init_workqueues(void)
 		struct global_cwq *gcwq = get_gcwq(cpu);
 
 		spin_lock_init(&gcwq->lock);
-		INIT_LIST_HEAD(&gcwq->worklist);
+		gcwq->pool.gcwq = gcwq;
+		INIT_LIST_HEAD(&gcwq->pool.worklist);
 		gcwq->cpu = cpu;
 		gcwq->flags |= GCWQ_DISASSOCIATED;
 
-		INIT_LIST_HEAD(&gcwq->idle_list);
+		INIT_LIST_HEAD(&gcwq->pool.idle_list);
 		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
 			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
 
-		init_timer_deferrable(&gcwq->idle_timer);
-		gcwq->idle_timer.function = idle_worker_timeout;
-		gcwq->idle_timer.data = (unsigned long)gcwq;
+		init_timer_deferrable(&gcwq->pool.idle_timer);
+		gcwq->pool.idle_timer.function = idle_worker_timeout;
+		gcwq->pool.idle_timer.data = (unsigned long)gcwq;
 
-		setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
+		setup_timer(&gcwq->pool.mayday_timer, gcwq_mayday_timeout,
 			    (unsigned long)gcwq);
 
-		ida_init(&gcwq->worker_ida);
+		ida_init(&gcwq->pool.worker_ida);
 
 		gcwq->trustee_state = TRUSTEE_DONE;
 		init_waitqueue_head(&gcwq->trustee_wait);
-- 
cgit v1.2.3-59-g8ed1b


From 63d95a9150ee3bbd4117fcd609dee40313b454d9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 12 Jul 2012 14:46:37 -0700
Subject: workqueue: use @pool instead of @gcwq or @cpu where applicable

Modify all functions which deal with per-pool properties to pass
around @pool instead of @gcwq or @cpu.

The changes in this patch are mechanical and don't caues any
functional difference.  This is to prepare for multiple pools per
gcwq.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 218 +++++++++++++++++++++++++++--------------------------
 1 file changed, 111 insertions(+), 107 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 61f154467026..2d82f7b193a0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -471,8 +471,10 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
 		return &unbound_global_cwq;
 }
 
-static atomic_t *get_gcwq_nr_running(unsigned int cpu)
+static atomic_t *get_pool_nr_running(struct worker_pool *pool)
 {
+	int cpu = pool->gcwq->cpu;
+
 	if (cpu != WORK_CPU_UNBOUND)
 		return &per_cpu(gcwq_nr_running, cpu);
 	else
@@ -578,10 +580,10 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
  * assume that they're being called with gcwq->lock held.
  */
 
-static bool __need_more_worker(struct global_cwq *gcwq)
+static bool __need_more_worker(struct worker_pool *pool)
 {
-	return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
-		gcwq->flags & GCWQ_HIGHPRI_PENDING;
+	return !atomic_read(get_pool_nr_running(pool)) ||
+		pool->gcwq->flags & GCWQ_HIGHPRI_PENDING;
 }
 
 /*
@@ -592,45 +594,46 @@ static bool __need_more_worker(struct global_cwq *gcwq)
  * function will always return %true for unbound gcwq as long as the
  * worklist isn't empty.
  */
-static bool need_more_worker(struct global_cwq *gcwq)
+static bool need_more_worker(struct worker_pool *pool)
 {
-	return !list_empty(&gcwq->pool.worklist) && __need_more_worker(gcwq);
+	return !list_empty(&pool->worklist) && __need_more_worker(pool);
 }
 
 /* Can I start working?  Called from busy but !running workers. */
-static bool may_start_working(struct global_cwq *gcwq)
+static bool may_start_working(struct worker_pool *pool)
 {
-	return gcwq->pool.nr_idle;
+	return pool->nr_idle;
 }
 
 /* Do I need to keep working?  Called from currently running workers. */
-static bool keep_working(struct global_cwq *gcwq)
+static bool keep_working(struct worker_pool *pool)
 {
-	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+	atomic_t *nr_running = get_pool_nr_running(pool);
 
-	return !list_empty(&gcwq->pool.worklist) &&
+	return !list_empty(&pool->worklist) &&
 		(atomic_read(nr_running) <= 1 ||
-		 gcwq->flags & GCWQ_HIGHPRI_PENDING);
+		 pool->gcwq->flags & GCWQ_HIGHPRI_PENDING);
 }
 
 /* Do we need a new worker?  Called from manager. */
-static bool need_to_create_worker(struct global_cwq *gcwq)
+static bool need_to_create_worker(struct worker_pool *pool)
 {
-	return need_more_worker(gcwq) && !may_start_working(gcwq);
+	return need_more_worker(pool) && !may_start_working(pool);
 }
 
 /* Do I need to be the manager? */
-static bool need_to_manage_workers(struct global_cwq *gcwq)
+static bool need_to_manage_workers(struct worker_pool *pool)
 {
-	return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+	return need_to_create_worker(pool) ||
+		pool->gcwq->flags & GCWQ_MANAGE_WORKERS;
 }
 
 /* Do we have too many workers and should some go away? */
-static bool too_many_workers(struct global_cwq *gcwq)
+static bool too_many_workers(struct worker_pool *pool)
 {
-	bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
-	int nr_idle = gcwq->pool.nr_idle + managing; /* manager is considered idle */
-	int nr_busy = gcwq->pool.nr_workers - nr_idle;
+	bool managing = pool->gcwq->flags & GCWQ_MANAGING_WORKERS;
+	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
+	int nr_busy = pool->nr_workers - nr_idle;
 
 	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
@@ -640,26 +643,26 @@ static bool too_many_workers(struct global_cwq *gcwq)
  */
 
 /* Return the first worker.  Safe with preemption disabled */
-static struct worker *first_worker(struct global_cwq *gcwq)
+static struct worker *first_worker(struct worker_pool *pool)
 {
-	if (unlikely(list_empty(&gcwq->pool.idle_list)))
+	if (unlikely(list_empty(&pool->idle_list)))
 		return NULL;
 
-	return list_first_entry(&gcwq->pool.idle_list, struct worker, entry);
+	return list_first_entry(&pool->idle_list, struct worker, entry);
 }
 
 /**
  * wake_up_worker - wake up an idle worker
- * @gcwq: gcwq to wake worker for
+ * @pool: worker pool to wake worker from
  *
- * Wake up the first idle worker of @gcwq.
+ * Wake up the first idle worker of @pool.
  *
  * CONTEXT:
  * spin_lock_irq(gcwq->lock).
  */
-static void wake_up_worker(struct global_cwq *gcwq)
+static void wake_up_worker(struct worker_pool *pool)
 {
-	struct worker *worker = first_worker(gcwq);
+	struct worker *worker = first_worker(pool);
 
 	if (likely(worker))
 		wake_up_process(worker->task);
@@ -681,7 +684,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
 	struct worker *worker = kthread_data(task);
 
 	if (!(worker->flags & WORKER_NOT_RUNNING))
-		atomic_inc(get_gcwq_nr_running(cpu));
+		atomic_inc(get_pool_nr_running(worker->pool));
 }
 
 /**
@@ -704,8 +707,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 {
 	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
 	struct worker_pool *pool = worker->pool;
-	struct global_cwq *gcwq = pool->gcwq;
-	atomic_t *nr_running = get_gcwq_nr_running(cpu);
+	atomic_t *nr_running = get_pool_nr_running(pool);
 
 	if (worker->flags & WORKER_NOT_RUNNING)
 		return NULL;
@@ -725,7 +727,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 	 * without gcwq lock is safe.
 	 */
 	if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
-		to_wakeup = first_worker(gcwq);
+		to_wakeup = first_worker(pool);
 	return to_wakeup ? to_wakeup->task : NULL;
 }
 
@@ -746,7 +748,6 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 				    bool wakeup)
 {
 	struct worker_pool *pool = worker->pool;
-	struct global_cwq *gcwq = pool->gcwq;
 
 	WARN_ON_ONCE(worker->task != current);
 
@@ -757,12 +758,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 	 */
 	if ((flags & WORKER_NOT_RUNNING) &&
 	    !(worker->flags & WORKER_NOT_RUNNING)) {
-		atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+		atomic_t *nr_running = get_pool_nr_running(pool);
 
 		if (wakeup) {
 			if (atomic_dec_and_test(nr_running) &&
 			    !list_empty(&pool->worklist))
-				wake_up_worker(gcwq);
+				wake_up_worker(pool);
 		} else
 			atomic_dec(nr_running);
 	}
@@ -782,7 +783,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
  */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
-	struct global_cwq *gcwq = worker->pool->gcwq;
+	struct worker_pool *pool = worker->pool;
 	unsigned int oflags = worker->flags;
 
 	WARN_ON_ONCE(worker->task != current);
@@ -796,7 +797,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 	 */
 	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
 		if (!(worker->flags & WORKER_NOT_RUNNING))
-			atomic_inc(get_gcwq_nr_running(gcwq->cpu));
+			atomic_inc(get_pool_nr_running(pool));
 }
 
 /**
@@ -880,15 +881,15 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 }
 
 /**
- * gcwq_determine_ins_pos - find insertion position
- * @gcwq: gcwq of interest
+ * pool_determine_ins_pos - find insertion position
+ * @pool: pool of interest
  * @cwq: cwq a work is being queued for
  *
- * A work for @cwq is about to be queued on @gcwq, determine insertion
+ * A work for @cwq is about to be queued on @pool, determine insertion
  * position for the work.  If @cwq is for HIGHPRI wq, the work is
  * queued at the head of the queue but in FIFO order with respect to
  * other HIGHPRI works; otherwise, at the end of the queue.  This
- * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
+ * function also sets GCWQ_HIGHPRI_PENDING flag to hint @pool that
  * there are HIGHPRI works pending.
  *
  * CONTEXT:
@@ -897,22 +898,22 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
  * RETURNS:
  * Pointer to inserstion position.
  */
-static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
+static inline struct list_head *pool_determine_ins_pos(struct worker_pool *pool,
 					       struct cpu_workqueue_struct *cwq)
 {
 	struct work_struct *twork;
 
 	if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
-		return &gcwq->pool.worklist;
+		return &pool->worklist;
 
-	list_for_each_entry(twork, &gcwq->pool.worklist, entry) {
+	list_for_each_entry(twork, &pool->worklist, entry) {
 		struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
 
 		if (!(tcwq->wq->flags & WQ_HIGHPRI))
 			break;
 	}
 
-	gcwq->flags |= GCWQ_HIGHPRI_PENDING;
+	pool->gcwq->flags |= GCWQ_HIGHPRI_PENDING;
 	return &twork->entry;
 }
 
@@ -933,7 +934,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 			struct work_struct *work, struct list_head *head,
 			unsigned int extra_flags)
 {
-	struct global_cwq *gcwq = cwq->pool->gcwq;
+	struct worker_pool *pool = cwq->pool;
 
 	/* we own @work, set data and link */
 	set_work_cwq(work, cwq, extra_flags);
@@ -953,8 +954,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 	 */
 	smp_mb();
 
-	if (__need_more_worker(gcwq))
-		wake_up_worker(gcwq);
+	if (__need_more_worker(pool))
+		wake_up_worker(pool);
 }
 
 /*
@@ -1056,7 +1057,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	if (likely(cwq->nr_active < cwq->max_active)) {
 		trace_workqueue_activate_work(work);
 		cwq->nr_active++;
-		worklist = gcwq_determine_ins_pos(gcwq, cwq);
+		worklist = pool_determine_ins_pos(cwq->pool, cwq);
 	} else {
 		work_flags |= WORK_STRUCT_DELAYED;
 		worklist = &cwq->delayed_works;
@@ -1221,7 +1222,7 @@ static void worker_enter_idle(struct worker *worker)
 	list_add(&worker->entry, &pool->idle_list);
 
 	if (likely(!(worker->flags & WORKER_ROGUE))) {
-		if (too_many_workers(gcwq) && !timer_pending(&pool->idle_timer))
+		if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
 			mod_timer(&pool->idle_timer,
 				  jiffies + IDLE_WORKER_TIMEOUT);
 	} else
@@ -1234,7 +1235,7 @@ static void worker_enter_idle(struct worker *worker)
 	 */
 	WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
 		     pool->nr_workers == pool->nr_idle &&
-		     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
+		     atomic_read(get_pool_nr_running(pool)));
 }
 
 /**
@@ -1356,10 +1357,10 @@ static struct worker *alloc_worker(void)
 
 /**
  * create_worker - create a new workqueue worker
- * @gcwq: gcwq the new worker will belong to
+ * @pool: pool the new worker will belong to
  * @bind: whether to set affinity to @cpu or not
  *
- * Create a new worker which is bound to @gcwq.  The returned worker
+ * Create a new worker which is bound to @pool.  The returned worker
  * can be started by calling start_worker() or destroyed using
  * destroy_worker().
  *
@@ -1369,10 +1370,10 @@ static struct worker *alloc_worker(void)
  * RETURNS:
  * Pointer to the newly created worker.
  */
-static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
+static struct worker *create_worker(struct worker_pool *pool, bool bind)
 {
+	struct global_cwq *gcwq = pool->gcwq;
 	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
-	struct worker_pool *pool = &gcwq->pool;
 	struct worker *worker = NULL;
 	int id = -1;
 
@@ -1480,27 +1481,27 @@ static void destroy_worker(struct worker *worker)
 	ida_remove(&pool->worker_ida, id);
 }
 
-static void idle_worker_timeout(unsigned long __gcwq)
+static void idle_worker_timeout(unsigned long __pool)
 {
-	struct global_cwq *gcwq = (void *)__gcwq;
+	struct worker_pool *pool = (void *)__pool;
+	struct global_cwq *gcwq = pool->gcwq;
 
 	spin_lock_irq(&gcwq->lock);
 
-	if (too_many_workers(gcwq)) {
+	if (too_many_workers(pool)) {
 		struct worker *worker;
 		unsigned long expires;
 
 		/* idle_list is kept in LIFO order, check the last one */
-		worker = list_entry(gcwq->pool.idle_list.prev, struct worker,
-				    entry);
+		worker = list_entry(pool->idle_list.prev, struct worker, entry);
 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
 		if (time_before(jiffies, expires))
-			mod_timer(&gcwq->pool.idle_timer, expires);
+			mod_timer(&pool->idle_timer, expires);
 		else {
 			/* it's been idle for too long, wake up manager */
 			gcwq->flags |= GCWQ_MANAGE_WORKERS;
-			wake_up_worker(gcwq);
+			wake_up_worker(pool);
 		}
 	}
 
@@ -1526,37 +1527,38 @@ static bool send_mayday(struct work_struct *work)
 	return true;
 }
 
-static void gcwq_mayday_timeout(unsigned long __gcwq)
+static void gcwq_mayday_timeout(unsigned long __pool)
 {
-	struct global_cwq *gcwq = (void *)__gcwq;
+	struct worker_pool *pool = (void *)__pool;
+	struct global_cwq *gcwq = pool->gcwq;
 	struct work_struct *work;
 
 	spin_lock_irq(&gcwq->lock);
 
-	if (need_to_create_worker(gcwq)) {
+	if (need_to_create_worker(pool)) {
 		/*
 		 * We've been trying to create a new worker but
 		 * haven't been successful.  We might be hitting an
 		 * allocation deadlock.  Send distress signals to
 		 * rescuers.
 		 */
-		list_for_each_entry(work, &gcwq->pool.worklist, entry)
+		list_for_each_entry(work, &pool->worklist, entry)
 			send_mayday(work);
 	}
 
 	spin_unlock_irq(&gcwq->lock);
 
-	mod_timer(&gcwq->pool.mayday_timer, jiffies + MAYDAY_INTERVAL);
+	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
 
 /**
  * maybe_create_worker - create a new worker if necessary
- * @gcwq: gcwq to create a new worker for
+ * @pool: pool to create a new worker for
  *
- * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
+ * Create a new worker for @pool if necessary.  @pool is guaranteed to
  * have at least one idle worker on return from this function.  If
  * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
- * sent to all rescuers with works scheduled on @gcwq to resolve
+ * sent to all rescuers with works scheduled on @pool to resolve
  * possible allocation deadlock.
  *
  * On return, need_to_create_worker() is guaranteed to be false and
@@ -1571,52 +1573,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
  * false if no action was taken and gcwq->lock stayed locked, true
  * otherwise.
  */
-static bool maybe_create_worker(struct global_cwq *gcwq)
+static bool maybe_create_worker(struct worker_pool *pool)
 __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
-	if (!need_to_create_worker(gcwq))
+	struct global_cwq *gcwq = pool->gcwq;
+
+	if (!need_to_create_worker(pool))
 		return false;
 restart:
 	spin_unlock_irq(&gcwq->lock);
 
 	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
-	mod_timer(&gcwq->pool.mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
 
 	while (true) {
 		struct worker *worker;
 
-		worker = create_worker(gcwq, true);
+		worker = create_worker(pool, true);
 		if (worker) {
-			del_timer_sync(&gcwq->pool.mayday_timer);
+			del_timer_sync(&pool->mayday_timer);
 			spin_lock_irq(&gcwq->lock);
 			start_worker(worker);
-			BUG_ON(need_to_create_worker(gcwq));
+			BUG_ON(need_to_create_worker(pool));
 			return true;
 		}
 
-		if (!need_to_create_worker(gcwq))
+		if (!need_to_create_worker(pool))
 			break;
 
 		__set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(CREATE_COOLDOWN);
 
-		if (!need_to_create_worker(gcwq))
+		if (!need_to_create_worker(pool))
 			break;
 	}
 
-	del_timer_sync(&gcwq->pool.mayday_timer);
+	del_timer_sync(&pool->mayday_timer);
 	spin_lock_irq(&gcwq->lock);
-	if (need_to_create_worker(gcwq))
+	if (need_to_create_worker(pool))
 		goto restart;
 	return true;
 }
 
 /**
  * maybe_destroy_worker - destroy workers which have been idle for a while
- * @gcwq: gcwq to destroy workers for
+ * @pool: pool to destroy workers for
  *
- * Destroy @gcwq workers which have been idle for longer than
+ * Destroy @pool workers which have been idle for longer than
  * IDLE_WORKER_TIMEOUT.
  *
  * LOCKING:
@@ -1627,20 +1631,19 @@ restart:
  * false if no action was taken and gcwq->lock stayed locked, true
  * otherwise.
  */
-static bool maybe_destroy_workers(struct global_cwq *gcwq)
+static bool maybe_destroy_workers(struct worker_pool *pool)
 {
 	bool ret = false;
 
-	while (too_many_workers(gcwq)) {
+	while (too_many_workers(pool)) {
 		struct worker *worker;
 		unsigned long expires;
 
-		worker = list_entry(gcwq->pool.idle_list.prev, struct worker,
-				    entry);
+		worker = list_entry(pool->idle_list.prev, struct worker, entry);
 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
 		if (time_before(jiffies, expires)) {
-			mod_timer(&gcwq->pool.idle_timer, expires);
+			mod_timer(&pool->idle_timer, expires);
 			break;
 		}
 
@@ -1673,7 +1676,8 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
  */
 static bool manage_workers(struct worker *worker)
 {
-	struct global_cwq *gcwq = worker->pool->gcwq;
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 	bool ret = false;
 
 	if (gcwq->flags & GCWQ_MANAGING_WORKERS)
@@ -1686,8 +1690,8 @@ static bool manage_workers(struct worker *worker)
 	 * Destroy and then create so that may_start_working() is true
 	 * on return.
 	 */
-	ret |= maybe_destroy_workers(gcwq);
-	ret |= maybe_create_worker(gcwq);
+	ret |= maybe_destroy_workers(pool);
+	ret |= maybe_create_worker(pool);
 
 	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
 
@@ -1746,7 +1750,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 {
 	struct work_struct *work = list_first_entry(&cwq->delayed_works,
 						    struct work_struct, entry);
-	struct list_head *pos = gcwq_determine_ins_pos(cwq->pool->gcwq, cwq);
+	struct list_head *pos = pool_determine_ins_pos(cwq->pool, cwq);
 
 	trace_workqueue_activate_work(work);
 	move_linked_works(work, pos, NULL);
@@ -1874,7 +1878,7 @@ __acquires(&gcwq->lock)
 
 		if (!list_empty(&pool->worklist) &&
 		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
-			wake_up_worker(gcwq);
+			wake_up_worker(pool);
 		else
 			gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
 	}
@@ -1890,8 +1894,8 @@ __acquires(&gcwq->lock)
 	 * Unbound gcwq isn't concurrency managed and work items should be
 	 * executed ASAP.  Wake up another worker if necessary.
 	 */
-	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(gcwq))
-		wake_up_worker(gcwq);
+	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+		wake_up_worker(pool);
 
 	spin_unlock_irq(&gcwq->lock);
 
@@ -1983,11 +1987,11 @@ woke_up:
 	worker_leave_idle(worker);
 recheck:
 	/* no more worker necessary? */
-	if (!need_more_worker(gcwq))
+	if (!need_more_worker(pool))
 		goto sleep;
 
 	/* do we need to manage? */
-	if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+	if (unlikely(!may_start_working(pool)) && manage_workers(worker))
 		goto recheck;
 
 	/*
@@ -2018,11 +2022,11 @@ recheck:
 			move_linked_works(work, &worker->scheduled, NULL);
 			process_scheduled_works(worker);
 		}
-	} while (keep_working(gcwq));
+	} while (keep_working(pool));
 
 	worker_set_flags(worker, WORKER_PREP, false);
 sleep:
-	if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+	if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
 		goto recheck;
 
 	/*
@@ -2107,8 +2111,8 @@ repeat:
 		 * regular worker; otherwise, we end up with 0 concurrency
 		 * and stalling the execution.
 		 */
-		if (keep_working(gcwq))
-			wake_up_worker(gcwq);
+		if (keep_working(pool))
+			wake_up_worker(pool);
 
 		spin_unlock_irq(&gcwq->lock);
 	}
@@ -3383,7 +3387,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * keep_working() are always true as long as the worklist is
 	 * not empty.
 	 */
-	atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
+	atomic_set(get_pool_nr_running(&gcwq->pool), 0);
 
 	spin_unlock_irq(&gcwq->lock);
 	del_timer_sync(&gcwq->pool.idle_timer);
@@ -3424,9 +3428,9 @@ static int __cpuinit trustee_thread(void *__gcwq)
 			wake_up_process(worker->task);
 		}
 
-		if (need_to_create_worker(gcwq)) {
+		if (need_to_create_worker(&gcwq->pool)) {
 			spin_unlock_irq(&gcwq->lock);
-			worker = create_worker(gcwq, false);
+			worker = create_worker(&gcwq->pool, false);
 			spin_lock_irq(&gcwq->lock);
 			if (worker) {
 				worker->flags |= WORKER_ROGUE;
@@ -3540,7 +3544,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		/* fall through */
 	case CPU_UP_PREPARE:
 		BUG_ON(gcwq->pool.first_idle);
-		new_worker = create_worker(gcwq, false);
+		new_worker = create_worker(&gcwq->pool, false);
 		if (!new_worker) {
 			if (new_trustee)
 				kthread_stop(new_trustee);
@@ -3788,7 +3792,7 @@ void thaw_workqueues(void)
 				cwq_activate_first_delayed(cwq);
 		}
 
-		wake_up_worker(gcwq);
+		wake_up_worker(&gcwq->pool);
 
 		spin_unlock_irq(&gcwq->lock);
 	}
@@ -3822,10 +3826,10 @@ static int __init init_workqueues(void)
 
 		init_timer_deferrable(&gcwq->pool.idle_timer);
 		gcwq->pool.idle_timer.function = idle_worker_timeout;
-		gcwq->pool.idle_timer.data = (unsigned long)gcwq;
+		gcwq->pool.idle_timer.data = (unsigned long)&gcwq->pool;
 
 		setup_timer(&gcwq->pool.mayday_timer, gcwq_mayday_timeout,
-			    (unsigned long)gcwq);
+			    (unsigned long)&gcwq->pool);
 
 		ida_init(&gcwq->pool.worker_ida);
 
@@ -3840,7 +3844,7 @@ static int __init init_workqueues(void)
 
 		if (cpu != WORK_CPU_UNBOUND)
 			gcwq->flags &= ~GCWQ_DISASSOCIATED;
-		worker = create_worker(gcwq, true);
+		worker = create_worker(&gcwq->pool, true);
 		BUG_ON(!worker);
 		spin_lock_irq(&gcwq->lock);
 		start_worker(worker);
-- 
cgit v1.2.3-59-g8ed1b


From 11ebea50dbc1ade5994b2c838a096078d4c02399 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 12 Jul 2012 14:46:37 -0700
Subject: workqueue: separate out worker_pool flags

GCWQ_MANAGE_WORKERS, GCWQ_MANAGING_WORKERS and GCWQ_HIGHPRI_PENDING
are per-pool properties.  Add worker_pool->flags and make the above
three flags per-pool flags.

The changes in this patch are mechanical and don't caues any
functional difference.  This is to prepare for multiple pools per
gcwq.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 47 +++++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2d82f7b193a0..7a98bae635fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -46,11 +46,13 @@
 
 enum {
 	/* global_cwq flags */
-	GCWQ_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
-	GCWQ_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
-	GCWQ_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
-	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
-	GCWQ_HIGHPRI_PENDING	= 1 << 4,	/* highpri works on queue */
+	GCWQ_DISASSOCIATED	= 1 << 0,	/* cpu can't serve workers */
+	GCWQ_FREEZING		= 1 << 1,	/* freeze in progress */
+
+	/* pool flags */
+	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
+	POOL_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
+	POOL_HIGHPRI_PENDING	= 1 << 2,	/* highpri works on queue */
 
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
@@ -142,6 +144,7 @@ struct worker {
 
 struct worker_pool {
 	struct global_cwq	*gcwq;		/* I: the owning gcwq */
+	unsigned int		flags;		/* X: flags */
 
 	struct list_head	worklist;	/* L: list of pending works */
 	int			nr_workers;	/* L: total number of workers */
@@ -583,7 +586,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 static bool __need_more_worker(struct worker_pool *pool)
 {
 	return !atomic_read(get_pool_nr_running(pool)) ||
-		pool->gcwq->flags & GCWQ_HIGHPRI_PENDING;
+		(pool->flags & POOL_HIGHPRI_PENDING);
 }
 
 /*
@@ -612,7 +615,7 @@ static bool keep_working(struct worker_pool *pool)
 
 	return !list_empty(&pool->worklist) &&
 		(atomic_read(nr_running) <= 1 ||
-		 pool->gcwq->flags & GCWQ_HIGHPRI_PENDING);
+		 (pool->flags & POOL_HIGHPRI_PENDING));
 }
 
 /* Do we need a new worker?  Called from manager. */
@@ -625,13 +628,13 @@ static bool need_to_create_worker(struct worker_pool *pool)
 static bool need_to_manage_workers(struct worker_pool *pool)
 {
 	return need_to_create_worker(pool) ||
-		pool->gcwq->flags & GCWQ_MANAGE_WORKERS;
+		(pool->flags & POOL_MANAGE_WORKERS);
 }
 
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
-	bool managing = pool->gcwq->flags & GCWQ_MANAGING_WORKERS;
+	bool managing = pool->flags & POOL_MANAGING_WORKERS;
 	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
 	int nr_busy = pool->nr_workers - nr_idle;
 
@@ -889,7 +892,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
  * position for the work.  If @cwq is for HIGHPRI wq, the work is
  * queued at the head of the queue but in FIFO order with respect to
  * other HIGHPRI works; otherwise, at the end of the queue.  This
- * function also sets GCWQ_HIGHPRI_PENDING flag to hint @pool that
+ * function also sets POOL_HIGHPRI_PENDING flag to hint @pool that
  * there are HIGHPRI works pending.
  *
  * CONTEXT:
@@ -913,7 +916,7 @@ static inline struct list_head *pool_determine_ins_pos(struct worker_pool *pool,
 			break;
 	}
 
-	pool->gcwq->flags |= GCWQ_HIGHPRI_PENDING;
+	pool->flags |= POOL_HIGHPRI_PENDING;
 	return &twork->entry;
 }
 
@@ -1500,7 +1503,7 @@ static void idle_worker_timeout(unsigned long __pool)
 			mod_timer(&pool->idle_timer, expires);
 		else {
 			/* it's been idle for too long, wake up manager */
-			gcwq->flags |= GCWQ_MANAGE_WORKERS;
+			pool->flags |= POOL_MANAGE_WORKERS;
 			wake_up_worker(pool);
 		}
 	}
@@ -1680,11 +1683,11 @@ static bool manage_workers(struct worker *worker)
 	struct global_cwq *gcwq = pool->gcwq;
 	bool ret = false;
 
-	if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+	if (pool->flags & POOL_MANAGING_WORKERS)
 		return ret;
 
-	gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
-	gcwq->flags |= GCWQ_MANAGING_WORKERS;
+	pool->flags &= ~POOL_MANAGE_WORKERS;
+	pool->flags |= POOL_MANAGING_WORKERS;
 
 	/*
 	 * Destroy and then create so that may_start_working() is true
@@ -1693,7 +1696,7 @@ static bool manage_workers(struct worker *worker)
 	ret |= maybe_destroy_workers(pool);
 	ret |= maybe_create_worker(pool);
 
-	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+	pool->flags &= ~POOL_MANAGING_WORKERS;
 
 	/*
 	 * The trustee might be waiting to take over the manager
@@ -1872,7 +1875,7 @@ __acquires(&gcwq->lock)
 	 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
 	 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
 	 */
-	if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
+	if (unlikely(pool->flags & POOL_HIGHPRI_PENDING)) {
 		struct work_struct *nwork = list_first_entry(&pool->worklist,
 					 struct work_struct, entry);
 
@@ -1880,7 +1883,7 @@ __acquires(&gcwq->lock)
 		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
 			wake_up_worker(pool);
 		else
-			gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
+			pool->flags &= ~POOL_HIGHPRI_PENDING;
 	}
 
 	/*
@@ -3360,10 +3363,10 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * cancelled.
 	 */
 	BUG_ON(gcwq->cpu != smp_processor_id());
-	rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
+	rc = trustee_wait_event(!(gcwq->pool.flags & POOL_MANAGING_WORKERS));
 	BUG_ON(rc < 0);
 
-	gcwq->flags |= GCWQ_MANAGING_WORKERS;
+	gcwq->pool.flags |= POOL_MANAGING_WORKERS;
 
 	list_for_each_entry(worker, &gcwq->pool.idle_list, entry)
 		worker->flags |= WORKER_ROGUE;
@@ -3487,7 +3490,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	}
 
 	/* relinquish manager role */
-	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+	gcwq->pool.flags &= ~POOL_MANAGING_WORKERS;
 
 	/* notify completion */
 	gcwq->trustee = NULL;
@@ -3604,7 +3607,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		spin_unlock_irq(&gcwq->lock);
 		kthread_bind(gcwq->pool.first_idle->task, cpu);
 		spin_lock_irq(&gcwq->lock);
-		gcwq->flags |= GCWQ_MANAGE_WORKERS;
+		gcwq->pool.flags |= POOL_MANAGE_WORKERS;
 		start_worker(gcwq->pool.first_idle);
 		gcwq->pool.first_idle = NULL;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 4ce62e9e30cacc26885cab133ad1de358dd79f21 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Jul 2012 22:16:44 -0700
Subject: workqueue: introduce NR_WORKER_POOLS and for_each_worker_pool()

Introduce NR_WORKER_POOLS and for_each_worker_pool() and convert code
paths which need to manipulate all pools in a gcwq to use them.
NR_WORKER_POOLS is currently one and for_each_worker_pool() iterates
over only @gcwq->pool.

Note that nr_running is per-pool property and converted to an array
with NR_WORKER_POOLS elements and renamed to pool_nr_running.  Note
that get_pool_nr_running() currently assumes 0 index.  The next patch
will make use of non-zero index.

The changes in this patch are mechanical and don't caues any
functional difference.  This is to prepare for multiple pools per
gcwq.

v2: nr_running indexing bug in get_pool_nr_running() fixed.

v3: Pointer to array is stupid.  Don't use it in get_pool_nr_running()
    as suggested by Linus.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 223 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 153 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7a98bae635fa..b0daaea44eaa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -74,6 +74,8 @@ enum {
 	TRUSTEE_RELEASE		= 3,		/* release workers */
 	TRUSTEE_DONE		= 4,		/* trustee is done */
 
+	NR_WORKER_POOLS		= 1,		/* # worker pools per gcwq */
+
 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
 	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
 	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
@@ -274,6 +276,9 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
 
+#define for_each_worker_pool(pool, gcwq)				\
+	for ((pool) = &(gcwq)->pool; (pool); (pool) = NULL)
+
 #define for_each_busy_worker(worker, i, pos, gcwq)			\
 	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
 		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -454,7 +459,7 @@ static bool workqueue_freezing;		/* W: have wqs started freezing? */
  * try_to_wake_up().  Put it in a separate cacheline.
  */
 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
 
 /*
  * Global cpu workqueue and nr_running counter for unbound gcwq.  The
@@ -462,7 +467,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
  * workers have WORKER_UNBOUND set.
  */
 static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);	/* always 0 */
+static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
+	[0 ... NR_WORKER_POOLS - 1]	= ATOMIC_INIT(0),	/* always 0 */
+};
 
 static int worker_thread(void *__worker);
 
@@ -477,11 +484,12 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
 static atomic_t *get_pool_nr_running(struct worker_pool *pool)
 {
 	int cpu = pool->gcwq->cpu;
+	int idx = 0;
 
 	if (cpu != WORK_CPU_UNBOUND)
-		return &per_cpu(gcwq_nr_running, cpu);
+		return &per_cpu(pool_nr_running, cpu)[idx];
 	else
-		return &unbound_gcwq_nr_running;
+		return &unbound_pool_nr_running[idx];
 }
 
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
@@ -3345,9 +3353,30 @@ EXPORT_SYMBOL_GPL(work_busy);
 	__ret1 < 0 ? -1 : 0;						\
 })
 
+static bool gcwq_is_managing_workers(struct global_cwq *gcwq)
+{
+	struct worker_pool *pool;
+
+	for_each_worker_pool(pool, gcwq)
+		if (pool->flags & POOL_MANAGING_WORKERS)
+			return true;
+	return false;
+}
+
+static bool gcwq_has_idle_workers(struct global_cwq *gcwq)
+{
+	struct worker_pool *pool;
+
+	for_each_worker_pool(pool, gcwq)
+		if (!list_empty(&pool->idle_list))
+			return true;
+	return false;
+}
+
 static int __cpuinit trustee_thread(void *__gcwq)
 {
 	struct global_cwq *gcwq = __gcwq;
+	struct worker_pool *pool;
 	struct worker *worker;
 	struct work_struct *work;
 	struct hlist_node *pos;
@@ -3363,13 +3392,15 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * cancelled.
 	 */
 	BUG_ON(gcwq->cpu != smp_processor_id());
-	rc = trustee_wait_event(!(gcwq->pool.flags & POOL_MANAGING_WORKERS));
+	rc = trustee_wait_event(!gcwq_is_managing_workers(gcwq));
 	BUG_ON(rc < 0);
 
-	gcwq->pool.flags |= POOL_MANAGING_WORKERS;
+	for_each_worker_pool(pool, gcwq) {
+		pool->flags |= POOL_MANAGING_WORKERS;
 
-	list_for_each_entry(worker, &gcwq->pool.idle_list, entry)
-		worker->flags |= WORKER_ROGUE;
+		list_for_each_entry(worker, &pool->idle_list, entry)
+			worker->flags |= WORKER_ROGUE;
+	}
 
 	for_each_busy_worker(worker, i, pos, gcwq)
 		worker->flags |= WORKER_ROGUE;
@@ -3390,10 +3421,12 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * keep_working() are always true as long as the worklist is
 	 * not empty.
 	 */
-	atomic_set(get_pool_nr_running(&gcwq->pool), 0);
+	for_each_worker_pool(pool, gcwq)
+		atomic_set(get_pool_nr_running(pool), 0);
 
 	spin_unlock_irq(&gcwq->lock);
-	del_timer_sync(&gcwq->pool.idle_timer);
+	for_each_worker_pool(pool, gcwq)
+		del_timer_sync(&pool->idle_timer);
 	spin_lock_irq(&gcwq->lock);
 
 	/*
@@ -3415,29 +3448,38 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * may be frozen works in freezable cwqs.  Don't declare
 	 * completion while frozen.
 	 */
-	while (gcwq->pool.nr_workers != gcwq->pool.nr_idle ||
-	       gcwq->flags & GCWQ_FREEZING ||
-	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
-		int nr_works = 0;
+	while (true) {
+		bool busy = false;
 
-		list_for_each_entry(work, &gcwq->pool.worklist, entry) {
-			send_mayday(work);
-			nr_works++;
-		}
+		for_each_worker_pool(pool, gcwq)
+			busy |= pool->nr_workers != pool->nr_idle;
 
-		list_for_each_entry(worker, &gcwq->pool.idle_list, entry) {
-			if (!nr_works--)
-				break;
-			wake_up_process(worker->task);
-		}
+		if (!busy && !(gcwq->flags & GCWQ_FREEZING) &&
+		    gcwq->trustee_state != TRUSTEE_IN_CHARGE)
+			break;
 
-		if (need_to_create_worker(&gcwq->pool)) {
-			spin_unlock_irq(&gcwq->lock);
-			worker = create_worker(&gcwq->pool, false);
-			spin_lock_irq(&gcwq->lock);
-			if (worker) {
-				worker->flags |= WORKER_ROGUE;
-				start_worker(worker);
+		for_each_worker_pool(pool, gcwq) {
+			int nr_works = 0;
+
+			list_for_each_entry(work, &pool->worklist, entry) {
+				send_mayday(work);
+				nr_works++;
+			}
+
+			list_for_each_entry(worker, &pool->idle_list, entry) {
+				if (!nr_works--)
+					break;
+				wake_up_process(worker->task);
+			}
+
+			if (need_to_create_worker(pool)) {
+				spin_unlock_irq(&gcwq->lock);
+				worker = create_worker(pool, false);
+				spin_lock_irq(&gcwq->lock);
+				if (worker) {
+					worker->flags |= WORKER_ROGUE;
+					start_worker(worker);
+				}
 			}
 		}
 
@@ -3452,11 +3494,18 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * all workers till we're canceled.
 	 */
 	do {
-		rc = trustee_wait_event(!list_empty(&gcwq->pool.idle_list));
-		while (!list_empty(&gcwq->pool.idle_list))
-			destroy_worker(list_first_entry(&gcwq->pool.idle_list,
-							struct worker, entry));
-	} while (gcwq->pool.nr_workers && rc >= 0);
+		rc = trustee_wait_event(gcwq_has_idle_workers(gcwq));
+
+		i = 0;
+		for_each_worker_pool(pool, gcwq) {
+			while (!list_empty(&pool->idle_list)) {
+				worker = list_first_entry(&pool->idle_list,
+							  struct worker, entry);
+				destroy_worker(worker);
+			}
+			i |= pool->nr_workers;
+		}
+	} while (i && rc >= 0);
 
 	/*
 	 * At this point, either draining has completed and no worker
@@ -3465,7 +3514,8 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * Tell the remaining busy ones to rebind once it finishes the
 	 * currently scheduled works by scheduling the rebind_work.
 	 */
-	WARN_ON(!list_empty(&gcwq->pool.idle_list));
+	for_each_worker_pool(pool, gcwq)
+		WARN_ON(!list_empty(&pool->idle_list));
 
 	for_each_busy_worker(worker, i, pos, gcwq) {
 		struct work_struct *rebind_work = &worker->rebind_work;
@@ -3490,7 +3540,8 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	}
 
 	/* relinquish manager role */
-	gcwq->pool.flags &= ~POOL_MANAGING_WORKERS;
+	for_each_worker_pool(pool, gcwq)
+		pool->flags &= ~POOL_MANAGING_WORKERS;
 
 	/* notify completion */
 	gcwq->trustee = NULL;
@@ -3532,8 +3583,10 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	unsigned int cpu = (unsigned long)hcpu;
 	struct global_cwq *gcwq = get_gcwq(cpu);
 	struct task_struct *new_trustee = NULL;
-	struct worker *uninitialized_var(new_worker);
+	struct worker *new_workers[NR_WORKER_POOLS] = { };
+	struct worker_pool *pool;
 	unsigned long flags;
+	int i;
 
 	action &= ~CPU_TASKS_FROZEN;
 
@@ -3546,12 +3599,12 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		kthread_bind(new_trustee, cpu);
 		/* fall through */
 	case CPU_UP_PREPARE:
-		BUG_ON(gcwq->pool.first_idle);
-		new_worker = create_worker(&gcwq->pool, false);
-		if (!new_worker) {
-			if (new_trustee)
-				kthread_stop(new_trustee);
-			return NOTIFY_BAD;
+		i = 0;
+		for_each_worker_pool(pool, gcwq) {
+			BUG_ON(pool->first_idle);
+			new_workers[i] = create_worker(pool, false);
+			if (!new_workers[i++])
+				goto err_destroy;
 		}
 	}
 
@@ -3568,8 +3621,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
 		/* fall through */
 	case CPU_UP_PREPARE:
-		BUG_ON(gcwq->pool.first_idle);
-		gcwq->pool.first_idle = new_worker;
+		i = 0;
+		for_each_worker_pool(pool, gcwq) {
+			BUG_ON(pool->first_idle);
+			pool->first_idle = new_workers[i++];
+		}
 		break;
 
 	case CPU_DYING:
@@ -3586,8 +3642,10 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		gcwq->trustee_state = TRUSTEE_BUTCHER;
 		/* fall through */
 	case CPU_UP_CANCELED:
-		destroy_worker(gcwq->pool.first_idle);
-		gcwq->pool.first_idle = NULL;
+		for_each_worker_pool(pool, gcwq) {
+			destroy_worker(pool->first_idle);
+			pool->first_idle = NULL;
+		}
 		break;
 
 	case CPU_DOWN_FAILED:
@@ -3604,18 +3662,32 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		 * Put the first_idle in and request a real manager to
 		 * take a look.
 		 */
-		spin_unlock_irq(&gcwq->lock);
-		kthread_bind(gcwq->pool.first_idle->task, cpu);
-		spin_lock_irq(&gcwq->lock);
-		gcwq->pool.flags |= POOL_MANAGE_WORKERS;
-		start_worker(gcwq->pool.first_idle);
-		gcwq->pool.first_idle = NULL;
+		for_each_worker_pool(pool, gcwq) {
+			spin_unlock_irq(&gcwq->lock);
+			kthread_bind(pool->first_idle->task, cpu);
+			spin_lock_irq(&gcwq->lock);
+			pool->flags |= POOL_MANAGE_WORKERS;
+			start_worker(pool->first_idle);
+			pool->first_idle = NULL;
+		}
 		break;
 	}
 
 	spin_unlock_irqrestore(&gcwq->lock, flags);
 
 	return notifier_from_errno(0);
+
+err_destroy:
+	if (new_trustee)
+		kthread_stop(new_trustee);
+
+	spin_lock_irqsave(&gcwq->lock, flags);
+	for (i = 0; i < NR_WORKER_POOLS; i++)
+		if (new_workers[i])
+			destroy_worker(new_workers[i]);
+	spin_unlock_irqrestore(&gcwq->lock, flags);
+
+	return NOTIFY_BAD;
 }
 
 #ifdef CONFIG_SMP
@@ -3774,6 +3846,7 @@ void thaw_workqueues(void)
 
 	for_each_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct worker_pool *pool;
 		struct workqueue_struct *wq;
 
 		spin_lock_irq(&gcwq->lock);
@@ -3795,7 +3868,8 @@ void thaw_workqueues(void)
 				cwq_activate_first_delayed(cwq);
 		}
 
-		wake_up_worker(&gcwq->pool);
+		for_each_worker_pool(pool, gcwq)
+			wake_up_worker(pool);
 
 		spin_unlock_irq(&gcwq->lock);
 	}
@@ -3816,25 +3890,29 @@ static int __init init_workqueues(void)
 	/* initialize gcwqs */
 	for_each_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct worker_pool *pool;
 
 		spin_lock_init(&gcwq->lock);
-		gcwq->pool.gcwq = gcwq;
-		INIT_LIST_HEAD(&gcwq->pool.worklist);
 		gcwq->cpu = cpu;
 		gcwq->flags |= GCWQ_DISASSOCIATED;
 
-		INIT_LIST_HEAD(&gcwq->pool.idle_list);
 		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
 			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
 
-		init_timer_deferrable(&gcwq->pool.idle_timer);
-		gcwq->pool.idle_timer.function = idle_worker_timeout;
-		gcwq->pool.idle_timer.data = (unsigned long)&gcwq->pool;
+		for_each_worker_pool(pool, gcwq) {
+			pool->gcwq = gcwq;
+			INIT_LIST_HEAD(&pool->worklist);
+			INIT_LIST_HEAD(&pool->idle_list);
 
-		setup_timer(&gcwq->pool.mayday_timer, gcwq_mayday_timeout,
-			    (unsigned long)&gcwq->pool);
+			init_timer_deferrable(&pool->idle_timer);
+			pool->idle_timer.function = idle_worker_timeout;
+			pool->idle_timer.data = (unsigned long)pool;
 
-		ida_init(&gcwq->pool.worker_ida);
+			setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
+				    (unsigned long)pool);
+
+			ida_init(&pool->worker_ida);
+		}
 
 		gcwq->trustee_state = TRUSTEE_DONE;
 		init_waitqueue_head(&gcwq->trustee_wait);
@@ -3843,15 +3921,20 @@ static int __init init_workqueues(void)
 	/* create the initial worker */
 	for_each_online_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
-		struct worker *worker;
+		struct worker_pool *pool;
 
 		if (cpu != WORK_CPU_UNBOUND)
 			gcwq->flags &= ~GCWQ_DISASSOCIATED;
-		worker = create_worker(&gcwq->pool, true);
-		BUG_ON(!worker);
-		spin_lock_irq(&gcwq->lock);
-		start_worker(worker);
-		spin_unlock_irq(&gcwq->lock);
+
+		for_each_worker_pool(pool, gcwq) {
+			struct worker *worker;
+
+			worker = create_worker(pool, true);
+			BUG_ON(!worker);
+			spin_lock_irq(&gcwq->lock);
+			start_worker(worker);
+			spin_unlock_irq(&gcwq->lock);
+		}
 	}
 
 	system_wq = alloc_workqueue("events", 0, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 3270476a6c0ce322354df8679652f060d66526dc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Jul 2012 22:16:45 -0700
Subject: workqueue: reimplement WQ_HIGHPRI using a separate worker_pool

WQ_HIGHPRI was implemented by queueing highpri work items at the head
of the global worklist.  Other than queueing at the head, they weren't
handled differently; unfortunately, this could lead to execution
latency of a few seconds on heavily loaded systems.

Now that workqueue code has been updated to deal with multiple
worker_pools per global_cwq, this patch reimplements WQ_HIGHPRI using
a separate worker_pool.  NR_WORKER_POOLS is bumped to two and
gcwq->pools[0] is used for normal pri work items and ->pools[1] for
highpri.  Highpri workers get -20 nice level and has 'H' suffix in
their names.  Note that this change increases the number of kworkers
per cpu.

POOL_HIGHPRI_PENDING, pool_determine_ins_pos() and highpri chain
wakeup code in process_one_work() are no longer used and removed.

This allows proper prioritization of highpri work items and removes
high execution latency of highpri work items.

v2: nr_running indexing bug in get_pool_nr_running() fixed.

v3: Refreshed for the get_pool_nr_running() update in the previous
    patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Josh Hunt <joshhunt00@gmail.com>
LKML-Reference: <CAKA=qzaHqwZ8eqpLNFjxnO2fX-tgAOjmpvxgBFjv6dJeQaOW1w@mail.gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
---
 Documentation/workqueue.txt | 103 ++++++++++++++++----------------------------
 kernel/workqueue.c          | 100 ++++++++++++------------------------------
 2 files changed, 65 insertions(+), 138 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/workqueue.txt b/Documentation/workqueue.txt
index a0b577de918f..a6ab4b62d926 100644
--- a/Documentation/workqueue.txt
+++ b/Documentation/workqueue.txt
@@ -89,25 +89,28 @@ called thread-pools.
 
 The cmwq design differentiates between the user-facing workqueues that
 subsystems and drivers queue work items on and the backend mechanism
-which manages thread-pool and processes the queued work items.
+which manages thread-pools and processes the queued work items.
 
 The backend is called gcwq.  There is one gcwq for each possible CPU
-and one gcwq to serve work items queued on unbound workqueues.
+and one gcwq to serve work items queued on unbound workqueues.  Each
+gcwq has two thread-pools - one for normal work items and the other
+for high priority ones.
 
 Subsystems and drivers can create and queue work items through special
 workqueue API functions as they see fit. They can influence some
 aspects of the way the work items are executed by setting flags on the
 workqueue they are putting the work item on. These flags include
-things like CPU locality, reentrancy, concurrency limits and more. To
-get a detailed overview refer to the API description of
+things like CPU locality, reentrancy, concurrency limits, priority and
+more.  To get a detailed overview refer to the API description of
 alloc_workqueue() below.
 
-When a work item is queued to a workqueue, the target gcwq is
-determined according to the queue parameters and workqueue attributes
-and appended on the shared worklist of the gcwq.  For example, unless
-specifically overridden, a work item of a bound workqueue will be
-queued on the worklist of exactly that gcwq that is associated to the
-CPU the issuer is running on.
+When a work item is queued to a workqueue, the target gcwq and
+thread-pool is determined according to the queue parameters and
+workqueue attributes and appended on the shared worklist of the
+thread-pool.  For example, unless specifically overridden, a work item
+of a bound workqueue will be queued on the worklist of either normal
+or highpri thread-pool of the gcwq that is associated to the CPU the
+issuer is running on.
 
 For any worker pool implementation, managing the concurrency level
 (how many execution contexts are active) is an important issue.  cmwq
@@ -115,26 +118,26 @@ tries to keep the concurrency at a minimal but sufficient level.
 Minimal to save resources and sufficient in that the system is used at
 its full capacity.
 
-Each gcwq bound to an actual CPU implements concurrency management by
-hooking into the scheduler.  The gcwq is notified whenever an active
-worker wakes up or sleeps and keeps track of the number of the
-currently runnable workers.  Generally, work items are not expected to
-hog a CPU and consume many cycles.  That means maintaining just enough
-concurrency to prevent work processing from stalling should be
-optimal.  As long as there are one or more runnable workers on the
-CPU, the gcwq doesn't start execution of a new work, but, when the
-last running worker goes to sleep, it immediately schedules a new
-worker so that the CPU doesn't sit idle while there are pending work
-items.  This allows using a minimal number of workers without losing
-execution bandwidth.
+Each thread-pool bound to an actual CPU implements concurrency
+management by hooking into the scheduler.  The thread-pool is notified
+whenever an active worker wakes up or sleeps and keeps track of the
+number of the currently runnable workers.  Generally, work items are
+not expected to hog a CPU and consume many cycles.  That means
+maintaining just enough concurrency to prevent work processing from
+stalling should be optimal.  As long as there are one or more runnable
+workers on the CPU, the thread-pool doesn't start execution of a new
+work, but, when the last running worker goes to sleep, it immediately
+schedules a new worker so that the CPU doesn't sit idle while there
+are pending work items.  This allows using a minimal number of workers
+without losing execution bandwidth.
 
 Keeping idle workers around doesn't cost other than the memory space
 for kthreads, so cmwq holds onto idle ones for a while before killing
 them.
 
 For an unbound wq, the above concurrency management doesn't apply and
-the gcwq for the pseudo unbound CPU tries to start executing all work
-items as soon as possible.  The responsibility of regulating
+the thread-pools for the pseudo unbound CPU try to start executing all
+work items as soon as possible.  The responsibility of regulating
 concurrency level is on the users.  There is also a flag to mark a
 bound wq to ignore the concurrency management.  Please refer to the
 API section for details.
@@ -205,31 +208,22 @@ resources, scheduled and executed.
 
   WQ_HIGHPRI
 
-	Work items of a highpri wq are queued at the head of the
-	worklist of the target gcwq and start execution regardless of
-	the current concurrency level.  In other words, highpri work
-	items will always start execution as soon as execution
-	resource is available.
+	Work items of a highpri wq are queued to the highpri
+	thread-pool of the target gcwq.  Highpri thread-pools are
+	served by worker threads with elevated nice level.
 
-	Ordering among highpri work items is preserved - a highpri
-	work item queued after another highpri work item will start
-	execution after the earlier highpri work item starts.
-
-	Although highpri work items are not held back by other
-	runnable work items, they still contribute to the concurrency
-	level.  Highpri work items in runnable state will prevent
-	non-highpri work items from starting execution.
-
-	This flag is meaningless for unbound wq.
+	Note that normal and highpri thread-pools don't interact with
+	each other.  Each maintain its separate pool of workers and
+	implements concurrency management among its workers.
 
   WQ_CPU_INTENSIVE
 
 	Work items of a CPU intensive wq do not contribute to the
 	concurrency level.  In other words, runnable CPU intensive
-	work items will not prevent other work items from starting
-	execution.  This is useful for bound work items which are
-	expected to hog CPU cycles so that their execution is
-	regulated by the system scheduler.
+	work items will not prevent other work items in the same
+	thread-pool from starting execution.  This is useful for bound
+	work items which are expected to hog CPU cycles so that their
+	execution is regulated by the system scheduler.
 
 	Although CPU intensive work items don't contribute to the
 	concurrency level, start of their executions is still
@@ -239,14 +233,6 @@ resources, scheduled and executed.
 
 	This flag is meaningless for unbound wq.
 
-  WQ_HIGHPRI | WQ_CPU_INTENSIVE
-
-	This combination makes the wq avoid interaction with
-	concurrency management completely and behave as a simple
-	per-CPU execution context provider.  Work items queued on a
-	highpri CPU-intensive wq start execution as soon as resources
-	are available and don't affect execution of other work items.
-
 @max_active:
 
 @max_active determines the maximum number of execution contexts per
@@ -328,20 +314,7 @@ If @max_active == 2,
  35		w2 wakes up and finishes
 
 Now, let's assume w1 and w2 are queued to a different wq q1 which has
-WQ_HIGHPRI set,
-
- TIME IN MSECS	EVENT
- 0		w1 and w2 start and burn CPU
- 5		w1 sleeps
- 10		w2 sleeps
- 10		w0 starts and burns CPU
- 15		w0 sleeps
- 15		w1 wakes up and finishes
- 20		w2 wakes up and finishes
- 25		w0 wakes up and burns CPU
- 30		w0 finishes
-
-If q1 has WQ_CPU_INTENSIVE set,
+WQ_CPU_INTENSIVE set,
 
  TIME IN MSECS	EVENT
  0		w0 starts and burns CPU
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b0daaea44eaa..4fa9e3552f1e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -52,7 +52,6 @@ enum {
 	/* pool flags */
 	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
 	POOL_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
-	POOL_HIGHPRI_PENDING	= 1 << 2,	/* highpri works on queue */
 
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
@@ -74,7 +73,7 @@ enum {
 	TRUSTEE_RELEASE		= 3,		/* release workers */
 	TRUSTEE_DONE		= 4,		/* trustee is done */
 
-	NR_WORKER_POOLS		= 1,		/* # worker pools per gcwq */
+	NR_WORKER_POOLS		= 2,		/* # worker pools per gcwq */
 
 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
 	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
@@ -95,6 +94,7 @@ enum {
 	 * all cpus.  Give -20.
 	 */
 	RESCUER_NICE_LEVEL	= -20,
+	HIGHPRI_NICE_LEVEL	= -20,
 };
 
 /*
@@ -174,7 +174,7 @@ struct global_cwq {
 	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
 						/* L: hash of busy workers */
 
-	struct worker_pool	pool;		/* the worker pools */
+	struct worker_pool	pools[2];	/* normal and highpri pools */
 
 	struct task_struct	*trustee;	/* L: for gcwq shutdown */
 	unsigned int		trustee_state;	/* L: trustee state */
@@ -277,7 +277,8 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 #include <trace/events/workqueue.h>
 
 #define for_each_worker_pool(pool, gcwq)				\
-	for ((pool) = &(gcwq)->pool; (pool); (pool) = NULL)
+	for ((pool) = &(gcwq)->pools[0];				\
+	     (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
 
 #define for_each_busy_worker(worker, i, pos, gcwq)			\
 	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
@@ -473,6 +474,11 @@ static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
 
 static int worker_thread(void *__worker);
 
+static int worker_pool_pri(struct worker_pool *pool)
+{
+	return pool - pool->gcwq->pools;
+}
+
 static struct global_cwq *get_gcwq(unsigned int cpu)
 {
 	if (cpu != WORK_CPU_UNBOUND)
@@ -484,7 +490,7 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
 static atomic_t *get_pool_nr_running(struct worker_pool *pool)
 {
 	int cpu = pool->gcwq->cpu;
-	int idx = 0;
+	int idx = worker_pool_pri(pool);
 
 	if (cpu != WORK_CPU_UNBOUND)
 		return &per_cpu(pool_nr_running, cpu)[idx];
@@ -586,15 +592,14 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 }
 
 /*
- * Policy functions.  These define the policies on how the global
- * worker pool is managed.  Unless noted otherwise, these functions
- * assume that they're being called with gcwq->lock held.
+ * Policy functions.  These define the policies on how the global worker
+ * pools are managed.  Unless noted otherwise, these functions assume that
+ * they're being called with gcwq->lock held.
  */
 
 static bool __need_more_worker(struct worker_pool *pool)
 {
-	return !atomic_read(get_pool_nr_running(pool)) ||
-		(pool->flags & POOL_HIGHPRI_PENDING);
+	return !atomic_read(get_pool_nr_running(pool));
 }
 
 /*
@@ -621,9 +626,7 @@ static bool keep_working(struct worker_pool *pool)
 {
 	atomic_t *nr_running = get_pool_nr_running(pool);
 
-	return !list_empty(&pool->worklist) &&
-		(atomic_read(nr_running) <= 1 ||
-		 (pool->flags & POOL_HIGHPRI_PENDING));
+	return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
 }
 
 /* Do we need a new worker?  Called from manager. */
@@ -891,43 +894,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 					    work);
 }
 
-/**
- * pool_determine_ins_pos - find insertion position
- * @pool: pool of interest
- * @cwq: cwq a work is being queued for
- *
- * A work for @cwq is about to be queued on @pool, determine insertion
- * position for the work.  If @cwq is for HIGHPRI wq, the work is
- * queued at the head of the queue but in FIFO order with respect to
- * other HIGHPRI works; otherwise, at the end of the queue.  This
- * function also sets POOL_HIGHPRI_PENDING flag to hint @pool that
- * there are HIGHPRI works pending.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to inserstion position.
- */
-static inline struct list_head *pool_determine_ins_pos(struct worker_pool *pool,
-					       struct cpu_workqueue_struct *cwq)
-{
-	struct work_struct *twork;
-
-	if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
-		return &pool->worklist;
-
-	list_for_each_entry(twork, &pool->worklist, entry) {
-		struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
-
-		if (!(tcwq->wq->flags & WQ_HIGHPRI))
-			break;
-	}
-
-	pool->flags |= POOL_HIGHPRI_PENDING;
-	return &twork->entry;
-}
-
 /**
  * insert_work - insert a work into gcwq
  * @cwq: cwq @work belongs to
@@ -1068,7 +1034,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	if (likely(cwq->nr_active < cwq->max_active)) {
 		trace_workqueue_activate_work(work);
 		cwq->nr_active++;
-		worklist = pool_determine_ins_pos(cwq->pool, cwq);
+		worklist = &cwq->pool->worklist;
 	} else {
 		work_flags |= WORK_STRUCT_DELAYED;
 		worklist = &cwq->delayed_works;
@@ -1385,6 +1351,7 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
 {
 	struct global_cwq *gcwq = pool->gcwq;
 	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+	const char *pri = worker_pool_pri(pool) ? "H" : "";
 	struct worker *worker = NULL;
 	int id = -1;
 
@@ -1406,15 +1373,17 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
 
 	if (!on_unbound_cpu)
 		worker->task = kthread_create_on_node(worker_thread,
-						      worker,
-						      cpu_to_node(gcwq->cpu),
-						      "kworker/%u:%d", gcwq->cpu, id);
+					worker, cpu_to_node(gcwq->cpu),
+					"kworker/%u:%d%s", gcwq->cpu, id, pri);
 	else
 		worker->task = kthread_create(worker_thread, worker,
-					      "kworker/u:%d", id);
+					      "kworker/u:%d%s", id, pri);
 	if (IS_ERR(worker->task))
 		goto fail;
 
+	if (worker_pool_pri(pool))
+		set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
+
 	/*
 	 * A rogue worker will become a regular one if CPU comes
 	 * online later on.  Make sure every worker has
@@ -1761,10 +1730,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 {
 	struct work_struct *work = list_first_entry(&cwq->delayed_works,
 						    struct work_struct, entry);
-	struct list_head *pos = pool_determine_ins_pos(cwq->pool, cwq);
 
 	trace_workqueue_activate_work(work);
-	move_linked_works(work, pos, NULL);
+	move_linked_works(work, &cwq->pool->worklist, NULL);
 	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
 	cwq->nr_active++;
 }
@@ -1879,21 +1847,6 @@ __acquires(&gcwq->lock)
 	set_work_cpu(work, gcwq->cpu);
 	list_del_init(&work->entry);
 
-	/*
-	 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
-	 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
-	 */
-	if (unlikely(pool->flags & POOL_HIGHPRI_PENDING)) {
-		struct work_struct *nwork = list_first_entry(&pool->worklist,
-					 struct work_struct, entry);
-
-		if (!list_empty(&pool->worklist) &&
-		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
-			wake_up_worker(pool);
-		else
-			pool->flags &= ~POOL_HIGHPRI_PENDING;
-	}
-
 	/*
 	 * CPU intensive works don't participate in concurrency
 	 * management.  They're the scheduler's responsibility.
@@ -3047,9 +3000,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 	for_each_cwq_cpu(cpu, wq) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		struct global_cwq *gcwq = get_gcwq(cpu);
+		int pool_idx = (bool)(flags & WQ_HIGHPRI);
 
 		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
-		cwq->pool = &gcwq->pool;
+		cwq->pool = &gcwq->pools[pool_idx];
 		cwq->wq = wq;
 		cwq->flush_color = -1;
 		cwq->max_active = max_active;
-- 
cgit v1.2.3-59-g8ed1b


From 00cd8dd3bf95f2cc8435b4cac01d9995635c6d0b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 10 Jun 2012 17:13:09 -0400
Subject: stop passing nameidata to ->lookup()

Just the flags; only NFS cares even about that, but there are
legitimate uses for such argument.  And getting rid of that
completely would require splitting ->lookup() into a couple
of methods (at least), so let's leave that alone for now...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |  3 +--
 Documentation/filesystems/porting |  4 ++--
 Documentation/filesystems/vfs.txt |  2 +-
 fs/9p/v9fs.h                      |  2 +-
 fs/9p/vfs_inode.c                 |  8 ++++----
 fs/9p/vfs_inode_dotl.c            |  2 +-
 fs/adfs/dir.c                     |  2 +-
 fs/affs/affs.h                    |  2 +-
 fs/affs/namei.c                   |  2 +-
 fs/afs/dir.c                      |  4 ++--
 fs/afs/mntpt.c                    |  4 ++--
 fs/autofs4/root.c                 |  4 ++--
 fs/bad_inode.c                    |  2 +-
 fs/befs/linuxvfs.c                |  4 ++--
 fs/bfs/dir.c                      |  2 +-
 fs/btrfs/inode.c                  |  2 +-
 fs/ceph/dir.c                     |  6 +++---
 fs/cifs/cifsfs.h                  |  2 +-
 fs/cifs/dir.c                     |  4 ++--
 fs/coda/dir.c                     |  4 ++--
 fs/configfs/dir.c                 |  2 +-
 fs/cramfs/inode.c                 |  2 +-
 fs/ecryptfs/inode.c               |  2 +-
 fs/efs/efs.h                      |  2 +-
 fs/efs/namei.c                    |  3 ++-
 fs/exofs/namei.c                  |  2 +-
 fs/ext2/namei.c                   |  2 +-
 fs/ext3/namei.c                   |  2 +-
 fs/ext4/namei.c                   |  2 +-
 fs/fat/namei_msdos.c              |  2 +-
 fs/fat/namei_vfat.c               |  2 +-
 fs/freevxfs/vxfs_lookup.c         |  4 ++--
 fs/fuse/dir.c                     |  4 ++--
 fs/gfs2/inode.c                   |  2 +-
 fs/hfs/dir.c                      |  2 +-
 fs/hfs/inode.c                    |  2 +-
 fs/hfsplus/dir.c                  |  2 +-
 fs/hfsplus/inode.c                |  2 +-
 fs/hostfs/hostfs_kern.c           |  2 +-
 fs/hpfs/dir.c                     |  2 +-
 fs/hpfs/hpfs_fn.h                 |  2 +-
 fs/hppfs/hppfs.c                  |  2 +-
 fs/isofs/isofs.h                  |  2 +-
 fs/isofs/namei.c                  |  2 +-
 fs/jffs2/dir.c                    |  4 ++--
 fs/jfs/namei.c                    |  2 +-
 fs/libfs.c                        |  2 +-
 fs/logfs/dir.c                    |  2 +-
 fs/minix/namei.c                  |  2 +-
 fs/namei.c                        |  2 +-
 fs/ncpfs/dir.c                    |  4 ++--
 fs/nfs/dir.c                      |  8 ++++----
 fs/nilfs2/namei.c                 |  2 +-
 fs/ntfs/namei.c                   |  2 +-
 fs/ocfs2/namei.c                  |  2 +-
 fs/omfs/dir.c                     |  2 +-
 fs/openpromfs/inode.c             |  4 ++--
 fs/proc/base.c                    | 18 ++++++++++--------
 fs/proc/generic.c                 |  2 +-
 fs/proc/internal.h                |  4 ++--
 fs/proc/namespaces.c              |  2 +-
 fs/proc/proc_net.c                |  2 +-
 fs/proc/proc_sysctl.c             |  2 +-
 fs/proc/root.c                    |  7 +++----
 fs/qnx4/namei.c                   |  2 +-
 fs/qnx4/qnx4.h                    |  2 +-
 fs/qnx6/namei.c                   |  2 +-
 fs/qnx6/qnx6.h                    |  2 +-
 fs/reiserfs/namei.c               |  2 +-
 fs/romfs/super.c                  |  2 +-
 fs/squashfs/namei.c               |  2 +-
 fs/sysfs/dir.c                    |  2 +-
 fs/sysv/namei.c                   |  2 +-
 fs/ubifs/dir.c                    |  2 +-
 fs/udf/namei.c                    |  2 +-
 fs/ufs/namei.c                    |  2 +-
 fs/xfs/xfs_iops.c                 |  4 ++--
 include/linux/fs.h                |  4 ++--
 kernel/cgroup.c                   |  4 ++--
 79 files changed, 115 insertions(+), 114 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 52a057367f6f..33f2c8f1db81 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -38,8 +38,7 @@ d_manage:	no		no		yes (ref-walk)	maybe
 --------------------------- inode_operations --------------------------- 
 prototypes:
 	int (*create) (struct inode *,struct dentry *,umode_t, struct nameidata *);
-	struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameid
-ata *);
+	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
 	int (*unlink) (struct inode *,struct dentry *);
 	int (*symlink) (struct inode *,struct dentry *,const char *);
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 56750b714d1e..690f573928b9 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -434,5 +434,5 @@ d_make_root() drops the reference to inode if dentry allocation fails.
 
 --
 [mandatory]
-	The witch is dead!  Well, 1/3 of it, anyway.  ->d_revalidate() does *not*
-take struct nameidata anymore; just the flags.
+	The witch is dead!  Well, 2/3 of it, anyway.  ->d_revalidate() and
+->lookup() do *not* take struct nameidata anymore; just the flags.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index b9a406b2ed0f..ee786354946c 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -342,7 +342,7 @@ filesystem. As of kernel 2.6.22, the following members are defined:
 
 struct inode_operations {
 	int (*create) (struct inode *,struct dentry *, umode_t, struct nameidata *);
-	struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
+	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
 	int (*unlink) (struct inode *,struct dentry *);
 	int (*symlink) (struct inode *,struct dentry *,const char *);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index e78956cbd702..34c59f14a1c9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -144,7 +144,7 @@ extern void v9fs_session_close(struct v9fs_session_info *v9ses);
 extern void v9fs_session_cancel(struct v9fs_session_info *v9ses);
 extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
 extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
-			struct nameidata *nameidata);
+			unsigned int flags);
 extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
 extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
 extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index eae476fb401c..bb0d7627f95b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -785,7 +785,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
  */
 
 struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
-				      struct nameidata *nameidata)
+				      unsigned int flags)
 {
 	struct dentry *res;
 	struct super_block *sb;
@@ -795,8 +795,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	char *name;
 	int result = 0;
 
-	p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
-		 dir, dentry->d_name.name, dentry, nameidata);
+	p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n",
+		 dir, dentry->d_name.name, dentry, flags);
 
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -869,7 +869,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	struct dentry *res = NULL;
 
 	if (d_unhashed(dentry)) {
-		res = v9fs_vfs_lookup(dir, dentry, NULL);
+		res = v9fs_vfs_lookup(dir, dentry, 0);
 		if (IS_ERR(res))
 			return PTR_ERR(res);
 
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1ee10c89df97..b97619fed196 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -259,7 +259,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	struct dentry *res = NULL;
 
 	if (d_unhashed(dentry)) {
-		res = v9fs_vfs_lookup(dir, dentry, NULL);
+		res = v9fs_vfs_lookup(dir, dentry, 0);
 		if (IS_ERR(res))
 			return PTR_ERR(res);
 
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 3d83075aaa2e..b3be2e7c5643 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -266,7 +266,7 @@ const struct dentry_operations adfs_dentry_operations = {
 };
 
 static struct dentry *
-adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+adfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode = NULL;
 	struct object_info obj;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 3a130e27eb15..49e4e3457bfd 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -153,7 +153,7 @@ extern void	affs_free_bitmap(struct super_block *sb);
 /* namei.c */
 
 extern int	affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len);
-extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *);
+extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int);
 extern int	affs_unlink(struct inode *dir, struct dentry *dentry);
 extern int	affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *);
 extern int	affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 47806940aac0..7f9721be709f 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -211,7 +211,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry)
 }
 
 struct dentry *
-affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	struct super_block *sb = dir->i_sb;
 	struct buffer_head *bh;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 65c54ab04733..ffb33e36ea72 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -20,7 +20,7 @@
 #include "internal.h"
 
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
-				 struct nameidata *nd);
+				 unsigned int flags);
 static int afs_dir_open(struct inode *inode, struct file *file);
 static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
 static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
@@ -516,7 +516,7 @@ out:
  * look up an entry in a directory
  */
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
-				 struct nameidata *nd)
+				 unsigned int flags)
 {
 	struct afs_vnode *vnode;
 	struct afs_fid fid;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 298cf8919ec7..9682c33d5daf 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -22,7 +22,7 @@
 
 static struct dentry *afs_mntpt_lookup(struct inode *dir,
 				       struct dentry *dentry,
-				       struct nameidata *nd);
+				       unsigned int flags);
 static int afs_mntpt_open(struct inode *inode, struct file *file);
 static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 
@@ -104,7 +104,7 @@ out:
  */
 static struct dentry *afs_mntpt_lookup(struct inode *dir,
 				       struct dentry *dentry,
-				       struct nameidata *nd)
+				       unsigned int flags)
 {
 	_enter("%p,%p{%p{%s},%s}",
 	       dir,
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 75e5f1c8e028..e7396cfdb109 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -32,7 +32,7 @@ static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
 static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 #endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
-static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
+static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int);
 static struct vfsmount *autofs4_d_automount(struct path *);
 static int autofs4_d_manage(struct dentry *, bool);
 static void autofs4_dentry_release(struct dentry *);
@@ -458,7 +458,7 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 }
 
 /* Lookups in the root directory */
-static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	struct autofs_sb_info *sbi;
 	struct autofs_info *ino;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 1b35d6bd06b0..d27e73c69ba4 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -179,7 +179,7 @@ static int bad_inode_create (struct inode *dir, struct dentry *dentry,
 }
 
 static struct dentry *bad_inode_lookup(struct inode *dir,
-			struct dentry *dentry, struct nameidata *nd)
+			struct dentry *dentry, unsigned int flags)
 {
 	return ERR_PTR(-EIO);
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index e18da23d42b5..cf7f3c67c8b7 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -34,7 +34,7 @@ static int befs_readdir(struct file *, void *, filldir_t);
 static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 static int befs_readpage(struct file *file, struct page *page);
 static sector_t befs_bmap(struct address_space *mapping, sector_t block);
-static struct dentry *befs_lookup(struct inode *, struct dentry *, struct nameidata *);
+static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int);
 static struct inode *befs_iget(struct super_block *, unsigned long);
 static struct inode *befs_alloc_inode(struct super_block *sb);
 static void befs_destroy_inode(struct inode *inode);
@@ -159,7 +159,7 @@ befs_get_block(struct inode *inode, sector_t block,
 }
 
 static struct dentry *
-befs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode = NULL;
 	struct super_block *sb = dir->i_sb;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d12c7966db27..3f1cd3b71681 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -133,7 +133,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 }
 
 static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
-						struct nameidata *nd)
+						unsigned int flags)
 {
 	struct inode *inode = NULL;
 	struct buffer_head *bh;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a101572f1cea..e5f1f81b2d65 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4247,7 +4247,7 @@ static void btrfs_dentry_release(struct dentry *dentry)
 }
 
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
+				   unsigned int flags)
 {
 	struct dentry *ret;
 
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 8898eef8bca9..74b2f3c54fe7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -576,7 +576,7 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
  * the MDS so that it gets our 'caps wanted' value in a single op.
  */
 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
-				  struct nameidata *nd)
+				  unsigned int flags)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -653,7 +653,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	}
 
 	if (d_unhashed(dentry)) {
-		res = ceph_lookup(dir, dentry, NULL);
+		res = ceph_lookup(dir, dentry, 0);
 		if (IS_ERR(res))
 			return PTR_ERR(res);
 
@@ -678,7 +678,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
  */
 int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
 {
-	struct dentry *result = ceph_lookup(dir, dentry, NULL);
+	struct dentry *result = ceph_lookup(dir, dentry, 0);
 
 	if (result && !IS_ERR(result)) {
 		/*
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 48bb474ce294..1abd31fd5bf0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -50,7 +50,7 @@ extern int cifs_atomic_open(struct inode *, struct dentry *,
 			    struct file *, unsigned, umode_t,
 			    int *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
-				  struct nameidata *);
+				  unsigned int);
 extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
 extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
 extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index b97ff48b7df6..2d732b9276ee 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -401,7 +401,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
 	 * in network traffic in the other paths.
 	 */
 	if (!(oflags & O_CREAT)) {
-		struct dentry *res = cifs_lookup(inode, direntry, NULL);
+		struct dentry *res = cifs_lookup(inode, direntry, 0);
 		if (IS_ERR(res))
 			return PTR_ERR(res);
 
@@ -621,7 +621,7 @@ mknod_out:
 
 struct dentry *
 cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
-	    struct nameidata *nd)
+	    unsigned int flags)
 {
 	int xid;
 	int rc = 0; /* to get around spurious gcc warning, set to zero here */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 7f8f1a7c6d87..da35e965861d 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -31,7 +31,7 @@
 
 /* dir inode-ops */
 static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, struct nameidata *nd);
-static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, struct nameidata *nd);
+static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, unsigned int flags);
 static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, 
 		     struct dentry *entry);
 static int coda_unlink(struct inode *dir_inode, struct dentry *entry);
@@ -94,7 +94,7 @@ const struct file_operations coda_dir_operations = {
 
 /* inode operations for directories */
 /* access routines: lookup, readlink, permission */
-static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd)
+static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags)
 {
 	struct super_block *sb = dir->i_sb;
 	const char *name = entry->d_name.name;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7e6c52d8a207..7414ae24a79b 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -442,7 +442,7 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
 
 static struct dentry * configfs_lookup(struct inode *dir,
 				       struct dentry *dentry,
-				       struct nameidata *nd)
+				       unsigned int flags)
 {
 	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
 	struct configfs_dirent * sd;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index d013c46402ed..28cca01ca9c9 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -417,7 +417,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 /*
  * Lookup and fill in the inode data..
  */
-static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	unsigned int offset = 0;
 	struct inode *inode = NULL;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a07441a0a878..4ab50c3f5ab2 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -374,7 +374,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
  */
 static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 				      struct dentry *ecryptfs_dentry,
-				      struct nameidata *ecryptfs_nd)
+				      unsigned int flags)
 {
 	char *encrypted_and_encoded_name = NULL;
 	size_t encrypted_and_encoded_name_size;
diff --git a/fs/efs/efs.h b/fs/efs/efs.h
index d8305b582ab0..5528926ac7f6 100644
--- a/fs/efs/efs.h
+++ b/fs/efs/efs.h
@@ -129,7 +129,7 @@ extern struct inode *efs_iget(struct super_block *, unsigned long);
 extern efs_block_t efs_map_block(struct inode *, efs_block_t);
 extern int efs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
-extern struct dentry *efs_lookup(struct inode *, struct dentry *, struct nameidata *);
+extern struct dentry *efs_lookup(struct inode *, struct dentry *, unsigned int);
 extern struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid,
 		int fh_len, int fh_type);
 extern struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid,
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 832b10ded82f..96f66d213a19 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -58,7 +58,8 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
 	return(0);
 }
 
-struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) {
+struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
 	efs_ino_t inodenum;
 	struct inode *inode = NULL;
 
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index fc7161d6bf6b..909ed6ea4cf6 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -46,7 +46,7 @@ static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
 }
 
 static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
+				   unsigned int flags)
 {
 	struct inode *inode;
 	ino_t ino;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index f663a67d7bf0..b3e6778cd1e7 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -55,7 +55,7 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
  * Methods themselves.
  */
 
-static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
 {
 	struct inode * inode;
 	ino_t ino;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index eeb63dfc5d20..86d25f3f6043 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1011,7 +1011,7 @@ errout:
 	return NULL;
 }
 
-static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
 {
 	struct inode * inode;
 	struct ext3_dir_entry_2 * de;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5845cd97bf8b..4fba3cd42e2b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1312,7 +1312,7 @@ errout:
 	return NULL;
 }
 
-static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	struct ext4_dir_entry_2 *de;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index c5938c9084b9..47c608b05294 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -201,7 +201,7 @@ static const struct dentry_operations msdos_dentry_operations = {
 
 /***** Get inode using directory and name */
 static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
+				   unsigned int flags)
 {
 	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 0bbdf3990060..44152571524e 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -714,7 +714,7 @@ static int vfat_d_anon_disconn(struct dentry *dentry)
 }
 
 static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
-				  struct nameidata *nd)
+				  unsigned int flags)
 {
 	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 3360f1e678ad..bd447e88f208 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -48,7 +48,7 @@
 #define VXFS_BLOCK_PER_PAGE(sbp)  ((PAGE_CACHE_SIZE / (sbp)->s_blocksize))
 
 
-static struct dentry *	vxfs_lookup(struct inode *, struct dentry *, struct nameidata *);
+static struct dentry *	vxfs_lookup(struct inode *, struct dentry *, unsigned int);
 static int		vxfs_readdir(struct file *, void *, filldir_t);
 
 const struct inode_operations vxfs_dir_inode_ops = {
@@ -203,7 +203,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
  *   in the return pointer.
  */
 static struct dentry *
-vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd)
+vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
 {
 	struct inode		*ip = NULL;
 	ino_t			ino;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index eba30bd9ba2b..385235ac137d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -316,7 +316,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 }
 
 static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
-				  struct nameidata *nd)
+				  unsigned int flags)
 {
 	int err;
 	struct fuse_entry_out outarg;
@@ -478,7 +478,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
 	struct dentry *res = NULL;
 
 	if (d_unhashed(entry)) {
-		res = fuse_lookup(dir, entry, NULL);
+		res = fuse_lookup(dir, entry, 0);
 		if (IS_ERR(res))
 			return PTR_ERR(res);
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index a9ba2444e077..19e443b73354 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -775,7 +775,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
  */
 
 static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
-				  struct nameidata *nd)
+				  unsigned int flags)
 {
 	struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0);
 	if (inode && !IS_ERR(inode)) {
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 62fc14ea4b73..617b1ed71f52 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -18,7 +18,7 @@
  * hfs_lookup()
  */
 static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
-				 struct nameidata *nd)
+				 unsigned int flags)
 {
 	hfs_cat_rec rec;
 	struct hfs_find_data fd;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 761ec06354b4..451c97281b83 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -489,7 +489,7 @@ out:
 }
 
 static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
-				      struct nameidata *nd)
+				      unsigned int flags)
 {
 	struct inode *inode = NULL;
 	hfs_cat_rec rec;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 26b53fb09f68..90c2f78b2c79 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -25,7 +25,7 @@ static inline void hfsplus_instantiate(struct dentry *dentry,
 
 /* Find the entry inside dir named dentry->d_name */
 static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
-				     struct nameidata *nd)
+				     unsigned int flags)
 {
 	struct inode *inode = NULL;
 	struct hfs_find_data fd;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 82b69ee4dacc..7009265b746f 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -168,7 +168,7 @@ const struct dentry_operations hfsplus_dentry_operations = {
 };
 
 static struct dentry *hfsplus_file_lookup(struct inode *dir,
-		struct dentry *dentry, struct nameidata *nd)
+		struct dentry *dentry, unsigned int flags)
 {
 	struct hfs_find_data fd;
 	struct super_block *sb = dir->i_sb;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2afa5bbccf9b..0ea005228e1b 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -595,7 +595,7 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 }
 
 struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
-			     struct nameidata *nd)
+			     unsigned int flags)
 {
 	struct inode *inode;
 	char *name;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index b8472f803f4e..78e12b2e0ea2 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -189,7 +189,7 @@ out:
  *	      to tell read_inode to read fnode or not.
  */
 
-struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index c07ef1f1ced6..ac1ead194db5 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -220,7 +220,7 @@ extern const struct dentry_operations hpfs_dentry_operations;
 
 /* dir.c */
 
-struct dentry *hpfs_lookup(struct inode *, struct dentry *, struct nameidata *);
+struct dentry *hpfs_lookup(struct inode *, struct dentry *, unsigned int);
 extern const struct file_operations hpfs_dir_ops;
 
 /* dnode.c */
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index d4f93b52cec5..e5c06531dcc4 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -138,7 +138,7 @@ static int file_removed(struct dentry *dentry, const char *file)
 }
 
 static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
-				   struct nameidata *nd)
+				   unsigned int flags)
 {
 	struct dentry *proc_dentry, *parent;
 	struct qstr *name = &dentry->d_name;
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 0e73f63d9274..3620ad1ea9bc 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -114,7 +114,7 @@ extern int isofs_name_translate(struct iso_directory_record *, char *, struct in
 int get_joliet_filename(struct iso_directory_record *, unsigned char *, struct inode *);
 int get_acorn_filename(struct iso_directory_record *, char *, struct inode *);
 
-extern struct dentry *isofs_lookup(struct inode *, struct dentry *, struct nameidata *);
+extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int flags);
 extern struct buffer_head *isofs_bread(struct inode *, sector_t);
 extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long);
 
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 1e2946f2a69e..c167028844ed 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -163,7 +163,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
 	return 0;
 }
 
-struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	int found;
 	unsigned long uninitialized_var(block);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index b56018896d5e..6a601673f89f 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -27,7 +27,7 @@ static int jffs2_readdir (struct file *, void *, filldir_t);
 static int jffs2_create (struct inode *,struct dentry *,umode_t,
 			 struct nameidata *);
 static struct dentry *jffs2_lookup (struct inode *,struct dentry *,
-				    struct nameidata *);
+				    unsigned int);
 static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
 static int jffs2_unlink (struct inode *,struct dentry *);
 static int jffs2_symlink (struct inode *,struct dentry *,const char *);
@@ -74,7 +74,7 @@ const struct inode_operations jffs2_dir_inode_operations =
    nice and simple
 */
 static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
-				   struct nameidata *nd)
+				   unsigned int flags)
 {
 	struct jffs2_inode_info *dir_f;
 	struct jffs2_full_dirent *fd = NULL, *fd_list;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index f37977fb0871..34fe85555caf 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1436,7 +1436,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 	return rc;
 }
 
-static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags)
 {
 	struct btstack btstack;
 	ino_t inum;
diff --git a/fs/libfs.c b/fs/libfs.c
index f86ec27a4230..ebd03f6910d5 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -53,7 +53,7 @@ static int simple_delete_dentry(const struct dentry *dentry)
  * Lookup the data. This is trivial - if the dentry didn't already
  * exist, we know it is negative.  Set d_op to delete negative dentries.
  */
-struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	static const struct dentry_operations simple_dentry_operations = {
 		.d_delete = simple_delete_dentry,
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index bea5d1b9954b..8a3dcc615b39 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -349,7 +349,7 @@ static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
 }
 
 static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
-		struct nameidata *nd)
+		unsigned int flags)
 {
 	struct page *page;
 	struct logfs_disk_dentry *dd;
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 2d0ee1786305..1f245240ea08 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -18,7 +18,7 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
 	return err;
 }
 
-static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
 {
 	struct inode * inode = NULL;
 	ino_t ino;
diff --git a/fs/namei.c b/fs/namei.c
index 2e943ab04f32..175e81b8f261 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1090,7 +1090,7 @@ static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
 		return ERR_PTR(-ENOENT);
 	}
 
-	old = dir->i_op->lookup(dir, dentry, nd);
+	old = dir->i_op->lookup(dir, dentry, nd ? nd->flags : 0);
 	if (unlikely(old)) {
 		dput(dentry);
 		dentry = old;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 32607f749588..a0cff22bfc9b 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -31,7 +31,7 @@ static void ncp_do_readdir(struct file *, void *, filldir_t,
 static int ncp_readdir(struct file *, void *, filldir_t);
 
 static int ncp_create(struct inode *, struct dentry *, umode_t, struct nameidata *);
-static struct dentry *ncp_lookup(struct inode *, struct dentry *, struct nameidata *);
+static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int);
 static int ncp_unlink(struct inode *, struct dentry *);
 static int ncp_mkdir(struct inode *, struct dentry *, umode_t);
 static int ncp_rmdir(struct inode *, struct dentry *);
@@ -836,7 +836,7 @@ out:
 	return result;
 }
 
-static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	struct ncp_server *server = NCP_SERVER(dir);
 	struct inode *inode = NULL;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 656f52e9aa2e..8f21205c5896 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -46,7 +46,7 @@
 static int nfs_opendir(struct inode *, struct file *);
 static int nfs_closedir(struct inode *, struct file *);
 static int nfs_readdir(struct file *, void *, filldir_t);
-static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
+static struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
 static int nfs_create(struct inode *, struct dentry *, umode_t, struct nameidata *);
 static int nfs_mkdir(struct inode *, struct dentry *, umode_t);
 static int nfs_rmdir(struct inode *, struct dentry *);
@@ -1270,7 +1270,7 @@ const struct dentry_operations nfs_dentry_operations = {
 	.d_release	= nfs_d_release,
 };
 
-static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
 	struct dentry *res;
 	struct dentry *parent;
@@ -1291,7 +1291,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 	 * If we're doing an exclusive create, optimize away the lookup
 	 * but don't hash the dentry.
 	 */
-	if (nd && nfs_is_exclusive_create(dir, nd->flags)) {
+	if (nfs_is_exclusive_create(dir, flags)) {
 		d_instantiate(dentry, NULL);
 		res = NULL;
 		goto out;
@@ -1482,7 +1482,7 @@ out:
 	return err;
 
 no_open:
-	res = nfs_lookup(dir, dentry, NULL);
+	res = nfs_lookup(dir, dentry, 0);
 	err = PTR_ERR(res);
 	if (IS_ERR(res))
 		goto out;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index b72847988b78..5e5f779db76f 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -63,7 +63,7 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
  */
 
 static struct dentry *
-nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	ino_t ino;
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 358273e59ade..436f36037e09 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -101,7 +101,7 @@
  * Locking: Caller must hold i_mutex on the directory.
  */
 static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
-		struct nameidata *nd)
+		unsigned int flags)
 {
 	ntfs_volume *vol = NTFS_SB(dir_ino->i_sb);
 	struct inode *dent_inode;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 9f39c640cddf..fd71f6e5841f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -98,7 +98,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
 
 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
+				   unsigned int flags)
 {
 	int status;
 	u64 blkno;
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index f00576ec320f..3d254872e641 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -291,7 +291,7 @@ static int omfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 }
 
 static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry,
-				  struct nameidata *nd)
+				  unsigned int flags)
 {
 	struct buffer_head *bh;
 	struct inode *inode = NULL;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index bc49c975d501..4a3477949bca 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -170,13 +170,13 @@ static const struct file_operations openprom_operations = {
 	.llseek		= generic_file_llseek,
 };
 
-static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *);
+static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, unsigned int);
 
 static const struct inode_operations openprom_inode_operations = {
 	.lookup		= openpromfs_lookup,
 };
 
-static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	struct op_inode_info *ent_oi, *oi = OP_I(dir);
 	struct device_node *dp, *child;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index bf749cca4cc6..8eaa5ea1c613 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1956,7 +1956,7 @@ out_no_task:
 }
 
 static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
-				    struct nameidata *nd)
+				    unsigned int flags)
 {
 	return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
 }
@@ -2145,7 +2145,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 }
 
 static struct dentry *proc_map_files_lookup(struct inode *dir,
-		struct dentry *dentry, struct nameidata *nd)
+		struct dentry *dentry, unsigned int flags)
 {
 	unsigned long vm_start, vm_end;
 	struct vm_area_struct *vma;
@@ -2380,7 +2380,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
 
 static struct dentry *proc_lookupfdinfo(struct inode *dir,
 					struct dentry *dentry,
-					struct nameidata *nd)
+					unsigned int flags)
 {
 	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
 }
@@ -2630,7 +2630,7 @@ static const struct file_operations proc_attr_dir_operations = {
 };
 
 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
-				struct dentry *dentry, struct nameidata *nd)
+				struct dentry *dentry, unsigned int flags)
 {
 	return proc_pident_lookup(dir, dentry,
 				  attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
@@ -3114,7 +3114,8 @@ static const struct file_operations proc_tgid_base_operations = {
 	.llseek		= default_llseek,
 };
 
-static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
+static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
 	return proc_pident_lookup(dir, dentry,
 				  tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
 }
@@ -3243,7 +3244,7 @@ out:
 	return error;
 }
 
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
 	struct dentry *result;
 	struct task_struct *task;
@@ -3470,7 +3471,8 @@ static int proc_tid_base_readdir(struct file * filp,
 				   tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
 }
 
-static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
+static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
 	return proc_pident_lookup(dir, dentry,
 				  tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
 }
@@ -3514,7 +3516,7 @@ out:
 	return error;
 }
 
-static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
 	struct dentry *result = ERR_PTR(-ENOENT);
 	struct task_struct *task;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 2edf34f2eb61..b3647fe6a608 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -446,7 +446,7 @@ out_unlock:
 }
 
 struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
-		struct nameidata *nd)
+		unsigned int flags)
 {
 	return proc_lookup_de(PDE(dir), dir, dentry);
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index e0c2a48dab73..e1167a1c9126 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -106,7 +106,7 @@ void pde_users_dec(struct proc_dir_entry *pde);
 
 extern spinlock_t proc_subdir_lock;
 
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int);
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
 unsigned long task_vsize(struct mm_struct *);
 unsigned long task_statm(struct mm_struct *,
@@ -132,7 +132,7 @@ int proc_remount(struct super_block *sb, int *flags, char *data);
  * of the /proc/<pid> subdirectories.
  */
 int proc_readdir(struct file *, void *, filldir_t);
-struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
+struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
 
 
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 40ceb40f9853..b178ed733c36 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -140,7 +140,7 @@ const struct file_operations proc_ns_dir_operations = {
 };
 
 static struct dentry *proc_ns_dir_lookup(struct inode *dir,
-				struct dentry *dentry, struct nameidata *nd)
+				struct dentry *dentry, unsigned int flags)
 {
 	struct dentry *error;
 	struct task_struct *task = get_proc_task(dir);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 06e1cc17caf6..fe72cd073dea 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -119,7 +119,7 @@ static struct net *get_proc_task_net(struct inode *dir)
 }
 
 static struct dentry *proc_tgid_net_lookup(struct inode *dir,
-		struct dentry *dentry, struct nameidata *nd)
+		struct dentry *dentry, unsigned int flags)
 {
 	struct dentry *de;
 	struct net *net;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fda69fa39099..dfafeb2b05a0 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -433,7 +433,7 @@ static struct ctl_table_header *grab_header(struct inode *inode)
 }
 
 static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
-					struct nameidata *nd)
+					unsigned int flags)
 {
 	struct ctl_table_header *head = grab_header(dir);
 	struct ctl_table_header *h = NULL;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 7c30fce037c0..568b20290c75 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -200,13 +200,12 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	return 0;
 }
 
-static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
 {
-	if (!proc_lookup(dir, dentry, nd)) {
+	if (!proc_lookup(dir, dentry, flags))
 		return NULL;
-	}
 	
-	return proc_pid_lookup(dir, dentry, nd);
+	return proc_pid_lookup(dir, dentry, flags);
 }
 
 static int proc_root_readdir(struct file * filp,
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index a512c0b30e8e..d024505ba007 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -95,7 +95,7 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
 	return NULL;
 }
 
-struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	int ino;
 	struct qnx4_inode_entry *de;
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
index 244d4620189b..34e2d329c97e 100644
--- a/fs/qnx4/qnx4.h
+++ b/fs/qnx4/qnx4.h
@@ -23,7 +23,7 @@ struct qnx4_inode_info {
 };
 
 extern struct inode *qnx4_iget(struct super_block *, unsigned long);
-extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags);
 extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
 extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
 
diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c
index 8a97289e04ad..0561326a94f5 100644
--- a/fs/qnx6/namei.c
+++ b/fs/qnx6/namei.c
@@ -13,7 +13,7 @@
 #include "qnx6.h"
 
 struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
-				struct nameidata *nd)
+				unsigned int flags)
 {
 	unsigned ino;
 	struct page *page;
diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h
index 6c5e02a0b6a8..b00fcc960d37 100644
--- a/fs/qnx6/qnx6.h
+++ b/fs/qnx6/qnx6.h
@@ -45,7 +45,7 @@ struct qnx6_inode_info {
 
 extern struct inode *qnx6_iget(struct super_block *sb, unsigned ino);
 extern struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
-					struct nameidata *nd);
+					unsigned int flags);
 
 #ifdef CONFIG_QNX6FS_DEBUG
 extern void qnx6_superblock_debug(struct qnx6_super_block *,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 84e8a69cee9d..1d9cf248c471 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -322,7 +322,7 @@ static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
 }
 
 static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
-				      struct nameidata *nd)
+				      unsigned int flags)
 {
 	int retval;
 	int lock_depth;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e64f6b5f7ae5..77c5f2173983 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -210,7 +210,7 @@ out:
  * look up an entry in a directory
  */
 static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
+				   unsigned int flags)
 {
 	unsigned long offset, maxoff;
 	struct inode *inode;
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index abcc58f3c152..7834a517f7f4 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -134,7 +134,7 @@ out:
 
 
 static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
-				 struct nameidata *nd)
+				 unsigned int flags)
 {
 	const unsigned char *name = dentry->d_name.name;
 	int len = dentry->d_name.len;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 038e74b3af87..efd373e3e0aa 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -764,7 +764,7 @@ int sysfs_create_dir(struct kobject * kobj)
 }
 
 static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
-				struct nameidata *nd)
+				unsigned int flags)
 {
 	struct dentry *ret = NULL;
 	struct dentry *parent = dentry->d_parent;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index d7466e293614..a8c4359cd0e1 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -43,7 +43,7 @@ const struct dentry_operations sysv_dentry_operations = {
 	.d_hash		= sysv_hash,
 };
 
-static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
 {
 	struct inode * inode = NULL;
 	ino_t ino;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index a6d42efc76d2..845b2df08317 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -184,7 +184,7 @@ static int dbg_check_name(const struct ubifs_info *c,
 }
 
 static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
+				   unsigned int flags)
 {
 	int err;
 	union ubifs_key key;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 18024178ac4c..929cc205985a 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -251,7 +251,7 @@ out_ok:
 }
 
 static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
-				 struct nameidata *nd)
+				 unsigned int flags)
 {
 	struct inode *inode = NULL;
 	struct fileIdentDesc cfi;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index a2281cadefa1..bc77fa170b9d 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -46,7 +46,7 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
 	return err;
 }
 
-static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
 {
 	struct inode * inode = NULL;
 	ino_t ino;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1a25fd802798..b41cfba14faf 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -197,7 +197,7 @@ STATIC struct dentry *
 xfs_vn_lookup(
 	struct inode	*dir,
 	struct dentry	*dentry,
-	struct nameidata *nd)
+	unsigned int flags)
 {
 	struct xfs_inode *cip;
 	struct xfs_name	name;
@@ -222,7 +222,7 @@ STATIC struct dentry *
 xfs_vn_ci_lookup(
 	struct inode	*dir,
 	struct dentry	*dentry,
-	struct nameidata *nd)
+	unsigned int flags)
 {
 	struct xfs_inode *ip;
 	struct xfs_name	xname;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 17ee20dba86c..7a71709b7fa7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1666,7 +1666,7 @@ struct file_operations {
 };
 
 struct inode_operations {
-	struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
+	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
 	void * (*follow_link) (struct dentry *, struct nameidata *);
 	int (*permission) (struct inode *, int);
 	struct posix_acl * (*get_acl)(struct inode *, int);
@@ -2571,7 +2571,7 @@ extern int simple_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata);
 
-extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *);
+extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
 extern const struct file_operations simple_dir_operations;
 extern const struct inode_operations simple_dir_inode_operations;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b303dfc7dce0..0cd1314acdaf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -822,7 +822,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
  */
 
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
 static const struct inode_operations cgroup_dir_inode_operations;
@@ -2570,7 +2570,7 @@ static const struct inode_operations cgroup_dir_inode_operations = {
 	.rename = cgroup_rename,
 };
 
-static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
-- 
cgit v1.2.3-59-g8ed1b


From 79714f72d3b964611997de512cb29198c9f2dbbb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Jun 2012 03:01:42 +0400
Subject: get rid of kern_path_parent()

all callers want the same thing, actually - a kinda-sorta analog of
kern_path_create().  I.e. they want parent vfsmount/dentry (with
->i_mutex held, to make sure the child dentry is still their child)
+ the child dentry.

Signed-off-by Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/base/devtmpfs.c | 98 +++++++++++++++++++++----------------------------
 fs/namei.c              | 22 ++++++++++-
 include/linux/namei.h   |  2 +-
 kernel/audit_watch.c    | 25 ++-----------
 4 files changed, 65 insertions(+), 82 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 765c3a28077a..d91a3a0b2325 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -227,33 +227,24 @@ static int handle_create(const char *nodename, umode_t mode, struct device *dev)
 
 static int dev_rmdir(const char *name)
 {
-	struct nameidata nd;
+	struct path parent;
 	struct dentry *dentry;
 	int err;
 
-	err = kern_path_parent(name, &nd);
-	if (err)
-		return err;
-
-	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-	dentry = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
-	if (!IS_ERR(dentry)) {
-		if (dentry->d_inode) {
-			if (dentry->d_inode->i_private == &thread)
-				err = vfs_rmdir(nd.path.dentry->d_inode,
-						dentry);
-			else
-				err = -EPERM;
-		} else {
-			err = -ENOENT;
-		}
-		dput(dentry);
+	dentry = kern_path_locked(name, &parent);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	if (dentry->d_inode) {
+		if (dentry->d_inode->i_private == &thread)
+			err = vfs_rmdir(parent.dentry->d_inode, dentry);
+		else
+			err = -EPERM;
 	} else {
-		err = PTR_ERR(dentry);
+		err = -ENOENT;
 	}
-
-	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-	path_put(&nd.path);
+	dput(dentry);
+	mutex_unlock(&parent.dentry->d_inode->i_mutex);
+	path_put(&parent);
 	return err;
 }
 
@@ -305,50 +296,43 @@ static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *sta
 
 static int handle_remove(const char *nodename, struct device *dev)
 {
-	struct nameidata nd;
+	struct path parent;
 	struct dentry *dentry;
-	struct kstat stat;
 	int deleted = 1;
 	int err;
 
-	err = kern_path_parent(nodename, &nd);
-	if (err)
-		return err;
+	dentry = kern_path_locked(nodename, &parent);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
-	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-	dentry = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
-	if (!IS_ERR(dentry)) {
-		if (dentry->d_inode) {
-			err = vfs_getattr(nd.path.mnt, dentry, &stat);
-			if (!err && dev_mynode(dev, dentry->d_inode, &stat)) {
-				struct iattr newattrs;
-				/*
-				 * before unlinking this node, reset permissions
-				 * of possible references like hardlinks
-				 */
-				newattrs.ia_uid = 0;
-				newattrs.ia_gid = 0;
-				newattrs.ia_mode = stat.mode & ~0777;
-				newattrs.ia_valid =
-					ATTR_UID|ATTR_GID|ATTR_MODE;
-				mutex_lock(&dentry->d_inode->i_mutex);
-				notify_change(dentry, &newattrs);
-				mutex_unlock(&dentry->d_inode->i_mutex);
-				err = vfs_unlink(nd.path.dentry->d_inode,
-						 dentry);
-				if (!err || err == -ENOENT)
-					deleted = 1;
-			}
-		} else {
-			err = -ENOENT;
+	if (dentry->d_inode) {
+		struct kstat stat;
+		err = vfs_getattr(parent.mnt, dentry, &stat);
+		if (!err && dev_mynode(dev, dentry->d_inode, &stat)) {
+			struct iattr newattrs;
+			/*
+			 * before unlinking this node, reset permissions
+			 * of possible references like hardlinks
+			 */
+			newattrs.ia_uid = 0;
+			newattrs.ia_gid = 0;
+			newattrs.ia_mode = stat.mode & ~0777;
+			newattrs.ia_valid =
+				ATTR_UID|ATTR_GID|ATTR_MODE;
+			mutex_lock(&dentry->d_inode->i_mutex);
+			notify_change(dentry, &newattrs);
+			mutex_unlock(&dentry->d_inode->i_mutex);
+			err = vfs_unlink(parent.dentry->d_inode, dentry);
+			if (!err || err == -ENOENT)
+				deleted = 1;
 		}
-		dput(dentry);
 	} else {
-		err = PTR_ERR(dentry);
+		err = -ENOENT;
 	}
-	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+	dput(dentry);
+	mutex_unlock(&parent.dentry->d_inode->i_mutex);
 
-	path_put(&nd.path);
+	path_put(&parent);
 	if (deleted && strchr(nodename, '/'))
 		delete_path(nodename);
 	return err;
diff --git a/fs/namei.c b/fs/namei.c
index 5abab9176903..6b29a51bef5d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1814,9 +1814,27 @@ static int do_path_lookup(int dfd, const char *name,
 	return retval;
 }
 
-int kern_path_parent(const char *name, struct nameidata *nd)
+/* does lookup, returns the object with parent locked */
+struct dentry *kern_path_locked(const char *name, struct path *path)
 {
-	return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
+	struct nameidata nd;
+	struct dentry *d;
+	int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd);
+	if (err)
+		return ERR_PTR(err);
+	if (nd.last_type != LAST_NORM) {
+		path_put(&nd.path);
+		return ERR_PTR(-EINVAL);
+	}
+	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
+	if (IS_ERR(d)) {
+		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+		path_put(&nd.path);
+		return d;
+	}
+	*path = nd.path;
+	return d;
 }
 
 int kern_path(const char *name, unsigned int flags, struct path *path)
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 23d859879210..f5931489e150 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -67,7 +67,7 @@ extern int kern_path(const char *, unsigned, struct path *);
 
 extern struct dentry *kern_path_create(int, const char *, struct path *, int);
 extern struct dentry *user_path_create(int, const char __user *, struct path *, int);
-extern int kern_path_parent(const char *, struct nameidata *);
+extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct path *);
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e683869365d9..3823281401b5 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -355,34 +355,15 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 /* Get path information necessary for adding watches. */
 static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 {
-	struct nameidata nd;
-	struct dentry *d;
-	int err;
-
-	err = kern_path_parent(watch->path, &nd);
-	if (err)
-		return err;
-
-	if (nd.last_type != LAST_NORM) {
-		path_put(&nd.path);
-		return -EINVAL;
-	}
-
-	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-	d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
-	if (IS_ERR(d)) {
-		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-		path_put(&nd.path);
+	struct dentry *d = kern_path_locked(watch->path, parent);
+	if (IS_ERR(d))
 		return PTR_ERR(d);
-	}
+	mutex_unlock(&parent->dentry->d_inode->i_mutex);
 	if (d->d_inode) {
 		/* update watch filter fields */
 		watch->dev = d->d_inode->i_sb->s_dev;
 		watch->ino = d->d_inode->i_ino;
 	}
-	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-
-	*parent = nd.path;
 	dput(d);
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From be34d1a3bc4b6f357a49acb55ae870c81337e4f0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Jun 2012 12:55:18 +0100
Subject: VFS: Make clone_mnt()/copy_tree()/collect_mounts() return errors

copy_tree() can theoretically fail in a case other than ENOMEM, but always
returns NULL which is interpreted by callers as -ENOMEM.  Change it to return
an explicit error.

Also change clone_mnt() for consistency and because union mounts will add new
error cases.

Thanks to Andreas Gruenbacher <agruen@suse.de> for a bug fix.
[AV: folded braino fix by Dan Carpenter]

Original-author: Valerie Aurora <vaurora@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Valerie Aurora <valerie.aurora@gmail.com>
Cc: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c      | 120 ++++++++++++++++++++++++++++------------------------
 fs/pnode.c          |   5 ++-
 kernel/audit_tree.c |  10 ++---
 3 files changed, 73 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/fs/namespace.c b/fs/namespace.c
index 8f412abcb67f..be1b07a774f1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -708,56 +708,60 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 					int flag)
 {
 	struct super_block *sb = old->mnt.mnt_sb;
-	struct mount *mnt = alloc_vfsmnt(old->mnt_devname);
+	struct mount *mnt;
+	int err;
 
-	if (mnt) {
-		if (flag & (CL_SLAVE | CL_PRIVATE))
-			mnt->mnt_group_id = 0; /* not a peer of original */
-		else
-			mnt->mnt_group_id = old->mnt_group_id;
-
-		if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
-			int err = mnt_alloc_group_id(mnt);
-			if (err)
-				goto out_free;
-		}
+	mnt = alloc_vfsmnt(old->mnt_devname);
+	if (!mnt)
+		return ERR_PTR(-ENOMEM);
 
-		mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
-		atomic_inc(&sb->s_active);
-		mnt->mnt.mnt_sb = sb;
-		mnt->mnt.mnt_root = dget(root);
-		mnt->mnt_mountpoint = mnt->mnt.mnt_root;
-		mnt->mnt_parent = mnt;
-		br_write_lock(&vfsmount_lock);
-		list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
-		br_write_unlock(&vfsmount_lock);
+	if (flag & (CL_SLAVE | CL_PRIVATE))
+		mnt->mnt_group_id = 0; /* not a peer of original */
+	else
+		mnt->mnt_group_id = old->mnt_group_id;
 
-		if (flag & CL_SLAVE) {
-			list_add(&mnt->mnt_slave, &old->mnt_slave_list);
-			mnt->mnt_master = old;
-			CLEAR_MNT_SHARED(mnt);
-		} else if (!(flag & CL_PRIVATE)) {
-			if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
-				list_add(&mnt->mnt_share, &old->mnt_share);
-			if (IS_MNT_SLAVE(old))
-				list_add(&mnt->mnt_slave, &old->mnt_slave);
-			mnt->mnt_master = old->mnt_master;
-		}
-		if (flag & CL_MAKE_SHARED)
-			set_mnt_shared(mnt);
-
-		/* stick the duplicate mount on the same expiry list
-		 * as the original if that was on one */
-		if (flag & CL_EXPIRE) {
-			if (!list_empty(&old->mnt_expire))
-				list_add(&mnt->mnt_expire, &old->mnt_expire);
-		}
+	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
+		err = mnt_alloc_group_id(mnt);
+		if (err)
+			goto out_free;
+	}
+
+	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
+	atomic_inc(&sb->s_active);
+	mnt->mnt.mnt_sb = sb;
+	mnt->mnt.mnt_root = dget(root);
+	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
+	mnt->mnt_parent = mnt;
+	br_write_lock(&vfsmount_lock);
+	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
+	br_write_unlock(&vfsmount_lock);
+
+	if (flag & CL_SLAVE) {
+		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
+		mnt->mnt_master = old;
+		CLEAR_MNT_SHARED(mnt);
+	} else if (!(flag & CL_PRIVATE)) {
+		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
+			list_add(&mnt->mnt_share, &old->mnt_share);
+		if (IS_MNT_SLAVE(old))
+			list_add(&mnt->mnt_slave, &old->mnt_slave);
+		mnt->mnt_master = old->mnt_master;
+	}
+	if (flag & CL_MAKE_SHARED)
+		set_mnt_shared(mnt);
+
+	/* stick the duplicate mount on the same expiry list
+	 * as the original if that was on one */
+	if (flag & CL_EXPIRE) {
+		if (!list_empty(&old->mnt_expire))
+			list_add(&mnt->mnt_expire, &old->mnt_expire);
 	}
+
 	return mnt;
 
  out_free:
 	free_vfsmnt(mnt);
-	return NULL;
+	return ERR_PTR(err);
 }
 
 static inline void mntfree(struct mount *mnt)
@@ -1242,11 +1246,12 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 	struct path path;
 
 	if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
-		return NULL;
+		return ERR_PTR(-EINVAL);
 
 	res = q = clone_mnt(mnt, dentry, flag);
-	if (!q)
-		goto Enomem;
+	if (IS_ERR(q))
+		return q;
+
 	q->mnt_mountpoint = mnt->mnt_mountpoint;
 
 	p = mnt;
@@ -1268,8 +1273,8 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 			path.mnt = &q->mnt;
 			path.dentry = p->mnt_mountpoint;
 			q = clone_mnt(p, p->mnt.mnt_root, flag);
-			if (!q)
-				goto Enomem;
+			if (IS_ERR(q))
+				goto out;
 			br_write_lock(&vfsmount_lock);
 			list_add_tail(&q->mnt_list, &res->mnt_list);
 			attach_mnt(q, &path);
@@ -1277,7 +1282,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 		}
 	}
 	return res;
-Enomem:
+out:
 	if (res) {
 		LIST_HEAD(umount_list);
 		br_write_lock(&vfsmount_lock);
@@ -1285,9 +1290,11 @@ Enomem:
 		br_write_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
-	return NULL;
+	return q;
 }
 
+/* Caller should check returned pointer for errors */
+
 struct vfsmount *collect_mounts(struct path *path)
 {
 	struct mount *tree;
@@ -1295,7 +1302,9 @@ struct vfsmount *collect_mounts(struct path *path)
 	tree = copy_tree(real_mount(path->mnt), path->dentry,
 			 CL_COPY_ALL | CL_PRIVATE);
 	up_write(&namespace_sem);
-	return tree ? &tree->mnt : NULL;
+	if (IS_ERR(tree))
+		return NULL;
+	return &tree->mnt;
 }
 
 void drop_collected_mounts(struct vfsmount *mnt)
@@ -1590,14 +1599,15 @@ static int do_loopback(struct path *path, char *old_name,
 	if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old))
 		goto out2;
 
-	err = -ENOMEM;
 	if (recurse)
 		mnt = copy_tree(old, old_path.dentry, 0);
 	else
 		mnt = clone_mnt(old, old_path.dentry, 0);
 
-	if (!mnt)
-		goto out2;
+	if (IS_ERR(mnt)) {
+		err = PTR_ERR(mnt);
+		goto out;
+	}
 
 	err = graft_tree(mnt, path);
 	if (err) {
@@ -2211,10 +2221,10 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	down_write(&namespace_sem);
 	/* First pass: copy the tree topology */
 	new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE);
-	if (!new) {
+	if (IS_ERR(new)) {
 		up_write(&namespace_sem);
 		kfree(new_ns);
-		return ERR_PTR(-ENOMEM);
+		return ERR_CAST(new);
 	}
 	new_ns->root = new;
 	br_write_lock(&vfsmount_lock);
diff --git a/fs/pnode.c b/fs/pnode.c
index bed378db0758..3e000a51ac0d 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -237,8 +237,9 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
 
 		source =  get_source(m, prev_dest_mnt, prev_src_mnt, &type);
 
-		if (!(child = copy_tree(source, source->mnt.mnt_root, type))) {
-			ret = -ENOMEM;
+		child = copy_tree(source, source->mnt.mnt_root, type);
+		if (IS_ERR(child)) {
+			ret = PTR_ERR(child);
 			list_splice(tree_list, tmp_list.prev);
 			goto out;
 		}
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5bf0790497e7..3a5ca582ba1e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -595,7 +595,7 @@ void audit_trim_trees(void)
 
 		root_mnt = collect_mounts(&path);
 		path_put(&path);
-		if (!root_mnt)
+		if (IS_ERR(root_mnt))
 			goto skip_it;
 
 		spin_lock(&hash_lock);
@@ -669,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
 		goto Err;
 	mnt = collect_mounts(&path);
 	path_put(&path);
-	if (!mnt) {
-		err = -ENOMEM;
+	if (IS_ERR(mnt)) {
+		err = PTR_ERR(mnt);
 		goto Err;
 	}
 
@@ -719,8 +719,8 @@ int audit_tag_tree(char *old, char *new)
 		return err;
 	tagged = collect_mounts(&path2);
 	path_put(&path2);
-	if (!tagged)
-		return -ENOMEM;
+	if (IS_ERR(tagged))
+		return PTR_ERR(tagged);
 
 	err = kern_path(old, 0, &path1);
 	if (err) {
-- 
cgit v1.2.3-59-g8ed1b


From 9249e17fe094d853d1ef7475dd559a2cc7e23d42 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Jun 2012 12:55:37 +0100
Subject: VFS: Pass mount flags to sget()

Pass mount flags to sget() so that it can use them in initialising a new
superblock before the set function is called.  They could also be passed to the
compare function.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/mtd/mtdsuper.c |  4 +---
 fs/9p/vfs_super.c      |  4 ++--
 fs/afs/super.c         |  3 +--
 fs/btrfs/super.c       |  4 ++--
 fs/ceph/super.c        |  2 +-
 fs/cifs/cifsfs.c       |  9 ++++-----
 fs/devpts/inode.c      |  6 +++---
 fs/ecryptfs/main.c     |  3 +--
 fs/gfs2/ops_fstype.c   |  5 ++---
 fs/libfs.c             |  4 ++--
 fs/logfs/super.c       |  3 +--
 fs/nfs/super.c         |  2 +-
 fs/nilfs2/super.c      |  4 ++--
 fs/proc/root.c         |  3 +--
 fs/reiserfs/procfs.c   |  2 +-
 fs/super.c             | 22 +++++++++++-----------
 fs/sysfs/mount.c       |  3 +--
 fs/ubifs/super.c       |  3 +--
 include/linux/fs.h     |  2 +-
 kernel/cgroup.c        |  2 +-
 20 files changed, 40 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index a90bfe79916d..334da5f583c0 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -63,7 +63,7 @@ static struct dentry *mount_mtd_aux(struct file_system_type *fs_type, int flags,
 	struct super_block *sb;
 	int ret;
 
-	sb = sget(fs_type, get_sb_mtd_compare, get_sb_mtd_set, mtd);
+	sb = sget(fs_type, get_sb_mtd_compare, get_sb_mtd_set, flags, mtd);
 	if (IS_ERR(sb))
 		goto out_error;
 
@@ -74,8 +74,6 @@ static struct dentry *mount_mtd_aux(struct file_system_type *fs_type, int flags,
 	pr_debug("MTDSB: New superblock for device %d (\"%s\")\n",
 	      mtd->index, mtd->name);
 
-	sb->s_flags = flags;
-
 	ret = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 	if (ret < 0) {
 		deactivate_locked_super(sb);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 8c92a9ba8330..137d50396898 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -89,7 +89,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	if (v9ses->cache)
 		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
 
-	sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
+	sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
 	if (!v9ses->cache)
 		sb->s_flags |= MS_SYNCHRONOUS;
 
@@ -137,7 +137,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 		goto close_session;
 	}
 
-	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
+	sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses);
 	if (IS_ERR(sb)) {
 		retval = PTR_ERR(sb);
 		goto clunk_fid;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index f02b31e7e648..df8c6047c2a1 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -395,7 +395,7 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 	as->volume = vol;
 
 	/* allocate a deviceless superblock */
-	sb = sget(fs_type, afs_test_super, afs_set_super, as);
+	sb = sget(fs_type, afs_test_super, afs_set_super, flags, as);
 	if (IS_ERR(sb)) {
 		ret = PTR_ERR(sb);
 		afs_put_volume(vol);
@@ -406,7 +406,6 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 	if (!sb->s_root) {
 		/* initial superblock/root creation */
 		_debug("create");
-		sb->s_flags = flags;
 		ret = afs_fill_super(sb, &params);
 		if (ret < 0) {
 			deactivate_locked_super(sb);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e23991574fdf..b19d75567728 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1068,7 +1068,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	bdev = fs_devices->latest_bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC,
+		 fs_info);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto error_close_devices;
@@ -1082,7 +1083,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	} else {
 		char b[BDEVNAME_SIZE];
 
-		s->s_flags = flags | MS_NOSEC;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		btrfs_sb(s)->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 1e67dd7305a4..7076109f014d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -871,7 +871,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 
 	if (ceph_test_opt(fsc->client, NOSHARE))
 		compare_super = NULL;
-	sb = sget(fs_type, compare_super, ceph_set_super, fsc);
+	sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);
 	if (IS_ERR(sb)) {
 		res = ERR_CAST(sb);
 		goto out;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c0c2751a7573..a7610cfedf0a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -637,7 +637,10 @@ cifs_do_mount(struct file_system_type *fs_type,
 	mnt_data.cifs_sb = cifs_sb;
 	mnt_data.flags = flags;
 
-	sb = sget(fs_type, cifs_match_super, cifs_set_super, &mnt_data);
+	/* BB should we make this contingent on mount parm? */
+	flags |= MS_NODIRATIME | MS_NOATIME;
+
+	sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
 	if (IS_ERR(sb)) {
 		root = ERR_CAST(sb);
 		cifs_umount(cifs_sb);
@@ -648,10 +651,6 @@ cifs_do_mount(struct file_system_type *fs_type,
 		cFYI(1, "Use existing superblock");
 		cifs_umount(cifs_sb);
 	} else {
-		sb->s_flags = flags;
-		/* BB should we make this contingent on mount parm? */
-		sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
-
 		rc = cifs_read_super(sb);
 		if (rc) {
 			root = ERR_PTR(rc);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 979c1e309c73..14afbabe6546 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -439,15 +439,15 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
 		return ERR_PTR(error);
 
 	if (opts.newinstance)
-		s = sget(fs_type, NULL, set_anon_super, NULL);
+		s = sget(fs_type, NULL, set_anon_super, flags, NULL);
 	else
-		s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
+		s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
+			 NULL);
 
 	if (IS_ERR(s))
 		return ERR_CAST(s);
 
 	if (!s->s_root) {
-		s->s_flags = flags;
 		error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error)
 			goto out_undo_sget;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 68954937a071..7edeb3d893c1 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -499,13 +499,12 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 		goto out;
 	}
 
-	s = sget(fs_type, NULL, set_anon_super, NULL);
+	s = sget(fs_type, NULL, set_anon_super, flags, NULL);
 	if (IS_ERR(s)) {
 		rc = PTR_ERR(s);
 		goto out;
 	}
 
-	s->s_flags = flags;
 	rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
 	if (rc)
 		goto out1;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b8c250fc4922..6c906078f657 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1286,7 +1286,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 		error = -EBUSY;
 		goto error_bdev;
 	}
-	s = sget(fs_type, test_gfs2_super, set_gfs2_super, bdev);
+	s = sget(fs_type, test_gfs2_super, set_gfs2_super, flags, bdev);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	error = PTR_ERR(s);
 	if (IS_ERR(s))
@@ -1316,7 +1316,6 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 	} else {
 		char b[BDEVNAME_SIZE];
 
-		s->s_flags = flags;
 		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
@@ -1360,7 +1359,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
 		       dev_name, error);
 		return ERR_PTR(error);
 	}
-	s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super,
+	s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags,
 		 path.dentry->d_inode->i_sb->s_bdev);
 	path_put(&path);
 	if (IS_ERR(s)) {
diff --git a/fs/libfs.c b/fs/libfs.c
index ebd03f6910d5..a74cb1725ac6 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -222,15 +222,15 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
 	const struct super_operations *ops,
 	const struct dentry_operations *dops, unsigned long magic)
 {
-	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
+	struct super_block *s;
 	struct dentry *dentry;
 	struct inode *root;
 	struct qstr d_name = QSTR_INIT(name, strlen(name));
 
+	s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL);
 	if (IS_ERR(s))
 		return ERR_CAST(s);
 
-	s->s_flags = MS_NOUSER;
 	s->s_maxbytes = MAX_LFS_FILESIZE;
 	s->s_blocksize = PAGE_SIZE;
 	s->s_blocksize_bits = PAGE_SHIFT;
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 97bca623d893..345c24b8a6f8 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -519,7 +519,7 @@ static struct dentry *logfs_get_sb_device(struct logfs_super *super,
 	log_super("LogFS: Start mount %x\n", mount_count++);
 
 	err = -EINVAL;
-	sb = sget(type, logfs_sb_test, logfs_sb_set, super);
+	sb = sget(type, logfs_sb_test, logfs_sb_set, flags | MS_NOATIME, super);
 	if (IS_ERR(sb)) {
 		super->s_devops->put_device(super);
 		kfree(super);
@@ -542,7 +542,6 @@ static struct dentry *logfs_get_sb_device(struct logfs_super *super,
 	sb->s_maxbytes	= (1ull << 43) - 1;
 	sb->s_max_links = LOGFS_LINK_MAX;
 	sb->s_op	= &logfs_super_operations;
-	sb->s_flags	= flags | MS_NOATIME;
 
 	err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
 	if (err)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 06228192f64e..8b2a2977b720 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2419,7 +2419,7 @@ static struct dentry *nfs_fs_mount_common(struct file_system_type *fs_type,
 		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
+	s = sget(fs_type, compare_super, nfs_set_super, flags, &sb_mntdata);
 	if (IS_ERR(s)) {
 		mntroot = ERR_CAST(s);
 		goto out_err_nosb;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1099a76cee59..d57c42f974ea 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1288,7 +1288,8 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 		err = -EBUSY;
 		goto failed;
 	}
-	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, sd.bdev);
+	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags,
+		 sd.bdev);
 	mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
 	if (IS_ERR(s)) {
 		err = PTR_ERR(s);
@@ -1301,7 +1302,6 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 		s_new = true;
 
 		/* New superblock instance created */
-		s->s_flags = flags;
 		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(sd.bdev));
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 568b20290c75..9a2d9fd7cadd 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -111,7 +111,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 		options = data;
 	}
 
-	sb = sget(fs_type, proc_test_super, proc_set_super, ns);
+	sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 
@@ -121,7 +121,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	}
 
 	if (!sb->s_root) {
-		sb->s_flags = flags;
 		err = proc_fill_super(sb);
 		if (err) {
 			deactivate_locked_super(sb);
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 2c1ade692cc8..e60e87035bb3 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -403,7 +403,7 @@ static void *r_start(struct seq_file *m, loff_t * pos)
 	if (l)
 		return NULL;
 
-	if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, s)))
+	if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, s)))
 		return NULL;
 
 	up_write(&s->s_umount);
diff --git a/fs/super.c b/fs/super.c
index cf001775617f..c743fb3be4b8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -105,11 +105,12 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
 /**
  *	alloc_super	-	create new superblock
  *	@type:	filesystem type superblock should belong to
+ *	@flags: the mount flags
  *
  *	Allocates and initializes a new &struct super_block.  alloc_super()
  *	returns a pointer new superblock or %NULL if allocation had failed.
  */
-static struct super_block *alloc_super(struct file_system_type *type)
+static struct super_block *alloc_super(struct file_system_type *type, int flags)
 {
 	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
 	static const struct super_operations default_op;
@@ -136,6 +137,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 #else
 		INIT_LIST_HEAD(&s->s_files);
 #endif
+		s->s_flags = flags;
 		s->s_bdi = &default_backing_dev_info;
 		INIT_HLIST_NODE(&s->s_instances);
 		INIT_HLIST_BL_HEAD(&s->s_anon);
@@ -415,11 +417,13 @@ EXPORT_SYMBOL(generic_shutdown_super);
  *	@type:	filesystem type superblock should belong to
  *	@test:	comparison callback
  *	@set:	setup callback
+ *	@flags:	mount flags
  *	@data:	argument to each of them
  */
 struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
+			int flags,
 			void *data)
 {
 	struct super_block *s = NULL;
@@ -450,7 +454,7 @@ retry:
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super(type);
+		s = alloc_super(type, flags);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
@@ -925,13 +929,12 @@ struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
 {
 	struct super_block *sb;
 
-	sb = sget(fs_type, ns_test_super, ns_set_super, data);
+	sb = sget(fs_type, ns_test_super, ns_set_super, flags, data);
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 
 	if (!sb->s_root) {
 		int err;
-		sb->s_flags = flags;
 		err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 		if (err) {
 			deactivate_locked_super(sb);
@@ -992,7 +995,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 		error = -EBUSY;
 		goto error_bdev;
 	}
-	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
+	s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC,
+		 bdev);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	if (IS_ERR(s))
 		goto error_s;
@@ -1017,7 +1021,6 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 	} else {
 		char b[BDEVNAME_SIZE];
 
-		s->s_flags = flags | MS_NOSEC;
 		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
@@ -1062,13 +1065,11 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
 	int (*fill_super)(struct super_block *, void *, int))
 {
 	int error;
-	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
+	struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
 
 	if (IS_ERR(s))
 		return ERR_CAST(s);
 
-	s->s_flags = flags;
-
 	error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 	if (error) {
 		deactivate_locked_super(s);
@@ -1091,11 +1092,10 @@ struct dentry *mount_single(struct file_system_type *fs_type,
 	struct super_block *s;
 	int error;
 
-	s = sget(fs_type, compare_single, set_anon_super, NULL);
+	s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
 	if (IS_ERR(s))
 		return ERR_CAST(s);
 	if (!s->s_root) {
-		s->s_flags = flags;
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			deactivate_locked_super(s);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index c15a7a3572e9..71eb7e253927 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -118,13 +118,12 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
 		info->ns[type] = kobj_ns_grab_current(type);
 
-	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
+	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
 		free_sysfs_super_info(info);
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 	if (!sb->s_root) {
-		sb->s_flags = flags;
 		error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			deactivate_locked_super(sb);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 5862dd9d2784..1c766c39c038 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2136,7 +2136,7 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
 
 	dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
 
-	sb = sget(fs_type, sb_test, sb_set, c);
+	sb = sget(fs_type, sb_test, sb_set, flags, c);
 	if (IS_ERR(sb)) {
 		err = PTR_ERR(sb);
 		kfree(c);
@@ -2153,7 +2153,6 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
 			goto out_deact;
 		}
 	} else {
-		sb->s_flags = flags;
 		err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 		if (err)
 			goto out_deact;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2f857e9eeb3a..48548bdd7722 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1914,7 +1914,7 @@ void free_anon_bdev(dev_t);
 struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
-			void *data);
+			int flags, void *data);
 extern struct dentry *mount_pseudo(struct file_system_type *, char *,
 	const struct super_operations *ops,
 	const struct dentry_operations *dops,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0cd1314acdaf..af2b5641fc8b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1587,7 +1587,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	opts.new_root = new_root;
 
 	/* Locate an existing or new sb for this hierarchy */
-	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
+	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
 	if (IS_ERR(sb)) {
 		ret = PTR_ERR(sb);
 		cgroup_drop_root(opts.new_root);
-- 
cgit v1.2.3-59-g8ed1b


From 6b1859dba01c7d512b72d77e3fd7da8354235189 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Fri, 13 Jul 2012 01:21:50 -0400
Subject: ntp: Fix STA_INS/DEL clearing bug

In commit 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d, I
introduced a bug that kept the STA_INS or STA_DEL bit
from being cleared from time_status via adjtimex()
without forcing STA_PLL first.

Usually once the STA_INS is set, it isn't cleared
until the leap second is applied, so its unlikely this
affected anyone. However during testing I noticed it
took some effort to cancel a leap second once STA_INS
was set.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
CC: stable@vger.kernel.org # 3.4
Link: http://lkml.kernel.org/r/1342156917-25092-2-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/ntp.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 70b33abcc7bb..b7fbadc5c973 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -409,7 +409,9 @@ int second_overflow(unsigned long secs)
 			time_state = TIME_DEL;
 		break;
 	case TIME_INS:
-		if (secs % 86400 == 0) {
+		if (!(time_status & STA_INS))
+			time_state = TIME_OK;
+		else if (secs % 86400 == 0) {
 			leap = -1;
 			time_state = TIME_OOP;
 			time_tai++;
@@ -418,7 +420,9 @@ int second_overflow(unsigned long secs)
 		}
 		break;
 	case TIME_DEL:
-		if ((secs + 1) % 86400 == 0) {
+		if (!(time_status & STA_DEL))
+			time_state = TIME_OK;
+		else if ((secs + 1) % 86400 == 0) {
 			leap = 1;
 			time_tai--;
 			time_state = TIME_WAIT;
-- 
cgit v1.2.3-59-g8ed1b


From 42e71e81f5bb5125ca7c194b5ccf1c93511ff8fb Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 13 Jul 2012 01:21:51 -0400
Subject: time: Whitespace cleanups per Ingo%27s requests

Ingo noted a number of places where there is inconsistent
use of whitespace. This patch tries to address the main
culprits.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1342156917-25092-3-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 269b1fe5f2ae..c2f12aa87fce 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -24,32 +24,31 @@
 /* Structure holding internal timekeeping values. */
 struct timekeeper {
 	/* Current clocksource used for timekeeping. */
-	struct clocksource *clock;
+	struct clocksource	*clock;
 	/* NTP adjusted clock multiplier */
-	u32	mult;
+	u32			mult;
 	/* The shift value of the current clocksource. */
-	int	shift;
-
+	int			shift;
 	/* Number of clock cycles in one NTP interval. */
-	cycle_t cycle_interval;
+	cycle_t			cycle_interval;
 	/* Number of clock shifted nano seconds in one NTP interval. */
-	u64	xtime_interval;
+	u64			xtime_interval;
 	/* shifted nano seconds left over when rounding cycle_interval */
-	s64	xtime_remainder;
+	s64			xtime_remainder;
 	/* Raw nano seconds accumulated per NTP interval. */
-	u32	raw_interval;
+	u32			raw_interval;
 
 	/* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
-	u64	xtime_nsec;
+	u64			xtime_nsec;
 	/* Difference between accumulated time and NTP time in ntp
 	 * shifted nano seconds. */
-	s64	ntp_error;
+	s64			ntp_error;
 	/* Shift conversion between clock shifted nano seconds and
 	 * ntp shifted nano seconds. */
-	int	ntp_error_shift;
+	int			ntp_error_shift;
 
 	/* The current time */
-	struct timespec xtime;
+	struct timespec 	xtime;
 	/*
 	 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
 	 * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
@@ -64,20 +63,17 @@ struct timekeeper {
 	 * - wall_to_monotonic is no longer the boot time, getboottime must be
 	 * used instead.
 	 */
-	struct timespec wall_to_monotonic;
+	struct timespec		wall_to_monotonic;
 	/* time spent in suspend */
-	struct timespec total_sleep_time;
+	struct timespec		total_sleep_time;
 	/* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
-	struct timespec raw_time;
-
+	struct timespec		raw_time;
 	/* Offset clock monotonic -> clock realtime */
-	ktime_t offs_real;
-
+	ktime_t			offs_real;
 	/* Offset clock monotonic -> clock boottime */
-	ktime_t offs_boot;
-
+	ktime_t			offs_boot;
 	/* Seqlock for all timekeeper values */
-	seqlock_t lock;
+	seqlock_t		lock;
 };
 
 static struct timekeeper timekeeper;
@@ -547,6 +543,7 @@ u64 timekeeping_max_deferment(void)
 {
 	unsigned long seq;
 	u64 ret;
+
 	do {
 		seq = read_seqbegin(&timekeeper.lock);
 
-- 
cgit v1.2.3-59-g8ed1b


From fee84c43e6afc42295ae8058cbbef9ea5633926c Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 13 Jul 2012 01:21:52 -0400
Subject: time: Explicitly use u32 instead of int for shift values

Ingo noted that using a u32 instead of int for shift values
would be better to make sure the compiler doesn't unnecessarily
use complex signed arithmetic.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1342156917-25092-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c2f12aa87fce..4fd83df0b14d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -28,7 +28,7 @@ struct timekeeper {
 	/* NTP adjusted clock multiplier */
 	u32			mult;
 	/* The shift value of the current clocksource. */
-	int			shift;
+	u32			shift;
 	/* Number of clock cycles in one NTP interval. */
 	cycle_t			cycle_interval;
 	/* Number of clock shifted nano seconds in one NTP interval. */
@@ -45,7 +45,7 @@ struct timekeeper {
 	s64			ntp_error;
 	/* Shift conversion between clock shifted nano seconds and
 	 * ntp shifted nano seconds. */
-	int			ntp_error_shift;
+	u32			ntp_error_shift;
 
 	/* The current time */
 	struct timespec 	xtime;
@@ -960,7 +960,7 @@ static void timekeeping_adjust(s64 offset)
  *
  * Returns the unconsumed cycles.
  */
-static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+static cycle_t logarithmic_accumulation(cycle_t offset, u32 shift)
 {
 	u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
 	u64 raw_nsecs;
-- 
cgit v1.2.3-59-g8ed1b


From 1e75fa8be9fb61e1af46b5b3b176347a4c958ca1 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 13 Jul 2012 01:21:53 -0400
Subject: time: Condense timekeeper.xtime into xtime_sec

The timekeeper struct has a xtime_nsec, which keeps the
sub-nanosecond remainder.  This ends up being somewhat
duplicative of the timekeeper.xtime.tv_nsec value, and we
have to do extra work to keep them apart, copying the full
nsec portion out and back in over and over.

This patch simplifies some of the logic by taking the timekeeper
xtime value and splitting it into timekeeper.xtime_sec and
reuses the timekeeper.xtime_nsec for the sub-second portion
(stored in higher res shifted nanoseconds).

This simplifies some of the accumulation logic. And will
allow for more accurate timekeeping once the vsyscall code
is updated to use the shifted nanosecond remainder.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1342156917-25092-5-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 181 ++++++++++++++++++++++++++++------------------
 1 file changed, 110 insertions(+), 71 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4fd83df0b14d..b98d9bd73e5e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -38,8 +38,11 @@ struct timekeeper {
 	/* Raw nano seconds accumulated per NTP interval. */
 	u32			raw_interval;
 
-	/* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
+	/* Current CLOCK_REALTIME time in seconds */
+	u64			xtime_sec;
+	/* Clock shifted nano seconds */
 	u64			xtime_nsec;
+
 	/* Difference between accumulated time and NTP time in ntp
 	 * shifted nano seconds. */
 	s64			ntp_error;
@@ -47,8 +50,6 @@ struct timekeeper {
 	 * ntp shifted nano seconds. */
 	u32			ntp_error_shift;
 
-	/* The current time */
-	struct timespec 	xtime;
 	/*
 	 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
 	 * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
@@ -84,11 +85,37 @@ static struct timekeeper timekeeper;
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 
-
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
 
+static inline void tk_normalize_xtime(struct timekeeper *tk)
+{
+	while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
+		tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift;
+		tk->xtime_sec++;
+	}
+}
+
+static struct timespec tk_xtime(struct timekeeper *tk)
+{
+	struct timespec ts;
+
+	ts.tv_sec = tk->xtime_sec;
+	ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
+	return ts;
+}
 
+static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
+{
+	tk->xtime_sec = ts->tv_sec;
+	tk->xtime_nsec = ts->tv_nsec << tk->shift;
+}
+
+static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
+{
+	tk->xtime_sec += ts->tv_sec;
+	tk->xtime_nsec += ts->tv_nsec << tk->shift;
+}
 
 /**
  * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -104,7 +131,9 @@ static void timekeeper_setup_internals(struct clocksource *clock)
 {
 	cycle_t interval;
 	u64 tmp, ntpinterval;
+	struct clocksource *old_clock;
 
+	old_clock = timekeeper.clock;
 	timekeeper.clock = clock;
 	clock->cycle_last = clock->read(clock);
 
@@ -126,7 +155,14 @@ static void timekeeper_setup_internals(struct clocksource *clock)
 	timekeeper.raw_interval =
 		((u64) interval * clock->mult) >> clock->shift;
 
-	timekeeper.xtime_nsec = 0;
+	 /* if changing clocks, convert xtime_nsec shift units */
+	if (old_clock) {
+		int shift_change = clock->shift - old_clock->shift;
+		if (shift_change < 0)
+			timekeeper.xtime_nsec >>= -shift_change;
+		else
+			timekeeper.xtime_nsec <<= shift_change;
+	}
 	timekeeper.shift = clock->shift;
 
 	timekeeper.ntp_error = 0;
@@ -145,6 +181,7 @@ static inline s64 timekeeping_get_ns(void)
 {
 	cycle_t cycle_now, cycle_delta;
 	struct clocksource *clock;
+	s64 nsec;
 
 	/* read clocksource: */
 	clock = timekeeper.clock;
@@ -153,9 +190,8 @@ static inline s64 timekeeping_get_ns(void)
 	/* calculate the delta since the last update_wall_time: */
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
-	/* return delta convert to nanoseconds using ntp adjusted mult. */
-	return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
-				  timekeeper.shift);
+	nsec = cycle_delta * timekeeper.mult + timekeeper.xtime_nsec;
+	return nsec >> timekeeper.shift;
 }
 
 static inline s64 timekeeping_get_ns_raw(void)
@@ -185,12 +221,15 @@ static void update_rt_offset(void)
 /* must hold write on timekeeper.lock */
 static void timekeeping_update(bool clearntp)
 {
+	struct timespec xt;
+
 	if (clearntp) {
 		timekeeper.ntp_error = 0;
 		ntp_clear();
 	}
 	update_rt_offset();
-	update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
+	xt = tk_xtime(&timekeeper);
+	update_vsyscall(&xt, &timekeeper.wall_to_monotonic,
 			 timekeeper.clock, timekeeper.mult);
 }
 
@@ -213,13 +252,12 @@ static void timekeeping_forward_now(void)
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 	clock->cycle_last = cycle_now;
 
-	nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
-				  timekeeper.shift);
+	timekeeper.xtime_nsec += cycle_delta * timekeeper.mult;
 
 	/* If arch requires, add in gettimeoffset() */
-	nsec += arch_gettimeoffset();
+	timekeeper.xtime_nsec += arch_gettimeoffset() << timekeeper.shift;
 
-	timespec_add_ns(&timekeeper.xtime, nsec);
+	tk_normalize_xtime(&timekeeper);
 
 	nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
 	timespec_add_ns(&timekeeper.raw_time, nsec);
@@ -234,15 +272,15 @@ static void timekeeping_forward_now(void)
 void getnstimeofday(struct timespec *ts)
 {
 	unsigned long seq;
-	s64 nsecs;
+	s64 nsecs = 0;
 
 	WARN_ON(timekeeping_suspended);
 
 	do {
 		seq = read_seqbegin(&timekeeper.lock);
 
-		*ts = timekeeper.xtime;
-		nsecs = timekeeping_get_ns();
+		ts->tv_sec = timekeeper.xtime_sec;
+		ts->tv_nsec = timekeeping_get_ns();
 
 		/* If arch requires, add in gettimeoffset() */
 		nsecs += arch_gettimeoffset();
@@ -262,11 +300,10 @@ ktime_t ktime_get(void)
 
 	do {
 		seq = read_seqbegin(&timekeeper.lock);
-		secs = timekeeper.xtime.tv_sec +
+		secs = timekeeper.xtime_sec +
 				timekeeper.wall_to_monotonic.tv_sec;
-		nsecs = timekeeper.xtime.tv_nsec +
+		nsecs = timekeeping_get_ns() +
 				timekeeper.wall_to_monotonic.tv_nsec;
-		nsecs += timekeeping_get_ns();
 		/* If arch requires, add in gettimeoffset() */
 		nsecs += arch_gettimeoffset();
 
@@ -291,22 +328,21 @@ void ktime_get_ts(struct timespec *ts)
 {
 	struct timespec tomono;
 	unsigned int seq;
-	s64 nsecs;
 
 	WARN_ON(timekeeping_suspended);
 
 	do {
 		seq = read_seqbegin(&timekeeper.lock);
-		*ts = timekeeper.xtime;
+		ts->tv_sec = timekeeper.xtime_sec;
+		ts->tv_nsec = timekeeping_get_ns();
 		tomono = timekeeper.wall_to_monotonic;
-		nsecs = timekeeping_get_ns();
 		/* If arch requires, add in gettimeoffset() */
-		nsecs += arch_gettimeoffset();
+		ts->tv_nsec += arch_gettimeoffset();
 
 	} while (read_seqretry(&timekeeper.lock, seq));
 
 	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
-				ts->tv_nsec + tomono.tv_nsec + nsecs);
+				ts->tv_nsec + tomono.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
 
@@ -334,7 +370,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		seq = read_seqbegin(&timekeeper.lock);
 
 		*ts_raw = timekeeper.raw_time;
-		*ts_real = timekeeper.xtime;
+		ts_real->tv_sec = timekeeper.xtime_sec;
+		ts_real->tv_nsec = 0;
 
 		nsecs_raw = timekeeping_get_ns_raw();
 		nsecs_real = timekeeping_get_ns();
@@ -377,7 +414,7 @@ EXPORT_SYMBOL(do_gettimeofday);
  */
 int do_settimeofday(const struct timespec *tv)
 {
-	struct timespec ts_delta;
+	struct timespec ts_delta, xt;
 	unsigned long flags;
 
 	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
@@ -387,12 +424,15 @@ int do_settimeofday(const struct timespec *tv)
 
 	timekeeping_forward_now();
 
-	ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec;
-	ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec;
+	xt = tk_xtime(&timekeeper);
+	ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
+	ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
+
 	timekeeper.wall_to_monotonic =
 			timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
 
-	timekeeper.xtime = *tv;
+	tk_set_xtime(&timekeeper, tv);
+
 	timekeeping_update(true);
 
 	write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -422,7 +462,8 @@ int timekeeping_inject_offset(struct timespec *ts)
 
 	timekeeping_forward_now();
 
-	timekeeper.xtime = timespec_add(timekeeper.xtime, *ts);
+
+	tk_xtime_add(&timekeeper, ts);
 	timekeeper.wall_to_monotonic =
 				timespec_sub(timekeeper.wall_to_monotonic, *ts);
 
@@ -606,14 +647,12 @@ void __init timekeeping_init(void)
 		clock->enable(clock);
 	timekeeper_setup_internals(clock);
 
-	timekeeper.xtime.tv_sec = now.tv_sec;
-	timekeeper.xtime.tv_nsec = now.tv_nsec;
+	tk_set_xtime(&timekeeper, &now);
 	timekeeper.raw_time.tv_sec = 0;
 	timekeeper.raw_time.tv_nsec = 0;
-	if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
-		boot.tv_sec = timekeeper.xtime.tv_sec;
-		boot.tv_nsec = timekeeper.xtime.tv_nsec;
-	}
+	if (boot.tv_sec == 0 && boot.tv_nsec == 0)
+		boot = tk_xtime(&timekeeper);
+
 	set_normalized_timespec(&timekeeper.wall_to_monotonic,
 				-boot.tv_sec, -boot.tv_nsec);
 	update_rt_offset();
@@ -646,7 +685,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
 		return;
 	}
 
-	timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
+	tk_xtime_add(&timekeeper, delta);
 	timekeeper.wall_to_monotonic =
 			timespec_sub(timekeeper.wall_to_monotonic, *delta);
 	update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta));
@@ -742,7 +781,7 @@ static int timekeeping_suspend(void)
 	 * try to compensate so the difference in system time
 	 * and persistent_clock time stays close to constant.
 	 */
-	delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time);
+	delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time);
 	delta_delta = timespec_sub(delta, old_delta);
 	if (abs(delta_delta.tv_sec)  >= 2) {
 		/*
@@ -977,9 +1016,9 @@ static cycle_t logarithmic_accumulation(cycle_t offset, u32 shift)
 	while (timekeeper.xtime_nsec >= nsecps) {
 		int leap;
 		timekeeper.xtime_nsec -= nsecps;
-		timekeeper.xtime.tv_sec++;
-		leap = second_overflow(timekeeper.xtime.tv_sec);
-		timekeeper.xtime.tv_sec += leap;
+		timekeeper.xtime_sec++;
+		leap = second_overflow(timekeeper.xtime_sec);
+		timekeeper.xtime_sec += leap;
 		timekeeper.wall_to_monotonic.tv_sec -= leap;
 		if (leap)
 			clock_was_set_delayed();
@@ -1015,6 +1054,7 @@ static void update_wall_time(void)
 	cycle_t offset;
 	int shift = 0, maxshift;
 	unsigned long flags;
+	s64 remainder;
 
 	write_seqlock_irqsave(&timekeeper.lock, flags);
 
@@ -1029,8 +1069,6 @@ static void update_wall_time(void)
 #else
 	offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
-	timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec <<
-						timekeeper.shift;
 
 	/*
 	 * With NO_HZ we may have to accumulate many cycle_intervals
@@ -1076,28 +1114,31 @@ static void update_wall_time(void)
 		timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
 	}
 
-
 	/*
-	 * Store full nanoseconds into xtime after rounding it up and
-	 * add the remainder to the error difference.
-	 */
-	timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >>
-						timekeeper.shift) + 1;
-	timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec <<
-						timekeeper.shift;
-	timekeeper.ntp_error +=	timekeeper.xtime_nsec <<
-				timekeeper.ntp_error_shift;
+	* Store only full nanoseconds into xtime_nsec after rounding
+	* it up and add the remainder to the error difference.
+	* XXX - This is necessary to avoid small 1ns inconsistnecies caused
+	* by truncating the remainder in vsyscalls. However, it causes
+	* additional work to be done in timekeeping_adjust(). Once
+	* the vsyscall implementations are converted to use xtime_nsec
+	* (shifted nanoseconds), this can be killed.
+	*/
+	remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1);
+	timekeeper.xtime_nsec -= remainder;
+	timekeeper.xtime_nsec += 1 << timekeeper.shift;
+	timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift;
 
 	/*
 	 * Finally, make sure that after the rounding
-	 * xtime.tv_nsec isn't larger than NSEC_PER_SEC
+	 * xtime_nsec isn't larger than NSEC_PER_SEC
 	 */
-	if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) {
+	if (unlikely(timekeeper.xtime_nsec >=
+			((u64)NSEC_PER_SEC << timekeeper.shift))) {
 		int leap;
-		timekeeper.xtime.tv_nsec -= NSEC_PER_SEC;
-		timekeeper.xtime.tv_sec++;
-		leap = second_overflow(timekeeper.xtime.tv_sec);
-		timekeeper.xtime.tv_sec += leap;
+		timekeeper.xtime_nsec -= (u64)NSEC_PER_SEC << timekeeper.shift;
+		timekeeper.xtime_sec++;
+		leap = second_overflow(timekeeper.xtime_sec);
+		timekeeper.xtime_sec += leap;
 		timekeeper.wall_to_monotonic.tv_sec -= leap;
 		if (leap)
 			clock_was_set_delayed();
@@ -1148,21 +1189,20 @@ void get_monotonic_boottime(struct timespec *ts)
 {
 	struct timespec tomono, sleep;
 	unsigned int seq;
-	s64 nsecs;
 
 	WARN_ON(timekeeping_suspended);
 
 	do {
 		seq = read_seqbegin(&timekeeper.lock);
-		*ts = timekeeper.xtime;
+		ts->tv_sec = timekeeper.xtime_sec;
+		ts->tv_nsec = timekeeping_get_ns();
 		tomono = timekeeper.wall_to_monotonic;
 		sleep = timekeeper.total_sleep_time;
-		nsecs = timekeeping_get_ns();
 
 	} while (read_seqretry(&timekeeper.lock, seq));
 
 	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
-			ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
+			ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(get_monotonic_boottime);
 
@@ -1195,13 +1235,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 
 unsigned long get_seconds(void)
 {
-	return timekeeper.xtime.tv_sec;
+	return timekeeper.xtime_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 
 struct timespec __current_kernel_time(void)
 {
-	return timekeeper.xtime;
+	return tk_xtime(&timekeeper);
 }
 
 struct timespec current_kernel_time(void)
@@ -1212,7 +1252,7 @@ struct timespec current_kernel_time(void)
 	do {
 		seq = read_seqbegin(&timekeeper.lock);
 
-		now = timekeeper.xtime;
+		now = tk_xtime(&timekeeper);
 	} while (read_seqretry(&timekeeper.lock, seq));
 
 	return now;
@@ -1227,7 +1267,7 @@ struct timespec get_monotonic_coarse(void)
 	do {
 		seq = read_seqbegin(&timekeeper.lock);
 
-		now = timekeeper.xtime;
+		now = tk_xtime(&timekeeper);
 		mono = timekeeper.wall_to_monotonic;
 	} while (read_seqretry(&timekeeper.lock, seq));
 
@@ -1262,7 +1302,7 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 
 	do {
 		seq = read_seqbegin(&timekeeper.lock);
-		*xtim = timekeeper.xtime;
+		*xtim = tk_xtime(&timekeeper);
 		*wtom = timekeeper.wall_to_monotonic;
 		*sleep = timekeeper.total_sleep_time;
 	} while (read_seqretry(&timekeeper.lock, seq));
@@ -1286,9 +1326,8 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 	do {
 		seq = read_seqbegin(&timekeeper.lock);
 
-		secs = timekeeper.xtime.tv_sec;
-		nsecs = timekeeper.xtime.tv_nsec;
-		nsecs += timekeeping_get_ns();
+		secs = timekeeper.xtime_sec;
+		nsecs = timekeeping_get_ns();
 		/* If arch requires, add in gettimeoffset() */
 		nsecs += arch_gettimeoffset();
 
-- 
cgit v1.2.3-59-g8ed1b


From 1f4f948706bcec1b51bf6492bf04057d2e21e273 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 13 Jul 2012 01:21:54 -0400
Subject: time: Refactor accumulation of nsecs to secs

We do the exact same logic moving nsecs to secs in the
timekeeper in multiple places, so condense this into a
single function.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1342156917-25092-6-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 54 ++++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b98d9bd73e5e..cb4a433bab97 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -990,6 +990,35 @@ static void timekeeping_adjust(s64 offset)
 }
 
 
+/**
+ * accumulate_nsecs_to_secs - Accumulates nsecs into secs
+ *
+ * Helper function that accumulates a the nsecs greater then a second
+ * from the xtime_nsec field to the xtime_secs field.
+ * It also calls into the NTP code to handle leapsecond processing.
+ *
+ */
+static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
+{
+	u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
+
+	while (tk->xtime_nsec >= nsecps) {
+		int leap;
+
+		tk->xtime_nsec -= nsecps;
+		tk->xtime_sec++;
+
+		/* Figure out if its a leap sec and apply if needed */
+		leap = second_overflow(tk->xtime_sec);
+		tk->xtime_sec += leap;
+		tk->wall_to_monotonic.tv_sec -= leap;
+		if (leap)
+			clock_was_set_delayed();
+
+	}
+}
+
+
 /**
  * logarithmic_accumulation - shifted accumulation of cycles
  *
@@ -1001,7 +1030,6 @@ static void timekeeping_adjust(s64 offset)
  */
 static cycle_t logarithmic_accumulation(cycle_t offset, u32 shift)
 {
-	u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
 	u64 raw_nsecs;
 
 	/* If the offset is smaller than a shifted interval, do nothing */
@@ -1013,16 +1041,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, u32 shift)
 	timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
 
 	timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
-	while (timekeeper.xtime_nsec >= nsecps) {
-		int leap;
-		timekeeper.xtime_nsec -= nsecps;
-		timekeeper.xtime_sec++;
-		leap = second_overflow(timekeeper.xtime_sec);
-		timekeeper.xtime_sec += leap;
-		timekeeper.wall_to_monotonic.tv_sec -= leap;
-		if (leap)
-			clock_was_set_delayed();
-	}
+
+	accumulate_nsecs_to_secs(&timekeeper);
 
 	/* Accumulate raw time */
 	raw_nsecs = timekeeper.raw_interval << shift;
@@ -1132,17 +1152,7 @@ static void update_wall_time(void)
 	 * Finally, make sure that after the rounding
 	 * xtime_nsec isn't larger than NSEC_PER_SEC
 	 */
-	if (unlikely(timekeeper.xtime_nsec >=
-			((u64)NSEC_PER_SEC << timekeeper.shift))) {
-		int leap;
-		timekeeper.xtime_nsec -= (u64)NSEC_PER_SEC << timekeeper.shift;
-		timekeeper.xtime_sec++;
-		leap = second_overflow(timekeeper.xtime_sec);
-		timekeeper.xtime_sec += leap;
-		timekeeper.wall_to_monotonic.tv_sec -= leap;
-		if (leap)
-			clock_was_set_delayed();
-	}
+	accumulate_nsecs_to_secs(&timekeeper);
 
 	timekeeping_update(false);
 
-- 
cgit v1.2.3-59-g8ed1b


From f2a5a0854efc62abe7f69e9947842cb135837f9a Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 13 Jul 2012 01:21:55 -0400
Subject: time: Move arch_gettimeoffset() usage into timekeeping_get_ns()

Since we call arch_gettimeoffset() in all the accessor
functions, move arch_gettimeoffset() calls into
timekeeping_get_ns() and timekeeping_get_ns_raw() to simplify
the code.

This also makes the code easier to maintain as we don't have to
worry about forgetting the arch_gettimeoffset() as has happened
in the past.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1342156917-25092-7-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cb4a433bab97..e43289df28c2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -191,13 +191,17 @@ static inline s64 timekeeping_get_ns(void)
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
 	nsec = cycle_delta * timekeeper.mult + timekeeper.xtime_nsec;
-	return nsec >> timekeeper.shift;
+	nsec >>= timekeeper.shift;
+
+	/* If arch requires, add in gettimeoffset() */
+	return nsec + arch_gettimeoffset();
 }
 
 static inline s64 timekeeping_get_ns_raw(void)
 {
 	cycle_t cycle_now, cycle_delta;
 	struct clocksource *clock;
+	s64 nsec;
 
 	/* read clocksource: */
 	clock = timekeeper.clock;
@@ -206,8 +210,11 @@ static inline s64 timekeeping_get_ns_raw(void)
 	/* calculate the delta since the last update_wall_time: */
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
-	/* return delta convert to nanoseconds. */
-	return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+	/* convert delta to nanoseconds. */
+	nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+
+	/* If arch requires, add in gettimeoffset() */
+	return nsec + arch_gettimeoffset();
 }
 
 static void update_rt_offset(void)
@@ -282,9 +289,6 @@ void getnstimeofday(struct timespec *ts)
 		ts->tv_sec = timekeeper.xtime_sec;
 		ts->tv_nsec = timekeeping_get_ns();
 
-		/* If arch requires, add in gettimeoffset() */
-		nsecs += arch_gettimeoffset();
-
 	} while (read_seqretry(&timekeeper.lock, seq));
 
 	timespec_add_ns(ts, nsecs);
@@ -304,8 +308,6 @@ ktime_t ktime_get(void)
 				timekeeper.wall_to_monotonic.tv_sec;
 		nsecs = timekeeping_get_ns() +
 				timekeeper.wall_to_monotonic.tv_nsec;
-		/* If arch requires, add in gettimeoffset() */
-		nsecs += arch_gettimeoffset();
 
 	} while (read_seqretry(&timekeeper.lock, seq));
 	/*
@@ -336,8 +338,6 @@ void ktime_get_ts(struct timespec *ts)
 		ts->tv_sec = timekeeper.xtime_sec;
 		ts->tv_nsec = timekeeping_get_ns();
 		tomono = timekeeper.wall_to_monotonic;
-		/* If arch requires, add in gettimeoffset() */
-		ts->tv_nsec += arch_gettimeoffset();
 
 	} while (read_seqretry(&timekeeper.lock, seq));
 
@@ -365,8 +365,6 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 	WARN_ON_ONCE(timekeeping_suspended);
 
 	do {
-		u32 arch_offset;
-
 		seq = read_seqbegin(&timekeeper.lock);
 
 		*ts_raw = timekeeper.raw_time;
@@ -376,11 +374,6 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		nsecs_raw = timekeeping_get_ns_raw();
 		nsecs_real = timekeeping_get_ns();
 
-		/* If arch requires, add in gettimeoffset() */
-		arch_offset = arch_gettimeoffset();
-		nsecs_raw += arch_offset;
-		nsecs_real += arch_offset;
-
 	} while (read_seqretry(&timekeeper.lock, seq));
 
 	timespec_add_ns(ts_raw, nsecs_raw);
@@ -1338,8 +1331,6 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 
 		secs = timekeeper.xtime_sec;
 		nsecs = timekeeping_get_ns();
-		/* If arch requires, add in gettimeoffset() */
-		nsecs += arch_gettimeoffset();
 
 		*offs_real = timekeeper.offs_real;
 		*offs_boot = timekeeper.offs_boot;
-- 
cgit v1.2.3-59-g8ed1b


From 2a8c0883c3cfffcc148ea606e2a4e7453cd75e73 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 13 Jul 2012 01:21:56 -0400
Subject: time: Move xtime_nsec adjustment underflow handling
 timekeeping_adjust

When we make adjustments speeding up the clock, its possible
for xtime_nsec to underflow. We already handle this properly,
but we do so from update_wall_time() instead of the more logical
timekeeping_adjust(), where the possible underflow actually
occurs.

Thus, move the correction logic to the timekeeping_adjust, which
is the function that causes the issue. Making update_wall_time()
more readable.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1342156917-25092-8-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e43289df28c2..aeeaab8cba6e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -980,6 +980,27 @@ static void timekeeping_adjust(s64 offset)
 	timekeeper.xtime_nsec -= offset;
 	timekeeper.ntp_error -= (interval - offset) <<
 				timekeeper.ntp_error_shift;
+
+	/*
+	 * It may be possible that when we entered this function, xtime_nsec
+	 * was very small.  Further, if we're slightly speeding the clocksource
+	 * in the code above, its possible the required corrective factor to
+	 * xtime_nsec could cause it to underflow.
+	 *
+	 * Now, since we already accumulated the second, cannot simply roll
+	 * the accumulated second back, since the NTP subsystem has been
+	 * notified via second_overflow. So instead we push xtime_nsec forward
+	 * by the amount we underflowed, and add that amount into the error.
+	 *
+	 * We'll correct this error next time through this function, when
+	 * xtime_nsec is not as small.
+	 */
+	if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
+		s64 neg = -(s64)timekeeper.xtime_nsec;
+		timekeeper.xtime_nsec = 0;
+		timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
+	}
+
 }
 
 
@@ -1105,27 +1126,6 @@ static void update_wall_time(void)
 	/* correct the clock when NTP error is too big */
 	timekeeping_adjust(offset);
 
-	/*
-	 * Since in the loop above, we accumulate any amount of time
-	 * in xtime_nsec over a second into xtime.tv_sec, its possible for
-	 * xtime_nsec to be fairly small after the loop. Further, if we're
-	 * slightly speeding the clocksource up in timekeeping_adjust(),
-	 * its possible the required corrective factor to xtime_nsec could
-	 * cause it to underflow.
-	 *
-	 * Now, we cannot simply roll the accumulated second back, since
-	 * the NTP subsystem has been notified via second_overflow. So
-	 * instead we push xtime_nsec forward by the amount we underflowed,
-	 * and add that amount into the error.
-	 *
-	 * We'll correct this error next time through this function, when
-	 * xtime_nsec is not as small.
-	 */
-	if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
-		s64 neg = -(s64)timekeeper.xtime_nsec;
-		timekeeper.xtime_nsec = 0;
-		timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
-	}
 
 	/*
 	* Store only full nanoseconds into xtime_nsec after rounding
-- 
cgit v1.2.3-59-g8ed1b


From f726a697d06102e7a1fc0a87308cb30a84580205 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 13 Jul 2012 01:21:57 -0400
Subject: time: Rework timekeeping functions to take timekeeper ptr as argument

As part of cleaning up the timekeeping code, this patch converts
a number of internal functions to takei a timekeeper ptr as an
argument, so that the internal functions don't access the global
timekeeper structure directly. This allows for further optimizations
to reduce lock hold time later.

This patch has been updated to include more consistent usage of the
timekeeper value, by making sure it is always passed as a argument
to non top-level functions.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1342156917-25092-9-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 208 +++++++++++++++++++++++-----------------------
 1 file changed, 103 insertions(+), 105 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index aeeaab8cba6e..5980e902978c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -127,14 +127,14 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
  *
  * Unless you're the timekeeping code, you should not be using this!
  */
-static void timekeeper_setup_internals(struct clocksource *clock)
+static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 {
 	cycle_t interval;
 	u64 tmp, ntpinterval;
 	struct clocksource *old_clock;
 
-	old_clock = timekeeper.clock;
-	timekeeper.clock = clock;
+	old_clock = tk->clock;
+	tk->clock = clock;
 	clock->cycle_last = clock->read(clock);
 
 	/* Do the ns -> cycle conversion first, using original mult */
@@ -147,64 +147,64 @@ static void timekeeper_setup_internals(struct clocksource *clock)
 		tmp = 1;
 
 	interval = (cycle_t) tmp;
-	timekeeper.cycle_interval = interval;
+	tk->cycle_interval = interval;
 
 	/* Go back from cycles -> shifted ns */
-	timekeeper.xtime_interval = (u64) interval * clock->mult;
-	timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
-	timekeeper.raw_interval =
+	tk->xtime_interval = (u64) interval * clock->mult;
+	tk->xtime_remainder = ntpinterval - tk->xtime_interval;
+	tk->raw_interval =
 		((u64) interval * clock->mult) >> clock->shift;
 
 	 /* if changing clocks, convert xtime_nsec shift units */
 	if (old_clock) {
 		int shift_change = clock->shift - old_clock->shift;
 		if (shift_change < 0)
-			timekeeper.xtime_nsec >>= -shift_change;
+			tk->xtime_nsec >>= -shift_change;
 		else
-			timekeeper.xtime_nsec <<= shift_change;
+			tk->xtime_nsec <<= shift_change;
 	}
-	timekeeper.shift = clock->shift;
+	tk->shift = clock->shift;
 
-	timekeeper.ntp_error = 0;
-	timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+	tk->ntp_error = 0;
+	tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
 
 	/*
 	 * The timekeeper keeps its own mult values for the currently
 	 * active clocksource. These value will be adjusted via NTP
 	 * to counteract clock drifting.
 	 */
-	timekeeper.mult = clock->mult;
+	tk->mult = clock->mult;
 }
 
 /* Timekeeper helper functions. */
-static inline s64 timekeeping_get_ns(void)
+static inline s64 timekeeping_get_ns(struct timekeeper *tk)
 {
 	cycle_t cycle_now, cycle_delta;
 	struct clocksource *clock;
 	s64 nsec;
 
 	/* read clocksource: */
-	clock = timekeeper.clock;
+	clock = tk->clock;
 	cycle_now = clock->read(clock);
 
 	/* calculate the delta since the last update_wall_time: */
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
-	nsec = cycle_delta * timekeeper.mult + timekeeper.xtime_nsec;
-	nsec >>= timekeeper.shift;
+	nsec = cycle_delta * tk->mult + tk->xtime_nsec;
+	nsec >>= tk->shift;
 
 	/* If arch requires, add in gettimeoffset() */
 	return nsec + arch_gettimeoffset();
 }
 
-static inline s64 timekeeping_get_ns_raw(void)
+static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 {
 	cycle_t cycle_now, cycle_delta;
 	struct clocksource *clock;
 	s64 nsec;
 
 	/* read clocksource: */
-	clock = timekeeper.clock;
+	clock = tk->clock;
 	cycle_now = clock->read(clock);
 
 	/* calculate the delta since the last update_wall_time: */
@@ -217,27 +217,26 @@ static inline s64 timekeeping_get_ns_raw(void)
 	return nsec + arch_gettimeoffset();
 }
 
-static void update_rt_offset(void)
+static void update_rt_offset(struct timekeeper *tk)
 {
-	struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic;
+	struct timespec tmp, *wtm = &tk->wall_to_monotonic;
 
 	set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
-	timekeeper.offs_real = timespec_to_ktime(tmp);
+	tk->offs_real = timespec_to_ktime(tmp);
 }
 
 /* must hold write on timekeeper.lock */
-static void timekeeping_update(bool clearntp)
+static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 {
 	struct timespec xt;
 
 	if (clearntp) {
-		timekeeper.ntp_error = 0;
+		tk->ntp_error = 0;
 		ntp_clear();
 	}
-	update_rt_offset();
-	xt = tk_xtime(&timekeeper);
-	update_vsyscall(&xt, &timekeeper.wall_to_monotonic,
-			 timekeeper.clock, timekeeper.mult);
+	update_rt_offset(tk);
+	xt = tk_xtime(tk);
+	update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
 }
 
 
@@ -248,26 +247,26 @@ static void timekeeping_update(bool clearntp)
  * update_wall_time(). This is useful before significant clock changes,
  * as it avoids having to deal with this time offset explicitly.
  */
-static void timekeeping_forward_now(void)
+static void timekeeping_forward_now(struct timekeeper *tk)
 {
 	cycle_t cycle_now, cycle_delta;
 	struct clocksource *clock;
 	s64 nsec;
 
-	clock = timekeeper.clock;
+	clock = tk->clock;
 	cycle_now = clock->read(clock);
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 	clock->cycle_last = cycle_now;
 
-	timekeeper.xtime_nsec += cycle_delta * timekeeper.mult;
+	tk->xtime_nsec += cycle_delta * tk->mult;
 
 	/* If arch requires, add in gettimeoffset() */
-	timekeeper.xtime_nsec += arch_gettimeoffset() << timekeeper.shift;
+	tk->xtime_nsec += arch_gettimeoffset() << tk->shift;
 
-	tk_normalize_xtime(&timekeeper);
+	tk_normalize_xtime(tk);
 
 	nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
-	timespec_add_ns(&timekeeper.raw_time, nsec);
+	timespec_add_ns(&tk->raw_time, nsec);
 }
 
 /**
@@ -287,7 +286,7 @@ void getnstimeofday(struct timespec *ts)
 		seq = read_seqbegin(&timekeeper.lock);
 
 		ts->tv_sec = timekeeper.xtime_sec;
-		ts->tv_nsec = timekeeping_get_ns();
+		ts->tv_nsec = timekeeping_get_ns(&timekeeper);
 
 	} while (read_seqretry(&timekeeper.lock, seq));
 
@@ -306,7 +305,7 @@ ktime_t ktime_get(void)
 		seq = read_seqbegin(&timekeeper.lock);
 		secs = timekeeper.xtime_sec +
 				timekeeper.wall_to_monotonic.tv_sec;
-		nsecs = timekeeping_get_ns() +
+		nsecs = timekeeping_get_ns(&timekeeper) +
 				timekeeper.wall_to_monotonic.tv_nsec;
 
 	} while (read_seqretry(&timekeeper.lock, seq));
@@ -336,7 +335,7 @@ void ktime_get_ts(struct timespec *ts)
 	do {
 		seq = read_seqbegin(&timekeeper.lock);
 		ts->tv_sec = timekeeper.xtime_sec;
-		ts->tv_nsec = timekeeping_get_ns();
+		ts->tv_nsec = timekeeping_get_ns(&timekeeper);
 		tomono = timekeeper.wall_to_monotonic;
 
 	} while (read_seqretry(&timekeeper.lock, seq));
@@ -371,8 +370,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		ts_real->tv_sec = timekeeper.xtime_sec;
 		ts_real->tv_nsec = 0;
 
-		nsecs_raw = timekeeping_get_ns_raw();
-		nsecs_real = timekeeping_get_ns();
+		nsecs_raw = timekeeping_get_ns_raw(&timekeeper);
+		nsecs_real = timekeeping_get_ns(&timekeeper);
 
 	} while (read_seqretry(&timekeeper.lock, seq));
 
@@ -415,7 +414,7 @@ int do_settimeofday(const struct timespec *tv)
 
 	write_seqlock_irqsave(&timekeeper.lock, flags);
 
-	timekeeping_forward_now();
+	timekeeping_forward_now(&timekeeper);
 
 	xt = tk_xtime(&timekeeper);
 	ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
@@ -426,7 +425,7 @@ int do_settimeofday(const struct timespec *tv)
 
 	tk_set_xtime(&timekeeper, tv);
 
-	timekeeping_update(true);
+	timekeeping_update(&timekeeper, true);
 
 	write_sequnlock_irqrestore(&timekeeper.lock, flags);
 
@@ -453,14 +452,14 @@ int timekeeping_inject_offset(struct timespec *ts)
 
 	write_seqlock_irqsave(&timekeeper.lock, flags);
 
-	timekeeping_forward_now();
+	timekeeping_forward_now(&timekeeper);
 
 
 	tk_xtime_add(&timekeeper, ts);
 	timekeeper.wall_to_monotonic =
 				timespec_sub(timekeeper.wall_to_monotonic, *ts);
 
-	timekeeping_update(true);
+	timekeeping_update(&timekeeper, true);
 
 	write_sequnlock_irqrestore(&timekeeper.lock, flags);
 
@@ -485,14 +484,14 @@ static int change_clocksource(void *data)
 
 	write_seqlock_irqsave(&timekeeper.lock, flags);
 
-	timekeeping_forward_now();
+	timekeeping_forward_now(&timekeeper);
 	if (!new->enable || new->enable(new) == 0) {
 		old = timekeeper.clock;
-		timekeeper_setup_internals(new);
+		tk_setup_internals(&timekeeper, new);
 		if (old->disable)
 			old->disable(old);
 	}
-	timekeeping_update(true);
+	timekeeping_update(&timekeeper, true);
 
 	write_sequnlock_irqrestore(&timekeeper.lock, flags);
 
@@ -542,7 +541,7 @@ void getrawmonotonic(struct timespec *ts)
 
 	do {
 		seq = read_seqbegin(&timekeeper.lock);
-		nsecs = timekeeping_get_ns_raw();
+		nsecs = timekeeping_get_ns_raw(&timekeeper);
 		*ts = timekeeper.raw_time;
 
 	} while (read_seqretry(&timekeeper.lock, seq));
@@ -638,7 +637,7 @@ void __init timekeeping_init(void)
 	clock = clocksource_default_clock();
 	if (clock->enable)
 		clock->enable(clock);
-	timekeeper_setup_internals(clock);
+	tk_setup_internals(&timekeeper, clock);
 
 	tk_set_xtime(&timekeeper, &now);
 	timekeeper.raw_time.tv_sec = 0;
@@ -648,7 +647,7 @@ void __init timekeeping_init(void)
 
 	set_normalized_timespec(&timekeeper.wall_to_monotonic,
 				-boot.tv_sec, -boot.tv_nsec);
-	update_rt_offset();
+	update_rt_offset(&timekeeper);
 	timekeeper.total_sleep_time.tv_sec = 0;
 	timekeeper.total_sleep_time.tv_nsec = 0;
 	write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -670,7 +669,8 @@ static void update_sleep_time(struct timespec t)
  * Takes a timespec offset measuring a suspend interval and properly
  * adds the sleep offset to the timekeeping variables.
  */
-static void __timekeeping_inject_sleeptime(struct timespec *delta)
+static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
+							struct timespec *delta)
 {
 	if (!timespec_valid(delta)) {
 		printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
@@ -678,10 +678,9 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
 		return;
 	}
 
-	tk_xtime_add(&timekeeper, delta);
-	timekeeper.wall_to_monotonic =
-			timespec_sub(timekeeper.wall_to_monotonic, *delta);
-	update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta));
+	tk_xtime_add(tk, delta);
+	tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta);
+	update_sleep_time(timespec_add(tk->total_sleep_time, *delta));
 }
 
 
@@ -707,11 +706,11 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 
 	write_seqlock_irqsave(&timekeeper.lock, flags);
 
-	timekeeping_forward_now();
+	timekeeping_forward_now(&timekeeper);
 
-	__timekeeping_inject_sleeptime(delta);
+	__timekeeping_inject_sleeptime(&timekeeper, delta);
 
-	timekeeping_update(true);
+	timekeeping_update(&timekeeper, true);
 
 	write_sequnlock_irqrestore(&timekeeper.lock, flags);
 
@@ -740,7 +739,7 @@ static void timekeeping_resume(void)
 
 	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
 		ts = timespec_sub(ts, timekeeping_suspend_time);
-		__timekeeping_inject_sleeptime(&ts);
+		__timekeeping_inject_sleeptime(&timekeeper, &ts);
 	}
 	/* re-base the last cycle value */
 	timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -765,7 +764,7 @@ static int timekeeping_suspend(void)
 	read_persistent_clock(&timekeeping_suspend_time);
 
 	write_seqlock_irqsave(&timekeeper.lock, flags);
-	timekeeping_forward_now();
+	timekeeping_forward_now(&timekeeper);
 	timekeeping_suspended = 1;
 
 	/*
@@ -813,7 +812,8 @@ device_initcall(timekeeping_init_ops);
  * If the error is already larger, we look ahead even further
  * to compensate for late or lost adjustments.
  */
-static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
+static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
+						 s64 error, s64 *interval,
 						 s64 *offset)
 {
 	s64 tick_error, i;
@@ -829,7 +829,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
 	 * here.  This is tuned so that an error of about 1 msec is adjusted
 	 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
 	 */
-	error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+	error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
 	error2 = abs(error2);
 	for (look_ahead = 0; error2 > 0; look_ahead++)
 		error2 >>= 2;
@@ -838,8 +838,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
 	 * Now calculate the error in (1 << look_ahead) ticks, but first
 	 * remove the single look ahead already included in the error.
 	 */
-	tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1);
-	tick_error -= timekeeper.xtime_interval >> 1;
+	tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
+	tick_error -= tk->xtime_interval >> 1;
 	error = ((error - tick_error) >> look_ahead) + tick_error;
 
 	/* Finally calculate the adjustment shift value.  */
@@ -864,9 +864,9 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
  * this is optimized for the most common adjustments of -1,0,1,
  * for other values we can do a bit more work.
  */
-static void timekeeping_adjust(s64 offset)
+static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 {
-	s64 error, interval = timekeeper.cycle_interval;
+	s64 error, interval = tk->cycle_interval;
 	int adj;
 
 	/*
@@ -882,7 +882,7 @@ static void timekeeping_adjust(s64 offset)
 	 *
 	 * Note: It does not "save" on aggravation when reading the code.
 	 */
-	error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
+	error = tk->ntp_error >> (tk->ntp_error_shift - 1);
 	if (error > interval) {
 		/*
 		 * We now divide error by 4(via shift), which checks if
@@ -904,7 +904,8 @@ static void timekeeping_adjust(s64 offset)
 		if (likely(error <= interval))
 			adj = 1;
 		else
-			adj = timekeeping_bigadjust(error, &interval, &offset);
+			adj = timekeeping_bigadjust(tk, error, &interval,
+							&offset);
 	} else if (error < -interval) {
 		/* See comment above, this is just switched for the negative */
 		error >>= 2;
@@ -913,18 +914,17 @@ static void timekeeping_adjust(s64 offset)
 			interval = -interval;
 			offset = -offset;
 		} else
-			adj = timekeeping_bigadjust(error, &interval, &offset);
-	} else /* No adjustment needed */
+			adj = timekeeping_bigadjust(tk, error, &interval,
+							&offset);
+	} else
 		return;
 
-	if (unlikely(timekeeper.clock->maxadj &&
-			(timekeeper.mult + adj >
-			timekeeper.clock->mult + timekeeper.clock->maxadj))) {
+	if (unlikely(tk->clock->maxadj &&
+		(tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
 		printk_once(KERN_WARNING
 			"Adjusting %s more than 11%% (%ld vs %ld)\n",
-			timekeeper.clock->name, (long)timekeeper.mult + adj,
-			(long)timekeeper.clock->mult +
-				timekeeper.clock->maxadj);
+			tk->clock->name, (long)tk->mult + adj,
+			(long)tk->clock->mult + tk->clock->maxadj);
 	}
 	/*
 	 * So the following can be confusing.
@@ -975,11 +975,10 @@ static void timekeeping_adjust(s64 offset)
 	 *
 	 * XXX - TODO: Doc ntp_error calculation.
 	 */
-	timekeeper.mult += adj;
-	timekeeper.xtime_interval += interval;
-	timekeeper.xtime_nsec -= offset;
-	timekeeper.ntp_error -= (interval - offset) <<
-				timekeeper.ntp_error_shift;
+	tk->mult += adj;
+	tk->xtime_interval += interval;
+	tk->xtime_nsec -= offset;
+	tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
 
 	/*
 	 * It may be possible that when we entered this function, xtime_nsec
@@ -995,10 +994,10 @@ static void timekeeping_adjust(s64 offset)
 	 * We'll correct this error next time through this function, when
 	 * xtime_nsec is not as small.
 	 */
-	if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
-		s64 neg = -(s64)timekeeper.xtime_nsec;
-		timekeeper.xtime_nsec = 0;
-		timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
+	if (unlikely((s64)tk->xtime_nsec < 0)) {
+		s64 neg = -(s64)tk->xtime_nsec;
+		tk->xtime_nsec = 0;
+		tk->ntp_error += neg << tk->ntp_error_shift;
 	}
 
 }
@@ -1042,37 +1041,36 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
  *
  * Returns the unconsumed cycles.
  */
-static cycle_t logarithmic_accumulation(cycle_t offset, u32 shift)
+static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
+						u32 shift)
 {
 	u64 raw_nsecs;
 
-	/* If the offset is smaller than a shifted interval, do nothing */
-	if (offset < timekeeper.cycle_interval<<shift)
+	/* If the offset is smaller then a shifted interval, do nothing */
+	if (offset < tk->cycle_interval<<shift)
 		return offset;
 
 	/* Accumulate one shifted interval */
-	offset -= timekeeper.cycle_interval << shift;
-	timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+	offset -= tk->cycle_interval << shift;
+	tk->clock->cycle_last += tk->cycle_interval << shift;
 
-	timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
-
-	accumulate_nsecs_to_secs(&timekeeper);
+	tk->xtime_nsec += tk->xtime_interval << shift;
+	accumulate_nsecs_to_secs(tk);
 
 	/* Accumulate raw time */
-	raw_nsecs = timekeeper.raw_interval << shift;
-	raw_nsecs += timekeeper.raw_time.tv_nsec;
+	raw_nsecs = tk->raw_interval << shift;
+	raw_nsecs += tk->raw_time.tv_nsec;
 	if (raw_nsecs >= NSEC_PER_SEC) {
 		u64 raw_secs = raw_nsecs;
 		raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
-		timekeeper.raw_time.tv_sec += raw_secs;
+		tk->raw_time.tv_sec += raw_secs;
 	}
-	timekeeper.raw_time.tv_nsec = raw_nsecs;
+	tk->raw_time.tv_nsec = raw_nsecs;
 
 	/* Accumulate error between NTP and clock interval */
-	timekeeper.ntp_error += ntp_tick_length() << shift;
-	timekeeper.ntp_error -=
-	    (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
-				(timekeeper.ntp_error_shift + shift);
+	tk->ntp_error += ntp_tick_length() << shift;
+	tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
+						(tk->ntp_error_shift + shift);
 
 	return offset;
 }
@@ -1118,13 +1116,13 @@ static void update_wall_time(void)
 	maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
 	shift = min(shift, maxshift);
 	while (offset >= timekeeper.cycle_interval) {
-		offset = logarithmic_accumulation(offset, shift);
+		offset = logarithmic_accumulation(&timekeeper, offset, shift);
 		if(offset < timekeeper.cycle_interval<<shift)
 			shift--;
 	}
 
 	/* correct the clock when NTP error is too big */
-	timekeeping_adjust(offset);
+	timekeeping_adjust(&timekeeper, offset);
 
 
 	/*
@@ -1147,7 +1145,7 @@ static void update_wall_time(void)
 	 */
 	accumulate_nsecs_to_secs(&timekeeper);
 
-	timekeeping_update(false);
+	timekeeping_update(&timekeeper, false);
 
 out:
 	write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -1198,7 +1196,7 @@ void get_monotonic_boottime(struct timespec *ts)
 	do {
 		seq = read_seqbegin(&timekeeper.lock);
 		ts->tv_sec = timekeeper.xtime_sec;
-		ts->tv_nsec = timekeeping_get_ns();
+		ts->tv_nsec = timekeeping_get_ns(&timekeeper);
 		tomono = timekeeper.wall_to_monotonic;
 		sleep = timekeeper.total_sleep_time;
 
@@ -1330,7 +1328,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 		seq = read_seqbegin(&timekeeper.lock);
 
 		secs = timekeeper.xtime_sec;
-		nsecs = timekeeping_get_ns();
+		nsecs = timekeeping_get_ns(&timekeeper);
 
 		*offs_real = timekeeper.offs_real;
 		*offs_boot = timekeeper.offs_boot;
-- 
cgit v1.2.3-59-g8ed1b


From 3e997130bd2e8c6f5aaa49d6e3161d4d29b43ab0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Jul 2012 12:50:42 -0400
Subject: timekeeping: Add missing update call in timekeeping_resume()

The leap second rework unearthed another issue of inconsistent data.

On timekeeping_resume() the timekeeper data is updated, but nothing
calls timekeeping_update(), so now the update code in the timer
interrupt sees stale values.

This has been the case before those changes, but then the timer
interrupt was using stale data as well so this went unnoticed for quite
some time.

Add the missing update call, so all the data is consistent everywhere.

Reported-by: Andreas Schwab <schwab@linux-m68k.org>
Reported-and-tested-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Reported-and-tested-by: Martin Steigerwald <Martin@lichtvoll.de>
Cc: LKML <linux-kernel@vger.kernel.org>
Cc: Linux PM list <linux-pm@vger.kernel.org>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/timekeeping.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 269b1fe5f2ae..3447cfaf11e7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -717,6 +717,7 @@ static void timekeeping_resume(void)
 	timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
 	timekeeper.ntp_error = 0;
 	timekeeping_suspended = 0;
+	timekeeping_update(false);
 	write_sequnlock_irqrestore(&timekeeper.lock, flags);
 
 	touch_softlockup_watchdog();
-- 
cgit v1.2.3-59-g8ed1b


From 6575820221f7a4dd6eadecf7bf83cdd154335eda Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:26 -0700
Subject: workqueue: perform cpu down operations from low priority
 cpu_notifier()

Currently, all workqueue cpu hotplug operations run off
CPU_PRI_WORKQUEUE which is higher than normal notifiers.  This is to
ensure that workqueue is up and running while bringing up a CPU before
other notifiers try to use workqueue on the CPU.

Per-cpu workqueues are supposed to remain working and bound to the CPU
for normal CPU_DOWN_PREPARE notifiers.  This holds mostly true even
with workqueue offlining running with higher priority because
workqueue CPU_DOWN_PREPARE only creates a bound trustee thread which
runs the per-cpu workqueue without concurrency management without
explicitly detaching the existing workers.

However, if the trustee needs to create new workers, it creates
unbound workers which may wander off to other CPUs while
CPU_DOWN_PREPARE notifiers are in progress.  Furthermore, if the CPU
down is cancelled, the per-CPU workqueue may end up with workers which
aren't bound to the CPU.

While reliably reproducible with a convoluted artificial test-case
involving scheduling and flushing CPU burning work items from CPU down
notifiers, this isn't very likely to happen in the wild, and, even
when it happens, the effects are likely to be hidden by the following
successful CPU down.

Fix it by using different priorities for up and down notifiers - high
priority for up operations and low priority for down operations.

Workqueue cpu hotplug operations will soon go through further cleanup.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 include/linux/cpu.h |  5 +++--
 kernel/workqueue.c  | 38 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 2e9b9ebbeb78..ce7a074f2519 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -73,8 +73,9 @@ enum {
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION	= 10,
-	/* prepare workqueues for other notifiers */
-	CPU_PRI_WORKQUEUE	= 5,
+	/* bring up workqueues before normal notifiers and down after */
+	CPU_PRI_WORKQUEUE_UP	= 5,
+	CPU_PRI_WORKQUEUE_DOWN	= -5,
 };
 
 #define CPU_ONLINE		0x0002 /* CPU (unsigned)v is up */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4fa9e3552f1e..f59b7fd26e26 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3644,6 +3644,41 @@ err_destroy:
 	return NOTIFY_BAD;
 }
 
+/*
+ * Workqueues should be brought up before normal priority CPU notifiers.
+ * This will be registered high priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+					       unsigned long action,
+					       void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_CANCELED:
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE:
+		return workqueue_cpu_callback(nfb, action, hcpu);
+	}
+	return NOTIFY_OK;
+}
+
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+						 unsigned long action,
+						 void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+	case CPU_DYING:
+	case CPU_POST_DEAD:
+		return workqueue_cpu_callback(nfb, action, hcpu);
+	}
+	return NOTIFY_OK;
+}
+
 #ifdef CONFIG_SMP
 
 struct work_for_cpu {
@@ -3839,7 +3874,8 @@ static int __init init_workqueues(void)
 	unsigned int cpu;
 	int i;
 
-	cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
+	cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
 
 	/* initialize gcwqs */
 	for_each_gcwq_cpu(cpu) {
-- 
cgit v1.2.3-59-g8ed1b


From f2d5a0ee06c1813f985bb9386f3ccc0d0315720f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:26 -0700
Subject: workqueue: drop CPU_DYING notifier operation

Workqueue used CPU_DYING notification to mark GCWQ_DISASSOCIATED.
This was necessary because workqueue's CPU_DOWN_PREPARE happened
before other DOWN_PREPARE notifiers and workqueue needed to stay
associated across the rest of DOWN_PREPARE.

After the previous patch, workqueue's DOWN_PREPARE happens after
others and can set GCWQ_DISASSOCIATED directly.  Drop CPU_DYING and
let the trustee set GCWQ_DISASSOCIATED after disabling concurrency
management.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f59b7fd26e26..1405fb98c0b1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1250,11 +1250,11 @@ static void worker_leave_idle(struct worker *worker)
  * verbatim as it's best effort and blocking and gcwq may be
  * [dis]associated in the meantime.
  *
- * This function tries set_cpus_allowed() and locks gcwq and verifies
- * the binding against GCWQ_DISASSOCIATED which is set during
- * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
- * idle state or fetches works without dropping lock, it can guarantee
- * the scheduling requirement described in the first paragraph.
+ * This function tries set_cpus_allowed() and locks gcwq and verifies the
+ * binding against %GCWQ_DISASSOCIATED which is set during
+ * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
+ * enters idle state or fetches works without dropping lock, it can
+ * guarantee the scheduling requirement described in the first paragraph.
  *
  * CONTEXT:
  * Might sleep.  Called without any lock but returns with gcwq->lock
@@ -3349,6 +3349,12 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	rc = trustee_wait_event(!gcwq_is_managing_workers(gcwq));
 	BUG_ON(rc < 0);
 
+	/*
+	 * We've claimed all manager positions.  Make all workers unbound
+	 * and set DISASSOCIATED.  Before this, all workers except for the
+	 * ones which are still executing works from before the last CPU
+	 * down must be on the cpu.  After this, they may become diasporas.
+	 */
 	for_each_worker_pool(pool, gcwq) {
 		pool->flags |= POOL_MANAGING_WORKERS;
 
@@ -3359,6 +3365,8 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	for_each_busy_worker(worker, i, pos, gcwq)
 		worker->flags |= WORKER_ROGUE;
 
+	gcwq->flags |= GCWQ_DISASSOCIATED;
+
 	/*
 	 * Call schedule() so that we cross rq->lock and thus can
 	 * guarantee sched callbacks see the rogue flag.  This is
@@ -3582,16 +3590,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		}
 		break;
 
-	case CPU_DYING:
-		/*
-		 * Before this, the trustee and all workers except for
-		 * the ones which are still executing works from
-		 * before the last CPU down must be on the cpu.  After
-		 * this, they'll all be diasporas.
-		 */
-		gcwq->flags |= GCWQ_DISASSOCIATED;
-		break;
-
 	case CPU_POST_DEAD:
 		gcwq->trustee_state = TRUSTEE_BUTCHER;
 		/* fall through */
@@ -3672,7 +3670,6 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
-	case CPU_DYING:
 	case CPU_POST_DEAD:
 		return workqueue_cpu_callback(nfb, action, hcpu);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 403c821d452c03be4ced571ac91339a9d3631b17 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:27 -0700
Subject: workqueue: ROGUE workers are UNBOUND workers

Currently, WORKER_UNBOUND is used to mark workers for the unbound
global_cwq and WORKER_ROGUE is used to mark workers for disassociated
per-cpu global_cwqs.  Both are used to make the marked worker skip
concurrency management and the only place they make any difference is
in worker_enter_idle() where WORKER_ROGUE is used to skip scheduling
idle timer, which can easily be replaced with trustee state testing.

This patch replaces WORKER_ROGUE with WORKER_UNBOUND and drops
WORKER_ROGUE.  This is to prepare for removing trustee and handling
disassociated global_cwqs as unbound.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1405fb98c0b1..af512927c607 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -58,13 +58,12 @@ enum {
 	WORKER_DIE		= 1 << 1,	/* die die die */
 	WORKER_IDLE		= 1 << 2,	/* is idle */
 	WORKER_PREP		= 1 << 3,	/* preparing to run works */
-	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
 	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
 	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
 	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
 
-	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
-				  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
+				  WORKER_CPU_INTENSIVE,
 
 	/* gcwq->trustee_state */
 	TRUSTEE_START		= 0,		/* start */
@@ -1198,7 +1197,7 @@ static void worker_enter_idle(struct worker *worker)
 	/* idle_list is LIFO */
 	list_add(&worker->entry, &pool->idle_list);
 
-	if (likely(!(worker->flags & WORKER_ROGUE))) {
+	if (likely(gcwq->trustee_state != TRUSTEE_DONE)) {
 		if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
 			mod_timer(&pool->idle_timer,
 				  jiffies + IDLE_WORKER_TIMEOUT);
@@ -1207,7 +1206,7 @@ static void worker_enter_idle(struct worker *worker)
 
 	/*
 	 * Sanity check nr_running.  Because trustee releases gcwq->lock
-	 * between setting %WORKER_ROGUE and zapping nr_running, the
+	 * between setting %WORKER_UNBOUND and zapping nr_running, the
 	 * warning may trigger spuriously.  Check iff trustee is idle.
 	 */
 	WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
@@ -1301,10 +1300,10 @@ __acquires(&gcwq->lock)
 }
 
 /*
- * Function for worker->rebind_work used to rebind rogue busy workers
- * to the associated cpu which is coming back online.  This is
- * scheduled by cpu up but can race with other cpu hotplug operations
- * and may be executed twice without intervening cpu down.
+ * Function for worker->rebind_work used to rebind unbound busy workers to
+ * the associated cpu which is coming back online.  This is scheduled by
+ * cpu up but can race with other cpu hotplug operations and may be
+ * executed twice without intervening cpu down.
  */
 static void worker_rebind_fn(struct work_struct *work)
 {
@@ -1385,9 +1384,8 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
 		set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
 
 	/*
-	 * A rogue worker will become a regular one if CPU comes
-	 * online later on.  Make sure every worker has
-	 * PF_THREAD_BOUND set.
+	 * An unbound worker will become a regular one if CPU comes online
+	 * later on.  Make sure every worker has PF_THREAD_BOUND set.
 	 */
 	if (bind && !on_unbound_cpu)
 		kthread_bind(worker->task, gcwq->cpu);
@@ -3215,11 +3213,10 @@ EXPORT_SYMBOL_GPL(work_busy);
  * gcwqs serve mix of short, long and very long running works making
  * blocked draining impractical.
  *
- * This is solved by allowing a gcwq to be detached from CPU, running
- * it with unbound (rogue) workers and allowing it to be reattached
- * later if the cpu comes back online.  A separate thread is created
- * to govern a gcwq in such state and is called the trustee of the
- * gcwq.
+ * This is solved by allowing a gcwq to be detached from CPU, running it
+ * with unbound workers and allowing it to be reattached later if the cpu
+ * comes back online.  A separate thread is created to govern a gcwq in
+ * such state and is called the trustee of the gcwq.
  *
  * Trustee states and their descriptions.
  *
@@ -3359,19 +3356,18 @@ static int __cpuinit trustee_thread(void *__gcwq)
 		pool->flags |= POOL_MANAGING_WORKERS;
 
 		list_for_each_entry(worker, &pool->idle_list, entry)
-			worker->flags |= WORKER_ROGUE;
+			worker->flags |= WORKER_UNBOUND;
 	}
 
 	for_each_busy_worker(worker, i, pos, gcwq)
-		worker->flags |= WORKER_ROGUE;
+		worker->flags |= WORKER_UNBOUND;
 
 	gcwq->flags |= GCWQ_DISASSOCIATED;
 
 	/*
-	 * Call schedule() so that we cross rq->lock and thus can
-	 * guarantee sched callbacks see the rogue flag.  This is
-	 * necessary as scheduler callbacks may be invoked from other
-	 * cpus.
+	 * Call schedule() so that we cross rq->lock and thus can guarantee
+	 * sched callbacks see the unbound flag.  This is necessary as
+	 * scheduler callbacks may be invoked from other cpus.
 	 */
 	spin_unlock_irq(&gcwq->lock);
 	schedule();
@@ -3439,7 +3435,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 				worker = create_worker(pool, false);
 				spin_lock_irq(&gcwq->lock);
 				if (worker) {
-					worker->flags |= WORKER_ROGUE;
+					worker->flags |= WORKER_UNBOUND;
 					start_worker(worker);
 				}
 			}
@@ -3488,7 +3484,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 		 * rebinding is scheduled.
 		 */
 		worker->flags |= WORKER_REBIND;
-		worker->flags &= ~WORKER_ROGUE;
+		worker->flags &= ~WORKER_UNBOUND;
 
 		/* queue rebind_work, wq doesn't matter, use the default one */
 		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
-- 
cgit v1.2.3-59-g8ed1b


From 6037315269d62bf967286ae2670fdd6b6acedab9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:27 -0700
Subject: workqueue: use mutex for global_cwq manager exclusion

POOL_MANAGING_WORKERS is used to ensure that at most one worker takes
the manager role at any given time on a given global_cwq.  Trustee
later hitched on it to assume manager adding blocking wait for the
bit.  As trustee already needed a custom wait mechanism, waiting for
MANAGING_WORKERS was rolled into the same mechanism.

Trustee is scheduled to be removed.  This patch separates out
MANAGING_WORKERS wait into per-pool mutex.  Workers use
mutex_trylock() to test for manager role and trustee uses mutex_lock()
to claim manager roles.

gcwq_claim/release_management() helpers are added to grab and release
manager roles of all pools on a global_cwq.  gcwq_claim_management()
always grabs pool manager mutexes in ascending pool index order and
uses pool index as lockdep subclass.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 65 ++++++++++++++++++++++--------------------------------
 1 file changed, 26 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index af512927c607..f7a00697d150 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -51,7 +51,6 @@ enum {
 
 	/* pool flags */
 	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
-	POOL_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
 
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
@@ -155,6 +154,7 @@ struct worker_pool {
 	struct timer_list	idle_timer;	/* L: worker idle timeout */
 	struct timer_list	mayday_timer;	/* L: SOS timer for workers */
 
+	struct mutex		manager_mutex;	/* mutex manager should hold */
 	struct ida		worker_ida;	/* L: for worker IDs */
 	struct worker		*first_idle;	/* L: first idle worker */
 };
@@ -644,7 +644,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
-	bool managing = pool->flags & POOL_MANAGING_WORKERS;
+	bool managing = mutex_is_locked(&pool->manager_mutex);
 	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
 	int nr_busy = pool->nr_workers - nr_idle;
 
@@ -1655,14 +1655,12 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
 static bool manage_workers(struct worker *worker)
 {
 	struct worker_pool *pool = worker->pool;
-	struct global_cwq *gcwq = pool->gcwq;
 	bool ret = false;
 
-	if (pool->flags & POOL_MANAGING_WORKERS)
+	if (!mutex_trylock(&pool->manager_mutex))
 		return ret;
 
 	pool->flags &= ~POOL_MANAGE_WORKERS;
-	pool->flags |= POOL_MANAGING_WORKERS;
 
 	/*
 	 * Destroy and then create so that may_start_working() is true
@@ -1671,15 +1669,7 @@ static bool manage_workers(struct worker *worker)
 	ret |= maybe_destroy_workers(pool);
 	ret |= maybe_create_worker(pool);
 
-	pool->flags &= ~POOL_MANAGING_WORKERS;
-
-	/*
-	 * The trustee might be waiting to take over the manager
-	 * position, tell it we're done.
-	 */
-	if (unlikely(gcwq->trustee))
-		wake_up_all(&gcwq->trustee_wait);
-
+	mutex_unlock(&pool->manager_mutex);
 	return ret;
 }
 
@@ -3255,6 +3245,24 @@ EXPORT_SYMBOL_GPL(work_busy);
  *                         ----------------> RELEASE --------------
  */
 
+/* claim manager positions of all pools */
+static void gcwq_claim_management(struct global_cwq *gcwq)
+{
+	struct worker_pool *pool;
+
+	for_each_worker_pool(pool, gcwq)
+		mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
+}
+
+/* release manager positions */
+static void gcwq_release_management(struct global_cwq *gcwq)
+{
+	struct worker_pool *pool;
+
+	for_each_worker_pool(pool, gcwq)
+		mutex_unlock(&pool->manager_mutex);
+}
+
 /**
  * trustee_wait_event_timeout - timed event wait for trustee
  * @cond: condition to wait for
@@ -3304,16 +3312,6 @@ EXPORT_SYMBOL_GPL(work_busy);
 	__ret1 < 0 ? -1 : 0;						\
 })
 
-static bool gcwq_is_managing_workers(struct global_cwq *gcwq)
-{
-	struct worker_pool *pool;
-
-	for_each_worker_pool(pool, gcwq)
-		if (pool->flags & POOL_MANAGING_WORKERS)
-			return true;
-	return false;
-}
-
 static bool gcwq_has_idle_workers(struct global_cwq *gcwq)
 {
 	struct worker_pool *pool;
@@ -3336,15 +3334,8 @@ static int __cpuinit trustee_thread(void *__gcwq)
 
 	BUG_ON(gcwq->cpu != smp_processor_id());
 
+	gcwq_claim_management(gcwq);
 	spin_lock_irq(&gcwq->lock);
-	/*
-	 * Claim the manager position and make all workers rogue.
-	 * Trustee must be bound to the target cpu and can't be
-	 * cancelled.
-	 */
-	BUG_ON(gcwq->cpu != smp_processor_id());
-	rc = trustee_wait_event(!gcwq_is_managing_workers(gcwq));
-	BUG_ON(rc < 0);
 
 	/*
 	 * We've claimed all manager positions.  Make all workers unbound
@@ -3352,12 +3343,9 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * ones which are still executing works from before the last CPU
 	 * down must be on the cpu.  After this, they may become diasporas.
 	 */
-	for_each_worker_pool(pool, gcwq) {
-		pool->flags |= POOL_MANAGING_WORKERS;
-
+	for_each_worker_pool(pool, gcwq)
 		list_for_each_entry(worker, &pool->idle_list, entry)
 			worker->flags |= WORKER_UNBOUND;
-	}
 
 	for_each_busy_worker(worker, i, pos, gcwq)
 		worker->flags |= WORKER_UNBOUND;
@@ -3497,9 +3485,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 			    work_color_to_flags(WORK_NO_COLOR));
 	}
 
-	/* relinquish manager role */
-	for_each_worker_pool(pool, gcwq)
-		pool->flags &= ~POOL_MANAGING_WORKERS;
+	gcwq_release_management(gcwq);
 
 	/* notify completion */
 	gcwq->trustee = NULL;
@@ -3894,6 +3880,7 @@ static int __init init_workqueues(void)
 			setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
 				    (unsigned long)pool);
 
+			mutex_init(&pool->manager_mutex);
 			ida_init(&pool->worker_ida);
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From bc2ae0f5bb2f39e6db06a62f9d353e4601a332a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:27 -0700
Subject: workqueue: drop @bind from create_worker()

Currently, create_worker()'s callers are responsible for deciding
whether the newly created worker should be bound to the associated CPU
and create_worker() sets WORKER_UNBOUND only for the workers for the
unbound global_cwq.  Creation during normal operation is always via
maybe_create_worker() and @bind is true.  For workers created during
hotplug, @bind is false.

Normal operation path is planned to be used even while the CPU is
going through hotplug operations or offline and this static decision
won't work.

Drop @bind from create_worker() and decide whether to bind by looking
at GCWQ_DISASSOCIATED.  create_worker() will also set WORKER_UNBOUND
autmatically if disassociated.  To avoid flipping GCWQ_DISASSOCIATED
while create_worker() is in progress, the flag is now allowed to be
changed only while holding all manager_mutexes on the global_cwq.

This requires that GCWQ_DISASSOCIATED is not cleared behind trustee's
back.  CPU_ONLINE no longer clears DISASSOCIATED before flushing
trustee, which clears DISASSOCIATED before rebinding remaining workers
if asked to release.  For cases where trustee isn't around, CPU_ONLINE
clears DISASSOCIATED after flushing trustee.  Also, now, first_idle
has UNBOUND set on creation which is explicitly cleared by CPU_ONLINE
while binding it.  These convolutions will soon be removed by further
simplification of CPU hotplug path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 64 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f7a00697d150..e1d05e51a80a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -45,7 +45,22 @@
 #include "workqueue_sched.h"
 
 enum {
-	/* global_cwq flags */
+	/*
+	 * global_cwq flags
+	 *
+	 * A bound gcwq is either associated or disassociated with its CPU.
+	 * While associated (!DISASSOCIATED), all workers are bound to the
+	 * CPU and none has %WORKER_UNBOUND set and concurrency management
+	 * is in effect.
+	 *
+	 * While DISASSOCIATED, the cpu may be offline and all workers have
+	 * %WORKER_UNBOUND set and concurrency management disabled, and may
+	 * be executing on any CPU.  The gcwq behaves as an unbound one.
+	 *
+	 * Note that DISASSOCIATED can be flipped only while holding
+	 * managership of all pools on the gcwq to avoid changing binding
+	 * state while create_worker() is in progress.
+	 */
 	GCWQ_DISASSOCIATED	= 1 << 0,	/* cpu can't serve workers */
 	GCWQ_FREEZING		= 1 << 1,	/* freeze in progress */
 
@@ -1334,7 +1349,6 @@ static struct worker *alloc_worker(void)
 /**
  * create_worker - create a new workqueue worker
  * @pool: pool the new worker will belong to
- * @bind: whether to set affinity to @cpu or not
  *
  * Create a new worker which is bound to @pool.  The returned worker
  * can be started by calling start_worker() or destroyed using
@@ -1346,10 +1360,9 @@ static struct worker *alloc_worker(void)
  * RETURNS:
  * Pointer to the newly created worker.
  */
-static struct worker *create_worker(struct worker_pool *pool, bool bind)
+static struct worker *create_worker(struct worker_pool *pool)
 {
 	struct global_cwq *gcwq = pool->gcwq;
-	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
 	const char *pri = worker_pool_pri(pool) ? "H" : "";
 	struct worker *worker = NULL;
 	int id = -1;
@@ -1370,7 +1383,7 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
 	worker->pool = pool;
 	worker->id = id;
 
-	if (!on_unbound_cpu)
+	if (gcwq->cpu != WORK_CPU_UNBOUND)
 		worker->task = kthread_create_on_node(worker_thread,
 					worker, cpu_to_node(gcwq->cpu),
 					"kworker/%u:%d%s", gcwq->cpu, id, pri);
@@ -1384,15 +1397,19 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
 		set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
 
 	/*
-	 * An unbound worker will become a regular one if CPU comes online
-	 * later on.  Make sure every worker has PF_THREAD_BOUND set.
+	 * Determine CPU binding of the new worker depending on
+	 * %GCWQ_DISASSOCIATED.  The caller is responsible for ensuring the
+	 * flag remains stable across this function.  See the comments
+	 * above the flag definition for details.
+	 *
+	 * As an unbound worker may later become a regular one if CPU comes
+	 * online, make sure every worker has %PF_THREAD_BOUND set.
 	 */
-	if (bind && !on_unbound_cpu)
+	if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
 		kthread_bind(worker->task, gcwq->cpu);
-	else {
+	} else {
 		worker->task->flags |= PF_THREAD_BOUND;
-		if (on_unbound_cpu)
-			worker->flags |= WORKER_UNBOUND;
+		worker->flags |= WORKER_UNBOUND;
 	}
 
 	return worker;
@@ -1568,7 +1585,7 @@ restart:
 	while (true) {
 		struct worker *worker;
 
-		worker = create_worker(pool, true);
+		worker = create_worker(pool);
 		if (worker) {
 			del_timer_sync(&pool->mayday_timer);
 			spin_lock_irq(&gcwq->lock);
@@ -3420,12 +3437,10 @@ static int __cpuinit trustee_thread(void *__gcwq)
 
 			if (need_to_create_worker(pool)) {
 				spin_unlock_irq(&gcwq->lock);
-				worker = create_worker(pool, false);
+				worker = create_worker(pool);
 				spin_lock_irq(&gcwq->lock);
-				if (worker) {
-					worker->flags |= WORKER_UNBOUND;
+				if (worker)
 					start_worker(worker);
-				}
 			}
 		}
 
@@ -3463,6 +3478,10 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	for_each_worker_pool(pool, gcwq)
 		WARN_ON(!list_empty(&pool->idle_list));
 
+	/* if we're reassociating, clear DISASSOCIATED */
+	if (gcwq->trustee_state == TRUSTEE_RELEASE)
+		gcwq->flags &= ~GCWQ_DISASSOCIATED;
+
 	for_each_busy_worker(worker, i, pos, gcwq) {
 		struct work_struct *rebind_work = &worker->rebind_work;
 
@@ -3546,7 +3565,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		i = 0;
 		for_each_worker_pool(pool, gcwq) {
 			BUG_ON(pool->first_idle);
-			new_workers[i] = create_worker(pool, false);
+			new_workers[i] = create_worker(pool);
 			if (!new_workers[i++])
 				goto err_destroy;
 		}
@@ -3584,13 +3603,19 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-		gcwq->flags &= ~GCWQ_DISASSOCIATED;
 		if (gcwq->trustee_state != TRUSTEE_DONE) {
 			gcwq->trustee_state = TRUSTEE_RELEASE;
 			wake_up_process(gcwq->trustee);
 			wait_trustee_state(gcwq, TRUSTEE_DONE);
 		}
 
+		/*
+		 * Either DISASSOCIATED is already cleared or no worker is
+		 * left on the gcwq.  Safe to clear DISASSOCIATED without
+		 * claiming managers.
+		 */
+		gcwq->flags &= ~GCWQ_DISASSOCIATED;
+
 		/*
 		 * Trustee is done and there might be no worker left.
 		 * Put the first_idle in and request a real manager to
@@ -3601,6 +3626,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			kthread_bind(pool->first_idle->task, cpu);
 			spin_lock_irq(&gcwq->lock);
 			pool->flags |= POOL_MANAGE_WORKERS;
+			pool->first_idle->flags &= ~WORKER_UNBOUND;
 			start_worker(pool->first_idle);
 			pool->first_idle = NULL;
 		}
@@ -3899,7 +3925,7 @@ static int __init init_workqueues(void)
 		for_each_worker_pool(pool, gcwq) {
 			struct worker *worker;
 
-			worker = create_worker(pool, true);
+			worker = create_worker(pool);
 			BUG_ON(!worker);
 			spin_lock_irq(&gcwq->lock);
 			start_worker(worker);
-- 
cgit v1.2.3-59-g8ed1b


From 25511a477657884d2164f338341fa89652610507 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:27 -0700
Subject: workqueue: reimplement CPU online rebinding to handle idle workers

Currently, if there are left workers when a CPU is being brough back
online, the trustee kills all idle workers and scheduled rebind_work
so that they re-bind to the CPU after the currently executing work is
finished.  This works for busy workers because concurrency management
doesn't try to wake up them from scheduler callbacks, which require
the target task to be on the local run queue.  The busy worker bumps
concurrency counter appropriately as it clears WORKER_UNBOUND from the
rebind work item and it's bound to the CPU before returning to the
idle state.

To reduce CPU on/offlining overhead (as many embedded systems use it
for powersaving) and simplify the code path, workqueue is planned to
be modified to retain idle workers across CPU on/offlining.  This
patch reimplements CPU online rebinding such that it can also handle
idle workers.

As noted earlier, due to the local wakeup requirement, rebinding idle
workers is tricky.  All idle workers must be re-bound before scheduler
callbacks are enabled.  This is achieved by interlocking idle
re-binding.  Idle workers are requested to re-bind and then hold until
all idle re-binding is complete so that no bound worker starts
executing work item.  Only after all idle workers are re-bound and
parked, CPU_ONLINE proceeds to release them and queue rebind work item
to busy workers thus guaranteeing scheduler callbacks aren't invoked
until all idle workers are ready.

worker_rebind_fn() is renamed to busy_worker_rebind_fn() and
idle_worker_rebind() for idle workers is added.  Rebinding logic is
moved to rebind_workers() and now called from CPU_ONLINE after
flushing trustee.  While at it, add CPU sanity check in
worker_thread().

Note that now a worker may become idle or the manager between trustee
release and rebinding during CPU_ONLINE.  As the previous patch
updated create_worker() so that it can be used by regular manager
while unbound and this patch implements idle re-binding, this is safe.

This prepares for removal of trustee and keeping idle workers across
CPU hotplugs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 215 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 166 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e1d05e51a80a..6927fecae412 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -133,6 +133,7 @@ enum {
 
 struct global_cwq;
 struct worker_pool;
+struct idle_rebind;
 
 /*
  * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -154,7 +155,10 @@ struct worker {
 	unsigned long		last_active;	/* L: last active timestamp */
 	unsigned int		flags;		/* X: flags */
 	int			id;		/* I: worker id */
-	struct work_struct	rebind_work;	/* L: rebind worker to cpu */
+
+	/* for rebinding worker to CPU */
+	struct idle_rebind	*idle_rebind;	/* L: for idle worker */
+	struct work_struct	rebind_work;	/* L: for busy worker */
 };
 
 struct worker_pool {
@@ -190,6 +194,8 @@ struct global_cwq {
 
 	struct worker_pool	pools[2];	/* normal and highpri pools */
 
+	wait_queue_head_t	rebind_hold;	/* rebind hold wait */
+
 	struct task_struct	*trustee;	/* L: for gcwq shutdown */
 	unsigned int		trustee_state;	/* L: trustee state */
 	wait_queue_head_t	trustee_wait;	/* trustee wait */
@@ -1314,13 +1320,37 @@ __acquires(&gcwq->lock)
 	}
 }
 
+struct idle_rebind {
+	int			cnt;		/* # workers to be rebound */
+	struct completion	done;		/* all workers rebound */
+};
+
+/*
+ * Rebind an idle @worker to its CPU.  During CPU onlining, this has to
+ * happen synchronously for idle workers.  worker_thread() will test
+ * %WORKER_REBIND before leaving idle and call this function.
+ */
+static void idle_worker_rebind(struct worker *worker)
+{
+	struct global_cwq *gcwq = worker->pool->gcwq;
+
+	/* CPU must be online at this point */
+	WARN_ON(!worker_maybe_bind_and_lock(worker));
+	if (!--worker->idle_rebind->cnt)
+		complete(&worker->idle_rebind->done);
+	spin_unlock_irq(&worker->pool->gcwq->lock);
+
+	/* we did our part, wait for rebind_workers() to finish up */
+	wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+}
+
 /*
- * Function for worker->rebind_work used to rebind unbound busy workers to
+ * Function for @worker->rebind.work used to rebind unbound busy workers to
  * the associated cpu which is coming back online.  This is scheduled by
  * cpu up but can race with other cpu hotplug operations and may be
  * executed twice without intervening cpu down.
  */
-static void worker_rebind_fn(struct work_struct *work)
+static void busy_worker_rebind_fn(struct work_struct *work)
 {
 	struct worker *worker = container_of(work, struct worker, rebind_work);
 	struct global_cwq *gcwq = worker->pool->gcwq;
@@ -1331,6 +1361,112 @@ static void worker_rebind_fn(struct work_struct *work)
 	spin_unlock_irq(&gcwq->lock);
 }
 
+/**
+ * rebind_workers - rebind all workers of a gcwq to the associated CPU
+ * @gcwq: gcwq of interest
+ *
+ * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
+ * is different for idle and busy ones.
+ *
+ * The idle ones should be rebound synchronously and idle rebinding should
+ * be complete before any worker starts executing work items with
+ * concurrency management enabled; otherwise, scheduler may oops trying to
+ * wake up non-local idle worker from wq_worker_sleeping().
+ *
+ * This is achieved by repeatedly requesting rebinding until all idle
+ * workers are known to have been rebound under @gcwq->lock and holding all
+ * idle workers from becoming busy until idle rebinding is complete.
+ *
+ * Once idle workers are rebound, busy workers can be rebound as they
+ * finish executing their current work items.  Queueing the rebind work at
+ * the head of their scheduled lists is enough.  Note that nr_running will
+ * be properbly bumped as busy workers rebind.
+ *
+ * On return, all workers are guaranteed to either be bound or have rebind
+ * work item scheduled.
+ */
+static void rebind_workers(struct global_cwq *gcwq)
+	__releases(&gcwq->lock) __acquires(&gcwq->lock)
+{
+	struct idle_rebind idle_rebind;
+	struct worker_pool *pool;
+	struct worker *worker;
+	struct hlist_node *pos;
+	int i;
+
+	lockdep_assert_held(&gcwq->lock);
+
+	for_each_worker_pool(pool, gcwq)
+		lockdep_assert_held(&pool->manager_mutex);
+
+	/*
+	 * Rebind idle workers.  Interlocked both ways.  We wait for
+	 * workers to rebind via @idle_rebind.done.  Workers will wait for
+	 * us to finish up by watching %WORKER_REBIND.
+	 */
+	init_completion(&idle_rebind.done);
+retry:
+	idle_rebind.cnt = 1;
+	INIT_COMPLETION(idle_rebind.done);
+
+	/* set REBIND and kick idle ones, we'll wait for these later */
+	for_each_worker_pool(pool, gcwq) {
+		list_for_each_entry(worker, &pool->idle_list, entry) {
+			if (worker->flags & WORKER_REBIND)
+				continue;
+
+			/* morph UNBOUND to REBIND */
+			worker->flags &= ~WORKER_UNBOUND;
+			worker->flags |= WORKER_REBIND;
+
+			idle_rebind.cnt++;
+			worker->idle_rebind = &idle_rebind;
+
+			/* worker_thread() will call idle_worker_rebind() */
+			wake_up_process(worker->task);
+		}
+	}
+
+	if (--idle_rebind.cnt) {
+		spin_unlock_irq(&gcwq->lock);
+		wait_for_completion(&idle_rebind.done);
+		spin_lock_irq(&gcwq->lock);
+		/* busy ones might have become idle while waiting, retry */
+		goto retry;
+	}
+
+	/*
+	 * All idle workers are rebound and waiting for %WORKER_REBIND to
+	 * be cleared inside idle_worker_rebind().  Clear and release.
+	 * Clearing %WORKER_REBIND from this foreign context is safe
+	 * because these workers are still guaranteed to be idle.
+	 */
+	for_each_worker_pool(pool, gcwq)
+		list_for_each_entry(worker, &pool->idle_list, entry)
+			worker->flags &= ~WORKER_REBIND;
+
+	wake_up_all(&gcwq->rebind_hold);
+
+	/* rebind busy workers */
+	for_each_busy_worker(worker, i, pos, gcwq) {
+		struct work_struct *rebind_work = &worker->rebind_work;
+
+		/* morph UNBOUND to REBIND */
+		worker->flags &= ~WORKER_UNBOUND;
+		worker->flags |= WORKER_REBIND;
+
+		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+				     work_data_bits(rebind_work)))
+			continue;
+
+		/* wq doesn't matter, use the default one */
+		debug_work_activate(rebind_work);
+		insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+			    worker->scheduled.next,
+			    work_color_to_flags(WORK_NO_COLOR));
+	}
+}
+
 static struct worker *alloc_worker(void)
 {
 	struct worker *worker;
@@ -1339,7 +1475,7 @@ static struct worker *alloc_worker(void)
 	if (worker) {
 		INIT_LIST_HEAD(&worker->entry);
 		INIT_LIST_HEAD(&worker->scheduled);
-		INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+		INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
 		/* on creation a worker is in !idle && prep state */
 		worker->flags = WORKER_PREP;
 	}
@@ -1829,6 +1965,9 @@ __acquires(&gcwq->lock)
 
 	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
+	WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+		     raw_smp_processor_id() != gcwq->cpu);
+
 	/*
 	 * A single work shouldn't be executed concurrently by
 	 * multiple workers on a single cpu.  Check whether anyone is
@@ -1946,11 +2085,20 @@ static int worker_thread(void *__worker)
 woke_up:
 	spin_lock_irq(&gcwq->lock);
 
-	/* DIE can be set only while we're idle, checking here is enough */
-	if (worker->flags & WORKER_DIE) {
+	/*
+	 * DIE can be set only while idle and REBIND set while busy has
+	 * @worker->rebind_work scheduled.  Checking here is enough.
+	 */
+	if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
 		spin_unlock_irq(&gcwq->lock);
-		worker->task->flags &= ~PF_WQ_WORKER;
-		return 0;
+
+		if (worker->flags & WORKER_DIE) {
+			worker->task->flags &= ~PF_WQ_WORKER;
+			return 0;
+		}
+
+		idle_worker_rebind(worker);
+		goto woke_up;
 	}
 
 	worker_leave_idle(worker);
@@ -3468,42 +3616,6 @@ static int __cpuinit trustee_thread(void *__gcwq)
 		}
 	} while (i && rc >= 0);
 
-	/*
-	 * At this point, either draining has completed and no worker
-	 * is left, or cpu down has been canceled or the cpu is being
-	 * brought back up.  There shouldn't be any idle one left.
-	 * Tell the remaining busy ones to rebind once it finishes the
-	 * currently scheduled works by scheduling the rebind_work.
-	 */
-	for_each_worker_pool(pool, gcwq)
-		WARN_ON(!list_empty(&pool->idle_list));
-
-	/* if we're reassociating, clear DISASSOCIATED */
-	if (gcwq->trustee_state == TRUSTEE_RELEASE)
-		gcwq->flags &= ~GCWQ_DISASSOCIATED;
-
-	for_each_busy_worker(worker, i, pos, gcwq) {
-		struct work_struct *rebind_work = &worker->rebind_work;
-
-		/*
-		 * Rebind_work may race with future cpu hotplug
-		 * operations.  Use a separate flag to mark that
-		 * rebinding is scheduled.
-		 */
-		worker->flags |= WORKER_REBIND;
-		worker->flags &= ~WORKER_UNBOUND;
-
-		/* queue rebind_work, wq doesn't matter, use the default one */
-		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
-				     work_data_bits(rebind_work)))
-			continue;
-
-		debug_work_activate(rebind_work);
-		insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
-			    worker->scheduled.next,
-			    work_color_to_flags(WORK_NO_COLOR));
-	}
-
 	gcwq_release_management(gcwq);
 
 	/* notify completion */
@@ -3609,13 +3721,16 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			wait_trustee_state(gcwq, TRUSTEE_DONE);
 		}
 
-		/*
-		 * Either DISASSOCIATED is already cleared or no worker is
-		 * left on the gcwq.  Safe to clear DISASSOCIATED without
-		 * claiming managers.
-		 */
+		spin_unlock_irq(&gcwq->lock);
+		gcwq_claim_management(gcwq);
+		spin_lock_irq(&gcwq->lock);
+
 		gcwq->flags &= ~GCWQ_DISASSOCIATED;
 
+		rebind_workers(gcwq);
+
+		gcwq_release_management(gcwq);
+
 		/*
 		 * Trustee is done and there might be no worker left.
 		 * Put the first_idle in and request a real manager to
@@ -3910,6 +4025,8 @@ static int __init init_workqueues(void)
 			ida_init(&pool->worker_ida);
 		}
 
+		init_waitqueue_head(&gcwq->rebind_hold);
+
 		gcwq->trustee_state = TRUSTEE_DONE;
 		init_waitqueue_head(&gcwq->trustee_wait);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 3ce63377305b694f53e7dd0c72907591c5344224 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:27 -0700
Subject: workqueue: don't butcher idle workers on an offline CPU

Currently, during CPU offlining, after all pending work items are
drained, the trustee butchers all workers.  Also, on CPU onlining
failure, workqueue_cpu_callback() ensures that the first idle worker
is destroyed.  Combined, these guarantee that an offline CPU doesn't
have any worker for it once all the lingering work items are finished.

This guarantee isn't really necessary and makes CPU on/offlining more
expensive than needs to be, especially for platforms which use CPU
hotplug for powersaving.

This patch lets offline CPUs removes idle worker butchering from the
trustee and let a CPU which failed onlining keep the created first
worker.  The first worker is created if the CPU doesn't have any
during CPU_DOWN_PREPARE and started right away.  If onlining succeeds,
the rebind_workers() call in CPU_ONLINE will rebind it like any other
workers.  If onlining fails, the worker is left alone till the next
try.

This makes CPU hotplugs cheaper by allowing global_cwqs to keep
workers across them and simplifies code.

Note that trustee doesn't re-arm idle timer when it's done and thus
the disassociated global_cwq will keep all workers until it comes back
online.  This will be improved by further patches.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 94 ++++++++----------------------------------------------
 1 file changed, 14 insertions(+), 80 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6927fecae412..acfabb22e2c4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -175,7 +175,6 @@ struct worker_pool {
 
 	struct mutex		manager_mutex;	/* mutex manager should hold */
 	struct ida		worker_ida;	/* L: for worker IDs */
-	struct worker		*first_idle;	/* L: first idle worker */
 };
 
 /*
@@ -3477,16 +3476,6 @@ static void gcwq_release_management(struct global_cwq *gcwq)
 	__ret1 < 0 ? -1 : 0;						\
 })
 
-static bool gcwq_has_idle_workers(struct global_cwq *gcwq)
-{
-	struct worker_pool *pool;
-
-	for_each_worker_pool(pool, gcwq)
-		if (!list_empty(&pool->idle_list))
-			return true;
-	return false;
-}
-
 static int __cpuinit trustee_thread(void *__gcwq)
 {
 	struct global_cwq *gcwq = __gcwq;
@@ -3494,7 +3483,6 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	struct worker *worker;
 	struct work_struct *work;
 	struct hlist_node *pos;
-	long rc;
 	int i;
 
 	BUG_ON(gcwq->cpu != smp_processor_id());
@@ -3597,25 +3585,6 @@ static int __cpuinit trustee_thread(void *__gcwq)
 			break;
 	}
 
-	/*
-	 * Either all works have been scheduled and cpu is down, or
-	 * cpu down has already been canceled.  Wait for and butcher
-	 * all workers till we're canceled.
-	 */
-	do {
-		rc = trustee_wait_event(gcwq_has_idle_workers(gcwq));
-
-		i = 0;
-		for_each_worker_pool(pool, gcwq) {
-			while (!list_empty(&pool->idle_list)) {
-				worker = list_first_entry(&pool->idle_list,
-							  struct worker, entry);
-				destroy_worker(worker);
-			}
-			i |= pool->nr_workers;
-		}
-	} while (i && rc >= 0);
-
 	gcwq_release_management(gcwq);
 
 	/* notify completion */
@@ -3658,10 +3627,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	unsigned int cpu = (unsigned long)hcpu;
 	struct global_cwq *gcwq = get_gcwq(cpu);
 	struct task_struct *new_trustee = NULL;
-	struct worker *new_workers[NR_WORKER_POOLS] = { };
 	struct worker_pool *pool;
 	unsigned long flags;
-	int i;
 
 	action &= ~CPU_TASKS_FROZEN;
 
@@ -3672,14 +3639,22 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		if (IS_ERR(new_trustee))
 			return notifier_from_errno(PTR_ERR(new_trustee));
 		kthread_bind(new_trustee, cpu);
-		/* fall through */
+		break;
+
 	case CPU_UP_PREPARE:
-		i = 0;
 		for_each_worker_pool(pool, gcwq) {
-			BUG_ON(pool->first_idle);
-			new_workers[i] = create_worker(pool);
-			if (!new_workers[i++])
-				goto err_destroy;
+			struct worker *worker;
+
+			if (pool->nr_workers)
+				continue;
+
+			worker = create_worker(pool);
+			if (!worker)
+				return NOTIFY_BAD;
+
+			spin_lock_irq(&gcwq->lock);
+			start_worker(worker);
+			spin_unlock_irq(&gcwq->lock);
 		}
 	}
 
@@ -3694,23 +3669,10 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		gcwq->trustee_state = TRUSTEE_START;
 		wake_up_process(gcwq->trustee);
 		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
-		/* fall through */
-	case CPU_UP_PREPARE:
-		i = 0;
-		for_each_worker_pool(pool, gcwq) {
-			BUG_ON(pool->first_idle);
-			pool->first_idle = new_workers[i++];
-		}
 		break;
 
 	case CPU_POST_DEAD:
 		gcwq->trustee_state = TRUSTEE_BUTCHER;
-		/* fall through */
-	case CPU_UP_CANCELED:
-		for_each_worker_pool(pool, gcwq) {
-			destroy_worker(pool->first_idle);
-			pool->first_idle = NULL;
-		}
 		break;
 
 	case CPU_DOWN_FAILED:
@@ -3730,39 +3692,12 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		rebind_workers(gcwq);
 
 		gcwq_release_management(gcwq);
-
-		/*
-		 * Trustee is done and there might be no worker left.
-		 * Put the first_idle in and request a real manager to
-		 * take a look.
-		 */
-		for_each_worker_pool(pool, gcwq) {
-			spin_unlock_irq(&gcwq->lock);
-			kthread_bind(pool->first_idle->task, cpu);
-			spin_lock_irq(&gcwq->lock);
-			pool->flags |= POOL_MANAGE_WORKERS;
-			pool->first_idle->flags &= ~WORKER_UNBOUND;
-			start_worker(pool->first_idle);
-			pool->first_idle = NULL;
-		}
 		break;
 	}
 
 	spin_unlock_irqrestore(&gcwq->lock, flags);
 
 	return notifier_from_errno(0);
-
-err_destroy:
-	if (new_trustee)
-		kthread_stop(new_trustee);
-
-	spin_lock_irqsave(&gcwq->lock, flags);
-	for (i = 0; i < NR_WORKER_POOLS; i++)
-		if (new_workers[i])
-			destroy_worker(new_workers[i]);
-	spin_unlock_irqrestore(&gcwq->lock, flags);
-
-	return NOTIFY_BAD;
 }
 
 /*
@@ -3775,7 +3710,6 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
-	case CPU_UP_CANCELED:
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
 		return workqueue_cpu_callback(nfb, action, hcpu);
-- 
cgit v1.2.3-59-g8ed1b


From 628c78e7ea19d5b70d2b6a59030362168cdbe1ad Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:27 -0700
Subject: workqueue: remove CPU offline trustee

With the previous changes, a disassociated global_cwq now can run as
an unbound one on its own - it can create workers as necessary to
drain remaining works after the CPU has been brought down and manage
the number of workers using the usual idle timer mechanism making
trustee completely redundant except for the actual unbinding
operation.

This patch removes the trustee and let a disassociated global_cwq
manage itself.  Unbinding is moved to a work item (for CPU affinity)
which is scheduled and flushed from CPU_DONW_PREPARE.

This patch moves nr_running clearing outside gcwq and manager locks to
simplify the code.  As nr_running is unused at the point, this is
safe.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 288 +++++++----------------------------------------------
 1 file changed, 36 insertions(+), 252 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index acfabb22e2c4..d1545daa74ad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,13 +79,6 @@ enum {
 	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
 				  WORKER_CPU_INTENSIVE,
 
-	/* gcwq->trustee_state */
-	TRUSTEE_START		= 0,		/* start */
-	TRUSTEE_IN_CHARGE	= 1,		/* trustee in charge of gcwq */
-	TRUSTEE_BUTCHER		= 2,		/* butcher workers */
-	TRUSTEE_RELEASE		= 3,		/* release workers */
-	TRUSTEE_DONE		= 4,		/* trustee is done */
-
 	NR_WORKER_POOLS		= 2,		/* # worker pools per gcwq */
 
 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
@@ -100,7 +93,6 @@ enum {
 						   (min two ticks) */
 	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
 	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */
-	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
 
 	/*
 	 * Rescue workers are used only on emergencies and shared by
@@ -194,10 +186,6 @@ struct global_cwq {
 	struct worker_pool	pools[2];	/* normal and highpri pools */
 
 	wait_queue_head_t	rebind_hold;	/* rebind hold wait */
-
-	struct task_struct	*trustee;	/* L: for gcwq shutdown */
-	unsigned int		trustee_state;	/* L: trustee state */
-	wait_queue_head_t	trustee_wait;	/* trustee wait */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -753,11 +741,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 	 * worklist not empty test sequence is in insert_work().
 	 * Please read comment there.
 	 *
-	 * NOT_RUNNING is clear.  This means that trustee is not in
-	 * charge and we're running on the local cpu w/ rq lock held
-	 * and preemption disabled, which in turn means that none else
-	 * could be manipulating idle_list, so dereferencing idle_list
-	 * without gcwq lock is safe.
+	 * NOT_RUNNING is clear.  This means that we're bound to and
+	 * running on the local cpu w/ rq lock held and preemption
+	 * disabled, which in turn means that none else could be
+	 * manipulating idle_list, so dereferencing idle_list without gcwq
+	 * lock is safe.
 	 */
 	if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
 		to_wakeup = first_worker(pool);
@@ -1217,19 +1205,16 @@ static void worker_enter_idle(struct worker *worker)
 	/* idle_list is LIFO */
 	list_add(&worker->entry, &pool->idle_list);
 
-	if (likely(gcwq->trustee_state != TRUSTEE_DONE)) {
-		if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
-			mod_timer(&pool->idle_timer,
-				  jiffies + IDLE_WORKER_TIMEOUT);
-	} else
-		wake_up_all(&gcwq->trustee_wait);
+	if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
+		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
 
 	/*
-	 * Sanity check nr_running.  Because trustee releases gcwq->lock
-	 * between setting %WORKER_UNBOUND and zapping nr_running, the
-	 * warning may trigger spuriously.  Check iff trustee is idle.
+	 * Sanity check nr_running.  Because gcwq_unbind_fn() releases
+	 * gcwq->lock between setting %WORKER_UNBOUND and zapping
+	 * nr_running, the warning may trigger spuriously.  Check iff
+	 * unbind is not in progress.
 	 */
-	WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
+	WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
 		     pool->nr_workers == pool->nr_idle &&
 		     atomic_read(get_pool_nr_running(pool)));
 }
@@ -3367,46 +3352,9 @@ EXPORT_SYMBOL_GPL(work_busy);
  * gcwqs serve mix of short, long and very long running works making
  * blocked draining impractical.
  *
- * This is solved by allowing a gcwq to be detached from CPU, running it
- * with unbound workers and allowing it to be reattached later if the cpu
- * comes back online.  A separate thread is created to govern a gcwq in
- * such state and is called the trustee of the gcwq.
- *
- * Trustee states and their descriptions.
- *
- * START	Command state used on startup.  On CPU_DOWN_PREPARE, a
- *		new trustee is started with this state.
- *
- * IN_CHARGE	Once started, trustee will enter this state after
- *		assuming the manager role and making all existing
- *		workers rogue.  DOWN_PREPARE waits for trustee to
- *		enter this state.  After reaching IN_CHARGE, trustee
- *		tries to execute the pending worklist until it's empty
- *		and the state is set to BUTCHER, or the state is set
- *		to RELEASE.
- *
- * BUTCHER	Command state which is set by the cpu callback after
- *		the cpu has went down.  Once this state is set trustee
- *		knows that there will be no new works on the worklist
- *		and once the worklist is empty it can proceed to
- *		killing idle workers.
- *
- * RELEASE	Command state which is set by the cpu callback if the
- *		cpu down has been canceled or it has come online
- *		again.  After recognizing this state, trustee stops
- *		trying to drain or butcher and clears ROGUE, rebinds
- *		all remaining workers back to the cpu and releases
- *		manager role.
- *
- * DONE		Trustee will enter this state after BUTCHER or RELEASE
- *		is complete.
- *
- *          trustee                 CPU                draining
- *         took over                down               complete
- * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
- *                        |                     |                  ^
- *                        | CPU is back online  v   return workers |
- *                         ----------------> RELEASE --------------
+ * This is solved by allowing a gcwq to be disassociated from the CPU
+ * running as an unbound one and allowing it to be reattached later if the
+ * cpu comes back online.
  */
 
 /* claim manager positions of all pools */
@@ -3427,61 +3375,11 @@ static void gcwq_release_management(struct global_cwq *gcwq)
 		mutex_unlock(&pool->manager_mutex);
 }
 
-/**
- * trustee_wait_event_timeout - timed event wait for trustee
- * @cond: condition to wait for
- * @timeout: timeout in jiffies
- *
- * wait_event_timeout() for trustee to use.  Handles locking and
- * checks for RELEASE request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * Positive indicating left time if @cond is satisfied, 0 if timed
- * out, -1 if canceled.
- */
-#define trustee_wait_event_timeout(cond, timeout) ({			\
-	long __ret = (timeout);						\
-	while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) &&	\
-	       __ret) {							\
-		spin_unlock_irq(&gcwq->lock);				\
-		__wait_event_timeout(gcwq->trustee_wait, (cond) ||	\
-			(gcwq->trustee_state == TRUSTEE_RELEASE),	\
-			__ret);						\
-		spin_lock_irq(&gcwq->lock);				\
-	}								\
-	gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);		\
-})
-
-/**
- * trustee_wait_event - event wait for trustee
- * @cond: condition to wait for
- *
- * wait_event() for trustee to use.  Automatically handles locking and
- * checks for CANCEL request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * 0 if @cond is satisfied, -1 if canceled.
- */
-#define trustee_wait_event(cond) ({					\
-	long __ret1;							\
-	__ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
-	__ret1 < 0 ? -1 : 0;						\
-})
-
-static int __cpuinit trustee_thread(void *__gcwq)
+static void gcwq_unbind_fn(struct work_struct *work)
 {
-	struct global_cwq *gcwq = __gcwq;
+	struct global_cwq *gcwq = get_gcwq(smp_processor_id());
 	struct worker_pool *pool;
 	struct worker *worker;
-	struct work_struct *work;
 	struct hlist_node *pos;
 	int i;
 
@@ -3505,119 +3403,29 @@ static int __cpuinit trustee_thread(void *__gcwq)
 
 	gcwq->flags |= GCWQ_DISASSOCIATED;
 
+	spin_unlock_irq(&gcwq->lock);
+	gcwq_release_management(gcwq);
+
 	/*
 	 * Call schedule() so that we cross rq->lock and thus can guarantee
-	 * sched callbacks see the unbound flag.  This is necessary as
-	 * scheduler callbacks may be invoked from other cpus.
+	 * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
+	 * as scheduler callbacks may be invoked from other cpus.
 	 */
-	spin_unlock_irq(&gcwq->lock);
 	schedule();
-	spin_lock_irq(&gcwq->lock);
 
 	/*
-	 * Sched callbacks are disabled now.  Zap nr_running.  After
-	 * this, nr_running stays zero and need_more_worker() and
-	 * keep_working() are always true as long as the worklist is
-	 * not empty.
+	 * Sched callbacks are disabled now.  Zap nr_running.  After this,
+	 * nr_running stays zero and need_more_worker() and keep_working()
+	 * are always true as long as the worklist is not empty.  @gcwq now
+	 * behaves as unbound (in terms of concurrency management) gcwq
+	 * which is served by workers tied to the CPU.
+	 *
+	 * On return from this function, the current worker would trigger
+	 * unbound chain execution of pending work items if other workers
+	 * didn't already.
 	 */
 	for_each_worker_pool(pool, gcwq)
 		atomic_set(get_pool_nr_running(pool), 0);
-
-	spin_unlock_irq(&gcwq->lock);
-	for_each_worker_pool(pool, gcwq)
-		del_timer_sync(&pool->idle_timer);
-	spin_lock_irq(&gcwq->lock);
-
-	/*
-	 * We're now in charge.  Notify and proceed to drain.  We need
-	 * to keep the gcwq running during the whole CPU down
-	 * procedure as other cpu hotunplug callbacks may need to
-	 * flush currently running tasks.
-	 */
-	gcwq->trustee_state = TRUSTEE_IN_CHARGE;
-	wake_up_all(&gcwq->trustee_wait);
-
-	/*
-	 * The original cpu is in the process of dying and may go away
-	 * anytime now.  When that happens, we and all workers would
-	 * be migrated to other cpus.  Try draining any left work.  We
-	 * want to get it over with ASAP - spam rescuers, wake up as
-	 * many idlers as necessary and create new ones till the
-	 * worklist is empty.  Note that if the gcwq is frozen, there
-	 * may be frozen works in freezable cwqs.  Don't declare
-	 * completion while frozen.
-	 */
-	while (true) {
-		bool busy = false;
-
-		for_each_worker_pool(pool, gcwq)
-			busy |= pool->nr_workers != pool->nr_idle;
-
-		if (!busy && !(gcwq->flags & GCWQ_FREEZING) &&
-		    gcwq->trustee_state != TRUSTEE_IN_CHARGE)
-			break;
-
-		for_each_worker_pool(pool, gcwq) {
-			int nr_works = 0;
-
-			list_for_each_entry(work, &pool->worklist, entry) {
-				send_mayday(work);
-				nr_works++;
-			}
-
-			list_for_each_entry(worker, &pool->idle_list, entry) {
-				if (!nr_works--)
-					break;
-				wake_up_process(worker->task);
-			}
-
-			if (need_to_create_worker(pool)) {
-				spin_unlock_irq(&gcwq->lock);
-				worker = create_worker(pool);
-				spin_lock_irq(&gcwq->lock);
-				if (worker)
-					start_worker(worker);
-			}
-		}
-
-		/* give a breather */
-		if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
-			break;
-	}
-
-	gcwq_release_management(gcwq);
-
-	/* notify completion */
-	gcwq->trustee = NULL;
-	gcwq->trustee_state = TRUSTEE_DONE;
-	wake_up_all(&gcwq->trustee_wait);
-	spin_unlock_irq(&gcwq->lock);
-	return 0;
-}
-
-/**
- * wait_trustee_state - wait for trustee to enter the specified state
- * @gcwq: gcwq the trustee of interest belongs to
- * @state: target state to wait for
- *
- * Wait for the trustee to reach @state.  DONE is already matched.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by cpu_callback.
- */
-static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
-__releases(&gcwq->lock)
-__acquires(&gcwq->lock)
-{
-	if (!(gcwq->trustee_state == state ||
-	      gcwq->trustee_state == TRUSTEE_DONE)) {
-		spin_unlock_irq(&gcwq->lock);
-		__wait_event(gcwq->trustee_wait,
-			     gcwq->trustee_state == state ||
-			     gcwq->trustee_state == TRUSTEE_DONE);
-		spin_lock_irq(&gcwq->lock);
-	}
 }
 
 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
@@ -3626,19 +3434,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 {
 	unsigned int cpu = (unsigned long)hcpu;
 	struct global_cwq *gcwq = get_gcwq(cpu);
-	struct task_struct *new_trustee = NULL;
 	struct worker_pool *pool;
+	struct work_struct unbind_work;
 	unsigned long flags;
 
 	action &= ~CPU_TASKS_FROZEN;
 
 	switch (action) {
 	case CPU_DOWN_PREPARE:
-		new_trustee = kthread_create(trustee_thread, gcwq,
-					     "workqueue_trustee/%d\n", cpu);
-		if (IS_ERR(new_trustee))
-			return notifier_from_errno(PTR_ERR(new_trustee));
-		kthread_bind(new_trustee, cpu);
+		/* unbinding should happen on the local CPU */
+		INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+		schedule_work_on(cpu, &unbind_work);
+		flush_work(&unbind_work);
 		break;
 
 	case CPU_UP_PREPARE:
@@ -3662,27 +3469,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	spin_lock_irqsave(&gcwq->lock, flags);
 
 	switch (action) {
-	case CPU_DOWN_PREPARE:
-		/* initialize trustee and tell it to acquire the gcwq */
-		BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
-		gcwq->trustee = new_trustee;
-		gcwq->trustee_state = TRUSTEE_START;
-		wake_up_process(gcwq->trustee);
-		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
-		break;
-
-	case CPU_POST_DEAD:
-		gcwq->trustee_state = TRUSTEE_BUTCHER;
-		break;
-
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-		if (gcwq->trustee_state != TRUSTEE_DONE) {
-			gcwq->trustee_state = TRUSTEE_RELEASE;
-			wake_up_process(gcwq->trustee);
-			wait_trustee_state(gcwq, TRUSTEE_DONE);
-		}
-
 		spin_unlock_irq(&gcwq->lock);
 		gcwq_claim_management(gcwq);
 		spin_lock_irq(&gcwq->lock);
@@ -3727,7 +3515,6 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
-	case CPU_POST_DEAD:
 		return workqueue_cpu_callback(nfb, action, hcpu);
 	}
 	return NOTIFY_OK;
@@ -3960,9 +3747,6 @@ static int __init init_workqueues(void)
 		}
 
 		init_waitqueue_head(&gcwq->rebind_hold);
-
-		gcwq->trustee_state = TRUSTEE_DONE;
-		init_waitqueue_head(&gcwq->trustee_wait);
 	}
 
 	/* create the initial worker */
-- 
cgit v1.2.3-59-g8ed1b


From 8db25e7891a47e03db6f04344a9c92be16e391bb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:28 -0700
Subject: workqueue: simplify CPU hotplug code

With trustee gone, CPU hotplug code can be simplified.

* gcwq_claim/release_management() now grab and release gcwq lock too
  respectively and gained _and_lock and _and_unlock postfixes.

* All CPU hotplug logic was implemented in workqueue_cpu_callback()
  which was called by workqueue_cpu_up/down_callback() for the correct
  priority.  This was because up and down paths shared a lot of logic,
  which is no longer true.  Remove workqueue_cpu_callback() and move
  all hotplug logic into the two actual callbacks.

This patch doesn't make any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 79 +++++++++++++++++-------------------------------------
 1 file changed, 25 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d1545daa74ad..471996a81633 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3358,19 +3358,21 @@ EXPORT_SYMBOL_GPL(work_busy);
  */
 
 /* claim manager positions of all pools */
-static void gcwq_claim_management(struct global_cwq *gcwq)
+static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
 {
 	struct worker_pool *pool;
 
 	for_each_worker_pool(pool, gcwq)
 		mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
+	spin_lock_irq(&gcwq->lock);
 }
 
 /* release manager positions */
-static void gcwq_release_management(struct global_cwq *gcwq)
+static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
 {
 	struct worker_pool *pool;
 
+	spin_unlock_irq(&gcwq->lock);
 	for_each_worker_pool(pool, gcwq)
 		mutex_unlock(&pool->manager_mutex);
 }
@@ -3385,8 +3387,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
 
 	BUG_ON(gcwq->cpu != smp_processor_id());
 
-	gcwq_claim_management(gcwq);
-	spin_lock_irq(&gcwq->lock);
+	gcwq_claim_management_and_lock(gcwq);
 
 	/*
 	 * We've claimed all manager positions.  Make all workers unbound
@@ -3403,8 +3404,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
 
 	gcwq->flags |= GCWQ_DISASSOCIATED;
 
-	spin_unlock_irq(&gcwq->lock);
-	gcwq_release_management(gcwq);
+	gcwq_release_management_and_unlock(gcwq);
 
 	/*
 	 * Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3428,26 +3428,19 @@ static void gcwq_unbind_fn(struct work_struct *work)
 		atomic_set(get_pool_nr_running(pool), 0);
 }
 
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
-						unsigned long action,
-						void *hcpu)
+/*
+ * Workqueues should be brought up before normal priority CPU notifiers.
+ * This will be registered high priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+					       unsigned long action,
+					       void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
 	struct global_cwq *gcwq = get_gcwq(cpu);
 	struct worker_pool *pool;
-	struct work_struct unbind_work;
-	unsigned long flags;
-
-	action &= ~CPU_TASKS_FROZEN;
-
-	switch (action) {
-	case CPU_DOWN_PREPARE:
-		/* unbinding should happen on the local CPU */
-		INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
-		schedule_work_on(cpu, &unbind_work);
-		flush_work(&unbind_work);
-		break;
 
+	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
 		for_each_worker_pool(pool, gcwq) {
 			struct worker *worker;
@@ -3463,45 +3456,16 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			start_worker(worker);
 			spin_unlock_irq(&gcwq->lock);
 		}
-	}
-
-	/* some are called w/ irq disabled, don't disturb irq status */
-	spin_lock_irqsave(&gcwq->lock, flags);
+		break;
 
-	switch (action) {
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-		spin_unlock_irq(&gcwq->lock);
-		gcwq_claim_management(gcwq);
-		spin_lock_irq(&gcwq->lock);
-
+		gcwq_claim_management_and_lock(gcwq);
 		gcwq->flags &= ~GCWQ_DISASSOCIATED;
-
 		rebind_workers(gcwq);
-
-		gcwq_release_management(gcwq);
+		gcwq_release_management_and_unlock(gcwq);
 		break;
 	}
-
-	spin_unlock_irqrestore(&gcwq->lock, flags);
-
-	return notifier_from_errno(0);
-}
-
-/*
- * Workqueues should be brought up before normal priority CPU notifiers.
- * This will be registered high priority CPU notifier.
- */
-static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
-					       unsigned long action,
-					       void *hcpu)
-{
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-	case CPU_DOWN_FAILED:
-	case CPU_ONLINE:
-		return workqueue_cpu_callback(nfb, action, hcpu);
-	}
 	return NOTIFY_OK;
 }
 
@@ -3513,9 +3477,16 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 						 unsigned long action,
 						 void *hcpu)
 {
+	unsigned int cpu = (unsigned long)hcpu;
+	struct work_struct unbind_work;
+
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
-		return workqueue_cpu_callback(nfb, action, hcpu);
+		/* unbinding should happen on the local CPU */
+		INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+		schedule_work_on(cpu, &unbind_work);
+		flush_work(&unbind_work);
+		break;
 	}
 	return NOTIFY_OK;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 11388c87d2abca1f01975ced28ce9eacea239104 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2012 00:00:58 +0200
Subject: PM / Sleep: Require CAP_BLOCK_SUSPEND to use wake_lock/wake_unlock

Require processes wanting to use the wake_lock/wake_unlock sysfs
files to have the CAP_BLOCK_SUSPEND capability, which also is
required for the eventpoll EPOLLWAKEUP flag to be effective, so that
all interfaces related to blocking autosleep depend on the same
capability.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: stable@vger.kernel.org
Acked-by: Michael Kerrisk <mtk.man-pages@gmail.com>
---
 kernel/power/wakelock.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index c8fba3380076..8f50de394d22 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -9,6 +9,7 @@
  * manipulate wakelocks on Android.
  */
 
+#include <linux/capability.h>
 #include <linux/ctype.h>
 #include <linux/device.h>
 #include <linux/err.h>
@@ -188,6 +189,9 @@ int pm_wake_lock(const char *buf)
 	size_t len;
 	int ret = 0;
 
+	if (!capable(CAP_BLOCK_SUSPEND))
+		return -EPERM;
+
 	while (*str && !isspace(*str))
 		str++;
 
@@ -231,6 +235,9 @@ int pm_wake_unlock(const char *buf)
 	size_t len;
 	int ret = 0;
 
+	if (!capable(CAP_BLOCK_SUSPEND))
+		return -EPERM;
+
 	len = strlen(buf);
 	if (!len)
 		return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From eea03c20ae38a55405c0865ed9adfccc400e4c8e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 18 Jul 2012 18:15:46 -0700
Subject: Make wait_for_device_probe() also do scsi_complete_async_scans()

Commit a7a20d103994 ("sd: limit the scope of the async probe domain")
make the SCSI device probing run device discovery in it's own async
domain.

However, as a result, the partition detection was no longer synchronized
by async_synchronize_full() (which, despite the name, only synchronizes
the global async space, not all of them).  Which in turn meant that
"wait_for_device_probe()" would not wait for the SCSI partitions to be
parsed.

And "wait_for_device_probe()" was what the boot time init code relied on
for mounting the root filesystem.

Now, most people never noticed this, because not only is it
timing-dependent, but modern distributions all use initrd.  So the root
filesystem isn't actually on a disk at all.  And then before they
actually mount the final disk filesystem, they will have loaded the
scsi-wait-scan module, which not only does the expected
wait_for_device_probe(), but also does scsi_complete_async_scans().

[ Side note: scsi_complete_async_scans() had also been partially broken,
  but that was fixed in commit 43a8d39d0137 ("fix async probe
  regression"), so that same commit a7a20d103994 had actually broken
  setups even if you used scsi-wait-scan explicitly ]

Solve this problem by just moving the scsi_complete_async_scans() call
into wait_for_device_probe().  Everybody who wants to wait for device
probing to finish really wants the SCSI probing to complete, so there's
no reason not to do this.

So now "wait_for_device_probe()" really does what the name implies, and
properly waits for device probing to finish.  This also removes the now
unnecessary extra calls to scsi_complete_async_scans().

Reported-and-tested-by: Artem S. Tashkinov <t.artem@mailcity.com>
Cc: Dan Williams <dan.j.williams@gmail.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: James Bottomley <jbottomley@parallels.com>
Cc: Borislav Petkov <bp@amd64.org>
Cc: linux-scsi <linux-scsi@vger.kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/dd.c             | 2 ++
 drivers/scsi/scsi_wait_scan.c | 5 -----
 include/linux/device.h        | 2 --
 kernel/power/hibernate.c      | 8 --------
 kernel/power/user.c           | 2 --
 5 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index dcb8a6e48692..4b01ab3d2c24 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -24,6 +24,7 @@
 #include <linux/wait.h>
 #include <linux/async.h>
 #include <linux/pm_runtime.h>
+#include <scsi/scsi_scan.h>
 
 #include "base.h"
 #include "power/power.h"
@@ -332,6 +333,7 @@ void wait_for_device_probe(void)
 	/* wait for the known devices to complete their probing */
 	wait_event(probe_waitqueue, atomic_read(&probe_count) == 0);
 	async_synchronize_full();
+	scsi_complete_async_scans();
 }
 EXPORT_SYMBOL_GPL(wait_for_device_probe);
 
diff --git a/drivers/scsi/scsi_wait_scan.c b/drivers/scsi/scsi_wait_scan.c
index ae7814874618..072734538876 100644
--- a/drivers/scsi/scsi_wait_scan.c
+++ b/drivers/scsi/scsi_wait_scan.c
@@ -22,11 +22,6 @@ static int __init wait_scan_init(void)
 	 * and might not yet have reached the scsi async scanning
 	 */
 	wait_for_device_probe();
-	/*
-	 * and then we wait for the actual asynchronous scsi scan
-	 * to finish.
-	 */
-	scsi_complete_async_scans();
 	return 0;
 }
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 161d96241b1b..6de94151ff6f 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -865,8 +865,6 @@ extern int (*platform_notify_remove)(struct device *dev);
 extern struct device *get_device(struct device *dev);
 extern void put_device(struct device *dev);
 
-extern void wait_for_device_probe(void);
-
 #ifdef CONFIG_DEVTMPFS
 extern int devtmpfs_create_node(struct device *dev);
 extern int devtmpfs_delete_node(struct device *dev);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8b53db38a279..238025f5472e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -27,7 +27,6 @@
 #include <linux/syscore_ops.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
-#include <scsi/scsi_scan.h>
 
 #include "power.h"
 
@@ -748,13 +747,6 @@ static int software_resume(void)
 			async_synchronize_full();
 		}
 
-		/*
-		 * We can't depend on SCSI devices being available after loading
-		 * one of their modules until scsi_complete_async_scans() is
-		 * called and the resume device usually is a SCSI one.
-		 */
-		scsi_complete_async_scans();
-
 		swsusp_resume_device = name_to_dev_t(resume_file);
 		if (!swsusp_resume_device) {
 			error = -ENODEV;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 91b0fd021a95..4ed81e74f86f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,7 +24,6 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
-#include <scsi/scsi_scan.h>
 
 #include <asm/uaccess.h>
 
@@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 		 * appear.
 		 */
 		wait_for_device_probe();
-		scsi_complete_async_scans();
 
 		data->swap = -1;
 		data->mode = O_WRONLY;
-- 
cgit v1.2.3-59-g8ed1b


From 2955b47d2c1983998a8c5915cb96884e67f7cb53 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 9 Jul 2012 19:33:25 -0700
Subject: [SCSI] async: introduce 'async_domain' type

This is in preparation for teaching async_synchronize_full() to sync all
pending async work, and not just on the async_running domain.  This
conversion is functionally equivalent, just embedding the existing list
in a new async_domain type.

The .registered attribute is used in a later patch to distinguish
between domains that want to be flushed by async_synchronize_full()
versus those that only expect async_synchronize_{full|cookie}_domain to
be used for flushing.

[jejb: add async.h to scsi_priv.h for struct async_domain]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Tested-by: Eldad Zack <eldad@fogrefinery.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 drivers/regulator/core.c      |  2 +-
 drivers/scsi/libsas/sas_ata.c |  2 +-
 drivers/scsi/scsi.c           |  3 ++-
 drivers/scsi/scsi_priv.h      |  3 ++-
 include/linux/async.h         | 35 +++++++++++++++++++++++++++++++----
 kernel/async.c                | 35 +++++++++++++++++------------------
 sound/soc/soc-dapm.c          |  2 +-
 7 files changed, 55 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 8b4b3829d9e7..6c74546fc3cd 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -2744,7 +2744,7 @@ static void regulator_bulk_enable_async(void *data, async_cookie_t cookie)
 int regulator_bulk_enable(int num_consumers,
 			  struct regulator_bulk_data *consumers)
 {
-	LIST_HEAD(async_domain);
+	ASYNC_DOMAIN_EXCLUSIVE(async_domain);
 	int i;
 	int ret = 0;
 
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index bec3bc8aab0c..a59fcdc8fd63 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -742,7 +742,7 @@ static void async_sas_ata_eh(void *data, async_cookie_t cookie)
 void sas_ata_strategy_handler(struct Scsi_Host *shost)
 {
 	struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(shost);
-	LIST_HEAD(async);
+	ASYNC_DOMAIN_EXCLUSIVE(async);
 	int i;
 
 	/* it's ok to defer revalidation events during ata eh, these
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index bbbc9c918d4c..4cade886a50a 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -54,6 +54,7 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
+#include <linux/async.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
@@ -91,7 +92,7 @@ EXPORT_SYMBOL(scsi_logging_level);
 #endif
 
 /* sd, scsi core and power management need to coordinate flushing async actions */
-LIST_HEAD(scsi_sd_probe_domain);
+ASYNC_DOMAIN(scsi_sd_probe_domain);
 EXPORT_SYMBOL(scsi_sd_probe_domain);
 
 /* NB: These are exposed through /proc/scsi/scsi and form part of the ABI.
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 13d74da5dfab..8f9a0cadc296 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -2,6 +2,7 @@
 #define _SCSI_PRIV_H
 
 #include <linux/device.h>
+#include <linux/async.h>
 #include <scsi/scsi_device.h>
 
 struct request_queue;
@@ -163,7 +164,7 @@ static inline int scsi_autopm_get_host(struct Scsi_Host *h) { return 0; }
 static inline void scsi_autopm_put_host(struct Scsi_Host *h) {}
 #endif /* CONFIG_PM_RUNTIME */
 
-extern struct list_head scsi_sd_probe_domain;
+extern struct async_domain scsi_sd_probe_domain;
 
 /* 
  * internal scsi timeout functions: for use by mid-layer and transport
diff --git a/include/linux/async.h b/include/linux/async.h
index 68a9530196f2..364e7ff16c08 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -9,19 +9,46 @@
  * as published by the Free Software Foundation; version 2
  * of the License.
  */
+#ifndef __ASYNC_H__
+#define __ASYNC_H__
 
 #include <linux/types.h>
 #include <linux/list.h>
 
 typedef u64 async_cookie_t;
 typedef void (async_func_ptr) (void *data, async_cookie_t cookie);
+struct async_domain {
+	struct list_head node;
+	struct list_head domain;
+	int count;
+	unsigned registered:1;
+};
+
+/*
+ * domain participates in global async_synchronize_full
+ */
+#define ASYNC_DOMAIN(_name) \
+	struct async_domain _name = { .node = LIST_HEAD_INIT(_name.node), \
+				      .domain = LIST_HEAD_INIT(_name.domain), \
+				      .count = 0, \
+				      .registered = 1 }
+
+/*
+ * domain is free to go out of scope as soon as all pending work is
+ * complete, this domain does not participate in async_synchronize_full
+ */
+#define ASYNC_DOMAIN_EXCLUSIVE(_name) \
+	struct async_domain _name = { .node = LIST_HEAD_INIT(_name.node), \
+				      .domain = LIST_HEAD_INIT(_name.domain), \
+				      .count = 0, \
+				      .registered = 0 }
 
 extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data);
 extern async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
-					    struct list_head *list);
+					    struct async_domain *domain);
 extern void async_synchronize_full(void);
-extern void async_synchronize_full_domain(struct list_head *list);
+extern void async_synchronize_full_domain(struct async_domain *domain);
 extern void async_synchronize_cookie(async_cookie_t cookie);
 extern void async_synchronize_cookie_domain(async_cookie_t cookie,
-					    struct list_head *list);
-
+					    struct async_domain *domain);
+#endif
diff --git a/kernel/async.c b/kernel/async.c
index bd0c168a3bbe..ba5491dfa991 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -62,7 +62,7 @@ static async_cookie_t next_cookie = 1;
 #define MAX_WORK	32768
 
 static LIST_HEAD(async_pending);
-static LIST_HEAD(async_running);
+static ASYNC_DOMAIN(async_running);
 static DEFINE_SPINLOCK(async_lock);
 
 struct async_entry {
@@ -71,7 +71,7 @@ struct async_entry {
 	async_cookie_t		cookie;
 	async_func_ptr		*func;
 	void			*data;
-	struct list_head	*running;
+	struct async_domain	*running;
 };
 
 static DECLARE_WAIT_QUEUE_HEAD(async_done);
@@ -82,13 +82,12 @@ static atomic_t entry_count;
 /*
  * MUST be called with the lock held!
  */
-static async_cookie_t  __lowest_in_progress(struct list_head *running)
+static async_cookie_t  __lowest_in_progress(struct async_domain *running)
 {
 	struct async_entry *entry;
 
-	if (!list_empty(running)) {
-		entry = list_first_entry(running,
-			struct async_entry, list);
+	if (!list_empty(&running->domain)) {
+		entry = list_first_entry(&running->domain, typeof(*entry), list);
 		return entry->cookie;
 	}
 
@@ -99,7 +98,7 @@ static async_cookie_t  __lowest_in_progress(struct list_head *running)
 	return next_cookie;	/* "infinity" value */
 }
 
-static async_cookie_t  lowest_in_progress(struct list_head *running)
+static async_cookie_t  lowest_in_progress(struct async_domain *running)
 {
 	unsigned long flags;
 	async_cookie_t ret;
@@ -119,10 +118,11 @@ static void async_run_entry_fn(struct work_struct *work)
 		container_of(work, struct async_entry, work);
 	unsigned long flags;
 	ktime_t uninitialized_var(calltime), delta, rettime;
+	struct async_domain *running = entry->running;
 
 	/* 1) move self to the running queue */
 	spin_lock_irqsave(&async_lock, flags);
-	list_move_tail(&entry->list, entry->running);
+	list_move_tail(&entry->list, &running->domain);
 	spin_unlock_irqrestore(&async_lock, flags);
 
 	/* 2) run (and print duration) */
@@ -156,7 +156,7 @@ static void async_run_entry_fn(struct work_struct *work)
 	wake_up(&async_done);
 }
 
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
+static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
 {
 	struct async_entry *entry;
 	unsigned long flags;
@@ -223,7 +223,7 @@ EXPORT_SYMBOL_GPL(async_schedule);
  * Note: This function may be called from atomic or non-atomic contexts.
  */
 async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
-				     struct list_head *running)
+				     struct async_domain *running)
 {
 	return __async_schedule(ptr, data, running);
 }
@@ -238,20 +238,20 @@ void async_synchronize_full(void)
 {
 	do {
 		async_synchronize_cookie(next_cookie);
-	} while (!list_empty(&async_running) || !list_empty(&async_pending));
+	} while (!list_empty(&async_running.domain) || !list_empty(&async_pending));
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 
 /**
  * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @list: running list to synchronize on
+ * @domain: running list to synchronize on
  *
  * This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list have been done.
+ * synchronization domain specified by the running list @domain have been done.
  */
-void async_synchronize_full_domain(struct list_head *list)
+void async_synchronize_full_domain(struct async_domain *domain)
 {
-	async_synchronize_cookie_domain(next_cookie, list);
+	async_synchronize_cookie_domain(next_cookie, domain);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 
@@ -261,11 +261,10 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
  * @running: running list to synchronize on
  *
  * This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list submitted
+ * synchronization domain specified by running list @running submitted
  * prior to @cookie have been done.
  */
-void async_synchronize_cookie_domain(async_cookie_t cookie,
-				     struct list_head *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
 {
 	ktime_t uninitialized_var(starttime), delta, endtime;
 
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 89eae93445cf..fa1e31206892 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -1545,7 +1545,7 @@ static int dapm_power_widgets(struct snd_soc_dapm_context *dapm, int event)
 	struct snd_soc_dapm_context *d;
 	LIST_HEAD(up_list);
 	LIST_HEAD(down_list);
-	LIST_HEAD(async_domain);
+	ASYNC_DOMAIN_EXCLUSIVE(async_domain);
 	enum snd_soc_bias_level bias;
 
 	trace_snd_soc_dapm_start(card);
-- 
cgit v1.2.3-59-g8ed1b


From a4683487f90bfe3049686fc5c566bdc1ad03ace6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 9 Jul 2012 19:33:30 -0700
Subject: [SCSI] async: make async_synchronize_full() flush all work regardless
 of domain

In response to an async related regression James noted:

  "My theory is that this is an init problem: The assumption in a lot of
   our code is that async_synchronize_full() waits for everything ... even
   the domain specific async schedules, which isn't true."

...so make this assumption true.

Each domain, including the default one, registers itself on a global domain
list when work is scheduled.  Once all entries complete it exits that
list.  Waiting for the list to be empty syncs all in-flight work across
all domains.

Domains can opt-out of global syncing if they are declared as exclusive
ASYNC_DOMAIN_EXCLUSIVE().  All stack-based domains have been declared
exclusive since the domain may go out of scope as soon as the last work
item completes.

Statically declared domains are mostly ok, but async_unregister_domain()
is there to close any theoretical races with pending
async_synchronize_full waiters at module removal time.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Reported-by: Meelis Roos <mroos@linux.ee>
Reported-by: Eldad Zack <eldadzack@gmail.com>
Tested-by: Eldad Zack <eldad@fogrefinery.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 drivers/scsi/scsi.c   |  1 +
 include/linux/async.h |  1 +
 kernel/async.c        | 43 +++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 4cade886a50a..2936b447cae9 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -1355,6 +1355,7 @@ static void __exit exit_scsi(void)
 	scsi_exit_devinfo();
 	scsi_exit_procfs();
 	scsi_exit_queue();
+	async_unregister_domain(&scsi_sd_probe_domain);
 }
 
 subsys_initcall(init_scsi);
diff --git a/include/linux/async.h b/include/linux/async.h
index 364e7ff16c08..7a24fe9b44b4 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -46,6 +46,7 @@ struct async_domain {
 extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data);
 extern async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
 					    struct async_domain *domain);
+void async_unregister_domain(struct async_domain *domain);
 extern void async_synchronize_full(void);
 extern void async_synchronize_full_domain(struct async_domain *domain);
 extern void async_synchronize_cookie(async_cookie_t cookie);
diff --git a/kernel/async.c b/kernel/async.c
index ba5491dfa991..9d3118384858 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -63,7 +63,9 @@ static async_cookie_t next_cookie = 1;
 
 static LIST_HEAD(async_pending);
 static ASYNC_DOMAIN(async_running);
+static LIST_HEAD(async_domains);
 static DEFINE_SPINLOCK(async_lock);
+static DEFINE_MUTEX(async_register_mutex);
 
 struct async_entry {
 	struct list_head	list;
@@ -145,6 +147,8 @@ static void async_run_entry_fn(struct work_struct *work)
 	/* 3) remove self from the running queue */
 	spin_lock_irqsave(&async_lock, flags);
 	list_del(&entry->list);
+	if (running->registered && --running->count == 0)
+		list_del_init(&running->node);
 
 	/* 4) free the entry */
 	kfree(entry);
@@ -187,6 +191,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
 	spin_lock_irqsave(&async_lock, flags);
 	newcookie = entry->cookie = next_cookie++;
 	list_add_tail(&entry->list, &async_pending);
+	if (running->registered && running->count++ == 0)
+		list_add_tail(&running->node, &async_domains);
 	atomic_inc(&entry_count);
 	spin_unlock_irqrestore(&async_lock, flags);
 
@@ -236,12 +242,42 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
  */
 void async_synchronize_full(void)
 {
+	mutex_lock(&async_register_mutex);
 	do {
-		async_synchronize_cookie(next_cookie);
-	} while (!list_empty(&async_running.domain) || !list_empty(&async_pending));
+		struct async_domain *domain = NULL;
+
+		spin_lock_irq(&async_lock);
+		if (!list_empty(&async_domains))
+			domain = list_first_entry(&async_domains, typeof(*domain), node);
+		spin_unlock_irq(&async_lock);
+
+		async_synchronize_cookie_domain(next_cookie, domain);
+	} while (!list_empty(&async_domains));
+	mutex_unlock(&async_register_mutex);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 
+/**
+ * async_unregister_domain - ensure no more anonymous waiters on this domain
+ * @domain: idle domain to flush out of any async_synchronize_full instances
+ *
+ * async_synchronize_{cookie|full}_domain() are not flushed since callers
+ * of these routines should know the lifetime of @domain
+ *
+ * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
+ */
+void async_unregister_domain(struct async_domain *domain)
+{
+	mutex_lock(&async_register_mutex);
+	spin_lock_irq(&async_lock);
+	WARN_ON(!domain->registered || !list_empty(&domain->node) ||
+		!list_empty(&domain->domain));
+	domain->registered = 0;
+	spin_unlock_irq(&async_lock);
+	mutex_unlock(&async_register_mutex);
+}
+EXPORT_SYMBOL_GPL(async_unregister_domain);
+
 /**
  * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
  * @domain: running list to synchronize on
@@ -268,6 +304,9 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
 {
 	ktime_t uninitialized_var(starttime), delta, endtime;
 
+	if (!running)
+		return;
+
 	if (initcall_debug && system_state == SYSTEM_BOOTING) {
 		printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
 		starttime = ktime_get();
-- 
cgit v1.2.3-59-g8ed1b


From bc792e612e78a24ae0b30cc5b85f2368379ba4d4 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <anton.vorontsov@linaro.org>
Date: Fri, 20 Jul 2012 17:27:37 -0700
Subject: kdb: Revive dmesg command

The kgdb dmesg command is broken after the printk rework.  The old logic
in kdb code makes no sense in terms of current printk/logging storage
format, and KDB simply hangs forever.

This patch revives the command by switching to kmsg_dumper iterator.

The code is now much more simpler and shorter.

Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/debug/kdb/kdb_main.c | 91 ++++++++++++++++-----------------------------
 1 file changed, 33 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 67b847dfa2bb..df17c935d3c6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,6 +14,7 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/kmsg_dump.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/sysrq.h>
@@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv)
  */
 static int kdb_dmesg(int argc, const char **argv)
 {
-	char *syslog_data[4], *start, *end, c = '\0', *p;
-	int diag, logging, logsize, lines = 0, adjust = 0, n;
+	int diag;
+	int logging;
+	int lines = 0;
+	int adjust = 0;
+	int n = 0;
+	int skip = 0;
+	struct kmsg_dumper dumper = { .active = 1 };
+	size_t len;
+	char buf[201];
 
 	if (argc > 2)
 		return KDB_ARGCOUNT;
@@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv)
 		kdb_set(2, setargs);
 	}
 
-	/* syslog_data[0,1] physical start, end+1.  syslog_data[2,3]
-	 * logical start, end+1. */
-	kdb_syslog_data(syslog_data);
-	if (syslog_data[2] == syslog_data[3])
-		return 0;
-	logsize = syslog_data[1] - syslog_data[0];
-	start = syslog_data[2];
-	end = syslog_data[3];
-#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
-	for (n = 0, p = start; p < end; ++p) {
-		c = *KDB_WRAP(p);
-		if (c == '\n')
-			++n;
-	}
-	if (c != '\n')
-		++n;
+	kmsg_dump_rewind(&dumper);
+	while (kmsg_dump_get_line(&dumper, 1, NULL, 0, NULL))
+		n++;
+
 	if (lines < 0) {
 		if (adjust >= n)
 			kdb_printf("buffer only contains %d lines, nothing "
@@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv)
 		else if (adjust - lines >= n)
 			kdb_printf("buffer only contains %d lines, last %d "
 				   "lines printed\n", n, n - adjust);
-		if (adjust) {
-			for (; start < end && adjust; ++start) {
-				if (*KDB_WRAP(start) == '\n')
-					--adjust;
-			}
-			if (start < end)
-				++start;
-		}
-		for (p = start; p < end && lines; ++p) {
-			if (*KDB_WRAP(p) == '\n')
-				++lines;
-		}
-		end = p;
+		skip = adjust;
+		lines = abs(lines);
 	} else if (lines > 0) {
-		int skip = n - (adjust + lines);
+		skip = n - lines - adjust;
+		lines = abs(lines);
 		if (adjust >= n) {
 			kdb_printf("buffer only contains %d lines, "
 				   "nothing printed\n", n);
@@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv)
 			kdb_printf("buffer only contains %d lines, first "
 				   "%d lines printed\n", n, lines);
 		}
-		for (; start < end && skip; ++start) {
-			if (*KDB_WRAP(start) == '\n')
-				--skip;
-		}
-		for (p = start; p < end && lines; ++p) {
-			if (*KDB_WRAP(p) == '\n')
-				--lines;
-		}
-		end = p;
+	} else {
+		lines = n;
 	}
-	/* Do a line at a time (max 200 chars) to reduce protocol overhead */
-	c = '\n';
-	while (start != end) {
-		char buf[201];
-		p = buf;
-		if (KDB_FLAG(CMD_INTERRUPT))
-			return 0;
-		while (start < end && (c = *KDB_WRAP(start)) &&
-		       (p - buf) < sizeof(buf)-1) {
-			++start;
-			*p++ = c;
-			if (c == '\n')
-				break;
+
+	if (skip >= n || skip < 0)
+		return 0;
+
+	kmsg_dump_rewind(&dumper);
+	while (kmsg_dump_get_line(&dumper, 1, buf, sizeof(buf), &len)) {
+		if (skip) {
+			skip--;
+			continue;
 		}
-		*p = '\0';
-		kdb_printf("%s", buf);
+		if (!lines--)
+			break;
+
+		kdb_printf("%.*s\n", (int)len - 1, buf);
 	}
-	if (c != '\n')
-		kdb_printf("\n");
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 1b499d05eecbe04969516717a8e15afb6ad80689 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <anton.vorontsov@linaro.org>
Date: Fri, 20 Jul 2012 17:27:54 -0700
Subject: printk: Remove kdb_syslog_data

The function is no longer needed, so remove it.

Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/debug/kdb/kdb_private.h |  1 -
 kernel/printk.c                | 15 ---------------
 2 files changed, 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 47c4e56e513b..392ec6a25844 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -205,7 +205,6 @@ extern char kdb_grep_string[];
 extern int kdb_grep_leading;
 extern int kdb_grep_trailing;
 extern char *kdb_cmds[];
-extern void kdb_syslog_data(char *syslog_data[]);
 extern unsigned long kdb_task_state_string(const char *);
 extern char kdb_task_state_char (const struct task_struct *);
 extern unsigned long kdb_task_state(const struct task_struct *p,
diff --git a/kernel/printk.c b/kernel/printk.c
index 177fa49357a5..c8129678dfbf 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1192,21 +1192,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 	return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
 
-#ifdef	CONFIG_KGDB_KDB
-/* kdb dmesg command needs access to the syslog buffer.  do_syslog()
- * uses locks so it cannot be used during debugging.  Just tell kdb
- * where the start and end of the physical and logical logs are.  This
- * is equivalent to do_syslog(3).
- */
-void kdb_syslog_data(char *syslog_data[4])
-{
-	syslog_data[0] = log_buf;
-	syslog_data[1] = log_buf + log_buf_len;
-	syslog_data[2] = log_buf + log_first_idx;
-	syslog_data[3] = log_buf + log_next_idx;
-}
-#endif	/* CONFIG_KGDB_KDB */
-
 static bool __read_mostly ignore_loglevel;
 
 static int __init ignore_loglevel_setup(char *str)
-- 
cgit v1.2.3-59-g8ed1b


From 533827c921c34310f63e859e1d6d0feec439657d Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <anton.vorontsov@linaro.org>
Date: Fri, 20 Jul 2012 17:28:07 -0700
Subject: printk: Implement some unlocked kmsg_dump functions

If used from KDB, the locked variants are prone to deadlocks (suppose we
got to the debugger w/ the logbuf lock held).

So, we have to implement a few routines that grab no logbuf lock.

Yet we don't need these functions in modules, so we don't export them.

Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmsg_dump.h | 16 +++++++++++
 kernel/printk.c           | 68 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 71 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index d6bd50110ec2..2e7a1e032c71 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -55,12 +55,17 @@ struct kmsg_dumper {
 #ifdef CONFIG_PRINTK
 void kmsg_dump(enum kmsg_dump_reason reason);
 
+bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
+			       char *line, size_t size, size_t *len);
+
 bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
 			char *line, size_t size, size_t *len);
 
 bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 			  char *buf, size_t size, size_t *len);
 
+void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper);
+
 void kmsg_dump_rewind(struct kmsg_dumper *dumper);
 
 int kmsg_dump_register(struct kmsg_dumper *dumper);
@@ -71,6 +76,13 @@ static inline void kmsg_dump(enum kmsg_dump_reason reason)
 {
 }
 
+static inline bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper,
+					     bool syslog, const char *line,
+					     size_t size, size_t *len)
+{
+	return false;
+}
+
 static inline bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
 				const char *line, size_t size, size_t *len)
 {
@@ -83,6 +95,10 @@ static inline bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	return false;
 }
 
+static inline void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+{
+}
+
 static inline void kmsg_dump_rewind(struct kmsg_dumper *dumper)
 {
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index c8129678dfbf..ac4bc9e79465 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -2510,7 +2510,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 }
 
 /**
- * kmsg_dump_get_line - retrieve one kmsg log line
+ * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
  * @dumper: registered kmsg dumper
  * @syslog: include the "<4>" prefixes
  * @line: buffer to copy the line to
@@ -2525,11 +2525,12 @@ void kmsg_dump(enum kmsg_dump_reason reason)
  *
  * A return value of FALSE indicates that there are no more records to
  * read.
+ *
+ * The function is similar to kmsg_dump_get_line(), but grabs no locks.
  */
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
-			char *line, size_t size, size_t *len)
+bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
+			       char *line, size_t size, size_t *len)
 {
-	unsigned long flags;
 	struct log *msg;
 	size_t l = 0;
 	bool ret = false;
@@ -2537,7 +2538,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
 	if (!dumper->active)
 		goto out;
 
-	raw_spin_lock_irqsave(&logbuf_lock, flags);
 	if (dumper->cur_seq < log_first_seq) {
 		/* messages are gone, move to first available one */
 		dumper->cur_seq = log_first_seq;
@@ -2545,10 +2545,8 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
 	}
 
 	/* last entry */
-	if (dumper->cur_seq >= log_next_seq) {
-		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+	if (dumper->cur_seq >= log_next_seq)
 		goto out;
-	}
 
 	msg = log_from_idx(dumper->cur_idx);
 	l = msg_print_text(msg, 0, syslog, line, size);
@@ -2556,12 +2554,41 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
 	dumper->cur_idx = log_next(dumper->cur_idx);
 	dumper->cur_seq++;
 	ret = true;
-	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 out:
 	if (len)
 		*len = l;
 	return ret;
 }
+
+/**
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+			char *line, size_t size, size_t *len)
+{
+	unsigned long flags;
+	bool ret;
+
+	raw_spin_lock_irqsave(&logbuf_lock, flags);
+	ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
+	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
 
 /**
@@ -2663,6 +2690,24 @@ out:
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
 
+/**
+ * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
+ * @dumper: registered kmsg dumper
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ *
+ * The function is similar to kmsg_dump_rewind(), but grabs no locks.
+ */
+void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+{
+	dumper->cur_seq = clear_seq;
+	dumper->cur_idx = clear_idx;
+	dumper->next_seq = log_next_seq;
+	dumper->next_idx = log_next_idx;
+}
+
 /**
  * kmsg_dump_rewind - reset the interator
  * @dumper: registered kmsg dumper
@@ -2676,10 +2721,7 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&logbuf_lock, flags);
-	dumper->cur_seq = clear_seq;
-	dumper->cur_idx = clear_idx;
-	dumper->next_seq = log_next_seq;
-	dumper->next_idx = log_next_idx;
+	kmsg_dump_rewind_nolock(dumper);
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
-- 
cgit v1.2.3-59-g8ed1b


From c064da47144b11be4697a4611f640086a663016a Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <anton.vorontsov@linaro.org>
Date: Fri, 20 Jul 2012 17:28:25 -0700
Subject: kdb: Switch to nolock variants of kmsg_dump functions

The locked variants are prone to deadlocks (suppose we got to the
debugger w/ the logbuf lock held), so let's switch to nolock variants.

Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/debug/kdb/kdb_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index df17c935d3c6..1f91413edb87 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2072,8 +2072,8 @@ static int kdb_dmesg(int argc, const char **argv)
 		kdb_set(2, setargs);
 	}
 
-	kmsg_dump_rewind(&dumper);
-	while (kmsg_dump_get_line(&dumper, 1, NULL, 0, NULL))
+	kmsg_dump_rewind_nolock(&dumper);
+	while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
 		n++;
 
 	if (lines < 0) {
@@ -2105,8 +2105,8 @@ static int kdb_dmesg(int argc, const char **argv)
 	if (skip >= n || skip < 0)
 		return 0;
 
-	kmsg_dump_rewind(&dumper);
-	while (kmsg_dump_get_line(&dumper, 1, buf, sizeof(buf), &len)) {
+	kmsg_dump_rewind_nolock(&dumper);
+	while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
 		if (skip) {
 			skip--;
 			continue;
-- 
cgit v1.2.3-59-g8ed1b


From 9a2e03d8ed518a61154f18d83d6466628e519f94 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 19 Jul 2012 13:52:53 -0700
Subject: kthread_worker: reorganize to prepare for flush_kthread_work()
 reimplementation

Make the following two non-functional changes.

* Separate out insert_kthread_work() from queue_kthread_work().

* Relocate struct kthread_flush_work and kthread_flush_work_fn()
  definitions above flush_kthread_work().

v2: Added lockdep_assert_held() in insert_kthread_work() as suggested
    by Andy Walls.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andy Walls <awalls@md.metrocast.net>
---
 kernel/kthread.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702e..4bfbff36d447 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -378,6 +378,19 @@ repeat:
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
 
+/* insert @work before @pos in @worker */
+static void insert_kthread_work(struct kthread_worker *worker,
+			       struct kthread_work *work,
+			       struct list_head *pos)
+{
+	lockdep_assert_held(&worker->lock);
+
+	list_add_tail(&work->node, pos);
+	work->queue_seq++;
+	if (likely(worker->task))
+		wake_up_process(worker->task);
+}
+
 /**
  * queue_kthread_work - queue a kthread_work
  * @worker: target kthread_worker
@@ -395,10 +408,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
 
 	spin_lock_irqsave(&worker->lock, flags);
 	if (list_empty(&work->node)) {
-		list_add_tail(&work->node, &worker->work_list);
-		work->queue_seq++;
-		if (likely(worker->task))
-			wake_up_process(worker->task);
+		insert_kthread_work(worker, work, &worker->work_list);
 		ret = true;
 	}
 	spin_unlock_irqrestore(&worker->lock, flags);
@@ -406,6 +416,18 @@ bool queue_kthread_work(struct kthread_worker *worker,
 }
 EXPORT_SYMBOL_GPL(queue_kthread_work);
 
+struct kthread_flush_work {
+	struct kthread_work	work;
+	struct completion	done;
+};
+
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+	struct kthread_flush_work *fwork =
+		container_of(work, struct kthread_flush_work, work);
+	complete(&fwork->done);
+}
+
 /**
  * flush_kthread_work - flush a kthread_work
  * @work: work to flush
@@ -436,18 +458,6 @@ void flush_kthread_work(struct kthread_work *work)
 }
 EXPORT_SYMBOL_GPL(flush_kthread_work);
 
-struct kthread_flush_work {
-	struct kthread_work	work;
-	struct completion	done;
-};
-
-static void kthread_flush_work_fn(struct kthread_work *work)
-{
-	struct kthread_flush_work *fwork =
-		container_of(work, struct kthread_flush_work, work);
-	complete(&fwork->done);
-}
-
 /**
  * flush_kthread_worker - flush all current works on a kthread_worker
  * @worker: worker to flush
-- 
cgit v1.2.3-59-g8ed1b


From 46f3d976213452350f9d10b0c2780c2681f7075b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 19 Jul 2012 13:52:53 -0700
Subject: kthread_worker: reimplement flush_kthread_work() to allow freeing the
 work item being executed

kthread_worker provides minimalistic workqueue-like interface for
users which need a dedicated worker thread (e.g. for realtime
priority).  It has basic queue, flush_work, flush_worker operations
which mostly match the workqueue counterparts; however, due to the way
flush_work() is implemented, it has a noticeable difference of not
allowing work items to be freed while being executed.

While the current users of kthread_worker are okay with the current
behavior, the restriction does impede some valid use cases.  Also,
removing this difference isn't difficult and actually makes the code
easier to understand.

This patch reimplements flush_kthread_work() such that it uses a
flush_work item instead of queue/done sequence numbers.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/kthread.h |  8 ++------
 kernel/kthread.c        | 48 +++++++++++++++++++++++++++---------------------
 2 files changed, 29 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 0714b24c0e45..22ccf9dee177 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -49,8 +49,6 @@ extern int tsk_fork_get_node(struct task_struct *tsk);
  * can be queued and flushed using queue/flush_kthread_work()
  * respectively.  Queued kthread_works are processed by a kthread
  * running kthread_worker_fn().
- *
- * A kthread_work can't be freed while it is executing.
  */
 struct kthread_work;
 typedef void (*kthread_work_func_t)(struct kthread_work *work);
@@ -59,15 +57,14 @@ struct kthread_worker {
 	spinlock_t		lock;
 	struct list_head	work_list;
 	struct task_struct	*task;
+	struct kthread_work	*current_work;
 };
 
 struct kthread_work {
 	struct list_head	node;
 	kthread_work_func_t	func;
 	wait_queue_head_t	done;
-	atomic_t		flushing;
-	int			queue_seq;
-	int			done_seq;
+	struct kthread_worker	*worker;
 };
 
 #define KTHREAD_WORKER_INIT(worker)	{				\
@@ -79,7 +76,6 @@ struct kthread_work {
 	.node = LIST_HEAD_INIT((work).node),				\
 	.func = (fn),							\
 	.done = __WAIT_QUEUE_HEAD_INITIALIZER((work).done),		\
-	.flushing = ATOMIC_INIT(0),					\
 	}
 
 #define DEFINE_KTHREAD_WORKER(worker)					\
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4bfbff36d447..b579af57ea10 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -360,16 +360,12 @@ repeat:
 					struct kthread_work, node);
 		list_del_init(&work->node);
 	}
+	worker->current_work = work;
 	spin_unlock_irq(&worker->lock);
 
 	if (work) {
 		__set_current_state(TASK_RUNNING);
 		work->func(work);
-		smp_wmb();	/* wmb worker-b0 paired with flush-b1 */
-		work->done_seq = work->queue_seq;
-		smp_mb();	/* mb worker-b1 paired with flush-b0 */
-		if (atomic_read(&work->flushing))
-			wake_up_all(&work->done);
 	} else if (!freezing(current))
 		schedule();
 
@@ -386,7 +382,7 @@ static void insert_kthread_work(struct kthread_worker *worker,
 	lockdep_assert_held(&worker->lock);
 
 	list_add_tail(&work->node, pos);
-	work->queue_seq++;
+	work->worker = worker;
 	if (likely(worker->task))
 		wake_up_process(worker->task);
 }
@@ -436,25 +432,35 @@ static void kthread_flush_work_fn(struct kthread_work *work)
  */
 void flush_kthread_work(struct kthread_work *work)
 {
-	int seq = work->queue_seq;
+	struct kthread_flush_work fwork = {
+		KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+		COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+	};
+	struct kthread_worker *worker;
+	bool noop = false;
+
+retry:
+	worker = work->worker;
+	if (!worker)
+		return;
 
-	atomic_inc(&work->flushing);
+	spin_lock_irq(&worker->lock);
+	if (work->worker != worker) {
+		spin_unlock_irq(&worker->lock);
+		goto retry;
+	}
 
-	/*
-	 * mb flush-b0 paired with worker-b1, to make sure either
-	 * worker sees the above increment or we see done_seq update.
-	 */
-	smp_mb__after_atomic_inc();
+	if (!list_empty(&work->node))
+		insert_kthread_work(worker, &fwork.work, work->node.next);
+	else if (worker->current_work == work)
+		insert_kthread_work(worker, &fwork.work, worker->work_list.next);
+	else
+		noop = true;
 
-	/* A - B <= 0 tests whether B is in front of A regardless of overflow */
-	wait_event(work->done, seq - work->done_seq <= 0);
-	atomic_dec(&work->flushing);
+	spin_unlock_irq(&worker->lock);
 
-	/*
-	 * rmb flush-b1 paired with worker-b0, to make sure our caller
-	 * sees every change made by work->func().
-	 */
-	smp_mb__after_atomic_dec();
+	if (!noop)
+		wait_for_completion(&fwork.done);
 }
 EXPORT_SYMBOL_GPL(flush_kthread_work);
 
-- 
cgit v1.2.3-59-g8ed1b


From 6fec10a1a5866dda3cd6a825a521fc7c2f226ba5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 22 Jul 2012 10:16:34 -0700
Subject: workqueue: fix spurious CPU locality WARN from process_one_work()

25511a4776 "workqueue: reimplement CPU online rebinding to handle idle
workers" added CPU locality sanity check in process_one_work().  It
triggers if a worker is executing on a different CPU without UNBOUND
or REBIND set.

This works for all normal workers but rescuers can trigger this
spuriously when they're serving the unbound or a disassociated
global_cwq - rescuers don't have either flag set and thus its
gcwq->cpu can be a different value including %WORK_CPU_UNBOUND.

Fix it by additionally testing %GCWQ_DISASSOCIATED.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
LKML-Refence: <20120721213656.GA7783@linux.vnet.ibm.com>
---
 kernel/workqueue.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 471996a81633..692d97628a10 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1949,7 +1949,13 @@ __acquires(&gcwq->lock)
 
 	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
+	/*
+	 * Ensure we're on the correct CPU.  DISASSOCIATED test is
+	 * necessary to avoid spurious warnings from rescuers servicing the
+	 * unbound or a disassociated gcwq.
+	 */
 	WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+		     !(gcwq->flags & GCWQ_DISASSOCIATED) &&
 		     raw_smp_processor_id() != gcwq->cpu);
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 7266702805f9d824f92ce5c4069eca65d0f21d28 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 15 Jul 2012 14:10:52 +0400
Subject: signal: make sure we don't get stopped with pending task_work

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/signal.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 677102789cf2..be4f856d52f8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1971,6 +1971,13 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 void ptrace_notify(int exit_code)
 {
 	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+	if (unlikely(current->task_works)) {
+		if (test_and_clear_ti_thread_flag(current_thread_info(),
+						   TIF_NOTIFY_RESUME)) {
+			smp_mb__after_clear_bit();
+			task_work_run();
+		}
+	}
 
 	spin_lock_irq(&current->sighand->siglock);
 	ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2191,6 +2198,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
 	struct signal_struct *signal = current->signal;
 	int signr;
 
+	if (unlikely(current->task_works)) {
+		if (test_and_clear_ti_thread_flag(current_thread_info(),
+						   TIF_NOTIFY_RESUME)) {
+			smp_mb__after_clear_bit();
+			task_work_run();
+		}
+	}
+
 	if (unlikely(uprobe_deny_signal()))
 		return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From 41f9d29f09ca0b22c3631e8a39676e74cda9bcc0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 26 Jun 2012 22:10:04 +0400
Subject: trimming task_work: kill ->data

get rid of the only user of ->data; this is _not_ the final variant - in the
end we'll have task_work and rcu_head identical and just use cred->rcu,
at which point the separate allocation will be gone completely.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/task_work.h    |  4 +---
 kernel/irq/manage.c          |  2 +-
 security/keys/internal.h     |  4 ++++
 security/keys/keyctl.c       | 14 ++++++++------
 security/keys/process_keys.c |  5 +++--
 5 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 294d5d5e90b1..627421c0e108 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -10,14 +10,12 @@ typedef void (*task_work_func_t)(struct task_work *);
 struct task_work {
 	struct hlist_node hlist;
 	task_work_func_t func;
-	void *data;
 };
 
 static inline void
-init_task_work(struct task_work *twork, task_work_func_t func, void *data)
+init_task_work(struct task_work *twork, task_work_func_t func)
 {
 	twork->func = func;
-	twork->data = data;
 }
 
 int task_work_add(struct task_struct *task, struct task_work *twork, bool);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8c548232ba39..d1dd54734ce7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -830,7 +830,7 @@ static int irq_thread(void *data)
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
-	init_task_work(&on_exit_work, irq_thread_dtor, NULL);
+	init_task_work(&on_exit_work, irq_thread_dtor);
 	task_work_add(current, &on_exit_work, false);
 
 	while (!irq_wait_for_interrupt(action)) {
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 3dcbf86b0d31..b510a316874a 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -148,6 +148,10 @@ extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags,
 #define KEY_LOOKUP_PARTIAL	0x02
 #define KEY_LOOKUP_FOR_UNLINK	0x04
 
+struct kludge {	/* this will die off very soon */
+	struct task_work twork;
+	struct cred *cred;
+};
 extern long join_session_keyring(const char *name);
 extern void key_change_session_keyring(struct task_work *twork);
 
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 0f5b3f027299..26723caaad05 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1456,7 +1456,8 @@ long keyctl_session_to_parent(void)
 {
 	struct task_struct *me, *parent;
 	const struct cred *mycred, *pcred;
-	struct task_work *newwork, *oldwork;
+	struct kludge *newwork;
+	struct task_work *oldwork;
 	key_ref_t keyring_r;
 	struct cred *cred;
 	int ret;
@@ -1466,7 +1467,7 @@ long keyctl_session_to_parent(void)
 		return PTR_ERR(keyring_r);
 
 	ret = -ENOMEM;
-	newwork = kmalloc(sizeof(struct task_work), GFP_KERNEL);
+	newwork = kmalloc(sizeof(struct kludge), GFP_KERNEL);
 	if (!newwork)
 		goto error_keyring;
 
@@ -1478,7 +1479,8 @@ long keyctl_session_to_parent(void)
 		goto error_newwork;
 
 	cred->tgcred->session_keyring = key_ref_to_ptr(keyring_r);
-	init_task_work(newwork, key_change_session_keyring, cred);
+	init_task_work(&newwork->twork, key_change_session_keyring);
+	newwork->cred = cred;
 
 	me = current;
 	rcu_read_lock();
@@ -1527,18 +1529,18 @@ long keyctl_session_to_parent(void)
 
 	/* the replacement session keyring is applied just prior to userspace
 	 * restarting */
-	ret = task_work_add(parent, newwork, true);
+	ret = task_work_add(parent, &newwork->twork, true);
 	if (!ret)
 		newwork = NULL;
 unlock:
 	write_unlock_irq(&tasklist_lock);
 	rcu_read_unlock();
 	if (oldwork) {
-		put_cred(oldwork->data);
+		put_cred(container_of(oldwork, struct kludge, twork)->cred);
 		kfree(oldwork);
 	}
 	if (newwork) {
-		put_cred(newwork->data);
+		put_cred(newwork->cred);
 		kfree(newwork);
 	}
 	return ret;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 4ad54eea1ea4..c9b07c97d7f2 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -837,9 +837,10 @@ error:
 void key_change_session_keyring(struct task_work *twork)
 {
 	const struct cred *old = current_cred();
-	struct cred *new = twork->data;
+	struct kludge *p = container_of(twork, struct kludge, twork);
+	struct cred *new = p->cred;
 
-	kfree(twork);
+	kfree(p);
 	if (unlikely(current->flags & PF_EXITING)) {
 		put_cred(new);
 		return;
-- 
cgit v1.2.3-59-g8ed1b


From 158e1645e07f3e9f7e4962d7a0997f5c3b98311b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 27 Jun 2012 09:24:13 +0400
Subject: trim task_work: get rid of hlist

layout based on Oleg's suggestion; single-linked list,
task->task_works points to the last element, forward pointer
from said last element points to head.  I'd still prefer
much more regular scheme with two pointers in task_work,
but...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/sched.h     |  2 +-
 include/linux/task_work.h |  4 +--
 include/linux/tracehook.h |  2 +-
 kernel/fork.c             |  2 +-
 kernel/task_work.c        | 64 ++++++++++++++++++++++++-----------------------
 5 files changed, 38 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4059c0f33f07..b9216ebc2789 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1405,7 +1405,7 @@ struct task_struct {
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
-	struct hlist_head task_works;
+	void *task_works;
 
 	struct audit_context *audit_context;
 #ifdef CONFIG_AUDITSYSCALL
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 627421c0e108..3b3e2c8d037b 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -8,7 +8,7 @@ struct task_work;
 typedef void (*task_work_func_t)(struct task_work *);
 
 struct task_work {
-	struct hlist_node hlist;
+	struct task_work *next;
 	task_work_func_t func;
 };
 
@@ -24,7 +24,7 @@ void task_work_run(void);
 
 static inline void exit_task_work(struct task_struct *task)
 {
-	if (unlikely(!hlist_empty(&task->task_works)))
+	if (unlikely(task->task_works))
 		task_work_run();
 }
 
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 6a4d82bedb03..1e98b5530425 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -192,7 +192,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
 	 * hlist_add_head(task->task_works);
 	 */
 	smp_mb__after_clear_bit();
-	if (unlikely(!hlist_empty(&current->task_works)))
+	if (unlikely(current->task_works))
 		task_work_run();
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b9e622..bebabad59202 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1415,7 +1415,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	p->group_leader = p;
 	INIT_LIST_HEAD(&p->thread_group);
-	INIT_HLIST_HEAD(&p->task_works);
+	p->task_works = NULL;
 
 	/* Now that the task is set up, run cgroup callbacks if
 	 * necessary. We need to run them before the task is visible
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 82d1c794066d..9b8948dbdc60 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -19,7 +19,12 @@ task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
 	 */
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 	if (likely(!(task->flags & PF_EXITING))) {
-		hlist_add_head(&twork->hlist, &task->task_works);
+		struct task_work *last = task->task_works;
+		struct task_work *first = last ? last->next : twork;
+		twork->next = first;
+		if (last)
+			last->next = twork;
+		task->task_works = twork;
 		err = 0;
 	}
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
@@ -34,51 +39,48 @@ struct task_work *
 task_work_cancel(struct task_struct *task, task_work_func_t func)
 {
 	unsigned long flags;
-	struct task_work *twork;
-	struct hlist_node *pos;
+	struct task_work *last, *res = NULL;
 
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
-	hlist_for_each_entry(twork, pos, &task->task_works, hlist) {
-		if (twork->func == func) {
-			hlist_del(&twork->hlist);
-			goto found;
+	last = task->task_works;
+	if (last) {
+		struct task_work *q = last, *p = q->next;
+		while (1) {
+			if (p->func == func) {
+				q->next = p->next;
+				if (p == last)
+					task->task_works = q == p ? NULL : q;
+				res = p;
+				break;
+			}
+			if (p == last)
+				break;
+			q = p;
+			p = q->next;
 		}
 	}
-	twork = NULL;
- found:
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
-	return twork;
+	return res;
 }
 
 void task_work_run(void)
 {
 	struct task_struct *task = current;
-	struct hlist_head task_works;
-	struct hlist_node *pos;
+	struct task_work *p, *q;
 
 	raw_spin_lock_irq(&task->pi_lock);
-	hlist_move_list(&task->task_works, &task_works);
+	p = task->task_works;
+	task->task_works = NULL;
 	raw_spin_unlock_irq(&task->pi_lock);
 
-	if (unlikely(hlist_empty(&task_works)))
+	if (unlikely(!p))
 		return;
-	/*
-	 * We use hlist to save the space in task_struct, but we want fifo.
-	 * Find the last entry, the list should be short, then process them
-	 * in reverse order.
-	 */
-	for (pos = task_works.first; pos->next; pos = pos->next)
-		;
-
-	for (;;) {
-		struct hlist_node **pprev = pos->pprev;
-		struct task_work *twork = container_of(pos, struct task_work,
-							hlist);
-		twork->func(twork);
 
-		if (pprev == &task_works.first)
-			break;
-		pos = container_of(pprev, struct hlist_node, next);
+	q = p->next; /* head */
+	p->next = NULL; /* cut it */
+	while (q) {
+		p = q->next;
+		q->func(q);
+		q = p;
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From 67d1214551e800f9fe7dc7c47a346d2df0fafed5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 27 Jun 2012 11:07:19 +0400
Subject: merge task_work and rcu_head, get rid of separate allocation for
 keyring case

task_work and rcu_head are identical now; merge them (calling the result
struct callback_head, rcu_head #define'd to it), kill separate allocation
in security/keys since we can just use cred->rcu now.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/sched.h        |  2 +-
 include/linux/task_work.h    | 14 ++++----------
 include/linux/types.h        |  9 +++++----
 kernel/irq/manage.c          |  4 ++--
 kernel/task_work.c           | 14 +++++++-------
 security/keys/internal.h     |  6 +-----
 security/keys/keyctl.c       | 28 +++++++++-------------------
 security/keys/process_keys.c |  6 ++----
 8 files changed, 31 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b9216ebc2789..af3555cc760f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1405,7 +1405,7 @@ struct task_struct {
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
-	void *task_works;
+	struct callback_head *task_works;
 
 	struct audit_context *audit_context;
 #ifdef CONFIG_AUDITSYSCALL
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 3b3e2c8d037b..fb46b03b1852 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -4,22 +4,16 @@
 #include <linux/list.h>
 #include <linux/sched.h>
 
-struct task_work;
-typedef void (*task_work_func_t)(struct task_work *);
-
-struct task_work {
-	struct task_work *next;
-	task_work_func_t func;
-};
+typedef void (*task_work_func_t)(struct callback_head *);
 
 static inline void
-init_task_work(struct task_work *twork, task_work_func_t func)
+init_task_work(struct callback_head *twork, task_work_func_t func)
 {
 	twork->func = func;
 }
 
-int task_work_add(struct task_struct *task, struct task_work *twork, bool);
-struct task_work *task_work_cancel(struct task_struct *, task_work_func_t);
+int task_work_add(struct task_struct *task, struct callback_head *twork, bool);
+struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
 void task_work_run(void);
 
 static inline void exit_task_work(struct task_struct *task)
diff --git a/include/linux/types.h b/include/linux/types.h
index 9c1bd539ea70..bf0dd7524b2a 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -246,14 +246,15 @@ struct ustat {
 };
 
 /**
- * struct rcu_head - callback structure for use with RCU
+ * struct callback_head - callback structure for use with RCU and task_work
  * @next: next update requests in a list
  * @func: actual update function to call after the grace period.
  */
-struct rcu_head {
-	struct rcu_head *next;
-	void (*func)(struct rcu_head *head);
+struct callback_head {
+	struct callback_head *next;
+	void (*func)(struct callback_head *head);
 };
+#define rcu_head callback_head
 
 #endif	/* __KERNEL__ */
 #endif /*  __ASSEMBLY__ */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d1dd54734ce7..814c9ef6bba1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -781,7 +781,7 @@ static void wake_threads_waitq(struct irq_desc *desc)
 		wake_up(&desc->wait_for_threads);
 }
 
-static void irq_thread_dtor(struct task_work *unused)
+static void irq_thread_dtor(struct callback_head *unused)
 {
 	struct task_struct *tsk = current;
 	struct irq_desc *desc;
@@ -813,7 +813,7 @@ static void irq_thread_dtor(struct task_work *unused)
  */
 static int irq_thread(void *data)
 {
-	struct task_work on_exit_work;
+	struct callback_head on_exit_work;
 	static const struct sched_param param = {
 		.sched_priority = MAX_USER_RT_PRIO/2,
 	};
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 9b8948dbdc60..76266fb665dc 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -3,7 +3,7 @@
 #include <linux/tracehook.h>
 
 int
-task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
+task_work_add(struct task_struct *task, struct callback_head *twork, bool notify)
 {
 	unsigned long flags;
 	int err = -ESRCH;
@@ -19,8 +19,8 @@ task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
 	 */
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 	if (likely(!(task->flags & PF_EXITING))) {
-		struct task_work *last = task->task_works;
-		struct task_work *first = last ? last->next : twork;
+		struct callback_head *last = task->task_works;
+		struct callback_head *first = last ? last->next : twork;
 		twork->next = first;
 		if (last)
 			last->next = twork;
@@ -35,16 +35,16 @@ task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
 	return err;
 }
 
-struct task_work *
+struct callback_head *
 task_work_cancel(struct task_struct *task, task_work_func_t func)
 {
 	unsigned long flags;
-	struct task_work *last, *res = NULL;
+	struct callback_head *last, *res = NULL;
 
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 	last = task->task_works;
 	if (last) {
-		struct task_work *q = last, *p = q->next;
+		struct callback_head *q = last, *p = q->next;
 		while (1) {
 			if (p->func == func) {
 				q->next = p->next;
@@ -66,7 +66,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
 void task_work_run(void)
 {
 	struct task_struct *task = current;
-	struct task_work *p, *q;
+	struct callback_head *p, *q;
 
 	raw_spin_lock_irq(&task->pi_lock);
 	p = task->task_works;
diff --git a/security/keys/internal.h b/security/keys/internal.h
index b510a316874a..c246ba5d43ab 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -148,12 +148,8 @@ extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags,
 #define KEY_LOOKUP_PARTIAL	0x02
 #define KEY_LOOKUP_FOR_UNLINK	0x04
 
-struct kludge {	/* this will die off very soon */
-	struct task_work twork;
-	struct cred *cred;
-};
 extern long join_session_keyring(const char *name);
-extern void key_change_session_keyring(struct task_work *twork);
+extern void key_change_session_keyring(struct callback_head *twork);
 
 extern struct work_struct key_gc_work;
 extern unsigned key_gc_delay;
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 26723caaad05..0291b3f9397c 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1456,8 +1456,7 @@ long keyctl_session_to_parent(void)
 {
 	struct task_struct *me, *parent;
 	const struct cred *mycred, *pcred;
-	struct kludge *newwork;
-	struct task_work *oldwork;
+	struct callback_head *newwork, *oldwork;
 	key_ref_t keyring_r;
 	struct cred *cred;
 	int ret;
@@ -1467,20 +1466,17 @@ long keyctl_session_to_parent(void)
 		return PTR_ERR(keyring_r);
 
 	ret = -ENOMEM;
-	newwork = kmalloc(sizeof(struct kludge), GFP_KERNEL);
-	if (!newwork)
-		goto error_keyring;
 
 	/* our parent is going to need a new cred struct, a new tgcred struct
 	 * and new security data, so we allocate them here to prevent ENOMEM in
 	 * our parent */
 	cred = cred_alloc_blank();
 	if (!cred)
-		goto error_newwork;
+		goto error_keyring;
+	newwork = &cred->rcu;
 
 	cred->tgcred->session_keyring = key_ref_to_ptr(keyring_r);
-	init_task_work(&newwork->twork, key_change_session_keyring);
-	newwork->cred = cred;
+	init_task_work(newwork, key_change_session_keyring);
 
 	me = current;
 	rcu_read_lock();
@@ -1529,24 +1525,18 @@ long keyctl_session_to_parent(void)
 
 	/* the replacement session keyring is applied just prior to userspace
 	 * restarting */
-	ret = task_work_add(parent, &newwork->twork, true);
+	ret = task_work_add(parent, newwork, true);
 	if (!ret)
 		newwork = NULL;
 unlock:
 	write_unlock_irq(&tasklist_lock);
 	rcu_read_unlock();
-	if (oldwork) {
-		put_cred(container_of(oldwork, struct kludge, twork)->cred);
-		kfree(oldwork);
-	}
-	if (newwork) {
-		put_cred(newwork->cred);
-		kfree(newwork);
-	}
+	if (oldwork)
+		put_cred(container_of(oldwork, struct cred, rcu));
+	if (newwork)
+		put_cred(cred);
 	return ret;
 
-error_newwork:
-	kfree(newwork);
 error_keyring:
 	key_ref_put(keyring_r);
 	return ret;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index c9b07c97d7f2..54339cfd6734 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -834,13 +834,11 @@ error:
  * Replace a process's session keyring on behalf of one of its children when
  * the target  process is about to resume userspace execution.
  */
-void key_change_session_keyring(struct task_work *twork)
+void key_change_session_keyring(struct callback_head *twork)
 {
 	const struct cred *old = current_cred();
-	struct kludge *p = container_of(twork, struct kludge, twork);
-	struct cred *new = p->cred;
+	struct cred *new = container_of(twork, struct cred, rcu);
 
-	kfree(p);
 	if (unlikely(current->flags & PF_EXITING)) {
 		put_cred(new);
 		return;
-- 
cgit v1.2.3-59-g8ed1b


From ed3e694d78cc75fa79bf29698631b146fd27aa35 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 27 Jun 2012 11:31:24 +0400
Subject: move exit_task_work() past exit_files() et.al.

... and get rid of PF_EXITING check in task_work_add().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c      |  6 ++----
 kernel/task_work.c | 30 +++++++++++-------------------
 2 files changed, 13 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2f59cc334516..d17f6c4ddfa9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -953,14 +953,11 @@ void do_exit(long code)
 	exit_signals(tsk);  /* sets PF_EXITING */
 	/*
 	 * tsk->flags are checked in the futex code to protect against
-	 * an exiting task cleaning up the robust pi futexes, and in
-	 * task_work_add() to avoid the race with exit_task_work().
+	 * an exiting task cleaning up the robust pi futexes.
 	 */
 	smp_mb();
 	raw_spin_unlock_wait(&tsk->pi_lock);
 
-	exit_task_work(tsk);
-
 	if (unlikely(in_atomic()))
 		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
 				current->comm, task_pid_nr(current),
@@ -995,6 +992,7 @@ void do_exit(long code)
 	exit_shm(tsk);
 	exit_files(tsk);
 	exit_fs(tsk);
+	exit_task_work(tsk);
 	check_stack_usage();
 	exit_thread();
 
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 76266fb665dc..fb396089f66a 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -5,34 +5,26 @@
 int
 task_work_add(struct task_struct *task, struct callback_head *twork, bool notify)
 {
+	struct callback_head *last, *first;
 	unsigned long flags;
-	int err = -ESRCH;
 
-#ifndef TIF_NOTIFY_RESUME
-	if (notify)
-		return -ENOTSUPP;
-#endif
 	/*
-	 * We must not insert the new work if the task has already passed
-	 * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait()
-	 * and check PF_EXITING under pi_lock.
+	 * Not inserting the new work if the task has already passed
+	 * exit_task_work() is the responisbility of callers.
 	 */
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
-	if (likely(!(task->flags & PF_EXITING))) {
-		struct callback_head *last = task->task_works;
-		struct callback_head *first = last ? last->next : twork;
-		twork->next = first;
-		if (last)
-			last->next = twork;
-		task->task_works = twork;
-		err = 0;
-	}
+	last = task->task_works;
+	first = last ? last->next : twork;
+	twork->next = first;
+	if (last)
+		last->next = twork;
+	task->task_works = twork;
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 
 	/* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
-	if (likely(!err) && notify)
+	if (notify)
 		set_notify_resume(task);
-	return err;
+	return 0;
 }
 
 struct callback_head *
-- 
cgit v1.2.3-59-g8ed1b


From a2d4c71d1559426155e5da8db3265bfa0d8d398d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 27 Jun 2012 11:33:29 +0400
Subject: deal with task_work callbacks adding more work

It doesn't matter on normal return to userland path (we'll recheck the
NOTIFY_RESUME flag anyway), but in case of exit_task_work() we'll
need that as soon as we get callbacks capable of triggering more
task_work_add().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/task_work.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/task_work.c b/kernel/task_work.c
index fb396089f66a..91d4e1742a0c 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -60,19 +60,21 @@ void task_work_run(void)
 	struct task_struct *task = current;
 	struct callback_head *p, *q;
 
-	raw_spin_lock_irq(&task->pi_lock);
-	p = task->task_works;
-	task->task_works = NULL;
-	raw_spin_unlock_irq(&task->pi_lock);
+	while (1) {
+		raw_spin_lock_irq(&task->pi_lock);
+		p = task->task_works;
+		task->task_works = NULL;
+		raw_spin_unlock_irq(&task->pi_lock);
 
-	if (unlikely(!p))
-		return;
+		if (unlikely(!p))
+			return;
 
-	q = p->next; /* head */
-	p->next = NULL; /* cut it */
-	while (q) {
-		p = q->next;
-		q->func(q);
-		q = p;
+		q = p->next; /* head */
+		p->next = NULL; /* cut it */
+		while (q) {
+			p = q->next;
+			q->func(q);
+			q = p;
+		}
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b