From 84e478c6f1eb9c4bfa1fff2f8108e9a061b46428 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 5 Feb 2010 21:47:05 -0500
Subject: nmi_watchdog: Config option to enable new nmi_watchdog

These are the bits that enable the new nmi_watchdog and safely
isolate the old nmi_watchdog.  Only one or the other can run,
not both at the same time.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
Cc: peterz@infradead.org
LKML-Reference: <1265424425-31562-4-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/nmi.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index b752e807adde..a42ff0bef708 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -47,4 +47,8 @@ static inline bool trigger_all_cpu_backtrace(void)
 }
 #endif
 
+#ifdef CONFIG_NMI_WATCHDOG
+int hw_nmi_is_cpu_stuck(struct pt_regs *);
+#endif
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 504d7cf10ee42bb76b9556859f23d4121dee0a77 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 12 Feb 2010 17:19:19 -0500
Subject: nmi_watchdog: Compile and portability fixes

The original patch was x86_64 centric.  Changed the code to make
it less so.

ested by building and running on a powerpc.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: peterz@infradead.org
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
LKML-Reference: <1266013161-31197-2-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/nmi.h    |  2 ++
 arch/x86/kernel/apic/hw_nmi.c | 21 ++++++++++++-----
 include/linux/nmi.h           |  9 ++++++++
 kernel/nmi_watchdog.c         | 52 ++++++++++++++++++++++++++++++++++---------
 kernel/sysctl.c               | 15 ++++++++++++-
 5 files changed, 82 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 93da9c3f3341..5b41b0feb6db 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
 
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern int check_nmi_watchdog(void);
+#if !defined(CONFIG_NMI_WATCHDOG)
 extern int nmi_watchdog_enabled;
+#endif
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
 extern int reserve_perfctr_nmi(unsigned int);
 extern void release_perfctr_nmi(unsigned int);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 8c0e6a410d05..312d772c5c35 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -32,8 +32,13 @@ static DEFINE_PER_CPU(unsigned, last_irq_sum);
  */
 static inline unsigned int get_timer_irqs(int cpu)
 {
-        return per_cpu(irq_stat, cpu).apic_timer_irqs +
-                per_cpu(irq_stat, cpu).irq0_irqs;
+	unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs;
+
+#if defined(CONFIG_X86_LOCAL_APIC)
+	irqs += per_cpu(irq_stat, cpu).apic_timer_irqs;
+#endif
+
+        return irqs;
 }
 
 static inline int mce_in_progress(void)
@@ -82,6 +87,11 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 	}
 }
 
+u64 hw_nmi_get_sample_period(void)
+{
+        return cpu_khz * 1000;
+}
+
 void arch_trigger_all_cpu_backtrace(void)
 {
 	int i;
@@ -100,15 +110,16 @@ void arch_trigger_all_cpu_backtrace(void)
 }
 
 /* STUB calls to mimic old nmi_watchdog behaviour */
+#if defined(CONFIG_X86_LOCAL_APIC)
 unsigned int nmi_watchdog = NMI_NONE;
 EXPORT_SYMBOL(nmi_watchdog);
+void acpi_nmi_enable(void) { return; }
+void acpi_nmi_disable(void) { return; }
+#endif
 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
 EXPORT_SYMBOL(nmi_active);
-int nmi_watchdog_enabled;
 int unknown_nmi_panic;
 void cpu_nmi_set_wd_enabled(void) { return; }
-void acpi_nmi_enable(void) { return; }
-void acpi_nmi_disable(void) { return; }
 void stop_apic_nmi_watchdog(void *unused) { return; }
 void setup_apic_nmi_watchdog(void *unused) { return; }
 int __init check_nmi_watchdog(void) { return 0; }
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a42ff0bef708..794e7354c5bf 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -20,10 +20,14 @@ extern void touch_nmi_watchdog(void);
 extern void acpi_nmi_disable(void);
 extern void acpi_nmi_enable(void);
 #else
+#ifndef CONFIG_NMI_WATCHDOG
 static inline void touch_nmi_watchdog(void)
 {
 	touch_softlockup_watchdog();
 }
+#else
+extern void touch_nmi_watchdog(void);
+#endif
 static inline void acpi_nmi_disable(void) { }
 static inline void acpi_nmi_enable(void) { }
 #endif
@@ -49,6 +53,11 @@ static inline bool trigger_all_cpu_backtrace(void)
 
 #ifdef CONFIG_NMI_WATCHDOG
 int hw_nmi_is_cpu_stuck(struct pt_regs *);
+u64 hw_nmi_get_sample_period(void);
+extern int nmi_watchdog_enabled;
+struct ctl_table;
+extern int proc_nmi_enabled(struct ctl_table *, int ,
+                        void __user *, size_t *, loff_t *);
 #endif
 
 #endif
diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c
index 36817b214d69..73c1954a97bb 100644
--- a/kernel/nmi_watchdog.c
+++ b/kernel/nmi_watchdog.c
@@ -30,6 +30,8 @@ static DEFINE_PER_CPU(struct perf_event *, nmi_watchdog_ev);
 static DEFINE_PER_CPU(int, nmi_watchdog_touch);
 static DEFINE_PER_CPU(long, alert_counter);
 
+static int panic_on_timeout;
+
 void touch_nmi_watchdog(void)
 {
 	__raw_get_cpu_var(nmi_watchdog_touch) = 1;
@@ -46,19 +48,49 @@ void touch_all_nmi_watchdog(void)
 	touch_softlockup_watchdog();
 }
 
+static int __init setup_nmi_watchdog(char *str)
+{
+        if (!strncmp(str, "panic", 5)) {
+                panic_on_timeout = 1;
+                str = strchr(str, ',');
+                if (!str)
+                        return 1;
+                ++str;
+        }
+        return 1;
+}
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
 #ifdef CONFIG_SYSCTL
 /*
  * proc handler for /proc/sys/kernel/nmi_watchdog
  */
+int nmi_watchdog_enabled;
+
 int proc_nmi_enabled(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int cpu;
 
-	if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL)
+	if (!write) {
+		struct perf_event *event;
+		for_each_online_cpu(cpu) {
+			event = per_cpu(nmi_watchdog_ev, cpu);
+			if (event->state > PERF_EVENT_STATE_OFF) {
+				nmi_watchdog_enabled = 1;
+				break;
+			}
+		}
+		proc_dointvec(table, write, buffer, length, ppos);
+		return 0;
+	}
+
+	if (per_cpu(nmi_watchdog_ev, smp_processor_id()) == NULL) {
 		nmi_watchdog_enabled = 0;
-	else
-		nmi_watchdog_enabled = 1;
+		proc_dointvec(table, write, buffer, length, ppos);
+		printk("NMI watchdog failed configuration, can not be enabled\n");
+		return 0;
+	}
 
 	touch_all_nmi_watchdog();
 	proc_dointvec(table, write, buffer, length, ppos);
@@ -81,8 +113,6 @@ struct perf_event_attr wd_attr = {
 	.disabled = 1,
 };
 
-static int panic_on_timeout;
-
 void wd_overflow(struct perf_event *event, int nmi,
 		 struct perf_sample_data *data,
 		 struct pt_regs *regs)
@@ -103,11 +133,11 @@ void wd_overflow(struct perf_event *event, int nmi,
 		 */
 		per_cpu(alert_counter,cpu) += 1;
 		if (per_cpu(alert_counter,cpu) == 5) {
-			/*
-			 * die_nmi will return ONLY if NOTIFY_STOP happens..
-			 */
-			die_nmi("BUG: NMI Watchdog detected LOCKUP",
-				regs, panic_on_timeout);
+			if (panic_on_timeout) {
+				panic("NMI Watchdog detected LOCKUP on cpu %d", cpu);
+			} else {
+				WARN(1, "NMI Watchdog detected LOCKUP on cpu %d", cpu);
+			}
 		}
 	} else {
 		per_cpu(alert_counter,cpu) = 0;
@@ -133,7 +163,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		/* originally wanted the below chunk to be in CPU_UP_PREPARE, but caps is unpriv for non-CPU0 */
-		wd_attr.sample_period = cpu_khz * 1000;
+		wd_attr.sample_period = hw_nmi_get_sample_period();
 		event = perf_event_create_kernel_counter(&wd_attr, hotcpu, -1, wd_overflow);
 		if (IS_ERR(event)) {
 			printk(KERN_ERR "nmi watchdog failed to create perf event on %i: %p\n", hotcpu, event);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b2448468..ac72c9e6bd9b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -60,6 +60,10 @@
 #include <asm/io.h>
 #endif
 
+#ifdef CONFIG_NMI_WATCHDOG
+#include <linux/nmi.h>
+#endif
+
 
 #if defined(CONFIG_SYSCTL)
 
@@ -692,7 +696,16 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
 	},
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+#if defined(CONFIG_NMI_WATCHDOG)
+	{
+		.procname       = "nmi_watchdog",
+		.data           = &nmi_watchdog_enabled,
+		.maxlen         = sizeof (int),
+		.mode           = 0644,
+		.proc_handler   = proc_nmi_enabled,
+	},
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_NMI_WATCHDOG)
 	{
 		.procname       = "unknown_nmi_panic",
 		.data           = &unknown_nmi_panic,
-- 
cgit v1.2.3-59-g8ed1b


From 47195d57636604ff6048b0d7aa3e4ed9643f6073 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Mon, 22 Feb 2010 18:09:03 -0500
Subject: nmi_watchdog: Clean up various small details

Mostly copy/paste whitespace damage with a couple of nitpicks by
the checkpatch script. Fix the struct definition as requested by Ingo too.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: peterz@infradead.org
Cc: gorcunov@gmail.com
Cc: aris@redhat.com
LKML-Reference: <1266880143-24943-1-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--
 arch/x86/kernel/apic/hw_nmi.c |   14 +++++------
 arch/x86/kernel/traps.c       |    6 ++--
 include/linux/nmi.h           |    2 -
 kernel/nmi_watchdog.c         |   51 ++++++++++++++++++++----------------------
 4 files changed, 36 insertions(+), 37 deletions(-)
---
 arch/x86/kernel/apic/hw_nmi.c | 14 ++++++------
 arch/x86/kernel/traps.c       |  6 ++---
 include/linux/nmi.h           |  2 +-
 kernel/nmi_watchdog.c         | 51 +++++++++++++++++++++----------------------
 4 files changed, 36 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 0b4d205a6b8e..e8b78a0be5de 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -38,15 +38,15 @@ static inline unsigned int get_timer_irqs(int cpu)
 	irqs += per_cpu(irq_stat, cpu).apic_timer_irqs;
 #endif
 
-        return irqs;
+	return irqs;
 }
 
 static inline int mce_in_progress(void)
 {
 #if defined(CONFIG_X86_MCE)
-        return atomic_read(&mce_entry) > 0;
+	return atomic_read(&mce_entry) > 0;
 #endif
-        return 0;
+	return 0;
 }
 
 int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
@@ -69,9 +69,9 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 	}
 
 	/* if we are doing an mce, just assume the cpu is not stuck */
-        /* Could check oops_in_progress here too, but it's safer not to */
-        if (mce_in_progress())
-                return 0;
+	/* Could check oops_in_progress here too, but it's safer not to */
+	if (mce_in_progress())
+		return 0;
 
 	/* We determine if the cpu is stuck by checking whether any
 	 * interrupts have happened since we last checked.  Of course
@@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 
 u64 hw_nmi_get_sample_period(void)
 {
-        return cpu_khz * 1000;
+	return cpu_khz * 1000;
 }
 
 #ifdef ARCH_HAS_NMI_WATCHDOG
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 973cbc4f044f..bdc7fab3ef3e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -402,9 +402,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 			return;
 
 #ifdef CONFIG_X86_LOCAL_APIC
-	        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
-	        			                == NOTIFY_STOP)
-	                return;
+		if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+							== NOTIFY_STOP)
+			return;
 
 #ifndef CONFIG_NMI_WATCHDOG
 		/*
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 794e7354c5bf..22cc7960b649 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -57,7 +57,7 @@ u64 hw_nmi_get_sample_period(void);
 extern int nmi_watchdog_enabled;
 struct ctl_table;
 extern int proc_nmi_enabled(struct ctl_table *, int ,
-                        void __user *, size_t *, loff_t *);
+			void __user *, size_t *, loff_t *);
 #endif
 
 #endif
diff --git a/kernel/nmi_watchdog.c b/kernel/nmi_watchdog.c
index 3c75cbf3acb8..0a6f57f537a7 100644
--- a/kernel/nmi_watchdog.c
+++ b/kernel/nmi_watchdog.c
@@ -50,31 +50,31 @@ void touch_all_nmi_watchdog(void)
 
 static int __init setup_nmi_watchdog(char *str)
 {
-        if (!strncmp(str, "panic", 5)) {
-                panic_on_timeout = 1;
-                str = strchr(str, ',');
-                if (!str)
-                        return 1;
-                ++str;
-        }
-        return 1;
+	if (!strncmp(str, "panic", 5)) {
+		panic_on_timeout = 1;
+		str = strchr(str, ',');
+		if (!str)
+			return 1;
+		++str;
+	}
+	return 1;
 }
 __setup("nmi_watchdog=", setup_nmi_watchdog);
 
 struct perf_event_attr wd_hw_attr = {
-	.type = PERF_TYPE_HARDWARE,
-	.config = PERF_COUNT_HW_CPU_CYCLES,
-	.size = sizeof(struct perf_event_attr),
-	.pinned = 1,
-	.disabled = 1,
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
 };
 
 struct perf_event_attr wd_sw_attr = {
-	.type = PERF_TYPE_SOFTWARE,
-	.config = PERF_COUNT_SW_CPU_CLOCK,
-	.size = sizeof(struct perf_event_attr),
-	.pinned = 1,
-	.disabled = 1,
+	.type		= PERF_TYPE_SOFTWARE,
+	.config		= PERF_COUNT_SW_CPU_CLOCK,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
 };
 
 void wd_overflow(struct perf_event *event, int nmi,
@@ -95,16 +95,15 @@ void wd_overflow(struct perf_event *event, int nmi,
 		 * Ayiee, looks like this CPU is stuck ...
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
-		per_cpu(alert_counter,cpu) += 1;
-		if (per_cpu(alert_counter,cpu) == 5) {
-			if (panic_on_timeout) {
+		per_cpu(alert_counter, cpu) += 1;
+		if (per_cpu(alert_counter, cpu) == 5) {
+			if (panic_on_timeout)
 				panic("NMI Watchdog detected LOCKUP on cpu %d", cpu);
-			} else {
+			else
 				WARN(1, "NMI Watchdog detected LOCKUP on cpu %d", cpu);
-			}
 		}
 	} else {
-		per_cpu(alert_counter,cpu) = 0;
+		per_cpu(alert_counter, cpu) = 0;
 	}
 
 	return;
@@ -126,7 +125,7 @@ static int enable_nmi_watchdog(int cpu)
 		event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow);
 		if (IS_ERR(event)) {
 			/* hardware doesn't exist or not supported, fallback to software events */
-			printk("nmi_watchdog: hardware not available, trying software events\n");
+			printk(KERN_INFO "nmi_watchdog: hardware not available, trying software events\n");
 			wd_attr = &wd_sw_attr;
 			wd_attr->sample_period = NSEC_PER_SEC;
 			event = perf_event_create_kernel_counter(wd_attr, cpu, -1, wd_overflow);
@@ -182,7 +181,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write,
 	if (nmi_watchdog_enabled) {
 		for_each_online_cpu(cpu)
 			if (enable_nmi_watchdog(cpu)) {
-				printk("NMI watchdog failed configuration, "
+				printk(KERN_ERR "NMI watchdog failed configuration, "
 					" can not be enabled\n");
 			}
 	} else {
-- 
cgit v1.2.3-59-g8ed1b


From 927a7c9c1793def3a55d60c926d3945528e6bf1b Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@opensource.se>
Date: Fri, 19 Mar 2010 04:47:19 +0000
Subject: dmaengine: shdma: Enable on SH-Mobile ARM

Enable the shdma dmaengine driver on SH-Mobile ARM.

Signed-off-by: Magnus Damm <damm@opensource.se>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/dma/Kconfig        | 2 +-
 drivers/dma/shdma.c        | 8 ++++----
 include/linux/serial_sci.h | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index c27f80e5d531..9d8ca990dde6 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -128,7 +128,7 @@ config TXX9_DMAC
 
 config SH_DMAE
 	tristate "Renesas SuperH DMAC support"
-	depends on SUPERH && SH_DMA
+	depends on (SUPERH && SH_DMA) || (ARM && ARCH_SHMOBILE)
 	depends on !SH_DMA_API
 	select DMA_ENGINE
 	help
diff --git a/drivers/dma/shdma.c b/drivers/dma/shdma.c
index 7a18b580f626..bbc3dfeed9b2 100644
--- a/drivers/dma/shdma.c
+++ b/drivers/dma/shdma.c
@@ -793,7 +793,7 @@ static irqreturn_t sh_dmae_interrupt(int irq, void *data)
 	return ret;
 }
 
-#if defined(CONFIG_CPU_SH4)
+#if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARCH_SHMOBILE)
 static irqreturn_t sh_dmae_err(int irq, void *data)
 {
 	struct sh_dmae_device *shdev = (struct sh_dmae_device *)data;
@@ -1034,7 +1034,7 @@ static int __init sh_dmae_probe(struct platform_device *pdev)
 	/* Default transfer size of 32 bytes requires 32-byte alignment */
 	shdev->common.copy_align = LOG2_DEFAULT_XFER_SIZE;
 
-#if defined(CONFIG_CPU_SH4)
+#if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARCH_SHMOBILE)
 	chanirq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 1);
 
 	if (!chanirq_res)
@@ -1059,7 +1059,7 @@ static int __init sh_dmae_probe(struct platform_device *pdev)
 
 #else
 	chanirq_res = errirq_res;
-#endif /* CONFIG_CPU_SH4 */
+#endif /* CONFIG_CPU_SH4 || CONFIG_ARCH_SHMOBILE */
 
 	if (chanirq_res->start == chanirq_res->end &&
 	    !platform_get_resource(pdev, IORESOURCE_IRQ, 1)) {
@@ -1106,7 +1106,7 @@ static int __init sh_dmae_probe(struct platform_device *pdev)
 chan_probe_err:
 	sh_dmae_chan_remove(shdev);
 eirqres:
-#if defined(CONFIG_CPU_SH4)
+#if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARCH_SHMOBILE)
 	free_irq(errirq, shdev);
 eirq_err:
 #endif
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index f5364a1de68b..837efa4e63c2 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -3,7 +3,7 @@
 
 #include <linux/serial_core.h>
 #ifdef CONFIG_SERIAL_SH_SCI_DMA
-#include <asm/dmaengine.h>
+#include <linux/sh_dma.h>
 #endif
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 4e639fdf0d0d745648aa62228ab8a0d9c03a563f Mon Sep 17 00:00:00 2001
From: Peter Jones <pjones@redhat.com>
Date: Thu, 25 Feb 2010 15:37:17 -0500
Subject: ibft: Update iBFT handling for v1.03 of the spec.

- Use struct acpi_table_ibft instead of struct ibft_table_header
- Don't do reserve_ibft_region() on UEFI machines (section 1.4.3.1)
- If ibft_addr isn't initialized when ibft_init() is called, check for
  ACPI-based tables.
- Fix compiler error when CONFIG_ACPI is not defined.

Signed-off-by: Konrad Rzeszutek Wilk <konrad@kernel.org>
Signed-off-by: Peter Jones <pjones@redhat.com>
Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
---
 drivers/firmware/iscsi_ibft.c      | 30 ++++++++++++++++++------------
 drivers/firmware/iscsi_ibft_find.c | 35 ++++++++++++++++++++++++++++++-----
 include/linux/iscsi_ibft.h         | 12 ++----------
 3 files changed, 50 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c
index ed2801c378de..b3ab24f9d78f 100644
--- a/drivers/firmware/iscsi_ibft.c
+++ b/drivers/firmware/iscsi_ibft.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2007 Red Hat, Inc.
+ *  Copyright 2007-2010 Red Hat, Inc.
  *  by Peter Jones <pjones@redhat.com>
  *  Copyright 2008 IBM, Inc.
  *  by Konrad Rzeszutek <konradr@linux.vnet.ibm.com>
@@ -19,6 +19,9 @@
  *
  * Changelog:
  *
+ *  06 Jan 2010 - Peter Jones <pjones@redhat.com>
+ *    New changelog entries are in the git log from now on.  Not here.
+ *
  *  14 Mar 2008 - Konrad Rzeszutek <ketuzsezr@darnok.org>
  *    Updated comments and copyrights. (v0.4.9)
  *
@@ -78,9 +81,10 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/acpi.h>
 
-#define IBFT_ISCSI_VERSION "0.4.9"
-#define IBFT_ISCSI_DATE "2008-Mar-14"
+#define IBFT_ISCSI_VERSION "0.5.0"
+#define IBFT_ISCSI_DATE "2010-Feb-25"
 
 MODULE_AUTHOR("Peter Jones <pjones@redhat.com> and \
 Konrad Rzeszutek <ketuzsezr@darnok.org>");
@@ -238,7 +242,7 @@ static const char *ibft_initiator_properties[] =
  */
 
 struct ibft_kobject {
-	struct ibft_table_header *header;
+	struct acpi_table_ibft *header;
 	union {
 		struct ibft_initiator *initiator;
 		struct ibft_nic *nic;
@@ -536,12 +540,13 @@ static int __init ibft_check_device(void)
 	u8 *pos;
 	u8 csum = 0;
 
-	len = ibft_addr->length;
+	len = ibft_addr->header.length;
 
 	/* Sanity checking of iBFT. */
-	if (ibft_addr->revision != 1) {
+	if (ibft_addr->header.revision != 1) {
 		printk(KERN_ERR "iBFT module supports only revision 1, " \
-				"while this is %d.\n", ibft_addr->revision);
+				"while this is %d.\n",
+				ibft_addr->header.revision);
 		return -ENOENT;
 	}
 	for (pos = (u8 *)ibft_addr; pos < (u8 *)ibft_addr + len; pos++)
@@ -558,7 +563,7 @@ static int __init ibft_check_device(void)
 /*
  * Helper function for ibft_register_kobjects.
  */
-static int __init ibft_create_kobject(struct ibft_table_header *header,
+static int __init ibft_create_kobject(struct acpi_table_ibft *header,
 				       struct ibft_hdr *hdr,
 				       struct list_head *list)
 {
@@ -596,7 +601,7 @@ static int __init ibft_create_kobject(struct ibft_table_header *header,
 	default:
 		printk(KERN_ERR "iBFT has unknown structure type (%d). " \
 				"Report this bug to %.6s!\n", hdr->id,
-				header->oem_id);
+				header->header.oem_id);
 		rc = 1;
 		break;
 	}
@@ -649,7 +654,7 @@ out_invalid_struct:
  * found add them on the passed-in list. We do not support the other
  * fields at this point, so they are skipped.
  */
-static int __init ibft_register_kobjects(struct ibft_table_header *header,
+static int __init ibft_register_kobjects(struct acpi_table_ibft *header,
 					  struct list_head *list)
 {
 	struct ibft_control *control = NULL;
@@ -660,7 +665,7 @@ static int __init ibft_register_kobjects(struct ibft_table_header *header,
 
 	control = (void *)header + sizeof(*header);
 	end = (void *)control + control->hdr.length;
-	eot_offset = (void *)header + header->length - (void *)control;
+	eot_offset = (void *)header + header->header.length - (void *)control;
 	rc = ibft_verify_hdr("control", (struct ibft_hdr *)control, id_control,
 			     sizeof(*control));
 
@@ -672,7 +677,8 @@ static int __init ibft_register_kobjects(struct ibft_table_header *header,
 	}
 	for (ptr = &control->initiator_off; ptr < end; ptr += sizeof(u16)) {
 		offset = *(u16 *)ptr;
-		if (offset && offset < header->length && offset < eot_offset) {
+		if (offset && offset < header->header.length &&
+						offset < eot_offset) {
 			rc = ibft_create_kobject(header,
 						 (void *)header + offset,
 						 list);
diff --git a/drivers/firmware/iscsi_ibft_find.c b/drivers/firmware/iscsi_ibft_find.c
index d6470ef36e4a..dd85555d3296 100644
--- a/drivers/firmware/iscsi_ibft_find.c
+++ b/drivers/firmware/iscsi_ibft_find.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2007 Red Hat, Inc.
+ *  Copyright 2007-2010 Red Hat, Inc.
  *  by Peter Jones <pjones@redhat.com>
  *  Copyright 2007 IBM, Inc.
  *  by Konrad Rzeszutek <konradr@linux.vnet.ibm.com>
@@ -22,6 +22,7 @@
 #include <linux/blkdev.h>
 #include <linux/ctype.h>
 #include <linux/device.h>
+#include <linux/efi.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/limits.h>
@@ -30,13 +31,15 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/acpi.h>
+#include <linux/iscsi_ibft.h>
 
 #include <asm/mmzone.h>
 
 /*
  * Physical location of iSCSI Boot Format Table.
  */
-struct ibft_table_header *ibft_addr;
+struct acpi_table_ibft *ibft_addr;
 EXPORT_SYMBOL_GPL(ibft_addr);
 
 #define IBFT_SIGN "iBFT"
@@ -46,6 +49,13 @@ EXPORT_SYMBOL_GPL(ibft_addr);
 #define VGA_MEM 0xA0000 /* VGA buffer */
 #define VGA_SIZE 0x20000 /* 128kB */
 
+#ifdef CONFIG_ACPI
+static int __init acpi_find_ibft(struct acpi_table_header *header)
+{
+	ibft_addr = (struct acpi_table_ibft *)header;
+	return 0;
+}
+#endif /* CONFIG_ACPI */
 
 /*
  * Routine used to find the iSCSI Boot Format Table. The logical
@@ -59,6 +69,11 @@ unsigned long __init find_ibft_region(unsigned long *sizep)
 
 	ibft_addr = NULL;
 
+	/* iBFT 1.03 section 1.4.3.1 mandates that UEFI machines will
+	 * only use ACPI for this */
+	if (efi_enabled)
+		return 0;
+
 	for (pos = IBFT_START; pos < IBFT_END; pos += 16) {
 		/* The table can't be inside the VGA BIOS reserved space,
 		 * so skip that area */
@@ -72,14 +87,24 @@ unsigned long __init find_ibft_region(unsigned long *sizep)
 			/* if the length of the table extends past 1M,
 			 * the table cannot be valid. */
 			if (pos + len <= (IBFT_END-1)) {
-				ibft_addr = (struct ibft_table_header *)virt;
+				ibft_addr = (struct acpi_table_ibft *)virt;
 				break;
 			}
 		}
 	}
+#ifdef CONFIG_ACPI
+	/*
+	 * One spec says "IBFT", the other says "iBFT". We have to check
+	 * for both.
+	 */
+	if (!ibft_addr)
+		acpi_table_parse(ACPI_SIG_IBFT, acpi_find_ibft);
+	if (!ibft_addr)
+		acpi_table_parse("iBFT", acpi_find_ibft);
+#endif /* CONFIG_ACPI */
 	if (ibft_addr) {
-		*sizep = PAGE_ALIGN(len);
-		return pos;
+		*sizep = PAGE_ALIGN(ibft_addr->header.length);
+		return (u64)isa_virt_to_bus(ibft_addr);
 	}
 
 	*sizep = 0;
diff --git a/include/linux/iscsi_ibft.h b/include/linux/iscsi_ibft.h
index d2e4042f8f5e..8ba7e5b9d62c 100644
--- a/include/linux/iscsi_ibft.h
+++ b/include/linux/iscsi_ibft.h
@@ -21,21 +21,13 @@
 #ifndef ISCSI_IBFT_H
 #define ISCSI_IBFT_H
 
-struct ibft_table_header {
-	char signature[4];
-	u32 length;
-	u8 revision;
-	u8 checksum;
-	char oem_id[6];
-	char oem_table_id[8];
-	char reserved[24];
-} __attribute__((__packed__));
+#include <acpi/acpi.h>
 
 /*
  * Logical location of iSCSI Boot Format Table.
  * If the value is NULL there is no iBFT on the machine.
  */
-extern struct ibft_table_header *ibft_addr;
+extern struct acpi_table_ibft *ibft_addr;
 
 /*
  * Routine used to find and reserve the iSCSI Boot Format Table. The
-- 
cgit v1.2.3-59-g8ed1b


From ba4ee30c6c797de148dcc7254cf6d531aba71d9b Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Mon, 12 Apr 2010 18:06:17 +0000
Subject: ibft: separate ibft parsing from sysfs interface

Not all iscsi drivers support ibft. For drivers like be2iscsi
that do not but are bootable through a vendor firmware specific
format/process this patch moves the sysfs interface from the ibft code
to a lib module. This then allows userspace tools to search for iscsi boot
info in a common place and in a common format.

ibft iscsi boot info is exported in the same place as it was
before: /sys/firmware/ibft.

vendor/fw boot info gets export in /sys/firmware/iscsi_bootX, where X is the
scsi host number of the HBA. Underneath these parent dirs, the
target, ethernet, and initiator dirs are the same as they were before.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: Konrad Rzeszutek Wilk <konrad@kernel.org>
Signed-off-by: Peter Jones <pjones@redhat.com>
---
 drivers/firmware/Kconfig            |   8 +
 drivers/firmware/Makefile           |   1 +
 drivers/firmware/iscsi_boot_sysfs.c | 481 ++++++++++++++++++++++++++++++++++++
 include/linux/iscsi_boot_sysfs.h    | 123 +++++++++
 4 files changed, 613 insertions(+)
 create mode 100644 drivers/firmware/iscsi_boot_sysfs.c
 create mode 100644 include/linux/iscsi_boot_sysfs.h

(limited to 'include/linux')

diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 1b03ba1d0834..571d2182613d 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -122,6 +122,14 @@ config ISCSI_IBFT_FIND
 	  is necessary for iSCSI Boot Firmware Table Attributes module to work
 	  properly.
 
+config ISCSI_BOOT_SYSFS
+	tristate "iSCSI Boot Sysfs Interface"
+	default	n
+	help
+	  This option enables support for exposing iSCSI boot information
+	  via sysfs to userspace. If you wish to export this information,
+	  say Y. Otherwise, say N.
+
 config ISCSI_IBFT
 	tristate "iSCSI Boot Firmware Table Attributes module"
 	depends on ISCSI_IBFT_FIND
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 1c3c17343dbe..5fe7e1662922 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -10,4 +10,5 @@ obj-$(CONFIG_DCDBAS)		+= dcdbas.o
 obj-$(CONFIG_DMIID)		+= dmi-id.o
 obj-$(CONFIG_ISCSI_IBFT_FIND)	+= iscsi_ibft_find.o
 obj-$(CONFIG_ISCSI_IBFT)	+= iscsi_ibft.o
+obj-$(CONFIG_ISCSI_BOOT_SYSFS)	+= iscsi_boot_sysfs.o
 obj-$(CONFIG_FIRMWARE_MEMMAP)	+= memmap.o
diff --git a/drivers/firmware/iscsi_boot_sysfs.c b/drivers/firmware/iscsi_boot_sysfs.c
new file mode 100644
index 000000000000..df6bff7366cf
--- /dev/null
+++ b/drivers/firmware/iscsi_boot_sysfs.c
@@ -0,0 +1,481 @@
+/*
+ * Export the iSCSI boot info to userland via sysfs.
+ *
+ * Copyright (C) 2010 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2010 Mike Christie
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License v2.0 as published by
+ * the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/capability.h>
+#include <linux/iscsi_boot_sysfs.h>
+
+
+MODULE_AUTHOR("Mike Christie <michaelc@cs.wisc.edu>");
+MODULE_DESCRIPTION("sysfs interface and helpers to export iSCSI boot information");
+MODULE_LICENSE("GPL");
+/*
+ * The kobject and attribute structures.
+ */
+struct iscsi_boot_attr {
+	struct attribute attr;
+	int type;
+	ssize_t (*show) (void *data, int type, char *buf);
+};
+
+/*
+ * The routine called for all sysfs attributes.
+ */
+static ssize_t iscsi_boot_show_attribute(struct kobject *kobj,
+					 struct attribute *attr, char *buf)
+{
+	struct iscsi_boot_kobj *boot_kobj =
+			container_of(kobj, struct iscsi_boot_kobj, kobj);
+	struct iscsi_boot_attr *boot_attr =
+			container_of(attr, struct iscsi_boot_attr, attr);
+	ssize_t ret = -EIO;
+	char *str = buf;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (boot_kobj->show)
+		ret = boot_kobj->show(boot_kobj->data, boot_attr->type, str);
+	return ret;
+}
+
+static const struct sysfs_ops iscsi_boot_attr_ops = {
+	.show = iscsi_boot_show_attribute,
+};
+
+static void iscsi_boot_kobj_release(struct kobject *kobj)
+{
+	struct iscsi_boot_kobj *boot_kobj =
+			container_of(kobj, struct iscsi_boot_kobj, kobj);
+
+	kfree(boot_kobj->data);
+	kfree(boot_kobj);
+}
+
+static struct kobj_type iscsi_boot_ktype = {
+	.release = iscsi_boot_kobj_release,
+	.sysfs_ops = &iscsi_boot_attr_ops,
+};
+
+#define iscsi_boot_rd_attr(fnname, sysfs_name, attr_type)		\
+static struct iscsi_boot_attr iscsi_boot_attr_##fnname = {	\
+	.attr	= { .name = __stringify(sysfs_name), .mode = 0444 },	\
+	.type	= attr_type,						\
+}
+
+/* Target attrs */
+iscsi_boot_rd_attr(tgt_index, index, ISCSI_BOOT_TGT_INDEX);
+iscsi_boot_rd_attr(tgt_flags, flags, ISCSI_BOOT_TGT_FLAGS);
+iscsi_boot_rd_attr(tgt_ip, ip-addr, ISCSI_BOOT_TGT_IP_ADDR);
+iscsi_boot_rd_attr(tgt_port, port, ISCSI_BOOT_TGT_PORT);
+iscsi_boot_rd_attr(tgt_lun, lun, ISCSI_BOOT_TGT_LUN);
+iscsi_boot_rd_attr(tgt_chap, chap-type, ISCSI_BOOT_TGT_CHAP_TYPE);
+iscsi_boot_rd_attr(tgt_nic, nic-assoc, ISCSI_BOOT_TGT_NIC_ASSOC);
+iscsi_boot_rd_attr(tgt_name, target-name, ISCSI_BOOT_TGT_NAME);
+iscsi_boot_rd_attr(tgt_chap_name, chap-name, ISCSI_BOOT_TGT_CHAP_NAME);
+iscsi_boot_rd_attr(tgt_chap_secret, chap-secret, ISCSI_BOOT_TGT_CHAP_SECRET);
+iscsi_boot_rd_attr(tgt_chap_rev_name, rev-chap-name,
+		   ISCSI_BOOT_TGT_REV_CHAP_NAME);
+iscsi_boot_rd_attr(tgt_chap_rev_secret, rev-chap-name-secret,
+		   ISCSI_BOOT_TGT_REV_CHAP_SECRET);
+
+static struct attribute *target_attrs[] = {
+	&iscsi_boot_attr_tgt_index.attr,
+	&iscsi_boot_attr_tgt_flags.attr,
+	&iscsi_boot_attr_tgt_ip.attr,
+	&iscsi_boot_attr_tgt_port.attr,
+	&iscsi_boot_attr_tgt_lun.attr,
+	&iscsi_boot_attr_tgt_chap.attr,
+	&iscsi_boot_attr_tgt_nic.attr,
+	&iscsi_boot_attr_tgt_name.attr,
+	&iscsi_boot_attr_tgt_chap_name.attr,
+	&iscsi_boot_attr_tgt_chap_secret.attr,
+	&iscsi_boot_attr_tgt_chap_rev_name.attr,
+	&iscsi_boot_attr_tgt_chap_rev_secret.attr,
+	NULL
+};
+
+static mode_t iscsi_boot_tgt_attr_is_visible(struct kobject *kobj,
+					     struct attribute *attr, int i)
+{
+	struct iscsi_boot_kobj *boot_kobj =
+			container_of(kobj, struct iscsi_boot_kobj, kobj);
+
+	if (attr ==  &iscsi_boot_attr_tgt_index.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_TGT_INDEX);
+	else if (attr == &iscsi_boot_attr_tgt_flags.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_TGT_FLAGS);
+	else if (attr == &iscsi_boot_attr_tgt_ip.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					      ISCSI_BOOT_TGT_IP_ADDR);
+	else if (attr == &iscsi_boot_attr_tgt_port.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					      ISCSI_BOOT_TGT_PORT);
+	else if (attr == &iscsi_boot_attr_tgt_lun.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					      ISCSI_BOOT_TGT_LUN);
+	else if (attr == &iscsi_boot_attr_tgt_chap.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_TGT_CHAP_TYPE);
+	else if (attr == &iscsi_boot_attr_tgt_nic.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_TGT_NIC_ASSOC);
+	else if (attr == &iscsi_boot_attr_tgt_name.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_TGT_NAME);
+	else if (attr == &iscsi_boot_attr_tgt_chap_name.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_TGT_CHAP_NAME);
+	else if (attr == &iscsi_boot_attr_tgt_chap_secret.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_TGT_CHAP_SECRET);
+	else if (attr == &iscsi_boot_attr_tgt_chap_rev_name.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_TGT_REV_CHAP_NAME);
+	else if (attr == &iscsi_boot_attr_tgt_chap_rev_secret.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_TGT_REV_CHAP_SECRET);
+	return 0;
+}
+
+static struct attribute_group iscsi_boot_target_attr_group = {
+	.attrs = target_attrs,
+	.is_visible = iscsi_boot_tgt_attr_is_visible,
+};
+
+/* Ethernet attrs */
+iscsi_boot_rd_attr(eth_index, index, ISCSI_BOOT_ETH_INDEX);
+iscsi_boot_rd_attr(eth_flags, flags, ISCSI_BOOT_ETH_FLAGS);
+iscsi_boot_rd_attr(eth_ip, ip-addr, ISCSI_BOOT_ETH_IP_ADDR);
+iscsi_boot_rd_attr(eth_subnet, subnet-mask, ISCSI_BOOT_ETH_SUBNET_MASK);
+iscsi_boot_rd_attr(eth_origin, origin, ISCSI_BOOT_ETH_ORIGIN);
+iscsi_boot_rd_attr(eth_gateway, gateway, ISCSI_BOOT_ETH_GATEWAY);
+iscsi_boot_rd_attr(eth_primary_dns, primary-dns, ISCSI_BOOT_ETH_PRIMARY_DNS);
+iscsi_boot_rd_attr(eth_secondary_dns, secondary-dns,
+		   ISCSI_BOOT_ETH_SECONDARY_DNS);
+iscsi_boot_rd_attr(eth_dhcp, dhcp, ISCSI_BOOT_ETH_DHCP);
+iscsi_boot_rd_attr(eth_vlan, vlan, ISCSI_BOOT_ETH_VLAN);
+iscsi_boot_rd_attr(eth_mac, mac, ISCSI_BOOT_ETH_MAC);
+iscsi_boot_rd_attr(eth_hostname, hostname, ISCSI_BOOT_ETH_HOSTNAME);
+
+static struct attribute *ethernet_attrs[] = {
+	&iscsi_boot_attr_eth_index.attr,
+	&iscsi_boot_attr_eth_flags.attr,
+	&iscsi_boot_attr_eth_ip.attr,
+	&iscsi_boot_attr_eth_subnet.attr,
+	&iscsi_boot_attr_eth_origin.attr,
+	&iscsi_boot_attr_eth_gateway.attr,
+	&iscsi_boot_attr_eth_primary_dns.attr,
+	&iscsi_boot_attr_eth_secondary_dns.attr,
+	&iscsi_boot_attr_eth_dhcp.attr,
+	&iscsi_boot_attr_eth_vlan.attr,
+	&iscsi_boot_attr_eth_mac.attr,
+	&iscsi_boot_attr_eth_hostname.attr,
+	NULL
+};
+
+static mode_t iscsi_boot_eth_attr_is_visible(struct kobject *kobj,
+					     struct attribute *attr, int i)
+{
+	struct iscsi_boot_kobj *boot_kobj =
+			container_of(kobj, struct iscsi_boot_kobj, kobj);
+
+	if (attr ==  &iscsi_boot_attr_eth_index.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_INDEX);
+	else if (attr ==  &iscsi_boot_attr_eth_flags.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_FLAGS);
+	else if (attr ==  &iscsi_boot_attr_eth_ip.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_IP_ADDR);
+	else if (attr ==  &iscsi_boot_attr_eth_subnet.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_SUBNET_MASK);
+	else if (attr ==  &iscsi_boot_attr_eth_origin.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_ORIGIN);
+	else if (attr ==  &iscsi_boot_attr_eth_gateway.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_GATEWAY);
+	else if (attr ==  &iscsi_boot_attr_eth_primary_dns.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_PRIMARY_DNS);
+	else if (attr ==  &iscsi_boot_attr_eth_secondary_dns.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_SECONDARY_DNS);
+	else if (attr ==  &iscsi_boot_attr_eth_dhcp.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_DHCP);
+	else if (attr ==  &iscsi_boot_attr_eth_vlan.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_VLAN);
+	else if (attr ==  &iscsi_boot_attr_eth_mac.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_MAC);
+	else if (attr ==  &iscsi_boot_attr_eth_hostname.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_ETH_HOSTNAME);
+	return 0;
+}
+
+static struct attribute_group iscsi_boot_ethernet_attr_group = {
+	.attrs = ethernet_attrs,
+	.is_visible = iscsi_boot_eth_attr_is_visible,
+};
+
+/* Initiator attrs */
+iscsi_boot_rd_attr(ini_index, index, ISCSI_BOOT_INI_INDEX);
+iscsi_boot_rd_attr(ini_flags, flags, ISCSI_BOOT_INI_FLAGS);
+iscsi_boot_rd_attr(ini_isns, isns-server, ISCSI_BOOT_INI_ISNS_SERVER);
+iscsi_boot_rd_attr(ini_slp, slp-server, ISCSI_BOOT_INI_SLP_SERVER);
+iscsi_boot_rd_attr(ini_primary_radius, pri-radius-server,
+		   ISCSI_BOOT_INI_PRI_RADIUS_SERVER);
+iscsi_boot_rd_attr(ini_secondary_radius, sec-radius-server,
+		   ISCSI_BOOT_INI_SEC_RADIUS_SERVER);
+iscsi_boot_rd_attr(ini_name, initiator-name, ISCSI_BOOT_INI_INITIATOR_NAME);
+
+static struct attribute *initiator_attrs[] = {
+	&iscsi_boot_attr_ini_index.attr,
+	&iscsi_boot_attr_ini_flags.attr,
+	&iscsi_boot_attr_ini_isns.attr,
+	&iscsi_boot_attr_ini_slp.attr,
+	&iscsi_boot_attr_ini_primary_radius.attr,
+	&iscsi_boot_attr_ini_secondary_radius.attr,
+	&iscsi_boot_attr_ini_name.attr,
+	NULL
+};
+
+static mode_t iscsi_boot_ini_attr_is_visible(struct kobject *kobj,
+					     struct attribute *attr, int i)
+{
+	struct iscsi_boot_kobj *boot_kobj =
+			container_of(kobj, struct iscsi_boot_kobj, kobj);
+
+	if (attr ==  &iscsi_boot_attr_ini_index.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_INI_INDEX);
+	if (attr ==  &iscsi_boot_attr_ini_flags.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_INI_FLAGS);
+	if (attr ==  &iscsi_boot_attr_ini_isns.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_INI_ISNS_SERVER);
+	if (attr ==  &iscsi_boot_attr_ini_slp.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_INI_SLP_SERVER);
+	if (attr ==  &iscsi_boot_attr_ini_primary_radius.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_INI_PRI_RADIUS_SERVER);
+	if (attr ==  &iscsi_boot_attr_ini_secondary_radius.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_INI_SEC_RADIUS_SERVER);
+	if (attr ==  &iscsi_boot_attr_ini_name.attr)
+		return boot_kobj->is_visible(boot_kobj->data,
+					     ISCSI_BOOT_INI_INITIATOR_NAME);
+
+	return 0;
+}
+
+static struct attribute_group iscsi_boot_initiator_attr_group = {
+	.attrs = initiator_attrs,
+	.is_visible = iscsi_boot_ini_attr_is_visible,
+};
+
+static struct iscsi_boot_kobj *
+iscsi_boot_create_kobj(struct iscsi_boot_kset *boot_kset,
+		       struct attribute_group *attr_group,
+		       const char *name, int index, void *data,
+		       ssize_t (*show) (void *data, int type, char *buf),
+		       mode_t (*is_visible) (void *data, int type))
+{
+	struct iscsi_boot_kobj *boot_kobj;
+
+	boot_kobj = kzalloc(sizeof(*boot_kobj), GFP_KERNEL);
+	if (!boot_kobj)
+		return NULL;
+	INIT_LIST_HEAD(&boot_kobj->list);
+
+	boot_kobj->kobj.kset = boot_kset->kset;
+	if (kobject_init_and_add(&boot_kobj->kobj, &iscsi_boot_ktype,
+				 NULL, name, index)) {
+		kfree(boot_kobj);
+		return NULL;
+	}
+	boot_kobj->data = data;
+	boot_kobj->show = show;
+	boot_kobj->is_visible = is_visible;
+
+	if (sysfs_create_group(&boot_kobj->kobj, attr_group)) {
+		/*
+		 * We do not want to free this because the caller
+		 * will assume that since the creation call failed
+		 * the boot kobj was not setup and the normal release
+		 * path is not being run.
+		 */
+		boot_kobj->data = NULL;
+		kobject_put(&boot_kobj->kobj);
+		return NULL;
+	}
+	boot_kobj->attr_group = attr_group;
+
+	kobject_uevent(&boot_kobj->kobj, KOBJ_ADD);
+	/* Nothing broke so lets add it to the list. */
+	list_add_tail(&boot_kobj->list, &boot_kset->kobj_list);
+	return boot_kobj;
+}
+
+static void iscsi_boot_remove_kobj(struct iscsi_boot_kobj *boot_kobj)
+{
+	list_del(&boot_kobj->list);
+	sysfs_remove_group(&boot_kobj->kobj, boot_kobj->attr_group);
+	kobject_put(&boot_kobj->kobj);
+}
+
+/**
+ * iscsi_boot_create_target() - create boot target sysfs dir
+ * @boot_kset: boot kset
+ * @index: the target id
+ * @data: driver specific data for target
+ * @show: attr show function
+ * @is_visible: attr visibility function
+ *
+ * Note: The boot sysfs lib will free the data passed in for the caller
+ * when all refs to the target kobject have been released.
+ */
+struct iscsi_boot_kobj *
+iscsi_boot_create_target(struct iscsi_boot_kset *boot_kset, int index,
+			 void *data,
+			 ssize_t (*show) (void *data, int type, char *buf),
+			 mode_t (*is_visible) (void *data, int type))
+{
+	return iscsi_boot_create_kobj(boot_kset, &iscsi_boot_target_attr_group,
+				      "target%d", index, data, show, is_visible);
+}
+EXPORT_SYMBOL_GPL(iscsi_boot_create_target);
+
+/**
+ * iscsi_boot_create_initiator() - create boot initiator sysfs dir
+ * @boot_kset: boot kset
+ * @index: the initiator id
+ * @data: driver specific data
+ * @show: attr show function
+ * @is_visible: attr visibility function
+ *
+ * Note: The boot sysfs lib will free the data passed in for the caller
+ * when all refs to the initiator kobject have been released.
+ */
+struct iscsi_boot_kobj *
+iscsi_boot_create_initiator(struct iscsi_boot_kset *boot_kset, int index,
+			    void *data,
+			    ssize_t (*show) (void *data, int type, char *buf),
+			    mode_t (*is_visible) (void *data, int type))
+{
+	return iscsi_boot_create_kobj(boot_kset,
+				      &iscsi_boot_initiator_attr_group,
+				      "initiator", index, data, show,
+				      is_visible);
+}
+EXPORT_SYMBOL_GPL(iscsi_boot_create_initiator);
+
+/**
+ * iscsi_boot_create_ethernet() - create boot ethernet sysfs dir
+ * @boot_kset: boot kset
+ * @index: the ethernet device id
+ * @data: driver specific data
+ * @show: attr show function
+ * @is_visible: attr visibility function
+ *
+ * Note: The boot sysfs lib will free the data passed in for the caller
+ * when all refs to the ethernet kobject have been released.
+ */
+struct iscsi_boot_kobj *
+iscsi_boot_create_ethernet(struct iscsi_boot_kset *boot_kset, int index,
+			   void *data,
+			   ssize_t (*show) (void *data, int type, char *buf),
+			   mode_t (*is_visible) (void *data, int type))
+{
+	return iscsi_boot_create_kobj(boot_kset,
+				      &iscsi_boot_ethernet_attr_group,
+				      "ethernet%d", index, data, show,
+				      is_visible);
+}
+EXPORT_SYMBOL_GPL(iscsi_boot_create_ethernet);
+
+/**
+ * iscsi_boot_create_kset() - creates root sysfs tree
+ * @set_name: name of root dir
+ */
+struct iscsi_boot_kset *iscsi_boot_create_kset(const char *set_name)
+{
+	struct iscsi_boot_kset *boot_kset;
+
+	boot_kset = kzalloc(sizeof(*boot_kset), GFP_KERNEL);
+	if (!boot_kset)
+		return NULL;
+
+	boot_kset->kset = kset_create_and_add(set_name, NULL, firmware_kobj);
+	if (!boot_kset->kset) {
+		kfree(boot_kset);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&boot_kset->kobj_list);
+	return boot_kset;
+}
+EXPORT_SYMBOL_GPL(iscsi_boot_create_kset);
+
+/**
+ * iscsi_boot_create_host_kset() - creates root sysfs tree for a scsi host
+ * @hostno: host number of scsi host
+ */
+struct iscsi_boot_kset *iscsi_boot_create_host_kset(unsigned int hostno)
+{
+	struct iscsi_boot_kset *boot_kset;
+	char *set_name;
+
+	set_name = kasprintf(GFP_KERNEL, "iscsi_boot%u", hostno);
+	if (!set_name)
+		return NULL;
+
+	boot_kset = iscsi_boot_create_kset(set_name);
+	kfree(set_name);
+	return boot_kset;
+}
+EXPORT_SYMBOL_GPL(iscsi_boot_create_host_kset);
+
+/**
+ * iscsi_boot_destroy_kset() - destroy kset and kobjects under it
+ * @boot_kset: boot kset
+ *
+ * This will remove the kset and kobjects and attrs under it.
+ */
+void iscsi_boot_destroy_kset(struct iscsi_boot_kset *boot_kset)
+{
+	struct iscsi_boot_kobj *boot_kobj, *tmp_kobj;
+
+	list_for_each_entry_safe(boot_kobj, tmp_kobj,
+				 &boot_kset->kobj_list, list)
+		iscsi_boot_remove_kobj(boot_kobj);
+
+	kset_unregister(boot_kset->kset);
+}
+EXPORT_SYMBOL_GPL(iscsi_boot_destroy_kset);
diff --git a/include/linux/iscsi_boot_sysfs.h b/include/linux/iscsi_boot_sysfs.h
new file mode 100644
index 000000000000..f1e6c184f14f
--- /dev/null
+++ b/include/linux/iscsi_boot_sysfs.h
@@ -0,0 +1,123 @@
+/*
+ * Export the iSCSI boot info to userland via sysfs.
+ *
+ * Copyright (C) 2010 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2010 Mike Christie
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License v2.0 as published by
+ * the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _ISCSI_BOOT_SYSFS_
+#define _ISCSI_BOOT_SYSFS_
+
+/*
+ * The text attributes names for each of the kobjects.
+*/
+enum iscsi_boot_eth_properties_enum {
+	ISCSI_BOOT_ETH_INDEX,
+	ISCSI_BOOT_ETH_FLAGS,
+	ISCSI_BOOT_ETH_IP_ADDR,
+	ISCSI_BOOT_ETH_SUBNET_MASK,
+	ISCSI_BOOT_ETH_ORIGIN,
+	ISCSI_BOOT_ETH_GATEWAY,
+	ISCSI_BOOT_ETH_PRIMARY_DNS,
+	ISCSI_BOOT_ETH_SECONDARY_DNS,
+	ISCSI_BOOT_ETH_DHCP,
+	ISCSI_BOOT_ETH_VLAN,
+	ISCSI_BOOT_ETH_MAC,
+	/* eth_pci_bdf - this is replaced by link to the device itself. */
+	ISCSI_BOOT_ETH_HOSTNAME,
+	ISCSI_BOOT_ETH_END_MARKER,
+};
+
+enum iscsi_boot_tgt_properties_enum {
+	ISCSI_BOOT_TGT_INDEX,
+	ISCSI_BOOT_TGT_FLAGS,
+	ISCSI_BOOT_TGT_IP_ADDR,
+	ISCSI_BOOT_TGT_PORT,
+	ISCSI_BOOT_TGT_LUN,
+	ISCSI_BOOT_TGT_CHAP_TYPE,
+	ISCSI_BOOT_TGT_NIC_ASSOC,
+	ISCSI_BOOT_TGT_NAME,
+	ISCSI_BOOT_TGT_CHAP_NAME,
+	ISCSI_BOOT_TGT_CHAP_SECRET,
+	ISCSI_BOOT_TGT_REV_CHAP_NAME,
+	ISCSI_BOOT_TGT_REV_CHAP_SECRET,
+	ISCSI_BOOT_TGT_END_MARKER,
+};
+
+enum iscsi_boot_initiator_properties_enum {
+	ISCSI_BOOT_INI_INDEX,
+	ISCSI_BOOT_INI_FLAGS,
+	ISCSI_BOOT_INI_ISNS_SERVER,
+	ISCSI_BOOT_INI_SLP_SERVER,
+	ISCSI_BOOT_INI_PRI_RADIUS_SERVER,
+	ISCSI_BOOT_INI_SEC_RADIUS_SERVER,
+	ISCSI_BOOT_INI_INITIATOR_NAME,
+	ISCSI_BOOT_INI_END_MARKER,
+};
+
+struct attribute_group;
+
+struct iscsi_boot_kobj {
+	struct kobject kobj;
+	struct attribute_group *attr_group;
+	struct list_head list;
+
+	/*
+	 * Pointer to store driver specific info. If set this will
+	 * be freed for the LLD when the kobj release function is called.
+	 */
+	void *data;
+	/*
+	 * Driver specific show function.
+	 *
+	 * The enum of the type. This can be any value of the above
+	 * properties.
+	 */
+	ssize_t (*show) (void *data, int type, char *buf);
+
+	/*
+	 * Drivers specific visibility function.
+	 * The function should return if they the attr should be readable
+	 * writable or should not be shown.
+	 *
+	 * The enum of the type. This can be any value of the above
+	 * properties.
+	 */
+	mode_t (*is_visible) (void *data, int type);
+};
+
+struct iscsi_boot_kset {
+	struct list_head kobj_list;
+	struct kset *kset;
+};
+
+struct iscsi_boot_kobj *
+iscsi_boot_create_initiator(struct iscsi_boot_kset *boot_kset, int index,
+			    void *data,
+			    ssize_t (*show) (void *data, int type, char *buf),
+			    mode_t (*is_visible) (void *data, int type));
+
+struct iscsi_boot_kobj *
+iscsi_boot_create_ethernet(struct iscsi_boot_kset *boot_kset, int index,
+			   void *data,
+			   ssize_t (*show) (void *data, int type, char *buf),
+			   mode_t (*is_visible) (void *data, int type));
+struct iscsi_boot_kobj *
+iscsi_boot_create_target(struct iscsi_boot_kset *boot_kset, int index,
+			 void *data,
+			 ssize_t (*show) (void *data, int type, char *buf),
+			 mode_t (*is_visible) (void *data, int type));
+
+struct iscsi_boot_kset *iscsi_boot_create_kset(const char *set_name);
+struct iscsi_boot_kset *iscsi_boot_create_host_kset(unsigned int hostno);
+void iscsi_boot_destroy_kset(struct iscsi_boot_kset *boot_kset);
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 58687acba59266735adb8ccd9b5b9aa2c7cd205b Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 7 May 2010 17:11:44 -0400
Subject: lockup_detector: Combine nmi_watchdog and softlockup detector

The new nmi_watchdog (which uses the perf event subsystem) is very
similar in structure to the softlockup detector.  Using Ingo's
suggestion, I combined the two functionalities into one file:
kernel/watchdog.c.

Now both the nmi_watchdog (or hardlockup detector) and softlockup
detector sit on top of the perf event subsystem, which is run every
60 seconds or so to see if there are any lockups.

To detect hardlockups, cpus not responding to interrupts, I
implemented an hrtimer that runs 5 times for every perf event
overflow event.  If that stops counting on a cpu, then the cpu is
most likely in trouble.

To detect softlockups, tasks not yielding to the scheduler, I used the
previous kthread idea that now gets kicked every time the hrtimer fires.
If the kthread isn't being scheduled neither is anyone else and the
warning is printed to the console.

I tested this on x86_64 and both the softlockup and hardlockup paths
work.

V2:
- cleaned up the Kconfig and softlockup combination
- surrounded hardlockup cases with #ifdef CONFIG_PERF_EVENTS_NMI
- seperated out the softlockup case from perf event subsystem
- re-arranged the enabling/disabling nmi watchdog from proc space
- added cpumasks for hardlockup failure cases
- removed fallback to soft events if no PMU exists for hard events

V3:
- comment cleanups
- drop support for older softlockup code
- per_cpu cleanups
- completely remove software clock base hardlockup detector
- use per_cpu masking on hard/soft lockup detection
- #ifdef cleanups
- rename config option NMI_WATCHDOG to LOCKUP_DETECTOR
- documentation additions

V4:
- documentation fixes
- convert per_cpu to __get_cpu_var
- powerpc compile fixes

V5:
- split apart warn flags for hard and soft lockups

TODO:
- figure out how to make an arch-agnostic clock2cycles call
  (if possible) to feed into perf events as a sample period

[fweisbec: merged conflict patch]

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <1273266711-18706-2-git-send-email-dzickus@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/kernel-parameters.txt |   2 +
 arch/x86/include/asm/nmi.h          |   2 +-
 arch/x86/kernel/apic/Makefile       |   4 +-
 arch/x86/kernel/apic/hw_nmi.c       |   2 +-
 arch/x86/kernel/traps.c             |   4 +-
 include/linux/nmi.h                 |   8 +-
 include/linux/sched.h               |   6 +
 init/Kconfig                        |   5 +-
 kernel/Makefile                     |   3 +-
 kernel/sysctl.c                     |  21 +-
 kernel/watchdog.c                   | 592 ++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug                   |  30 +-
 12 files changed, 650 insertions(+), 29 deletions(-)
 create mode 100644 kernel/watchdog.c

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 839b21b0699a..dfe8d1c226c6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1777,6 +1777,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	nousb		[USB] Disable the USB subsystem
 
+	nowatchdog	[KNL] Disable the lockup detector.
+
 	nowb		[ARM]
 
 	nox2apic	[X86-64,APIC] Do not enable x2APIC mode.
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 5b41b0feb6db..932f0f86b4b7 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
 
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern int check_nmi_watchdog(void);
-#if !defined(CONFIG_NMI_WATCHDOG)
+#if !defined(CONFIG_LOCKUP_DETECTOR)
 extern int nmi_watchdog_enabled;
 #endif
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 1a4512e48d24..52f32e0ea194 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,10 @@
 #
 
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_NMI_WATCHDOG),y)
+ifneq ($(CONFIG_LOCKUP_DETECTOR),y)
 obj-$(CONFIG_X86_LOCAL_APIC)	+= nmi.o
 endif
-obj-$(CONFIG_NMI_WATCHDOG)	+= hw_nmi.o
+obj-$(CONFIG_LOCKUP_DETECTOR)	+= hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_SMP)		+= ipi.o
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index e8b78a0be5de..79425f96fcee 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
 
 u64 hw_nmi_get_sample_period(void)
 {
-	return cpu_khz * 1000;
+	return (u64)(cpu_khz) * 1000 * 60;
 }
 
 #ifdef ARCH_HAS_NMI_WATCHDOG
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bdc7fab3ef3e..bd347c2b34dc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -406,7 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 							== NOTIFY_STOP)
 			return;
 
-#ifndef CONFIG_NMI_WATCHDOG
+#ifndef CONFIG_LOCKUP_DETECTOR
 		/*
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
@@ -414,7 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		if (nmi_watchdog_tick(regs, reason))
 			return;
 		if (!do_nmi_callback(regs, cpu))
-#endif /* !CONFIG_NMI_WATCHDOG */
+#endif /* !CONFIG_LOCKUP_DETECTOR */
 			unknown_nmi_error(reason, regs);
 #else
 		unknown_nmi_error(reason, regs);
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 22cc7960b649..abd48aacaf79 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -20,7 +20,7 @@ extern void touch_nmi_watchdog(void);
 extern void acpi_nmi_disable(void);
 extern void acpi_nmi_enable(void);
 #else
-#ifndef CONFIG_NMI_WATCHDOG
+#ifndef CONFIG_LOCKUP_DETECTOR
 static inline void touch_nmi_watchdog(void)
 {
 	touch_softlockup_watchdog();
@@ -51,12 +51,12 @@ static inline bool trigger_all_cpu_backtrace(void)
 }
 #endif
 
-#ifdef CONFIG_NMI_WATCHDOG
+#ifdef CONFIG_LOCKUP_DETECTOR
 int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(void);
-extern int nmi_watchdog_enabled;
+extern int watchdog_enabled;
 struct ctl_table;
-extern int proc_nmi_enabled(struct ctl_table *, int ,
+extern int proc_dowatchdog_enabled(struct ctl_table *, int ,
 			void __user *, size_t *, loff_t *);
 #endif
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dad7f668ebf7..37efe8fa5306 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -346,6 +346,12 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 size_t *lenp, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_LOCKUP_DETECTOR
+extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+				  void __user *buffer,
+				  size_t *lenp, loff_t *ppos);
+#endif
+
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 
diff --git a/init/Kconfig b/init/Kconfig
index c6c8903cb534..e44e25422f22 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -944,8 +944,11 @@ config PERF_USE_VMALLOC
 
 config PERF_EVENTS_NMI
 	bool
+	depends on PERF_EVENTS
 	help
-	  Arch has support for nmi_watchdog
+	  System hardware can generate an NMI using the perf event
+	  subsystem.  Also has support for calculating CPU cycle events
+	  to determine how many clock cycles in a given period.
 
 menu "Kernel Performance Events And Counters"
 
diff --git a/kernel/Makefile b/kernel/Makefile
index d5c30060ac14..6adeafc3e259 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -76,9 +76,8 @@ obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += kgdb.o
-obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
-obj-$(CONFIG_NMI_WATCHDOG) += nmi_watchdog.o
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
+obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a38af430f0d8..0f9adda85f97 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -74,7 +74,7 @@
 #include <scsi/sg.h>
 #endif
 
-#ifdef CONFIG_NMI_WATCHDOG
+#ifdef CONFIG_LOCKUP_DETECTOR
 #include <linux/nmi.h>
 #endif
 
@@ -686,16 +686,25 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
 	},
-#if defined(CONFIG_NMI_WATCHDOG)
+#if defined(CONFIG_LOCKUP_DETECTOR)
 	{
-		.procname       = "nmi_watchdog",
-		.data           = &nmi_watchdog_enabled,
+		.procname       = "watchdog",
+		.data           = &watchdog_enabled,
 		.maxlen         = sizeof (int),
 		.mode           = 0644,
-		.proc_handler   = proc_nmi_enabled,
+		.proc_handler   = proc_dowatchdog_enabled,
+	},
+	{
+		.procname	= "watchdog_thresh",
+		.data		= &softlockup_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dowatchdog_thresh,
+		.extra1		= &neg_one,
+		.extra2		= &sixty,
 	},
 #endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_NMI_WATCHDOG)
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
 	{
 		.procname       = "unknown_nmi_panic",
 		.data           = &unknown_nmi_panic,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 000000000000..6b7fad8497af
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,592 @@
+/*
+ * Detect hard and soft lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * this code detects hard lockups: incidents in where on a CPU
+ * the kernel does not respond to anything except NMI.
+ *
+ * Note: Most of this code is borrowed heavily from softlockup.c,
+ * so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
+ * to those contributors as well.
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+
+int watchdog_enabled;
+int __read_mostly softlockup_thresh = 60;
+
+static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
+static DEFINE_PER_CPU(bool, softlockup_touch_sync);
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+#ifdef CONFIG_PERF_EVENTS_NMI
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+#endif
+
+static int __read_mostly did_panic;
+static int __initdata no_watchdog;
+
+
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+#ifdef CONFIG_PERF_EVENTS_NMI
+static int hardlockup_panic;
+
+static int __init hardlockup_panic_setup(char *str)
+{
+	if (!strncmp(str, "panic", 5))
+		hardlockup_panic = 1;
+	return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+#endif
+
+unsigned int __read_mostly softlockup_panic =
+			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+
+static int __init softlockup_panic_setup(char *str)
+{
+	softlockup_panic = simple_strtoul(str, NULL, 0);
+
+	return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
+static int __init nowatchdog_setup(char *str)
+{
+	no_watchdog = 1;
+	return 1;
+}
+__setup("nowatchdog", nowatchdog_setup);
+
+/* deprecated */
+static int __init nosoftlockup_setup(char *str)
+{
+	no_watchdog = 1;
+	return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+/*  */
+
+
+/*
+ * Returns seconds, approximately.  We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(int this_cpu)
+{
+	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+}
+
+static unsigned long get_sample_period(void)
+{
+	/*
+	 * convert softlockup_thresh from seconds to ns
+	 * the divide by 5 is to give hrtimer 5 chances to
+	 * increment before the hardlockup detector generates
+	 * a warning
+	 */
+	return softlockup_thresh / 5 * NSEC_PER_SEC;
+}
+
+/* Commands for resetting the watchdog */
+static void __touch_watchdog(void)
+{
+	int this_cpu = raw_smp_processor_id();
+
+	__get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+}
+
+void touch_watchdog(void)
+{
+	__get_cpu_var(watchdog_touch_ts) = 0;
+}
+EXPORT_SYMBOL(touch_watchdog);
+
+void touch_all_watchdog(void)
+{
+	int cpu;
+
+	/*
+	 * this is done lockless
+	 * do we care if a 0 races with a timestamp?
+	 * all it means is the softlock check starts one cycle later
+	 */
+	for_each_online_cpu(cpu)
+		per_cpu(watchdog_touch_ts, cpu) = 0;
+}
+
+void touch_nmi_watchdog(void)
+{
+	touch_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+void touch_all_nmi_watchdog(void)
+{
+	touch_all_watchdog();
+}
+
+void touch_softlockup_watchdog(void)
+{
+	touch_watchdog();
+}
+
+void touch_all_softlockup_watchdogs(void)
+{
+	touch_all_watchdog();
+}
+
+void touch_softlockup_watchdog_sync(void)
+{
+	__raw_get_cpu_var(softlockup_touch_sync) = true;
+	__raw_get_cpu_var(watchdog_touch_ts) = 0;
+}
+
+void softlockup_tick(void)
+{
+}
+
+#ifdef CONFIG_PERF_EVENTS_NMI
+/* watchdog detector functions */
+static int is_hardlockup(int cpu)
+{
+	unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+	if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+		return 1;
+
+	per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+	return 0;
+}
+#endif
+
+static int is_softlockup(unsigned long touch_ts, int cpu)
+{
+	unsigned long now = get_timestamp(cpu);
+
+	/* Warn about unreasonable delays: */
+	if (time_after(now, touch_ts + softlockup_thresh))
+		return now - touch_ts;
+
+	return 0;
+}
+
+static int
+watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = watchdog_panic,
+};
+
+#ifdef CONFIG_PERF_EVENTS_NMI
+static struct perf_event_attr wd_hw_attr = {
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
+};
+
+/* Callback function for perf event subsystem */
+void watchdog_overflow_callback(struct perf_event *event, int nmi,
+		 struct perf_sample_data *data,
+		 struct pt_regs *regs)
+{
+	int this_cpu = smp_processor_id();
+	unsigned long touch_ts = per_cpu(watchdog_touch_ts, this_cpu);
+
+	if (touch_ts == 0) {
+		__touch_watchdog();
+		return;
+	}
+
+	/* check for a hardlockup
+	 * This is done by making sure our timer interrupt
+	 * is incrementing.  The timer interrupt should have
+	 * fired multiple times before we overflow'd.  If it hasn't
+	 * then this is a good indication the cpu is stuck
+	 */
+	if (is_hardlockup(this_cpu)) {
+		/* only print hardlockups once */
+		if (__get_cpu_var(hard_watchdog_warn) == true)
+			return;
+
+		if (hardlockup_panic)
+			panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+		else
+			WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+
+		__get_cpu_var(hard_watchdog_warn) = true;
+		return;
+	}
+
+	__get_cpu_var(hard_watchdog_warn) = false;
+	return;
+}
+static void watchdog_interrupt_count(void)
+{
+	__get_cpu_var(hrtimer_interrupts)++;
+}
+#else
+static inline void watchdog_interrupt_count(void) { return; }
+#endif /* CONFIG_PERF_EVENTS_NMI */
+
+/* watchdog kicker functions */
+static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
+{
+	int this_cpu = smp_processor_id();
+	unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
+	struct pt_regs *regs = get_irq_regs();
+	int duration;
+
+	/* kick the hardlockup detector */
+	watchdog_interrupt_count();
+
+	/* kick the softlockup detector */
+	wake_up_process(__get_cpu_var(softlockup_watchdog));
+
+	/* .. and repeat */
+	hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
+
+	if (touch_ts == 0) {
+		if (unlikely(per_cpu(softlockup_touch_sync, this_cpu))) {
+			/*
+			 * If the time stamp was touched atomically
+			 * make sure the scheduler tick is up to date.
+			 */
+			per_cpu(softlockup_touch_sync, this_cpu) = false;
+			sched_clock_tick();
+		}
+		__touch_watchdog();
+		return HRTIMER_RESTART;
+	}
+
+	/* check for a softlockup
+	 * This is done by making sure a high priority task is
+	 * being scheduled.  The task touches the watchdog to
+	 * indicate it is getting cpu time.  If it hasn't then
+	 * this is a good indication some task is hogging the cpu
+	 */
+	duration = is_softlockup(touch_ts, this_cpu);
+	if (unlikely(duration)) {
+		/* only warn once */
+		if (__get_cpu_var(soft_watchdog_warn) == true)
+			return HRTIMER_RESTART;
+
+		printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+			this_cpu, duration,
+			current->comm, task_pid_nr(current));
+		print_modules();
+		print_irqtrace_events(current);
+		if (regs)
+			show_regs(regs);
+		else
+			dump_stack();
+
+		if (softlockup_panic)
+			panic("softlockup: hung tasks");
+		__get_cpu_var(soft_watchdog_warn) = true;
+	} else
+		__get_cpu_var(soft_watchdog_warn) = false;
+
+	return HRTIMER_RESTART;
+}
+
+
+/*
+ * The watchdog thread - touches the timestamp.
+ */
+static int watchdog(void *__bind_cpu)
+{
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, (unsigned long)__bind_cpu);
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+
+	/* initialize timestamp */
+	__touch_watchdog();
+
+	/* kick off the timer for the hardlockup detector */
+	/* done here because hrtimer_start can only pin to smp_processor_id() */
+	hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
+		      HRTIMER_MODE_REL_PINNED);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	/*
+	 * Run briefly once per second to reset the softlockup timestamp.
+	 * If this gets delayed for more than 60 seconds then the
+	 * debug-printout triggers in softlockup_tick().
+	 */
+	while (!kthread_should_stop()) {
+		__touch_watchdog();
+		schedule();
+
+		if (kthread_should_stop())
+			break;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+
+#ifdef CONFIG_PERF_EVENTS_NMI
+static int watchdog_nmi_enable(int cpu)
+{
+	struct perf_event_attr *wd_attr;
+	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+	/* is it already setup and enabled? */
+	if (event && event->state > PERF_EVENT_STATE_OFF)
+		goto out;
+
+	/* it is setup but not enabled */
+	if (event != NULL)
+		goto out_enable;
+
+	/* Try to register using hardware perf events */
+	wd_attr = &wd_hw_attr;
+	wd_attr->sample_period = hw_nmi_get_sample_period();
+	event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
+	if (!IS_ERR(event)) {
+		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+		goto out_save;
+	}
+
+	printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+	return -1;
+
+	/* success path */
+out_save:
+	per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+	perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+	return 0;
+}
+
+static void watchdog_nmi_disable(int cpu)
+{
+	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+	if (event) {
+		perf_event_disable(event);
+		per_cpu(watchdog_ev, cpu) = NULL;
+
+		/* should be in cleanup, but blocks oprofile */
+		perf_event_release_kernel(event);
+	}
+	return;
+}
+#else
+static int watchdog_nmi_enable(int cpu) { return 0; }
+static void watchdog_nmi_disable(int cpu) { return; }
+#endif /* CONFIG_PERF_EVENTS_NMI */
+
+/* prepare/enable/disable routines */
+static int watchdog_prepare_cpu(int cpu)
+{
+	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+
+	WARN_ON(per_cpu(softlockup_watchdog, cpu));
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = watchdog_timer_fn;
+
+	return 0;
+}
+
+static int watchdog_enable(int cpu)
+{
+	struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+
+	/* enable the perf event */
+	if (watchdog_nmi_enable(cpu) != 0)
+		return -1;
+
+	/* create the watchdog thread */
+	if (!p) {
+		p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+		if (IS_ERR(p)) {
+			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
+			return -1;
+		}
+		kthread_bind(p, cpu);
+		per_cpu(watchdog_touch_ts, cpu) = 0;
+		per_cpu(softlockup_watchdog, cpu) = p;
+		wake_up_process(p);
+	}
+
+	return 0;
+}
+
+static void watchdog_disable(int cpu)
+{
+	struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+
+	/*
+	 * cancel the timer first to stop incrementing the stats
+	 * and waking up the kthread
+	 */
+	hrtimer_cancel(hrtimer);
+
+	/* disable the perf event */
+	watchdog_nmi_disable(cpu);
+
+	/* stop the watchdog thread */
+	if (p) {
+		per_cpu(softlockup_watchdog, cpu) = NULL;
+		kthread_stop(p);
+	}
+
+	/* if any cpu succeeds, watchdog is considered enabled for the system */
+	watchdog_enabled = 1;
+}
+
+static void watchdog_enable_all_cpus(void)
+{
+	int cpu;
+	int result;
+
+	for_each_online_cpu(cpu)
+		result += watchdog_enable(cpu);
+
+	if (result)
+		printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+
+}
+
+static void watchdog_disable_all_cpus(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		watchdog_disable(cpu);
+
+	/* if all watchdogs are disabled, then they are disabled for the system */
+	watchdog_enabled = 0;
+}
+
+
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
+/*
+ * proc handler for /proc/sys/kernel/nmi_watchdog
+ */
+
+int proc_dowatchdog_enabled(struct ctl_table *table, int write,
+		     void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, buffer, length, ppos);
+
+	if (watchdog_enabled)
+		watchdog_enable_all_cpus();
+	else
+		watchdog_disable_all_cpus();
+	return 0;
+}
+
+int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+
+/* stub functions */
+int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	return proc_dowatchdog_thresh(table, write, buffer, lenp, ppos);
+}
+/* end of stub functions */
+#endif /* CONFIG_SYSCTL */
+
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __cpuinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (watchdog_prepare_cpu(hotcpu))
+			return NOTIFY_BAD;
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		if (watchdog_enable(hotcpu))
+			return NOTIFY_BAD;
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		watchdog_disable(hotcpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		watchdog_disable(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+static int __init spawn_watchdog_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	int err;
+
+	if (no_watchdog)
+		return 0;
+
+	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	WARN_ON(err == NOTIFY_BAD);
+
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+	return 0;
+}
+early_initcall(spawn_watchdog_task);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 220ae6063b6f..49e285dcaf57 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -153,7 +153,7 @@ config DEBUG_SHIRQ
 	  points; some don't and need to be caught.
 
 config DETECT_SOFTLOCKUP
-	bool "Detect Soft Lockups"
+	bool
 	depends on DEBUG_KERNEL && !S390
 	default y
 	help
@@ -171,17 +171,27 @@ config DETECT_SOFTLOCKUP
 	   can be detected via the NMI-watchdog, on platforms that
 	   support it.)
 
-config NMI_WATCHDOG
-	bool "Detect Hard Lockups with an NMI Watchdog"
-	depends on DEBUG_KERNEL && PERF_EVENTS && PERF_EVENTS_NMI
+config LOCKUP_DETECTOR
+	bool "Detect Hard and Soft Lockups"
+	depends on DEBUG_KERNEL
+	default DETECT_SOFTLOCKUP
 	help
-	  Say Y here to enable the kernel to use the NMI as a watchdog
-	  to detect hard lockups.  This is useful when a cpu hangs for no
-	  reason but can still respond to NMIs.  A backtrace is displayed
-	  for reviewing and reporting.
+	  Say Y here to enable the kernel to act as a watchdog to detect
+	  hard and soft lockups.
+
+	  Softlockups are bugs that cause the kernel to loop in kernel
+	  mode for more than 60 seconds, without giving other tasks a
+	  chance to run.  The current stack trace is displayed upon
+	  detection and the system will stay locked up.
+
+	  Hardlockups are bugs that cause the CPU to loop in kernel mode
+	  for more than 60 seconds, without letting other interrupts have a
+	  chance to run.  The current stack trace is displayed upon detection
+	  and the system will stay locked up.
 
-	  The overhead should be minimal, just an extra NMI every few
-	  seconds.
+	  The overhead should be minimal.  A periodic hrtimer runs to
+	  generate interrupts and kick the watchdog task every 10-12 seconds.
+	  An NMI is generated every 60 seconds or so to check for hardlockups.
 
 config BOOTPARAM_SOFTLOCKUP_PANIC
 	bool "Panic (Reboot) On Soft Lockups"
-- 
cgit v1.2.3-59-g8ed1b


From 332fbdbca3f7716c5620970755ae054d213bcc4e Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 7 May 2010 17:11:45 -0400
Subject: lockup_detector: Touch_softlockup cleanups and softlockup_tick
 removal

Just some code cleanup to make touch_softlockup clearer and remove the
softlockup_tick function as it is no longer needed.

Also remove the /proc softlockup_thres call as it has been changed to
watchdog_thres.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <1273266711-18706-3-git-send-email-dzickus@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/sched.h | 16 +++-------------
 kernel/sysctl.c       |  9 ---------
 kernel/timer.c        |  1 -
 kernel/watchdog.c     | 35 +++--------------------------------
 4 files changed, 6 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 37efe8fa5306..33f9b2ad0bbb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -312,19 +312,15 @@ extern void scheduler_tick(void);
 extern void sched_show_task(struct task_struct *p);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
-extern void softlockup_tick(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
-extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-				    void __user *buffer,
-				    size_t *lenp, loff_t *ppos);
+extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+				  void __user *buffer,
+				  size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
 extern int softlockup_thresh;
 #else
-static inline void softlockup_tick(void)
-{
-}
 static inline void touch_softlockup_watchdog(void)
 {
 }
@@ -346,12 +342,6 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 size_t *lenp, loff_t *ppos);
 #endif
 
-#ifdef CONFIG_LOCKUP_DETECTOR
-extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
-				  void __user *buffer,
-				  size_t *lenp, loff_t *ppos);
-#endif
-
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0f9adda85f97..999bc3fccf47 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -817,15 +817,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-	{
-		.procname	= "softlockup_thresh",
-		.data		= &softlockup_thresh,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dosoftlockup_thresh,
-		.extra1		= &neg_one,
-		.extra2		= &sixty,
-	},
 #endif
 #ifdef CONFIG_DETECT_HUNG_TASK
 	{
diff --git a/kernel/timer.c b/kernel/timer.c
index aeb6a54f2771..e8de5eb07a02 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1225,7 +1225,6 @@ void run_local_timers(void)
 {
 	hrtimer_run_queues();
 	raise_softirq(TIMER_SOFTIRQ);
-	softlockup_tick();
 }
 
 /*
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6b7fad8497af..f1541b7e3244 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -119,13 +119,12 @@ static void __touch_watchdog(void)
 	__get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
 }
 
-void touch_watchdog(void)
+void touch_softlockup_watchdog(void)
 {
 	__get_cpu_var(watchdog_touch_ts) = 0;
 }
-EXPORT_SYMBOL(touch_watchdog);
 
-void touch_all_watchdog(void)
+void touch_all_softlockup_watchdogs(void)
 {
 	int cpu;
 
@@ -140,35 +139,16 @@ void touch_all_watchdog(void)
 
 void touch_nmi_watchdog(void)
 {
-	touch_watchdog();
+	touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL(touch_nmi_watchdog);
 
-void touch_all_nmi_watchdog(void)
-{
-	touch_all_watchdog();
-}
-
-void touch_softlockup_watchdog(void)
-{
-	touch_watchdog();
-}
-
-void touch_all_softlockup_watchdogs(void)
-{
-	touch_all_watchdog();
-}
-
 void touch_softlockup_watchdog_sync(void)
 {
 	__raw_get_cpu_var(softlockup_touch_sync) = true;
 	__raw_get_cpu_var(watchdog_touch_ts) = 0;
 }
 
-void softlockup_tick(void)
-{
-}
-
 #ifdef CONFIG_PERF_EVENTS_NMI
 /* watchdog detector functions */
 static int is_hardlockup(int cpu)
@@ -522,15 +502,6 @@ int proc_dowatchdog_thresh(struct ctl_table *table, int write,
 {
 	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 }
-
-/* stub functions */
-int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-			     void __user *buffer,
-			     size_t *lenp, loff_t *ppos)
-{
-	return proc_dowatchdog_thresh(table, write, buffer, lenp, ppos);
-}
-/* end of stub functions */
 #endif /* CONFIG_SYSCTL */
 
 
-- 
cgit v1.2.3-59-g8ed1b


From 19cc36c0f0457e5c6629ec24036fbbe8255c88ec Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 13 May 2010 02:30:49 +0200
Subject: lockup_detector: Fix forgotten config conversion

Fix forgotten CONFIG_DETECT_SOFTLOCKUP -> CONFIG_LOCKUP_DETECTOR
in sched.h

Fixes:
	arch/x86/built-in.o: In function `touch_nmi_watchdog':
	(.text+0x1bd59): undefined reference to `touch_softlockup_watchdog'
	kernel/built-in.o: In function `show_state_filter':
	(.text+0x10d01): undefined reference to `touch_all_softlockup_watchdogs'
	kernel/built-in.o: In function `sched_clock_idle_wakeup_event':
	(.text+0x362f9): undefined reference to `touch_softlockup_watchdog'
	kernel/built-in.o: In function `timekeeping_resume':
	timekeeping.c:(.text+0x38757): undefined reference to `touch_softlockup_watchdog'
	kernel/built-in.o: In function `tick_nohz_handler':
	tick-sched.c:(.text+0x3e5b9): undefined reference to `touch_softlockup_watchdog'
	kernel/built-in.o: In function `tick_sched_timer':
	tick-sched.c:(.text+0x3e671): undefined reference to `touch_softlockup_watchdog'
	kernel/built-in.o: In function `tick_check_idle':
	(.text+0x3e90b): undefined reference to `touch_softlockup_watchdog'

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33f9b2ad0bbb..3958e0cd24f7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -311,7 +311,7 @@ extern void scheduler_tick(void);
 
 extern void sched_show_task(struct task_struct *p);
 
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_LOCKUP_DETECTOR
 extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
-- 
cgit v1.2.3-59-g8ed1b


From cafcd80d216bc2136b8edbb794327e495792c666 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 14 May 2010 11:11:21 -0400
Subject: lockup_detector: Cross arch compile fixes

Combining the softlockup and hardlockup code causes watchdog.c
to build even without the hardlockup detection support.

So if an arch, that has the previous and the new nmi watchdog
implementations cohabiting, wants to know if the generic one
is in use, CONFIG_LOCKUP_DETECTOR is not a reliable check.
We need to use CONFIG_HARDLOCKUP_DETECTOR instead.

Fixes:
	kernel/built-in.o: In function `touch_nmi_watchdog':
	(.text+0x449bc): multiple definition of `touch_nmi_watchdog'
	arch/sparc/kernel/built-in.o:(.text+0x11b28): first defined here

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
LKML-Reference: <20100514151121.GR15159@redhat.com>
[ use CONFIG_HARDLOCKUP_DETECTOR instead of CONFIG_PERF_EVENTS_NMI]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/kernel/apic/Makefile | 4 ++--
 include/linux/nmi.h           | 2 +-
 kernel/watchdog.c             | 7 +++++--
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 52f32e0ea194..910f20b457c4 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,10 @@
 #
 
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_LOCKUP_DETECTOR),y)
+ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
 obj-$(CONFIG_X86_LOCAL_APIC)	+= nmi.o
 endif
-obj-$(CONFIG_LOCKUP_DETECTOR)	+= hw_nmi.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR)	+= hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_SMP)		+= ipi.o
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index abd48aacaf79..06aab5eee134 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -20,7 +20,7 @@ extern void touch_nmi_watchdog(void);
 extern void acpi_nmi_disable(void);
 extern void acpi_nmi_enable(void);
 #else
-#ifndef CONFIG_LOCKUP_DETECTOR
+#ifndef CONFIG_HARDLOCKUP_DETECTOR
 static inline void touch_nmi_watchdog(void)
 {
 	touch_softlockup_watchdog();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 83fb63155cbc..e53622c1465e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,13 +31,13 @@ int watchdog_enabled;
 int __read_mostly softlockup_thresh = 60;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
-static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
-static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, soft_watchdog_warn);
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
@@ -139,6 +139,7 @@ void touch_all_softlockup_watchdogs(void)
 		per_cpu(watchdog_touch_ts, cpu) = 0;
 }
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
 void touch_nmi_watchdog(void)
 {
 	__get_cpu_var(watchdog_nmi_touch) = true;
@@ -146,6 +147,8 @@ void touch_nmi_watchdog(void)
 }
 EXPORT_SYMBOL(touch_nmi_watchdog);
 
+#endif
+
 void touch_softlockup_watchdog_sync(void)
 {
 	__raw_get_cpu_var(softlockup_touch_sync) = true;
-- 
cgit v1.2.3-59-g8ed1b


From 75b93489b449db4a34f0424c72f51821d985f52f Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Date: Sun, 23 May 2010 16:39:02 +0000
Subject: serial: add a new port type, found on some sh-mobile SoCs

Such ports are found, e.g., on SH7372.

Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 include/linux/serial_core.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index f10db6e5f3b5..522832023a69 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -186,6 +186,9 @@
 #define PORT_ALTERA_JTAGUART	91
 #define PORT_ALTERA_UART	92
 
+/* SH-SCI */
+#define PORT_SCIFB	93
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-- 
cgit v1.2.3-59-g8ed1b


From 06c4648d46d1b757d6b9591a86810be79818b60c Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Wed, 26 May 2010 00:09:42 +0000
Subject: arp_notify: allow drivers to explicitly request a notification event.

Currently such notifications are only generated when the device comes up or the
address changes. However one use case for these notifications is to enable
faster network recovery after a virtual machine migration (by causing switches
to relearn their MAC tables). A migration appears to the network stack as a
temporary loss of carrier and therefore does not trigger either of the current
conditions. Rather than adding carrier up as a trigger (which can cause issues
when interfaces a flapping) simply add an interface which the driver can use
to explicitly trigger the notification.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Stephen Hemminger <shemminger@linux-foundation.org>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Cc: stable@kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 include/linux/notifier.h  |  1 +
 net/ipv4/devinet.c        |  1 +
 net/sched/sch_generic.c   | 18 ++++++++++++++++++
 4 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 40291f375024..a24916156f4e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1772,6 +1772,8 @@ extern void netif_carrier_on(struct net_device *dev);
 
 extern void netif_carrier_off(struct net_device *dev);
 
+extern void netif_notify_peers(struct net_device *dev);
+
 /**
  *	netif_dormant_on - mark device as dormant.
  *	@dev: network device
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 540703b555cb..22c2abb61974 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -210,6 +210,7 @@ static inline int notifier_to_errno(int ret)
 #define NETDEV_POST_INIT	0x0010
 #define NETDEV_UNREGISTER_BATCH 0x0011
 #define NETDEV_BONDING_DESLAVE  0x0012
+#define NETDEV_NOTIFY_PEERS	0x0012
 
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 382bc768ed56..da14c49284f4 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1081,6 +1081,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 		}
 		ip_mc_up(in_dev);
 		/* fall through */
+	case NETDEV_NOTIFY_PEERS:
 	case NETDEV_CHANGEADDR:
 		/* Send gratuitous ARP to notify of link change */
 		if (IN_DEV_ARP_NOTIFY(in_dev)) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a63029ef3edd..bd1892fe4b21 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -327,6 +327,24 @@ void netif_carrier_off(struct net_device *dev)
 }
 EXPORT_SYMBOL(netif_carrier_off);
 
+/**
+ * 	netif_notify_peers - notify network peers about existence of @dev
+ * 	@dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void netif_notify_peers(struct net_device *dev)
+{
+	rtnl_lock();
+	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL(netif_notify_peers);
+
 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
    under all circumstances. It is difficult to invent anything faster or
    cheaper.
-- 
cgit v1.2.3-59-g8ed1b


From 38117d1495e587fbb10d6e55733139a27893cef5 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 31 May 2010 00:28:35 -0700
Subject: net: Fix NETDEV_NOTIFY_PEERS to not conflict with
 NETDEV_BONDING_DESLAVE.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/notifier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 22c2abb61974..b2f1a4d83550 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -210,7 +210,7 @@ static inline int notifier_to_errno(int ret)
 #define NETDEV_POST_INIT	0x0010
 #define NETDEV_UNREGISTER_BATCH 0x0011
 #define NETDEV_BONDING_DESLAVE  0x0012
-#define NETDEV_NOTIFY_PEERS	0x0012
+#define NETDEV_NOTIFY_PEERS	0x0013
 
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN
-- 
cgit v1.2.3-59-g8ed1b


From c2d9ba9bce8d7323ca96f239e1f505c14d6244fb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 1 Jun 2010 06:51:19 +0000
Subject: net: CONFIG_NET_NS reduction

Use read_pnet() and write_pnet() to reduce number of ifdef CONFIG_NET_NS

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h            |  6 +-----
 include/net/cfg80211.h               | 15 ++-------------
 include/net/genetlink.h              | 15 ++-------------
 include/net/netfilter/nf_conntrack.h |  6 +-----
 include/net/sock.h                   | 10 ++--------
 net/ipv6/addrlabel.c                 |  6 +-----
 net/netfilter/nf_conntrack_core.c    |  8 ++------
 7 files changed, 11 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a24916156f4e..bd6b75317d5f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1087,11 +1087,7 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 static inline
 struct net *dev_net(const struct net_device *dev)
 {
-#ifdef CONFIG_NET_NS
-	return dev->nd_net;
-#else
-	return &init_net;
-#endif
+	return read_pnet(&dev->nd_net);
 }
 
 static inline
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index b44a2e5321a3..e7ebeb8bdf71 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1330,26 +1330,15 @@ struct wiphy {
 	char priv[0] __attribute__((__aligned__(NETDEV_ALIGN)));
 };
 
-#ifdef CONFIG_NET_NS
-static inline struct net *wiphy_net(struct wiphy *wiphy)
-{
-	return wiphy->_net;
-}
-
-static inline void wiphy_net_set(struct wiphy *wiphy, struct net *net)
-{
-	wiphy->_net = net;
-}
-#else
 static inline struct net *wiphy_net(struct wiphy *wiphy)
 {
-	return &init_net;
+	return read_pnet(&wiphy->_net);
 }
 
 static inline void wiphy_net_set(struct wiphy *wiphy, struct net *net)
 {
+	write_pnet(&wiphy->_net, net);
 }
-#endif
 
 /**
  * wiphy_priv - return priv from wiphy
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index eb551baafc04..f7dcd2c70412 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -68,26 +68,15 @@ struct genl_info {
 #endif
 };
 
-#ifdef CONFIG_NET_NS
 static inline struct net *genl_info_net(struct genl_info *info)
 {
-	return info->_net;
+	return read_pnet(&info->_net);
 }
 
 static inline void genl_info_net_set(struct genl_info *info, struct net *net)
 {
-	info->_net = net;
+	write_pnet(&info->_net, net);
 }
-#else
-static inline struct net *genl_info_net(struct genl_info *info)
-{
-	return &init_net;
-}
-
-static inline void genl_info_net_set(struct genl_info *info, struct net *net)
-{
-}
-#endif
 
 /**
  * struct genl_ops - generic netlink operations
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index bde095f7e845..bbfdd9453087 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -152,11 +152,7 @@ extern struct net init_net;
 
 static inline struct net *nf_ct_net(const struct nf_conn *ct)
 {
-#ifdef CONFIG_NET_NS
-	return ct->ct_net;
-#else
-	return &init_net;
-#endif
+	return read_pnet(&ct->ct_net);
 }
 
 /* Alter reply tuple (maybe alter helper). */
diff --git a/include/net/sock.h b/include/net/sock.h
index ca241ea14875..3461e5d1e9ad 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1724,19 +1724,13 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_e
 static inline
 struct net *sock_net(const struct sock *sk)
 {
-#ifdef CONFIG_NET_NS
-	return sk->sk_net;
-#else
-	return &init_net;
-#endif
+	return read_pnet(&sk->sk_net);
 }
 
 static inline
 void sock_net_set(struct sock *sk, struct net *net)
 {
-#ifdef CONFIG_NET_NS
-	sk->sk_net = net;
-#endif
+	write_pnet(&sk->sk_net, net);
 }
 
 /*
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 8c4348cb1950..f0e774cea386 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -53,11 +53,7 @@ static struct ip6addrlbl_table
 static inline
 struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl)
 {
-#ifdef CONFIG_NET_NS
-	return lbl->lbl_net;
-#else
-	return &init_net;
-#endif
+	return read_pnet(&lbl->lbl_net);
 }
 
 /*
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index eeeb8bc73982..77288980fae0 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -619,9 +619,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
 	ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
 	/* Don't set timer yet: wait for confirmation */
 	setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
-#ifdef CONFIG_NET_NS
-	ct->ct_net = net;
-#endif
+	write_pnet(&ct->ct_net, net);
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 	if (zone) {
 		struct nf_conntrack_zone *nf_ct_zone;
@@ -1363,9 +1361,7 @@ static int nf_conntrack_init_init_net(void)
 		goto err_extend;
 #endif
 	/* Set up fake conntrack: to never be deleted, not in any hashes */
-#ifdef CONFIG_NET_NS
-	nf_conntrack_untracked.ct_net = &init_net;
-#endif
+	write_pnet(&nf_conntrack_untracked.ct_net, &init_net);
 	atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
 	/*  - and look it like as a confirmed connection */
 	set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
-- 
cgit v1.2.3-59-g8ed1b


From 614f60fa9d73a9e8fdff3df83381907fea7c5649 Mon Sep 17 00:00:00 2001
From: Scott McMillan <scott.a.mcmillan@intel.com>
Date: Wed, 2 Jun 2010 05:53:56 -0700
Subject: packet_mmap: expose hw packet timestamps to network packet capture
 utilities

This patch adds a setting, PACKET_TIMESTAMP, to specify the packet
timestamp source that is exported to capture utilities like tcpdump by
packet_mmap.

PACKET_TIMESTAMP accepts the same integer bit field as
SO_TIMESTAMPING.  However, only the SOF_TIMESTAMPING_SYS_HARDWARE and
SOF_TIMESTAMPING_RAW_HARDWARE values are currently recognized by
PACKET_TIMESTAMP.  SOF_TIMESTAMPING_SYS_HARDWARE takes precedence over
SOF_TIMESTAMPING_RAW_HARDWARE if both bits are set.

If PACKET_TIMESTAMP is not set, a software timestamp generated inside
the networking stack is used (the behavior before this setting was
added).

Signed-off-by: Scott McMillan <scott.a.mcmillan@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/packet_mmap.txt | 26 ++++++++++++++++++++++
 include/linux/if_packet.h                |  1 +
 net/packet/af_packet.c                   | 37 ++++++++++++++++++++++++++++++--
 3 files changed, 62 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 98f71a5cef00..2546aa4dc232 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -493,6 +493,32 @@ The user can also use poll() to check if a buffer is available:
     pfd.events = POLLOUT;
     retval = poll(&pfd, 1, timeout);
 
+-------------------------------------------------------------------------------
++ PACKET_TIMESTAMP
+-------------------------------------------------------------------------------
+
+The PACKET_TIMESTAMP setting determines the source of the timestamp in
+the packet meta information.  If your NIC is capable of timestamping
+packets in hardware, you can request those hardware timestamps to used.
+Note: you may need to enable the generation of hardware timestamps with
+SIOCSHWTSTAMP.
+
+PACKET_TIMESTAMP accepts the same integer bit field as
+SO_TIMESTAMPING.  However, only the SOF_TIMESTAMPING_SYS_HARDWARE
+and SOF_TIMESTAMPING_RAW_HARDWARE values are recognized by
+PACKET_TIMESTAMP.  SOF_TIMESTAMPING_SYS_HARDWARE takes precedence over
+SOF_TIMESTAMPING_RAW_HARDWARE if both bits are set.
+
+    int req = 0;
+    req |= SOF_TIMESTAMPING_SYS_HARDWARE;
+    setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req))
+
+If PACKET_TIMESTAMP is not set, a software timestamp generated inside
+the networking stack is used (the behavior before this setting was added).
+
+See include/linux/net_tstamp.h and Documentation/networking/timestamping
+for more information on hardware timestamps.
+
 --------------------------------------------------------------------------------
 + THANKS
 --------------------------------------------------------------------------------
diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 6ac23ef1801a..72bfa5a034dd 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -48,6 +48,7 @@ struct sockaddr_ll {
 #define PACKET_LOSS			14
 #define PACKET_VNET_HDR			15
 #define PACKET_TX_TIMESTAMP		16
+#define PACKET_TIMESTAMP		17
 
 struct tpacket_stats {
 	unsigned int	tp_packets;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 2078a277e06b..9a17f28b1253 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -83,6 +83,7 @@
 #include <linux/if_vlan.h>
 #include <linux/virtio_net.h>
 #include <linux/errqueue.h>
+#include <linux/net_tstamp.h>
 
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
@@ -202,6 +203,7 @@ struct packet_sock {
 	unsigned int		tp_hdrlen;
 	unsigned int		tp_reserve;
 	unsigned int		tp_loss:1;
+	unsigned int		tp_tstamp;
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
 };
 
@@ -656,6 +658,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	struct sk_buff *copy_skb = NULL;
 	struct timeval tv;
 	struct timespec ts;
+	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
 
 	if (skb->pkt_type == PACKET_LOOPBACK)
 		goto drop;
@@ -737,7 +740,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		h.h1->tp_snaplen = snaplen;
 		h.h1->tp_mac = macoff;
 		h.h1->tp_net = netoff;
-		if (skb->tstamp.tv64)
+		if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
+				&& shhwtstamps->syststamp.tv64)
+			tv = ktime_to_timeval(shhwtstamps->syststamp);
+		else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
+				&& shhwtstamps->hwtstamp.tv64)
+			tv = ktime_to_timeval(shhwtstamps->hwtstamp);
+		else if (skb->tstamp.tv64)
 			tv = ktime_to_timeval(skb->tstamp);
 		else
 			do_gettimeofday(&tv);
@@ -750,7 +759,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		h.h2->tp_snaplen = snaplen;
 		h.h2->tp_mac = macoff;
 		h.h2->tp_net = netoff;
-		if (skb->tstamp.tv64)
+		if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
+				&& shhwtstamps->syststamp.tv64)
+			ts = ktime_to_timespec(shhwtstamps->syststamp);
+		else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
+				&& shhwtstamps->hwtstamp.tv64)
+			ts = ktime_to_timespec(shhwtstamps->hwtstamp);
+		else if (skb->tstamp.tv64)
 			ts = ktime_to_timespec(skb->tstamp);
 		else
 			getnstimeofday(&ts);
@@ -2027,6 +2042,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		po->has_vnet_hdr = !!val;
 		return 0;
 	}
+	case PACKET_TIMESTAMP:
+	{
+		int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		po->tp_tstamp = val;
+		return 0;
+	}
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -2119,6 +2146,12 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 		val = po->tp_loss;
 		data = &val;
 		break;
+	case PACKET_TIMESTAMP:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = po->tp_tstamp;
+		data = &val;
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From ab95bfe01f9872459c8678572ccadbf646badad0 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 1 Jun 2010 21:52:08 +0000
Subject: net: replace hooks in __netif_receive_skb V5

What this patch does is it removes two receive frame hooks (for bridge and for
macvlan) from __netif_receive_skb. These are replaced them with a single
hook for both. It only supports one hook per device because it makes no
sense to do bridging and macvlan on the same device.

Then a network driver (of virtual netdev like macvlan or bridge) can register
an rx_handler for needed net device.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c      |  19 +++++---
 include/linux/if_bridge.h  |   2 -
 include/linux/if_macvlan.h |   4 --
 include/linux/netdevice.h  |   7 +++
 net/bridge/br.c            |   2 -
 net/bridge/br_if.c         |   8 +++
 net/bridge/br_input.c      |  12 +++--
 net/bridge/br_private.h    |   3 +-
 net/core/dev.c             | 119 +++++++++++++++++++++------------------------
 9 files changed, 93 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 87e8d4cb4057..53422ce26f7f 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -145,15 +145,16 @@ static void macvlan_broadcast(struct sk_buff *skb,
 }
 
 /* called under rcu_read_lock() from netif_receive_skb */
-static struct sk_buff *macvlan_handle_frame(struct macvlan_port *port,
-					    struct sk_buff *skb)
+static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb)
 {
+	struct macvlan_port *port;
 	const struct ethhdr *eth = eth_hdr(skb);
 	const struct macvlan_dev *vlan;
 	const struct macvlan_dev *src;
 	struct net_device *dev;
 	unsigned int len;
 
+	port = rcu_dereference(skb->dev->macvlan_port);
 	if (is_multicast_ether_addr(eth->h_dest)) {
 		src = macvlan_hash_lookup(port, eth->h_source);
 		if (!src)
@@ -515,6 +516,7 @@ static int macvlan_port_create(struct net_device *dev)
 {
 	struct macvlan_port *port;
 	unsigned int i;
+	int err;
 
 	if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK)
 		return -EINVAL;
@@ -528,13 +530,21 @@ static int macvlan_port_create(struct net_device *dev)
 	for (i = 0; i < MACVLAN_HASH_SIZE; i++)
 		INIT_HLIST_HEAD(&port->vlan_hash[i]);
 	rcu_assign_pointer(dev->macvlan_port, port);
-	return 0;
+
+	err = netdev_rx_handler_register(dev, macvlan_handle_frame);
+	if (err) {
+		rcu_assign_pointer(dev->macvlan_port, NULL);
+		kfree(port);
+	}
+
+	return err;
 }
 
 static void macvlan_port_destroy(struct net_device *dev)
 {
 	struct macvlan_port *port = dev->macvlan_port;
 
+	netdev_rx_handler_unregister(dev);
 	rcu_assign_pointer(dev->macvlan_port, NULL);
 	synchronize_rcu();
 	kfree(port);
@@ -767,14 +777,12 @@ static int __init macvlan_init_module(void)
 	int err;
 
 	register_netdevice_notifier(&macvlan_notifier_block);
-	macvlan_handle_frame_hook = macvlan_handle_frame;
 
 	err = macvlan_link_register(&macvlan_link_ops);
 	if (err < 0)
 		goto err1;
 	return 0;
 err1:
-	macvlan_handle_frame_hook = NULL;
 	unregister_netdevice_notifier(&macvlan_notifier_block);
 	return err;
 }
@@ -782,7 +790,6 @@ err1:
 static void __exit macvlan_cleanup_module(void)
 {
 	rtnl_link_unregister(&macvlan_link_ops);
-	macvlan_handle_frame_hook = NULL;
 	unregister_netdevice_notifier(&macvlan_notifier_block);
 }
 
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 938b7e81df95..0d241a5c4909 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -102,8 +102,6 @@ struct __fdb_entry {
 #include <linux/netdevice.h>
 
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
-extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
-					       struct sk_buff *skb);
 extern int (*br_should_route_hook)(struct sk_buff *skb);
 
 #endif
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 9ea047aca795..c26a0e4f0ce8 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -84,8 +84,4 @@ extern int macvlan_link_register(struct rtnl_link_ops *ops);
 extern netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 				      struct net_device *dev);
 
-
-extern struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *,
-						    struct sk_buff *);
-
 #endif /* _LINUX_IF_MACVLAN_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bd6b75317d5f..5156b806924c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -381,6 +381,8 @@ enum gro_result {
 };
 typedef enum gro_result gro_result_t;
 
+typedef struct sk_buff *rx_handler_func_t(struct sk_buff *skb);
+
 extern void __napi_schedule(struct napi_struct *n);
 
 static inline int napi_disable_pending(struct napi_struct *n)
@@ -957,6 +959,7 @@ struct net_device {
 #endif
 
 	struct netdev_queue	rx_queue;
+	rx_handler_func_t	*rx_handler;
 
 	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
 
@@ -1689,6 +1692,10 @@ static inline void napi_free_frags(struct napi_struct *napi)
 	napi->skb = NULL;
 }
 
+extern int netdev_rx_handler_register(struct net_device *dev,
+				      rx_handler_func_t *rx_handler);
+extern void netdev_rx_handler_unregister(struct net_device *dev);
+
 extern void		netif_nit_deliver(struct sk_buff *skb);
 extern int		dev_valid_name(const char *name);
 extern int		dev_ioctl(struct net *net, unsigned int cmd, void __user *);
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 76357b547752..c8436fa31344 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -63,7 +63,6 @@ static int __init br_init(void)
 		goto err_out4;
 
 	brioctl_set(br_ioctl_deviceless_stub);
-	br_handle_frame_hook = br_handle_frame;
 
 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
 	br_fdb_test_addr_hook = br_fdb_test_addr;
@@ -100,7 +99,6 @@ static void __exit br_deinit(void)
 	br_fdb_test_addr_hook = NULL;
 #endif
 
-	br_handle_frame_hook = NULL;
 	br_fdb_fini();
 }
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 18b245e2c00e..d9242342837e 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -147,6 +147,7 @@ static void del_nbp(struct net_bridge_port *p)
 
 	list_del_rcu(&p->list);
 
+	netdev_rx_handler_unregister(dev);
 	rcu_assign_pointer(dev->br_port, NULL);
 
 	br_multicast_del_port(p);
@@ -429,6 +430,11 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 		goto err2;
 
 	rcu_assign_pointer(dev->br_port, p);
+
+	err = netdev_rx_handler_register(dev, br_handle_frame);
+	if (err)
+		goto err3;
+
 	dev_disable_lro(dev);
 
 	list_add_rcu(&p->list, &br->port_list);
@@ -451,6 +457,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	br_netpoll_enable(br, dev);
 
 	return 0;
+err3:
+	rcu_assign_pointer(dev->br_port, NULL);
 err2:
 	br_fdb_delete_by_port(br, p, 1);
 err1:
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index d36e700f7a26..99647d8f95c8 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -131,15 +131,19 @@ static inline int is_link_local(const unsigned char *dest)
 }
 
 /*
- * Called via br_handle_frame_hook.
  * Return NULL if skb is handled
- * note: already called with rcu_read_lock (preempt_disabled)
+ * note: already called with rcu_read_lock (preempt_disabled) from
+ * netif_receive_skb
  */
-struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
+struct sk_buff *br_handle_frame(struct sk_buff *skb)
 {
+	struct net_bridge_port *p;
 	const unsigned char *dest = eth_hdr(skb)->h_dest;
 	int (*rhook)(struct sk_buff *skb);
 
+	if (skb->pkt_type == PACKET_LOOPBACK)
+		return skb;
+
 	if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
 		goto drop;
 
@@ -147,6 +151,8 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
 	if (!skb)
 		return NULL;
 
+	p = rcu_dereference(skb->dev->br_port);
+
 	if (unlikely(is_link_local(dest))) {
 		/* Pause frames shouldn't be passed up by driver anyway */
 		if (skb->protocol == htons(ETH_P_PAUSE))
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 0f4a74bc6a9b..c83519b555bb 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -331,8 +331,7 @@ extern void br_features_recompute(struct net_bridge *br);
 
 /* br_input.c */
 extern int br_handle_frame_finish(struct sk_buff *skb);
-extern struct sk_buff *br_handle_frame(struct net_bridge_port *p,
-				       struct sk_buff *skb);
+extern struct sk_buff *br_handle_frame(struct sk_buff *skb);
 
 /* br_ioctl.c */
 extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
diff --git a/net/core/dev.c b/net/core/dev.c
index ffca5c1066fa..ec01a5998d70 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2604,70 +2604,14 @@ static inline int deliver_skb(struct sk_buff *skb,
 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
-#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
-
-#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
+#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
+    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
 /* This hook is defined here for ATM LANE */
 int (*br_fdb_test_addr_hook)(struct net_device *dev,
 			     unsigned char *addr) __read_mostly;
 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 #endif
 
-/*
- * If bridge module is loaded call bridging hook.
- *  returns NULL if packet was consumed.
- */
-struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
-					struct sk_buff *skb) __read_mostly;
-EXPORT_SYMBOL_GPL(br_handle_frame_hook);
-
-static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
-					    struct packet_type **pt_prev, int *ret,
-					    struct net_device *orig_dev)
-{
-	struct net_bridge_port *port;
-
-	if (skb->pkt_type == PACKET_LOOPBACK ||
-	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
-		return skb;
-
-	if (*pt_prev) {
-		*ret = deliver_skb(skb, *pt_prev, orig_dev);
-		*pt_prev = NULL;
-	}
-
-	return br_handle_frame_hook(port, skb);
-}
-#else
-#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
-#endif
-
-#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
-struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p,
-					     struct sk_buff *skb) __read_mostly;
-EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
-
-static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
-					     struct packet_type **pt_prev,
-					     int *ret,
-					     struct net_device *orig_dev)
-{
-	struct macvlan_port *port;
-
-	port = rcu_dereference(skb->dev->macvlan_port);
-	if (!port)
-		return skb;
-
-	if (*pt_prev) {
-		*ret = deliver_skb(skb, *pt_prev, orig_dev);
-		*pt_prev = NULL;
-	}
-	return macvlan_handle_frame_hook(port, skb);
-}
-#else
-#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
-#endif
-
 #ifdef CONFIG_NET_CLS_ACT
 /* TODO: Maybe we should just force sch_ingress to be compiled in
  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
@@ -2763,6 +2707,47 @@ void netif_nit_deliver(struct sk_buff *skb)
 	rcu_read_unlock();
 }
 
+/**
+ *	netdev_rx_handler_register - register receive handler
+ *	@dev: device to register a handler for
+ *	@rx_handler: receive handler to register
+ *
+ *	Register a receive hander for a device. This handler will then be
+ *	called from __netif_receive_skb. A negative errno code is returned
+ *	on a failure.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int netdev_rx_handler_register(struct net_device *dev,
+			       rx_handler_func_t *rx_handler)
+{
+	ASSERT_RTNL();
+
+	if (dev->rx_handler)
+		return -EBUSY;
+
+	rcu_assign_pointer(dev->rx_handler, rx_handler);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
+
+/**
+ *	netdev_rx_handler_unregister - unregister receive handler
+ *	@dev: device to unregister a handler from
+ *
+ *	Unregister a receive hander from a device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+void netdev_rx_handler_unregister(struct net_device *dev)
+{
+
+	ASSERT_RTNL();
+	rcu_assign_pointer(dev->rx_handler, NULL);
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
+
 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
 					      struct net_device *master)
 {
@@ -2815,6 +2800,7 @@ EXPORT_SYMBOL(__skb_bond_should_drop);
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
+	rx_handler_func_t *rx_handler;
 	struct net_device *orig_dev;
 	struct net_device *master;
 	struct net_device *null_or_orig;
@@ -2877,12 +2863,17 @@ static int __netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
-	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
-	if (!skb)
-		goto out;
-	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
-	if (!skb)
-		goto out;
+	/* Handle special case of bridge or macvlan */
+	rx_handler = rcu_dereference(skb->dev->rx_handler);
+	if (rx_handler) {
+		if (pt_prev) {
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = NULL;
+		}
+		skb = rx_handler(skb);
+		if (!skb)
+			goto out;
+	}
 
 	/*
 	 * Make sure frames received on VLAN interfaces stacked on
-- 
cgit v1.2.3-59-g8ed1b


From 77c2061d10a408d0220c2b0e7faefe52d9c41008 Mon Sep 17 00:00:00 2001
From: Walter Goldens <goldenstranger@yahoo.com>
Date: Tue, 18 May 2010 04:44:54 -0700
Subject: wireless: fix several minor description typos

Signed-off-by: Walter Goldens <goldenstranger@yahoo.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/ath5k/eeprom.c   | 2 +-
 drivers/net/wireless/ath/ath9k/hw.h       | 2 +-
 drivers/net/wireless/iwmc3200wifi/hal.c   | 2 +-
 drivers/net/wireless/libertas/scan.c      | 2 +-
 drivers/net/wireless/orinoco/hermes_dld.c | 2 +-
 drivers/net/wireless/rt2x00/rt2x00dev.c   | 2 +-
 drivers/net/wireless/zd1211rw/zd_mac.c    | 2 +-
 drivers/net/wireless/zd1211rw/zd_usb.c    | 2 +-
 include/linux/nl80211.h                   | 2 +-
 include/net/cfg80211.h                    | 2 +-
 net/mac80211/mlme.c                       | 2 +-
 net/mac80211/status.c                     | 2 +-
 net/mac80211/work.c                       | 2 +-
 13 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath/ath5k/eeprom.c b/drivers/net/wireless/ath/ath5k/eeprom.c
index ed0263672d6d..8490348379ae 100644
--- a/drivers/net/wireless/ath/ath5k/eeprom.c
+++ b/drivers/net/wireless/ath/ath5k/eeprom.c
@@ -715,7 +715,7 @@ ath5k_eeprom_convert_pcal_info_5111(struct ath5k_hw *ah, int mode,
 
 		/* Only one curve for RF5111
 		 * find out which one and place
-		 * in in pd_curves.
+		 * in pd_curves.
 		 * Note: ee_x_gain is reversed here */
 		for (idx = 0; idx < AR5K_EEPROM_N_PD_CURVES; idx++) {
 
diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h
index 5cf0714f069c..ffc9249b02c5 100644
--- a/drivers/net/wireless/ath/ath9k/hw.h
+++ b/drivers/net/wireless/ath/ath9k/hw.h
@@ -461,7 +461,7 @@ struct ath9k_hw_version {
 #define AR_GENTMR_BIT(_index)	(1 << (_index))
 
 /*
- * Using de Bruijin sequence to to look up 1's index in a 32 bit number
+ * Using de Bruijin sequence to look up 1's index in a 32 bit number
  * debruijn32 = 0000 0111 0111 1100 1011 0101 0011 0001
  */
 #define debruijn32 0x077CB531U
diff --git a/drivers/net/wireless/iwmc3200wifi/hal.c b/drivers/net/wireless/iwmc3200wifi/hal.c
index 9531b18cf72a..907ac890997c 100644
--- a/drivers/net/wireless/iwmc3200wifi/hal.c
+++ b/drivers/net/wireless/iwmc3200wifi/hal.c
@@ -54,7 +54,7 @@
  *   LMAC. If you look at LMAC commands you'll se that they
  *   are actually regular iwlwifi target commands encapsulated
  *   into a special UMAC command called UMAC passthrough.
- *   This is due to the fact the the host talks exclusively
+ *   This is due to the fact the host talks exclusively
  *   to the UMAC and so there needs to be a special UMAC
  *   command for talking to the LMAC.
  *   This is how a wifi command is layed out:
diff --git a/drivers/net/wireless/libertas/scan.c b/drivers/net/wireless/libertas/scan.c
index 24cd54b3a806..7d82f13bdf1d 100644
--- a/drivers/net/wireless/libertas/scan.c
+++ b/drivers/net/wireless/libertas/scan.c
@@ -666,7 +666,7 @@ void lbs_scan_worker(struct work_struct *work)
 /**
  *  @brief Interpret a BSS scan response returned from the firmware
  *
- *  Parse the various fixed fields and IEs passed back for a a BSS probe
+ *  Parse the various fixed fields and IEs passed back for a BSS probe
  *  response or beacon from the scan command.  Record information as needed
  *  in the scan table struct bss_descriptor for that entry.
  *
diff --git a/drivers/net/wireless/orinoco/hermes_dld.c b/drivers/net/wireless/orinoco/hermes_dld.c
index 6da85e75fce0..f750f49bbd4e 100644
--- a/drivers/net/wireless/orinoco/hermes_dld.c
+++ b/drivers/net/wireless/orinoco/hermes_dld.c
@@ -68,7 +68,7 @@ struct dblock {
 } __attribute__ ((packed));
 
 /*
- * Plug Data References are located in in the image after the last data
+ * Plug Data References are located in the image after the last data
  * block.  They refer to areas in the adapter memory where the plug data
  * items with matching ID should be written.
  */
diff --git a/drivers/net/wireless/rt2x00/rt2x00dev.c b/drivers/net/wireless/rt2x00/rt2x00dev.c
index 3ae468c4d760..2ed32e02a06f 100644
--- a/drivers/net/wireless/rt2x00/rt2x00dev.c
+++ b/drivers/net/wireless/rt2x00/rt2x00dev.c
@@ -224,7 +224,7 @@ void rt2x00lib_txdone(struct queue_entry *entry,
 	/*
 	 * If the IV/EIV data was stripped from the frame before it was
 	 * passed to the hardware, we should now reinsert it again because
-	 * mac80211 will expect the the same data to be present it the
+	 * mac80211 will expect the same data to be present it the
 	 * frame as it was passed to us.
 	 */
 	if (test_bit(CONFIG_SUPPORT_HW_CRYPTO, &rt2x00dev->flags))
diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c
index b0b666019a93..163a8a06b22d 100644
--- a/drivers/net/wireless/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zd1211rw/zd_mac.c
@@ -855,7 +855,7 @@ int zd_mac_rx(struct ieee80211_hw *hw, const u8 *buffer, unsigned int length)
 	if (skb == NULL)
 		return -ENOMEM;
 	if (need_padding) {
-		/* Make sure the the payload data is 4 byte aligned. */
+		/* Make sure the payload data is 4 byte aligned. */
 		skb_reserve(skb, 2);
 	}
 
diff --git a/drivers/net/wireless/zd1211rw/zd_usb.c b/drivers/net/wireless/zd1211rw/zd_usb.c
index c257940b71b6..818e1480ca93 100644
--- a/drivers/net/wireless/zd1211rw/zd_usb.c
+++ b/drivers/net/wireless/zd1211rw/zd_usb.c
@@ -844,7 +844,7 @@ out:
  * @usb: a &struct zd_usb pointer
  * @urb: URB to be freed
  *
- * Frees the the transmission URB, which means to put it on the free URB
+ * Frees the transmission URB, which means to put it on the free URB
  * list.
  */
 static void free_tx_urb(struct zd_usb *usb, struct urb *urb)
diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index b7c77f9712f4..64fb32b93a28 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -132,7 +132,7 @@
  * 	%NL80211_ATTR_REG_RULE_POWER_MAX_ANT_GAIN and
  * 	%NL80211_ATTR_REG_RULE_POWER_MAX_EIRP.
  * @NL80211_CMD_REQ_SET_REG: ask the wireless core to set the regulatory domain
- * 	to the the specified ISO/IEC 3166-1 alpha2 country code. The core will
+ * 	to the specified ISO/IEC 3166-1 alpha2 country code. The core will
  * 	store this as a valid request and then query userspace for it.
  *
  * @NL80211_CMD_GET_MESH_PARAMS: Get mesh networking properties for the
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index b44a2e5321a3..049e507d2f82 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -89,7 +89,7 @@ enum ieee80211_channel_flags {
  * @max_power: maximum transmission power (in dBm)
  * @beacon_found: helper to regulatory code to indicate when a beacon
  *	has been found on this channel. Use regulatory_hint_found_beacon()
- *	to enable this, this is is useful only on 5 GHz band.
+ *	to enable this, this is useful only on 5 GHz band.
  * @orig_mag: internal use
  * @orig_mpwr: internal use
  */
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 0839c4e8fd2e..31e3386b8d43 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1763,7 +1763,7 @@ static void ieee80211_sta_work(struct work_struct *work)
 
 	/*
 	 * ieee80211_queue_work() should have picked up most cases,
-	 * here we'll pick the the rest.
+	 * here we'll pick the rest.
 	 */
 	if (WARN(local->suspended, "STA MLME work scheduled while "
 		 "going to suspend\n"))
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 94613af009f3..34da67995d94 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -47,7 +47,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
 	/*
 	 * This skb 'survived' a round-trip through the driver, and
 	 * hopefully the driver didn't mangle it too badly. However,
-	 * we can definitely not rely on the the control information
+	 * we can definitely not rely on the control information
 	 * being correct. Clear it so we don't get junk there, and
 	 * indicate that it needs new processing, but must not be
 	 * modified/encrypted again.
diff --git a/net/mac80211/work.c b/net/mac80211/work.c
index be3d4a698692..4157717ed786 100644
--- a/net/mac80211/work.c
+++ b/net/mac80211/work.c
@@ -840,7 +840,7 @@ static void ieee80211_work_work(struct work_struct *work)
 
 	/*
 	 * ieee80211_queue_work() should have picked up most cases,
-	 * here we'll pick the the rest.
+	 * here we'll pick the rest.
 	 */
 	if (WARN(local->suspended, "work scheduled while going to suspend\n"))
 		return;
-- 
cgit v1.2.3-59-g8ed1b


From b5f7e7554753e2cc3ef3bef0271fdb32027df2ba Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 2 Jun 2010 12:05:27 +0000
Subject: ipv4: add LINUX_MIB_IPRPFILTER snmp counter

Christoph Lameter mentioned that packets could be dropped in input path
because of rp_filter settings, without any SNMP counter being
incremented. System administrator can have a hard time to track the
problem.

This patch introduces a new counter, LINUX_MIB_IPRPFILTER, incremented
each time we drop a packet because Reverse Path Filter triggers.

(We receive an IPv4 datagram on a given interface, and find the route to
send an answer would use another interface)

netstat -s | grep IPReversePathFilter
    IPReversePathFilter: 21714

Reported-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/snmp.h    |  1 +
 net/ipv4/fib_frontend.c |  6 ++++--
 net/ipv4/ip_input.c     |  3 +++
 net/ipv4/proc.c         |  1 +
 net/ipv4/route.c        | 31 ++++++++++++++++++-------------
 5 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 52797714ade7..ebb0c80ffd6e 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -229,6 +229,7 @@ enum
 	LINUX_MIB_TCPBACKLOGDROP,
 	LINUX_MIB_TCPMINTTLDROP, /* RFC 5082 */
 	LINUX_MIB_TCPDEFERACCEPTDROP,
+	LINUX_MIB_IPRPFILTER, /* IP Reverse Path Filter (rp_filter) */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4f0ed458c883..e830f7a123bd 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -284,7 +284,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 	if (no_addr)
 		goto last_resort;
 	if (rpf == 1)
-		goto e_inval;
+		goto e_rpf;
 	fl.oif = dev->ifindex;
 
 	ret = 0;
@@ -299,7 +299,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 
 last_resort:
 	if (rpf)
-		goto e_inval;
+		goto e_rpf;
 	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
 	*itag = 0;
 	return 0;
@@ -308,6 +308,8 @@ e_inval_res:
 	fib_res_put(&res);
 e_inval:
 	return -EINVAL;
+e_rpf:
+	return -EXDEV;
 }
 
 static inline __be32 sk_extract_addr(struct sockaddr *addr)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d930dc5e4d85..d52c9da644cf 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -340,6 +340,9 @@ static int ip_rcv_finish(struct sk_buff *skb)
 			else if (err == -ENETUNREACH)
 				IP_INC_STATS_BH(dev_net(skb->dev),
 						IPSTATS_MIB_INNOROUTES);
+			else if (err == -EXDEV)
+				NET_INC_STATS_BH(dev_net(skb->dev),
+						 LINUX_MIB_IPRPFILTER);
 			goto drop;
 		}
 	}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3dc9914c1dce..e320ca6b3ef3 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -252,6 +252,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
 	SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
 	SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
+	SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8495bceec764..d377b45005fc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1851,6 +1851,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	__be32 spec_dst;
 	struct in_device *in_dev = in_dev_get(dev);
 	u32 itag = 0;
+	int err;
 
 	/* Primary sanity checks. */
 
@@ -1865,10 +1866,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		if (!ipv4_is_local_multicast(daddr))
 			goto e_inval;
 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
-	} else if (fib_validate_source(saddr, 0, tos, 0,
-					dev, &spec_dst, &itag, 0) < 0)
-		goto e_inval;
-
+	} else {
+		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
+					  &itag, 0);
+		if (err < 0)
+			goto e_err;
+	}
 	rth = dst_alloc(&ipv4_dst_ops);
 	if (!rth)
 		goto e_nobufs;
@@ -1920,8 +1923,10 @@ e_nobufs:
 	return -ENOBUFS;
 
 e_inval:
+	err = -EINVAL;
+e_err:
 	in_dev_put(in_dev);
-	return -EINVAL;
+	return err;
 }
 
 
@@ -1985,7 +1990,6 @@ static int __mkroute_input(struct sk_buff *skb,
 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
 					 saddr);
 
-		err = -EINVAL;
 		goto cleanup;
 	}
 
@@ -2157,13 +2161,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		goto brd_input;
 
 	if (res.type == RTN_LOCAL) {
-		int result;
-		result = fib_validate_source(saddr, daddr, tos,
+		err = fib_validate_source(saddr, daddr, tos,
 					     net->loopback_dev->ifindex,
 					     dev, &spec_dst, &itag, skb->mark);
-		if (result < 0)
-			goto martian_source;
-		if (result)
+		if (err < 0)
+			goto martian_source_keep_err;
+		if (err)
 			flags |= RTCF_DIRECTSRC;
 		spec_dst = daddr;
 		goto local_input;
@@ -2191,7 +2194,7 @@ brd_input:
 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
 					  &itag, skb->mark);
 		if (err < 0)
-			goto martian_source;
+			goto martian_source_keep_err;
 		if (err)
 			flags |= RTCF_DIRECTSRC;
 	}
@@ -2272,8 +2275,10 @@ e_nobufs:
 	goto done;
 
 martian_source:
+	err = -EINVAL;
+martian_source_keep_err:
 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
-	goto e_inval;
+	goto done;
 }
 
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-- 
cgit v1.2.3-59-g8ed1b


From bc10502dba37d3b210efd9f3867212298f13b78e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 3 Jun 2010 03:21:52 -0700
Subject: net: use __packed annotation

cleanup patch.

Use new __packed annotation in net/ and include/
(except netfilter)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_ether.h   |  2 +-
 include/linux/if_fddi.h    |  8 ++---
 include/linux/if_frad.h    |  2 +-
 include/linux/if_hippi.h   |  8 ++---
 include/linux/if_pppox.h   |  8 ++---
 include/linux/ipv6.h       |  4 +--
 include/linux/isdnif.h     |  2 +-
 include/linux/mISDNif.h    |  2 +-
 include/linux/nbd.h        |  2 +-
 include/linux/ncp.h        | 10 +++---
 include/linux/ncp_fs_sb.h  | 14 ++++----
 include/linux/phonet.h     |  4 +--
 include/linux/rds.h        | 12 +++----
 include/linux/sctp.h       | 80 +++++++++++++++++++++++-----------------------
 include/linux/wlp.h        | 22 ++++++-------
 include/net/dn_dev.h       |  8 ++---
 include/net/dn_nsp.h       | 16 +++++-----
 include/net/ip6_tunnel.h   |  2 +-
 include/net/ipx.h          |  8 ++---
 include/net/mip6.h         |  2 +-
 include/net/ndisc.h        |  2 +-
 include/net/sctp/structs.h |  4 +--
 include/rxrpc/packet.h     |  8 ++---
 net/bluetooth/bnep/bnep.h  |  8 ++---
 net/compat.c               |  6 ++--
 net/iucv/iucv.c            | 14 ++++----
 net/mac80211/cfg.c         |  2 +-
 net/mac80211/ieee80211_i.h |  2 +-
 net/mac80211/rx.c          |  2 +-
 net/sctp/sm_make_chunk.c   |  2 +-
 30 files changed, 133 insertions(+), 133 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index bed7a4682b90..c831467774d0 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h
@@ -119,7 +119,7 @@ struct ethhdr {
 	unsigned char	h_dest[ETH_ALEN];	/* destination eth addr	*/
 	unsigned char	h_source[ETH_ALEN];	/* source ether addr	*/
 	__be16		h_proto;		/* packet type ID field	*/
-} __attribute__((packed));
+} __packed;
 
 #ifdef __KERNEL__
 #include <linux/skbuff.h>
diff --git a/include/linux/if_fddi.h b/include/linux/if_fddi.h
index 5459c5c09930..9947c39e62f6 100644
--- a/include/linux/if_fddi.h
+++ b/include/linux/if_fddi.h
@@ -67,7 +67,7 @@ struct fddi_8022_1_hdr {
 	__u8	dsap;					/* destination service access point */
 	__u8	ssap;					/* source service access point */
 	__u8	ctrl;					/* control byte #1 */
-} __attribute__ ((packed));
+} __packed;
 
 /* Define 802.2 Type 2 header */
 struct fddi_8022_2_hdr {
@@ -75,7 +75,7 @@ struct fddi_8022_2_hdr {
 	__u8	ssap;					/* source service access point */
 	__u8	ctrl_1;					/* control byte #1 */
 	__u8	ctrl_2;					/* control byte #2 */
-} __attribute__ ((packed));
+} __packed;
 
 /* Define 802.2 SNAP header */
 #define FDDI_K_OUI_LEN	3
@@ -85,7 +85,7 @@ struct fddi_snap_hdr {
 	__u8	ctrl;					/* always 0x03 */
 	__u8	oui[FDDI_K_OUI_LEN];	/* organizational universal id */
 	__be16	ethertype;				/* packet type ID field */
-} __attribute__ ((packed));
+} __packed;
 
 /* Define FDDI LLC frame header */
 struct fddihdr {
@@ -98,7 +98,7 @@ struct fddihdr {
 		struct fddi_8022_2_hdr		llc_8022_2;
 		struct fddi_snap_hdr		llc_snap;
 		} hdr;
-} __attribute__ ((packed));
+} __packed;
 
 #ifdef __KERNEL__
 #include <linux/netdevice.h>
diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h
index 80b3a1056a5f..191ee0869bc1 100644
--- a/include/linux/if_frad.h
+++ b/include/linux/if_frad.h
@@ -135,7 +135,7 @@ struct frhdr
    __be16 PID;
 
 #define IP_NLPID pad 
-} __attribute__((packed));
+} __packed;
 
 /* see RFC 1490 for the definition of the following */
 #define FRAD_I_UI		0x03
diff --git a/include/linux/if_hippi.h b/include/linux/if_hippi.h
index 8d038eb8db5c..5fe5f307c6f5 100644
--- a/include/linux/if_hippi.h
+++ b/include/linux/if_hippi.h
@@ -104,7 +104,7 @@ struct hippi_fp_hdr {
 	__be32		fixed;
 #endif
 	__be32		d2_size;
-} __attribute__ ((packed));
+} __packed;
 
 struct hippi_le_hdr {
 #if defined (__BIG_ENDIAN_BITFIELD)
@@ -129,7 +129,7 @@ struct hippi_le_hdr {
 	__u8		daddr[HIPPI_ALEN];
 	__u16		locally_administered;
 	__u8		saddr[HIPPI_ALEN];
-} __attribute__ ((packed));
+} __packed;
 
 #define HIPPI_OUI_LEN	3
 /*
@@ -142,12 +142,12 @@ struct hippi_snap_hdr {
 	__u8	ctrl;			/* always 0x03 */
 	__u8	oui[HIPPI_OUI_LEN];	/* organizational universal id (zero)*/
 	__be16	ethertype;		/* packet type ID field */
-} __attribute__ ((packed));
+} __packed;
 
 struct hippi_hdr {
 	struct hippi_fp_hdr	fp;
 	struct hippi_le_hdr	le;
 	struct hippi_snap_hdr	snap;
-} __attribute__ ((packed));
+} __packed;
 
 #endif	/* _LINUX_IF_HIPPI_H */
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index a6577af0c4e6..1925e0c3f162 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -59,7 +59,7 @@ struct sockaddr_pppox {
        union{ 
                struct pppoe_addr       pppoe; 
        }sa_addr; 
-}__attribute__ ((packed)); 
+} __packed;
 
 /* The use of the above union isn't viable because the size of this
  * struct must stay fixed over time -- applications use sizeof(struct
@@ -70,7 +70,7 @@ struct sockaddr_pppol2tp {
 	sa_family_t     sa_family;      /* address family, AF_PPPOX */
 	unsigned int    sa_protocol;    /* protocol identifier */
 	struct pppol2tp_addr pppol2tp;
-}__attribute__ ((packed));
+} __packed;
 
 /* The L2TPv3 protocol changes tunnel and session ids from 16 to 32
  * bits. So we need a different sockaddr structure.
@@ -79,7 +79,7 @@ struct sockaddr_pppol2tpv3 {
 	sa_family_t     sa_family;      /* address family, AF_PPPOX */
 	unsigned int    sa_protocol;    /* protocol identifier */
 	struct pppol2tpv3_addr pppol2tp;
-} __attribute__ ((packed));
+} __packed;
 
 /*********************************************************************
  *
@@ -129,7 +129,7 @@ struct pppoe_hdr {
 	__be16 sid;
 	__be16 length;
 	struct pppoe_tag tag[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Length of entire PPPoE + PPP header */
 #define PPPOE_SES_HLEN	8
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 99e1ab7e3eec..940e21595351 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -58,7 +58,7 @@ struct ipv6_opt_hdr {
 	/* 
 	 * TLV encoded option data follows.
 	 */
-} __attribute__ ((packed));	/* required for some archs */
+} __packed;	/* required for some archs */
 
 #define ipv6_destopt_hdr ipv6_opt_hdr
 #define ipv6_hopopt_hdr  ipv6_opt_hdr
@@ -99,7 +99,7 @@ struct ipv6_destopt_hao {
 	__u8			type;
 	__u8			length;
 	struct in6_addr		addr;
-} __attribute__ ((__packed__));
+} __packed;
 
 /*
  *	IPv6 fixed header
diff --git a/include/linux/isdnif.h b/include/linux/isdnif.h
index b9b5a684ed69..b8c23f88dd54 100644
--- a/include/linux/isdnif.h
+++ b/include/linux/isdnif.h
@@ -317,7 +317,7 @@ typedef struct T30_s {
 	__u8 r_scantime;
 	__u8 r_id[FAXIDLEN];
 	__u8 r_code;
-} __attribute__((packed)) T30_s;
+} __packed T30_s;
 
 #define ISDN_TTY_FAX_CONN_IN	0
 #define ISDN_TTY_FAX_CONN_OUT	1
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index 78c3bed1c3f5..b5e7f2202484 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -251,7 +251,7 @@
 struct mISDNhead {
 	unsigned int	prim;
 	unsigned int	id;
-}  __attribute__((packed));
+}  __packed;
 
 #define MISDN_HEADER_LEN	sizeof(struct mISDNhead)
 #define MAX_DATA_SIZE		2048
diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index 155719dab813..bb58854a8061 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -88,7 +88,7 @@ struct nbd_request {
 	char handle[8];
 	__be64 from;
 	__be32 len;
-} __attribute__ ((packed));
+} __packed;
 
 /*
  * This is the reply packet that nbd-server sends back to the client after
diff --git a/include/linux/ncp.h b/include/linux/ncp.h
index 99f0adeeb3f3..3ace8370e61e 100644
--- a/include/linux/ncp.h
+++ b/include/linux/ncp.h
@@ -27,7 +27,7 @@ struct ncp_request_header {
 	__u8 conn_high;
 	__u8 function;
 	__u8 data[0];
-} __attribute__((packed));
+} __packed;
 
 #define NCP_REPLY                (0x3333)
 #define NCP_WATCHDOG		 (0x3E3E)
@@ -42,7 +42,7 @@ struct ncp_reply_header {
 	__u8 completion_code;
 	__u8 connection_state;
 	__u8 data[0];
-} __attribute__((packed));
+} __packed;
 
 #define NCP_VOLNAME_LEN (16)
 #define NCP_NUMBER_OF_VOLUMES (256)
@@ -158,7 +158,7 @@ struct nw_info_struct {
 #ifdef __KERNEL__
 	struct nw_nfs_info nfs;
 #endif
-} __attribute__((packed));
+} __packed;
 
 /* modify mask - use with MODIFY_DOS_INFO structure */
 #define DM_ATTRIBUTES		  (cpu_to_le32(0x02))
@@ -190,12 +190,12 @@ struct nw_modify_dos_info {
 	__u16 inheritanceGrantMask;
 	__u16 inheritanceRevokeMask;
 	__u32 maximumSpace;
-} __attribute__((packed));
+} __packed;
 
 struct nw_search_sequence {
 	__u8 volNumber;
 	__u32 dirBase;
 	__u32 sequence;
-} __attribute__((packed));
+} __packed;
 
 #endif				/* _LINUX_NCP_H */
diff --git a/include/linux/ncp_fs_sb.h b/include/linux/ncp_fs_sb.h
index 5ec9ca671687..8da05bc098ca 100644
--- a/include/linux/ncp_fs_sb.h
+++ b/include/linux/ncp_fs_sb.h
@@ -104,13 +104,13 @@ struct ncp_server {
 
 		unsigned int state;		/* STREAM only: receiver state */
 		struct {
-			__u32 magic __attribute__((packed));
-			__u32 len __attribute__((packed));
-			__u16 type __attribute__((packed));
-			__u16 p1 __attribute__((packed));
-			__u16 p2 __attribute__((packed));
-			__u16 p3 __attribute__((packed));
-			__u16 type2 __attribute__((packed));
+			__u32 magic __packed;
+			__u32 len __packed;
+			__u16 type __packed;
+			__u16 p1 __packed;
+			__u16 p2 __packed;
+			__u16 p3 __packed;
+			__u16 type2 __packed;
 		} buf;				/* STREAM only: temporary buffer */
 		unsigned char* ptr;		/* STREAM only: pointer to data */
 		size_t len;			/* STREAM only: length of data to receive */
diff --git a/include/linux/phonet.h b/include/linux/phonet.h
index e5126cff9b2a..24426c3d6b5a 100644
--- a/include/linux/phonet.h
+++ b/include/linux/phonet.h
@@ -56,7 +56,7 @@ struct phonethdr {
 	__be16	pn_length;
 	__u8	pn_robj;
 	__u8	pn_sobj;
-} __attribute__((packed));
+} __packed;
 
 /* Common Phonet payload header */
 struct phonetmsg {
@@ -98,7 +98,7 @@ struct sockaddr_pn {
 	__u8 spn_dev;
 	__u8 spn_resource;
 	__u8 spn_zero[sizeof(struct sockaddr) - sizeof(sa_family_t) - 3];
-} __attribute__ ((packed));
+} __packed;
 
 /* Well known address */
 #define PN_DEV_PC	0x10
diff --git a/include/linux/rds.h b/include/linux/rds.h
index cab4994c2f63..24bce3ded9ea 100644
--- a/include/linux/rds.h
+++ b/include/linux/rds.h
@@ -100,7 +100,7 @@
 struct rds_info_counter {
 	u_int8_t	name[32];
 	u_int64_t	value;
-} __attribute__((packed));
+} __packed;
 
 #define RDS_INFO_CONNECTION_FLAG_SENDING	0x01
 #define RDS_INFO_CONNECTION_FLAG_CONNECTING	0x02
@@ -115,7 +115,7 @@ struct rds_info_connection {
 	__be32		faddr;
 	u_int8_t	transport[TRANSNAMSIZ];		/* null term ascii */
 	u_int8_t	flags;
-} __attribute__((packed));
+} __packed;
 
 struct rds_info_flow {
 	__be32		laddr;
@@ -123,7 +123,7 @@ struct rds_info_flow {
 	u_int32_t	bytes;
 	__be16		lport;
 	__be16		fport;
-} __attribute__((packed));
+} __packed;
 
 #define RDS_INFO_MESSAGE_FLAG_ACK               0x01
 #define RDS_INFO_MESSAGE_FLAG_FAST_ACK          0x02
@@ -136,7 +136,7 @@ struct rds_info_message {
 	__be16		lport;
 	__be16		fport;
 	u_int8_t	flags;
-} __attribute__((packed));
+} __packed;
 
 struct rds_info_socket {
 	u_int32_t	sndbuf;
@@ -146,7 +146,7 @@ struct rds_info_socket {
 	__be16		connected_port;
 	u_int32_t	rcvbuf;
 	u_int64_t	inum;
-} __attribute__((packed));
+} __packed;
 
 struct rds_info_tcp_socket {
 	__be32          local_addr;
@@ -158,7 +158,7 @@ struct rds_info_tcp_socket {
 	u_int32_t       last_sent_nxt;
 	u_int32_t       last_expected_una;
 	u_int32_t       last_seen_una;
-} __attribute__((packed));
+} __packed;
 
 #define RDS_IB_GID_LEN	16
 struct rds_info_rdma_connection {
diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index c20d3ce673c0..c11a28706fa4 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -61,7 +61,7 @@ typedef struct sctphdr {
 	__be16 dest;
 	__be32 vtag;
 	__le32 checksum;
-} __attribute__((packed)) sctp_sctphdr_t;
+} __packed sctp_sctphdr_t;
 
 #ifdef __KERNEL__
 #include <linux/skbuff.h>
@@ -77,7 +77,7 @@ typedef struct sctp_chunkhdr {
 	__u8 type;
 	__u8 flags;
 	__be16 length;
-} __attribute__((packed)) sctp_chunkhdr_t;
+} __packed sctp_chunkhdr_t;
 
 
 /* Section 3.2.  Chunk Type Values.
@@ -167,7 +167,7 @@ enum { SCTP_CHUNK_FLAG_T = 0x01 };
 typedef struct sctp_paramhdr {
 	__be16 type;
 	__be16 length;
-} __attribute__((packed)) sctp_paramhdr_t;
+} __packed sctp_paramhdr_t;
 
 typedef enum {
 
@@ -228,12 +228,12 @@ typedef struct sctp_datahdr {
 	__be16 ssn;
 	__be32 ppid;
 	__u8  payload[0];
-} __attribute__((packed)) sctp_datahdr_t;
+} __packed sctp_datahdr_t;
 
 typedef struct sctp_data_chunk {
         sctp_chunkhdr_t chunk_hdr;
         sctp_datahdr_t  data_hdr;
-} __attribute__((packed)) sctp_data_chunk_t;
+} __packed sctp_data_chunk_t;
 
 /* DATA Chuck Specific Flags */
 enum {
@@ -259,78 +259,78 @@ typedef struct sctp_inithdr {
 	__be16 num_inbound_streams;
 	__be32 initial_tsn;
 	__u8  params[0];
-} __attribute__((packed)) sctp_inithdr_t;
+} __packed sctp_inithdr_t;
 
 typedef struct sctp_init_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_inithdr_t init_hdr;
-} __attribute__((packed)) sctp_init_chunk_t;
+} __packed sctp_init_chunk_t;
 
 
 /* Section 3.3.2.1. IPv4 Address Parameter (5) */
 typedef struct sctp_ipv4addr_param {
 	sctp_paramhdr_t param_hdr;
 	struct in_addr  addr;
-} __attribute__((packed)) sctp_ipv4addr_param_t;
+} __packed sctp_ipv4addr_param_t;
 
 /* Section 3.3.2.1. IPv6 Address Parameter (6) */
 typedef struct sctp_ipv6addr_param {
 	sctp_paramhdr_t param_hdr;
 	struct in6_addr addr;
-} __attribute__((packed)) sctp_ipv6addr_param_t;
+} __packed sctp_ipv6addr_param_t;
 
 /* Section 3.3.2.1 Cookie Preservative (9) */
 typedef struct sctp_cookie_preserve_param {
 	sctp_paramhdr_t param_hdr;
 	__be32          lifespan_increment;
-} __attribute__((packed)) sctp_cookie_preserve_param_t;
+} __packed sctp_cookie_preserve_param_t;
 
 /* Section 3.3.2.1 Host Name Address (11) */
 typedef struct sctp_hostname_param {
 	sctp_paramhdr_t param_hdr;
 	uint8_t hostname[0];
-} __attribute__((packed)) sctp_hostname_param_t;
+} __packed sctp_hostname_param_t;
 
 /* Section 3.3.2.1 Supported Address Types (12) */
 typedef struct sctp_supported_addrs_param {
 	sctp_paramhdr_t param_hdr;
 	__be16 types[0];
-} __attribute__((packed)) sctp_supported_addrs_param_t;
+} __packed sctp_supported_addrs_param_t;
 
 /* Appendix A. ECN Capable (32768) */
 typedef struct sctp_ecn_capable_param {
 	sctp_paramhdr_t param_hdr;
-} __attribute__((packed)) sctp_ecn_capable_param_t;
+} __packed sctp_ecn_capable_param_t;
 
 /* ADDIP Section 3.2.6 Adaptation Layer Indication */
 typedef struct sctp_adaptation_ind_param {
 	struct sctp_paramhdr param_hdr;
 	__be32 adaptation_ind;
-} __attribute__((packed)) sctp_adaptation_ind_param_t;
+} __packed sctp_adaptation_ind_param_t;
 
 /* ADDIP Section 4.2.7 Supported Extensions Parameter */
 typedef struct sctp_supported_ext_param {
 	struct sctp_paramhdr param_hdr;
 	__u8 chunks[0];
-} __attribute__((packed)) sctp_supported_ext_param_t;
+} __packed sctp_supported_ext_param_t;
 
 /* AUTH Section 3.1 Random */
 typedef struct sctp_random_param {
 	sctp_paramhdr_t param_hdr;
 	__u8 random_val[0];
-} __attribute__((packed)) sctp_random_param_t;
+} __packed sctp_random_param_t;
 
 /* AUTH Section 3.2 Chunk List */
 typedef struct sctp_chunks_param {
 	sctp_paramhdr_t param_hdr;
 	__u8 chunks[0];
-} __attribute__((packed)) sctp_chunks_param_t;
+} __packed sctp_chunks_param_t;
 
 /* AUTH Section 3.3 HMAC Algorithm */
 typedef struct sctp_hmac_algo_param {
 	sctp_paramhdr_t param_hdr;
 	__be16 hmac_ids[0];
-} __attribute__((packed)) sctp_hmac_algo_param_t;
+} __packed sctp_hmac_algo_param_t;
 
 /* RFC 2960.  Section 3.3.3 Initiation Acknowledgement (INIT ACK) (2):
  *   The INIT ACK chunk is used to acknowledge the initiation of an SCTP
@@ -342,13 +342,13 @@ typedef sctp_init_chunk_t sctp_initack_chunk_t;
 typedef struct sctp_cookie_param {
 	sctp_paramhdr_t p;
 	__u8 body[0];
-} __attribute__((packed)) sctp_cookie_param_t;
+} __packed sctp_cookie_param_t;
 
 /* Section 3.3.3.1 Unrecognized Parameters (8) */
 typedef struct sctp_unrecognized_param {
 	sctp_paramhdr_t param_hdr;
 	sctp_paramhdr_t unrecognized;
-} __attribute__((packed)) sctp_unrecognized_param_t;
+} __packed sctp_unrecognized_param_t;
 
 
@@ -363,7 +363,7 @@ typedef struct sctp_unrecognized_param {
 typedef struct sctp_gap_ack_block {
 	__be16 start;
 	__be16 end;
-} __attribute__((packed)) sctp_gap_ack_block_t;
+} __packed sctp_gap_ack_block_t;
 
 typedef __be32 sctp_dup_tsn_t;
 
@@ -378,12 +378,12 @@ typedef struct sctp_sackhdr {
 	__be16 num_gap_ack_blocks;
 	__be16 num_dup_tsns;
 	sctp_sack_variable_t variable[0];
-} __attribute__((packed)) sctp_sackhdr_t;
+} __packed sctp_sackhdr_t;
 
 typedef struct sctp_sack_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_sackhdr_t sack_hdr;
-} __attribute__((packed)) sctp_sack_chunk_t;
+} __packed sctp_sack_chunk_t;
 
 
 /* RFC 2960.  Section 3.3.5 Heartbeat Request (HEARTBEAT) (4):
@@ -395,12 +395,12 @@ typedef struct sctp_sack_chunk {
 
 typedef struct sctp_heartbeathdr {
 	sctp_paramhdr_t info;
-} __attribute__((packed)) sctp_heartbeathdr_t;
+} __packed sctp_heartbeathdr_t;
 
 typedef struct sctp_heartbeat_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_heartbeathdr_t hb_hdr;
-} __attribute__((packed)) sctp_heartbeat_chunk_t;
+} __packed sctp_heartbeat_chunk_t;
 
 
 /* For the abort and shutdown ACK we must carry the init tag in the
@@ -409,7 +409,7 @@ typedef struct sctp_heartbeat_chunk {
  */
 typedef struct sctp_abort_chunk {
         sctp_chunkhdr_t uh;
-} __attribute__((packed)) sctp_abort_chunk_t;
+} __packed sctp_abort_chunk_t;
 
 
 /* For the graceful shutdown we must carry the tag (in common header)
@@ -417,12 +417,12 @@ typedef struct sctp_abort_chunk {
  */
 typedef struct sctp_shutdownhdr {
 	__be32 cum_tsn_ack;
-} __attribute__((packed)) sctp_shutdownhdr_t;
+} __packed sctp_shutdownhdr_t;
 
 struct sctp_shutdown_chunk_t {
         sctp_chunkhdr_t    chunk_hdr;
         sctp_shutdownhdr_t shutdown_hdr;
-} __attribute__ ((packed));
+} __packed;
 
 /* RFC 2960.  Section 3.3.10 Operation Error (ERROR) (9) */
 
@@ -430,12 +430,12 @@ typedef struct sctp_errhdr {
 	__be16 cause;
 	__be16 length;
 	__u8  variable[0];
-} __attribute__((packed)) sctp_errhdr_t;
+} __packed sctp_errhdr_t;
 
 typedef struct sctp_operr_chunk {
         sctp_chunkhdr_t chunk_hdr;
 	sctp_errhdr_t   err_hdr;
-} __attribute__((packed)) sctp_operr_chunk_t;
+} __packed sctp_operr_chunk_t;
 
 /* RFC 2960 3.3.10 - Operation Error
  *
@@ -525,7 +525,7 @@ typedef struct sctp_ecnehdr {
 typedef struct sctp_ecne_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_ecnehdr_t ence_hdr;
-} __attribute__((packed)) sctp_ecne_chunk_t;
+} __packed sctp_ecne_chunk_t;
 
 /* RFC 2960.  Appendix A.  Explicit Congestion Notification.
  *   Congestion Window Reduced (CWR) (13)
@@ -537,7 +537,7 @@ typedef struct sctp_cwrhdr {
 typedef struct sctp_cwr_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_cwrhdr_t cwr_hdr;
-} __attribute__((packed)) sctp_cwr_chunk_t;
+} __packed sctp_cwr_chunk_t;
 
 /* PR-SCTP
  * 3.2 Forward Cumulative TSN Chunk Definition (FORWARD TSN)
@@ -588,17 +588,17 @@ typedef struct sctp_cwr_chunk {
 struct sctp_fwdtsn_skip {
 	__be16 stream;
 	__be16 ssn;
-} __attribute__((packed));
+} __packed;
 
 struct sctp_fwdtsn_hdr {
 	__be32 new_cum_tsn;
 	struct sctp_fwdtsn_skip skip[0];
-} __attribute((packed));
+} __packed;
 
 struct sctp_fwdtsn_chunk {
 	struct sctp_chunkhdr chunk_hdr;
 	struct sctp_fwdtsn_hdr fwdtsn_hdr;
-} __attribute((packed));
+} __packed;
 
 
 /* ADDIP
@@ -636,17 +636,17 @@ struct sctp_fwdtsn_chunk {
 typedef struct sctp_addip_param {
 	sctp_paramhdr_t	param_hdr;
 	__be32		crr_id;
-} __attribute__((packed)) sctp_addip_param_t;
+} __packed sctp_addip_param_t;
 
 typedef struct sctp_addiphdr {
 	__be32	serial;
 	__u8	params[0];
-} __attribute__((packed)) sctp_addiphdr_t;
+} __packed sctp_addiphdr_t;
 
 typedef struct sctp_addip_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_addiphdr_t addip_hdr;
-} __attribute__((packed)) sctp_addip_chunk_t;
+} __packed sctp_addip_chunk_t;
 
 /* AUTH
  * Section 4.1  Authentication Chunk (AUTH)
@@ -701,11 +701,11 @@ typedef struct sctp_authhdr {
 	__be16 shkey_id;
 	__be16 hmac_id;
 	__u8   hmac[0];
-} __attribute__((packed)) sctp_authhdr_t;
+} __packed sctp_authhdr_t;
 
 typedef struct sctp_auth_chunk {
 	sctp_chunkhdr_t chunk_hdr;
 	sctp_authhdr_t auth_hdr;
-} __attribute__((packed)) sctp_auth_chunk_t;
+} __packed sctp_auth_chunk_t;
 
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/linux/wlp.h b/include/linux/wlp.h
index ac95ce6606ac..c76fe2392506 100644
--- a/include/linux/wlp.h
+++ b/include/linux/wlp.h
@@ -300,7 +300,7 @@ struct wlp_ie {
 	__le16 cycle_param;
 	__le16 acw_anchor_addr;
 	u8 wssid_hash_list[];
-} __attribute__((packed));
+} __packed;
 
 static inline int wlp_ie_hash_length(struct wlp_ie *ie)
 {
@@ -324,7 +324,7 @@ static inline void wlp_ie_set_hash_length(struct wlp_ie *ie, int hash_length)
  */
 struct wlp_nonce {
 	u8 data[16];
-} __attribute__((packed));
+} __packed;
 
 /**
  * WLP UUID
@@ -336,7 +336,7 @@ struct wlp_nonce {
  */
 struct wlp_uuid {
 	u8 data[16];
-} __attribute__((packed));
+} __packed;
 
 
 /**
@@ -348,7 +348,7 @@ struct wlp_dev_type {
 	u8 OUI[3];
 	u8 OUIsubdiv;
 	__le16 subID;
-} __attribute__((packed));
+} __packed;
 
 /**
  * WLP frame header
@@ -357,7 +357,7 @@ struct wlp_dev_type {
 struct wlp_frame_hdr {
 	__le16 mux_hdr;			/* WLP_PROTOCOL_ID */
 	enum wlp_frame_type type:8;
-} __attribute__((packed));
+} __packed;
 
 /**
  * WLP attribute field header
@@ -368,7 +368,7 @@ struct wlp_frame_hdr {
 struct wlp_attr_hdr {
 	__le16 type;
 	__le16 length;
-} __attribute__((packed));
+} __packed;
 
 /**
  * Device information commonly used together
@@ -401,13 +401,13 @@ struct wlp_device_info {
 struct wlp_attr_##name {						\
 	struct wlp_attr_hdr hdr;					\
 	type name;							\
-} __attribute__((packed));
+} __packed;
 
 #define wlp_attr_array(type, name)					\
 struct wlp_attr_##name {						\
 	struct wlp_attr_hdr hdr;					\
 	type name[];							\
-} __attribute__((packed));
+} __packed;
 
 /**
  * WLP association attribute fields
@@ -483,7 +483,7 @@ struct wlp_wss_info {
 	struct wlp_attr_accept_enrl accept;
 	struct wlp_attr_wss_sec_status sec_stat;
 	struct wlp_attr_wss_bcast bcast;
-} __attribute__((packed));
+} __packed;
 
 /* WLP WSS Information */
 wlp_attr_array(struct wlp_wss_info, wss_info)
@@ -520,7 +520,7 @@ wlp_attr(u8, wlp_assc_err)
 struct wlp_frame_std_abbrv_hdr {
 	struct wlp_frame_hdr hdr;
 	u8 tag;
-} __attribute__((packed));
+} __packed;
 
 /**
  * WLP association frames
@@ -533,7 +533,7 @@ struct wlp_frame_assoc {
 	struct wlp_attr_version version;
 	struct wlp_attr_msg_type msg_type;
 	u8 attr[];
-} __attribute__((packed));
+} __packed;
 
 /* Ethernet to dev address mapping */
 struct wlp_eda {
diff --git a/include/net/dn_dev.h b/include/net/dn_dev.h
index 511a459ec10f..0916bbf3bdff 100644
--- a/include/net/dn_dev.h
+++ b/include/net/dn_dev.h
@@ -101,7 +101,7 @@ struct dn_short_packet {
 	__le16 dstnode;
 	__le16 srcnode;
 	__u8   forward;
-} __attribute__((packed));
+} __packed;
 
 struct dn_long_packet {
 	__u8   msgflg;
@@ -115,7 +115,7 @@ struct dn_long_packet {
 	__u8   visit_ct;
 	__u8   s_class;
 	__u8   pt;
-} __attribute__((packed));
+} __packed;
 
 /*------------------------- DRP - Routing messages ---------------------*/
 
@@ -132,7 +132,7 @@ struct endnode_hello_message {
 	__u8   mpd;
 	__u8   datalen;
 	__u8   data[2];
-} __attribute__((packed));
+} __packed;
 
 struct rtnode_hello_message {
 	__u8   msgflg;
@@ -144,7 +144,7 @@ struct rtnode_hello_message {
 	__u8   area;
 	__le16  timer;
 	__u8   mpd;
-} __attribute__((packed));
+} __packed;
 
 
 extern void dn_dev_init(void);
diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h
index 17d43d2db5ec..e43a2893f132 100644
--- a/include/net/dn_nsp.h
+++ b/include/net/dn_nsp.h
@@ -74,18 +74,18 @@ struct nsp_data_seg_msg {
 	__u8   msgflg;
 	__le16 dstaddr;
 	__le16 srcaddr;
-} __attribute__((packed));
+} __packed;
 
 struct nsp_data_opt_msg {
 	__le16 acknum;
 	__le16 segnum;
 	__le16 lsflgs;
-} __attribute__((packed));
+} __packed;
 
 struct nsp_data_opt_msg1 {
 	__le16 acknum;
 	__le16 segnum;
-} __attribute__((packed));
+} __packed;
 
 
 /* Acknowledgment Message (data/other data)                             */
@@ -94,13 +94,13 @@ struct nsp_data_ack_msg {
 	__le16 dstaddr;
 	__le16 srcaddr;
 	__le16 acknum;
-} __attribute__((packed));
+} __packed;
 
 /* Connect Acknowledgment Message */
 struct  nsp_conn_ack_msg {
 	__u8 msgflg;
 	__le16 dstaddr;
-} __attribute__((packed));
+} __packed;
 
 
 /* Connect Initiate/Retransmit Initiate/Connect Confirm */
@@ -117,7 +117,7 @@ struct  nsp_conn_init_msg {
 #define NSP_FC_MASK   0x0c            /* FC type mask         */
 	__u8   info;
 	__le16 segsize;
-} __attribute__((packed));
+} __packed;
 
 /* Disconnect Initiate/Disconnect Confirm */
 struct  nsp_disconn_init_msg {
@@ -125,7 +125,7 @@ struct  nsp_disconn_init_msg {
 	__le16 dstaddr;
 	__le16 srcaddr;
 	__le16 reason;
-} __attribute__((packed));
+} __packed;
 
 
@@ -135,7 +135,7 @@ struct  srcobj_fmt {
 	__le16 grpcode;
 	__le16 usrcode;
 	__u8   dlen;
-} __attribute__((packed));
+} __packed;
 
 /*
  * A collection of functions for manipulating the sequence
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index fbf9d1cda27b..fc94ec568a50 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -27,6 +27,6 @@ struct ipv6_tlv_tnl_enc_lim {
 	__u8 type;		/* type-code for option         */
 	__u8 length;		/* option length                */
 	__u8 encap_limit;	/* tunnel encapsulation limit   */
-} __attribute__ ((packed));
+} __packed;
 
 #endif
diff --git a/include/net/ipx.h b/include/net/ipx.h
index ef51a668ba19..05d7e4a88b49 100644
--- a/include/net/ipx.h
+++ b/include/net/ipx.h
@@ -27,9 +27,9 @@ struct ipx_address {
 #define IPX_MAX_PPROP_HOPS 8
 
 struct ipxhdr {
-	__be16			ipx_checksum __attribute__ ((packed));
+	__be16			ipx_checksum __packed;
 #define IPX_NO_CHECKSUM	cpu_to_be16(0xFFFF)
-	__be16			ipx_pktsize __attribute__ ((packed));
+	__be16			ipx_pktsize __packed;
 	__u8			ipx_tctrl;
 	__u8			ipx_type;
 #define IPX_TYPE_UNKNOWN	0x00
@@ -38,8 +38,8 @@ struct ipxhdr {
 #define IPX_TYPE_SPX		0x05	/* SPX protocol */
 #define IPX_TYPE_NCP		0x11	/* $lots for docs on this (SPIT) */
 #define IPX_TYPE_PPROP		0x14	/* complicated flood fill brdcast */
-	struct ipx_address	ipx_dest __attribute__ ((packed));
-	struct ipx_address	ipx_source __attribute__ ((packed));
+	struct ipx_address	ipx_dest __packed;
+	struct ipx_address	ipx_source __packed;
 };
 
 static __inline__ struct ipxhdr *ipx_hdr(struct sk_buff *skb)
diff --git a/include/net/mip6.h b/include/net/mip6.h
index a83ad1982a90..26ba99b5a4b1 100644
--- a/include/net/mip6.h
+++ b/include/net/mip6.h
@@ -39,7 +39,7 @@ struct ip6_mh {
 	__u16	ip6mh_cksum;
 	/* Followed by type specific messages */
 	__u8	data[0];
-} __attribute__ ((__packed__));
+} __packed;
 
 #define IP6_MH_TYPE_BRR		0   /* Binding Refresh Request */
 #define IP6_MH_TYPE_HOTI	1   /* HOTI Message   */
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index f76f22d05721..895997bc2ead 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -82,7 +82,7 @@ struct ra_msg {
 struct nd_opt_hdr {
 	__u8		nd_opt_type;
 	__u8		nd_opt_len;
-} __attribute__((__packed__));
+} __packed;
 
 
 extern int			ndisc_init(void);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 4b860116e096..f9e7473613bd 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -443,7 +443,7 @@ struct sctp_signed_cookie {
 	__u8 signature[SCTP_SECRET_SIZE];
 	__u32 __pad;		/* force sctp_cookie alignment to 64 bits */
 	struct sctp_cookie c;
-} __attribute__((packed));
+} __packed;
 
 /* This is another convenience type to allocate memory for address
  * params for the maximum size and pass such structures around
@@ -488,7 +488,7 @@ typedef struct sctp_sender_hb_info {
 	union sctp_addr daddr;
 	unsigned long sent_at;
 	__u64 hb_nonce;
-} __attribute__((packed)) sctp_sender_hb_info_t;
+} __packed sctp_sender_hb_info_t;
 
 /*
  *  RFC 2960 1.3.2 Sequenced Delivery within Streams
diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h
index b69e6e173ea1..9b2c30897e50 100644
--- a/include/rxrpc/packet.h
+++ b/include/rxrpc/packet.h
@@ -65,7 +65,7 @@ struct rxrpc_header {
 	};
 	__be16		serviceId;	/* service ID */
 
-} __attribute__((packed));
+} __packed;
 
 #define __rxrpc_header_off(X) offsetof(struct rxrpc_header,X)
 
@@ -120,7 +120,7 @@ struct rxrpc_ackpacket {
 #define RXRPC_ACK_TYPE_NACK		0
 #define RXRPC_ACK_TYPE_ACK		1
 
-} __attribute__((packed));
+} __packed;
 
 /*
  * ACK packets can have a further piece of information tagged on the end
@@ -141,7 +141,7 @@ struct rxkad_challenge {
 	__be32		nonce;		/* encrypted random number */
 	__be32		min_level;	/* minimum security level */
 	__be32		__padding;	/* padding to 8-byte boundary */
-} __attribute__((packed));
+} __packed;
 
 /*****************************************************************************/
 /*
@@ -164,7 +164,7 @@ struct rxkad_response {
 
 	__be32		kvno;		/* Kerberos key version number */
 	__be32		ticket_len;	/* Kerberos ticket length  */
-} __attribute__((packed));
+} __packed;
 
 /*****************************************************************************/
 /*
diff --git a/net/bluetooth/bnep/bnep.h b/net/bluetooth/bnep/bnep.h
index 0d9e506f5d5a..70672544db86 100644
--- a/net/bluetooth/bnep/bnep.h
+++ b/net/bluetooth/bnep/bnep.h
@@ -86,26 +86,26 @@ struct bnep_setup_conn_req {
 	__u8  ctrl;
 	__u8  uuid_size;
 	__u8  service[0];
-} __attribute__((packed));
+} __packed;
 
 struct bnep_set_filter_req {
 	__u8  type;
 	__u8  ctrl;
 	__be16 len;
 	__u8  list[0];
-} __attribute__((packed));
+} __packed;
 
 struct bnep_control_rsp {
 	__u8  type;
 	__u8  ctrl;
 	__be16 resp;
-} __attribute__((packed));
+} __packed;
 
 struct bnep_ext_hdr {
 	__u8  type;
 	__u8  len;
 	__u8  data[0];
-} __attribute__((packed));
+} __packed;
 
 /* BNEP ioctl defines */
 #define BNEPCONNADD	_IOW('B', 200, int)
diff --git a/net/compat.c b/net/compat.c
index ec24d9edb025..1cf75905f132 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -531,7 +531,7 @@ struct compat_group_req {
 	__u32				 gr_interface;
 	struct __kernel_sockaddr_storage gr_group
 		__attribute__ ((aligned(4)));
-} __attribute__ ((packed));
+} __packed;
 
 struct compat_group_source_req {
 	__u32				 gsr_interface;
@@ -539,7 +539,7 @@ struct compat_group_source_req {
 		__attribute__ ((aligned(4)));
 	struct __kernel_sockaddr_storage gsr_source
 		__attribute__ ((aligned(4)));
-} __attribute__ ((packed));
+} __packed;
 
 struct compat_group_filter {
 	__u32				 gf_interface;
@@ -549,7 +549,7 @@ struct compat_group_filter {
 	__u32				 gf_numsrc;
 	struct __kernel_sockaddr_storage gf_slist[1]
 		__attribute__ ((aligned(4)));
-} __attribute__ ((packed));
+} __packed;
 
 #define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \
 			sizeof(struct __kernel_sockaddr_storage))
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index f28ad2cc8428..499c045d6910 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -1463,7 +1463,7 @@ struct iucv_path_pending {
 	u32 res3;
 	u8  ippollfg;
 	u8  res4[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_path_pending(struct iucv_irq_data *data)
 {
@@ -1524,7 +1524,7 @@ struct iucv_path_complete {
 	u32 res3;
 	u8  ippollfg;
 	u8  res4[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_path_complete(struct iucv_irq_data *data)
 {
@@ -1554,7 +1554,7 @@ struct iucv_path_severed {
 	u32 res4;
 	u8  ippollfg;
 	u8  res5[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_path_severed(struct iucv_irq_data *data)
 {
@@ -1590,7 +1590,7 @@ struct iucv_path_quiesced {
 	u32 res4;
 	u8  ippollfg;
 	u8  res5[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_path_quiesced(struct iucv_irq_data *data)
 {
@@ -1618,7 +1618,7 @@ struct iucv_path_resumed {
 	u32 res4;
 	u8  ippollfg;
 	u8  res5[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_path_resumed(struct iucv_irq_data *data)
 {
@@ -1649,7 +1649,7 @@ struct iucv_message_complete {
 	u32 ipbfln2f;
 	u8  ippollfg;
 	u8  res2[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_message_complete(struct iucv_irq_data *data)
 {
@@ -1694,7 +1694,7 @@ struct iucv_message_pending {
 	u32 ipbfln2f;
 	u8  ippollfg;
 	u8  res2[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_message_pending(struct iucv_irq_data *data)
 {
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index c7000a6ca379..a2ed0f7b5568 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -600,7 +600,7 @@ struct iapp_layer2_update {
 	u8 ssap;		/* 0 */
 	u8 control;
 	u8 xid_info[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void ieee80211_send_layer2_update(struct sta_info *sta)
 {
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 1a9e2da37a93..ec3e5c3e27bd 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1084,7 +1084,7 @@ struct ieee80211_tx_status_rtap_hdr {
 	u8 padding_for_rate;
 	__le16 tx_flags;
 	u8 data_retries;
-} __attribute__ ((packed));
+} __packed;
 
 
 /* HT */
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 6e2a7bcd8cb8..2d9a2ee94e17 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2139,7 +2139,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
 		u8 rate_or_pad;
 		__le16 chan_freq;
 		__le16 chan_flags;
-	} __attribute__ ((packed)) *rthdr;
+	} __packed *rthdr;
 	struct sk_buff *skb = rx->skb, *skb2;
 	struct net_device *prev_dev = NULL;
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index bd2a50b482ac..246f92924658 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1817,7 +1817,7 @@ malformed:
 struct __sctp_missing {
 	__be32 num_missing;
 	__be16 type;
-}  __attribute__((packed));
+}  __packed;
 
 /*
  * Report a missing mandatory parameter.
-- 
cgit v1.2.3-59-g8ed1b


From 724df615928b7050d33b6243f60b12bd87484fc7 Mon Sep 17 00:00:00 2001
From: "Justin P. Mattock" <justinmattock@gmail.com>
Date: Wed, 26 May 2010 09:22:40 -0700
Subject: fix comment typo in netdevice.h

Fix missing "of" in comment.

 Signed-off-by: Justin P. Mattock <justinmattock@gmail.com>

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a1bff6518166..c761c903772e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -775,7 +775,7 @@ struct net_device {
 	/*
 	 * This is the first field of the "visible" part of this structure
 	 * (i.e. as seen by users in the "Space.c" file).  It is the name
-	 * the interface.
+	 * of the interface.
 	 */
 	char			name[IFNAMSIZ];
 
-- 
cgit v1.2.3-59-g8ed1b


From 14f92952bf74a365ca7f9dfbec158e7c933ea723 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 3 Jun 2010 19:37:27 -0700
Subject: ssb: add dma_dev to ssb_device structure

Add dma_dev, a pointer to struct device, to struct ssb_device.  We pass it
to the generic DMA API with SSB_BUSTYPE_PCI and SSB_BUSTYPE_SSB.
ssb_devices_register() sets up it properly.

This is preparation for replacing the ssb bus specific DMA API (ssb_dma_*)
with the generic DMA API.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Michael Buesch <mb@bu3sch.de>
Cc: Gary Zambrano <zambrano@broadcom.com>
Cc: Stefano Brivio <stefano.brivio@polimi.it>
Cc: Larry Finger <Larry.Finger@lwfinger.net>
Cc: John W. Linville <linville@tuxdriver.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/ssb/main.c      | 2 ++
 include/linux/ssb/ssb.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ssb/main.c b/drivers/ssb/main.c
index 51275aac5b34..a732b396e602 100644
--- a/drivers/ssb/main.c
+++ b/drivers/ssb/main.c
@@ -486,6 +486,7 @@ static int ssb_devices_register(struct ssb_bus *bus)
 #ifdef CONFIG_SSB_PCIHOST
 			sdev->irq = bus->host_pci->irq;
 			dev->parent = &bus->host_pci->dev;
+			sdev->dma_dev = dev->parent;
 #endif
 			break;
 		case SSB_BUSTYPE_PCMCIA:
@@ -501,6 +502,7 @@ static int ssb_devices_register(struct ssb_bus *bus)
 			break;
 		case SSB_BUSTYPE_SSB:
 			dev->dma_mask = &dev->coherent_dma_mask;
+			sdev->dma_dev = dev;
 			break;
 		}
 
diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index a2608bff9c78..0d5f04316b35 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -167,7 +167,7 @@ struct ssb_device {
 	 * is an optimization. */
 	const struct ssb_bus_ops *ops;
 
-	struct device *dev;
+	struct device *dev, *dma_dev;
 
 	struct ssb_bus *bus;
 	struct ssb_device_id id;
-- 
cgit v1.2.3-59-g8ed1b


From 467429b475e56f154f93b3b14fd75f238d14597a Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 3 Jun 2010 19:37:44 -0700
Subject: ssb: remove the ssb DMA API

Now they are unnecessary.  We can use the generic DMA API with any bus.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Michael Buesch <mb@bu3sch.de>
Cc: Gary Zambrano <zambrano@broadcom.com>
Cc: Stefano Brivio <stefano.brivio@polimi.it>
Cc: Larry Finger <Larry.Finger@lwfinger.net>
Cc: John W. Linville <linville@tuxdriver.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/ssb/main.c      |  74 -----------------------
 include/linux/ssb/ssb.h | 157 ------------------------------------------------
 2 files changed, 231 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ssb/main.c b/drivers/ssb/main.c
index a732b396e602..7cee7f4eb60b 100644
--- a/drivers/ssb/main.c
+++ b/drivers/ssb/main.c
@@ -1228,80 +1228,6 @@ u32 ssb_dma_translation(struct ssb_device *dev)
 }
 EXPORT_SYMBOL(ssb_dma_translation);
 
-int ssb_dma_set_mask(struct ssb_device *dev, u64 mask)
-{
-#ifdef CONFIG_SSB_PCIHOST
-	int err;
-#endif
-
-	switch (dev->bus->bustype) {
-	case SSB_BUSTYPE_PCI:
-#ifdef CONFIG_SSB_PCIHOST
-		err = pci_set_dma_mask(dev->bus->host_pci, mask);
-		if (err)
-			return err;
-		err = pci_set_consistent_dma_mask(dev->bus->host_pci, mask);
-		return err;
-#endif
-	case SSB_BUSTYPE_SSB:
-		return dma_set_mask(dev->dev, mask);
-	default:
-		__ssb_dma_not_implemented(dev);
-	}
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(ssb_dma_set_mask);
-
-void * ssb_dma_alloc_consistent(struct ssb_device *dev, size_t size,
-				dma_addr_t *dma_handle, gfp_t gfp_flags)
-{
-	switch (dev->bus->bustype) {
-	case SSB_BUSTYPE_PCI:
-#ifdef CONFIG_SSB_PCIHOST
-		if (gfp_flags & GFP_DMA) {
-			/* Workaround: The PCI API does not support passing
-			 * a GFP flag. */
-			return dma_alloc_coherent(&dev->bus->host_pci->dev,
-						  size, dma_handle, gfp_flags);
-		}
-		return pci_alloc_consistent(dev->bus->host_pci, size, dma_handle);
-#endif
-	case SSB_BUSTYPE_SSB:
-		return dma_alloc_coherent(dev->dev, size, dma_handle, gfp_flags);
-	default:
-		__ssb_dma_not_implemented(dev);
-	}
-	return NULL;
-}
-EXPORT_SYMBOL(ssb_dma_alloc_consistent);
-
-void ssb_dma_free_consistent(struct ssb_device *dev, size_t size,
-			     void *vaddr, dma_addr_t dma_handle,
-			     gfp_t gfp_flags)
-{
-	switch (dev->bus->bustype) {
-	case SSB_BUSTYPE_PCI:
-#ifdef CONFIG_SSB_PCIHOST
-		if (gfp_flags & GFP_DMA) {
-			/* Workaround: The PCI API does not support passing
-			 * a GFP flag. */
-			dma_free_coherent(&dev->bus->host_pci->dev,
-					  size, vaddr, dma_handle);
-			return;
-		}
-		pci_free_consistent(dev->bus->host_pci, size,
-				    vaddr, dma_handle);
-		return;
-#endif
-	case SSB_BUSTYPE_SSB:
-		dma_free_coherent(dev->dev, size, vaddr, dma_handle);
-		return;
-	default:
-		__ssb_dma_not_implemented(dev);
-	}
-}
-EXPORT_SYMBOL(ssb_dma_free_consistent);
-
 int ssb_bus_may_powerdown(struct ssb_bus *bus)
 {
 	struct ssb_chipcommon *cc;
diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 0d5f04316b35..623b704fdc42 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -470,14 +470,6 @@ extern u32 ssb_dma_translation(struct ssb_device *dev);
 #define SSB_DMA_TRANSLATION_MASK	0xC0000000
 #define SSB_DMA_TRANSLATION_SHIFT	30
 
-extern int ssb_dma_set_mask(struct ssb_device *dev, u64 mask);
-
-extern void * ssb_dma_alloc_consistent(struct ssb_device *dev, size_t size,
-				       dma_addr_t *dma_handle, gfp_t gfp_flags);
-extern void ssb_dma_free_consistent(struct ssb_device *dev, size_t size,
-				    void *vaddr, dma_addr_t dma_handle,
-				    gfp_t gfp_flags);
-
 static inline void __cold __ssb_dma_not_implemented(struct ssb_device *dev)
 {
 #ifdef CONFIG_SSB_DEBUG
@@ -486,155 +478,6 @@ static inline void __cold __ssb_dma_not_implemented(struct ssb_device *dev)
 #endif /* DEBUG */
 }
 
-static inline int ssb_dma_mapping_error(struct ssb_device *dev, dma_addr_t addr)
-{
-	switch (dev->bus->bustype) {
-	case SSB_BUSTYPE_PCI:
-#ifdef CONFIG_SSB_PCIHOST
-		return pci_dma_mapping_error(dev->bus->host_pci, addr);
-#endif
-		break;
-	case SSB_BUSTYPE_SSB:
-		return dma_mapping_error(dev->dev, addr);
-	default:
-		break;
-	}
-	__ssb_dma_not_implemented(dev);
-	return -ENOSYS;
-}
-
-static inline dma_addr_t ssb_dma_map_single(struct ssb_device *dev, void *p,
-					    size_t size, enum dma_data_direction dir)
-{
-	switch (dev->bus->bustype) {
-	case SSB_BUSTYPE_PCI:
-#ifdef CONFIG_SSB_PCIHOST
-		return pci_map_single(dev->bus->host_pci, p, size, dir);
-#endif
-		break;
-	case SSB_BUSTYPE_SSB:
-		return dma_map_single(dev->dev, p, size, dir);
-	default:
-		break;
-	}
-	__ssb_dma_not_implemented(dev);
-	return 0;
-}
-
-static inline void ssb_dma_unmap_single(struct ssb_device *dev, dma_addr_t dma_addr,
-					size_t size, enum dma_data_direction dir)
-{
-	switch (dev->bus->bustype) {
-	case SSB_BUSTYPE_PCI:
-#ifdef CONFIG_SSB_PCIHOST
-		pci_unmap_single(dev->bus->host_pci, dma_addr, size, dir);
-		return;
-#endif
-		break;
-	case SSB_BUSTYPE_SSB:
-		dma_unmap_single(dev->dev, dma_addr, size, dir);
-		return;
-	default:
-		break;
-	}
-	__ssb_dma_not_implemented(dev);
-}
-
-static inline void ssb_dma_sync_single_for_cpu(struct ssb_device *dev,
-					       dma_addr_t dma_addr,
-					       size_t size,
-					       enum dma_data_direction dir)
-{
-	switch (dev->bus->bustype) {
-	case SSB_BUSTYPE_PCI:
-#ifdef CONFIG_SSB_PCIHOST
-		pci_dma_sync_single_for_cpu(dev->bus->host_pci, dma_addr,
-					    size, dir);
-		return;
-#endif
-		break;
-	case SSB_BUSTYPE_SSB:
-		dma_sync_single_for_cpu(dev->dev, dma_addr, size, dir);
-		return;
-	default:
-		break;
-	}
-	__ssb_dma_not_implemented(dev);
-}
-
-static inline void ssb_dma_sync_single_for_device(struct ssb_device *dev,
-						  dma_addr_t dma_addr,
-						  size_t size,
-						  enum dma_data_direction dir)
-{
-	switch (dev->bus->bustype) {
-	case SSB_BUSTYPE_PCI:
-#ifdef CONFIG_SSB_PCIHOST
-		pci_dma_sync_single_for_device(dev->bus->host_pci, dma_addr,
-					       size, dir);
-		return;
-#endif
-		break;
-	case SSB_BUSTYPE_SSB:
-		dma_sync_single_for_device(dev->dev, dma_addr, size, dir);
-		return;
-	default:
-		break;
-	}
-	__ssb_dma_not_implemented(dev);
-}
-
-static inline void ssb_dma_sync_single_range_for_cpu(struct ssb_device *dev,
-						     dma_addr_t dma_addr,
-						     unsigned long offset,
-						     size_t size,
-						     enum dma_data_direction dir)
-{
-	switch (dev->bus->bustype) {
-	case SSB_BUSTYPE_PCI:
-#ifdef CONFIG_SSB_PCIHOST
-		/* Just sync everything. That's all the PCI API can do. */
-		pci_dma_sync_single_for_cpu(dev->bus->host_pci, dma_addr,
-					    offset + size, dir);
-		return;
-#endif
-		break;
-	case SSB_BUSTYPE_SSB:
-		dma_sync_single_range_for_cpu(dev->dev, dma_addr, offset,
-					      size, dir);
-		return;
-	default:
-		break;
-	}
-	__ssb_dma_not_implemented(dev);
-}
-
-static inline void ssb_dma_sync_single_range_for_device(struct ssb_device *dev,
-							dma_addr_t dma_addr,
-							unsigned long offset,
-							size_t size,
-							enum dma_data_direction dir)
-{
-	switch (dev->bus->bustype) {
-	case SSB_BUSTYPE_PCI:
-#ifdef CONFIG_SSB_PCIHOST
-		/* Just sync everything. That's all the PCI API can do. */
-		pci_dma_sync_single_for_device(dev->bus->host_pci, dma_addr,
-					       offset + size, dir);
-		return;
-#endif
-		break;
-	case SSB_BUSTYPE_SSB:
-		dma_sync_single_range_for_device(dev->dev, dma_addr, offset,
-						 size, dir);
-		return;
-	default:
-		break;
-	}
-	__ssb_dma_not_implemented(dev);
-}
-
-
 #ifdef CONFIG_SSB_PCIHOST
 /* PCI-host wrapper driver */
 extern int ssb_pcihost_register(struct pci_driver *driver);
-- 
cgit v1.2.3-59-g8ed1b


From 5360bd776f73d0a7da571d72a09a03f237e99900 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Fri, 28 May 2010 23:01:00 -0400
Subject: Fix up the "generic" unistd.h ABI to be more useful.

Reserve 16 "architecture-specific" syscall numbers starting at 244.

Allow use of the sys_sync_file_range2() API with the generic unistd.h
by specifying __ARCH_WANT_SYNC_FILE_RANGE2 before including it.

Allow using the generic unistd.h to create the "compat" syscall table
by specifying __SYSCALL_COMPAT before including it.

Use sys_fadvise64_64 for __NR3264_fadvise64 in both 32- and 64-bit mode.

Request the appropriate __ARCH_WANT_COMPAT_SYS_xxx values when
some deprecated syscall modes are selected.

As part of this change to fix up the syscalls, also provide a couple
of missing signal-related syscall prototypes in <linux/syscalls.h>.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/unistd.h | 26 ++++++++++++++++++++------
 include/linux/syscalls.h     |  4 ++++
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index 6a0b30f78a62..30218b4fa4e0 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -18,7 +18,7 @@
 #define __SYSCALL(x, y)
 #endif
 
-#if __BITS_PER_LONG == 32
+#if __BITS_PER_LONG == 32 || defined(__SYSCALL_COMPAT)
 #define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _32)
 #else
 #define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _64)
@@ -241,8 +241,13 @@ __SYSCALL(__NR_sync, sys_sync)
 __SYSCALL(__NR_fsync, sys_fsync)
 #define __NR_fdatasync 83
 __SYSCALL(__NR_fdatasync, sys_fdatasync)
+#ifdef __ARCH_WANT_SYNC_FILE_RANGE2
+#define __NR_sync_file_range2 84
+__SYSCALL(__NR_sync_file_range2, sys_sync_file_range2)
+#else
 #define __NR_sync_file_range 84
-__SYSCALL(__NR_sync_file_range, sys_sync_file_range) /* .long sys_sync_file_range2, */
+__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
+#endif
 
 /* fs/timerfd.c */
 #define __NR_timerfd_create 85
@@ -580,7 +585,7 @@ __SYSCALL(__NR_execve, sys_execve)	/* .long sys_execve_wrapper */
 __SC_3264(__NR3264_mmap, sys_mmap2, sys_mmap)
 /* mm/fadvise.c */
 #define __NR3264_fadvise64 223
-__SC_3264(__NR3264_fadvise64, sys_fadvise64_64, sys_fadvise64)
+__SYSCALL(__NR3264_fadvise64, sys_fadvise64_64)
 
 /* mm/, CONFIG_MMU only */
 #ifndef __ARCH_NOMMU
@@ -627,8 +632,14 @@ __SYSCALL(__NR_accept4, sys_accept4)
 #define __NR_recvmmsg 243
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
 
+/*
+ * Architectures may provide up to 16 syscalls of their own
+ * starting with this value.
+ */
+#define __NR_arch_specific_syscall 244
+
 #undef __NR_syscalls
-#define __NR_syscalls 244
+#define __NR_syscalls 260
 
 /*
  * All syscalls below here should go away really,
@@ -694,7 +705,8 @@ __SYSCALL(__NR_signalfd, sys_signalfd)
 #define __NR_syscalls (__NR_signalfd+1)
 #endif /* __ARCH_WANT_SYSCALL_NO_FLAGS */
 
-#if __BITS_PER_LONG == 32 && defined(__ARCH_WANT_SYSCALL_OFF_T)
+#if (__BITS_PER_LONG == 32 || defined(__SYSCALL_COMPAT)) && \
+     defined(__ARCH_WANT_SYSCALL_OFF_T)
 #define __NR_sendfile 1046
 __SYSCALL(__NR_sendfile, sys_sendfile)
 #define __NR_ftruncate 1047
@@ -740,6 +752,7 @@ __SYSCALL(__NR_getpgrp, sys_getpgrp)
 __SYSCALL(__NR_pause, sys_pause)
 #define __NR_time 1062
 #define __ARCH_WANT_SYS_TIME
+#define __ARCH_WANT_COMPAT_SYS_TIME
 __SYSCALL(__NR_time, sys_time)
 #define __NR_utime 1063
 #define __ARCH_WANT_SYS_UTIME
@@ -801,7 +814,7 @@ __SYSCALL(__NR_fork, sys_ni_syscall)
  * Here we map the numbers so that both versions
  * use the same syscall table layout.
  */
-#if __BITS_PER_LONG == 64
+#if __BITS_PER_LONG == 64 && !defined(__SYSCALL_COMPAT)
 #define __NR_fcntl __NR3264_fcntl
 #define __NR_statfs __NR3264_statfs
 #define __NR_fstatfs __NR3264_fstatfs
@@ -848,6 +861,7 @@ __SYSCALL(__NR_fork, sys_ni_syscall)
 #endif
 #define __ARCH_WANT_SYS_RT_SIGACTION
 #define __ARCH_WANT_SYS_RT_SIGSUSPEND
+#define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
 
 /*
  * "Conditional" syscalls
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a1a86a53bc73..4a19d9bb8368 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -364,9 +364,13 @@ asmlinkage long sys_init_module(void __user *umod, unsigned long len,
 asmlinkage long sys_delete_module(const char __user *name_user,
 				unsigned int flags);
 
+asmlinkage long sys_rt_sigaction(int sig, const struct sigaction __user *act,
+				 struct sigaction __user *oact,
+				 size_t sigsetsize);
 asmlinkage long sys_rt_sigprocmask(int how, sigset_t __user *set,
 				sigset_t __user *oset, size_t sigsetsize);
 asmlinkage long sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize);
+asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize);
 asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
 				siginfo_t __user *uinfo,
 				const struct timespec __user *uts,
-- 
cgit v1.2.3-59-g8ed1b


From b78462ebc6a4ef9074aa80abebcdd470dc5f0ce0 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Wed, 2 Jun 2010 12:24:37 +0000
Subject: skbuff: add check for non-linear to warn_if_lro and needs_linearize

We can avoid an unecessary cache miss by checking if the skb is non-linear
before accessing gso_size/gso_type in skb_warn_if_lro, the same can also be
done to avoid a cache miss on nr_frags if data_len is 0.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 ++-
 net/core/dev.c         | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bf243fc54959..645e78d395fd 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2129,7 +2129,8 @@ static inline bool skb_warn_if_lro(const struct sk_buff *skb)
 	/* LRO sets gso_size but not gso_type, whereas if GSO is really
 	 * wanted then gso_type will be set. */
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
-	if (shinfo->gso_size != 0 && unlikely(shinfo->gso_type == 0)) {
+	if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 &&
+	    unlikely(shinfo->gso_type == 0)) {
 		__skb_warn_lro_forwarding(skb);
 		return true;
 	}
diff --git a/net/core/dev.c b/net/core/dev.c
index ec01a5998d70..3abb3a6058be 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2103,9 +2103,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 static inline int skb_needs_linearize(struct sk_buff *skb,
 				      struct net_device *dev)
 {
-	return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
-	       (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
-					      illegal_highdma(dev, skb)));
+	return skb_is_nonlinear(skb) &&
+	       ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
+	        (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
+					      illegal_highdma(dev, skb))));
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From bb1d912323d5dd50e1079e389f4e964be14f0ae3 Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <andy@greyhouse.net>
Date: Wed, 2 Jun 2010 08:40:18 +0000
Subject: bonding: allow user-controlled output slave selection

v2: changed bonding module version, modified to apply on top of changes
from previous patch in series, and updated documentation to elaborate on
multiqueue awareness that now exists in bonding driver.

This patch give the user the ability to control the output slave for
round-robin and active-backup bonding.  Similar functionality was
discussed in the past, but Jay Vosburgh indicated he would rather see a
feature like this added to existing modes rather than creating a
completely new mode.  Jay's thoughts as well as Neil's input surrounding
some of the issues with the first implementation pushed us toward a
design that relied on the queue_mapping rather than skb marks.
Round-robin and active-backup modes were chosen as the first users of
this slave selection as they seemed like the most logical choices when
considering a multi-switch environment.

Round-robin mode works without any modification, but active-backup does
require inclusion of the first patch in this series and setting
the 'all_slaves_active' flag.  This will allow reception of unicast traffic on
any of the backup interfaces.

This was tested with IPv4-based filters as well as VLAN-based filters
with good results.

More information as well as a configuration example is available in the
patch to Documentation/networking/bonding.txt.

Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.txt |  84 ++++++++++++++++++++++++-
 drivers/net/bonding/bond_main.c      |  75 +++++++++++++++++++++-
 drivers/net/bonding/bond_sysfs.c     | 116 +++++++++++++++++++++++++++++++++++
 drivers/net/bonding/bonding.h        |   9 ++-
 include/linux/if_bonding.h           |   1 +
 5 files changed, 278 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 61f516b135b4..d0914781830e 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -49,6 +49,7 @@ Table of Contents
 3.3	Configuring Bonding Manually with Ifenslave
 3.3.1		Configuring Multiple Bonds Manually
 3.4	Configuring Bonding Manually via Sysfs
+3.5	Overriding Configuration for Special Cases
 
 4. Querying Bonding Configuration
 4.1	Bonding Configuration
@@ -1318,8 +1319,87 @@ echo 2000 > /sys/class/net/bond1/bonding/arp_interval
 echo +eth2 > /sys/class/net/bond1/bonding/slaves
 echo +eth3 > /sys/class/net/bond1/bonding/slaves
 
-
-4. Querying Bonding Configuration 
+3.5 Overriding Configuration for Special Cases
+----------------------------------------------
+When using the bonding driver, the physical port which transmits a frame is
+typically selected by the bonding driver, and is not relevant to the user or
+system administrator.  The output port is simply selected using the policies of
+the selected bonding mode.  On occasion however, it is helpful to direct certain
+classes of traffic to certain physical interfaces on output to implement
+slightly more complex policies.  For example, to reach a web server over a
+bonded interface in which eth0 connects to a private network, while eth1
+connects via a public network, it may be desirous to bias the bond to send said
+traffic over eth0 first, using eth1 only as a fall back, while all other traffic
+can safely be sent over either interface.  Such configurations may be achieved
+using the traffic control utilities inherent in linux.
+
+By default the bonding driver is multiqueue aware and 16 queues are created
+when the driver initializes (see Documentation/networking/multiqueue.txt
+for details).  If more or less queues are desired the module parameter
+tx_queues can be used to change this value.  There is no sysfs parameter
+available as the allocation is done at module init time.
+
+The output of the file /proc/net/bonding/bondX has changed so the output Queue
+ID is now printed for each slave:
+
+Bonding Mode: fault-tolerance (active-backup)
+Primary Slave: None
+Currently Active Slave: eth0
+MII Status: up
+MII Polling Interval (ms): 0
+Up Delay (ms): 0
+Down Delay (ms): 0
+
+Slave Interface: eth0
+MII Status: up
+Link Failure Count: 0
+Permanent HW addr: 00:1a:a0:12:8f:cb
+Slave queue ID: 0
+
+Slave Interface: eth1
+MII Status: up
+Link Failure Count: 0
+Permanent HW addr: 00:1a:a0:12:8f:cc
+Slave queue ID: 2
+
+The queue_id for a slave can be set using the command:
+
+# echo "eth1:2" > /sys/class/net/bond0/bonding/queue_id
+
+Any interface that needs a queue_id set should set it with multiple calls
+like the one above until proper priorities are set for all interfaces.  On
+distributions that allow configuration via initscripts, multiple 'queue_id'
+arguments can be added to BONDING_OPTS to set all needed slave queues.
+
+These queue id's can be used in conjunction with the tc utility to configure
+a multiqueue qdisc and filters to bias certain traffic to transmit on certain
+slave devices.  For instance, say we wanted, in the above configuration to
+force all traffic bound to 192.168.1.100 to use eth1 in the bond as its output
+device. The following commands would accomplish this:
+
+# tc qdisc add dev bond0 handle 1 root multiq
+
+# tc filter add dev bond0 protocol ip parent 1: prio 1 u32 match ip dst \
+	192.168.1.100 action skbedit queue_mapping 2
+
+These commands tell the kernel to attach a multiqueue queue discipline to the
+bond0 interface and filter traffic enqueued to it, such that packets with a dst
+ip of 192.168.1.100 have their output queue mapping value overwritten to 2.
+This value is then passed into the driver, causing the normal output path
+selection policy to be overridden, selecting instead qid 2, which maps to eth1.
+
+Note that qid values begin at 1.  Qid 0 is reserved to initiate to the driver
+that normal output policy selection should take place.  One benefit to simply
+leaving the qid for a slave to 0 is the multiqueue awareness in the bonding
+driver that is now present.  This awareness allows tc filters to be placed on
+slave devices as well as bond devices and the bonding driver will simply act as
+a pass-through for selecting output queues on the slave device rather than 
+output port selection.
+
+This feature first appeared in bonding driver version 3.7.0 and support for
+output slave selection was limited to round-robin and active-backup modes.
+
+4 Querying Bonding Configuration
 =================================
 
 4.1 Bonding Configuration
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index f22f6bf43858..1b19276cff12 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -90,6 +90,7 @@
 #define BOND_LINK_ARP_INTERV	0
 
 static int max_bonds	= BOND_DEFAULT_MAX_BONDS;
+static int tx_queues	= BOND_DEFAULT_TX_QUEUES;
 static int num_grat_arp = 1;
 static int num_unsol_na = 1;
 static int miimon	= BOND_LINK_MON_INTERV;
@@ -111,6 +112,8 @@ static struct bond_params bonding_defaults;
 
 module_param(max_bonds, int, 0);
 MODULE_PARM_DESC(max_bonds, "Max number of bonded devices");
+module_param(tx_queues, int, 0);
+MODULE_PARM_DESC(tx_queues, "Max number of transmit queues (default = 16)");
 module_param(num_grat_arp, int, 0644);
 MODULE_PARM_DESC(num_grat_arp, "Number of gratuitous ARP packets to send on failover event");
 module_param(num_unsol_na, int, 0644);
@@ -1540,6 +1543,12 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		goto err_undo_flags;
 	}
 
+	/*
+	 * Set the new_slave's queue_id to be zero.  Queue ID mapping
+	 * is set via sysfs or module option if desired.
+	 */
+	new_slave->queue_id = 0;
+
 	/* Save slave's original mtu and then set it to match the bond */
 	new_slave->original_mtu = slave_dev->mtu;
 	res = dev_set_mtu(slave_dev, bond->dev->mtu);
@@ -3285,6 +3294,7 @@ static void bond_info_show_slave(struct seq_file *seq,
 		else
 			seq_puts(seq, "Aggregator ID: N/A\n");
 	}
+	seq_printf(seq, "Slave queue ID: %d\n", slave->queue_id);
 }
 
 static int bond_info_seq_show(struct seq_file *seq, void *v)
@@ -4421,9 +4431,59 @@ static void bond_set_xmit_hash_policy(struct bonding *bond)
 	}
 }
 
+/*
+ * Lookup the slave that corresponds to a qid
+ */
+static inline int bond_slave_override(struct bonding *bond,
+				      struct sk_buff *skb)
+{
+	int i, res = 1;
+	struct slave *slave = NULL;
+	struct slave *check_slave;
+
+	read_lock(&bond->lock);
+
+	if (!BOND_IS_OK(bond) || !skb->queue_mapping)
+		goto out;
+
+	/* Find out if any slaves have the same mapping as this skb. */
+	bond_for_each_slave(bond, check_slave, i) {
+		if (check_slave->queue_id == skb->queue_mapping) {
+			slave = check_slave;
+			break;
+		}
+	}
+
+	/* If the slave isn't UP, use default transmit policy. */
+	if (slave && slave->queue_id && IS_UP(slave->dev) &&
+	    (slave->link == BOND_LINK_UP)) {
+		res = bond_dev_queue_xmit(bond, skb, slave->dev);
+	}
+
+out:
+	read_unlock(&bond->lock);
+	return res;
+}
+
+static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	/*
+	 * This helper function exists to help dev_pick_tx get the correct
+	 * destination queue.  Using a helper function skips the a call to
+	 * skb_tx_hash and will put the skbs in the queue we expect on their
+	 * way down to the bonding driver.
+	 */
+	return skb->queue_mapping;
+}
+
 static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	const struct bonding *bond = netdev_priv(dev);
+	struct bonding *bond = netdev_priv(dev);
+
+	if (TX_QUEUE_OVERRIDE(bond->params.mode)) {
+		if (!bond_slave_override(bond, skb))
+			return NETDEV_TX_OK;
+	}
 
 	switch (bond->params.mode) {
 	case BOND_MODE_ROUNDROBIN:
@@ -4508,6 +4568,7 @@ static const struct net_device_ops bond_netdev_ops = {
 	.ndo_open		= bond_open,
 	.ndo_stop		= bond_close,
 	.ndo_start_xmit		= bond_start_xmit,
+	.ndo_select_queue	= bond_select_queue,
 	.ndo_get_stats		= bond_get_stats,
 	.ndo_do_ioctl		= bond_do_ioctl,
 	.ndo_set_multicast_list	= bond_set_multicast_list,
@@ -4776,6 +4837,13 @@ static int bond_check_params(struct bond_params *params)
 		}
 	}
 
+	if (tx_queues < 1 || tx_queues > 255) {
+		pr_warning("Warning: tx_queues (%d) should be between "
+			   "1 and 255, resetting to %d\n",
+			   tx_queues, BOND_DEFAULT_TX_QUEUES);
+		tx_queues = BOND_DEFAULT_TX_QUEUES;
+	}
+
 	if ((all_slaves_active != 0) && (all_slaves_active != 1)) {
 		pr_warning("Warning: all_slaves_active module parameter (%d), "
 			   "not of valid value (0/1), so it was set to "
@@ -4953,6 +5021,7 @@ static int bond_check_params(struct bond_params *params)
 	params->primary[0] = 0;
 	params->primary_reselect = primary_reselect_value;
 	params->fail_over_mac = fail_over_mac_value;
+	params->tx_queues = tx_queues;
 	params->all_slaves_active = all_slaves_active;
 
 	if (primary) {
@@ -5040,8 +5109,8 @@ int bond_create(struct net *net, const char *name)
 
 	rtnl_lock();
 
-	bond_dev = alloc_netdev(sizeof(struct bonding), name ? name : "",
-				bond_setup);
+	bond_dev = alloc_netdev_mq(sizeof(struct bonding), name ? name : "",
+				bond_setup, tx_queues);
 	if (!bond_dev) {
 		pr_err("%s: eek! can't alloc netdev!\n", name);
 		rtnl_unlock();
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 066311a5e084..f9a034361a8e 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -1411,6 +1411,121 @@ static ssize_t bonding_show_ad_partner_mac(struct device *d,
 }
 static DEVICE_ATTR(ad_partner_mac, S_IRUGO, bonding_show_ad_partner_mac, NULL);
 
+/*
+ * Show the queue_ids of the slaves in the current bond.
+ */
+static ssize_t bonding_show_queue_id(struct device *d,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct slave *slave;
+	int i, res = 0;
+	struct bonding *bond = to_bond(d);
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	read_lock(&bond->lock);
+	bond_for_each_slave(bond, slave, i) {
+		if (res > (PAGE_SIZE - 6)) {
+			/* not enough space for another interface name */
+			if ((PAGE_SIZE - res) > 10)
+				res = PAGE_SIZE - 10;
+			res += sprintf(buf + res, "++more++ ");
+			break;
+		}
+		res += sprintf(buf + res, "%s:%d ",
+			       slave->dev->name, slave->queue_id);
+	}
+	read_unlock(&bond->lock);
+	if (res)
+		buf[res-1] = '\n'; /* eat the leftover space */
+	rtnl_unlock();
+	return res;
+}
+
+/*
+ * Set the queue_ids of the  slaves in the current bond.  The bond
+ * interface must be enslaved for this to work.
+ */
+static ssize_t bonding_store_queue_id(struct device *d,
+				      struct device_attribute *attr,
+				      const char *buffer, size_t count)
+{
+	struct slave *slave, *update_slave;
+	struct bonding *bond = to_bond(d);
+	u16 qid;
+	int i, ret = count;
+	char *delim;
+	struct net_device *sdev = NULL;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	/* delim will point to queue id if successful */
+	delim = strchr(buffer, ':');
+	if (!delim)
+		goto err_no_cmd;
+
+	/*
+	 * Terminate string that points to device name and bump it
+	 * up one, so we can read the queue id there.
+	 */
+	*delim = '\0';
+	if (sscanf(++delim, "%hd\n", &qid) != 1)
+		goto err_no_cmd;
+
+	/* Check buffer length, valid ifname and queue id */
+	if (strlen(buffer) > IFNAMSIZ ||
+	    !dev_valid_name(buffer) ||
+	    qid > bond->params.tx_queues)
+		goto err_no_cmd;
+
+	/* Get the pointer to that interface if it exists */
+	sdev = __dev_get_by_name(dev_net(bond->dev), buffer);
+	if (!sdev)
+		goto err_no_cmd;
+
+	read_lock(&bond->lock);
+
+	/* Search for thes slave and check for duplicate qids */
+	update_slave = NULL;
+	bond_for_each_slave(bond, slave, i) {
+		if (sdev == slave->dev)
+			/*
+			 * We don't need to check the matching
+			 * slave for dups, since we're overwriting it
+			 */
+			update_slave = slave;
+		else if (qid && qid == slave->queue_id) {
+			goto err_no_cmd_unlock;
+		}
+	}
+
+	if (!update_slave)
+		goto err_no_cmd_unlock;
+
+	/* Actually set the qids for the slave */
+	update_slave->queue_id = qid;
+
+	read_unlock(&bond->lock);
+out:
+	rtnl_unlock();
+	return ret;
+
+err_no_cmd_unlock:
+	read_unlock(&bond->lock);
+err_no_cmd:
+	pr_info("invalid input for queue_id set for %s.\n",
+		bond->dev->name);
+	ret = -EPERM;
+	goto out;
+}
+
+static DEVICE_ATTR(queue_id, S_IRUGO | S_IWUSR, bonding_show_queue_id,
+		   bonding_store_queue_id);
+
+
 /*
  * Show and set the all_slaves_active flag.
  */
@@ -1489,6 +1604,7 @@ static struct attribute *per_bond_attrs[] = {
 	&dev_attr_ad_actor_key.attr,
 	&dev_attr_ad_partner_key.attr,
 	&dev_attr_ad_partner_mac.attr,
+	&dev_attr_queue_id.attr,
 	&dev_attr_all_slaves_active.attr,
 	NULL,
 };
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index cecdea2a629f..c6fdd851579a 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -23,8 +23,8 @@
 #include "bond_3ad.h"
 #include "bond_alb.h"
 
-#define DRV_VERSION	"3.6.0"
-#define DRV_RELDATE	"September 26, 2009"
+#define DRV_VERSION	"3.7.0"
+#define DRV_RELDATE	"June 2, 2010"
 #define DRV_NAME	"bonding"
 #define DRV_DESCRIPTION	"Ethernet Channel Bonding Driver"
 
@@ -60,6 +60,9 @@
 		 ((mode) == BOND_MODE_TLB)          ||	\
 		 ((mode) == BOND_MODE_ALB))
 
+#define TX_QUEUE_OVERRIDE(mode)				\
+			(((mode) == BOND_MODE_ACTIVEBACKUP) ||	\
+			 ((mode) == BOND_MODE_ROUNDROBIN))
 /*
  * Less bad way to call ioctl from within the kernel; this needs to be
  * done some other way to get the call out of interrupt context.
@@ -131,6 +134,7 @@ struct bond_params {
 	char primary[IFNAMSIZ];
 	int primary_reselect;
 	__be32 arp_targets[BOND_MAX_ARP_TARGETS];
+	int tx_queues;
 	int all_slaves_active;
 };
 
@@ -165,6 +169,7 @@ struct slave {
 	u8     perm_hwaddr[ETH_ALEN];
 	u16    speed;
 	u8     duplex;
+	u16    queue_id;
 	struct ad_slave_info ad_info; /* HUGE - better to dynamically alloc */
 	struct tlb_slave_info tlb_info;
 };
diff --git a/include/linux/if_bonding.h b/include/linux/if_bonding.h
index cd525fae3c98..2c7994372bde 100644
--- a/include/linux/if_bonding.h
+++ b/include/linux/if_bonding.h
@@ -83,6 +83,7 @@
 
 #define BOND_DEFAULT_MAX_BONDS  1   /* Default maximum number of devices to support */
 
+#define BOND_DEFAULT_TX_QUEUES 16   /* Default number of tx queues per device */
 /* hashing types */
 #define BOND_XMIT_POLICY_LAYER2		0 /* layer 2 (MAC only), default */
 #define BOND_XMIT_POLICY_LAYER34	1 /* layer 3+4 (IP ^ (TCP || UDP)) */
-- 
cgit v1.2.3-59-g8ed1b


From 139ef32b0e6b88b00b5e3e74d052d938f178dc9b Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Mon, 7 Jun 2010 08:48:13 -0400
Subject: Revert adding some arch-specific signal syscalls to
 <linux/syscalls.h>.

It turns out there is some variance on the calling conventions for
these syscalls, and <asm-generic/syscalls.h> is already the mechanism
used to handle this.  Switch arch/tile over to using that mechanism and
tweak the calling conventions for a couple of tile syscalls to match
<asm-generic/syscalls.h>.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/include/asm/syscalls.h | 22 +---------------------
 arch/tile/kernel/process.c       |  2 +-
 arch/tile/kernel/signal.c        |  4 ++--
 arch/tile/kernel/sys.c           |  2 +-
 include/linux/syscalls.h         |  4 ----
 5 files changed, 5 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/arch/tile/include/asm/syscalls.h b/arch/tile/include/asm/syscalls.h
index e1be54d1a7d8..9f2b8e2f69d5 100644
--- a/arch/tile/include/asm/syscalls.h
+++ b/arch/tile/include/asm/syscalls.h
@@ -22,21 +22,7 @@
 #include <linux/linkage.h>
 #include <linux/signal.h>
 #include <linux/types.h>
-
-/* kernel/process.c */
-int sys_fork(struct pt_regs *);
-int sys_vfork(struct pt_regs *);
-int sys_clone(unsigned long clone_flags, unsigned long newsp,
-	      int __user *parent_tidptr, int __user *child_tidptr,
-	      struct pt_regs *);
-int sys_execve(char __user *path, char __user *__user *argv,
-	       char __user *__user *envp, struct pt_regs *);
-
-/* kernel/signal.c */
-int sys_sigaltstack(const stack_t __user *, stack_t __user *,
-		    struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
-int sys_raise_fpe(int code, unsigned long addr, struct pt_regs*);
+#include <asm-generic/syscalls.h>
 
 /* kernel/sys.c */
 ssize_t sys32_readahead(int fd, u32 offset_lo, u32 offset_hi, u32 count);
@@ -45,12 +31,6 @@ long sys32_fadvise64(int fd, u32 offset_lo, u32 offset_hi,
 int sys32_fadvise64_64(int fd, u32 offset_lo, u32 offset_hi,
 		       u32 len_lo, u32 len_hi, int advice);
 long sys_flush_cache(void);
-long sys_mmap(unsigned long addr, unsigned long len,
-	      unsigned long prot, unsigned long flags,
-	      unsigned long fd, unsigned long offset);
-long sys_mmap2(unsigned long addr, unsigned long len,
-	       unsigned long prot, unsigned long flags,
-	       unsigned long fd, unsigned long offset);
 
 #ifndef __tilegx__
 /* mm/fault.c */
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 824f230e6d1a..c70ff14a48e4 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -502,7 +502,7 @@ int _sys_fork(struct pt_regs *regs)
 }
 
 int _sys_clone(unsigned long clone_flags, unsigned long newsp,
-	       int __user *parent_tidptr, int __user *child_tidptr,
+	       void __user *parent_tidptr, void __user *child_tidptr,
 	       struct pt_regs *regs)
 {
 	if (!newsp)
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 7ea85eb85242..45835cfad407 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -43,8 +43,8 @@
 /* Caller before callee in this file; other callee is in assembler */
 void do_signal(struct pt_regs *regs);
 
-int _sys_sigaltstack(const stack_t __user *uss,
-		     stack_t __user *uoss, struct pt_regs *regs)
+long _sys_sigaltstack(const stack_t __user *uss,
+                      stack_t __user *uoss, struct pt_regs *regs)
 {
 	return do_sigaltstack(uss, uoss, regs->sp);
 }
diff --git a/arch/tile/kernel/sys.c b/arch/tile/kernel/sys.c
index a3d982b212b4..0427978cea0a 100644
--- a/arch/tile/kernel/sys.c
+++ b/arch/tile/kernel/sys.c
@@ -95,7 +95,7 @@ SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len,
  */
 SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
 		unsigned long, prot, unsigned long, flags,
-		unsigned long, fd, unsigned long, offset)
+		unsigned long, fd, off_t, offset)
 {
 	if (offset & ((1 << PAGE_SHIFT) - 1))
 		return -EINVAL;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1e3cd5fec7ed..7f614ce274a9 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -364,13 +364,9 @@ asmlinkage long sys_init_module(void __user *umod, unsigned long len,
 asmlinkage long sys_delete_module(const char __user *name_user,
 				unsigned int flags);
 
-asmlinkage long sys_rt_sigaction(int sig, const struct sigaction __user *act,
-				 struct sigaction __user *oact,
-				 size_t sigsetsize);
 asmlinkage long sys_rt_sigprocmask(int how, sigset_t __user *set,
 				sigset_t __user *oset, size_t sigsetsize);
 asmlinkage long sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize);
-asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize);
 asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
 				siginfo_t __user *uinfo,
 				const struct timespec __user *uts,
-- 
cgit v1.2.3-59-g8ed1b


From abbceff7d7a884968e876e52578da1db4a4f6b54 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 10 May 2010 15:15:12 -0400
Subject: swiotlb: add the swiotlb initialization function with iotlb memory

This enables the caller to initialize swiotlb with its own iotlb
memory.

See "swiotlb: swiotlb: add swiotlb_tbl_map_single library function" for
full description of patchset.

[v2: changed ..with_tlb to ..with_tbl]

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Albert Herranz <albert_herranz@yahoo.es>
---
 include/linux/swiotlb.h |  1 +
 lib/swiotlb.c           | 48 ++++++++++++++++++++++++++++++------------------
 2 files changed, 31 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 81a4e213c6cf..b406261d8887 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -23,6 +23,7 @@ extern int swiotlb_force;
 #define IO_TLB_SHIFT 11
 
 extern void swiotlb_init(int verbose);
+extern void swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 
 extern void
 *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 783aff00024c..ec61e1507d0a 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -140,28 +140,14 @@ void swiotlb_print_info(void)
 	       (unsigned long long)pend);
 }
 
-/*
- * Statically reserve bounce buffer space and initialize bounce buffer data
- * structures for the software IO TLB used to implement the DMA API.
- */
-void __init
-swiotlb_init_with_default_size(size_t default_size, int verbose)
+void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 {
 	unsigned long i, bytes;
 
-	if (!io_tlb_nslabs) {
-		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
-		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
-	}
+	bytes = nslabs << IO_TLB_SHIFT;
 
-	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
-
-	/*
-	 * Get IO TLB memory from the low pages
-	 */
-	io_tlb_start = alloc_bootmem_low_pages(bytes);
-	if (!io_tlb_start)
-		panic("Cannot allocate SWIOTLB buffer");
+	io_tlb_nslabs = nslabs;
+	io_tlb_start = tlb;
 	io_tlb_end = io_tlb_start + bytes;
 
 	/*
@@ -185,6 +171,32 @@ swiotlb_init_with_default_size(size_t default_size, int verbose)
 		swiotlb_print_info();
 }
 
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the DMA API.
+ */
+void __init
+swiotlb_init_with_default_size(size_t default_size, int verbose)
+{
+	unsigned long bytes;
+
+	if (!io_tlb_nslabs) {
+		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	}
+
+	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+	/*
+	 * Get IO TLB memory from the low pages
+	 */
+	io_tlb_start = alloc_bootmem_low_pages(bytes);
+	if (!io_tlb_start)
+		panic("Cannot allocate SWIOTLB buffer");
+
+	swiotlb_init_with_tbl(io_tlb_start, io_tlb_nslabs, verbose);
+}
+
 void __init
 swiotlb_init(int verbose)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 22d48269984fc93a71f65a54aa422aacf5fdb926 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 10 May 2010 15:01:15 -0500
Subject: swiotlb: search and replace "int dir" with "enum dma_data_direction
 dir"

.. to catch anybody doing something funky.

See "swiotlb: swiotlb: add swiotlb_tbl_map_single library function" for
full description of patchset.

[v2: swiotlb_sync_single_range_* no more - removed usage]
[v3: enum dma_data_direction direction -> enum dma_data_direction dir]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Tested-by: Albert Herranz <albert_herranz@yahoo.es>
---
 include/linux/swiotlb.h |  4 ++--
 lib/swiotlb.c           | 23 +++++++++++++----------
 2 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b406261d8887..250d766f17f7 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -43,11 +43,11 @@ extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
 
 extern int
 swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
-	       int direction);
+	       enum dma_data_direction dir);
 
 extern void
 swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
-		 int direction);
+		 enum dma_data_direction dir);
 
 extern int
 swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 1fc15bf63945..5f60157f31d8 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -374,7 +374,8 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
 }
 
 void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr,
-			     phys_addr_t phys, size_t size, int dir)
+			     phys_addr_t phys, size_t size,
+			     enum dma_data_direction dir)
 {
 	unsigned long flags;
 	char *dma_addr;
@@ -481,7 +482,8 @@ found:
  */
 
 static void *
-map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir)
+map_single(struct device *hwdev, phys_addr_t phys, size_t size,
+	   enum dma_data_direction dir)
 {
 	dma_addr_t start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start);
 
@@ -493,7 +495,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir)
  */
 static void
 swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
-			int dir)
+			enum dma_data_direction dir)
 {
 	unsigned long flags;
 	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
@@ -534,7 +536,7 @@ swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
 
 static void
 swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size,
-	    int dir, int target)
+	    enum dma_data_direction dir, int target)
 {
 	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
 	phys_addr_t phys = io_tlb_orig_addr[index];
@@ -624,7 +626,8 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 EXPORT_SYMBOL(swiotlb_free_coherent);
 
 static void
-swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
+swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
+	     int do_panic)
 {
 	/*
 	 * Ran out of IOMMU space for this operation. This is very bad.
@@ -702,7 +705,7 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
  * whatever the device wrote there.
  */
 static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
-			 size_t size, int dir)
+			 size_t size, enum dma_data_direction dir)
 {
 	phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
 
@@ -745,7 +748,7 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
  */
 static void
 swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
-		    size_t size, int dir, int target)
+		    size_t size, enum dma_data_direction dir, int target)
 {
 	phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
 
@@ -832,7 +835,7 @@ EXPORT_SYMBOL(swiotlb_map_sg_attrs);
 
 int
 swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
-	       int dir)
+	       enum dma_data_direction dir)
 {
 	return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
 }
@@ -859,7 +862,7 @@ EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
 
 void
 swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
-		 int dir)
+		 enum dma_data_direction dir)
 {
 	return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
 }
@@ -874,7 +877,7 @@ EXPORT_SYMBOL(swiotlb_unmap_sg);
  */
 static void
 swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
-		int nelems, int dir, int target)
+		int nelems, enum dma_data_direction dir, int target)
 {
 	struct scatterlist *sg;
 	int i;
-- 
cgit v1.2.3-59-g8ed1b


From d7ef1533a90f432615d25729c2477bac9e72051d Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 28 May 2010 11:37:10 -0400
Subject: swiotlb: Make swiotlb bookkeeping functions visible in the header
 file.

We put the functions dealing with the operations on
the SWIOTLB buffer in the header and make those functions non-static.
And also make the functions exported via EXPORT_SYMBOL_GPL.

See "swiotlb: swiotlb: add swiotlb_tbl_map_single library function" for
full description of patchset.

[v2: swiotlb_sync_single_range_for_* no more. Remove usage.]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Tested-by: Albert Herranz <albert_herranz@yahoo.es>
---
 include/linux/swiotlb.h | 22 ++++++++++++++++++++++
 lib/swiotlb.c           | 29 ++++++++++++++---------------
 2 files changed, 36 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 250d766f17f7..8c0e349f4a6c 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -25,6 +25,28 @@ extern int swiotlb_force;
 extern void swiotlb_init(int verbose);
 extern void swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 
+/*
+ * Enumeration for sync targets
+ */
+enum dma_sync_target {
+	SYNC_FOR_CPU = 0,
+	SYNC_FOR_DEVICE = 1,
+};
+extern void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr,
+				    phys_addr_t phys, size_t size,
+				    enum dma_data_direction dir);
+
+extern void swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr,
+				     size_t size, enum dma_data_direction dir);
+
+extern void swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr,
+				    size_t size, enum dma_data_direction dir,
+				    enum dma_sync_target target);
+
+/* Accessory functions. */
+extern void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+			   enum dma_data_direction dir);
+
 extern void
 *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			dma_addr_t *dma_handle, gfp_t flags);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 5f60157f31d8..34e3082632d8 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -50,14 +50,6 @@
  */
 #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
 
-/*
- * Enumeration for sync targets
- */
-enum dma_sync_target {
-	SYNC_FOR_CPU = 0,
-	SYNC_FOR_DEVICE = 1,
-};
-
 int swiotlb_force;
 
 /*
@@ -335,8 +327,8 @@ static int is_swiotlb_buffer(phys_addr_t paddr)
 /*
  * Bounce: copy the swiotlb buffer back to the original dma location
  */
-static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
-			   enum dma_data_direction dir)
+void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+		    enum dma_data_direction dir)
 {
 	unsigned long pfn = PFN_DOWN(phys);
 
@@ -372,6 +364,7 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
 			memcpy(phys_to_virt(phys), dma_addr, size);
 	}
 }
+EXPORT_SYMBOL_GPL(swiotlb_bounce);
 
 void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr,
 			     phys_addr_t phys, size_t size,
@@ -476,6 +469,7 @@ found:
 
 	return dma_addr;
 }
+EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single);
 
 /*
  * Allocates bounce buffer and returns its kernel virtual address.
@@ -493,7 +487,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size,
 /*
  * dma_addr is the kernel virtual address of the bounce buffer to unmap.
  */
-static void
+void
 swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
 			enum dma_data_direction dir)
 {
@@ -533,10 +527,12 @@ swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
 	}
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
 }
+EXPORT_SYMBOL_GPL(swiotlb_tbl_unmap_single);
 
-static void
+void
 swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size,
-	    enum dma_data_direction dir, int target)
+			enum dma_data_direction dir,
+			enum dma_sync_target target)
 {
 	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
 	phys_addr_t phys = io_tlb_orig_addr[index];
@@ -560,6 +556,7 @@ swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size,
 		BUG();
 	}
 }
+EXPORT_SYMBOL_GPL(swiotlb_tbl_sync_single);
 
 void *
 swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -748,7 +745,8 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
  */
 static void
 swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
-		    size_t size, enum dma_data_direction dir, int target)
+		    size_t size, enum dma_data_direction dir,
+		    enum dma_sync_target target)
 {
 	phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
 
@@ -877,7 +875,8 @@ EXPORT_SYMBOL(swiotlb_unmap_sg);
  */
 static void
 swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
-		int nelems, enum dma_data_direction dir, int target)
+		int nelems, enum dma_data_direction dir,
+		enum dma_sync_target target)
 {
 	struct scatterlist *sg;
 	int i;
-- 
cgit v1.2.3-59-g8ed1b


From bb69ae049fcc986fcd742eb90ca0d44a7a49c9f1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 7 Jun 2010 11:42:13 +0000
Subject: anycast: Some RCU conversions

- dev_get_by_flags() changed to dev_get_by_flags_rcu()

- ipv6_sock_ac_join() dont touch dev & idev refcounts
- ipv6_sock_ac_drop() dont touch dev & idev refcounts
- ipv6_sock_ac_close() dont touch dev & idev refcounts
- ipv6_dev_ac_dec() dount touch idev refcount
- ipv6_chk_acast_addr() dont touch idev refcount

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 +--
 net/core/dev.c            | 14 +++-----
 net/ipv6/anycast.c        | 90 ++++++++++++++++++++++-------------------------
 3 files changed, 49 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5156b806924c..c319f28d699d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1271,8 +1271,8 @@ extern void		dev_add_pack(struct packet_type *pt);
 extern void		dev_remove_pack(struct packet_type *pt);
 extern void		__dev_remove_pack(struct packet_type *pt);
 
-extern struct net_device	*dev_get_by_flags(struct net *net, unsigned short flags,
-						  unsigned short mask);
+extern struct net_device	*dev_get_by_flags_rcu(struct net *net, unsigned short flags,
+						      unsigned short mask);
 extern struct net_device	*dev_get_by_name(struct net *net, const char *name);
 extern struct net_device	*dev_get_by_name_rcu(struct net *net, const char *name);
 extern struct net_device	*__dev_get_by_name(struct net *net, const char *name);
diff --git a/net/core/dev.c b/net/core/dev.c
index c8d127718ff1..6f330cee79a6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -803,35 +803,31 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 
 /**
- *	dev_get_by_flags - find any device with given flags
+ *	dev_get_by_flags_rcu - find any device with given flags
  *	@net: the applicable net namespace
  *	@if_flags: IFF_* values
  *	@mask: bitmask of bits in if_flags to check
  *
  *	Search for any interface with the given flags. Returns NULL if a device
- *	is not found or a pointer to the device. The device returned has
- *	had a reference added and the pointer is safe until the user calls
- *	dev_put to indicate they have finished with it.
+ *	is not found or a pointer to the device. Must be called inside
+ *	rcu_read_lock(), and result refcount is unchanged.
  */
 
-struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
+struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 				    unsigned short mask)
 {
 	struct net_device *dev, *ret;
 
 	ret = NULL;
-	rcu_read_lock();
 	for_each_netdev_rcu(net, dev) {
 		if (((dev->flags ^ if_flags) & mask) == 0) {
-			dev_hold(dev);
 			ret = dev;
 			break;
 		}
 	}
-	rcu_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL(dev_get_by_flags);
+EXPORT_SYMBOL(dev_get_by_flags_rcu);
 
 /**
  *	dev_valid_name - check if name is okay for network device
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index b5b07054508a..f058fbd808c8 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -77,41 +77,40 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr)
 	pac->acl_next = NULL;
 	ipv6_addr_copy(&pac->acl_addr, addr);
 
+	rcu_read_lock();
 	if (ifindex == 0) {
 		struct rt6_info *rt;
 
 		rt = rt6_lookup(net, addr, NULL, 0, 0);
 		if (rt) {
 			dev = rt->rt6i_dev;
-			dev_hold(dev);
 			dst_release(&rt->u.dst);
 		} else if (ishost) {
 			err = -EADDRNOTAVAIL;
-			goto out_free_pac;
+			goto error;
 		} else {
 			/* router, no matching interface: just pick one */
-
-			dev = dev_get_by_flags(net, IFF_UP, IFF_UP|IFF_LOOPBACK);
+			dev = dev_get_by_flags_rcu(net, IFF_UP,
+						   IFF_UP | IFF_LOOPBACK);
 		}
 	} else
-		dev = dev_get_by_index(net, ifindex);
+		dev = dev_get_by_index_rcu(net, ifindex);
 
 	if (dev == NULL) {
 		err = -ENODEV;
-		goto out_free_pac;
+		goto error;
 	}
 
-	idev = in6_dev_get(dev);
+	idev = __in6_dev_get(dev);
 	if (!idev) {
 		if (ifindex)
 			err = -ENODEV;
 		else
 			err = -EADDRNOTAVAIL;
-		goto out_dev_put;
+		goto error;
 	}
 	/* reset ishost, now that we have a specific device */
 	ishost = !idev->cnf.forwarding;
-	in6_dev_put(idev);
 
 	pac->acl_ifindex = dev->ifindex;
 
@@ -124,26 +123,22 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr)
 		if (ishost)
 			err = -EADDRNOTAVAIL;
 		if (err)
-			goto out_dev_put;
+			goto error;
 	}
 
 	err = ipv6_dev_ac_inc(dev, addr);
-	if (err)
-		goto out_dev_put;
-
-	write_lock_bh(&ipv6_sk_ac_lock);
-	pac->acl_next = np->ipv6_ac_list;
-	np->ipv6_ac_list = pac;
-	write_unlock_bh(&ipv6_sk_ac_lock);
-
-	dev_put(dev);
-
-	return 0;
+	if (!err) {
+		write_lock_bh(&ipv6_sk_ac_lock);
+		pac->acl_next = np->ipv6_ac_list;
+		np->ipv6_ac_list = pac;
+		write_unlock_bh(&ipv6_sk_ac_lock);
+		pac = NULL;
+	}
 
-out_dev_put:
-	dev_put(dev);
-out_free_pac:
-	sock_kfree_s(sk, pac, sizeof(*pac));
+error:
+	rcu_read_unlock();
+	if (pac)
+		sock_kfree_s(sk, pac, sizeof(*pac));
 	return err;
 }
 
@@ -176,11 +171,12 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
 
 	write_unlock_bh(&ipv6_sk_ac_lock);
 
-	dev = dev_get_by_index(net, pac->acl_ifindex);
-	if (dev) {
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, pac->acl_ifindex);
+	if (dev)
 		ipv6_dev_ac_dec(dev, &pac->acl_addr);
-		dev_put(dev);
-	}
+	rcu_read_unlock();
+
 	sock_kfree_s(sk, pac, sizeof(*pac));
 	return 0;
 }
@@ -199,13 +195,12 @@ void ipv6_sock_ac_close(struct sock *sk)
 	write_unlock_bh(&ipv6_sk_ac_lock);
 
 	prev_index = 0;
+	rcu_read_lock();
 	while (pac) {
 		struct ipv6_ac_socklist *next = pac->acl_next;
 
 		if (pac->acl_ifindex != prev_index) {
-			if (dev)
-				dev_put(dev);
-			dev = dev_get_by_index(net, pac->acl_ifindex);
+			dev = dev_get_by_index_rcu(net, pac->acl_ifindex);
 			prev_index = pac->acl_ifindex;
 		}
 		if (dev)
@@ -213,8 +208,7 @@ void ipv6_sock_ac_close(struct sock *sk)
 		sock_kfree_s(sk, pac, sizeof(*pac));
 		pac = next;
 	}
-	if (dev)
-		dev_put(dev);
+	rcu_read_unlock();
 }
 
 #if 0
@@ -363,33 +357,32 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr)
 	return 0;
 }
 
+/* called with rcu_read_lock() */
 static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr)
 {
-	int ret;
-	struct inet6_dev *idev = in6_dev_get(dev);
+	struct inet6_dev *idev = __in6_dev_get(dev);
+
 	if (idev == NULL)
 		return -ENODEV;
-	ret = __ipv6_dev_ac_dec(idev, addr);
-	in6_dev_put(idev);
-	return ret;
+	return __ipv6_dev_ac_dec(idev, addr);
 }
 
 /*
  *	check if the interface has this anycast address
+ *	called with rcu_read_lock()
  */
 static int ipv6_chk_acast_dev(struct net_device *dev, struct in6_addr *addr)
 {
 	struct inet6_dev *idev;
 	struct ifacaddr6 *aca;
 
-	idev = in6_dev_get(dev);
+	idev = __in6_dev_get(dev);
 	if (idev) {
 		read_lock_bh(&idev->lock);
 		for (aca = idev->ac_list; aca; aca = aca->aca_next)
 			if (ipv6_addr_equal(&aca->aca_addr, addr))
 				break;
 		read_unlock_bh(&idev->lock);
-		in6_dev_put(idev);
 		return aca != NULL;
 	}
 	return 0;
@@ -403,14 +396,15 @@ int ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
 {
 	int found = 0;
 
-	if (dev)
-		return ipv6_chk_acast_dev(dev, addr);
 	rcu_read_lock();
-	for_each_netdev_rcu(net, dev)
-		if (ipv6_chk_acast_dev(dev, addr)) {
-			found = 1;
-			break;
-		}
+	if (dev)
+		found = ipv6_chk_acast_dev(dev, addr);
+	else
+		for_each_netdev_rcu(net, dev)
+			if (ipv6_chk_acast_dev(dev, addr)) {
+				found = 1;
+				break;
+			}
 	rcu_read_unlock();
 	return found;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 5bfddbd46a95c978f4d3c992339cbdf4f4b790a3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 8 Jun 2010 16:09:52 +0200
Subject: netfilter: nf_conntrack: IPS_UNTRACKED bit

NOTRACK makes all cpus share a cache line on nf_conntrack_untracked
twice per packet. This is bad for performance.
__read_mostly annotation is also a bad choice.

This patch introduces IPS_UNTRACKED bit so that we can use later a
per_cpu untrack structure more easily.

A new helper, nf_ct_untracked_get() returns a pointer to
nf_conntrack_untracked.

Another one, nf_ct_untracked_status_or() is used by nf_nat_init() to add
IPS_NAT_DONE_MASK bits to untracked status.

nf_ct_is_untracked() prototype is changed to work on a nf_conn pointer.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nf_conntrack_common.h  |  4 ++++
 include/net/netfilter/nf_conntrack.h           | 12 +++++++++---
 include/net/netfilter/nf_conntrack_core.h      |  2 +-
 net/ipv4/netfilter/nf_nat_core.c               |  2 +-
 net/ipv4/netfilter/nf_nat_standalone.c         |  2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  2 +-
 net/netfilter/nf_conntrack_core.c              | 11 ++++++++---
 net/netfilter/nf_conntrack_netlink.c           |  2 +-
 net/netfilter/xt_CT.c                          |  4 ++--
 net/netfilter/xt_NOTRACK.c                     |  2 +-
 net/netfilter/xt_TEE.c                         |  4 ++--
 net/netfilter/xt_cluster.c                     |  2 +-
 net/netfilter/xt_conntrack.c                   | 11 ++++++-----
 net/netfilter/xt_socket.c                      |  2 +-
 net/netfilter/xt_state.c                       | 14 ++++++++------
 15 files changed, 47 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 14e6d32002c4..1afd18c855ec 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -76,6 +76,10 @@ enum ip_conntrack_status {
 	/* Conntrack is a template */
 	IPS_TEMPLATE_BIT = 11,
 	IPS_TEMPLATE = (1 << IPS_TEMPLATE_BIT),
+
+	/* Conntrack is a fake untracked entry */
+	IPS_UNTRACKED_BIT = 12,
+	IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT),
 };
 
 /* Connection tracking event types */
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index bde095f7e845..3bc38c70bbbe 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -261,7 +261,13 @@ extern s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
 			       u32 seq);
 
 /* Fake conntrack entry for untracked connections */
-extern struct nf_conn nf_conntrack_untracked;
+static inline struct nf_conn *nf_ct_untracked_get(void)
+{
+	extern struct nf_conn nf_conntrack_untracked;
+
+	return &nf_conntrack_untracked;
+}
+extern void nf_ct_untracked_status_or(unsigned long bits);
 
 /* Iterate over all conntracks: if iter returns true, it's deleted. */
 extern void
@@ -289,9 +295,9 @@ static inline int nf_ct_is_dying(struct nf_conn *ct)
 	return test_bit(IPS_DYING_BIT, &ct->status);
 }
 
-static inline int nf_ct_is_untracked(const struct sk_buff *skb)
+static inline int nf_ct_is_untracked(const struct nf_conn *ct)
 {
-	return (skb->nfct == &nf_conntrack_untracked.ct_general);
+	return test_bit(IPS_UNTRACKED_BIT, &ct->status);
 }
 
 extern int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp);
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 3d7524fba194..aced085132e7 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -60,7 +60,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
 	struct nf_conn *ct = (struct nf_conn *)skb->nfct;
 	int ret = NF_ACCEPT;
 
-	if (ct && ct != &nf_conntrack_untracked) {
+	if (ct && !nf_ct_is_untracked(ct)) {
 		if (!nf_ct_is_confirmed(ct))
 			ret = __nf_conntrack_confirm(skb);
 		if (likely(ret == NF_ACCEPT))
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 4f8bddb760c9..c7719b283ada 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -742,7 +742,7 @@ static int __init nf_nat_init(void)
 	spin_unlock_bh(&nf_nat_lock);
 
 	/* Initialize fake conntrack so that NAT will skip it */
-	nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
+	nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
 
 	l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
 
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index beb25819c9c9..6723c682250d 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -98,7 +98,7 @@ nf_nat_fn(unsigned int hooknum,
 		return NF_ACCEPT;
 
 	/* Don't try to NAT if this packet is not conntracked */
-	if (ct == &nf_conntrack_untracked)
+	if (nf_ct_is_untracked(ct))
 		return NF_ACCEPT;
 
 	nat = nfct_nat(ct);
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 9be81776415e..1df3c8b6bf47 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -208,7 +208,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	type = icmp6h->icmp6_type - 130;
 	if (type >= 0 && type < sizeof(noct_valid_new) &&
 	    noct_valid_new[type]) {
-		skb->nfct = &nf_conntrack_untracked.ct_general;
+		skb->nfct = &nf_ct_untracked_get()->ct_general;
 		skb->nfctinfo = IP_CT_NEW;
 		nf_conntrack_get(skb->nfct);
 		return NF_ACCEPT;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index eeeb8bc73982..6c1da212380d 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -62,7 +62,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 unsigned int nf_conntrack_max __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_max);
 
-struct nf_conn nf_conntrack_untracked __read_mostly;
+struct nf_conn nf_conntrack_untracked;
 EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
 
 static int nf_conntrack_hash_rnd_initted;
@@ -1321,6 +1321,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
 		  &nf_conntrack_htable_size, 0600);
 
+void nf_ct_untracked_status_or(unsigned long bits)
+{
+	nf_conntrack_untracked.status |= bits;
+}
+EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
+
 static int nf_conntrack_init_init_net(void)
 {
 	int max_factor = 8;
@@ -1368,8 +1374,7 @@ static int nf_conntrack_init_init_net(void)
 #endif
 	atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
 	/*  - and look it like as a confirmed connection */
-	set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
-
+	nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
 	return 0;
 
 #ifdef CONFIG_NF_CONNTRACK_ZONES
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c42ff6aa441d..5bae1cd15eea 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -480,7 +480,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	int err;
 
 	/* ignore our fake conntrack entry */
-	if (ct == &nf_conntrack_untracked)
+	if (nf_ct_is_untracked(ct))
 		return 0;
 
 	if (events & (1 << IPCT_DESTROY)) {
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 562bf3266e04..0cb6053f02fd 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -67,7 +67,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par)
 		return -EINVAL;
 
 	if (info->flags & XT_CT_NOTRACK) {
-		ct = &nf_conntrack_untracked;
+		ct = nf_ct_untracked_get();
 		atomic_inc(&ct->ct_general.use);
 		goto out;
 	}
@@ -132,7 +132,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par)
 	struct nf_conn *ct = info->ct;
 	struct nf_conn_help *help;
 
-	if (ct != &nf_conntrack_untracked) {
+	if (!nf_ct_is_untracked(ct)) {
 		help = nfct_help(ct);
 		if (help)
 			module_put(help->helper->me);
diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c
index 512b9123252f..9d782181b6c8 100644
--- a/net/netfilter/xt_NOTRACK.c
+++ b/net/netfilter/xt_NOTRACK.c
@@ -23,7 +23,7 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	   If there is a real ct entry correspondig to this packet,
 	   it'll hang aroun till timing out. We don't deal with it
 	   for performance reasons. JK */
-	skb->nfct = &nf_conntrack_untracked.ct_general;
+	skb->nfct = &nf_ct_untracked_get()->ct_general;
 	skb->nfctinfo = IP_CT_NEW;
 	nf_conntrack_get(skb->nfct);
 
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 859d9fd429c8..7a118267c4c4 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -104,7 +104,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 #ifdef WITH_CONNTRACK
 	/* Avoid counting cloned packets towards the original connection. */
 	nf_conntrack_put(skb->nfct);
-	skb->nfct     = &nf_conntrack_untracked.ct_general;
+	skb->nfct     = &nf_ct_untracked_get()->ct_general;
 	skb->nfctinfo = IP_CT_NEW;
 	nf_conntrack_get(skb->nfct);
 #endif
@@ -177,7 +177,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 
 #ifdef WITH_CONNTRACK
 	nf_conntrack_put(skb->nfct);
-	skb->nfct     = &nf_conntrack_untracked.ct_general;
+	skb->nfct     = &nf_ct_untracked_get()->ct_general;
 	skb->nfctinfo = IP_CT_NEW;
 	nf_conntrack_get(skb->nfct);
 #endif
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index 30b95a1c1c89..f4af1bfafb1c 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -120,7 +120,7 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	if (ct == NULL)
 		return false;
 
-	if (ct == &nf_conntrack_untracked)
+	if (nf_ct_is_untracked(ct))
 		return false;
 
 	if (ct->master)
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 39681f10291c..e536710ad916 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -123,11 +123,12 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
 
 	ct = nf_ct_get(skb, &ctinfo);
 
-	if (ct == &nf_conntrack_untracked)
-		statebit = XT_CONNTRACK_STATE_UNTRACKED;
-	else if (ct != NULL)
-		statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
-	else
+	if (ct) {
+		if (nf_ct_is_untracked(ct))
+			statebit = XT_CONNTRACK_STATE_UNTRACKED;
+		else
+			statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
+	} else
 		statebit = XT_CONNTRACK_STATE_INVALID;
 
 	if (info->match_flags & XT_CONNTRACK_STATE) {
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 3d54c236a1ba..1ca89908cbad 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -127,7 +127,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 	 * reply packet of an established SNAT-ted connection. */
 
 	ct = nf_ct_get(skb, &ctinfo);
-	if (ct && (ct != &nf_conntrack_untracked) &&
+	if (ct && !nf_ct_is_untracked(ct) &&
 	    ((iph->protocol != IPPROTO_ICMP &&
 	      ctinfo == IP_CT_IS_REPLY + IP_CT_ESTABLISHED) ||
 	     (iph->protocol == IPPROTO_ICMP &&
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index e12e053d3782..a507922d80cd 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -26,14 +26,16 @@ state_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	const struct xt_state_info *sinfo = par->matchinfo;
 	enum ip_conntrack_info ctinfo;
 	unsigned int statebit;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 
-	if (nf_ct_is_untracked(skb))
-		statebit = XT_STATE_UNTRACKED;
-	else if (!nf_ct_get(skb, &ctinfo))
+	if (!ct)
 		statebit = XT_STATE_INVALID;
-	else
-		statebit = XT_STATE_BIT(ctinfo);
-
+	else {
+		if (nf_ct_is_untracked(ct))
+			statebit = XT_STATE_UNTRACKED;
+		else
+			statebit = XT_STATE_BIT(ctinfo);
+	}
 	return (sinfo->statemask & statebit);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 50a323b73069b169385a8ac65633dee837a7d13f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 8 Jun 2010 21:40:36 +0200
Subject: sched: define and use CPU_PRI_* enums for cpu notifier priorities

Instead of hardcoding priority 10 and 20 in sched and perf, collect
them into CPU_PRI_* enums.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/cpu.h        | 9 +++++++++
 include/linux/perf_event.h | 2 +-
 kernel/sched.c             | 2 +-
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index e287863ac053..2d9073883ea9 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -48,6 +48,15 @@ extern ssize_t arch_cpu_release(const char *, size_t);
 #endif
 struct notifier_block;
 
+/*
+ * CPU notifier priorities.
+ */
+enum {
+	/* migration should happen before other stuff but after perf */
+	CPU_PRI_PERF		= 20,
+	CPU_PRI_MIGRATION	= 10,
+};
+
 #ifdef CONFIG_SMP
 /* Need to know about CPUs going up/down? */
 #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5d0266d94985..469e03e96fe7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1068,7 +1068,7 @@ static inline void perf_event_disable(struct perf_event *event)		{ }
 #define perf_cpu_notifier(fn)					\
 do {								\
 	static struct notifier_block fn##_nb __cpuinitdata =	\
-		{ .notifier_call = fn, .priority = 20 };	\
+		{ .notifier_call = fn, .priority = CPU_PRI_PERF }; \
 	fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE,		\
 		(void *)(unsigned long)smp_processor_id());	\
 	fn(&fn##_nb, (unsigned long)CPU_STARTING,		\
diff --git a/kernel/sched.c b/kernel/sched.c
index f8b8996228dd..552faf8d358c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5801,7 +5801,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
-	.priority = 10
+	.priority = CPU_PRI_MIGRATION,
 };
 
 static int __init migration_init(void)
-- 
cgit v1.2.3-59-g8ed1b


From 3a101d0548e925ab16ca6aaa8cf4f767d322ddb0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 8 Jun 2010 21:40:36 +0200
Subject: sched: adjust when cpu_active and cpuset configurations are updated
 during cpu on/offlining

Currently, when a cpu goes down, cpu_active is cleared before
CPU_DOWN_PREPARE starts and cpuset configuration is updated from a
default priority cpu notifier.  When a cpu is coming up, it's set
before CPU_ONLINE but cpuset configuration again is updated from the
same cpu notifier.

For cpu notifiers, this presents an inconsistent state.  Threads which
a CPU_DOWN_PREPARE notifier expects to be bound to the CPU can be
migrated to other cpus because the cpu is no more inactive.

Fix it by updating cpu_active in the highest priority cpu notifier and
cpuset configuration in the second highest when a cpu is coming up.
Down path is updated similarly.  This guarantees that all other cpu
notifiers see consistent cpu_active and cpuset configuration.

cpuset_track_online_cpus() notifier is converted to
cpuset_update_active_cpus() which just updates the configuration and
now called from cpuset_cpu_[in]active() notifiers registered from
sched_init_smp().  If cpuset is disabled, cpuset_update_active_cpus()
degenerates into partition_sched_domains() making separate notifier
for !CONFIG_CPUSETS unnecessary.

This problem is triggered by cmwq.  During CPU_DOWN_PREPARE, hotplug
callback creates a kthread and kthread_bind()s it to the target cpu,
and the thread is expected to run on that cpu.

* Ingo's test discovered __cpuinit/exit markups were incorrect.
  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Menage <menage@google.com>
---
 include/linux/cpu.h    | 16 ++++++++++++
 include/linux/cpuset.h |  6 +++++
 kernel/cpu.c           |  6 -----
 kernel/cpuset.c        | 21 ++--------------
 kernel/sched.c         | 67 +++++++++++++++++++++++++++++++++++++-------------
 5 files changed, 74 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 2d9073883ea9..de6b1722cdca 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -52,6 +52,22 @@ struct notifier_block;
  * CPU notifier priorities.
  */
 enum {
+	/*
+	 * SCHED_ACTIVE marks a cpu which is coming up active during
+	 * CPU_ONLINE and CPU_DOWN_FAILED and must be the first
+	 * notifier.  CPUSET_ACTIVE adjusts cpuset according to
+	 * cpu_active mask right after SCHED_ACTIVE.  During
+	 * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are
+	 * ordered in the similar way.
+	 *
+	 * This ordering guarantees consistent cpu_active mask and
+	 * migration behavior to all cpu notifiers.
+	 */
+	CPU_PRI_SCHED_ACTIVE	= INT_MAX,
+	CPU_PRI_CPUSET_ACTIVE	= INT_MAX - 1,
+	CPU_PRI_SCHED_INACTIVE	= INT_MIN + 1,
+	CPU_PRI_CPUSET_INACTIVE	= INT_MIN,
+
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION	= 10,
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 457ed765a116..f20eb8f16025 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,6 +20,7 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
+extern void cpuset_update_active_cpus(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -132,6 +133,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
+static inline void cpuset_update_active_cpus(void)
+{
+	partition_sched_domains(1, NULL, NULL);
+}
+
 static inline void cpuset_cpus_allowed(struct task_struct *p,
 				       struct cpumask *mask)
 {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 97d1b426a4ac..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		return -EINVAL;
 
 	cpu_hotplug_begin();
-	set_cpu_active(cpu, false);
 	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (err) {
-		set_cpu_active(cpu, true);
-
 		nr_calls--;
 		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
 		printk("%s: attempt to take down CPU %u failed\n",
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
-		set_cpu_active(cpu, true);
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
 
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
 
-	set_cpu_active(cpu, true);
-
 	/* Now call notifier in preparation. */
 	cpu_notify(CPU_ONLINE | mod, hcpu);
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02b9611eadde..05727dcaa80d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
  * but making no active use of cpusets.
  *
  * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_online_map on each CPU hotplug (cpuhp) event.
+ * cpu_active_mask on each CPU hotplug (cpuhp) event.
  *
  * Called within get_online_cpus().  Needs to call cgroup_lock()
  * before calling generate_sched_domains().
  */
-static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
-				unsigned long phase, void *unused_cpu)
+void __cpuexit cpuset_update_active_cpus(void)
 {
 	struct sched_domain_attr *attr;
 	cpumask_var_t *doms;
 	int ndoms;
 
-	switch (phase) {
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		break;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
 	cgroup_lock();
 	mutex_lock(&callback_mutex);
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 
 	/* Have scheduler rebuild the domains */
 	partition_sched_domains(ndoms, doms, attr);
-
-	return NOTIFY_OK;
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void)
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
 	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 
-	hotcpu_notifier(cpuset_track_online_cpus, 0);
 	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
 
 	cpuset_wq = create_singlethread_workqueue("cpuset");
diff --git a/kernel/sched.c b/kernel/sched.c
index 552faf8d358c..2b942e49d0fa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5804,17 +5804,46 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 	.priority = CPU_PRI_MIGRATION,
 };
 
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+				      unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		set_cpu_active((long)hcpu, true);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		set_cpu_active((long)hcpu, false);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
 static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
 
-	/* Start one for the boot CPU: */
+	/* Initialize migration for the boot CPU */
 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 
+	/* Register cpu active notifiers */
+	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
 	return 0;
 }
 early_initcall(migration_init);
@@ -7273,29 +7302,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
-#ifndef CONFIG_CPUSETS
 /*
- * Add online and remove offline CPUs from the scheduler domains.
- * When cpusets are enabled they take over this function.
+ * Update cpusets according to cpu_active mask.  If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
  */
-static int update_sched_domains(struct notifier_block *nfb,
-				unsigned long action, void *hcpu)
+static int __cpuexit cpuset_cpu_active(struct notifier_block *nfb,
+				       unsigned long action, void *hcpu)
 {
-	switch (action) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		partition_sched_domains(1, NULL, NULL);
+		cpuset_update_active_cpus();
 		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
 
+static int __cpuexit cpuset_cpu_inactive(struct notifier_block *nfb,
+					 unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		cpuset_update_active_cpus();
+		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
-#endif
 
 static int update_runtime(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
@@ -7341,10 +7376,8 @@ void __init sched_init_smp(void)
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
 
-#ifndef CONFIG_CPUSETS
-	/* XXX: Theoretical race here - CPU may be hotplugged now */
-	hotcpu_notifier(update_sched_domains, 0);
-#endif
+	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
 
 	/* RT runtime code needs to handle some hotplug events */
 	hotcpu_notifier(update_runtime, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 21aa9af03d06cb1d19a3738e5cf12acff984e69b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 8 Jun 2010 21:40:37 +0200
Subject: sched: add hooks for workqueue

Concurrency managed workqueue needs to know when workers are going to
sleep and waking up.  Using these two hooks, cmwq keeps track of the
current concurrency level and throttles execution of new works if it's
too high and wakes up another worker from the sleep hook if it becomes
too low.

This patch introduces PF_WQ_WORKER to identify workqueue workers and
adds the following two hooks.

* wq_worker_waking_up(): called when a worker is woken up.

* wq_worker_sleeping(): called when a worker is going to sleep and may
  return a pointer to a local task which should be woken up.  The
  returned task is woken up using try_to_wake_up_local() which is
  simplified ttwu which is called under rq lock and can only wake up
  local tasks.

Both hooks are currently defined as noop in kernel/workqueue_sched.h.
Later cmwq implementation will replace them with proper
implementation.

These hooks are hard coded as they'll always be enabled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |  1 +
 kernel/fork.c            |  2 +-
 kernel/sched.c           | 53 ++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/workqueue_sched.h | 16 +++++++++++++++
 4 files changed, 69 insertions(+), 3 deletions(-)
 create mode 100644 kernel/workqueue_sched.h

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f118809c953f..edc3dd168d87 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1696,6 +1696,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
+#define PF_WQ_WORKER	0x00000020	/* I'm a workqueue worker */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14ba047..a82a65cef741 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -907,7 +907,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long new_flags = p->flags;
 
-	new_flags &= ~PF_SUPERPRIV;
+	new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
 	new_flags |= PF_FORKNOEXEC;
 	new_flags |= PF_STARTING;
 	p->flags = new_flags;
diff --git a/kernel/sched.c b/kernel/sched.c
index 96eafd5f345f..edd5a54b95da 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
 #include <asm/irq_regs.h>
 
 #include "sched_cpupri.h"
+#include "workqueue_sched.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -2306,6 +2307,9 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
 		rq->idle_stamp = 0;
 	}
 #endif
+	/* if a worker is waking up, notify workqueue */
+	if ((p->flags & PF_WQ_WORKER) && success)
+		wq_worker_waking_up(p, cpu_of(rq));
 }
 
 /**
@@ -2413,6 +2417,37 @@ out:
 	return success;
 }
 
+/**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not alredy there.  The caller must
+ * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ * the current task.  this_rq() stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+	struct rq *rq = task_rq(p);
+	bool success = false;
+
+	BUG_ON(rq != this_rq());
+	BUG_ON(p == current);
+	lockdep_assert_held(&rq->lock);
+
+	if (!(p->state & TASK_NORMAL))
+		return;
+
+	if (!p->se.on_rq) {
+		if (likely(!task_running(rq, p))) {
+			schedstat_inc(rq, ttwu_count);
+			schedstat_inc(rq, ttwu_local);
+		}
+		ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+		success = true;
+	}
+	ttwu_post_activation(p, rq, 0, success);
+}
+
 /**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
@@ -3618,10 +3653,24 @@ need_resched_nonpreemptible:
 	clear_tsk_need_resched(prev);
 
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-		if (unlikely(signal_pending_state(prev->state, prev)))
+		if (unlikely(signal_pending_state(prev->state, prev))) {
 			prev->state = TASK_RUNNING;
-		else
+		} else {
+			/*
+			 * If a worker is going to sleep, notify and
+			 * ask workqueue whether it wants to wake up a
+			 * task to maintain concurrency.  If so, wake
+			 * up the task.
+			 */
+			if (prev->flags & PF_WQ_WORKER) {
+				struct task_struct *to_wakeup;
+
+				to_wakeup = wq_worker_sleeping(prev, cpu);
+				if (to_wakeup)
+					try_to_wake_up_local(to_wakeup);
+			}
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
+		}
 		switch_count = &prev->nvcsw;
 	}
 
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 000000000000..af040babb742
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,16 @@
+/*
+ * kernel/workqueue_sched.h
+ *
+ * Scheduler hooks for concurrency managed workqueue.  Only to be
+ * included from sched.c and workqueue.c.
+ */
+static inline void wq_worker_waking_up(struct task_struct *task,
+				       unsigned int cpu)
+{
+}
+
+static inline struct task_struct *wq_worker_sleeping(struct task_struct *task,
+						     unsigned int cpu)
+{
+	return NULL;
+}
-- 
cgit v1.2.3-59-g8ed1b


From b0f82b81fe6bbcf78d478071f33e44554726bc81 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 20 May 2010 07:47:21 +0200
Subject: perf: Drop the skip argument from perf_arch_fetch_regs_caller

Drop this argument now that we always want to rewind only to the
state of the first caller.
It means frame pointers are not necessary anymore to reliably get
the source of an event. But this also means we need this helper
to be a macro now, as an inline function is not an option since
we need to know when to provide a default implentation.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/powerpc/include/asm/perf_event.h | 12 ++++++++++++
 arch/powerpc/kernel/misc.S            | 26 --------------------------
 arch/sparc/include/asm/perf_event.h   |  8 ++++++++
 arch/sparc/kernel/helpers.S           |  6 +++---
 arch/x86/include/asm/perf_event.h     | 13 +++++++++++++
 arch/x86/include/asm/stacktrace.h     |  7 ++-----
 arch/x86/kernel/cpu/perf_event.c      | 16 ----------------
 include/linux/perf_event.h            | 32 +++++++-------------------------
 include/trace/ftrace.h                |  2 +-
 kernel/perf_event.c                   |  5 -----
 kernel/trace/trace_event_perf.c       |  2 --
 11 files changed, 46 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h
index e6d4ce69b126..5c16b891d501 100644
--- a/arch/powerpc/include/asm/perf_event.h
+++ b/arch/powerpc/include/asm/perf_event.h
@@ -21,3 +21,15 @@
 #ifdef CONFIG_FSL_EMB_PERF_EVENT
 #include <asm/perf_event_fsl_emb.h>
 #endif
+
+#ifdef CONFIG_PERF_EVENTS
+#include <asm/ptrace.h>
+#include <asm/reg.h>
+
+#define perf_arch_fetch_caller_regs(regs, __ip)			\
+	do {							\
+		(regs)->nip = __ip;				\
+		(regs)->gpr[1] = *(unsigned long *)__get_SP();	\
+		asm volatile("mfmsr %0" : "=r" ((regs)->msr));	\
+	} while (0)
+#endif
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index 22e507c8a556..2d29752cbe16 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -127,29 +127,3 @@ _GLOBAL(__setup_cpu_power7)
 _GLOBAL(__restore_cpu_power7)
 	/* place holder */
 	blr
-
-/*
- * Get a minimal set of registers for our caller's nth caller.
- * r3 = regs pointer, r5 = n.
- *
- * We only get R1 (stack pointer), NIP (next instruction pointer)
- * and LR (link register).  These are all we can get in the
- * general case without doing complicated stack unwinding, but
- * fortunately they are enough to do a stack backtrace, which
- * is all we need them for.
- */
-_GLOBAL(perf_arch_fetch_caller_regs)
-	mr	r6,r1
-	cmpwi	r5,0
-	mflr	r4
-	ble	2f
-	mtctr	r5
-1:	PPC_LL	r6,0(r6)
-	bdnz	1b
-	PPC_LL	r4,PPC_LR_STKOFF(r6)
-2:	PPC_LL	r7,0(r6)
-	PPC_LL	r7,PPC_LR_STKOFF(r7)
-	PPC_STL	r6,GPR1-STACK_FRAME_OVERHEAD(r3)
-	PPC_STL	r4,_NIP-STACK_FRAME_OVERHEAD(r3)
-	PPC_STL	r7,_LINK-STACK_FRAME_OVERHEAD(r3)
-	blr
diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h
index 7e2669894ce8..74c4e0cd889c 100644
--- a/arch/sparc/include/asm/perf_event.h
+++ b/arch/sparc/include/asm/perf_event.h
@@ -6,7 +6,15 @@ extern void set_perf_event_pending(void);
 #define	PERF_EVENT_INDEX_OFFSET	0
 
 #ifdef CONFIG_PERF_EVENTS
+#include <asm/ptrace.h>
+
 extern void init_hw_perf_events(void);
+
+extern void
+__perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
+
+#define perf_arch_fetch_caller_regs(pt_regs, ip)	\
+	__perf_arch_fetch_caller_regs(pt_regs, ip, 1);
 #else
 static inline void init_hw_perf_events(void)	{ }
 #endif
diff --git a/arch/sparc/kernel/helpers.S b/arch/sparc/kernel/helpers.S
index 92090cc9e829..682fee06a16b 100644
--- a/arch/sparc/kernel/helpers.S
+++ b/arch/sparc/kernel/helpers.S
@@ -47,9 +47,9 @@ stack_trace_flush:
 	.size		stack_trace_flush,.-stack_trace_flush
 
 #ifdef CONFIG_PERF_EVENTS
-	.globl		perf_arch_fetch_caller_regs
-	.type		perf_arch_fetch_caller_regs,#function
-perf_arch_fetch_caller_regs:
+	.globl		__perf_arch_fetch_caller_regs
+	.type		__perf_arch_fetch_caller_regs,#function
+__perf_arch_fetch_caller_regs:
 	/* We always read the %pstate into %o5 since we will use
 	 * that to construct a fake %tstate to store into the regs.
 	 */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 254883d0c7e0..02de29830ffe 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -140,6 +140,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)	perf_misc_flags(regs)
 
+#include <asm/stacktrace.h>
+
+/*
+ * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
+ * and the comment with PERF_EFLAGS_EXACT.
+ */
+#define perf_arch_fetch_caller_regs(regs, __ip)		{	\
+	(regs)->ip = (__ip);					\
+	(regs)->bp = caller_frame_pointer();			\
+	(regs)->cs = __KERNEL_CS;				\
+	regs->flags = 0;					\
+}
+
 #else
 static inline void init_hw_perf_events(void)		{ }
 static inline void perf_events_lapic_init(void)	{ }
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index a957463d3c7a..2b16a2ad23dc 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -78,17 +78,14 @@ struct stack_frame_ia32 {
     u32 return_address;
 };
 
-static inline unsigned long rewind_frame_pointer(int n)
+static inline unsigned long caller_frame_pointer(void)
 {
 	struct stack_frame *frame;
 
 	get_bp(frame);
 
 #ifdef CONFIG_FRAME_POINTER
-	while (n--) {
-		if (probe_kernel_address(&frame->next_frame, frame))
-			break;
-	}
+	frame = frame->next_frame;
 #endif
 
 	return (unsigned long)frame;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9632fb61e8f9..2c075fe573d0 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1706,22 +1706,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return entry;
 }
 
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-	regs->ip = ip;
-	/*
-	 * perf_arch_fetch_caller_regs adds another call, we need to increment
-	 * the skip level
-	 */
-	regs->bp = rewind_frame_pointer(skip + 1);
-	regs->cs = __KERNEL_CS;
-	/*
-	 * We abuse bit 3 to pass exact information, see perf_misc_flags
-	 * and the comment with PERF_EFLAGS_EXACT.
-	 */
-	regs->flags = 0;
-}
-
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
 	unsigned long ip;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb6c91eac7e3..bea785cef493 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -905,8 +905,10 @@ extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
 
-extern void
-perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
+#ifndef perf_arch_fetch_caller_regs
+static inline void
+perf_arch_fetch_caller_regs(struct regs *regs, unsigned long ip) { }
+#endif
 
 /*
  * Take a snapshot of the regs. Skip ip and frame pointer to
@@ -916,31 +918,11 @@ perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
  * - bp for callchains
  * - eflags, for future purposes, just in case
  */
-static inline void perf_fetch_caller_regs(struct pt_regs *regs, int skip)
+static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 {
-	unsigned long ip;
-
 	memset(regs, 0, sizeof(*regs));
 
-	switch (skip) {
-	case 1 :
-		ip = CALLER_ADDR0;
-		break;
-	case 2 :
-		ip = CALLER_ADDR1;
-		break;
-	case 3 :
-		ip = CALLER_ADDR2;
-		break;
-	case 4:
-		ip = CALLER_ADDR3;
-		break;
-	/* No need to support further for now */
-	default:
-		ip = 0;
-	}
-
-	return perf_arch_fetch_caller_regs(regs, ip, skip);
+	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
 }
 
 static inline void
@@ -950,7 +932,7 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 		struct pt_regs hot_regs;
 
 		if (!regs) {
-			perf_fetch_caller_regs(&hot_regs, 1);
+			perf_fetch_caller_regs(&hot_regs);
 			regs = &hot_regs;
 		}
 		__perf_sw_event(event_id, nr, nmi, regs, addr);
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 3d685d1f2a03..8ee8b6e6b25e 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -705,7 +705,7 @@ perf_trace_##call(void *__data, proto)					\
 	int __data_size;						\
 	int rctx;							\
 									\
-	perf_fetch_caller_regs(&__regs, 1);				\
+	perf_fetch_caller_regs(&__regs);				\
 									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 	__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e099650cd249..9ae4dbcdf469 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2851,11 +2851,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return NULL;
 }
 
-__weak
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-}
-
 
 /*
  * We assume there is only KVM supporting the callbacks.
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cb6f365016e4..21db1d3a48d0 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,8 +9,6 @@
 #include <linux/kprobes.h>
 #include "trace.h"
 
-EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
-
 static char *perf_trace_buf[4];
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From c676329abb2b8359d9a5d734dec0c81779823fd6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 25 May 2010 10:48:51 +0200
Subject: sched_clock: Add local_clock() API and improve documentation

For people who otherwise get to write: cpu_clock(smp_processor_id()),
there is now: local_clock().

Also, as per suggestion from Andrew, provide some documentation on
the various clock interfaces, and minimize the unsigned long long vs
u64 mess.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jens Axboe <jaxboe@fusionio.com>
LKML-Reference: <1275052414.1645.52.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/parisc/kernel/ftrace.c |  4 +-
 include/linux/sched.h       | 37 ++++++++++--------
 kernel/lockdep.c            |  2 +-
 kernel/perf_event.c         |  2 +-
 kernel/rcutorture.c         |  3 +-
 kernel/sched.c              |  2 +-
 kernel/sched_clock.c        | 95 ++++++++++++++++++++++++++++++++++++++++-----
 kernel/trace/trace_clock.c  |  2 +-
 8 files changed, 113 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
index 9877372ffdba..5beb97bafbb1 100644
--- a/arch/parisc/kernel/ftrace.c
+++ b/arch/parisc/kernel/ftrace.c
@@ -82,7 +82,7 @@ unsigned long ftrace_return_to_handler(unsigned long retval0,
 	unsigned long ret;
 
 	pop_return_trace(&trace, &ret);
-	trace.rettime = cpu_clock(raw_smp_processor_id());
+	trace.rettime = local_clock();
 	ftrace_graph_return(&trace);
 
 	if (unlikely(!ret)) {
@@ -126,7 +126,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	calltime = cpu_clock(raw_smp_processor_id());
+	calltime = local_clock();
 
 	if (push_return_trace(old, calltime,
 				self_addr, &trace.depth) == -EBUSY) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index edc3dd168d87..c2d4316a04bb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1791,20 +1791,23 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 #endif
 
 /*
- * Architectures can set this to 1 if they have specified
- * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
- * but then during bootup it turns out that sched_clock()
- * is reliable after all:
+ * Do not use outside of architecture code which knows its limitations.
+ *
+ * sched_clock() has no promise of monotonicity or bounded drift between
+ * CPUs, use (which you should not) requires disabling IRQs.
+ *
+ * Please use one of the three interfaces below.
  */
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-extern int sched_clock_stable;
-#endif
-
-/* ftrace calls sched_clock() directly */
 extern unsigned long long notrace sched_clock(void);
+/*
+ * See the comment in kernel/sched_clock.c
+ */
+extern u64 cpu_clock(int cpu);
+extern u64 local_clock(void);
+extern u64 sched_clock_cpu(int cpu);
+
 
 extern void sched_clock_init(void);
-extern u64 sched_clock_cpu(int cpu);
 
 #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 static inline void sched_clock_tick(void)
@@ -1819,17 +1822,19 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
 }
 #else
+/*
+ * Architectures can set this to 1 if they have specified
+ * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ * but then during bootup it turns out that sched_clock()
+ * is reliable after all:
+ */
+extern int sched_clock_stable;
+
 extern void sched_clock_tick(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 #endif
 
-/*
- * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
- * clock constructed from sched_clock():
- */
-extern unsigned long long cpu_clock(int cpu);
-
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
 extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 54286798c37b..f2852a510232 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
 
 static inline u64 lockstat_clock(void)
 {
-	return cpu_clock(smp_processor_id());
+	return local_clock();
 }
 
 static int lock_point(unsigned long points[], unsigned long ip)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 31d6afe92594..109c5ec88933 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 
 static inline u64 perf_clock(void)
 {
-	return cpu_clock(raw_smp_processor_id());
+	return local_clock();
 }
 
 /*
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6535ac8bc6a5..2e2726d790b9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -239,8 +239,7 @@ static unsigned long
 rcu_random(struct rcu_random_state *rrsp)
 {
 	if (--rrsp->rrs_count < 0) {
-		rrsp->rrs_state +=
-			(unsigned long)cpu_clock(raw_smp_processor_id());
+		rrsp->rrs_state += (unsigned long)local_clock();
 		rrsp->rrs_count = RCU_RANDOM_REFRESH;
 	}
 	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
diff --git a/kernel/sched.c b/kernel/sched.c
index 8f351c56567f..3abd8f780dae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1647,7 +1647,7 @@ static void update_shares(struct sched_domain *sd)
 	if (root_task_group_empty())
 		return;
 
-	now = cpu_clock(raw_smp_processor_id());
+	now = local_clock();
 	elapsed = now - sd->last_update;
 
 	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 906a0f718cb3..52f1a149bfb1 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
  *   Ingo Molnar <mingo@redhat.com>
  *   Guillaume Chazarain <guichaz@gmail.com>
  *
- * Create a semi stable clock from a mixture of other events, including:
- *  - gtod
+ *
+ * What:
+ *
+ * cpu_clock(i) provides a fast (execution time) high resolution
+ * clock with bounded drift between CPUs. The value of cpu_clock(i)
+ * is monotonic for constant i. The timestamp returned is in nanoseconds.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ *
+ * There is no strict promise about the base, although it tends to start
+ * at 0 on boot (but people really shouldn't rely on that).
+ *
+ * cpu_clock(i)       -- can be used from any context, including NMI.
+ * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
+ * local_clock()      -- is cpu_clock() on the current cpu.
+ *
+ * How:
+ *
+ * The implementation either uses sched_clock() when
+ * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
+ * sched_clock() is assumed to provide these properties (mostly it means
+ * the architecture provides a globally synchronized highres time source).
+ *
+ * Otherwise it tries to create a semi stable clock from a mixture of other
+ * clocks, including:
+ *
+ *  - GTOD (clock monotomic)
  *  - sched_clock()
  *  - explicit idle events
  *
- * We use gtod as base and the unstable clock deltas. The deltas are filtered,
- * making it monotonic and keeping it within an expected window.
+ * We use GTOD as base and use sched_clock() deltas to improve resolution. The
+ * deltas are filtered to provide monotonicity and keeping it within an
+ * expected window.
  *
  * Furthermore, explicit sleep and wakeup hooks allow us to account for time
  * that is otherwise invisible (TSC gets stopped).
  *
- * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
- * consistent between cpus (never more than 2 jiffies difference).
+ *
+ * Notes:
+ *
+ * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
+ * like cpufreq interrupts that can change the base clock (TSC) multiplier
+ * and cause funny jumps in time -- although the filtering provided by
+ * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
+ * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
+ * sched_clock().
  */
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
@@ -170,6 +206,11 @@ again:
 	return val;
 }
 
+/*
+ * Similar to cpu_clock(), but requires local IRQs to be disabled.
+ *
+ * See cpu_clock().
+ */
 u64 sched_clock_cpu(int cpu)
 {
 	struct sched_clock_data *scd;
@@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-unsigned long long cpu_clock(int cpu)
+/*
+ * As outlined at the top, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ */
+u64 cpu_clock(int cpu)
 {
-	unsigned long long clock;
+	u64 clock;
 	unsigned long flags;
 
 	local_irq_save(flags);
@@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
 	return clock;
 }
 
+/*
+ * Similar to cpu_clock() for the current cpu. Time will only be observed
+ * to be monotonic if care is taken to only compare timestampt taken on the
+ * same CPU.
+ *
+ * See cpu_clock().
+ */
+u64 local_clock(void)
+{
+	u64 clock;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	clock = sched_clock_cpu(smp_processor_id());
+	local_irq_restore(flags);
+
+	return clock;
+}
+
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 void sched_clock_init(void)
@@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
 	return sched_clock();
 }
 
-
-unsigned long long cpu_clock(int cpu)
+u64 cpu_clock(int cpu)
 {
 	return sched_clock_cpu(cpu);
 }
 
+u64 local_clock(void)
+{
+	return sched_clock_cpu(0);
+}
+
 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 EXPORT_SYMBOL_GPL(cpu_clock);
+EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..1723e2b8c589 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -56,7 +56,7 @@ u64 notrace trace_clock_local(void)
  */
 u64 notrace trace_clock(void)
 {
-	return cpu_clock(raw_smp_processor_id());
+	return local_clock();
 }
 
 
-- 
cgit v1.2.3-59-g8ed1b


From 83cd4fe27ad8446619b2e030b171b858501de87d Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Fri, 21 May 2010 17:09:41 -0700
Subject: sched: Change nohz idle load balancing logic to push model

In the new push model, all idle CPUs indeed go into nohz mode. There is
still the concept of idle load balancer (performing the load balancing
on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz
balancer when any of the nohz CPUs need idle load balancing.
The kickee CPU does the idle load balancing on behalf of all idle CPUs
instead of the normal idle balance.

This addresses the below two problems with the current nohz ilb logic:
* the idle load balancer continued to have periodic ticks during idle and
  wokeup frequently, even though it did not have any rebalancing to do on
  behalf of any of the idle CPUs.
* On x86 and CPUs that have APIC timer stoppage on idle CPUs, this
  periodic wakeup can result in a periodic additional interrupt on a CPU
  doing the timer broadcast.

Also currently we are migrating the unpinned timers from an idle to the cpu
doing idle load balancing (when all the cpus in the system are idle,
there is no idle load balancing cpu and timers get added to the same idle cpu
where the request was made. So the existing optimization works only on semi idle
system).

And In semi idle system, we no longer have periodic ticks on the idle load
balancer CPU. Using that cpu will add more delays to the timers than intended
(as that cpu's timer base may not be uptodate wrt jiffies etc). This was
causing mysterious slowdowns during boot etc.

For now, in the semi idle case, use the nearest busy cpu for migrating timers
from an idle cpu.  This is good for power-savings anyway.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |   9 +-
 kernel/hrtimer.c         |   8 +-
 kernel/sched.c           |  34 ++++-
 kernel/sched_fair.c      | 329 ++++++++++++++++++++++++++++-------------------
 kernel/time/tick-sched.c |   8 +-
 kernel/timer.c           |   8 +-
 6 files changed, 237 insertions(+), 159 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c2d4316a04bb..a3e5b1cd0438 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -271,14 +271,11 @@ extern int runqueue_is_locked(int cpu);
 
 extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
-extern int select_nohz_load_balancer(int cpu);
-extern int get_nohz_load_balancer(void);
+extern void select_nohz_load_balancer(int stop_tick);
+extern int get_nohz_timer_target(void);
 extern int nohz_ratelimit(int cpu);
 #else
-static inline int select_nohz_load_balancer(int cpu)
-{
-	return 0;
-}
+static inline void select_nohz_load_balancer(int stop_tick) { }
 
 static inline int nohz_ratelimit(int cpu)
 {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..e934339fbbef 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 static int hrtimer_get_target(int this_cpu, int pinned)
 {
 #ifdef CONFIG_NO_HZ
-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
-		int preferred_cpu = get_nohz_load_balancer();
-
-		if (preferred_cpu >= 0)
-			return preferred_cpu;
-	}
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
+		return get_nohz_timer_target();
 #endif
 	return this_cpu;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index a757f6b11cbd..132950b33dde 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -460,7 +460,7 @@ struct rq {
 	unsigned long last_load_update_tick;
 #ifdef CONFIG_NO_HZ
 	u64 nohz_stamp;
-	unsigned char in_nohz_recently;
+	unsigned char nohz_balance_kick;
 #endif
 	unsigned int skip_clock_update;
 
@@ -1194,6 +1194,27 @@ static void resched_cpu(int cpu)
 }
 
 #ifdef CONFIG_NO_HZ
+/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+	struct sched_domain *sd;
+
+	for_each_domain(cpu, sd) {
+		for_each_cpu(i, sched_domain_span(sd))
+			if (!idle_cpu(i))
+				return i;
+	}
+	return cpu;
+}
 /*
  * When add_timer_on() enqueues a timer into the timer wheel of an
  * idle CPU then this timer might expire before the next timer event
@@ -7791,6 +7812,10 @@ void __init sched_init(void)
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
 		rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ
+		rq->nohz_balance_kick = 0;
+		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
+#endif
 #endif
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
@@ -7835,8 +7860,11 @@ void __init sched_init(void)
 	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-	zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
-	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
+	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+	alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
+	atomic_set(&nohz.load_balancer, nr_cpu_ids);
+	atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
+	atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
 #endif
 	/* May be allocated at isolcpus cmdline parse time */
 	if (cpu_isolated_map == NULL)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 22b8b4f2b616..6ee2e0af665b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3091,13 +3091,40 @@ out_unlock:
 }
 
 #ifdef CONFIG_NO_HZ
+
+static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
+
+static void trigger_sched_softirq(void *data)
+{
+	raise_softirq_irqoff(SCHED_SOFTIRQ);
+}
+
+static inline void init_sched_softirq_csd(struct call_single_data *csd)
+{
+	csd->func = trigger_sched_softirq;
+	csd->info = NULL;
+	csd->flags = 0;
+	csd->priv = 0;
+}
+
+/*
+ * idle load balancing details
+ * - One of the idle CPUs nominates itself as idle load_balancer, while
+ *   entering idle.
+ * - This idle load balancer CPU will also go into tickless mode when
+ *   it is idle, just like all other idle CPUs
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ *   needed, they will kick the idle load balancer, which then does idle
+ *   load balancing for all the idle CPUs.
+ */
 static struct {
 	atomic_t load_balancer;
-	cpumask_var_t cpu_mask;
-	cpumask_var_t ilb_grp_nohz_mask;
-} nohz ____cacheline_aligned = {
-	.load_balancer = ATOMIC_INIT(-1),
-};
+	atomic_t first_pick_cpu;
+	atomic_t second_pick_cpu;
+	cpumask_var_t idle_cpus_mask;
+	cpumask_var_t grp_idle_mask;
+	unsigned long next_balance;     /* in jiffy units */
+} nohz ____cacheline_aligned;
 
 int get_nohz_load_balancer(void)
 {
@@ -3151,17 +3178,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
  */
 static inline int is_semi_idle_group(struct sched_group *ilb_group)
 {
-	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+	cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
 					sched_group_cpus(ilb_group));
 
 	/*
 	 * A sched_group is semi-idle when it has atleast one busy cpu
 	 * and atleast one idle cpu.
 	 */
-	if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+	if (cpumask_empty(nohz.grp_idle_mask))
 		return 0;
 
-	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+	if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
 		return 0;
 
 	return 1;
@@ -3194,7 +3221,7 @@ static int find_new_ilb(int cpu)
 	 * Optimize for the case when we have no idle CPUs or only one
 	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
 	 */
-	if (cpumask_weight(nohz.cpu_mask) < 2)
+	if (cpumask_weight(nohz.idle_cpus_mask) < 2)
 		goto out_done;
 
 	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3202,7 +3229,7 @@ static int find_new_ilb(int cpu)
 
 		do {
 			if (is_semi_idle_group(ilb_group))
-				return cpumask_first(nohz.ilb_grp_nohz_mask);
+				return cpumask_first(nohz.grp_idle_mask);
 
 			ilb_group = ilb_group->next;
 
@@ -3210,98 +3237,116 @@ static int find_new_ilb(int cpu)
 	}
 
 out_done:
-	return cpumask_first(nohz.cpu_mask);
+	return nr_cpu_ids;
 }
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
 static inline int find_new_ilb(int call_cpu)
 {
-	return cpumask_first(nohz.cpu_mask);
+	return nr_cpu_ids;
 }
 #endif
 
+/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void nohz_balancer_kick(int cpu)
+{
+	int ilb_cpu;
+
+	nohz.next_balance++;
+
+	ilb_cpu = get_nohz_load_balancer();
+
+	if (ilb_cpu >= nr_cpu_ids) {
+		ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+		if (ilb_cpu >= nr_cpu_ids)
+			return;
+	}
+
+	if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+		struct call_single_data *cp;
+
+		cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+		cp = &per_cpu(remote_sched_softirq_cb, cpu);
+		__smp_call_function_single(ilb_cpu, cp, 0);
+	}
+	return;
+}
+
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
+ * load balancing on behalf of all those cpus.
  *
- * While stopping the tick, this cpu will become the ilb owner if there
- * is no other owner. And will be the owner till that cpu becomes busy
- * or if all cpus in the system stop their ticks at which point
- * there is no need for ilb owner.
+ * When the ilb owner becomes busy, we will not have new ilb owner until some
+ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
+ * idle load balancing by kicking one of the idle CPUs.
  *
- * When the ilb owner becomes busy, it nominates another owner, during the
- * next busy scheduler_tick()
+ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
+ * ilb owner CPU in future (when there is a need for idle load balancing on
+ * behalf of all idle CPUs).
  */
-int select_nohz_load_balancer(int stop_tick)
+void select_nohz_load_balancer(int stop_tick)
 {
 	int cpu = smp_processor_id();
 
 	if (stop_tick) {
-		cpu_rq(cpu)->in_nohz_recently = 1;
-
 		if (!cpu_active(cpu)) {
 			if (atomic_read(&nohz.load_balancer) != cpu)
-				return 0;
+				return;
 
 			/*
 			 * If we are going offline and still the leader,
 			 * give up!
 			 */
-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+					   nr_cpu_ids) != cpu)
 				BUG();
 
-			return 0;
+			return;
 		}
 
-		cpumask_set_cpu(cpu, nohz.cpu_mask);
+		cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 
-		/* time for ilb owner also to sleep */
-		if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
-			if (atomic_read(&nohz.load_balancer) == cpu)
-				atomic_set(&nohz.load_balancer, -1);
-			return 0;
-		}
+		if (atomic_read(&nohz.first_pick_cpu) == cpu)
+			atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+		if (atomic_read(&nohz.second_pick_cpu) == cpu)
+			atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
 
-		if (atomic_read(&nohz.load_balancer) == -1) {
-			/* make me the ilb owner */
-			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
-				return 1;
-		} else if (atomic_read(&nohz.load_balancer) == cpu) {
+		if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
 			int new_ilb;
 
-			if (!(sched_smt_power_savings ||
-						sched_mc_power_savings))
-				return 1;
+			/* make me the ilb owner */
+			if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
+					   cpu) != nr_cpu_ids)
+				return;
+
 			/*
 			 * Check to see if there is a more power-efficient
 			 * ilb.
 			 */
 			new_ilb = find_new_ilb(cpu);
 			if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-				atomic_set(&nohz.load_balancer, -1);
+				atomic_set(&nohz.load_balancer, nr_cpu_ids);
 				resched_cpu(new_ilb);
-				return 0;
+				return;
 			}
-			return 1;
+			return;
 		}
 	} else {
-		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
-			return 0;
+		if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+			return;
 
-		cpumask_clear_cpu(cpu, nohz.cpu_mask);
+		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
 
 		if (atomic_read(&nohz.load_balancer) == cpu)
-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+					   nr_cpu_ids) != cpu)
 				BUG();
 	}
-	return 0;
+	return;
 }
 #endif
 
@@ -3383,11 +3428,101 @@ out:
 		rq->next_balance = next_balance;
 }
 
+#ifdef CONFIG_NO_HZ
 /*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
  */
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+{
+	struct rq *this_rq = cpu_rq(this_cpu);
+	struct rq *rq;
+	int balance_cpu;
+
+	if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+		return;
+
+	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+		if (balance_cpu == this_cpu)
+			continue;
+
+		/*
+		 * If this cpu gets work to do, stop the load balancing
+		 * work being done for other cpus. Next load
+		 * balancing owner will pick it up.
+		 */
+		if (need_resched()) {
+			this_rq->nohz_balance_kick = 0;
+			break;
+		}
+
+		raw_spin_lock_irq(&this_rq->lock);
+		update_cpu_load(this_rq);
+		raw_spin_unlock_irq(&this_rq->lock);
+
+		rebalance_domains(balance_cpu, CPU_IDLE);
+
+		rq = cpu_rq(balance_cpu);
+		if (time_after(this_rq->next_balance, rq->next_balance))
+			this_rq->next_balance = rq->next_balance;
+	}
+	nohz.next_balance = this_rq->next_balance;
+	this_rq->nohz_balance_kick = 0;
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer
+ * - first_pick_cpu is the one of the busy CPUs. It will kick
+ *   idle load balancer when it has more than one process active. This
+ *   eliminates the need for idle load balancing altogether when we have
+ *   only one running process in the system (common case).
+ * - If there are more than one busy CPU, idle load balancer may have
+ *   to run for active_load_balance to happen (i.e., two busy CPUs are
+ *   SMT or core siblings and can run better if they move to different
+ *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
+ *   which will kick idle load balancer as soon as it has any load.
+ */
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
+{
+	unsigned long now = jiffies;
+	int ret;
+	int first_pick_cpu, second_pick_cpu;
+
+	if (time_before(now, nohz.next_balance))
+		return 0;
+
+	if (!rq->nr_running)
+		return 0;
+
+	first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+	second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+
+	if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+	    second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+		return 0;
+
+	ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+	if (ret == nr_cpu_ids || ret == cpu) {
+		atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+		if (rq->nr_running > 1)
+			return 1;
+	} else {
+		ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+		if (ret == nr_cpu_ids || ret == cpu) {
+			if (rq->nr_running)
+				return 1;
+		}
+	}
+	return 0;
+}
+#else
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+#endif
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
 static void run_rebalance_domains(struct softirq_action *h)
 {
 	int this_cpu = smp_processor_id();
@@ -3397,40 +3532,12 @@ static void run_rebalance_domains(struct softirq_action *h)
 
 	rebalance_domains(this_cpu, idle);
 
-#ifdef CONFIG_NO_HZ
 	/*
-	 * If this cpu is the owner for idle load balancing, then do the
+	 * If this cpu has a pending nohz_balance_kick, then do the
 	 * balancing on behalf of the other idle cpus whose ticks are
 	 * stopped.
 	 */
-	if (this_rq->idle_at_tick &&
-	    atomic_read(&nohz.load_balancer) == this_cpu) {
-		struct rq *rq;
-		int balance_cpu;
-
-		for_each_cpu(balance_cpu, nohz.cpu_mask) {
-			if (balance_cpu == this_cpu)
-				continue;
-
-			/*
-			 * If this cpu gets work to do, stop the load balancing
-			 * work being done for other cpus. Next load
-			 * balancing owner will pick it up.
-			 */
-			if (need_resched())
-				break;
-
-			rq = cpu_rq(balance_cpu);
-			raw_spin_lock_irq(&rq->lock);
-			update_cpu_load(rq);
-			raw_spin_unlock_irq(&rq->lock);
-			rebalance_domains(balance_cpu, CPU_IDLE);
-
-			if (time_after(this_rq->next_balance, rq->next_balance))
-				this_rq->next_balance = rq->next_balance;
-		}
-	}
-#endif
+	nohz_idle_balance(this_cpu, idle);
 }
 
 static inline int on_null_domain(int cpu)
@@ -3440,57 +3547,17 @@ static inline int on_null_domain(int cpu)
 
 /*
  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
  */
 static inline void trigger_load_balance(struct rq *rq, int cpu)
 {
-#ifdef CONFIG_NO_HZ
-	/*
-	 * If we were in the nohz mode recently and busy at the current
-	 * scheduler tick, then check if we need to nominate new idle
-	 * load balancer.
-	 */
-	if (rq->in_nohz_recently && !rq->idle_at_tick) {
-		rq->in_nohz_recently = 0;
-
-		if (atomic_read(&nohz.load_balancer) == cpu) {
-			cpumask_clear_cpu(cpu, nohz.cpu_mask);
-			atomic_set(&nohz.load_balancer, -1);
-		}
-
-		if (atomic_read(&nohz.load_balancer) == -1) {
-			int ilb = find_new_ilb(cpu);
-
-			if (ilb < nr_cpu_ids)
-				resched_cpu(ilb);
-		}
-	}
-
-	/*
-	 * If this cpu is idle and doing idle load balancing for all the
-	 * cpus with ticks stopped, is it time for that to stop?
-	 */
-	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
-		resched_cpu(cpu);
-		return;
-	}
-
-	/*
-	 * If this cpu is idle and the idle load balancing is done by
-	 * someone else, then no need raise the SCHED_SOFTIRQ
-	 */
-	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-	    cpumask_test_cpu(cpu, nohz.cpu_mask))
-		return;
-#endif
 	/* Don't need to rebalance while attached to NULL domain */
 	if (time_after_eq(jiffies, rq->next_balance) &&
 	    likely(!on_null_domain(cpu)))
 		raise_softirq(SCHED_SOFTIRQ);
+#ifdef CONFIG_NO_HZ
+	else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+		nohz_balancer_kick(cpu);
+#endif
 }
 
 static void rq_online_fair(struct rq *rq)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1d7b9bc1c034..5f171f04ab00 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -408,13 +408,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 		 * the scheduler tick in nohz_restart_sched_tick.
 		 */
 		if (!ts->tick_stopped) {
-			if (select_nohz_load_balancer(1)) {
-				/*
-				 * sched tick not stopped!
-				 */
-				cpumask_clear_cpu(cpu, nohz_cpu_mask);
-				goto out;
-			}
+			select_nohz_load_balancer(1);
 
 			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
 			ts->tick_stopped = 1;
diff --git a/kernel/timer.c b/kernel/timer.c
index ee305c8d4e18..48d6aec0789c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -679,12 +679,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 	cpu = smp_processor_id();
 
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
-		int preferred_cpu = get_nohz_load_balancer();
-
-		if (preferred_cpu >= 0)
-			cpu = preferred_cpu;
-	}
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
+		cpu = get_nohz_timer_target();
 #endif
 	new_base = per_cpu(tvec_bases, cpu);
 
-- 
cgit v1.2.3-59-g8ed1b


From 9d5efe05eb0c904545a28b19c18b949f23334de0 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Tue, 8 Jun 2010 14:57:02 +1000
Subject: sched: Fix capacity calculations for SMT4

Handle cpu capacity being reported as 0 on cores with more number of
hardware threads. For example on a Power7 core with 4 hardware
threads, core power is 1177 and thus power of each hardware thread is
1177/4 = 294. This low power can lead to capacity for each hardware
thread being calculated as 0, which leads to tasks bouncing within the
core madly!

Fix this by reporting capacity for hardware threads as 1, provided
their power is not scaled down significantly because of frequency
scaling or real-time tasks usage of cpu.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <20100608045702.21D03CC895@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 +-
 kernel/sched_fair.c   | 53 +++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 44 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a3e5b1cd0438..c731296e5e93 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -857,7 +857,7 @@ struct sched_group {
 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
 	 * single CPU.
 	 */
-	unsigned int cpu_power;
+	unsigned int cpu_power, cpu_power_orig;
 
 	/*
 	 * The CPUs this group covers.
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6ee2e0af665b..b9b3462483b7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2285,13 +2285,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 	unsigned long power = SCHED_LOAD_SCALE;
 	struct sched_group *sdg = sd->groups;
 
-	if (sched_feat(ARCH_POWER))
-		power *= arch_scale_freq_power(sd, cpu);
-	else
-		power *= default_scale_freq_power(sd, cpu);
-
-	power >>= SCHED_LOAD_SHIFT;
-
 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
 		if (sched_feat(ARCH_POWER))
 			power *= arch_scale_smt_power(sd, cpu);
@@ -2301,6 +2294,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power >>= SCHED_LOAD_SHIFT;
 	}
 
+	sdg->cpu_power_orig = power;
+
+	if (sched_feat(ARCH_POWER))
+		power *= arch_scale_freq_power(sd, cpu);
+	else
+		power *= default_scale_freq_power(sd, cpu);
+
+	power >>= SCHED_LOAD_SHIFT;
+
 	power *= scale_rt_power(cpu);
 	power >>= SCHED_LOAD_SHIFT;
 
@@ -2333,6 +2335,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
 	sdg->cpu_power = power;
 }
 
+/*
+ * Try and fix up capacity for tiny siblings, this is needed when
+ * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ * which on its own isn't powerful enough.
+ *
+ * See update_sd_pick_busiest() and check_asym_packing().
+ */
+static inline int
+fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+{
+	/*
+	 * Only siblings can have significantly less than SCHED_LOAD_SCALE
+	 */
+	if (sd->level != SD_LV_SIBLING)
+		return 0;
+
+	/*
+	 * If ~90% of the cpu_power is still there, we're good.
+	 */
+	if (group->cpu_power * 32 < group->cpu_power_orig * 29)
+		return 1;
+
+	return 0;
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @sd: The sched_domain whose statistics are to be updated.
@@ -2426,6 +2453,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 
 	sgs->group_capacity =
 		DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+	if (!sgs->group_capacity)
+		sgs->group_capacity = fix_small_capacity(sd, group);
 }
 
 /**
@@ -2724,8 +2753,9 @@ ret:
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-		   unsigned long imbalance, const struct cpumask *cpus)
+find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
+		   enum cpu_idle_type idle, unsigned long imbalance,
+		   const struct cpumask *cpus)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
@@ -2736,6 +2766,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 		unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
 		unsigned long wl;
 
+		if (!capacity)
+			capacity = fix_small_capacity(sd, group);
+
 		if (!cpumask_test_cpu(i, cpus))
 			continue;
 
@@ -2852,7 +2885,7 @@ redo:
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, idle, imbalance, cpus);
+	busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
-- 
cgit v1.2.3-59-g8ed1b


From 532cb4c401e225b084c14d6bd6a2f8ee561de2f1 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Tue, 8 Jun 2010 14:57:02 +1000
Subject: sched: Add asymmetric group packing option for sibling domain

Check to see if the group is packed in a sched doman.

This is primarily intended to used at the sibling level.  Some cores
like POWER7 prefer to use lower numbered SMT threads.  In the case of
POWER7, it can move to lower SMT modes only when higher threads are
idle.  When in lower SMT modes, the threads will perform better since
they share less core resources.  Hence when we have idle threads, we
want them to be the higher ones.

This adds a hook into f_b_g() called check_asym_packing() to check the
packing.  This packing function is run on idle threads.  It checks to
see if the busiest CPU in this domain (core in the P7 case) has a
higher CPU number than what where the packing function is being run
on.  If it is, calculate the imbalance and return the higher busier
thread as the busiest group to f_b_g().  Here we are assuming a lower
CPU number will be equivalent to a lower SMT thread number.

It also creates a new SD_ASYM_PACKING flag to enable this feature at
any scheduler domain level.

It also creates an arch hook to enable this feature at the sibling
level.  The default function doesn't enable this feature.

Based heavily on patch from Peter Zijlstra.
Fixes from Srivatsa Vaddagiri.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <20100608045702.2936CCC897@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |   4 +-
 include/linux/topology.h |   1 +
 kernel/sched_fair.c      | 139 +++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 126 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c731296e5e93..ff154e10752b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -801,7 +801,7 @@ enum cpu_idle_type {
 #define SD_POWERSAVINGS_BALANCE	0x0100	/* Balance for power savings */
 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
-
+#define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
 
 enum powersavings_balance_level {
@@ -836,6 +836,8 @@ static inline int sd_balance_for_package_power(void)
 	return SD_PREFER_SIBLING;
 }
 
+extern int __weak arch_sd_sibiling_asym_packing(void);
+
 /*
  * Optimise SD flags for power savings:
  * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
diff --git a/include/linux/topology.h b/include/linux/topology.h
index c44df50a05ab..cf57f30d0dcb 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -103,6 +103,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
 				| 0*SD_PREFER_SIBLING			\
+				| arch_sd_sibiling_asym_packing()	\
 				,					\
 	.last_balance		= jiffies,				\
 	.balance_interval	= 1,					\
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b9b3462483b7..593424f91a8a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2457,12 +2457,54 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 		sgs->group_capacity = fix_small_capacity(sd, group);
 }
 
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @sd: sched_domain whose statistics are to be checked
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sds: sched_group statistics
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ */
+static bool update_sd_pick_busiest(struct sched_domain *sd,
+				   struct sd_lb_stats *sds,
+				   struct sched_group *sg,
+				   struct sg_lb_stats *sgs,
+				   int this_cpu)
+{
+	if (sgs->avg_load <= sds->max_load)
+		return false;
+
+	if (sgs->sum_nr_running > sgs->group_capacity)
+		return true;
+
+	if (sgs->group_imb)
+		return true;
+
+	/*
+	 * ASYM_PACKING needs to move all the work to the lowest
+	 * numbered CPUs in the group, therefore mark all groups
+	 * higher than ourself as busy.
+	 */
+	if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+	    this_cpu < group_first_cpu(sg)) {
+		if (!sds->busiest)
+			return true;
+
+		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+			return true;
+	}
+
+	return false;
+}
+
 /**
  * update_sd_lb_stats - Update sched_group's statistics for load balancing.
  * @sd: sched_domain whose statistics are to be updated.
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
+ * @sd_idle: Idle status of the sched_domain containing sg.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
  * @sds: variable to hold the statistics for this sched_domain.
@@ -2473,7 +2515,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			struct sd_lb_stats *sds)
 {
 	struct sched_domain *child = sd->child;
-	struct sched_group *group = sd->groups;
+	struct sched_group *sg = sd->groups;
 	struct sg_lb_stats sgs;
 	int load_idx, prefer_sibling = 0;
 
@@ -2486,21 +2528,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 	do {
 		int local_group;
 
-		local_group = cpumask_test_cpu(this_cpu,
-					       sched_group_cpus(group));
+		local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
 		memset(&sgs, 0, sizeof(sgs));
-		update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
 				local_group, cpus, balance, &sgs);
 
 		if (local_group && !(*balance))
 			return;
 
 		sds->total_load += sgs.group_load;
-		sds->total_pwr += group->cpu_power;
+		sds->total_pwr += sg->cpu_power;
 
 		/*
 		 * In case the child domain prefers tasks go to siblings
-		 * first, lower the group capacity to one so that we'll try
+		 * first, lower the sg capacity to one so that we'll try
 		 * and move all the excess tasks away.
 		 */
 		if (prefer_sibling)
@@ -2508,23 +2549,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 
 		if (local_group) {
 			sds->this_load = sgs.avg_load;
-			sds->this = group;
+			sds->this = sg;
 			sds->this_nr_running = sgs.sum_nr_running;
 			sds->this_load_per_task = sgs.sum_weighted_load;
-		} else if (sgs.avg_load > sds->max_load &&
-			   (sgs.sum_nr_running > sgs.group_capacity ||
-				sgs.group_imb)) {
+		} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
 			sds->max_load = sgs.avg_load;
-			sds->busiest = group;
+			sds->busiest = sg;
 			sds->busiest_nr_running = sgs.sum_nr_running;
 			sds->busiest_group_capacity = sgs.group_capacity;
 			sds->busiest_load_per_task = sgs.sum_weighted_load;
 			sds->group_imb = sgs.group_imb;
 		}
 
-		update_sd_power_savings_stats(group, sds, local_group, &sgs);
-		group = group->next;
-	} while (group != sd->groups);
+		update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+		sg = sg->next;
+	} while (sg != sd->groups);
+}
+
+int __weak arch_sd_sibiling_asym_packing(void)
+{
+       return 0*SD_ASYM_PACKING;
+}
+
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ *			sched doman.
+ *
+ * This is primarily intended to used at the sibling level.  Some
+ * cores like POWER7 prefer to use lower numbered SMT threads.  In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle.  When in lower SMT modes, the threads will
+ * perform better since they share less core resources.  Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads.  It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on.  Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * @sd: The sched_domain whose packing is to be checked.
+ * @sds: Statistics of the sched_domain which is to be packed
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: returns amount of imbalanced due to packing.
+ *
+ * Returns 1 when packing is required and a task should be moved to
+ * this CPU.  The amount of the imbalance is returned in *imbalance.
+ */
+static int check_asym_packing(struct sched_domain *sd,
+			      struct sd_lb_stats *sds,
+			      int this_cpu, unsigned long *imbalance)
+{
+	int busiest_cpu;
+
+	if (!(sd->flags & SD_ASYM_PACKING))
+		return 0;
+
+	if (!sds->busiest)
+		return 0;
+
+	busiest_cpu = group_first_cpu(sds->busiest);
+	if (this_cpu > busiest_cpu)
+		return 0;
+
+	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+				       SCHED_LOAD_SCALE);
+	return 1;
 }
 
 /**
@@ -2719,6 +2809,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (!(*balance))
 		goto ret;
 
+	if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+	    check_asym_packing(sd, &sds, this_cpu, imbalance))
+		return sds.busiest;
+
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
@@ -2808,9 +2902,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 /* Working cpumask for load_balance and load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+			       int busiest_cpu, int this_cpu)
 {
 	if (idle == CPU_NEWLY_IDLE) {
+
+		/*
+		 * ASYM_PACKING needs to force migrate tasks from busy but
+		 * higher numbered CPUs in order to pack all tasks in the
+		 * lowest numbered CPUs.
+		 */
+		if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+			return 1;
+
 		/*
 		 * The only task running in a non-idle cpu can be moved to this
 		 * cpu in an attempt to completely freeup the other CPU
@@ -2929,7 +3033,8 @@ redo:
 		schedstat_inc(sd, lb_failed[idle]);
 		sd->nr_balance_failed++;
 
-		if (need_active_balance(sd, sd_idle, idle)) {
+		if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
+					this_cpu)) {
 			raw_spin_lock_irqsave(&busiest->lock, flags);
 
 			/* don't kick the active_load_balance_cpu_stop,
-- 
cgit v1.2.3-59-g8ed1b


From ecc55f84b2e9741f29daa787ded93986df6cbe17 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 21 May 2010 15:11:34 +0200
Subject: perf, trace: Inline perf_swevent_put_recursion_context()

Inline perf_swevent_put_recursion_context into perf_tp_event(), this
shrinks the per trace template code footprint and saves a function
call.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h | 3 +--
 include/linux/perf_event.h   | 2 +-
 kernel/perf_event.c          | 8 ++++----
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 3167f2df4126..0af31cd335d6 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -257,8 +257,7 @@ static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
 		       u64 count, struct pt_regs *regs, void *head)
 {
-	perf_tp_event(addr, count, raw_data, size, regs, head);
-	perf_swevent_put_recursion_context(rctx);
+	perf_tp_event(addr, count, raw_data, size, regs, head, rctx);
 }
 #endif
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5d0266d94985..c691a0b27bcd 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1001,7 +1001,7 @@ static inline bool perf_paranoid_kernel(void)
 extern void perf_event_init(void);
 extern void perf_tp_event(u64 addr, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
-			  struct hlist_head *head);
+			  struct hlist_head *head, int rctx);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ff86c558af4c..4bd3b597bcca 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4213,14 +4213,12 @@ int perf_swevent_get_recursion_context(void)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
-void perf_swevent_put_recursion_context(int rctx)
+void inline perf_swevent_put_recursion_context(int rctx)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	barrier();
 	cpuctx->recursion[rctx]--;
 }
-EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
-
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
 			    struct pt_regs *regs, u64 addr)
@@ -4601,7 +4599,7 @@ static int perf_tp_event_match(struct perf_event *event,
 }
 
 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-		   struct pt_regs *regs, struct hlist_head *head)
+		   struct pt_regs *regs, struct hlist_head *head, int rctx)
 {
 	struct perf_sample_data data;
 	struct perf_event *event;
@@ -4621,6 +4619,8 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 			perf_swevent_add(event, count, 1, &data, regs);
 	}
 	rcu_read_unlock();
+
+	perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
-- 
cgit v1.2.3-59-g8ed1b


From 3af9e859281bda7eb7c20b51879cf43aa788ac2e Mon Sep 17 00:00:00 2001
From: Eric B Munson <ebmunson@us.ibm.com>
Date: Tue, 18 May 2010 15:30:49 +0100
Subject: perf: Add non-exec mmap() tracking

Add the capacility to track data mmap()s. This can be used together
with PERF_SAMPLE_ADDR for data profiling.

Signed-off-by: Anton Blanchard <anton@samba.org>
[Updated code for stable perf ABI]
Signed-off-by: Eric B Munson <ebmunson@us.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1274193049-25997-1-git-send-email-ebmunson@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/exec.c                   |  1 +
 include/linux/perf_event.h  | 12 +++---------
 kernel/perf_event.c         | 34 +++++++++++++++++++++++-----------
 mm/mmap.c                   |  6 +++++-
 tools/perf/builtin-record.c |  4 +++-
 5 files changed, 35 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index e19de6a80339..97d91a03fb13 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -653,6 +653,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	else
 		stack_base = vma->vm_start - stack_expand;
 #endif
+	current->mm->start_stack = bprm->p;
 	ret = expand_stack(vma, stack_base);
 	if (ret)
 		ret = -EFAULT;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c691a0b27bcd..36efad90cd43 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -214,8 +214,9 @@ struct perf_event_attr {
 				 *  See also PERF_RECORD_MISC_EXACT_IP
 				 */
 				precise_ip     :  2, /* skid constraint       */
+				mmap_data      :  1, /* non-exec mmap data    */
 
-				__reserved_1   : 47;
+				__reserved_1   : 46;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -962,14 +963,7 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 	}
 }
 
-extern void __perf_event_mmap(struct vm_area_struct *vma);
-
-static inline void perf_event_mmap(struct vm_area_struct *vma)
-{
-	if (vma->vm_flags & VM_EXEC)
-		__perf_event_mmap(vma);
-}
-
+extern void perf_event_mmap(struct vm_area_struct *vma);
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index b39bec346e80..227ed9c8ec34 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1891,7 +1891,7 @@ static void free_event(struct perf_event *event)
 
 	if (!event->parent) {
 		atomic_dec(&nr_events);
-		if (event->attr.mmap)
+		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_dec(&nr_mmap_events);
 		if (event->attr.comm)
 			atomic_dec(&nr_comm_events);
@@ -3491,7 +3491,7 @@ perf_event_read_event(struct perf_event *event,
 /*
  * task tracking -- fork/exit
  *
- * enabled by: attr.comm | attr.mmap | attr.task
+ * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
  */
 
 struct perf_task_event {
@@ -3541,7 +3541,8 @@ static int perf_event_task_match(struct perf_event *event)
 	if (event->cpu != -1 && event->cpu != smp_processor_id())
 		return 0;
 
-	if (event->attr.comm || event->attr.mmap || event->attr.task)
+	if (event->attr.comm || event->attr.mmap ||
+	    event->attr.mmap_data || event->attr.task)
 		return 1;
 
 	return 0;
@@ -3766,7 +3767,8 @@ static void perf_event_mmap_output(struct perf_event *event,
 }
 
 static int perf_event_mmap_match(struct perf_event *event,
-				   struct perf_mmap_event *mmap_event)
+				   struct perf_mmap_event *mmap_event,
+				   int executable)
 {
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
@@ -3774,19 +3776,21 @@ static int perf_event_mmap_match(struct perf_event *event,
 	if (event->cpu != -1 && event->cpu != smp_processor_id())
 		return 0;
 
-	if (event->attr.mmap)
+	if ((!executable && event->attr.mmap_data) ||
+	    (executable && event->attr.mmap))
 		return 1;
 
 	return 0;
 }
 
 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
-				  struct perf_mmap_event *mmap_event)
+				  struct perf_mmap_event *mmap_event,
+				  int executable)
 {
 	struct perf_event *event;
 
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-		if (perf_event_mmap_match(event, mmap_event))
+		if (perf_event_mmap_match(event, mmap_event, executable))
 			perf_event_mmap_output(event, mmap_event);
 	}
 }
@@ -3830,6 +3834,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 		if (!vma->vm_mm) {
 			name = strncpy(tmp, "[vdso]", sizeof(tmp));
 			goto got_name;
+		} else if (vma->vm_start <= vma->vm_mm->start_brk &&
+				vma->vm_end >= vma->vm_mm->brk) {
+			name = strncpy(tmp, "[heap]", sizeof(tmp));
+			goto got_name;
+		} else if (vma->vm_start <= vma->vm_mm->start_stack &&
+				vma->vm_end >= vma->vm_mm->start_stack) {
+			name = strncpy(tmp, "[stack]", sizeof(tmp));
+			goto got_name;
 		}
 
 		name = strncpy(tmp, "//anon", sizeof(tmp));
@@ -3846,17 +3858,17 @@ got_name:
 
 	rcu_read_lock();
 	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
+	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
 	ctx = rcu_dereference(current->perf_event_ctxp);
 	if (ctx)
-		perf_event_mmap_ctx(ctx, mmap_event);
+		perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
 	put_cpu_var(perf_cpu_context);
 	rcu_read_unlock();
 
 	kfree(buf);
 }
 
-void __perf_event_mmap(struct vm_area_struct *vma)
+void perf_event_mmap(struct vm_area_struct *vma)
 {
 	struct perf_mmap_event mmap_event;
 
@@ -4911,7 +4923,7 @@ done:
 
 	if (!event->parent) {
 		atomic_inc(&nr_events);
-		if (event->attr.mmap)
+		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_inc(&nr_mmap_events);
 		if (event->attr.comm)
 			atomic_inc(&nr_comm_events);
diff --git a/mm/mmap.c b/mm/mmap.c
index 456ec6f27889..e38e910cb756 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1734,8 +1734,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		grow = (address - vma->vm_end) >> PAGE_SHIFT;
 
 		error = acct_stack_growth(vma, size, grow);
-		if (!error)
+		if (!error) {
 			vma->vm_end = address;
+			perf_event_mmap(vma);
+		}
 	}
 	anon_vma_unlock(vma);
 	return error;
@@ -1781,6 +1783,7 @@ static int expand_downwards(struct vm_area_struct *vma,
 		if (!error) {
 			vma->vm_start = address;
 			vma->vm_pgoff -= grow;
+			perf_event_mmap(vma);
 		}
 	}
 	anon_vma_unlock(vma);
@@ -2208,6 +2211,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	vma->vm_page_prot = vm_get_page_prot(flags);
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 out:
+	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED) {
 		if (!mlock_vma_pages_range(vma, addr, addr + len))
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 5e5c6403a315..39c7247bc54a 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -268,8 +268,10 @@ static void create_counter(int counter, int cpu)
 	if (inherit_stat)
 		attr->inherit_stat = 1;
 
-	if (sample_address)
+	if (sample_address) {
 		attr->sample_type	|= PERF_SAMPLE_ADDR;
+		attr->mmap_data = track;
+	}
 
 	if (call_graph)
 		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
-- 
cgit v1.2.3-59-g8ed1b


From 8d2cacbbb8deadfae78aa16e4e1ee619bdd7019e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 25 May 2010 17:49:05 +0200
Subject: perf: Cleanup {start,commit,cancel}_txn details

Clarify some of the transactional group scheduling API details
and change it so that a successfull ->commit_txn also closes
the transaction.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1274803086.5882.1752.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_event.c |  7 ++++---
 arch/sparc/kernel/perf_event.c   |  7 ++++---
 arch/x86/kernel/cpu/perf_event.c | 14 +++++---------
 include/linux/perf_event.h       | 27 ++++++++++++++++++++++-----
 kernel/perf_event.c              |  9 +--------
 5 files changed, 36 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index 43b83c35cf54..ac2a8c2554d9 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -754,7 +754,7 @@ static int power_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuhw->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuhw->group_flag & PERF_EVENT_TXN)
 		goto nocheck;
 
 	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
@@ -858,7 +858,7 @@ void power_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag |= PERF_EVENT_TXN;
 	cpuhw->n_txn_start = cpuhw->n_events;
 }
 
@@ -871,7 +871,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 }
 
 /*
@@ -897,6 +897,7 @@ int power_pmu_commit_txn(const struct pmu *pmu)
 	for (i = cpuhw->n_txn_start; i < n; ++i)
 		cpuhw->event[i]->hw.config = cpuhw->events[i];
 
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 	return 0;
 }
 
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 0ec92c8861dd..beeb92fa3acd 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1005,7 +1005,7 @@ static int sparc_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto nocheck;
 
 	if (check_excludes(cpuc->event, n0, 1))
@@ -1102,7 +1102,7 @@ static void sparc_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag |= PERF_EVENT_TXN;
 }
 
 /*
@@ -1114,7 +1114,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuhw->group_flag &= ~PERF_EVENT_TXN;
 }
 
 /*
@@ -1137,6 +1137,7 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu)
 	if (sparc_check_constraints(cpuc->event, cpuc->events, n))
 		return -EAGAIN;
 
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5db5b7d65a18..af04c6fa59cb 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_event *event)
 	 * skip the schedulability test here, it will be peformed
 	 * at commit time(->commit_txn) as a whole
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto out;
 
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1096,7 +1096,7 @@ static void x86_pmu_disable(struct perf_event *event)
 	 * The events never got scheduled and ->cancel_txn will truncate
 	 * the event_list.
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+	if (cpuc->group_flag & PERF_EVENT_TXN)
 		return;
 
 	x86_pmu_stop(event);
@@ -1388,7 +1388,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuc->group_flag |= PERF_EVENT_TXN;
 	cpuc->n_txn = 0;
 }
 
@@ -1401,7 +1401,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 	/*
 	 * Truncate the collected events.
 	 */
@@ -1435,11 +1435,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
-	/*
-	 * Clear out the txn count so that ->cancel_txn() which gets
-	 * run after ->commit_txn() doesn't undo things.
-	 */
-	cpuc->n_txn = 0;
+	cpuc->group_flag &= ~PERF_EVENT_TXN;
 
 	return 0;
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 36efad90cd43..f1b6ba0770e0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -549,7 +549,10 @@ struct hw_perf_event {
 
 struct perf_event;
 
-#define PERF_EVENT_TXN_STARTED 1
+/*
+ * Common implementation detail of pmu::{start,commit,cancel}_txn
+ */
+#define PERF_EVENT_TXN 0x1
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -563,14 +566,28 @@ struct pmu {
 	void (*unthrottle)		(struct perf_event *event);
 
 	/*
-	 * group events scheduling is treated as a transaction,
-	 * add group events as a whole and perform one schedulability test.
-	 * If test fails, roll back the whole group
+	 * Group events scheduling is treated as a transaction, add group
+	 * events as a whole and perform one schedulability test. If the test
+	 * fails, roll back the whole group
 	 */
 
+	/*
+	 * Start the transaction, after this ->enable() doesn't need
+	 * to do schedulability tests.
+	 */
 	void (*start_txn)	(const struct pmu *pmu);
-	void (*cancel_txn)	(const struct pmu *pmu);
+	/*
+	 * If ->start_txn() disabled the ->enable() schedulability test
+	 * then ->commit_txn() is required to perform one. On success
+	 * the transaction is closed. On error the transaction is kept
+	 * open until ->cancel_txn() is called.
+	 */
 	int  (*commit_txn)	(const struct pmu *pmu);
+	/*
+	 * Will cancel the transaction, assumes ->disable() is called for
+	 * each successfull ->enable() during the transaction.
+	 */
+	void (*cancel_txn)	(const struct pmu *pmu);
 };
 
 /**
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 227ed9c8ec34..6f60920772b3 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -675,7 +675,6 @@ group_sched_in(struct perf_event *group_event,
 	struct perf_event *event, *partial_group = NULL;
 	const struct pmu *pmu = group_event->pmu;
 	bool txn = false;
-	int ret;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
@@ -703,15 +702,9 @@ group_sched_in(struct perf_event *group_event,
 		}
 	}
 
-	if (!txn)
+	if (!txn || !pmu->commit_txn(pmu))
 		return 0;
 
-	ret = pmu->commit_txn(pmu);
-	if (!ret) {
-		pmu->cancel_txn(pmu);
-		return 0;
-	}
-
 group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
-- 
cgit v1.2.3-59-g8ed1b


From ca5135e6b4a3cbc7e187737520fbc4b508f6f7a2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 28 May 2010 19:33:23 +0200
Subject: perf: Rename perf_mmap_data to perf_buffer

Rename to clarify code.

s/perf_mmap_data/perf_buffer/g and selective s/data/buffer/g

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |   6 +-
 kernel/perf_event.c        | 308 ++++++++++++++++++++++-----------------------
 2 files changed, 157 insertions(+), 157 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f1b6ba0770e0..2a0da021c23f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -602,7 +602,7 @@ enum perf_event_active_state {
 
 struct file;
 
-struct perf_mmap_data {
+struct perf_buffer {
 	atomic_t			refcount;
 	struct rcu_head			rcu_head;
 #ifdef CONFIG_PERF_USE_VMALLOC
@@ -727,7 +727,7 @@ struct perf_event {
 	atomic_t			mmap_count;
 	int				mmap_locked;
 	struct user_struct		*mmap_user;
-	struct perf_mmap_data		*data;
+	struct perf_buffer		*buffer;
 
 	/* poll related */
 	wait_queue_head_t		waitq;
@@ -825,7 +825,7 @@ struct perf_cpu_context {
 
 struct perf_output_handle {
 	struct perf_event		*event;
-	struct perf_mmap_data		*data;
+	struct perf_buffer		*buffer;
 	unsigned long			wakeup;
 	unsigned long			size;
 	void				*addr;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6f60920772b3..93d545801e43 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1876,7 +1876,7 @@ static void free_event_rcu(struct rcu_head *head)
 }
 
 static void perf_pending_sync(struct perf_event *event);
-static void perf_mmap_data_put(struct perf_mmap_data *data);
+static void perf_buffer_put(struct perf_buffer *buffer);
 
 static void free_event(struct perf_event *event)
 {
@@ -1892,9 +1892,9 @@ static void free_event(struct perf_event *event)
 			atomic_dec(&nr_task_events);
 	}
 
-	if (event->data) {
-		perf_mmap_data_put(event->data);
-		event->data = NULL;
+	if (event->buffer) {
+		perf_buffer_put(event->buffer);
+		event->buffer = NULL;
 	}
 
 	if (event->destroy)
@@ -2119,13 +2119,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_event *event = file->private_data;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned int events = POLL_HUP;
 
 	rcu_read_lock();
-	data = rcu_dereference(event->data);
-	if (data)
-		events = atomic_xchg(&data->poll, 0);
+	buffer = rcu_dereference(event->buffer);
+	if (buffer)
+		events = atomic_xchg(&buffer->poll, 0);
 	rcu_read_unlock();
 
 	poll_wait(file, &event->waitq, wait);
@@ -2335,14 +2335,14 @@ static int perf_event_index(struct perf_event *event)
 void perf_event_update_userpage(struct perf_event *event)
 {
 	struct perf_event_mmap_page *userpg;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 
 	rcu_read_lock();
-	data = rcu_dereference(event->data);
-	if (!data)
+	buffer = rcu_dereference(event->buffer);
+	if (!buffer)
 		goto unlock;
 
-	userpg = data->user_page;
+	userpg = buffer->user_page;
 
 	/*
 	 * Disable preemption so as to not let the corresponding user-space
@@ -2376,15 +2376,15 @@ unlock:
  */
 
 static struct page *
-perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
 {
-	if (pgoff > data->nr_pages)
+	if (pgoff > buffer->nr_pages)
 		return NULL;
 
 	if (pgoff == 0)
-		return virt_to_page(data->user_page);
+		return virt_to_page(buffer->user_page);
 
-	return virt_to_page(data->data_pages[pgoff - 1]);
+	return virt_to_page(buffer->data_pages[pgoff - 1]);
 }
 
 static void *perf_mmap_alloc_page(int cpu)
@@ -2400,42 +2400,42 @@ static void *perf_mmap_alloc_page(int cpu)
 	return page_address(page);
 }
 
-static struct perf_mmap_data *
-perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+static struct perf_buffer *
+perf_buffer_alloc(struct perf_event *event, int nr_pages)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned long size;
 	int i;
 
-	size = sizeof(struct perf_mmap_data);
+	size = sizeof(struct perf_buffer);
 	size += nr_pages * sizeof(void *);
 
-	data = kzalloc(size, GFP_KERNEL);
-	if (!data)
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
 		goto fail;
 
-	data->user_page = perf_mmap_alloc_page(event->cpu);
-	if (!data->user_page)
+	buffer->user_page = perf_mmap_alloc_page(event->cpu);
+	if (!buffer->user_page)
 		goto fail_user_page;
 
 	for (i = 0; i < nr_pages; i++) {
-		data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
-		if (!data->data_pages[i])
+		buffer->data_pages[i] = perf_mmap_alloc_page(event->cpu);
+		if (!buffer->data_pages[i])
 			goto fail_data_pages;
 	}
 
-	data->nr_pages = nr_pages;
+	buffer->nr_pages = nr_pages;
 
-	return data;
+	return buffer;
 
 fail_data_pages:
 	for (i--; i >= 0; i--)
-		free_page((unsigned long)data->data_pages[i]);
+		free_page((unsigned long)buffer->data_pages[i]);
 
-	free_page((unsigned long)data->user_page);
+	free_page((unsigned long)buffer->user_page);
 
 fail_user_page:
-	kfree(data);
+	kfree(buffer);
 
 fail:
 	return NULL;
@@ -2449,17 +2449,17 @@ static void perf_mmap_free_page(unsigned long addr)
 	__free_page(page);
 }
 
-static void perf_mmap_data_free(struct perf_mmap_data *data)
+static void perf_buffer_free(struct perf_buffer *buffer)
 {
 	int i;
 
-	perf_mmap_free_page((unsigned long)data->user_page);
-	for (i = 0; i < data->nr_pages; i++)
-		perf_mmap_free_page((unsigned long)data->data_pages[i]);
-	kfree(data);
+	perf_mmap_free_page((unsigned long)buffer->user_page);
+	for (i = 0; i < buffer->nr_pages; i++)
+		perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
+	kfree(buffer);
 }
 
-static inline int page_order(struct perf_mmap_data *data)
+static inline int page_order(struct perf_buffer *buffer)
 {
 	return 0;
 }
@@ -2472,18 +2472,18 @@ static inline int page_order(struct perf_mmap_data *data)
  * Required for architectures that have d-cache aliasing issues.
  */
 
-static inline int page_order(struct perf_mmap_data *data)
+static inline int page_order(struct perf_buffer *buffer)
 {
-	return data->page_order;
+	return buffer->page_order;
 }
 
 static struct page *
-perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
 {
-	if (pgoff > (1UL << page_order(data)))
+	if (pgoff > (1UL << page_order(buffer)))
 		return NULL;
 
-	return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
+	return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
 }
 
 static void perf_mmap_unmark_page(void *addr)
@@ -2493,57 +2493,57 @@ static void perf_mmap_unmark_page(void *addr)
 	page->mapping = NULL;
 }
 
-static void perf_mmap_data_free_work(struct work_struct *work)
+static void perf_buffer_free_work(struct work_struct *work)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	void *base;
 	int i, nr;
 
-	data = container_of(work, struct perf_mmap_data, work);
-	nr = 1 << page_order(data);
+	buffer = container_of(work, struct perf_buffer, work);
+	nr = 1 << page_order(buffer);
 
-	base = data->user_page;
+	base = buffer->user_page;
 	for (i = 0; i < nr + 1; i++)
 		perf_mmap_unmark_page(base + (i * PAGE_SIZE));
 
 	vfree(base);
-	kfree(data);
+	kfree(buffer);
 }
 
-static void perf_mmap_data_free(struct perf_mmap_data *data)
+static void perf_buffer_free(struct perf_buffer *buffer)
 {
-	schedule_work(&data->work);
+	schedule_work(&buffer->work);
 }
 
-static struct perf_mmap_data *
-perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+static struct perf_buffer *
+perf_buffer_alloc(struct perf_event *event, int nr_pages)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned long size;
 	void *all_buf;
 
-	size = sizeof(struct perf_mmap_data);
+	size = sizeof(struct perf_buffer);
 	size += sizeof(void *);
 
-	data = kzalloc(size, GFP_KERNEL);
-	if (!data)
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
 		goto fail;
 
-	INIT_WORK(&data->work, perf_mmap_data_free_work);
+	INIT_WORK(&buffer->work, perf_buffer_free_work);
 
 	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
 	if (!all_buf)
 		goto fail_all_buf;
 
-	data->user_page = all_buf;
-	data->data_pages[0] = all_buf + PAGE_SIZE;
-	data->page_order = ilog2(nr_pages);
-	data->nr_pages = 1;
+	buffer->user_page = all_buf;
+	buffer->data_pages[0] = all_buf + PAGE_SIZE;
+	buffer->page_order = ilog2(nr_pages);
+	buffer->nr_pages = 1;
 
-	return data;
+	return buffer;
 
 fail_all_buf:
-	kfree(data);
+	kfree(buffer);
 
 fail:
 	return NULL;
@@ -2551,15 +2551,15 @@ fail:
 
 #endif
 
-static unsigned long perf_data_size(struct perf_mmap_data *data)
+static unsigned long perf_data_size(struct perf_buffer *buffer)
 {
-	return data->nr_pages << (PAGE_SHIFT + page_order(data));
+	return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
 }
 
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct perf_event *event = vma->vm_file->private_data;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	int ret = VM_FAULT_SIGBUS;
 
 	if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -2569,14 +2569,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 
 	rcu_read_lock();
-	data = rcu_dereference(event->data);
-	if (!data)
+	buffer = rcu_dereference(event->buffer);
+	if (!buffer)
 		goto unlock;
 
 	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
 		goto unlock;
 
-	vmf->page = perf_mmap_to_page(data, vmf->pgoff);
+	vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
 	if (!vmf->page)
 		goto unlock;
 
@@ -2592,51 +2592,51 @@ unlock:
 }
 
 static void
-perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
+perf_buffer_init(struct perf_event *event, struct perf_buffer *buffer)
 {
-	long max_size = perf_data_size(data);
+	long max_size = perf_data_size(buffer);
 
 	if (event->attr.watermark) {
-		data->watermark = min_t(long, max_size,
+		buffer->watermark = min_t(long, max_size,
 					event->attr.wakeup_watermark);
 	}
 
-	if (!data->watermark)
-		data->watermark = max_size / 2;
+	if (!buffer->watermark)
+		buffer->watermark = max_size / 2;
 
-	atomic_set(&data->refcount, 1);
-	rcu_assign_pointer(event->data, data);
+	atomic_set(&buffer->refcount, 1);
+	rcu_assign_pointer(event->buffer, buffer);
 }
 
-static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
+static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 
-	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
-	perf_mmap_data_free(data);
+	buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
+	perf_buffer_free(buffer);
 }
 
-static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
+static struct perf_buffer *perf_buffer_get(struct perf_event *event)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 
 	rcu_read_lock();
-	data = rcu_dereference(event->data);
-	if (data) {
-		if (!atomic_inc_not_zero(&data->refcount))
-			data = NULL;
+	buffer = rcu_dereference(event->buffer);
+	if (buffer) {
+		if (!atomic_inc_not_zero(&buffer->refcount))
+			buffer = NULL;
 	}
 	rcu_read_unlock();
 
-	return data;
+	return buffer;
 }
 
-static void perf_mmap_data_put(struct perf_mmap_data *data)
+static void perf_buffer_put(struct perf_buffer *buffer)
 {
-	if (!atomic_dec_and_test(&data->refcount))
+	if (!atomic_dec_and_test(&buffer->refcount))
 		return;
 
-	call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
+	call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
 }
 
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2651,16 +2651,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	struct perf_event *event = vma->vm_file->private_data;
 
 	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-		unsigned long size = perf_data_size(event->data);
+		unsigned long size = perf_data_size(event->buffer);
 		struct user_struct *user = event->mmap_user;
-		struct perf_mmap_data *data = event->data;
+		struct perf_buffer *buffer = event->buffer;
 
 		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
 		vma->vm_mm->locked_vm -= event->mmap_locked;
-		rcu_assign_pointer(event->data, NULL);
+		rcu_assign_pointer(event->buffer, NULL);
 		mutex_unlock(&event->mmap_mutex);
 
-		perf_mmap_data_put(data);
+		perf_buffer_put(buffer);
 		free_uid(user);
 	}
 }
@@ -2678,7 +2678,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long user_locked, user_lock_limit;
 	struct user_struct *user = current_user();
 	unsigned long locked, lock_limit;
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned long vma_size;
 	unsigned long nr_pages;
 	long user_extra, extra;
@@ -2699,7 +2699,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	nr_pages = (vma_size / PAGE_SIZE) - 1;
 
 	/*
-	 * If we have data pages ensure they're a power-of-two number, so we
+	 * If we have buffer pages ensure they're a power-of-two number, so we
 	 * can do bitmasks instead of modulo.
 	 */
 	if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -2713,9 +2713,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 	WARN_ON_ONCE(event->ctx->parent_ctx);
 	mutex_lock(&event->mmap_mutex);
-	if (event->data) {
-		if (event->data->nr_pages == nr_pages)
-			atomic_inc(&event->data->refcount);
+	if (event->buffer) {
+		if (event->buffer->nr_pages == nr_pages)
+			atomic_inc(&event->buffer->refcount);
 		else
 			ret = -EINVAL;
 		goto unlock;
@@ -2745,17 +2745,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
-	WARN_ON(event->data);
+	WARN_ON(event->buffer);
 
-	data = perf_mmap_data_alloc(event, nr_pages);
-	if (!data) {
+	buffer = perf_buffer_alloc(event, nr_pages);
+	if (!buffer) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 
-	perf_mmap_data_init(event, data);
+	perf_buffer_init(event, buffer);
 	if (vma->vm_flags & VM_WRITE)
-		event->data->writable = 1;
+		event->buffer->writable = 1;
 
 	atomic_long_add(user_extra, &user->locked_vm);
 	event->mmap_locked = extra;
@@ -2964,15 +2964,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
 /*
  * Output
  */
-static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
 			      unsigned long offset, unsigned long head)
 {
 	unsigned long mask;
 
-	if (!data->writable)
+	if (!buffer->writable)
 		return true;
 
-	mask = perf_data_size(data) - 1;
+	mask = perf_data_size(buffer) - 1;
 
 	offset = (offset - tail) & mask;
 	head   = (head   - tail) & mask;
@@ -2985,7 +2985,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
 
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
-	atomic_set(&handle->data->poll, POLL_IN);
+	atomic_set(&handle->buffer->poll, POLL_IN);
 
 	if (handle->nmi) {
 		handle->event->pending_wakeup = 1;
@@ -3005,45 +3005,45 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
  */
 static void perf_output_get_handle(struct perf_output_handle *handle)
 {
-	struct perf_mmap_data *data = handle->data;
+	struct perf_buffer *buffer = handle->buffer;
 
 	preempt_disable();
-	local_inc(&data->nest);
-	handle->wakeup = local_read(&data->wakeup);
+	local_inc(&buffer->nest);
+	handle->wakeup = local_read(&buffer->wakeup);
 }
 
 static void perf_output_put_handle(struct perf_output_handle *handle)
 {
-	struct perf_mmap_data *data = handle->data;
+	struct perf_buffer *buffer = handle->buffer;
 	unsigned long head;
 
 again:
-	head = local_read(&data->head);
+	head = local_read(&buffer->head);
 
 	/*
 	 * IRQ/NMI can happen here, which means we can miss a head update.
 	 */
 
-	if (!local_dec_and_test(&data->nest))
+	if (!local_dec_and_test(&buffer->nest))
 		goto out;
 
 	/*
 	 * Publish the known good head. Rely on the full barrier implied
-	 * by atomic_dec_and_test() order the data->head read and this
+	 * by atomic_dec_and_test() order the buffer->head read and this
 	 * write.
 	 */
-	data->user_page->data_head = head;
+	buffer->user_page->data_head = head;
 
 	/*
 	 * Now check if we missed an update, rely on the (compiler)
-	 * barrier in atomic_dec_and_test() to re-read data->head.
+	 * barrier in atomic_dec_and_test() to re-read buffer->head.
 	 */
-	if (unlikely(head != local_read(&data->head))) {
-		local_inc(&data->nest);
+	if (unlikely(head != local_read(&buffer->head))) {
+		local_inc(&buffer->nest);
 		goto again;
 	}
 
-	if (handle->wakeup != local_read(&data->wakeup))
+	if (handle->wakeup != local_read(&buffer->wakeup))
 		perf_output_wakeup(handle);
 
  out:
@@ -3063,12 +3063,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
 		buf += size;
 		handle->size -= size;
 		if (!handle->size) {
-			struct perf_mmap_data *data = handle->data;
+			struct perf_buffer *buffer = handle->buffer;
 
 			handle->page++;
-			handle->page &= data->nr_pages - 1;
-			handle->addr = data->data_pages[handle->page];
-			handle->size = PAGE_SIZE << page_order(data);
+			handle->page &= buffer->nr_pages - 1;
+			handle->addr = buffer->data_pages[handle->page];
+			handle->size = PAGE_SIZE << page_order(buffer);
 		}
 	} while (len);
 }
@@ -3077,7 +3077,7 @@ int perf_output_begin(struct perf_output_handle *handle,
 		      struct perf_event *event, unsigned int size,
 		      int nmi, int sample)
 {
-	struct perf_mmap_data *data;
+	struct perf_buffer *buffer;
 	unsigned long tail, offset, head;
 	int have_lost;
 	struct {
@@ -3093,19 +3093,19 @@ int perf_output_begin(struct perf_output_handle *handle,
 	if (event->parent)
 		event = event->parent;
 
-	data = rcu_dereference(event->data);
-	if (!data)
+	buffer = rcu_dereference(event->buffer);
+	if (!buffer)
 		goto out;
 
-	handle->data	= data;
+	handle->buffer	= buffer;
 	handle->event	= event;
 	handle->nmi	= nmi;
 	handle->sample	= sample;
 
-	if (!data->nr_pages)
+	if (!buffer->nr_pages)
 		goto out;
 
-	have_lost = local_read(&data->lost);
+	have_lost = local_read(&buffer->lost);
 	if (have_lost)
 		size += sizeof(lost_event);
 
@@ -3117,30 +3117,30 @@ int perf_output_begin(struct perf_output_handle *handle,
 		 * tail pointer. So that all reads will be completed before the
 		 * write is issued.
 		 */
-		tail = ACCESS_ONCE(data->user_page->data_tail);
+		tail = ACCESS_ONCE(buffer->user_page->data_tail);
 		smp_rmb();
-		offset = head = local_read(&data->head);
+		offset = head = local_read(&buffer->head);
 		head += size;
-		if (unlikely(!perf_output_space(data, tail, offset, head)))
+		if (unlikely(!perf_output_space(buffer, tail, offset, head)))
 			goto fail;
-	} while (local_cmpxchg(&data->head, offset, head) != offset);
+	} while (local_cmpxchg(&buffer->head, offset, head) != offset);
 
-	if (head - local_read(&data->wakeup) > data->watermark)
-		local_add(data->watermark, &data->wakeup);
+	if (head - local_read(&buffer->wakeup) > buffer->watermark)
+		local_add(buffer->watermark, &buffer->wakeup);
 
-	handle->page = offset >> (PAGE_SHIFT + page_order(data));
-	handle->page &= data->nr_pages - 1;
-	handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
-	handle->addr = data->data_pages[handle->page];
+	handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
+	handle->page &= buffer->nr_pages - 1;
+	handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
+	handle->addr = buffer->data_pages[handle->page];
 	handle->addr += handle->size;
-	handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
+	handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
 
 	if (have_lost) {
 		lost_event.header.type = PERF_RECORD_LOST;
 		lost_event.header.misc = 0;
 		lost_event.header.size = sizeof(lost_event);
 		lost_event.id          = event->id;
-		lost_event.lost        = local_xchg(&data->lost, 0);
+		lost_event.lost        = local_xchg(&buffer->lost, 0);
 
 		perf_output_put(handle, lost_event);
 	}
@@ -3148,7 +3148,7 @@ int perf_output_begin(struct perf_output_handle *handle,
 	return 0;
 
 fail:
-	local_inc(&data->lost);
+	local_inc(&buffer->lost);
 	perf_output_put_handle(handle);
 out:
 	rcu_read_unlock();
@@ -3159,15 +3159,15 @@ out:
 void perf_output_end(struct perf_output_handle *handle)
 {
 	struct perf_event *event = handle->event;
-	struct perf_mmap_data *data = handle->data;
+	struct perf_buffer *buffer = handle->buffer;
 
 	int wakeup_events = event->attr.wakeup_events;
 
 	if (handle->sample && wakeup_events) {
-		int events = local_inc_return(&data->events);
+		int events = local_inc_return(&buffer->events);
 		if (events >= wakeup_events) {
-			local_sub(wakeup_events, &data->events);
-			local_inc(&data->wakeup);
+			local_sub(wakeup_events, &buffer->events);
+			local_inc(&buffer->wakeup);
 		}
 	}
 
@@ -5010,7 +5010,7 @@ err_size:
 static int
 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
-	struct perf_mmap_data *data = NULL, *old_data = NULL;
+	struct perf_buffer *buffer = NULL, *old_buffer = NULL;
 	int ret = -EINVAL;
 
 	if (!output_event)
@@ -5040,19 +5040,19 @@ set:
 
 	if (output_event) {
 		/* get the buffer we want to redirect to */
-		data = perf_mmap_data_get(output_event);
-		if (!data)
+		buffer = perf_buffer_get(output_event);
+		if (!buffer)
 			goto unlock;
 	}
 
-	old_data = event->data;
-	rcu_assign_pointer(event->data, data);
+	old_buffer = event->buffer;
+	rcu_assign_pointer(event->buffer, buffer);
 	ret = 0;
 unlock:
 	mutex_unlock(&event->mmap_mutex);
 
-	if (old_data)
-		perf_mmap_data_put(old_data);
+	if (old_buffer)
+		perf_buffer_put(old_buffer);
 out:
 	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b


From d57e34fdd60be7ffd0b1d86bfa1a553df86b7172 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 28 May 2010 19:41:35 +0200
Subject: perf: Simplify the ring-buffer logic: make perf_buffer_alloc() do
 everything needed

Currently there are perf_buffer_alloc() + perf_buffer_init() + some
separate bits, fold it all into a single perf_buffer_alloc() and only
leave the attachment to the event separate.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  2 ++
 kernel/perf_event.c        | 61 ++++++++++++++++++++++++++--------------------
 2 files changed, 36 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2a0da021c23f..441992a9775c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -602,6 +602,8 @@ enum perf_event_active_state {
 
 struct file;
 
+#define PERF_BUFFER_WRITABLE		0x01
+
 struct perf_buffer {
 	atomic_t			refcount;
 	struct rcu_head			rcu_head;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 93d545801e43..f75c9c9c8177 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2369,6 +2369,25 @@ unlock:
 	rcu_read_unlock();
 }
 
+static unsigned long perf_data_size(struct perf_buffer *buffer);
+
+static void
+perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
+{
+	long max_size = perf_data_size(buffer);
+
+	if (watermark)
+		buffer->watermark = min(max_size, watermark);
+
+	if (!buffer->watermark)
+		buffer->watermark = max_size / 2;
+
+	if (flags & PERF_BUFFER_WRITABLE)
+		buffer->writable = 1;
+
+	atomic_set(&buffer->refcount, 1);
+}
+
 #ifndef CONFIG_PERF_USE_VMALLOC
 
 /*
@@ -2401,7 +2420,7 @@ static void *perf_mmap_alloc_page(int cpu)
 }
 
 static struct perf_buffer *
-perf_buffer_alloc(struct perf_event *event, int nr_pages)
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
 {
 	struct perf_buffer *buffer;
 	unsigned long size;
@@ -2414,18 +2433,20 @@ perf_buffer_alloc(struct perf_event *event, int nr_pages)
 	if (!buffer)
 		goto fail;
 
-	buffer->user_page = perf_mmap_alloc_page(event->cpu);
+	buffer->user_page = perf_mmap_alloc_page(cpu);
 	if (!buffer->user_page)
 		goto fail_user_page;
 
 	for (i = 0; i < nr_pages; i++) {
-		buffer->data_pages[i] = perf_mmap_alloc_page(event->cpu);
+		buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
 		if (!buffer->data_pages[i])
 			goto fail_data_pages;
 	}
 
 	buffer->nr_pages = nr_pages;
 
+	perf_buffer_init(buffer, watermark, flags);
+
 	return buffer;
 
 fail_data_pages:
@@ -2516,7 +2537,7 @@ static void perf_buffer_free(struct perf_buffer *buffer)
 }
 
 static struct perf_buffer *
-perf_buffer_alloc(struct perf_event *event, int nr_pages)
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
 {
 	struct perf_buffer *buffer;
 	unsigned long size;
@@ -2540,6 +2561,8 @@ perf_buffer_alloc(struct perf_event *event, int nr_pages)
 	buffer->page_order = ilog2(nr_pages);
 	buffer->nr_pages = 1;
 
+	perf_buffer_init(buffer, watermark, flags);
+
 	return buffer;
 
 fail_all_buf:
@@ -2591,23 +2614,6 @@ unlock:
 	return ret;
 }
 
-static void
-perf_buffer_init(struct perf_event *event, struct perf_buffer *buffer)
-{
-	long max_size = perf_data_size(buffer);
-
-	if (event->attr.watermark) {
-		buffer->watermark = min_t(long, max_size,
-					event->attr.wakeup_watermark);
-	}
-
-	if (!buffer->watermark)
-		buffer->watermark = max_size / 2;
-
-	atomic_set(&buffer->refcount, 1);
-	rcu_assign_pointer(event->buffer, buffer);
-}
-
 static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
 {
 	struct perf_buffer *buffer;
@@ -2682,7 +2688,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long vma_size;
 	unsigned long nr_pages;
 	long user_extra, extra;
-	int ret = 0;
+	int ret = 0, flags = 0;
 
 	/*
 	 * Don't allow mmap() of inherited per-task counters. This would
@@ -2747,15 +2753,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 	WARN_ON(event->buffer);
 
-	buffer = perf_buffer_alloc(event, nr_pages);
+	if (vma->vm_flags & VM_WRITE)
+		flags |= PERF_BUFFER_WRITABLE;
+
+	buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
+				   event->cpu, flags);
 	if (!buffer) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
-
-	perf_buffer_init(event, buffer);
-	if (vma->vm_flags & VM_WRITE)
-		event->buffer->writable = 1;
+	rcu_assign_pointer(event->buffer, buffer);
 
 	atomic_long_add(user_extra, &user->locked_vm);
 	event->mmap_locked = extra;
-- 
cgit v1.2.3-59-g8ed1b


From a6e6dea68c18f705957573ee5596097c7e82d0e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 21 May 2010 14:27:58 +0200
Subject: perf: Add perf_event::child_count

Only child counters adding back their values into the parent counter
are responsible for cross-cpu updates to event->count.

So if we pull that out into a new child_count variable, we get an
event->count that is only modified locally.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 1 +
 kernel/perf_event.c        | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 441992a9775c..f34dab9b275e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -671,6 +671,7 @@ struct perf_event {
 	enum perf_event_active_state	state;
 	unsigned int			attach_state;
 	atomic64_t			count;
+	atomic64_t			child_count;
 
 	/*
 	 * These are the total time in nanoseconds that the event
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ab4c0ffc271c..a395fda2d94c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1738,7 +1738,7 @@ static void __perf_event_read(void *info)
 
 static inline u64 perf_event_count(struct perf_event *event)
 {
-	return atomic64_read(&event->count);
+	return atomic64_read(&event->count) + atomic64_read(&event->child_count);
 }
 
 static u64 perf_event_read(struct perf_event *event)
@@ -5379,7 +5379,7 @@ static void sync_child_event(struct perf_event *child_event,
 	/*
 	 * Add back the child's count to the parent's count:
 	 */
-	atomic64_add(child_val, &parent_event->count);
+	atomic64_add(child_val, &parent_event->child_count);
 	atomic64_add(child_event->total_time_enabled,
 		     &parent_event->child_total_time_enabled);
 	atomic64_add(child_event->total_time_running,
-- 
cgit v1.2.3-59-g8ed1b


From e78505958cf123048fb48cb56b79cebb8edd15fb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 21 May 2010 14:43:08 +0200
Subject: perf: Convert perf_event to local_t

Since now all modification to event->count (and ->prev_count
and ->period_left) are local to a cpu, change then to local64_t so we
avoid the LOCK'ed ops.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/perf_event.c     | 18 ++++++++---------
 arch/powerpc/kernel/perf_event.c | 34 ++++++++++++++++----------------
 arch/sh/kernel/perf_event.c      |  6 +++---
 arch/sparc/kernel/perf_event.c   | 18 ++++++++---------
 arch/x86/kernel/cpu/perf_event.c | 18 ++++++++---------
 include/linux/perf_event.h       |  7 ++++---
 kernel/perf_event.c              | 42 ++++++++++++++++++++--------------------
 7 files changed, 72 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index c45768614c8a..5b7cfafc0720 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -164,20 +164,20 @@ armpmu_event_set_period(struct perf_event *event,
 			struct hw_perf_event *hwc,
 			int idx)
 {
-	s64 left = atomic64_read(&hwc->period_left);
+	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
 	int ret = 0;
 
 	if (unlikely(left <= -period)) {
 		left = period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
@@ -185,7 +185,7 @@ armpmu_event_set_period(struct perf_event *event,
 	if (left > (s64)armpmu->max_period)
 		left = armpmu->max_period;
 
-	atomic64_set(&hwc->prev_count, (u64)-left);
+	local64_set(&hwc->prev_count, (u64)-left);
 
 	armpmu->write_counter(idx, (u64)(-left) & 0xffffffff);
 
@@ -204,18 +204,18 @@ armpmu_event_update(struct perf_event *event,
 	s64 delta;
 
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	new_raw_count = armpmu->read_counter(idx);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 			     new_raw_count) != prev_raw_count)
 		goto again;
 
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &hwc->period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
 
 	return new_raw_count;
 }
@@ -478,7 +478,7 @@ __hw_perf_event_init(struct perf_event *event)
 	if (!hwc->sample_period) {
 		hwc->sample_period  = armpmu->max_period;
 		hwc->last_period    = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
+		local64_set(&hwc->period_left, hwc->sample_period);
 	}
 
 	err = 0;
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index ac2a8c2554d9..af1d9a7c65d1 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -410,15 +410,15 @@ static void power_pmu_read(struct perf_event *event)
 	 * Therefore we treat them like NMIs.
 	 */
 	do {
-		prev = atomic64_read(&event->hw.prev_count);
+		prev = local64_read(&event->hw.prev_count);
 		barrier();
 		val = read_pmc(event->hw.idx);
-	} while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
+	} while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
 
 	/* The counters are only 32 bits wide */
 	delta = (val - prev) & 0xfffffffful;
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &event->hw.period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &event->hw.period_left);
 }
 
 /*
@@ -444,10 +444,10 @@ static void freeze_limited_counters(struct cpu_hw_events *cpuhw,
 		if (!event->hw.idx)
 			continue;
 		val = (event->hw.idx == 5) ? pmc5 : pmc6;
-		prev = atomic64_read(&event->hw.prev_count);
+		prev = local64_read(&event->hw.prev_count);
 		event->hw.idx = 0;
 		delta = (val - prev) & 0xfffffffful;
-		atomic64_add(delta, &event->count);
+		local64_add(delta, &event->count);
 	}
 }
 
@@ -462,7 +462,7 @@ static void thaw_limited_counters(struct cpu_hw_events *cpuhw,
 		event = cpuhw->limited_counter[i];
 		event->hw.idx = cpuhw->limited_hwidx[i];
 		val = (event->hw.idx == 5) ? pmc5 : pmc6;
-		atomic64_set(&event->hw.prev_count, val);
+		local64_set(&event->hw.prev_count, val);
 		perf_event_update_userpage(event);
 	}
 }
@@ -666,11 +666,11 @@ void hw_perf_enable(void)
 		}
 		val = 0;
 		if (event->hw.sample_period) {
-			left = atomic64_read(&event->hw.period_left);
+			left = local64_read(&event->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
 		}
-		atomic64_set(&event->hw.prev_count, val);
+		local64_set(&event->hw.prev_count, val);
 		event->hw.idx = idx;
 		write_pmc(idx, val);
 		perf_event_update_userpage(event);
@@ -842,8 +842,8 @@ static void power_pmu_unthrottle(struct perf_event *event)
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
 	write_pmc(event->hw.idx, val);
-	atomic64_set(&event->hw.prev_count, val);
-	atomic64_set(&event->hw.period_left, left);
+	local64_set(&event->hw.prev_count, val);
+	local64_set(&event->hw.period_left, left);
 	perf_event_update_userpage(event);
 	perf_enable();
 	local_irq_restore(flags);
@@ -1109,7 +1109,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 	event->hw.config = events[n];
 	event->hw.event_base = cflags[n];
 	event->hw.last_period = event->hw.sample_period;
-	atomic64_set(&event->hw.period_left, event->hw.last_period);
+	local64_set(&event->hw.period_left, event->hw.last_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -1147,16 +1147,16 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 	int record = 0;
 
 	/* we don't have to worry about interrupts here */
-	prev = atomic64_read(&event->hw.prev_count);
+	prev = local64_read(&event->hw.prev_count);
 	delta = (val - prev) & 0xfffffffful;
-	atomic64_add(delta, &event->count);
+	local64_add(delta, &event->count);
 
 	/*
 	 * See if the total period for this event has expired,
 	 * and update for the next period.
 	 */
 	val = 0;
-	left = atomic64_read(&event->hw.period_left) - delta;
+	left = local64_read(&event->hw.period_left) - delta;
 	if (period) {
 		if (left <= 0) {
 			left += period;
@@ -1194,8 +1194,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 	}
 
 	write_pmc(event->hw.idx, val);
-	atomic64_set(&event->hw.prev_count, val);
-	atomic64_set(&event->hw.period_left, left);
+	local64_set(&event->hw.prev_count, val);
+	local64_set(&event->hw.period_left, left);
 	perf_event_update_userpage(event);
 }
 
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 81b6de41ae5d..7a3dc3567258 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -185,10 +185,10 @@ static void sh_perf_event_update(struct perf_event *event,
 	 * this is the simplest approach for maintaining consistency.
 	 */
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	new_raw_count = sh_pmu->read(idx);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 			     new_raw_count) != prev_raw_count)
 		goto again;
 
@@ -203,7 +203,7 @@ again:
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
+	local64_add(delta, &event->count);
 }
 
 static void sh_pmu_disable(struct perf_event *event)
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index beeb92fa3acd..8a6660da8e08 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -572,18 +572,18 @@ static u64 sparc_perf_event_update(struct perf_event *event,
 	s64 delta;
 
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	new_raw_count = read_pmc(idx);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 			     new_raw_count) != prev_raw_count)
 		goto again;
 
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &hwc->period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
 
 	return new_raw_count;
 }
@@ -591,27 +591,27 @@ again:
 static int sparc_perf_event_set_period(struct perf_event *event,
 				       struct hw_perf_event *hwc, int idx)
 {
-	s64 left = atomic64_read(&hwc->period_left);
+	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
 	int ret = 0;
 
 	if (unlikely(left <= -period)) {
 		left = period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 	if (left > MAX_PERIOD)
 		left = MAX_PERIOD;
 
-	atomic64_set(&hwc->prev_count, (u64)-left);
+	local64_set(&hwc->prev_count, (u64)-left);
 
 	write_pmc(idx, (u64)(-left) & 0xffffffff);
 
@@ -1087,7 +1087,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	if (!hwc->sample_period) {
 		hwc->sample_period = MAX_PERIOD;
 		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
+		local64_set(&hwc->period_left, hwc->sample_period);
 	}
 
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 79e199843db6..2d0d29069275 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -296,10 +296,10 @@ x86_perf_event_update(struct perf_event *event)
 	 * count to the generic event atomically:
 	 */
 again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
+	prev_raw_count = local64_read(&hwc->prev_count);
 	rdmsrl(hwc->event_base + idx, new_raw_count);
 
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 					new_raw_count) != prev_raw_count)
 		goto again;
 
@@ -314,8 +314,8 @@ again:
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	atomic64_add(delta, &event->count);
-	atomic64_sub(delta, &hwc->period_left);
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
 
 	return new_raw_count;
 }
@@ -439,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event)
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
 		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
+		local64_set(&hwc->period_left, hwc->sample_period);
 	} else {
 		/*
 		 * If we have a PMU initialized but no APIC
@@ -886,7 +886,7 @@ static int
 x86_perf_event_set_period(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	s64 left = atomic64_read(&hwc->period_left);
+	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
 	int ret = 0, idx = hwc->idx;
 
@@ -898,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event)
 	 */
 	if (unlikely(left <= -period)) {
 		left = period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
-		atomic64_set(&hwc->period_left, left);
+		local64_set(&hwc->period_left, left);
 		hwc->last_period = period;
 		ret = 1;
 	}
@@ -924,7 +924,7 @@ x86_perf_event_set_period(struct perf_event *event)
 	 * The hw event starts counting from this event offset,
 	 * mark it to be able to extra future deltas:
 	 */
-	atomic64_set(&hwc->prev_count, (u64)-left);
+	local64_set(&hwc->prev_count, (u64)-left);
 
 	wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f34dab9b275e..7342979f95f2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -487,6 +487,7 @@ struct perf_guest_info_callbacks {
 #include <linux/cpu.h>
 #include <asm/atomic.h>
 #include <asm/local.h>
+#include <asm/local64.h>
 
 #define PERF_MAX_STACK_DEPTH		255
 
@@ -536,10 +537,10 @@ struct hw_perf_event {
 		struct arch_hw_breakpoint	info;
 #endif
 	};
-	atomic64_t			prev_count;
+	local64_t			prev_count;
 	u64				sample_period;
 	u64				last_period;
-	atomic64_t			period_left;
+	local64_t			period_left;
 	u64				interrupts;
 
 	u64				freq_time_stamp;
@@ -670,7 +671,7 @@ struct perf_event {
 
 	enum perf_event_active_state	state;
 	unsigned int			attach_state;
-	atomic64_t			count;
+	local64_t			count;
 	atomic64_t			child_count;
 
 	/*
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a395fda2d94c..97c73018592e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1148,9 +1148,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
 	 * In order to keep per-task stats reliable we need to flip the event
 	 * values when we flip the contexts.
 	 */
-	value = atomic64_read(&next_event->count);
-	value = atomic64_xchg(&event->count, value);
-	atomic64_set(&next_event->count, value);
+	value = local64_read(&next_event->count);
+	value = local64_xchg(&event->count, value);
+	local64_set(&next_event->count, value);
 
 	swap(event->total_time_enabled, next_event->total_time_enabled);
 	swap(event->total_time_running, next_event->total_time_running);
@@ -1540,10 +1540,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 
 	hwc->sample_period = sample_period;
 
-	if (atomic64_read(&hwc->period_left) > 8*sample_period) {
+	if (local64_read(&hwc->period_left) > 8*sample_period) {
 		perf_disable();
 		perf_event_stop(event);
-		atomic64_set(&hwc->period_left, 0);
+		local64_set(&hwc->period_left, 0);
 		perf_event_start(event);
 		perf_enable();
 	}
@@ -1584,7 +1584,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 
 		perf_disable();
 		event->pmu->read(event);
-		now = atomic64_read(&event->count);
+		now = local64_read(&event->count);
 		delta = now - hwc->freq_count_stamp;
 		hwc->freq_count_stamp = now;
 
@@ -1738,7 +1738,7 @@ static void __perf_event_read(void *info)
 
 static inline u64 perf_event_count(struct perf_event *event)
 {
-	return atomic64_read(&event->count) + atomic64_read(&event->child_count);
+	return local64_read(&event->count) + atomic64_read(&event->child_count);
 }
 
 static u64 perf_event_read(struct perf_event *event)
@@ -2141,7 +2141,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 static void perf_event_reset(struct perf_event *event)
 {
 	(void)perf_event_read(event);
-	atomic64_set(&event->count, 0);
+	local64_set(&event->count, 0);
 	perf_event_update_userpage(event);
 }
 
@@ -2359,7 +2359,7 @@ void perf_event_update_userpage(struct perf_event *event)
 	userpg->index = perf_event_index(event);
 	userpg->offset = perf_event_count(event);
 	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		userpg->offset -= atomic64_read(&event->hw.prev_count);
+		userpg->offset -= local64_read(&event->hw.prev_count);
 
 	userpg->time_enabled = event->total_time_enabled +
 			atomic64_read(&event->child_total_time_enabled);
@@ -4035,14 +4035,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
 	hwc->last_period = hwc->sample_period;
 
 again:
-	old = val = atomic64_read(&hwc->period_left);
+	old = val = local64_read(&hwc->period_left);
 	if (val < 0)
 		return 0;
 
 	nr = div64_u64(period + val, period);
 	offset = nr * period;
 	val -= offset;
-	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+	if (local64_cmpxchg(&hwc->period_left, old, val) != old)
 		goto again;
 
 	return nr;
@@ -4081,7 +4081,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 {
 	struct hw_perf_event *hwc = &event->hw;
 
-	atomic64_add(nr, &event->count);
+	local64_add(nr, &event->count);
 
 	if (!regs)
 		return;
@@ -4092,7 +4092,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
 		return perf_swevent_overflow(event, 1, nmi, data, regs);
 
-	if (atomic64_add_negative(nr, &hwc->period_left))
+	if (local64_add_negative(nr, &hwc->period_left))
 		return;
 
 	perf_swevent_overflow(event, 0, nmi, data, regs);
@@ -4383,8 +4383,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
 	u64 now;
 
 	now = cpu_clock(cpu);
-	prev = atomic64_xchg(&event->hw.prev_count, now);
-	atomic64_add(now - prev, &event->count);
+	prev = local64_xchg(&event->hw.prev_count, now);
+	local64_add(now - prev, &event->count);
 }
 
 static int cpu_clock_perf_event_enable(struct perf_event *event)
@@ -4392,7 +4392,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int cpu = raw_smp_processor_id();
 
-	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+	local64_set(&hwc->prev_count, cpu_clock(cpu));
 	perf_swevent_start_hrtimer(event);
 
 	return 0;
@@ -4424,9 +4424,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now)
 	u64 prev;
 	s64 delta;
 
-	prev = atomic64_xchg(&event->hw.prev_count, now);
+	prev = local64_xchg(&event->hw.prev_count, now);
 	delta = now - prev;
-	atomic64_add(delta, &event->count);
+	local64_add(delta, &event->count);
 }
 
 static int task_clock_perf_event_enable(struct perf_event *event)
@@ -4436,7 +4436,7 @@ static int task_clock_perf_event_enable(struct perf_event *event)
 
 	now = event->ctx->time;
 
-	atomic64_set(&hwc->prev_count, now);
+	local64_set(&hwc->prev_count, now);
 
 	perf_swevent_start_hrtimer(event);
 
@@ -4879,7 +4879,7 @@ perf_event_alloc(struct perf_event_attr *attr,
 		hwc->sample_period = 1;
 	hwc->last_period = hwc->sample_period;
 
-	atomic64_set(&hwc->period_left, hwc->sample_period);
+	local64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
 	 * we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -5313,7 +5313,7 @@ inherit_event(struct perf_event *parent_event,
 		hwc->sample_period = sample_period;
 		hwc->last_period   = sample_period;
 
-		atomic64_set(&hwc->period_left, sample_period);
+		local64_set(&hwc->period_left, sample_period);
 	}
 
 	child_event->overflow_handler = parent_event->overflow_handler;
-- 
cgit v1.2.3-59-g8ed1b


From 7be7923633a142402266d642ccebf74f556a649b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 9 Jun 2010 11:57:23 +0200
Subject: perf: Fix build breakage for architecutes without atomic64_t

The local64.h include dependency was not dependent on PERF_EVENT=y,
which meant that arch's without atomic64_t support ended up including
it and failed to build.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
 include/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7342979f95f2..1218d05728b9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -462,6 +462,7 @@ enum perf_callchain_context {
 
 #ifdef CONFIG_PERF_EVENTS
 # include <asm/perf_event.h>
+# include <asm/local64.h>
 #endif
 
 struct perf_guest_info_callbacks {
@@ -487,7 +488,6 @@ struct perf_guest_info_callbacks {
 #include <linux/cpu.h>
 #include <asm/atomic.h>
 #include <asm/local.h>
-#include <asm/local64.h>
 
 #define PERF_MAX_STACK_DEPTH		255
 
-- 
cgit v1.2.3-59-g8ed1b


From 039ca4e74a1cf60bd7487324a564ecf5c981f254 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 26 May 2010 17:22:17 +0800
Subject: tracing: Remove kmemtrace ftrace plugin

We have been resisting new ftrace plugins and removing existing
ones, and kmemtrace has been superseded by kmem trace events
and perf-kmem, so we remove it.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
[ remove kmemtrace from the makefile, handle slob too ]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/ABI/testing/debugfs-kmemtrace |  71 ----
 Documentation/trace/kmemtrace.txt           | 126 -------
 MAINTAINERS                                 |   7 -
 include/linux/kmemtrace.h                   |  25 --
 include/linux/slab_def.h                    |   3 +-
 include/linux/slub_def.h                    |   3 +-
 init/main.c                                 |   2 -
 kernel/trace/Kconfig                        |  20 --
 kernel/trace/Makefile                       |   1 -
 kernel/trace/kmemtrace.c                    | 529 ----------------------------
 kernel/trace/trace.h                        |  12 -
 kernel/trace/trace_entries.h                |  35 --
 mm/slab.c                                   |   1 -
 mm/slob.c                                   |   4 +-
 mm/slub.c                                   |   1 -
 15 files changed, 7 insertions(+), 833 deletions(-)
 delete mode 100644 Documentation/ABI/testing/debugfs-kmemtrace
 delete mode 100644 Documentation/trace/kmemtrace.txt
 delete mode 100644 include/linux/kmemtrace.h
 delete mode 100644 kernel/trace/kmemtrace.c

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace
deleted file mode 100644
index 5e6a92a02d85..000000000000
--- a/Documentation/ABI/testing/debugfs-kmemtrace
+++ /dev/null
@@ -1,71 +0,0 @@
-What:		/sys/kernel/debug/kmemtrace/
-Date:		July 2008
-Contact:	Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
-Description:
-
-In kmemtrace-enabled kernels, the following files are created:
-
-/sys/kernel/debug/kmemtrace/
-	cpu<n>		(0400)	Per-CPU tracing data, see below. (binary)
-	total_overruns	(0400)	Total number of bytes which were dropped from
-				cpu<n> files because of full buffer condition,
-				non-binary. (text)
-	abi_version	(0400)	Kernel's kmemtrace ABI version. (text)
-
-Each per-CPU file should be read according to the relay interface. That is,
-the reader should set affinity to that specific CPU and, as currently done by
-the userspace application (though there are other methods), use poll() with
-an infinite timeout before every read(). Otherwise, erroneous data may be
-read. The binary data has the following _core_ format:
-
-	Event ID	(1 byte)	Unsigned integer, one of:
-		0 - represents an allocation (KMEMTRACE_EVENT_ALLOC)
-		1 - represents a freeing of previously allocated memory
-		    (KMEMTRACE_EVENT_FREE)
-	Type ID		(1 byte)	Unsigned integer, one of:
-		0 - this is a kmalloc() / kfree()
-		1 - this is a kmem_cache_alloc() / kmem_cache_free()
-		2 - this is a __get_free_pages() et al.
-	Event size	(2 bytes)	Unsigned integer representing the
-					size of this event. Used to extend
-					kmemtrace. Discard the bytes you
-					don't know about.
-	Sequence number	(4 bytes)	Signed integer used to reorder data
-					logged on SMP machines. Wraparound
-					must be taken into account, although
-					it is unlikely.
-	Caller address	(8 bytes)	Return address to the caller.
-	Pointer to mem	(8 bytes)	Pointer to target memory area. Can be
-					NULL, but not all such calls might be
-					recorded.
-
-In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow:
-
-	Requested bytes	(8 bytes)	Total number of requested bytes,
-					unsigned, must not be zero.
-	Allocated bytes (8 bytes)	Total number of actually allocated
-					bytes, unsigned, must not be lower
-					than requested bytes.
-	Requested flags	(4 bytes)	GFP flags supplied by the caller.
-	Target CPU	(4 bytes)	Signed integer, valid for event id 1.
-					If equal to -1, target CPU is the same
-					as origin CPU, but the reverse might
-					not be true.
-
-The data is made available in the same endianness the machine has.
-
-Other event ids and type ids may be defined and added. Other fields may be
-added by increasing event size, but see below for details.
-Every modification to the ABI, including new id definitions, are followed
-by bumping the ABI version by one.
-
-Adding new data to the packet (features) is done at the end of the mandatory
-data:
-	Feature size	(2 byte)
-	Feature ID	(1 byte)
-	Feature data	(Feature size - 3 bytes)
-
-
-Users:
-	kmemtrace-user - git://repo.or.cz/kmemtrace-user.git
-
diff --git a/Documentation/trace/kmemtrace.txt b/Documentation/trace/kmemtrace.txt
deleted file mode 100644
index 6308735e58ca..000000000000
--- a/Documentation/trace/kmemtrace.txt
+++ /dev/null
@@ -1,126 +0,0 @@
-			kmemtrace - Kernel Memory Tracer
-
-			  by Eduard - Gabriel Munteanu
-			     <eduard.munteanu@linux360.ro>
-
-I. Introduction
-===============
-
-kmemtrace helps kernel developers figure out two things:
-1) how different allocators (SLAB, SLUB etc.) perform
-2) how kernel code allocates memory and how much
-
-To do this, we trace every allocation and export information to the userspace
-through the relay interface. We export things such as the number of requested
-bytes, the number of bytes actually allocated (i.e. including internal
-fragmentation), whether this is a slab allocation or a plain kmalloc() and so
-on.
-
-The actual analysis is performed by a userspace tool (see section III for
-details on where to get it from). It logs the data exported by the kernel,
-processes it and (as of writing this) can provide the following information:
-- the total amount of memory allocated and fragmentation per call-site
-- the amount of memory allocated and fragmentation per allocation
-- total memory allocated and fragmentation in the collected dataset
-- number of cross-CPU allocation and frees (makes sense in NUMA environments)
-
-Moreover, it can potentially find inconsistent and erroneous behavior in
-kernel code, such as using slab free functions on kmalloc'ed memory or
-allocating less memory than requested (but not truly failed allocations).
-
-kmemtrace also makes provisions for tracing on some arch and analysing the
-data on another.
-
-II. Design and goals
-====================
-
-kmemtrace was designed to handle rather large amounts of data. Thus, it uses
-the relay interface to export whatever is logged to userspace, which then
-stores it. Analysis and reporting is done asynchronously, that is, after the
-data is collected and stored. By design, it allows one to log and analyse
-on different machines and different arches.
-
-As of writing this, the ABI is not considered stable, though it might not
-change much. However, no guarantees are made about compatibility yet. When
-deemed stable, the ABI should still allow easy extension while maintaining
-backward compatibility. This is described further in Documentation/ABI.
-
-Summary of design goals:
-	- allow logging and analysis to be done across different machines
-	- be fast and anticipate usage in high-load environments (*)
-	- be reasonably extensible
-	- make it possible for GNU/Linux distributions to have kmemtrace
-	included in their repositories
-
-(*) - one of the reasons Pekka Enberg's original userspace data analysis
-    tool's code was rewritten from Perl to C (although this is more than a
-    simple conversion)
-
-
-III. Quick usage guide
-======================
-
-1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable
-CONFIG_KMEMTRACE).
-
-2) Get the userspace tool and build it:
-$ git clone git://repo.or.cz/kmemtrace-user.git		# current repository
-$ cd kmemtrace-user/
-$ ./autogen.sh
-$ ./configure
-$ make
-
-3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the
-'single' runlevel (so that relay buffers don't fill up easily), and run
-kmemtrace:
-# '$' does not mean user, but root here.
-$ mount -t debugfs none /sys/kernel/debug
-$ mount -t proc none /proc
-$ cd path/to/kmemtrace-user/
-$ ./kmemtraced
-Wait a bit, then stop it with CTRL+C.
-$ cat /sys/kernel/debug/kmemtrace/total_overruns	# Check if we didn't
-							# overrun, should
-							# be zero.
-$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to
-		check its correctness]
-$ ./kmemtrace-report
-
-Now you should have a nice and short summary of how the allocator performs.
-
-IV. FAQ and known issues
-========================
-
-Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix
-this? Should I worry?
-A: If it's non-zero, this affects kmemtrace's accuracy, depending on how
-large the number is. You can fix it by supplying a higher
-'kmemtrace.subbufs=N' kernel parameter.
----
-
-Q: kmemtrace_check reports errors, how do I fix this? Should I worry?
-A: This is a bug and should be reported. It can occur for a variety of
-reasons:
-	- possible bugs in relay code
-	- possible misuse of relay by kmemtrace
-	- timestamps being collected unorderly
-Or you may fix it yourself and send us a patch.
----
-
-Q: kmemtrace_report shows many errors, how do I fix this? Should I worry?
-A: This is a known issue and I'm working on it. These might be true errors
-in kernel code, which may have inconsistent behavior (e.g. allocating memory
-with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed
-out this behavior may work with SLAB, but may fail with other allocators.
-
-It may also be due to lack of tracing in some unusual allocator functions.
-
-We don't want bug reports regarding this issue yet.
----
-
-V. See also
-===========
-
-Documentation/kernel-parameters.txt
-Documentation/ABI/testing/debugfs-kmemtrace
-
diff --git a/MAINTAINERS b/MAINTAINERS
index 33047a605438..67e6e9d848db 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3361,13 +3361,6 @@ F:	include/linux/kmemleak.h
 F:	mm/kmemleak.c
 F:	mm/kmemleak-test.c
 
-KMEMTRACE
-M:	Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
-S:	Maintained
-F:	Documentation/trace/kmemtrace.txt
-F:	include/linux/kmemtrace.h
-F:	kernel/trace/kmemtrace.c
-
 KPROBES
 M:	Ananth N Mavinakayanahalli <ananth@in.ibm.com>
 M:	Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
deleted file mode 100644
index b616d3930c3b..000000000000
--- a/include/linux/kmemtrace.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <trace/events/kmem.h>
-
-#ifdef CONFIG_KMEMTRACE
-extern void kmemtrace_init(void);
-#else
-static inline void kmemtrace_init(void)
-{
-}
-#endif
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 1812dac8c496..1acfa73ce2ac 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,7 +14,8 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
-#include <linux/kmemtrace.h>
+
+#include <trace/events/kmem.h>
 
 #ifndef ARCH_KMALLOC_MINALIGN
 /*
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 55695c8d2f8a..2345d3a033e6 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,9 +10,10 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
-#include <linux/kmemtrace.h>
 #include <linux/kmemleak.h>
 
+#include <trace/events/kmem.h>
+
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
 	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
diff --git a/init/main.c b/init/main.c
index 94f65efdc65a..e2a2bf3a169f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -66,7 +66,6 @@
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/kmemcheck.h>
-#include <linux/kmemtrace.h>
 #include <linux/sfi.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
@@ -652,7 +651,6 @@ asmlinkage void __init start_kernel(void)
 #endif
 	page_cgroup_init();
 	enable_debug_pagealloc();
-	kmemtrace_init();
 	kmemleak_init();
 	debug_objects_mem_init();
 	idr_init_cache();
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 572992abc71c..f669092fdead 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -354,26 +354,6 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
-config KMEMTRACE
-	bool "Trace SLAB allocations"
-	select GENERIC_TRACER
-	help
-	  kmemtrace provides tracing for slab allocator functions, such as
-	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
-	  data is then fed to the userspace application in order to analyse
-	  allocation hotspots, internal fragmentation and so on, making it
-	  possible to see how well an allocator performs, as well as debug
-	  and profile kernel code.
-
-	  This requires an userspace application to use. See
-	  Documentation/trace/kmemtrace.txt for more information.
-
-	  Saying Y will make the kernel somewhat larger and slower. However,
-	  if you disable kmemtrace at run-time or boot-time, the performance
-	  impact is minimal (depending on the arch the kernel is built for).
-
-	  If unsure, say N.
-
 config WORKQUEUE_TRACER
 	bool "Trace workqueues"
 	select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c3aaeba82372..469a1c7555a5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -40,7 +40,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index bbfc1bb1660b..000000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,529 +0,0 @@
-/*
- * Memory allocator tracing
- *
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- */
-
-#include <linux/tracepoint.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/dcache.h>
-#include <linux/fs.h>
-
-#include <linux/kmemtrace.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-/* Select an alternative, minimalistic output than the original one */
-#define TRACE_KMEM_OPT_MINIMAL	0x1
-
-static struct tracer_opt kmem_opts[] = {
-	/* Default disable the minimalistic output */
-	{ TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
-	{ }
-};
-
-static struct tracer_flags kmem_tracer_flags = {
-	.val			= 0,
-	.opts			= kmem_opts
-};
-
-static struct trace_array *kmemtrace_array;
-
-/* Trace allocations */
-static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
-				   unsigned long call_site,
-				   const void *ptr,
-				   size_t bytes_req,
-				   size_t bytes_alloc,
-				   gfp_t gfp_flags,
-				   int node)
-{
-	struct ftrace_event_call *call = &event_kmem_alloc;
-	struct trace_array *tr = kmemtrace_array;
-	struct kmemtrace_alloc_entry *entry;
-	struct ring_buffer_event *event;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
-	if (!event)
-		return;
-
-	entry = ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type		= TRACE_KMEM_ALLOC;
-	entry->type_id		= type_id;
-	entry->call_site	= call_site;
-	entry->ptr		= ptr;
-	entry->bytes_req	= bytes_req;
-	entry->bytes_alloc	= bytes_alloc;
-	entry->gfp_flags	= gfp_flags;
-	entry->node		= node;
-
-	if (!filter_check_discard(call, entry, tr->buffer, event))
-		ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-}
-
-static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
-				  unsigned long call_site,
-				  const void *ptr)
-{
-	struct ftrace_event_call *call = &event_kmem_free;
-	struct trace_array *tr = kmemtrace_array;
-	struct kmemtrace_free_entry *entry;
-	struct ring_buffer_event *event;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type		= TRACE_KMEM_FREE;
-	entry->type_id		= type_id;
-	entry->call_site	= call_site;
-	entry->ptr		= ptr;
-
-	if (!filter_check_discard(call, entry, tr->buffer, event))
-		ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-}
-
-static void kmemtrace_kmalloc(void *ignore,
-			      unsigned long call_site,
-			      const void *ptr,
-			      size_t bytes_req,
-			      size_t bytes_alloc,
-			      gfp_t gfp_flags)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmem_cache_alloc(void *ignore,
-				       unsigned long call_site,
-				       const void *ptr,
-				       size_t bytes_req,
-				       size_t bytes_alloc,
-				       gfp_t gfp_flags)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmalloc_node(void *ignore,
-				   unsigned long call_site,
-				   const void *ptr,
-				   size_t bytes_req,
-				   size_t bytes_alloc,
-				   gfp_t gfp_flags,
-				   int node)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kmem_cache_alloc_node(void *ignore,
-					    unsigned long call_site,
-					    const void *ptr,
-					    size_t bytes_req,
-					    size_t bytes_alloc,
-					    gfp_t gfp_flags,
-					    int node)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void
-kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
-{
-	kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
-}
-
-static void kmemtrace_kmem_cache_free(void *ignore,
-				      unsigned long call_site, const void *ptr)
-{
-	kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
-}
-
-static int kmemtrace_start_probes(void)
-{
-	int err;
-
-	err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
-	if (err)
-		return err;
-	err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
-	if (err)
-		return err;
-	err = register_trace_kfree(kmemtrace_kfree, NULL);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
-
-	return err;
-}
-
-static void kmemtrace_stop_probes(void)
-{
-	unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
-	unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
-	unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
-	unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
-	unregister_trace_kfree(kmemtrace_kfree, NULL);
-	unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
-}
-
-static int kmem_trace_init(struct trace_array *tr)
-{
-	kmemtrace_array = tr;
-
-	tracing_reset_online_cpus(tr);
-
-	kmemtrace_start_probes();
-
-	return 0;
-}
-
-static void kmem_trace_reset(struct trace_array *tr)
-{
-	kmemtrace_stop_probes();
-}
-
-static void kmemtrace_headers(struct seq_file *s)
-{
-	/* Don't need headers for the original kmemtrace output */
-	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
-		return;
-
-	seq_printf(s, "#\n");
-	seq_printf(s, "# ALLOC  TYPE  REQ   GIVEN  FLAGS     "
-			"      POINTER         NODE    CALLER\n");
-	seq_printf(s, "# FREE   |      |     |       |       "
-			"       |   |            |        |\n");
-	seq_printf(s, "# |\n\n");
-}
-
-/*
- * The following functions give the original output from kmemtrace,
- * plus the origin CPU, since reordering occurs in-kernel now.
- */
-
-#define KMEMTRACE_USER_ALLOC	0
-#define KMEMTRACE_USER_FREE	1
-
-struct kmemtrace_user_event {
-	u8			event_id;
-	u8			type_id;
-	u16			event_size;
-	u32			cpu;
-	u64			timestamp;
-	unsigned long		call_site;
-	unsigned long		ptr;
-};
-
-struct kmemtrace_user_event_alloc {
-	size_t			bytes_req;
-	size_t			bytes_alloc;
-	unsigned		gfp_flags;
-	int			node;
-};
-
-static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
-		      struct trace_event *event)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_alloc_entry *entry;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
-	    "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
-	    entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
-	    (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
-	    (unsigned long)entry->gfp_flags, entry->node);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags,
-		     struct trace_event *event)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_free_entry *entry;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
-			       entry->type_id, (void *)entry->call_site,
-			       (unsigned long)entry->ptr);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
-			   struct trace_event *event)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_alloc_entry *entry;
-	struct kmemtrace_user_event *ev;
-	struct kmemtrace_user_event_alloc *ev_alloc;
-
-	trace_assign_type(entry, iter->ent);
-
-	ev = trace_seq_reserve(s, sizeof(*ev));
-	if (!ev)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev->event_id		= KMEMTRACE_USER_ALLOC;
-	ev->type_id		= entry->type_id;
-	ev->event_size		= sizeof(*ev) + sizeof(*ev_alloc);
-	ev->cpu			= iter->cpu;
-	ev->timestamp		= iter->ts;
-	ev->call_site		= entry->call_site;
-	ev->ptr			= (unsigned long)entry->ptr;
-
-	ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
-	if (!ev_alloc)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev_alloc->bytes_req	= entry->bytes_req;
-	ev_alloc->bytes_alloc	= entry->bytes_alloc;
-	ev_alloc->gfp_flags	= entry->gfp_flags;
-	ev_alloc->node		= entry->node;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
-			  struct trace_event *event)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_free_entry *entry;
-	struct kmemtrace_user_event *ev;
-
-	trace_assign_type(entry, iter->ent);
-
-	ev = trace_seq_reserve(s, sizeof(*ev));
-	if (!ev)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev->event_id		= KMEMTRACE_USER_FREE;
-	ev->type_id		= entry->type_id;
-	ev->event_size		= sizeof(*ev);
-	ev->cpu			= iter->cpu;
-	ev->timestamp		= iter->ts;
-	ev->call_site		= entry->call_site;
-	ev->ptr			= (unsigned long)entry->ptr;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-/* The two other following provide a more minimalistic output */
-static enum print_line_t
-kmemtrace_print_alloc_compress(struct trace_iterator *iter)
-{
-	struct kmemtrace_alloc_entry *entry;
-	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	/* Alloc entry */
-	ret = trace_seq_printf(s, "  +      ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Type */
-	switch (entry->type_id) {
-	case KMEMTRACE_TYPE_KMALLOC:
-		ret = trace_seq_printf(s, "K   ");
-		break;
-	case KMEMTRACE_TYPE_CACHE:
-		ret = trace_seq_printf(s, "C   ");
-		break;
-	case KMEMTRACE_TYPE_PAGES:
-		ret = trace_seq_printf(s, "P   ");
-		break;
-	default:
-		ret = trace_seq_printf(s, "?   ");
-	}
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Requested */
-	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_req);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Allocated */
-	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_alloc);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Flags
-	 * TODO: would be better to see the name of the GFP flag names
-	 */
-	ret = trace_seq_printf(s, "%08x   ", entry->gfp_flags);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Pointer to allocated */
-	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Node and call site*/
-	ret = trace_seq_printf(s, "%4d   %pf\n", entry->node,
-						 (void *)entry->call_site);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_compress(struct trace_iterator *iter)
-{
-	struct kmemtrace_free_entry *entry;
-	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	/* Free entry */
-	ret = trace_seq_printf(s, "  -      ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Type */
-	switch (entry->type_id) {
-	case KMEMTRACE_TYPE_KMALLOC:
-		ret = trace_seq_printf(s, "K     ");
-		break;
-	case KMEMTRACE_TYPE_CACHE:
-		ret = trace_seq_printf(s, "C     ");
-		break;
-	case KMEMTRACE_TYPE_PAGES:
-		ret = trace_seq_printf(s, "P     ");
-		break;
-	default:
-		ret = trace_seq_printf(s, "?     ");
-	}
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Skip requested/allocated/flags */
-	ret = trace_seq_printf(s, "                       ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Pointer to allocated */
-	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Skip node and print call site*/
-	ret = trace_seq_printf(s, "       %pf\n", (void *)entry->call_site);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-
-	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
-		return TRACE_TYPE_UNHANDLED;
-
-	switch (entry->type) {
-	case TRACE_KMEM_ALLOC:
-		return kmemtrace_print_alloc_compress(iter);
-	case TRACE_KMEM_FREE:
-		return kmemtrace_print_free_compress(iter);
-	default:
-		return TRACE_TYPE_UNHANDLED;
-	}
-}
-
-static struct trace_event_functions kmem_trace_alloc_funcs = {
-	.trace			= kmemtrace_print_alloc,
-	.binary			= kmemtrace_print_alloc_user,
-};
-
-static struct trace_event kmem_trace_alloc = {
-	.type			= TRACE_KMEM_ALLOC,
-	.funcs			= &kmem_trace_alloc_funcs,
-};
-
-static struct trace_event_functions kmem_trace_free_funcs = {
-	.trace			= kmemtrace_print_free,
-	.binary			= kmemtrace_print_free_user,
-};
-
-static struct trace_event kmem_trace_free = {
-	.type			= TRACE_KMEM_FREE,
-	.funcs			= &kmem_trace_free_funcs,
-};
-
-static struct tracer kmem_tracer __read_mostly = {
-	.name			= "kmemtrace",
-	.init			= kmem_trace_init,
-	.reset			= kmem_trace_reset,
-	.print_line		= kmemtrace_print_line,
-	.print_header		= kmemtrace_headers,
-	.flags			= &kmem_tracer_flags
-};
-
-void kmemtrace_init(void)
-{
-	/* earliest opportunity to start kmem tracing */
-}
-
-static int __init init_kmem_tracer(void)
-{
-	if (!register_ftrace_event(&kmem_trace_alloc)) {
-		pr_warning("Warning: could not register kmem events\n");
-		return 1;
-	}
-
-	if (!register_ftrace_event(&kmem_trace_free)) {
-		pr_warning("Warning: could not register kmem events\n");
-		return 1;
-	}
-
-	if (register_tracer(&kmem_tracer) != 0) {
-		pr_warning("Warning: could not register the kmem tracer\n");
-		return 1;
-	}
-
-	return 0;
-}
-device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 75a5e800a737..075cd2ea84a2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,7 +9,6 @@
 #include <linux/mmiotrace.h>
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
-#include <linux/kmemtrace.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -30,19 +29,12 @@ enum trace_type {
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
-	TRACE_KMEM_ALLOC,
-	TRACE_KMEM_FREE,
 	TRACE_BLK,
 	TRACE_KSYM,
 
 	__TRACE_LAST_TYPE,
 };
 
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
 
 #undef __field
 #define __field(type, item)		type	item;
@@ -208,10 +200,6 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
-		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
-			  TRACE_KMEM_ALLOC);	\
-		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
-			  TRACE_KMEM_FREE);	\
 		IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
 		__ftrace_bad_type();					\
 	} while (0)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c293364c984f..13abc157dbaf 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -291,41 +291,6 @@ FTRACE_ENTRY(branch, trace_branch,
 		 __entry->func, __entry->file, __entry->correct)
 );
 
-FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
-
-	TRACE_KMEM_ALLOC,
-
-	F_STRUCT(
-		__field(	enum kmemtrace_type_id,	type_id		)
-		__field(	unsigned long,		call_site	)
-		__field(	const void *,		ptr		)
-		__field(	size_t,			bytes_req	)
-		__field(	size_t,			bytes_alloc	)
-		__field(	gfp_t,			gfp_flags	)
-		__field(	int,			node		)
-	),
-
-	F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
-		 " flags:%x node:%d",
-		 __entry->type_id, __entry->call_site, __entry->ptr,
-		 __entry->bytes_req, __entry->bytes_alloc,
-		 __entry->gfp_flags, __entry->node)
-);
-
-FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
-
-	TRACE_KMEM_FREE,
-
-	F_STRUCT(
-		__field(	enum kmemtrace_type_id,	type_id		)
-		__field(	unsigned long,		call_site	)
-		__field(	const void *,		ptr		)
-	),
-
-	F_printk("type:%u call_site:%lx ptr:%p",
-		 __entry->type_id, __entry->call_site, __entry->ptr)
-);
-
 FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
 
 	TRACE_KSYM,
diff --git a/mm/slab.c b/mm/slab.c
index e49f8f46f46d..47360c3e5abd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,6 @@
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
-#include	<linux/kmemtrace.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
diff --git a/mm/slob.c b/mm/slob.c
index 23631e2bb57a..a82ab5811bd9 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -66,8 +66,10 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
-#include <linux/kmemtrace.h>
 #include <linux/kmemleak.h>
+
+#include <trace/events/kmem.h>
+
 #include <asm/atomic.h>
 
 /*
diff --git a/mm/slub.c b/mm/slub.c
index 26f0cb9cc584..a61f1aad1070 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -17,7 +17,6 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/kmemtrace.h>
 #include <linux/kmemcheck.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
-- 
cgit v1.2.3-59-g8ed1b


From 8e4b50f94e8c1435a3e0ece42b7f97bc857d0145 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Thu, 10 Jun 2010 08:26:28 +0200
Subject: firewire: core: add CSR SPLIT_TIMEOUT support

Implement the SPLIT_TIMEOUT registers.  Besides being required by the
spec, this is desirable for some IIDC devices and necessary for many
audio devices to be able to increase the timeout from userspace.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
---
 drivers/firewire/core-card.c        |  4 ++
 drivers/firewire/core-transaction.c | 76 +++++++++++++++++++++++++++++++------
 include/linux/firewire.h            |  5 +++
 3 files changed, 74 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 901435cdd5c2..d0f15c2f1e1d 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -428,6 +428,10 @@ void fw_card_initialize(struct fw_card *card,
 	card->device = device;
 	card->current_tlabel = 0;
 	card->tlabel_mask = 0;
+	card->split_timeout_hi = 0;
+	card->split_timeout_lo = 800 << 19;
+	card->split_timeout_cycles = 800;
+	card->split_timeout_jiffies = DIV_ROUND_UP(HZ, 10);
 	card->color = 0;
 	card->broadcast_channel = BROADCAST_CHANNEL_INITIAL;
 
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 0034229dfd14..9a7d3ec23f2b 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -339,7 +339,8 @@ void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode,
 	setup_timer(&t->split_timeout_timer,
 		    split_transaction_timeout_callback, (unsigned long)t);
 	/* FIXME: start this timer later, relative to t->timestamp */
-	mod_timer(&t->split_timeout_timer, jiffies + DIV_ROUND_UP(HZ, 10));
+	mod_timer(&t->split_timeout_timer,
+		  jiffies + card->split_timeout_jiffies);
 	t->callback = callback;
 	t->callback_data = callback_data;
 
@@ -673,11 +674,28 @@ void fw_fill_response(struct fw_packet *response, u32 *request_header,
 }
 EXPORT_SYMBOL(fw_fill_response);
 
-static struct fw_request *allocate_request(struct fw_packet *p)
+static u32 compute_split_timeout_timestamp(struct fw_card *card,
+					   u32 request_timestamp)
+{
+	unsigned int cycles;
+	u32 timestamp;
+
+	cycles = card->split_timeout_cycles;
+	cycles += request_timestamp & 0x1fff;
+
+	timestamp = request_timestamp & ~0x1fff;
+	timestamp += (cycles / 8000) << 13;
+	timestamp |= cycles % 8000;
+
+	return timestamp;
+}
+
+static struct fw_request *allocate_request(struct fw_card *card,
+					   struct fw_packet *p)
 {
 	struct fw_request *request;
 	u32 *data, length;
-	int request_tcode, t;
+	int request_tcode;
 
 	request_tcode = HEADER_GET_TCODE(p->header[0]);
 	switch (request_tcode) {
@@ -712,14 +730,9 @@ static struct fw_request *allocate_request(struct fw_packet *p)
 	if (request == NULL)
 		return NULL;
 
-	t = (p->timestamp & 0x1fff) + 4000;
-	if (t >= 8000)
-		t = (p->timestamp & ~0x1fff) + 0x2000 + t - 8000;
-	else
-		t = (p->timestamp & ~0x1fff) + t;
-
 	request->response.speed = p->speed;
-	request->response.timestamp = t;
+	request->response.timestamp =
+			compute_split_timeout_timestamp(card, p->timestamp);
 	request->response.generation = p->generation;
 	request->response.ack = 0;
 	request->response.callback = free_response_callback;
@@ -845,7 +858,7 @@ void fw_core_handle_request(struct fw_card *card, struct fw_packet *p)
 	if (p->ack != ACK_PENDING && p->ack != ACK_COMPLETE)
 		return;
 
-	request = allocate_request(p);
+	request = allocate_request(card, p);
 	if (request == NULL) {
 		/* FIXME: send statically allocated busy packet. */
 		return;
@@ -993,6 +1006,19 @@ static u32 read_state_register(struct fw_card *card)
 	return 0;
 }
 
+static void update_split_timeout(struct fw_card *card)
+{
+	unsigned int cycles;
+
+	cycles = card->split_timeout_hi * 8000 + (card->split_timeout_lo >> 19);
+
+	cycles = max(cycles, 800u); /* minimum as per the spec */
+	cycles = min(cycles, 3u * 8000u); /* maximum OHCI timeout */
+
+	card->split_timeout_cycles = cycles;
+	card->split_timeout_jiffies = DIV_ROUND_UP(cycles * HZ, 8000);
+}
+
 static void handle_registers(struct fw_card *card, struct fw_request *request,
 		int tcode, int destination, int source, int generation,
 		int speed, unsigned long long offset,
@@ -1001,6 +1027,7 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 	int reg = offset & ~CSR_REGISTER_BASE;
 	__be32 *data = payload;
 	int rcode = RCODE_COMPLETE;
+	unsigned long flags;
 
 	switch (reg) {
 	case CSR_STATE_CLEAR:
@@ -1039,6 +1066,33 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 			rcode = RCODE_TYPE_ERROR;
 		break;
 
+	case CSR_SPLIT_TIMEOUT_HI:
+		if (tcode == TCODE_READ_QUADLET_REQUEST) {
+			*data = cpu_to_be32(card->split_timeout_hi);
+		} else if (tcode == TCODE_WRITE_QUADLET_REQUEST) {
+			spin_lock_irqsave(&card->lock, flags);
+			card->split_timeout_hi = be32_to_cpu(*data) & 7;
+			update_split_timeout(card);
+			spin_unlock_irqrestore(&card->lock, flags);
+		} else {
+			rcode = RCODE_TYPE_ERROR;
+		}
+		break;
+
+	case CSR_SPLIT_TIMEOUT_LO:
+		if (tcode == TCODE_READ_QUADLET_REQUEST) {
+			*data = cpu_to_be32(card->split_timeout_lo);
+		} else if (tcode == TCODE_WRITE_QUADLET_REQUEST) {
+			spin_lock_irqsave(&card->lock, flags);
+			card->split_timeout_lo =
+					be32_to_cpu(*data) & 0xfff80000;
+			update_split_timeout(card);
+			spin_unlock_irqrestore(&card->lock, flags);
+		} else {
+			rcode = RCODE_TYPE_ERROR;
+		}
+		break;
+
 	case CSR_CYCLE_TIME:
 		if (TCODE_IS_READ_REQUEST(tcode) && length == 4)
 			*data = cpu_to_be32(card->driver->
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 72e2b8ac2a5a..cdf8213c68ca 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -89,6 +89,11 @@ struct fw_card {
 	struct list_head transaction_list;
 	unsigned long reset_jiffies;
 
+	u32 split_timeout_hi;
+	u32 split_timeout_lo;
+	unsigned int split_timeout_cycles;
+	unsigned int split_timeout_jiffies;
+
 	unsigned long long guid;
 	unsigned max_receive;
 	int link_speed;
-- 
cgit v1.2.3-59-g8ed1b


From a1a1132bd83d0aea51d4f19be4b4a58a064a0131 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Thu, 10 Jun 2010 08:35:06 +0200
Subject: firewire: add CSR PRIORITY_BUDGET support

If supported by the OHCI controller, implement the PRIORITY_BUDGET
register, which is required for nodes that can use asynchronous
priority arbitration.

To allow the core to determine what features the lowlevel device
supports, add a new card driver callback.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
---
 drivers/firewire/core-transaction.c | 14 ++++++++++++++
 drivers/firewire/core.h             |  4 ++++
 drivers/firewire/ohci.c             | 27 +++++++++++++++++++++++++++
 include/linux/firewire.h            |  1 +
 4 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 8146133818dc..a61eb3fb9573 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -1126,6 +1126,20 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 			rcode = RCODE_TYPE_ERROR;
 		break;
 
+	case CSR_PRIORITY_BUDGET:
+		if (!(card->driver->get_features(card) &
+						FEATURE_PRIORITY_BUDGET))
+			rcode = RCODE_ADDRESS_ERROR;
+		else if (tcode == TCODE_READ_QUADLET_REQUEST)
+			*data = cpu_to_be32(card->driver->
+				read_csr_reg(card, CSR_PRIORITY_BUDGET));
+		else if (tcode == TCODE_WRITE_QUADLET_REQUEST)
+			card->driver->write_csr_reg(card, CSR_PRIORITY_BUDGET,
+						    be32_to_cpu(*data));
+		else
+			rcode = RCODE_TYPE_ERROR;
+		break;
+
 	case CSR_BROADCAST_CHANNEL:
 		if (tcode == TCODE_READ_QUADLET_REQUEST)
 			*data = cpu_to_be32(card->broadcast_channel);
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index efcdeb2e31e6..3b8c0f042f49 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -38,6 +38,8 @@ struct fw_packet;
 #define BROADCAST_CHANNEL_INITIAL	(1 << 31 | 31)
 #define BROADCAST_CHANNEL_VALID		(1 << 30)
 
+#define FEATURE_PRIORITY_BUDGET		0x01
+
 struct fw_card_driver {
 	/*
 	 * Enable the given card with the given initial config rom.
@@ -78,6 +80,8 @@ struct fw_card_driver {
 	u32 (*read_csr_reg)(struct fw_card *card, int csr_offset);
 	void (*write_csr_reg)(struct fw_card *card, int csr_offset, u32 value);
 
+	unsigned int (*get_features)(struct fw_card *card);
+
 	struct fw_iso_context *
 	(*allocate_iso_context)(struct fw_card *card,
 				int type, int channel, size_t header_size);
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 9c588fd01250..0e5413531785 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -170,6 +170,7 @@ struct fw_ohci {
 	int generation;
 	int request_generation;	/* for timestamping incoming requests */
 	unsigned quirks;
+	unsigned int pri_req_max;
 	u32 bus_time;
 
 	/*
@@ -1738,6 +1739,11 @@ static int ohci_enable(struct fw_card *card,
 	reg_write(ohci, OHCI1394_IsochronousCycleTimer, seconds << 25);
 	ohci->bus_time = seconds & ~0x3f;
 
+	/* Get implemented bits of the priority arbitration request counter. */
+	reg_write(ohci, OHCI1394_FairnessControl, 0x3f);
+	ohci->pri_req_max = reg_read(ohci, OHCI1394_FairnessControl) & 0x3f;
+	reg_write(ohci, OHCI1394_FairnessControl, 0);
+
 	ar_context_run(&ohci->ar_request_ctx);
 	ar_context_run(&ohci->ar_response_ctx);
 
@@ -2028,6 +2034,10 @@ static u32 ohci_read_csr_reg(struct fw_card *card, int csr_offset)
 		value = reg_read(ohci, OHCI1394_ATRetries);
 		return (value >> 4) & 0x0ffff00f;
 
+	case CSR_PRIORITY_BUDGET:
+		return (reg_read(ohci, OHCI1394_FairnessControl) & 0x3f) |
+			(ohci->pri_req_max << 8);
+
 	default:
 		WARN_ON(1);
 		return 0;
@@ -2065,12 +2075,28 @@ static void ohci_write_csr_reg(struct fw_card *card, int csr_offset, u32 value)
 		flush_writes(ohci);
 		break;
 
+	case CSR_PRIORITY_BUDGET:
+		reg_write(ohci, OHCI1394_FairnessControl, value & 0x3f);
+		flush_writes(ohci);
+		break;
+
 	default:
 		WARN_ON(1);
 		break;
 	}
 }
 
+static unsigned int ohci_get_features(struct fw_card *card)
+{
+	struct fw_ohci *ohci = fw_ohci(card);
+	unsigned int features = 0;
+
+	if (ohci->pri_req_max != 0)
+		features |= FEATURE_PRIORITY_BUDGET;
+
+	return features;
+}
+
 static void copy_iso_headers(struct iso_context *ctx, void *p)
 {
 	int i = ctx->header_length;
@@ -2510,6 +2536,7 @@ static const struct fw_card_driver ohci_driver = {
 	.enable_phys_dma	= ohci_enable_phys_dma,
 	.read_csr_reg		= ohci_read_csr_reg,
 	.write_csr_reg		= ohci_write_csr_reg,
+	.get_features		= ohci_get_features,
 
 	.allocate_iso_context	= ohci_allocate_iso_context,
 	.free_iso_context	= ohci_free_iso_context,
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index cdf8213c68ca..a50377d91254 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -32,6 +32,7 @@
 #define CSR_CYCLE_TIME			0x200
 #define CSR_BUS_TIME			0x204
 #define CSR_BUSY_TIMEOUT		0x210
+#define CSR_PRIORITY_BUDGET		0x218
 #define CSR_BUS_MANAGER_ID		0x21c
 #define CSR_BANDWIDTH_AVAILABLE		0x220
 #define CSR_CHANNELS_AVAILABLE		0x224
-- 
cgit v1.2.3-59-g8ed1b


From 3d1f46eb60b155c705e389ecdf313f11b4b91976 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Thu, 10 Jun 2010 08:35:37 +0200
Subject: firewire: core: add CSR MAINT_UTILITY support

Implement the MAIN_UTILITY register, which is utterly optional
but useful as a safe target for diagnostic read/write/broadcast
transactions.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
---
 drivers/firewire/core-transaction.c | 9 +++++++++
 include/linux/firewire.h            | 3 +++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index a61eb3fb9573..dd8ef650a7cb 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -1140,6 +1140,15 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 			rcode = RCODE_TYPE_ERROR;
 		break;
 
+	case CSR_MAINT_UTILITY:
+		if (tcode == TCODE_READ_QUADLET_REQUEST)
+			*data = card->maint_utility_register;
+		else if (tcode == TCODE_WRITE_QUADLET_REQUEST)
+			card->maint_utility_register = *data;
+		else
+			rcode = RCODE_TYPE_ERROR;
+		break;
+
 	case CSR_BROADCAST_CHANNEL:
 		if (tcode == TCODE_READ_QUADLET_REQUEST)
 			*data = cpu_to_be32(card->broadcast_channel);
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index a50377d91254..f1160e831dad 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -38,6 +38,7 @@
 #define CSR_CHANNELS_AVAILABLE		0x224
 #define CSR_CHANNELS_AVAILABLE_HI	0x224
 #define CSR_CHANNELS_AVAILABLE_LO	0x228
+#define CSR_MAINT_UTILITY		0x230
 #define CSR_BROADCAST_CHANNEL		0x234
 #define CSR_CONFIG_ROM			0x400
 #define CSR_CONFIG_ROM_END		0x800
@@ -122,6 +123,8 @@ struct fw_card {
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
 	__be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
+
+	__be32 maint_utility_register;
 };
 
 struct fw_attribute_group {
-- 
cgit v1.2.3-59-g8ed1b


From 7e0e314f198d5048b74c8f0ef9f4c1c02e5ecfc9 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Thu, 10 Jun 2010 08:37:15 +0200
Subject: firewire: core: add CSR abdicate support

Implement the abdicate bit, which is required for bus manager
capable nodes and tested by the Base 1394 Test Suite.

Finally, something to do at a command reset!  :-)

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
---
 drivers/firewire/core-card.c        |  3 ++-
 drivers/firewire/core-topology.c    |  2 ++
 drivers/firewire/core-transaction.c | 13 +++++++++++--
 drivers/firewire/core.h             |  1 +
 include/linux/firewire.h            |  2 ++
 5 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index d0f15c2f1e1d..7c4cf6cfa746 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -260,7 +260,8 @@ static void fw_card_bm_work(struct work_struct *work)
 
 	grace = time_after(jiffies, card->reset_jiffies + DIV_ROUND_UP(HZ, 8));
 
-	if (is_next_generation(generation, card->bm_generation) ||
+	if ((is_next_generation(generation, card->bm_generation) &&
+	     !card->bm_abdicate) ||
 	    (card->bm_generation != generation && grace)) {
 		/*
 		 * This first step is to figure out who is IRM and
diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index 93ec64cdeef7..ca3c65318165 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -552,6 +552,8 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation,
 	smp_wmb();
 	card->generation = generation;
 	card->reset_jiffies = jiffies;
+	card->bm_abdicate = card->csr_abdicate;
+	card->csr_abdicate = false;
 	fw_schedule_bm_work(card, 0);
 
 	local_node = build_tree(card, self_ids, self_id_count);
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index e0c6cce894cf..85a54da243e2 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -1008,6 +1008,10 @@ static u32 read_state_register(struct fw_card *card)
 	/* Bit 8 (cmstr): */
 	value |= card->driver->read_csr_reg(card, CSR_STATE_CLEAR);
 
+	/* Bit 10 (abdicate): */
+	if (card->csr_abdicate)
+		value |= CSR_STATE_BIT_ABDICATE;
+
 	return value;
 }
 
@@ -1041,6 +1045,8 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 		} else if (tcode == TCODE_WRITE_QUADLET_REQUEST) {
 			card->driver->write_csr_reg(card, CSR_STATE_CLEAR,
 						    be32_to_cpu(*data));
+			if (*data & cpu_to_be32(CSR_STATE_BIT_ABDICATE))
+				card->csr_abdicate = false;
 		} else {
 			rcode = RCODE_TYPE_ERROR;
 		}
@@ -1052,7 +1058,8 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 		} else if (tcode == TCODE_WRITE_QUADLET_REQUEST) {
 			card->driver->write_csr_reg(card, CSR_STATE_SET,
 						    be32_to_cpu(*data));
-			/* FIXME: implement abdicate */
+			if (*data & cpu_to_be32(CSR_STATE_BIT_ABDICATE))
+				card->csr_abdicate = true;
 		} else {
 			rcode = RCODE_TYPE_ERROR;
 		}
@@ -1070,7 +1077,9 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 		break;
 
 	case CSR_RESET_START:
-		if (tcode != TCODE_WRITE_QUADLET_REQUEST)
+		if (tcode == TCODE_WRITE_QUADLET_REQUEST)
+			card->csr_abdicate = false;
+		else
 			rcode = RCODE_TYPE_ERROR;
 		break;
 
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index aaecdd1c1767..a9ace1f8dc3f 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -41,6 +41,7 @@ struct fw_packet;
 #define FEATURE_PRIORITY_BUDGET		0x01
 
 #define CSR_STATE_BIT_CMSTR	(1 << 8)
+#define CSR_STATE_BIT_ABDICATE	(1 << 10)
 
 struct fw_card_driver {
 	/*
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index f1160e831dad..4d22643215ef 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -119,6 +119,8 @@ struct fw_card {
 	int bm_retries;
 	int bm_generation;
 	__be32 bm_transaction_data[2];
+	bool bm_abdicate; /* value of csr_abdicate before last bus reset */
+	bool csr_abdicate; /* visible in CSR STATE_CLEAR/SET registers */
 
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
-- 
cgit v1.2.3-59-g8ed1b


From fb76dd10b91146e9cefbb3cd4e6812c5a95ee43b Mon Sep 17 00:00:00 2001
From: Luotao Fu <l.fu@pengutronix.de>
Date: Thu, 10 Jun 2010 12:05:23 -0700
Subject: Input: matrix_keypad - add support for clustered irq

This one adds support of a combined irq source for the whole matrix keypad.
This can be useful if all rows and columns of the keypad are e.g. connected
to a GPIO expander, which only has one interrupt line for all events on
every single GPIO.

Signed-off-by: Luotao Fu <l.fu@pengutronix.de>
Acked-by: Eric Miao <eric.y.miao@gmail.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/keyboard/matrix_keypad.c | 108 ++++++++++++++++++++++++---------
 include/linux/input/matrix_keypad.h    |   6 ++
 2 files changed, 86 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/matrix_keypad.c b/drivers/input/keyboard/matrix_keypad.c
index b443e088fd3c..b02e4268e18f 100644
--- a/drivers/input/keyboard/matrix_keypad.c
+++ b/drivers/input/keyboard/matrix_keypad.c
@@ -37,6 +37,7 @@ struct matrix_keypad {
 	spinlock_t lock;
 	bool scan_pending;
 	bool stopped;
+	bool gpio_all_disabled;
 };
 
 /*
@@ -87,8 +88,12 @@ static void enable_row_irqs(struct matrix_keypad *keypad)
 	const struct matrix_keypad_platform_data *pdata = keypad->pdata;
 	int i;
 
-	for (i = 0; i < pdata->num_row_gpios; i++)
-		enable_irq(gpio_to_irq(pdata->row_gpios[i]));
+	if (pdata->clustered_irq > 0)
+		enable_irq(pdata->clustered_irq);
+	else {
+		for (i = 0; i < pdata->num_row_gpios; i++)
+			enable_irq(gpio_to_irq(pdata->row_gpios[i]));
+	}
 }
 
 static void disable_row_irqs(struct matrix_keypad *keypad)
@@ -96,8 +101,12 @@ static void disable_row_irqs(struct matrix_keypad *keypad)
 	const struct matrix_keypad_platform_data *pdata = keypad->pdata;
 	int i;
 
-	for (i = 0; i < pdata->num_row_gpios; i++)
-		disable_irq_nosync(gpio_to_irq(pdata->row_gpios[i]));
+	if (pdata->clustered_irq > 0)
+		disable_irq_nosync(pdata->clustered_irq);
+	else {
+		for (i = 0; i < pdata->num_row_gpios; i++)
+			disable_irq_nosync(gpio_to_irq(pdata->row_gpios[i]));
+	}
 }
 
 /*
@@ -216,45 +225,69 @@ static void matrix_keypad_stop(struct input_dev *dev)
 }
 
 #ifdef CONFIG_PM
-static int matrix_keypad_suspend(struct device *dev)
+static void matrix_keypad_enable_wakeup(struct matrix_keypad *keypad)
 {
-	struct platform_device *pdev = to_platform_device(dev);
-	struct matrix_keypad *keypad = platform_get_drvdata(pdev);
 	const struct matrix_keypad_platform_data *pdata = keypad->pdata;
+	unsigned int gpio;
 	int i;
 
-	matrix_keypad_stop(keypad->input_dev);
+	if (pdata->clustered_irq > 0) {
+		if (enable_irq_wake(pdata->clustered_irq) == 0)
+			keypad->gpio_all_disabled = true;
+	} else {
 
-	if (device_may_wakeup(&pdev->dev)) {
 		for (i = 0; i < pdata->num_row_gpios; i++) {
 			if (!test_bit(i, keypad->disabled_gpios)) {
-				unsigned int gpio = pdata->row_gpios[i];
+				gpio = pdata->row_gpios[i];
 
 				if (enable_irq_wake(gpio_to_irq(gpio)) == 0)
 					__set_bit(i, keypad->disabled_gpios);
 			}
 		}
 	}
-
-	return 0;
 }
 
-static int matrix_keypad_resume(struct device *dev)
+static void matrix_keypad_disable_wakeup(struct matrix_keypad *keypad)
 {
-	struct platform_device *pdev = to_platform_device(dev);
-	struct matrix_keypad *keypad = platform_get_drvdata(pdev);
 	const struct matrix_keypad_platform_data *pdata = keypad->pdata;
+	unsigned int gpio;
 	int i;
 
-	if (device_may_wakeup(&pdev->dev)) {
+	if (pdata->clustered_irq > 0) {
+		if (keypad->gpio_all_disabled) {
+			disable_irq_wake(pdata->clustered_irq);
+			keypad->gpio_all_disabled = false;
+		}
+	} else {
 		for (i = 0; i < pdata->num_row_gpios; i++) {
 			if (test_and_clear_bit(i, keypad->disabled_gpios)) {
-				unsigned int gpio = pdata->row_gpios[i];
-
+				gpio = pdata->row_gpios[i];
 				disable_irq_wake(gpio_to_irq(gpio));
 			}
 		}
 	}
+}
+
+static int matrix_keypad_suspend(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct matrix_keypad *keypad = platform_get_drvdata(pdev);
+
+	matrix_keypad_stop(keypad->input_dev);
+
+	if (device_may_wakeup(&pdev->dev))
+		matrix_keypad_enable_wakeup(keypad);
+
+	return 0;
+}
+
+static int matrix_keypad_resume(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct matrix_keypad *keypad = platform_get_drvdata(pdev);
+
+	if (device_may_wakeup(&pdev->dev))
+		matrix_keypad_disable_wakeup(keypad);
 
 	matrix_keypad_start(keypad->input_dev);
 
@@ -296,17 +329,31 @@ static int __devinit init_matrix_gpio(struct platform_device *pdev,
 		gpio_direction_input(pdata->row_gpios[i]);
 	}
 
-	for (i = 0; i < pdata->num_row_gpios; i++) {
-		err = request_irq(gpio_to_irq(pdata->row_gpios[i]),
+	if (pdata->clustered_irq > 0) {
+		err = request_irq(pdata->clustered_irq,
 				matrix_keypad_interrupt,
-				IRQF_DISABLED |
-				IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
+				pdata->clustered_irq_flags,
 				"matrix-keypad", keypad);
 		if (err) {
 			dev_err(&pdev->dev,
-				"Unable to acquire interrupt for GPIO line %i\n",
-				pdata->row_gpios[i]);
-			goto err_free_irqs;
+				"Unable to acquire clustered interrupt\n");
+			goto err_free_rows;
+		}
+	} else {
+		for (i = 0; i < pdata->num_row_gpios; i++) {
+			err = request_irq(gpio_to_irq(pdata->row_gpios[i]),
+					matrix_keypad_interrupt,
+					IRQF_DISABLED |
+					IRQF_TRIGGER_RISING |
+					IRQF_TRIGGER_FALLING,
+					"matrix-keypad", keypad);
+			if (err) {
+				dev_err(&pdev->dev,
+					"Unable to acquire interrupt "
+					"for GPIO line %i\n",
+					pdata->row_gpios[i]);
+				goto err_free_irqs;
+			}
 		}
 	}
 
@@ -418,11 +465,16 @@ static int __devexit matrix_keypad_remove(struct platform_device *pdev)
 
 	device_init_wakeup(&pdev->dev, 0);
 
-	for (i = 0; i < pdata->num_row_gpios; i++) {
-		free_irq(gpio_to_irq(pdata->row_gpios[i]), keypad);
-		gpio_free(pdata->row_gpios[i]);
+	if (pdata->clustered_irq > 0) {
+		free_irq(pdata->clustered_irq, keypad);
+	} else {
+		for (i = 0; i < pdata->num_row_gpios; i++)
+			free_irq(gpio_to_irq(pdata->row_gpios[i]), keypad);
 	}
 
+	for (i = 0; i < pdata->num_row_gpios; i++)
+		gpio_free(pdata->row_gpios[i]);
+
 	for (i = 0; i < pdata->num_col_gpios; i++)
 		gpio_free(pdata->col_gpios[i]);
 
diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h
index c964cd7f436a..80352ad6581a 100644
--- a/include/linux/input/matrix_keypad.h
+++ b/include/linux/input/matrix_keypad.h
@@ -41,6 +41,9 @@ struct matrix_keymap_data {
  * @col_scan_delay_us: delay, measured in microseconds, that is
  *	needed before we can keypad after activating column gpio
  * @debounce_ms: debounce interval in milliseconds
+ * @clustered_irq: may be specified if interrupts of all row/column GPIOs
+ *	are bundled to one single irq
+ * @clustered_irq_flags: flags that are needed for the clustered irq
  * @active_low: gpio polarity
  * @wakeup: controls whether the device should be set up as wakeup
  *	source
@@ -63,6 +66,9 @@ struct matrix_keypad_platform_data {
 	/* key debounce interval in milli-second */
 	unsigned int	debounce_ms;
 
+	unsigned int	clustered_irq;
+	unsigned int	clustered_irq_flags;
+
 	bool		active_low;
 	bool		wakeup;
 	bool		no_autorepeat;
-- 
cgit v1.2.3-59-g8ed1b


From c6de9f08912311ddc1b3502b90e10fd449acd401 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 31 May 2010 16:48:09 +0800
Subject: x86, mce: Add HW_ERR printk prefix for hardware error logging

This makes hardware error related log in printk log more explicit. So
that the users can report it to hardware vendor instead of LKML or
software vendor.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
LKML-Reference: <1275295689.3444.462.camel@yhuang-dev.sh.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 include/linux/kernel.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8317ec4b9f3b..3bf740bb069e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -247,6 +247,13 @@ extern struct pid *session_of_pgrp(struct pid *pgrp);
 #define FW_WARN		"[Firmware Warn]: "
 #define FW_INFO		"[Firmware Info]: "
 
+/*
+ * HW_ERR
+ * Add this to a message for hardware errors, so that user can report
+ * it to hardware vendor instead of LKML or software vendor.
+ */
+#define HW_ERR		"[Hardware Error]: "
+
 #ifdef CONFIG_PRINTK
 asmlinkage int vprintk(const char *fmt, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
-- 
cgit v1.2.3-59-g8ed1b


From be1f3c2c027cc5ad735df6a45a542ed1db7ec48b Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 8 Jun 2010 07:19:54 +0000
Subject: net: Enable 64-bit net device statistics on 32-bit architectures

Use struct rtnl_link_stats64 as the statistics structure.

On 32-bit architectures, insert 32 bits of padding after/before each
field of struct net_device_stats to make its layout compatible with
struct rtnl_link_stats64.  Add an anonymous union in net_device; move
stats into the union and add struct rtnl_link_stats64 stats64.

Add net_device_ops::ndo_get_stats64, implementations of which will
return a pointer to struct rtnl_link_stats64.  Drivers that implement
this operation must not update the structure asynchronously.

Change dev_get_stats() to call ndo_get_stats64 if available, and to
return a pointer to struct rtnl_link_stats64.  Change callers of
dev_get_stats() accordingly.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 13 +++---
 include/linux/if_link.h         |  3 +-
 include/linux/netdevice.h       | 91 ++++++++++++++++++++++++-----------------
 net/8021q/vlanproc.c            | 13 +++---
 net/core/dev.c                  | 19 +++++----
 net/core/net-sysfs.c            | 12 +++---
 net/core/rtnetlink.c            |  6 +--
 7 files changed, 90 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index ac4f94b7da37..a95a41b74b4e 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3804,20 +3804,21 @@ static int bond_close(struct net_device *bond_dev)
 	return 0;
 }
 
-static struct net_device_stats *bond_get_stats(struct net_device *bond_dev)
+static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
-	struct net_device_stats *stats = &bond_dev->stats;
-	struct net_device_stats local_stats;
+	struct rtnl_link_stats64 *stats = &bond_dev->stats64;
+	struct rtnl_link_stats64 local_stats;
 	struct slave *slave;
 	int i;
 
-	memset(&local_stats, 0, sizeof(struct net_device_stats));
+	memset(&local_stats, 0, sizeof(local_stats));
 
 	read_lock_bh(&bond->lock);
 
 	bond_for_each_slave(bond, slave, i) {
-		const struct net_device_stats *sstats = dev_get_stats(slave->dev);
+		const struct rtnl_link_stats64 *sstats =
+			dev_get_stats(slave->dev);
 
 		local_stats.rx_packets += sstats->rx_packets;
 		local_stats.rx_bytes += sstats->rx_bytes;
@@ -4569,7 +4570,7 @@ static const struct net_device_ops bond_netdev_ops = {
 	.ndo_stop		= bond_close,
 	.ndo_start_xmit		= bond_start_xmit,
 	.ndo_select_queue	= bond_select_queue,
-	.ndo_get_stats		= bond_get_stats,
+	.ndo_get_stats64	= bond_get_stats,
 	.ndo_do_ioctl		= bond_do_ioctl,
 	.ndo_set_multicast_list	= bond_set_multicast_list,
 	.ndo_change_mtu		= bond_change_mtu,
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 85c812db5a3f..7fcad2e1be3d 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -4,7 +4,7 @@
 #include <linux/types.h>
 #include <linux/netlink.h>
 
-/* The struct should be in sync with struct net_device_stats */
+/* This struct should be in sync with struct rtnl_link_stats64 */
 struct rtnl_link_stats {
 	__u32	rx_packets;		/* total packets received	*/
 	__u32	tx_packets;		/* total packets transmitted	*/
@@ -37,6 +37,7 @@ struct rtnl_link_stats {
 	__u32	tx_compressed;
 };
 
+/* The main device statistics structure */
 struct rtnl_link_stats64 {
 	__u64	rx_packets;		/* total packets received	*/
 	__u64	tx_packets;		/* total packets transmitted	*/
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c319f28d699d..4fbccc5f609a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -159,45 +159,49 @@ static inline bool dev_xmit_complete(int rc)
 #define MAX_HEADER (LL_MAX_HEADER + 48)
 #endif
 
-#endif  /*  __KERNEL__  */
-
 /*
- *	Network device statistics. Akin to the 2.0 ether stats but
- *	with byte counters.
+ *	Old network device statistics. Fields are native words
+ *	(unsigned long) so they can be read and written atomically.
+ *	Each field is padded to 64 bits for compatibility with
+ *	rtnl_link_stats64.
  */
 
+#if BITS_PER_LONG == 64
+#define NET_DEVICE_STATS_DEFINE(name)	unsigned long name
+#elif defined(__LITTLE_ENDIAN)
+#define NET_DEVICE_STATS_DEFINE(name)	unsigned long name, pad_ ## name
+#else
+#define NET_DEVICE_STATS_DEFINE(name)	unsigned long pad_ ## name, name
+#endif
+
 struct net_device_stats {
-	unsigned long	rx_packets;		/* total packets received	*/
-	unsigned long	tx_packets;		/* total packets transmitted	*/
-	unsigned long	rx_bytes;		/* total bytes received 	*/
-	unsigned long	tx_bytes;		/* total bytes transmitted	*/
-	unsigned long	rx_errors;		/* bad packets received		*/
-	unsigned long	tx_errors;		/* packet transmit problems	*/
-	unsigned long	rx_dropped;		/* no space in linux buffers	*/
-	unsigned long	tx_dropped;		/* no space available in linux	*/
-	unsigned long	multicast;		/* multicast packets received	*/
-	unsigned long	collisions;
-
-	/* detailed rx_errors: */
-	unsigned long	rx_length_errors;
-	unsigned long	rx_over_errors;		/* receiver ring buff overflow	*/
-	unsigned long	rx_crc_errors;		/* recved pkt with crc error	*/
-	unsigned long	rx_frame_errors;	/* recv'd frame alignment error */
-	unsigned long	rx_fifo_errors;		/* recv'r fifo overrun		*/
-	unsigned long	rx_missed_errors;	/* receiver missed packet	*/
-
-	/* detailed tx_errors */
-	unsigned long	tx_aborted_errors;
-	unsigned long	tx_carrier_errors;
-	unsigned long	tx_fifo_errors;
-	unsigned long	tx_heartbeat_errors;
-	unsigned long	tx_window_errors;
-	
-	/* for cslip etc */
-	unsigned long	rx_compressed;
-	unsigned long	tx_compressed;
+	NET_DEVICE_STATS_DEFINE(rx_packets);
+	NET_DEVICE_STATS_DEFINE(tx_packets);
+	NET_DEVICE_STATS_DEFINE(rx_bytes);
+	NET_DEVICE_STATS_DEFINE(tx_bytes);
+	NET_DEVICE_STATS_DEFINE(rx_errors);
+	NET_DEVICE_STATS_DEFINE(tx_errors);
+	NET_DEVICE_STATS_DEFINE(rx_dropped);
+	NET_DEVICE_STATS_DEFINE(tx_dropped);
+	NET_DEVICE_STATS_DEFINE(multicast);
+	NET_DEVICE_STATS_DEFINE(collisions);
+	NET_DEVICE_STATS_DEFINE(rx_length_errors);
+	NET_DEVICE_STATS_DEFINE(rx_over_errors);
+	NET_DEVICE_STATS_DEFINE(rx_crc_errors);
+	NET_DEVICE_STATS_DEFINE(rx_frame_errors);
+	NET_DEVICE_STATS_DEFINE(rx_fifo_errors);
+	NET_DEVICE_STATS_DEFINE(rx_missed_errors);
+	NET_DEVICE_STATS_DEFINE(tx_aborted_errors);
+	NET_DEVICE_STATS_DEFINE(tx_carrier_errors);
+	NET_DEVICE_STATS_DEFINE(tx_fifo_errors);
+	NET_DEVICE_STATS_DEFINE(tx_heartbeat_errors);
+	NET_DEVICE_STATS_DEFINE(tx_window_errors);
+	NET_DEVICE_STATS_DEFINE(rx_compressed);
+	NET_DEVICE_STATS_DEFINE(tx_compressed);
 };
 
+#endif  /*  __KERNEL__  */
+
 
 /* Media selection options. */
 enum {
@@ -662,10 +666,19 @@ struct netdev_rx_queue {
  *	Callback uses when the transmitter has not made any progress
  *	for dev->watchdog ticks.
  *
+ * struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev);
  * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
  *	Called when a user wants to get the network device usage
- *	statistics. If not defined, the counters in dev->stats will
- *	be used.
+ *	statistics. Drivers must do one of the following:
+ *	1. Define @ndo_get_stats64 to update a rtnl_link_stats64 structure
+ *	   (which should normally be dev->stats64) and return a ponter to
+ *	   it. The structure must not be changed asynchronously.
+ *	2. Define @ndo_get_stats to update a net_device_stats64 structure
+ *	   (which should normally be dev->stats) and return a pointer to
+ *	   it. The structure may be changed asynchronously only if each
+ *	   field is written atomically.
+ *	3. Update dev->stats asynchronously and atomically, and define
+ *	   neither operation.
  *
  * void (*ndo_vlan_rx_register)(struct net_device *dev, struct vlan_group *grp);
  *	If device support VLAN receive accleration
@@ -720,6 +733,7 @@ struct net_device_ops {
 						   struct neigh_parms *);
 	void			(*ndo_tx_timeout) (struct net_device *dev);
 
+	struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev);
 	struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 
 	void			(*ndo_vlan_rx_register)(struct net_device *dev,
@@ -869,7 +883,10 @@ struct net_device {
 	int			ifindex;
 	int			iflink;
 
-	struct net_device_stats	stats;
+	union {
+		struct rtnl_link_stats64 stats64;
+		struct net_device_stats stats;
+	};
 
 #ifdef CONFIG_WIRELESS_EXT
 	/* List of functions to handle Wireless Extensions (instead of ioctl).
@@ -2121,7 +2138,7 @@ extern void		netdev_features_change(struct net_device *dev);
 /* Load a device via the kmod */
 extern void		dev_load(struct net *net, const char *name);
 extern void		dev_mcast_init(void);
-extern const struct net_device_stats *dev_get_stats(struct net_device *dev);
+extern const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev);
 extern void		dev_txq_stats_fold(const struct net_device *dev, struct net_device_stats *stats);
 
 extern int		netdev_max_backlog;
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index afead353e215..df56f5ce887c 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -278,8 +278,9 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset)
 {
 	struct net_device *vlandev = (struct net_device *) seq->private;
 	const struct vlan_dev_info *dev_info = vlan_dev_info(vlandev);
-	const struct net_device_stats *stats;
+	const struct rtnl_link_stats64 *stats;
 	static const char fmt[] = "%30s %12lu\n";
+	static const char fmt64[] = "%30s %12llu\n";
 	int i;
 
 	if (!is_vlan_dev(vlandev))
@@ -291,12 +292,12 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset)
 		   vlandev->name, dev_info->vlan_id,
 		   (int)(dev_info->flags & 1), vlandev->priv_flags);
 
-	seq_printf(seq, fmt, "total frames received", stats->rx_packets);
-	seq_printf(seq, fmt, "total bytes received", stats->rx_bytes);
-	seq_printf(seq, fmt, "Broadcast/Multicast Rcvd", stats->multicast);
+	seq_printf(seq, fmt64, "total frames received", stats->rx_packets);
+	seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes);
+	seq_printf(seq, fmt64, "Broadcast/Multicast Rcvd", stats->multicast);
 	seq_puts(seq, "\n");
-	seq_printf(seq, fmt, "total frames transmitted", stats->tx_packets);
-	seq_printf(seq, fmt, "total bytes transmitted", stats->tx_bytes);
+	seq_printf(seq, fmt64, "total frames transmitted", stats->tx_packets);
+	seq_printf(seq, fmt64, "total bytes transmitted", stats->tx_bytes);
 	seq_printf(seq, fmt, "total headroom inc",
 		   dev_info->cnt_inc_headroom_on_tx);
 	seq_printf(seq, fmt, "total encap on xmit",
diff --git a/net/core/dev.c b/net/core/dev.c
index 277844901ce3..a1abc10db08a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3701,10 +3701,10 @@ void dev_seq_stop(struct seq_file *seq, void *v)
 
 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
 {
-	const struct net_device_stats *stats = dev_get_stats(dev);
+	const struct rtnl_link_stats64 *stats = dev_get_stats(dev);
 
-	seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
-		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
+	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
+		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
 		   dev->name, stats->rx_bytes, stats->rx_packets,
 		   stats->rx_errors,
 		   stats->rx_dropped + stats->rx_missed_errors,
@@ -5281,18 +5281,21 @@ EXPORT_SYMBOL(dev_txq_stats_fold);
  *	@dev: device to get statistics from
  *
  *	Get network statistics from device. The device driver may provide
- *	its own method by setting dev->netdev_ops->get_stats; otherwise
- *	the internal statistics structure is used.
+ *	its own method by setting dev->netdev_ops->get_stats64 or
+ *	dev->netdev_ops->get_stats; otherwise the internal statistics
+ *	structure is used.
  */
-const struct net_device_stats *dev_get_stats(struct net_device *dev)
+const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
+	if (ops->ndo_get_stats64)
+		return ops->ndo_get_stats64(dev);
 	if (ops->ndo_get_stats)
-		return ops->ndo_get_stats(dev);
+		return (struct rtnl_link_stats64 *)ops->ndo_get_stats(dev);
 
 	dev_txq_stats_fold(dev, &dev->stats);
-	return &dev->stats;
+	return &dev->stats64;
 }
 EXPORT_SYMBOL(dev_get_stats);
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 99e7052d7323..ea3bb4c3b87d 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -29,6 +29,7 @@ static const char fmt_hex[] = "%#x\n";
 static const char fmt_long_hex[] = "%#lx\n";
 static const char fmt_dec[] = "%d\n";
 static const char fmt_ulong[] = "%lu\n";
+static const char fmt_u64[] = "%llu\n";
 
 static inline int dev_isalive(const struct net_device *dev)
 {
@@ -324,14 +325,13 @@ static ssize_t netstat_show(const struct device *d,
 	struct net_device *dev = to_net_dev(d);
 	ssize_t ret = -EINVAL;
 
-	WARN_ON(offset > sizeof(struct net_device_stats) ||
-			offset % sizeof(unsigned long) != 0);
+	WARN_ON(offset > sizeof(struct rtnl_link_stats64) ||
+			offset % sizeof(u64) != 0);
 
 	read_lock(&dev_base_lock);
 	if (dev_isalive(dev)) {
-		const struct net_device_stats *stats = dev_get_stats(dev);
-		ret = sprintf(buf, fmt_ulong,
-			      *(unsigned long *)(((u8 *) stats) + offset));
+		const struct rtnl_link_stats64 *stats = dev_get_stats(dev);
+		ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset));
 	}
 	read_unlock(&dev_base_lock);
 	return ret;
@@ -343,7 +343,7 @@ static ssize_t show_##name(struct device *d,				\
 			   struct device_attribute *attr, char *buf) 	\
 {									\
 	return netstat_show(d, attr, buf,				\
-			    offsetof(struct net_device_stats, name));	\
+			    offsetof(struct rtnl_link_stats64, name));	\
 }									\
 static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1a2af24e9e3d..e645778e9b7e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -579,7 +579,7 @@ static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
 }
 
 static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
-				 const struct net_device_stats *b)
+				 const struct rtnl_link_stats64 *b)
 {
 	a->rx_packets = b->rx_packets;
 	a->tx_packets = b->tx_packets;
@@ -610,7 +610,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
 	a->tx_compressed = b->tx_compressed;
 }
 
-static void copy_rtnl_link_stats64(void *v, const struct net_device_stats *b)
+static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
 {
 	struct rtnl_link_stats64 a;
 
@@ -791,7 +791,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
-	const struct net_device_stats *stats;
+	const struct rtnl_link_stats64 *stats;
 	struct nlattr *attr;
 
 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
-- 
cgit v1.2.3-59-g8ed1b


From f5c5440d40a24c5dc8030cde0a03debe87de4afb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 14 Jun 2010 16:15:23 +0200
Subject: netfilter: nfnetlink_log: RCU conversion, part 2

- must use atomic_inc_not_zero() in instance_lookup_get()

- must use hlist_add_head_rcu() instead of hlist_add_head()

- must use hlist_del_rcu() instead of hlist_del()

- Introduce NFULNL_COPY_DISABLED to stop lockless reader from using an
instance, before we do final instance_put() on it.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nfnetlink_log.h |  1 +
 net/netfilter/nfnetlink_log.c           | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h
index d3bab7a2c9b7..1d0b84aa1d42 100644
--- a/include/linux/netfilter/nfnetlink_log.h
+++ b/include/linux/netfilter/nfnetlink_log.h
@@ -89,6 +89,7 @@ enum nfulnl_attr_config {
 #define NFULNL_COPY_NONE	0x00
 #define NFULNL_COPY_META	0x01
 #define NFULNL_COPY_PACKET	0x02
+#define NFULNL_COPY_DISABLED	0x03
 
 #define NFULNL_CFG_F_SEQ	0x0001
 #define NFULNL_CFG_F_SEQ_GLOBAL	0x0002
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 8ec23ec568e7..fb86a51bb65a 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -109,8 +109,8 @@ instance_lookup_get(u_int16_t group_num)
 
 	rcu_read_lock_bh();
 	inst = __instance_lookup(group_num);
-	if (inst)
-		instance_get(inst);
+	if (inst && !atomic_inc_not_zero(&inst->use))
+		inst = NULL;
 	rcu_read_unlock_bh();
 
 	return inst;
@@ -171,7 +171,7 @@ instance_create(u_int16_t group_num, int pid)
 	inst->copy_mode 	= NFULNL_COPY_PACKET;
 	inst->copy_range 	= NFULNL_COPY_RANGE_MAX;
 
-	hlist_add_head(&inst->hlist,
+	hlist_add_head_rcu(&inst->hlist,
 		       &instance_table[instance_hashfn(group_num)]);
 
 	spin_unlock_bh(&instances_lock);
@@ -185,18 +185,23 @@ out_unlock:
 
 static void __nfulnl_flush(struct nfulnl_instance *inst);
 
+/* called with BH disabled */
 static void
 __instance_destroy(struct nfulnl_instance *inst)
 {
 	/* first pull it out of the global list */
-	hlist_del(&inst->hlist);
+	hlist_del_rcu(&inst->hlist);
 
 	/* then flush all pending packets from skb */
 
-	spin_lock_bh(&inst->lock);
+	spin_lock(&inst->lock);
+
+	/* lockless readers wont be able to use us */
+	inst->copy_mode = NFULNL_COPY_DISABLED;
+
 	if (inst->skb)
 		__nfulnl_flush(inst);
-	spin_unlock_bh(&inst->lock);
+	spin_unlock(&inst->lock);
 
 	/* and finally put the refcount */
 	instance_put(inst);
@@ -624,6 +629,7 @@ nfulnl_log_packet(u_int8_t pf,
 		size += nla_total_size(data_len);
 		break;
 
+	case NFULNL_COPY_DISABLED:
 	default:
 		goto unlock_and_release;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 551d55a944b143ef26fbd482d1c463199d6f65cf Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Sat, 17 Apr 2010 08:48:42 -0400
Subject: tree/tiny rcu: Add debug RCU head objects

Helps finding racy users of call_rcu(), which results in hangs because list
entries are overwritten and/or skipped.

Changelog since v4:
- Bissectability is now OK
- Now generate a WARN_ON_ONCE() for non-initialized rcu_head passed to
  call_rcu(). Statically initialized objects are detected with
  object_is_static().
- Rename rcu_head_init_on_stack to init_rcu_head_on_stack.
- Remove init_rcu_head() completely.

Changelog since v3:
- Include comments from Lai Jiangshan

This new patch version is based on the debugobjects with the newly introduced
"active state" tracker.

Non-initialized entries are all considered as "statically initialized". An
activation fixup (triggered by call_rcu()) takes care of performing the debug
object initialization without issuing any warning. Since we cannot increase the
size of struct rcu_head, I don't see much room to put an identifier for
statically initialized rcu_head structures. So for now, we have to live without
"activation without explicit init" detection. But the main purpose of this debug
option is to detect double-activations (double call_rcu() use of a rcu_head
before the callback is executed), which is correctly addressed here.

This also detects potential internal RCU callback corruption, which would cause
the callbacks to be executed twice.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: David S. Miller <davem@davemloft.net>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: akpm@linux-foundation.org
CC: mingo@elte.hu
CC: laijs@cn.fujitsu.com
CC: dipankar@in.ibm.com
CC: josh@joshtriplett.org
CC: dvhltc@us.ibm.com
CC: niv@us.ibm.com
CC: tglx@linutronix.de
CC: peterz@infradead.org
CC: rostedt@goodmis.org
CC: Valdis.Kletnieks@vt.edu
CC: dhowells@redhat.com
CC: eric.dumazet@gmail.com
CC: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/rcupdate.h |  49 +++++++++++++++
 kernel/rcupdate.c        | 160 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcutiny.c         |   2 +
 kernel/rcutree.c         |   2 +
 lib/Kconfig.debug        |   6 ++
 5 files changed, 219 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index b653b4aaa8a6..2b7fc506e479 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -40,6 +40,7 @@
 #include <linux/seqlock.h>
 #include <linux/lockdep.h>
 #include <linux/completion.h>
+#include <linux/debugobjects.h>
 
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable; /* for sysctl */
@@ -79,6 +80,16 @@ extern void rcu_init(void);
        (ptr)->next = NULL; (ptr)->func = NULL; \
 } while (0)
 
+/*
+ * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
+ * initialization and destruction of rcu_head on the stack. rcu_head structures
+ * allocated dynamically in the heap or defined statically don't need any
+ * initialization.
+ */
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+extern void init_rcu_head_on_stack(struct rcu_head *head);
+extern void destroy_rcu_head_on_stack(struct rcu_head *head);
+#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 static inline void init_rcu_head_on_stack(struct rcu_head *head)
 {
 }
@@ -86,6 +97,7 @@ static inline void init_rcu_head_on_stack(struct rcu_head *head)
 static inline void destroy_rcu_head_on_stack(struct rcu_head *head)
 {
 }
+#endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
@@ -517,4 +529,41 @@ extern void call_rcu(struct rcu_head *head,
 extern void call_rcu_bh(struct rcu_head *head,
 			void (*func)(struct rcu_head *head));
 
+/*
+ * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
+ * by call_rcu() and rcu callback execution, and are therefore not part of the
+ * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
+ */
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+# define STATE_RCU_HEAD_READY	0
+# define STATE_RCU_HEAD_QUEUED	1
+
+extern struct debug_obj_descr rcuhead_debug_descr;
+
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+	debug_object_activate(head, &rcuhead_debug_descr);
+	debug_object_active_state(head, &rcuhead_debug_descr,
+				  STATE_RCU_HEAD_READY,
+				  STATE_RCU_HEAD_QUEUED);
+}
+
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+	debug_object_active_state(head, &rcuhead_debug_descr,
+				  STATE_RCU_HEAD_QUEUED,
+				  STATE_RCU_HEAD_READY);
+	debug_object_deactivate(head, &rcuhead_debug_descr);
+}
+#else	/* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+}
+
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+}
+#endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 72a8dc9567f5..4d169835fb36 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void)
 }
 EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
 #endif /* #ifdef CONFIG_PROVE_RCU */
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+static inline void debug_init_rcu_head(struct rcu_head *head)
+{
+	debug_object_init(head, &rcuhead_debug_descr);
+}
+
+static inline void debug_rcu_head_free(struct rcu_head *head)
+{
+	debug_object_free(head, &rcuhead_debug_descr);
+}
+
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
+{
+	struct rcu_head *head = addr;
+
+	switch (state) {
+	case ODEBUG_STATE_ACTIVE:
+		/*
+		 * Ensure that queued callbacks are all executed.
+		 * If we detect that we are nested in a RCU read-side critical
+		 * section, we should simply fail, otherwise we would deadlock.
+		 */
+		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+		    irqs_disabled()) {
+			WARN_ON(1);
+			return 0;
+		}
+		rcu_barrier();
+		rcu_barrier_sched();
+		rcu_barrier_bh();
+		debug_object_init(head, &rcuhead_debug_descr);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ * Activation is performed internally by call_rcu().
+ */
+static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+{
+	struct rcu_head *head = addr;
+
+	switch (state) {
+
+	case ODEBUG_STATE_NOTAVAILABLE:
+		/*
+		 * This is not really a fixup. We just make sure that it is
+		 * tracked in the object tracker.
+		 */
+		debug_object_init(head, &rcuhead_debug_descr);
+		debug_object_activate(head, &rcuhead_debug_descr);
+		return 0;
+
+	case ODEBUG_STATE_ACTIVE:
+		/*
+		 * Ensure that queued callbacks are all executed.
+		 * If we detect that we are nested in a RCU read-side critical
+		 * section, we should simply fail, otherwise we would deadlock.
+		 */
+		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+		    irqs_disabled()) {
+			WARN_ON(1);
+			return 0;
+		}
+		rcu_barrier();
+		rcu_barrier_sched();
+		rcu_barrier_bh();
+		debug_object_activate(head, &rcuhead_debug_descr);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
+{
+	struct rcu_head *head = addr;
+
+	switch (state) {
+	case ODEBUG_STATE_ACTIVE:
+		/*
+		 * Ensure that queued callbacks are all executed.
+		 * If we detect that we are nested in a RCU read-side critical
+		 * section, we should simply fail, otherwise we would deadlock.
+		 */
+#ifndef CONFIG_PREEMPT
+		WARN_ON(1);
+		return 0;
+#else
+		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+		    irqs_disabled()) {
+			WARN_ON(1);
+			return 0;
+		}
+		rcu_barrier();
+		rcu_barrier_sched();
+		rcu_barrier_bh();
+		debug_object_free(head, &rcuhead_debug_descr);
+		return 1;
+#endif
+	default:
+		return 0;
+	}
+}
+
+/**
+ * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects of a new rcu_head structure that
+ * has been allocated as an auto variable on the stack.  This function
+ * is not required for rcu_head structures that are statically defined or
+ * that are dynamically allocated on the heap.  This function has no
+ * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void init_rcu_head_on_stack(struct rcu_head *head)
+{
+	debug_object_init_on_stack(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
+
+/**
+ * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects that an on-stack rcu_head structure
+ * is about to go out of scope.  As with init_rcu_head_on_stack(), this
+ * function is not required for rcu_head structures that are statically
+ * defined or that are dynamically allocated on the heap.  Also as with
+ * init_rcu_head_on_stack(), this function has no effect for
+ * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void destroy_rcu_head_on_stack(struct rcu_head *head)
+{
+	debug_object_free(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
+
+struct debug_obj_descr rcuhead_debug_descr = {
+	.name = "rcu_head",
+	.fixup_init = rcuhead_fixup_init,
+	.fixup_activate = rcuhead_fixup_activate,
+	.fixup_free = rcuhead_fixup_free,
+};
+EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 38729d3cd236..196ec02f8be0 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 	while (list) {
 		next = list->next;
 		prefetch(next);
+		debug_rcu_head_unqueue(list);
 		list->func(list);
 		list = next;
 	}
@@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head,
 {
 	unsigned long flags;
 
+	debug_rcu_head_queue(head);
 	head->func = func;
 	head->next = NULL;
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d4437345706f..d5bc43976c5a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	while (list) {
 		next = list->next;
 		prefetch(next);
+		debug_rcu_head_unqueue(list);
 		list->func(list);
 		list = next;
 		if (++count >= rdp->blimit)
@@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	unsigned long flags;
 	struct rcu_data *rdp;
 
+	debug_rcu_head_queue(head);
 	head->func = func;
 	head->next = NULL;
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e722e9d62221..142faa2ec665 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -307,6 +307,12 @@ config DEBUG_OBJECTS_WORK
 	  work queue routines to track the life time of work objects and
 	  validate the work operations.
 
+config DEBUG_OBJECTS_RCU_HEAD
+	bool "Debug RCU callbacks objects"
+	depends on DEBUG_OBJECTS && PREEMPT
+	help
+	  Enable this to turn on debugging of RCU list heads (call_rcu() usage).
+
 config DEBUG_OBJECTS_ENABLE_DEFAULT
 	int "debug_objects bootup default value (0-1)"
         range 0 1
-- 
cgit v1.2.3-59-g8ed1b


From f5155b33277c9678041a27869165619bb34f722f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 30 Apr 2010 06:42:01 -0700
Subject: rcu: add an rcu_dereference_index_check()

The sparse RCU-pointer checking relies on type magic that dereferences
the pointer in question.  This does not work if the pointer is in fact
an array index.  This commit therefore supplies a new RCU API that
omits the sparse checking to continue to support rcu_dereference()
on integers.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 2b7fc506e479..9fbc54a2585d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -566,4 +566,37 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 }
 #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
+#ifndef CONFIG_PROVE_RCU
+#define __do_rcu_dereference_check(c) do { } while (0)
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+#define __rcu_dereference_index_check(p, c) \
+	({ \
+		typeof(p) _________p1 = ACCESS_ONCE(p); \
+		__do_rcu_dereference_check(c); \
+		smp_read_barrier_depends(); \
+		(_________p1); \
+	})
+
+/**
+ * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * Similar to rcu_dereference_check(), but omits the sparse checking.
+ * This allows rcu_dereference_index_check() to be used on integers,
+ * which can then be used as array indices.  Attempting to use
+ * rcu_dereference_check() on an integer will give compiler warnings
+ * because the sparse address-space mechanism relies on dereferencing
+ * the RCU-protected pointer.  Dereferencing integers is not something
+ * that even gcc will put up with.
+ *
+ * Note that this function does not implicitly check for RCU read-side
+ * critical sections.  If this function gains lots of uses, it might
+ * make sense to provide versions for each flavor of RCU, but it does
+ * not make sense as of early 2010.
+ */
+#define rcu_dereference_index_check(p, c) \
+	__rcu_dereference_index_check((p), (c))
+
 #endif /* __LINUX_RCUPDATE_H */
-- 
cgit v1.2.3-59-g8ed1b


From 71d1d5c722db9ae3b3f9c08ef7ddcd7759fbb1e0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 11 May 2010 16:13:14 -0700
Subject: rcu: add __rcu API for later sparse checking

This commit defines an __rcu API, but provides only vacuous definitions
for it.  This breaks dependencies among most of the subsequent patches,
allowing them to reach mainline asynchronously via whatever trees are
appropriate.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Christopher Li <sparse@chrisli.org>
Cc: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/compiler.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index a5a472b10746..c1a62c56a660 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -16,6 +16,7 @@
 # define __release(x)	__context__(x,-1)
 # define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
 # define __percpu	__attribute__((noderef, address_space(3)))
+# define __rcu
 extern void __chk_user_ptr(const volatile void __user *);
 extern void __chk_io_ptr(const volatile void __iomem *);
 #else
@@ -34,6 +35,7 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 # define __release(x) (void)0
 # define __cond_lock(x,c) (c)
 # define __percpu
+# define __rcu
 #endif
 
 #ifdef __KERNEL__
-- 
cgit v1.2.3-59-g8ed1b


From a25909a4d4a29e272f953e12595bf2f04a292dbd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 13 May 2010 12:32:28 -0700
Subject: lockdep: Add an in_workqueue_context() lockdep-based test function

Some recent uses of RCU make use of workqueues.  In these uses, execution
within the context of a specific workqueue takes the place of the usual
RCU read-side primitives such as rcu_read_lock(), and flushing of workqueues
takes the place of the usual RCU grace-period primitives.  Checking for
correct use of rcu_dereference() in such cases requires a test of whether
the code is executing in the context of a particular workqueue.  This
commit adds an in_workqueue_context() function that provides this test.
This new function is only defined when lockdep is enabled, which allows
it to be used as the second argument of rcu_dereference_check().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/workqueue.h |  4 ++++
 kernel/workqueue.c        | 15 +++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 9466e860d8c2..d0f7c8178498 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -297,4 +297,8 @@ static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 #else
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg);
 #endif /* CONFIG_SMP */
+
+#ifdef CONFIG_LOCKDEP
+int in_workqueue_context(struct workqueue_struct *wq);
+#endif
 #endif
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 327d2deb4451..59fef1531dd2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,21 @@ struct workqueue_struct {
 #endif
 };
 
+#ifdef CONFIG_LOCKDEP
+/**
+ * in_workqueue_context() - in context of specified workqueue?
+ * @wq: the workqueue of interest
+ *
+ * Checks lockdep state to see if the current task is executing from
+ * within a workqueue item.  This function exists only if lockdep is
+ * enabled.
+ */
+int in_workqueue_context(struct workqueue_struct *wq)
+{
+	return lock_is_held(&wq->lockdep_map);
+}
+#endif
+
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
 static struct debug_obj_descr work_debug_descr;
-- 
cgit v1.2.3-59-g8ed1b


From 2c666df80764389886110c942a7916ba9622583d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 29 Apr 2010 20:48:47 -0700
Subject: vfs: add fs.h to define struct file

The sparse RCU-pointer annotations require definition of the
underlying type of any pointer passed to rcu_dereference() and friends.
So fcheck_files() needs "struct file" to be defined, so include fs.h.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
---
 include/linux/fdtable.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 013dc529e95f..551671e87927 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -11,6 +11,7 @@
 #include <linux/rcupdate.h>
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/fs.h>
 
 #include <asm/atomic.h>
 
-- 
cgit v1.2.3-59-g8ed1b


From 81bdf5bd7349bd4523538cbd7878f334bc2bfe14 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 2 May 2010 18:10:06 -0700
Subject: net: Make accesses to ->br_port safe for sparse RCU

The new versions of the rcu_dereference() APIs requires that any pointers
passed to one of these APIs be fully defined.  The ->br_port field
in struct net_device points to a struct net_bridge_port, which is an
incomplete type.  This commit therefore changes ->br_port to be a void*,
and introduces a br_port() helper function to convert the type to struct
net_bridge_port, and applies this new helper function where required.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Stephen Hemminger <shemminger@linux-foundation.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/if_bridge.h           | 3 +++
 net/bridge/br_fdb.c                 | 2 +-
 net/bridge/br_private.h             | 5 +++++
 net/bridge/netfilter/ebt_redirect.c | 2 +-
 net/bridge/netfilter/ebt_ulog.c     | 4 ++--
 net/bridge/netfilter/ebtables.c     | 4 ++--
 net/netfilter/nfnetlink_log.c       | 4 ++--
 net/netfilter/nfnetlink_queue.c     | 4 ++--
 8 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 938b7e81df95..d001d782922d 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -101,6 +101,9 @@ struct __fdb_entry {
 
 #include <linux/netdevice.h>
 
+/* br_handle_frame_hook() needs the following forward declaration. */
+struct net_bridge_port;
+
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
 extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
 					       struct sk_buff *skb);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 26637439965b..845710bca49c 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -246,7 +246,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
 		return 0;
 
 	rcu_read_lock();
-	fdb = __br_fdb_get(dev->br_port->br, addr);
+	fdb = __br_fdb_get(br_port(dev)->br, addr);
 	ret = fdb && fdb->dst->dev != dev &&
 		fdb->dst->state == BR_STATE_FORWARDING;
 	rcu_read_unlock();
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 0f4a74bc6a9b..3255188355b5 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -268,6 +268,11 @@ static inline int br_is_root_bridge(const struct net_bridge *br)
 	return !memcmp(&br->bridge_id, &br->designated_root, 8);
 }
 
+static inline struct net_bridge_port *br_port(const struct net_device *dev)
+{
+	return rcu_dereference(dev->br_port);
+}
+
 /* br_device.c */
 extern void br_dev_setup(struct net_device *dev);
 extern netdev_tx_t br_dev_xmit(struct sk_buff *skb,
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 9e19166ba453..a39df0ae0f81 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -25,7 +25,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
 	if (par->hooknum != NF_BR_BROUTING)
 		memcpy(eth_hdr(skb)->h_dest,
-		       par->in->br_port->br->dev->dev_addr, ETH_ALEN);
+		       br_port(par->in)->br->dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(eth_hdr(skb)->h_dest, par->in->dev_addr, ETH_ALEN);
 	skb->pkt_type = PACKET_HOST;
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index ae3c7cef1484..5a4996bbb090 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -178,7 +178,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 		strcpy(pm->physindev, in->name);
 		/* If in isn't a bridge, then physindev==indev */
 		if (in->br_port)
-			strcpy(pm->indev, in->br_port->br->dev->name);
+			strcpy(pm->indev, br_port(in)->br->dev->name);
 		else
 			strcpy(pm->indev, in->name);
 	} else
@@ -187,7 +187,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 	if (out) {
 		/* If out exists, then out is a bridge port */
 		strcpy(pm->physoutdev, out->name);
-		strcpy(pm->outdev, out->br_port->br->dev->name);
+		strcpy(pm->outdev, br_port(out)->br->dev->name);
 	} else
 		pm->outdev[0] = pm->physoutdev[0] = '\0';
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 59ca00e40dec..4c2aab8cbfc7 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -141,10 +141,10 @@ ebt_basic_match(const struct ebt_entry *e, const struct ethhdr *h,
 	if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT))
 		return 1;
 	if ((!in || !in->br_port) ? 0 : FWINV2(ebt_dev_check(
-	   e->logical_in, in->br_port->br->dev), EBT_ILOGICALIN))
+	   e->logical_in, br_port(in)->br->dev), EBT_ILOGICALIN))
 		return 1;
 	if ((!out || !out->br_port) ? 0 : FWINV2(ebt_dev_check(
-	   e->logical_out, out->br_port->br->dev), EBT_ILOGICALOUT))
+	   e->logical_out, br_port(out)->br->dev), EBT_ILOGICALOUT))
 		return 1;
 
 	if (e->bitmask & EBT_SOURCEMAC) {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index fc9a211e629e..78957cfa3bdd 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -404,7 +404,7 @@ __build_packet_message(struct nfulnl_instance *inst,
 				     htonl(indev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
-				     htonl(indev->br_port->br->dev->ifindex));
+				     htonl(br_port(indev)->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is bridge group, we need to look for
 			 * physical device (when called from ipv4) */
@@ -431,7 +431,7 @@ __build_packet_message(struct nfulnl_instance *inst,
 				     htonl(outdev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
-				     htonl(outdev->br_port->br->dev->ifindex));
+				     htonl(br_port(outdev)->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is a bridge group, we need to look
 			 * for physical device (when called from ipv4) */
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 12e1ab37fcd8..c3c17498298e 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -297,7 +297,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 				     htonl(indev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV,
-				     htonl(indev->br_port->br->dev->ifindex));
+				     htonl(br_port(indev)->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is bridge group, we need to look for
 			 * physical device (when called from ipv4) */
@@ -322,7 +322,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 				     htonl(outdev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV,
-				     htonl(outdev->br_port->br->dev->ifindex));
+				     htonl(br_port(outdev)->br->dev->ifindex));
 		} else {
 			/* Case 2: outdev is bridge group, we need to look for
 			 * physical output device (when called from ipv4) */
-- 
cgit v1.2.3-59-g8ed1b


From 1be3b5fe9dffe3300f995584d8f996dd20e29412 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 14 Jun 2010 08:53:26 +0000
Subject: ethtool: Revert incorrect indentation changes

commit 97f8aefbbfb5aa5c9944e5fa8149f1fdaf71c7b6 "net: fix ethtool
coding style errors and warnings" changed the indentation of several
macro definitions in ethtool.h.  These definitions line up in the diff
where there is an extra character at the start of each line, but not
in the resulting file.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 276b40a16835..2c8af093d8b3 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -586,29 +586,29 @@ struct ethtool_ops {
 #define ETHTOOL_GREGS		0x00000004 /* Get NIC registers. */
 #define ETHTOOL_GWOL		0x00000005 /* Get wake-on-lan options. */
 #define ETHTOOL_SWOL		0x00000006 /* Set wake-on-lan options. */
-#define ETHTOOL_GMSGLVL	0x00000007 /* Get driver message level */
-#define ETHTOOL_SMSGLVL	0x00000008 /* Set driver msg level. */
+#define ETHTOOL_GMSGLVL		0x00000007 /* Get driver message level */
+#define ETHTOOL_SMSGLVL		0x00000008 /* Set driver msg level. */
 #define ETHTOOL_NWAY_RST	0x00000009 /* Restart autonegotiation. */
 #define ETHTOOL_GLINK		0x0000000a /* Get link status (ethtool_value) */
-#define ETHTOOL_GEEPROM	0x0000000b /* Get EEPROM data */
-#define ETHTOOL_SEEPROM	0x0000000c /* Set EEPROM data. */
+#define ETHTOOL_GEEPROM		0x0000000b /* Get EEPROM data */
+#define ETHTOOL_SEEPROM		0x0000000c /* Set EEPROM data. */
 #define ETHTOOL_GCOALESCE	0x0000000e /* Get coalesce config */
 #define ETHTOOL_SCOALESCE	0x0000000f /* Set coalesce config. */
 #define ETHTOOL_GRINGPARAM	0x00000010 /* Get ring parameters */
 #define ETHTOOL_SRINGPARAM	0x00000011 /* Set ring parameters. */
 #define ETHTOOL_GPAUSEPARAM	0x00000012 /* Get pause parameters */
 #define ETHTOOL_SPAUSEPARAM	0x00000013 /* Set pause parameters. */
-#define ETHTOOL_GRXCSUM	0x00000014 /* Get RX hw csum enable (ethtool_value) */
-#define ETHTOOL_SRXCSUM	0x00000015 /* Set RX hw csum enable (ethtool_value) */
-#define ETHTOOL_GTXCSUM	0x00000016 /* Get TX hw csum enable (ethtool_value) */
-#define ETHTOOL_STXCSUM	0x00000017 /* Set TX hw csum enable (ethtool_value) */
+#define ETHTOOL_GRXCSUM		0x00000014 /* Get RX hw csum enable (ethtool_value) */
+#define ETHTOOL_SRXCSUM		0x00000015 /* Set RX hw csum enable (ethtool_value) */
+#define ETHTOOL_GTXCSUM		0x00000016 /* Get TX hw csum enable (ethtool_value) */
+#define ETHTOOL_STXCSUM		0x00000017 /* Set TX hw csum enable (ethtool_value) */
 #define ETHTOOL_GSG		0x00000018 /* Get scatter-gather enable
 					    * (ethtool_value) */
 #define ETHTOOL_SSG		0x00000019 /* Set scatter-gather enable
 					    * (ethtool_value). */
 #define ETHTOOL_TEST		0x0000001a /* execute NIC self-test. */
 #define ETHTOOL_GSTRINGS	0x0000001b /* get specified string set */
-#define ETHTOOL_PHYS_ID	0x0000001c /* identify the NIC */
+#define ETHTOOL_PHYS_ID		0x0000001c /* identify the NIC */
 #define ETHTOOL_GSTATS		0x0000001d /* get NIC-specific statistics */
 #define ETHTOOL_GTSO		0x0000001e /* Get TSO enable (ethtool_value) */
 #define ETHTOOL_STSO		0x0000001f /* Set TSO enable (ethtool_value) */
@@ -619,8 +619,8 @@ struct ethtool_ops {
 #define ETHTOOL_SGSO		0x00000024 /* Set GSO enable (ethtool_value) */
 #define ETHTOOL_GFLAGS		0x00000025 /* Get flags bitmap(ethtool_value) */
 #define ETHTOOL_SFLAGS		0x00000026 /* Set flags bitmap(ethtool_value) */
-#define ETHTOOL_GPFLAGS	0x00000027 /* Get driver-private flags bitmap */
-#define ETHTOOL_SPFLAGS	0x00000028 /* Set driver-private flags bitmap */
+#define ETHTOOL_GPFLAGS		0x00000027 /* Get driver-private flags bitmap */
+#define ETHTOOL_SPFLAGS		0x00000028 /* Set driver-private flags bitmap */
 
 #define ETHTOOL_GRXFH		0x00000029 /* Get RX flow hash configuration */
 #define ETHTOOL_SRXFH		0x0000002a /* Set RX flow hash configuration */
@@ -645,18 +645,18 @@ struct ethtool_ops {
 /* Indicates what features are supported by the interface. */
 #define SUPPORTED_10baseT_Half		(1 << 0)
 #define SUPPORTED_10baseT_Full		(1 << 1)
-#define SUPPORTED_100baseT_Half	(1 << 2)
-#define SUPPORTED_100baseT_Full	(1 << 3)
+#define SUPPORTED_100baseT_Half		(1 << 2)
+#define SUPPORTED_100baseT_Full		(1 << 3)
 #define SUPPORTED_1000baseT_Half	(1 << 4)
 #define SUPPORTED_1000baseT_Full	(1 << 5)
 #define SUPPORTED_Autoneg		(1 << 6)
 #define SUPPORTED_TP			(1 << 7)
 #define SUPPORTED_AUI			(1 << 8)
 #define SUPPORTED_MII			(1 << 9)
-#define SUPPORTED_FIBRE		(1 << 10)
+#define SUPPORTED_FIBRE			(1 << 10)
 #define SUPPORTED_BNC			(1 << 11)
 #define SUPPORTED_10000baseT_Full	(1 << 12)
-#define SUPPORTED_Pause		(1 << 13)
+#define SUPPORTED_Pause			(1 << 13)
 #define SUPPORTED_Asym_Pause		(1 << 14)
 #define SUPPORTED_2500baseX_Full	(1 << 15)
 #define SUPPORTED_Backplane		(1 << 16)
@@ -666,8 +666,8 @@ struct ethtool_ops {
 #define SUPPORTED_10000baseR_FEC	(1 << 20)
 
 /* Indicates what features are advertised by the interface. */
-#define ADVERTISED_10baseT_Half	(1 << 0)
-#define ADVERTISED_10baseT_Full	(1 << 1)
+#define ADVERTISED_10baseT_Half		(1 << 0)
+#define ADVERTISED_10baseT_Full		(1 << 1)
 #define ADVERTISED_100baseT_Half	(1 << 2)
 #define ADVERTISED_100baseT_Full	(1 << 3)
 #define ADVERTISED_1000baseT_Half	(1 << 4)
@@ -706,12 +706,12 @@ struct ethtool_ops {
 #define DUPLEX_FULL		0x01
 
 /* Which connector port. */
-#define PORT_TP		0x00
+#define PORT_TP			0x00
 #define PORT_AUI		0x01
 #define PORT_MII		0x02
 #define PORT_FIBRE		0x03
 #define PORT_BNC		0x04
-#define PORT_DA		0x05
+#define PORT_DA			0x05
 #define PORT_NONE		0xef
 #define PORT_OTHER		0xff
 
@@ -725,7 +725,7 @@ struct ethtool_ops {
 /* Enable or disable autonegotiation.  If this is set to enable,
  * the forced link modes above are completely ignored.
  */
-#define AUTONEG_DISABLE	0x00
+#define AUTONEG_DISABLE		0x00
 #define AUTONEG_ENABLE		0x01
 
 /* Mode MDI or MDI-X */
-- 
cgit v1.2.3-59-g8ed1b


From 0902b469bd25065aa0688c3cee6f11744c817e7c Mon Sep 17 00:00:00 2001
From: Luciano Coelho <luciano.coelho@nokia.com>
Date: Tue, 15 Jun 2010 15:04:00 +0200
Subject: netfilter: xtables: idletimer target implementation

This patch implements an idletimer Xtables target that can be used to
identify when interfaces have been idle for a certain period of time.

Timers are identified by labels and are created when a rule is set with a new
label.  The rules also take a timeout value (in seconds) as an option.  If
more than one rule uses the same timer label, the timer will be restarted
whenever any of the rules get a hit.

One entry for each timer is created in sysfs.  This attribute contains the
timer remaining for the timer to expire.  The attributes are located under
the xt_idletimer class:

/sys/class/xt_idletimer/timers/<label>

When the timer expires, the target module sends a sysfs notification to the
userspace, which can then decide what to do (eg. disconnect to save power).

Cc: Timo Teras <timo.teras@iki.fi>
Signed-off-by: Luciano Coelho <luciano.coelho@nokia.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/Kbuild         |   1 +
 include/linux/netfilter/xt_IDLETIMER.h |  45 +++++
 net/netfilter/Kconfig                  |  12 ++
 net/netfilter/Makefile                 |   1 +
 net/netfilter/xt_IDLETIMER.c           | 314 +++++++++++++++++++++++++++++++++
 5 files changed, 373 insertions(+)
 create mode 100644 include/linux/netfilter/xt_IDLETIMER.h
 create mode 100644 net/netfilter/xt_IDLETIMER.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index 48767cd16453..bb103f43afa0 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -8,6 +8,7 @@ header-y += xt_CONNMARK.h
 header-y += xt_CONNSECMARK.h
 header-y += xt_CT.h
 header-y += xt_DSCP.h
+header-y += xt_IDLETIMER.h
 header-y += xt_LED.h
 header-y += xt_MARK.h
 header-y += xt_NFLOG.h
diff --git a/include/linux/netfilter/xt_IDLETIMER.h b/include/linux/netfilter/xt_IDLETIMER.h
new file mode 100644
index 000000000000..3e1aa1be942e
--- /dev/null
+++ b/include/linux/netfilter/xt_IDLETIMER.h
@@ -0,0 +1,45 @@
+/*
+ * linux/include/linux/netfilter/xt_IDLETIMER.h
+ *
+ * Header file for Xtables timer target module.
+ *
+ * Copyright (C) 2004, 2010 Nokia Corporation
+ * Written by Timo Teras <ext-timo.teras@nokia.com>
+ *
+ * Converted to x_tables and forward-ported to 2.6.34
+ * by Luciano Coelho <luciano.coelho@nokia.com>
+ *
+ * Contact: Luciano Coelho <luciano.coelho@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#ifndef _XT_IDLETIMER_H
+#define _XT_IDLETIMER_H
+
+#include <linux/types.h>
+
+#define MAX_IDLETIMER_LABEL_SIZE 28
+
+struct idletimer_tg_info {
+	__u32 timeout;
+
+	char label[MAX_IDLETIMER_LABEL_SIZE];
+
+	/* for kernel module internal use only */
+	struct idletimer_tg *timer __attribute((aligned(8)));
+};
+
+#endif
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 8593a77cfea9..413ed24a968a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -424,6 +424,18 @@ config NETFILTER_XT_TARGET_HL
 	since you can easily create immortal packets that loop
 	forever on the network.
 
+config NETFILTER_XT_TARGET_IDLETIMER
+	tristate  "IDLETIMER target support"
+	depends on NETFILTER_ADVANCED
+	help
+
+	  This option adds the `IDLETIMER' target.  Each matching packet
+	  resets the timer associated with label specified when the rule is
+	  added.  When the timer expires, it triggers a sysfs notification.
+	  The remaining time for expiration can be read via sysfs.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_TARGET_LED
 	tristate '"LED" target support'
 	depends on LEDS_CLASS && LEDS_TRIGGERS
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 14e3a8fd8180..e28420aac5ef 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o
 
 # matches
 obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
new file mode 100644
index 000000000000..e11090a0675c
--- /dev/null
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -0,0 +1,314 @@
+/*
+ * linux/net/netfilter/xt_IDLETIMER.c
+ *
+ * Netfilter module to trigger a timer when packet matches.
+ * After timer expires a kevent will be sent.
+ *
+ * Copyright (C) 2004, 2010 Nokia Corporation
+ * Written by Timo Teras <ext-timo.teras@nokia.com>
+ *
+ * Converted to x_tables and reworked for upstream inclusion
+ * by Luciano Coelho <luciano.coelho@nokia.com>
+ *
+ * Contact: Luciano Coelho <luciano.coelho@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_IDLETIMER.h>
+#include <linux/kobject.h>
+#include <linux/workqueue.h>
+#include <linux/sysfs.h>
+
+struct idletimer_tg_attr {
+	struct attribute attr;
+	ssize_t	(*show)(struct kobject *kobj,
+			struct attribute *attr, char *buf);
+};
+
+struct idletimer_tg {
+	struct list_head entry;
+	struct timer_list timer;
+	struct work_struct work;
+
+	struct kobject *kobj;
+	struct idletimer_tg_attr attr;
+
+	unsigned int refcnt;
+};
+
+static LIST_HEAD(idletimer_tg_list);
+static DEFINE_MUTEX(list_mutex);
+
+static struct kobject *idletimer_tg_kobj;
+
+static
+struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
+{
+	struct idletimer_tg *entry;
+
+	BUG_ON(!label);
+
+	list_for_each_entry(entry, &idletimer_tg_list, entry) {
+		if (!strcmp(label, entry->attr.attr.name))
+			return entry;
+	}
+
+	return NULL;
+}
+
+static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	struct idletimer_tg *timer;
+	unsigned long expires = 0;
+
+	mutex_lock(&list_mutex);
+
+	timer =	__idletimer_tg_find_by_label(attr->name);
+	if (timer)
+		expires = timer->timer.expires;
+
+	mutex_unlock(&list_mutex);
+
+	if (time_after(expires, jiffies))
+		return sprintf(buf, "%u\n",
+			       jiffies_to_msecs(expires - jiffies) / 1000);
+
+	return sprintf(buf, "0\n");
+}
+
+static void idletimer_tg_work(struct work_struct *work)
+{
+	struct idletimer_tg *timer = container_of(work, struct idletimer_tg,
+						  work);
+
+	sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
+}
+
+static void idletimer_tg_expired(unsigned long data)
+{
+	struct idletimer_tg *timer = (struct idletimer_tg *) data;
+
+	pr_debug("timer %s expired\n", timer->attr.attr.name);
+
+	schedule_work(&timer->work);
+}
+
+static int idletimer_tg_create(struct idletimer_tg_info *info)
+{
+	int ret;
+
+	info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
+	if (!info->timer) {
+		pr_debug("couldn't alloc timer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
+	if (!info->timer->attr.attr.name) {
+		pr_debug("couldn't alloc attribute name\n");
+		ret = -ENOMEM;
+		goto out_free_timer;
+	}
+	info->timer->attr.attr.mode = S_IRUGO;
+	info->timer->attr.show = idletimer_tg_show;
+
+	ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr);
+	if (ret < 0) {
+		pr_debug("couldn't add file to sysfs");
+		goto out_free_attr;
+	}
+
+	list_add(&info->timer->entry, &idletimer_tg_list);
+
+	setup_timer(&info->timer->timer, idletimer_tg_expired,
+		    (unsigned long) info->timer);
+	info->timer->refcnt = 1;
+
+	mod_timer(&info->timer->timer,
+		  msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+	INIT_WORK(&info->timer->work, idletimer_tg_work);
+
+	return 0;
+
+out_free_attr:
+	kfree(info->timer->attr.attr.name);
+out_free_timer:
+	kfree(info->timer);
+out:
+	return ret;
+}
+
+/*
+ * The actual xt_tables plugin.
+ */
+static unsigned int idletimer_tg_target(struct sk_buff *skb,
+					 const struct xt_action_param *par)
+{
+	const struct idletimer_tg_info *info = par->targinfo;
+
+	pr_debug("resetting timer %s, timeout period %u\n",
+		 info->label, info->timeout);
+
+	BUG_ON(!info->timer);
+
+	mod_timer(&info->timer->timer,
+		  msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+	return XT_CONTINUE;
+}
+
+static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+{
+	struct idletimer_tg_info *info = par->targinfo;
+	int ret;
+
+	pr_debug("checkentry targinfo%s\n", info->label);
+
+	if (info->timeout == 0) {
+		pr_debug("timeout value is zero\n");
+		return -EINVAL;
+	}
+
+	if (info->label[0] == '\0' ||
+	    strnlen(info->label,
+		    MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) {
+		pr_debug("label is empty or not nul-terminated\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&list_mutex);
+
+	info->timer = __idletimer_tg_find_by_label(info->label);
+	if (info->timer) {
+		info->timer->refcnt++;
+		mod_timer(&info->timer->timer,
+			  msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+		pr_debug("increased refcnt of timer %s to %u\n",
+			 info->label, info->timer->refcnt);
+	} else {
+		ret = idletimer_tg_create(info);
+		if (ret < 0) {
+			pr_debug("failed to create timer\n");
+			mutex_unlock(&list_mutex);
+			return ret;
+		}
+	}
+
+	mutex_unlock(&list_mutex);
+	return 0;
+}
+
+static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	const struct idletimer_tg_info *info = par->targinfo;
+
+	pr_debug("destroy targinfo %s\n", info->label);
+
+	mutex_lock(&list_mutex);
+
+	if (--info->timer->refcnt == 0) {
+		pr_debug("deleting timer %s\n", info->label);
+
+		list_del(&info->timer->entry);
+		del_timer_sync(&info->timer->timer);
+		sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+		kfree(info->timer->attr.attr.name);
+		kfree(info->timer);
+	} else {
+		pr_debug("decreased refcnt of timer %s to %u\n",
+			 info->label, info->timer->refcnt);
+	}
+
+	mutex_unlock(&list_mutex);
+}
+
+static struct xt_target idletimer_tg __read_mostly = {
+	.name		= "IDLETIMER",
+	.family		= NFPROTO_UNSPEC,
+	.target		= idletimer_tg_target,
+	.targetsize     = sizeof(struct idletimer_tg_info),
+	.checkentry	= idletimer_tg_checkentry,
+	.destroy        = idletimer_tg_destroy,
+	.me		= THIS_MODULE,
+};
+
+static struct class *idletimer_tg_class;
+
+static struct device *idletimer_tg_device;
+
+static int __init idletimer_tg_init(void)
+{
+	int err;
+
+	idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer");
+	err = PTR_ERR(idletimer_tg_class);
+	if (IS_ERR(idletimer_tg_class)) {
+		pr_debug("couldn't register device class\n");
+		goto out;
+	}
+
+	idletimer_tg_device = device_create(idletimer_tg_class, NULL,
+					    MKDEV(0, 0), NULL, "timers");
+	err = PTR_ERR(idletimer_tg_device);
+	if (IS_ERR(idletimer_tg_device)) {
+		pr_debug("couldn't register system device\n");
+		goto out_class;
+	}
+
+	idletimer_tg_kobj = &idletimer_tg_device->kobj;
+
+	err =  xt_register_target(&idletimer_tg);
+	if (err < 0) {
+		pr_debug("couldn't register xt target\n");
+		goto out_dev;
+	}
+
+	return 0;
+out_dev:
+	device_destroy(idletimer_tg_class, MKDEV(0, 0));
+out_class:
+	class_destroy(idletimer_tg_class);
+out:
+	return err;
+}
+
+static void __exit idletimer_tg_exit(void)
+{
+	xt_unregister_target(&idletimer_tg);
+
+	device_destroy(idletimer_tg_class, MKDEV(0, 0));
+	class_destroy(idletimer_tg_class);
+}
+
+module_init(idletimer_tg_init);
+module_exit(idletimer_tg_exit);
+
+MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>");
+MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>");
+MODULE_DESCRIPTION("Xtables: idle time monitor");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3-59-g8ed1b


From de85d99eb7b595f6751550184b94c1e2f74a828b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 10 Jun 2010 16:12:44 +0000
Subject: netpoll: Fix RCU usage

The use of RCU in netpoll is incorrect in a number of places:

1) The initial setting is lacking a write barrier.
2) The synchronize_rcu is in the wrong place.
3) Read barriers are missing.
4) Some places are even missing rcu_read_lock.
5) npinfo is zeroed after freeing.

This patch fixes those issues.  As most users are in BH context,
this also converts the RCU usage to the BH variant.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 13 ++++++++-----
 net/core/netpoll.c      | 20 ++++++++++++--------
 2 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index e9e231215865..95c9f7e16776 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -57,12 +57,15 @@ void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
 #ifdef CONFIG_NETPOLL
 static inline bool netpoll_rx(struct sk_buff *skb)
 {
-	struct netpoll_info *npinfo = skb->dev->npinfo;
+	struct netpoll_info *npinfo;
 	unsigned long flags;
 	bool ret = false;
 
+	rcu_read_lock_bh();
+	npinfo = rcu_dereference(skb->dev->npinfo);
+
 	if (!npinfo || (list_empty(&npinfo->rx_np) && !npinfo->rx_flags))
-		return false;
+		goto out;
 
 	spin_lock_irqsave(&npinfo->rx_lock, flags);
 	/* check rx_flags again with the lock held */
@@ -70,12 +73,14 @@ static inline bool netpoll_rx(struct sk_buff *skb)
 		ret = true;
 	spin_unlock_irqrestore(&npinfo->rx_lock, flags);
 
+out:
+	rcu_read_unlock_bh();
 	return ret;
 }
 
 static inline int netpoll_rx_on(struct sk_buff *skb)
 {
-	struct netpoll_info *npinfo = skb->dev->npinfo;
+	struct netpoll_info *npinfo = rcu_dereference(skb->dev->npinfo);
 
 	return npinfo && (!list_empty(&npinfo->rx_np) || npinfo->rx_flags);
 }
@@ -91,7 +96,6 @@ static inline void *netpoll_poll_lock(struct napi_struct *napi)
 {
 	struct net_device *dev = napi->dev;
 
-	rcu_read_lock(); /* deal with race on ->npinfo */
 	if (dev && dev->npinfo) {
 		spin_lock(&napi->poll_lock);
 		napi->poll_owner = smp_processor_id();
@@ -108,7 +112,6 @@ static inline void netpoll_poll_unlock(void *have)
 		napi->poll_owner = -1;
 		spin_unlock(&napi->poll_lock);
 	}
-	rcu_read_unlock();
 }
 
 #else
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 19ff66079f76..e9ab4f0c454c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -261,6 +261,7 @@ void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 	unsigned long tries;
 	struct net_device *dev = np->dev;
 	const struct net_device_ops *ops = dev->netdev_ops;
+	/* It is up to the caller to keep npinfo alive. */
 	struct netpoll_info *npinfo = np->dev->npinfo;
 
 	if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
@@ -810,10 +811,7 @@ int netpoll_setup(struct netpoll *np)
 	refill_skbs();
 
 	/* last thing to do is link it to the net device structure */
-	ndev->npinfo = npinfo;
-
-	/* avoid racing with NAPI reading npinfo */
-	synchronize_rcu();
+	rcu_assign_pointer(ndev->npinfo, npinfo);
 
 	return 0;
 
@@ -857,6 +855,16 @@ void netpoll_cleanup(struct netpoll *np)
 
 			if (atomic_dec_and_test(&npinfo->refcnt)) {
 				const struct net_device_ops *ops;
+
+				ops = np->dev->netdev_ops;
+				if (ops->ndo_netpoll_cleanup)
+					ops->ndo_netpoll_cleanup(np->dev);
+
+				rcu_assign_pointer(np->dev->npinfo, NULL);
+
+				/* avoid racing with NAPI reading npinfo */
+				synchronize_rcu_bh();
+
 				skb_queue_purge(&npinfo->arp_tx);
 				skb_queue_purge(&npinfo->txq);
 				cancel_rearming_delayed_work(&npinfo->tx_work);
@@ -864,10 +872,6 @@ void netpoll_cleanup(struct netpoll *np)
 				/* clean after last, unfinished work */
 				__skb_queue_purge(&npinfo->txq);
 				kfree(npinfo);
-				ops = np->dev->netdev_ops;
-				if (ops->ndo_netpoll_cleanup)
-					ops->ndo_netpoll_cleanup(np->dev);
-				np->dev->npinfo = NULL;
 			}
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From 4247e161b12f8dffb7ee3ee07bc5e61f714ebe2d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 10 Jun 2010 16:12:47 +0000
Subject: netpoll: Add ndo_netpoll_setup

This patch adds ndo_netpoll_setup as the initialisation primitive
to complement ndo_netpoll_cleanup.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 net/core/netpoll.c        | 10 ++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4fbccc5f609a..fb20cc55ba52 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -744,6 +744,8 @@ struct net_device_ops {
 						        unsigned short vid);
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	void                    (*ndo_poll_controller)(struct net_device *dev);
+	int			(*ndo_netpoll_setup)(struct net_device *dev,
+						     struct netpoll_info *info);
 	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
 #endif
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index d10c249bcc8f..7de6dcad5d79 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -698,6 +698,7 @@ int netpoll_setup(struct netpoll *np)
 	struct net_device *ndev = NULL;
 	struct in_device *in_dev;
 	struct netpoll_info *npinfo;
+	const struct net_device_ops *ops;
 	unsigned long flags;
 	int err;
 
@@ -797,6 +798,13 @@ int netpoll_setup(struct netpoll *np)
 		INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
 
 		atomic_set(&npinfo->refcnt, 1);
+
+		ops = np->dev->netdev_ops;
+		if (ops->ndo_netpoll_setup) {
+			err = ops->ndo_netpoll_setup(ndev, npinfo);
+			if (err)
+				goto free_npinfo;
+		}
 	} else {
 		npinfo = ndev->npinfo;
 		atomic_inc(&npinfo->refcnt);
@@ -817,6 +825,8 @@ int netpoll_setup(struct netpoll *np)
 
 	return 0;
 
+free_npinfo:
+	kfree(npinfo);
 unlock:
 	rtnl_unlock();
 put:
-- 
cgit v1.2.3-59-g8ed1b


From 8fdd95ec162a8fbac7f41d6f54f90402fe3e8cb1 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 10 Jun 2010 16:12:48 +0000
Subject: netpoll: Allow netpoll_setup/cleanup recursion

This patch adds the functions __netpoll_setup/__netpoll_cleanup
which is designed to be called recursively through ndo_netpoll_seutp.

They must be called with RTNL held, and the caller must initialise
np->dev and ensure that it has a valid reference count.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h |   2 +
 net/core/netpoll.c      | 176 ++++++++++++++++++++++++++----------------------
 2 files changed, 99 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 95c9f7e16776..f3ad74af7e1f 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -46,9 +46,11 @@ void netpoll_poll(struct netpoll *np);
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len);
 void netpoll_print_options(struct netpoll *np);
 int netpoll_parse_options(struct netpoll *np, char *opt);
+int __netpoll_setup(struct netpoll *np);
 int netpoll_setup(struct netpoll *np);
 int netpoll_trap(void);
 void netpoll_set_trap(int trap);
+void __netpoll_cleanup(struct netpoll *np);
 void netpoll_cleanup(struct netpoll *np);
 int __netpoll_rx(struct sk_buff *skb);
 void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 7de6dcad5d79..560297ee55b4 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -693,15 +693,78 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 	return -1;
 }
 
-int netpoll_setup(struct netpoll *np)
+int __netpoll_setup(struct netpoll *np)
 {
-	struct net_device *ndev = NULL;
-	struct in_device *in_dev;
+	struct net_device *ndev = np->dev;
 	struct netpoll_info *npinfo;
 	const struct net_device_ops *ops;
 	unsigned long flags;
 	int err;
 
+	if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
+	    !ndev->netdev_ops->ndo_poll_controller) {
+		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
+		       np->name, np->dev_name);
+		err = -ENOTSUPP;
+		goto out;
+	}
+
+	if (!ndev->npinfo) {
+		npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+		if (!npinfo) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		npinfo->rx_flags = 0;
+		INIT_LIST_HEAD(&npinfo->rx_np);
+
+		spin_lock_init(&npinfo->rx_lock);
+		skb_queue_head_init(&npinfo->arp_tx);
+		skb_queue_head_init(&npinfo->txq);
+		INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
+
+		atomic_set(&npinfo->refcnt, 1);
+
+		ops = np->dev->netdev_ops;
+		if (ops->ndo_netpoll_setup) {
+			err = ops->ndo_netpoll_setup(ndev, npinfo);
+			if (err)
+				goto free_npinfo;
+		}
+	} else {
+		npinfo = ndev->npinfo;
+		atomic_inc(&npinfo->refcnt);
+	}
+
+	npinfo->netpoll = np;
+
+	if (np->rx_hook) {
+		spin_lock_irqsave(&npinfo->rx_lock, flags);
+		npinfo->rx_flags |= NETPOLL_RX_ENABLED;
+		list_add_tail(&np->rx, &npinfo->rx_np);
+		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+	}
+
+	/* last thing to do is link it to the net device structure */
+	rcu_assign_pointer(ndev->npinfo, npinfo);
+	rtnl_unlock();
+
+	return 0;
+
+free_npinfo:
+	kfree(npinfo);
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(__netpoll_setup);
+
+int netpoll_setup(struct netpoll *np)
+{
+	struct net_device *ndev = NULL;
+	struct in_device *in_dev;
+	int err;
+
 	if (np->dev_name)
 		ndev = dev_get_by_name(&init_net, np->dev_name);
 	if (!ndev) {
@@ -774,61 +837,14 @@ int netpoll_setup(struct netpoll *np)
 	refill_skbs();
 
 	rtnl_lock();
-	if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
-	    !ndev->netdev_ops->ndo_poll_controller) {
-		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
-		       np->name, np->dev_name);
-		err = -ENOTSUPP;
-		goto unlock;
-	}
-
-	if (!ndev->npinfo) {
-		npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
-		if (!npinfo) {
-			err = -ENOMEM;
-			goto unlock;
-		}
-
-		npinfo->rx_flags = 0;
-		INIT_LIST_HEAD(&npinfo->rx_np);
-
-		spin_lock_init(&npinfo->rx_lock);
-		skb_queue_head_init(&npinfo->arp_tx);
-		skb_queue_head_init(&npinfo->txq);
-		INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
-
-		atomic_set(&npinfo->refcnt, 1);
-
-		ops = np->dev->netdev_ops;
-		if (ops->ndo_netpoll_setup) {
-			err = ops->ndo_netpoll_setup(ndev, npinfo);
-			if (err)
-				goto free_npinfo;
-		}
-	} else {
-		npinfo = ndev->npinfo;
-		atomic_inc(&npinfo->refcnt);
-	}
-
-	npinfo->netpoll = np;
-
-	if (np->rx_hook) {
-		spin_lock_irqsave(&npinfo->rx_lock, flags);
-		npinfo->rx_flags |= NETPOLL_RX_ENABLED;
-		list_add_tail(&np->rx, &npinfo->rx_np);
-		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-	}
-
-	/* last thing to do is link it to the net device structure */
-	rcu_assign_pointer(ndev->npinfo, npinfo);
+	err = __netpoll_setup(np);
 	rtnl_unlock();
 
+	if (err)
+		goto put;
+
 	return 0;
 
-free_npinfo:
-	kfree(npinfo);
-unlock:
-	rtnl_unlock();
 put:
 	dev_put(ndev);
 	return err;
@@ -841,40 +857,32 @@ static int __init netpoll_init(void)
 }
 core_initcall(netpoll_init);
 
-void netpoll_cleanup(struct netpoll *np)
+void __netpoll_cleanup(struct netpoll *np)
 {
 	struct netpoll_info *npinfo;
 	unsigned long flags;
-	int free = 0;
 
-	if (!np->dev)
+	npinfo = np->dev->npinfo;
+	if (!npinfo)
 		return;
 
-	rtnl_lock();
-	npinfo = np->dev->npinfo;
-	if (npinfo) {
-		if (!list_empty(&npinfo->rx_np)) {
-			spin_lock_irqsave(&npinfo->rx_lock, flags);
-			list_del(&np->rx);
-			if (list_empty(&npinfo->rx_np))
-				npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
-			spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-		}
+	if (!list_empty(&npinfo->rx_np)) {
+		spin_lock_irqsave(&npinfo->rx_lock, flags);
+		list_del(&np->rx);
+		if (list_empty(&npinfo->rx_np))
+			npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+	}
 
-		free = atomic_dec_and_test(&npinfo->refcnt);
-		if (free) {
-			const struct net_device_ops *ops;
+	if (atomic_dec_and_test(&npinfo->refcnt)) {
+		const struct net_device_ops *ops;
 
-			ops = np->dev->netdev_ops;
-			if (ops->ndo_netpoll_cleanup)
-				ops->ndo_netpoll_cleanup(np->dev);
+		ops = np->dev->netdev_ops;
+		if (ops->ndo_netpoll_cleanup)
+			ops->ndo_netpoll_cleanup(np->dev);
 
-			rcu_assign_pointer(np->dev->npinfo, NULL);
-		}
-	}
-	rtnl_unlock();
+		rcu_assign_pointer(np->dev->npinfo, NULL);
 
-	if (free) {
 		/* avoid racing with NAPI reading npinfo */
 		synchronize_rcu_bh();
 
@@ -886,9 +894,19 @@ void netpoll_cleanup(struct netpoll *np)
 		__skb_queue_purge(&npinfo->txq);
 		kfree(npinfo);
 	}
+}
+EXPORT_SYMBOL_GPL(__netpoll_cleanup);
 
-	dev_put(np->dev);
+void netpoll_cleanup(struct netpoll *np)
+{
+	if (!np->dev)
+		return;
 
+	rtnl_lock();
+	__netpoll_cleanup(np);
+	rtnl_unlock();
+
+	dev_put(np->dev);
 	np->dev = NULL;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From c18370f5b2949d9cca519355f33690b75e1e7c8b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 10 Jun 2010 16:12:49 +0000
Subject: netpoll: Add netpoll_tx_running

This patch adds the helper netpoll_tx_running for use within
ndo_start_xmit.  It returns non-zero if ndo_start_xmit is being
invoked by netpoll, and zero otherwise.

This is currently implemented by simply looking at the hardirq
count.  This is because for all non-netpoll uses of ndo_start_xmit,
IRQs must be enabled while netpoll always disables IRQs before
calling ndo_start_xmit.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index f3ad74af7e1f..4c77fe78ceff 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -116,6 +116,11 @@ static inline void netpoll_poll_unlock(void *have)
 	}
 }
 
+static inline int netpoll_tx_running(struct net_device *dev)
+{
+	return irqs_disabled();
+}
+
 #else
 static inline int netpoll_rx(struct sk_buff *skb)
 {
@@ -139,6 +144,10 @@ static inline void netpoll_poll_unlock(void *have)
 static inline void netpoll_netdev_init(struct net_device *dev)
 {
 }
+static inline int netpoll_tx_running(struct net_device *dev)
+{
+	return 0;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 93e2c32b5cb2ad92ceb1d7a4684f20a0d25bf530 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Thu, 10 Jun 2010 03:34:59 +0000
Subject: net: add rx_handler data pointer

Add possibility to register rx_handler data pointer along with a rx_handler.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c     | 2 +-
 include/linux/netdevice.h | 4 +++-
 net/bridge/br_if.c        | 2 +-
 net/core/dev.c            | 6 +++++-
 4 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 59c315556a30..87a3bf69c4a3 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -532,7 +532,7 @@ static int macvlan_port_create(struct net_device *dev)
 		INIT_HLIST_HEAD(&port->vlan_hash[i]);
 	rcu_assign_pointer(dev->macvlan_port, port);
 
-	err = netdev_rx_handler_register(dev, macvlan_handle_frame);
+	err = netdev_rx_handler_register(dev, macvlan_handle_frame, NULL);
 	if (err) {
 		rcu_assign_pointer(dev->macvlan_port, NULL);
 		kfree(port);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fb20cc55ba52..361ff1145cf1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -979,6 +979,7 @@ struct net_device {
 
 	struct netdev_queue	rx_queue;
 	rx_handler_func_t	*rx_handler;
+	void			*rx_handler_data;
 
 	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
 
@@ -1712,7 +1713,8 @@ static inline void napi_free_frags(struct napi_struct *napi)
 }
 
 extern int netdev_rx_handler_register(struct net_device *dev,
-				      rx_handler_func_t *rx_handler);
+				      rx_handler_func_t *rx_handler,
+				      void *rx_handler_data);
 extern void netdev_rx_handler_unregister(struct net_device *dev);
 
 extern void		netif_nit_deliver(struct sk_buff *skb);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 97ac9da4d76c..0d142ed0bbe3 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -433,7 +433,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 
 	rcu_assign_pointer(dev->br_port, p);
 
-	err = netdev_rx_handler_register(dev, br_handle_frame);
+	err = netdev_rx_handler_register(dev, br_handle_frame, NULL);
 	if (err)
 		goto err4;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index a1abc10db08a..abdb19e547a7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2703,6 +2703,7 @@ void netif_nit_deliver(struct sk_buff *skb)
  *	netdev_rx_handler_register - register receive handler
  *	@dev: device to register a handler for
  *	@rx_handler: receive handler to register
+ *	@rx_handler_data: data pointer that is used by rx handler
  *
  *	Register a receive hander for a device. This handler will then be
  *	called from __netif_receive_skb. A negative errno code is returned
@@ -2711,13 +2712,15 @@ void netif_nit_deliver(struct sk_buff *skb)
  *	The caller must hold the rtnl_mutex.
  */
 int netdev_rx_handler_register(struct net_device *dev,
-			       rx_handler_func_t *rx_handler)
+			       rx_handler_func_t *rx_handler,
+			       void *rx_handler_data)
 {
 	ASSERT_RTNL();
 
 	if (dev->rx_handler)
 		return -EBUSY;
 
+	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 	rcu_assign_pointer(dev->rx_handler, rx_handler);
 
 	return 0;
@@ -2737,6 +2740,7 @@ void netdev_rx_handler_unregister(struct net_device *dev)
 
 	ASSERT_RTNL();
 	rcu_assign_pointer(dev->rx_handler, NULL);
+	rcu_assign_pointer(dev->rx_handler_data, NULL);
 }
 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 
-- 
cgit v1.2.3-59-g8ed1b


From a35e2c1b6d90544b3c688783869817628e5f9607 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 15 Jun 2010 03:27:57 +0000
Subject: macvlan: use rx_handler_data pointer to store macvlan_port pointer V2

Register macvlan_port pointer as rx_handler data pointer. As macvlan_port is
removed from struct net_device, another netdev priv_flag is added to indicate
the device serves as a macvlan port.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c     | 28 ++++++++++++++++------------
 include/linux/if.h        |  1 +
 include/linux/netdevice.h |  2 --
 3 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 87a3bf69c4a3..e096875aa055 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -40,6 +40,11 @@ struct macvlan_port {
 	struct rcu_head		rcu;
 };
 
+#define macvlan_port_get_rcu(dev) \
+	((struct macvlan_port *) rcu_dereference(dev->rx_handler_data))
+#define macvlan_port_get(dev) ((struct macvlan_port *) dev->rx_handler_data)
+#define macvlan_port_exists(dev) (dev->priv_flags & IFF_MACVLAN_PORT)
+
 static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port,
 					       const unsigned char *addr)
 {
@@ -155,7 +160,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb)
 	struct net_device *dev;
 	unsigned int len;
 
-	port = rcu_dereference(skb->dev->macvlan_port);
+	port = macvlan_port_get_rcu(skb->dev);
 	if (is_multicast_ether_addr(eth->h_dest)) {
 		src = macvlan_hash_lookup(port, eth->h_source);
 		if (!src)
@@ -530,14 +535,12 @@ static int macvlan_port_create(struct net_device *dev)
 	INIT_LIST_HEAD(&port->vlans);
 	for (i = 0; i < MACVLAN_HASH_SIZE; i++)
 		INIT_HLIST_HEAD(&port->vlan_hash[i]);
-	rcu_assign_pointer(dev->macvlan_port, port);
 
-	err = netdev_rx_handler_register(dev, macvlan_handle_frame, NULL);
-	if (err) {
-		rcu_assign_pointer(dev->macvlan_port, NULL);
+	err = netdev_rx_handler_register(dev, macvlan_handle_frame, port);
+	if (err)
 		kfree(port);
-	}
 
+	dev->priv_flags |= IFF_MACVLAN_PORT;
 	return err;
 }
 
@@ -551,10 +554,10 @@ static void macvlan_port_rcu_free(struct rcu_head *head)
 
 static void macvlan_port_destroy(struct net_device *dev)
 {
-	struct macvlan_port *port = dev->macvlan_port;
+	struct macvlan_port *port = macvlan_port_get(dev);
 
+	dev->priv_flags &= ~IFF_MACVLAN_PORT;
 	netdev_rx_handler_unregister(dev);
-	rcu_assign_pointer(dev->macvlan_port, NULL);
 	call_rcu(&port->rcu, macvlan_port_rcu_free);
 }
 
@@ -633,12 +636,12 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 	if (!tb[IFLA_ADDRESS])
 		random_ether_addr(dev->dev_addr);
 
-	if (lowerdev->macvlan_port == NULL) {
+	if (!macvlan_port_exists(lowerdev)) {
 		err = macvlan_port_create(lowerdev);
 		if (err < 0)
 			return err;
 	}
-	port = lowerdev->macvlan_port;
+	port = macvlan_port_get(lowerdev);
 
 	vlan->lowerdev = lowerdev;
 	vlan->dev      = dev;
@@ -748,10 +751,11 @@ static int macvlan_device_event(struct notifier_block *unused,
 	struct macvlan_dev *vlan, *next;
 	struct macvlan_port *port;
 
-	port = dev->macvlan_port;
-	if (port == NULL)
+	if (!macvlan_port_exists(dev))
 		return NOTIFY_DONE;
 
+	port = macvlan_port_get(dev);
+
 	switch (event) {
 	case NETDEV_CHANGE:
 		list_for_each_entry(vlan, &port->vlans, list)
diff --git a/include/linux/if.h b/include/linux/if.h
index be350e62a905..31f2e27ebcd0 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -73,6 +73,7 @@
 #define IFF_DONT_BRIDGE 0x800		/* disallow bridging this ether dev */
 #define IFF_IN_NETPOLL	0x1000		/* whether we are processing netpoll */
 #define IFF_DISABLE_NETPOLL	0x2000	/* disable netpoll at run-time */
+#define IFF_MACVLAN_PORT	0x4000	/* device used as macvlan port */
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 361ff1145cf1..5f231de2032f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1049,8 +1049,6 @@ struct net_device {
 
 	/* bridge stuff */
 	struct net_bridge_port	*br_port;
-	/* macvlan */
-	struct macvlan_port	*macvlan_port;
 	/* GARP */
 	struct garp_port	*garp_port;
 
-- 
cgit v1.2.3-59-g8ed1b


From f350a0a87374418635689471606454abc7beaa3a Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 15 Jun 2010 06:50:45 +0000
Subject: bridge: use rx_handler_data pointer to store net_bridge_port pointer

Register net_bridge_port pointer as rx_handler data pointer. As br_port is
removed from struct net_device, another netdev priv_flag is added to indicate
the device serves as a bridge port. Also rcuized pointers are now correctly
dereferenced in br_fdb.c and in netfilter parts.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ksz884x.c                       |  2 +-
 drivers/staging/batman-adv/hard-interface.c |  2 +-
 include/linux/if.h                          |  1 +
 include/linux/netdevice.h                   |  2 --
 net/bridge/br_fdb.c                         |  4 ++--
 net/bridge/br_if.c                          | 23 +++++++++++++----------
 net/bridge/br_input.c                       |  9 ++++-----
 net/bridge/br_netfilter.c                   | 11 ++++++-----
 net/bridge/br_netlink.c                     |  9 +++++----
 net/bridge/br_notify.c                      |  5 +++--
 net/bridge/br_private.h                     |  5 +++++
 net/bridge/br_stp_bpdu.c                    |  5 +++--
 net/bridge/netfilter/ebt_redirect.c         |  3 ++-
 net/bridge/netfilter/ebt_ulog.c             |  8 +++++---
 net/bridge/netfilter/ebtables.c             | 11 +++++++----
 net/core/dev.c                              |  3 ++-
 net/netfilter/nfnetlink_log.c               |  6 ++++--
 net/netfilter/nfnetlink_queue.c             |  6 ++++--
 net/wireless/nl80211.c                      |  2 +-
 net/wireless/util.c                         |  4 ++--
 20 files changed, 71 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ksz884x.c b/drivers/net/ksz884x.c
index 7805bbf1d53a..62362b4a8c56 100644
--- a/drivers/net/ksz884x.c
+++ b/drivers/net/ksz884x.c
@@ -5718,7 +5718,7 @@ static void dev_set_promiscuous(struct net_device *dev, struct dev_priv *priv,
 		 * from the bridge.
 		 */
 		if ((hw->features & STP_SUPPORT) && !promiscuous &&
-				dev->br_port) {
+		    (dev->priv_flags & IFF_BRIDGE_PORT)) {
 			struct ksz_switch *sw = hw->ksz_switch;
 			int port = priv->port.first_port;
 
diff --git a/drivers/staging/batman-adv/hard-interface.c b/drivers/staging/batman-adv/hard-interface.c
index 7a582e80de18..5ede9c255094 100644
--- a/drivers/staging/batman-adv/hard-interface.c
+++ b/drivers/staging/batman-adv/hard-interface.c
@@ -71,7 +71,7 @@ static int is_valid_iface(struct net_device *net_dev)
 #endif
 
 	/* Device is being bridged */
-	/* if (net_dev->br_port != NULL)
+	/* if (net_dev->priv_flags & IFF_BRIDGE_PORT)
 		return 0; */
 
 	return 1;
diff --git a/include/linux/if.h b/include/linux/if.h
index 31f2e27ebcd0..53558ec59e1b 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -74,6 +74,7 @@
 #define IFF_IN_NETPOLL	0x1000		/* whether we are processing netpoll */
 #define IFF_DISABLE_NETPOLL	0x2000	/* disable netpoll at run-time */
 #define IFF_MACVLAN_PORT	0x4000	/* device used as macvlan port */
+#define IFF_BRIDGE_PORT	0x8000		/* device used as bridge port */
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5f231de2032f..a7e0458029b5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1047,8 +1047,6 @@ struct net_device {
 	/* mid-layer private */
 	void			*ml_priv;
 
-	/* bridge stuff */
-	struct net_bridge_port	*br_port;
 	/* GARP */
 	struct garp_port	*garp_port;
 
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 26637439965b..6818e609b2c0 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -242,11 +242,11 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
 	struct net_bridge_fdb_entry *fdb;
 	int ret;
 
-	if (!dev->br_port)
+	if (!br_port_exists(dev))
 		return 0;
 
 	rcu_read_lock();
-	fdb = __br_fdb_get(dev->br_port->br, addr);
+	fdb = __br_fdb_get(br_port_get_rcu(dev)->br, addr);
 	ret = fdb && fdb->dst->dev != dev &&
 		fdb->dst->state == BR_STATE_FORWARDING;
 	rcu_read_unlock();
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 0d142ed0bbe3..c03d2c3ff03e 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -147,8 +147,9 @@ static void del_nbp(struct net_bridge_port *p)
 
 	list_del_rcu(&p->list);
 
+	dev->priv_flags &= ~IFF_BRIDGE_PORT;
+
 	netdev_rx_handler_unregister(dev);
-	rcu_assign_pointer(dev->br_port, NULL);
 
 	br_multicast_del_port(p);
 
@@ -400,7 +401,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 		return -ELOOP;
 
 	/* Device is already being bridged */
-	if (dev->br_port != NULL)
+	if (br_port_exists(dev))
 		return -EBUSY;
 
 	/* No bridging devices that dislike that (e.g. wireless) */
@@ -431,11 +432,11 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (br_netpoll_info(br) && ((err = br_netpoll_enable(p))))
 		goto err3;
 
-	rcu_assign_pointer(dev->br_port, p);
-
-	err = netdev_rx_handler_register(dev, br_handle_frame, NULL);
+	err = netdev_rx_handler_register(dev, br_handle_frame, p);
 	if (err)
-		goto err4;
+		goto err3;
+
+	dev->priv_flags |= IFF_BRIDGE_PORT;
 
 	dev_disable_lro(dev);
 
@@ -457,8 +458,6 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	kobject_uevent(&p->kobj, KOBJ_ADD);
 
 	return 0;
-err4:
-	rcu_assign_pointer(dev->br_port, NULL);
 err3:
 	sysfs_remove_link(br->ifobj, p->dev->name);
 err2:
@@ -477,9 +476,13 @@ put_back:
 /* called with RTNL */
 int br_del_if(struct net_bridge *br, struct net_device *dev)
 {
-	struct net_bridge_port *p = dev->br_port;
+	struct net_bridge_port *p;
+
+	if (!br_port_exists(dev))
+		return -EINVAL;
 
-	if (!p || p->br != br)
+	p = br_port_get(dev);
+	if (p->br != br)
 		return -EINVAL;
 
 	del_nbp(p);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 99647d8f95c8..f076c9d79d5e 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -41,7 +41,7 @@ static int br_pass_frame_up(struct sk_buff *skb)
 int br_handle_frame_finish(struct sk_buff *skb)
 {
 	const unsigned char *dest = eth_hdr(skb)->h_dest;
-	struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
+	struct net_bridge_port *p = br_port_get_rcu(skb->dev);
 	struct net_bridge *br;
 	struct net_bridge_fdb_entry *dst;
 	struct net_bridge_mdb_entry *mdst;
@@ -111,10 +111,9 @@ drop:
 /* note: already called with rcu_read_lock (preempt_disabled) */
 static int br_handle_local_finish(struct sk_buff *skb)
 {
-	struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
+	struct net_bridge_port *p = br_port_get_rcu(skb->dev);
 
-	if (p)
-		br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
+	br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
 	return 0;	 /* process further */
 }
 
@@ -151,7 +150,7 @@ struct sk_buff *br_handle_frame(struct sk_buff *skb)
 	if (!skb)
 		return NULL;
 
-	p = rcu_dereference(skb->dev->br_port);
+	p = br_port_get_rcu(skb->dev);
 
 	if (unlikely(is_link_local(dest))) {
 		/* Pause frames shouldn't be passed up by driver anyway */
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 0685b2558ab5..f54404ddee53 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -127,16 +127,17 @@ void br_netfilter_rtable_init(struct net_bridge *br)
 
 static inline struct rtable *bridge_parent_rtable(const struct net_device *dev)
 {
-	struct net_bridge_port *port = rcu_dereference(dev->br_port);
-
-	return port ? &port->br->fake_rtable : NULL;
+	if (!br_port_exists(dev))
+		return NULL;
+	return &br_port_get_rcu(dev)->br->fake_rtable;
 }
 
 static inline struct net_device *bridge_parent(const struct net_device *dev)
 {
-	struct net_bridge_port *port = rcu_dereference(dev->br_port);
+	if (!br_port_exists(dev))
+		return NULL;
 
-	return port ? port->br->dev : NULL;
+	return br_port_get_rcu(dev)->br->dev;
 }
 
 static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index fe0a79018ab2..4a6a378c84e3 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -120,10 +120,11 @@ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 	idx = 0;
 	for_each_netdev(net, dev) {
 		/* not a bridge port */
-		if (dev->br_port == NULL || idx < cb->args[0])
+		if (!br_port_exists(dev) || idx < cb->args[0])
 			goto skip;
 
-		if (br_fill_ifinfo(skb, dev->br_port, NETLINK_CB(cb->skb).pid,
+		if (br_fill_ifinfo(skb, br_port_get(dev),
+				   NETLINK_CB(cb->skb).pid,
 				   cb->nlh->nlmsg_seq, RTM_NEWLINK,
 				   NLM_F_MULTI) < 0)
 			break;
@@ -168,9 +169,9 @@ static int br_rtm_setlink(struct sk_buff *skb,  struct nlmsghdr *nlh, void *arg)
 	if (!dev)
 		return -ENODEV;
 
-	p = dev->br_port;
-	if (!p)
+	if (!br_port_exists(dev))
 		return -EINVAL;
+	p = br_port_get(dev);
 
 	/* if kernel STP is running, don't allow changes */
 	if (p->br->stp_enabled == BR_KERNEL_STP)
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 717e1fd6133c..404d4e14c6a7 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -32,14 +32,15 @@ struct notifier_block br_device_notifier = {
 static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
 {
 	struct net_device *dev = ptr;
-	struct net_bridge_port *p = dev->br_port;
+	struct net_bridge_port *p = br_port_get(dev);
 	struct net_bridge *br;
 	int err;
 
 	/* not a port of a bridge */
-	if (p == NULL)
+	if (!br_port_exists(dev))
 		return NOTIFY_DONE;
 
+	p = br_port_get(dev);
 	br = p->br;
 
 	switch (event) {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 0f5394c4f2f1..f6bc979b1135 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -150,6 +150,11 @@ struct net_bridge_port
 #endif
 };
 
+#define br_port_get_rcu(dev) \
+	((struct net_bridge_port *) rcu_dereference(dev->rx_handler_data))
+#define br_port_get(dev) ((struct net_bridge_port *) dev->rx_handler_data)
+#define br_port_exists(dev) (dev->priv_flags & IFF_BRIDGE_PORT)
+
 struct br_cpu_netstats {
 	unsigned long	rx_packets;
 	unsigned long	rx_bytes;
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
index 217bd225a42f..70aecb48fb69 100644
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -137,12 +137,13 @@ void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
 		struct net_device *dev)
 {
 	const unsigned char *dest = eth_hdr(skb)->h_dest;
-	struct net_bridge_port *p = rcu_dereference(dev->br_port);
+	struct net_bridge_port *p;
 	struct net_bridge *br;
 	const unsigned char *buf;
 
-	if (!p)
+	if (!br_port_exists(dev))
 		goto err;
+	p = br_port_get_rcu(dev);
 
 	if (!pskb_may_pull(skb, 4))
 		goto err;
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 9e19166ba453..46624bb6d9be 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -24,8 +24,9 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		return EBT_DROP;
 
 	if (par->hooknum != NF_BR_BROUTING)
+		/* rcu_read_lock()ed by nf_hook_slow */
 		memcpy(eth_hdr(skb)->h_dest,
-		       par->in->br_port->br->dev->dev_addr, ETH_ALEN);
+		       br_port_get_rcu(par->in)->br->dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(eth_hdr(skb)->h_dest, par->in->dev_addr, ETH_ALEN);
 	skb->pkt_type = PACKET_HOST;
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index ae3c7cef1484..26377e96fa1c 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -177,8 +177,9 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 	if (in) {
 		strcpy(pm->physindev, in->name);
 		/* If in isn't a bridge, then physindev==indev */
-		if (in->br_port)
-			strcpy(pm->indev, in->br_port->br->dev->name);
+		if (br_port_exists(in))
+			/* rcu_read_lock()ed by nf_hook_slow */
+			strcpy(pm->indev, br_port_get_rcu(in)->br->dev->name);
 		else
 			strcpy(pm->indev, in->name);
 	} else
@@ -187,7 +188,8 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 	if (out) {
 		/* If out exists, then out is a bridge port */
 		strcpy(pm->physoutdev, out->name);
-		strcpy(pm->outdev, out->br_port->br->dev->name);
+		/* rcu_read_lock()ed by nf_hook_slow */
+		strcpy(pm->outdev, br_port_get_rcu(out)->br->dev->name);
 	} else
 		pm->outdev[0] = pm->physoutdev[0] = '\0';
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 59ca00e40dec..bcc102e3be4d 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -140,11 +140,14 @@ ebt_basic_match(const struct ebt_entry *e, const struct ethhdr *h,
 		return 1;
 	if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT))
 		return 1;
-	if ((!in || !in->br_port) ? 0 : FWINV2(ebt_dev_check(
-	   e->logical_in, in->br_port->br->dev), EBT_ILOGICALIN))
+	/* rcu_read_lock()ed by nf_hook_slow */
+	if (in && br_port_exists(in) &&
+	    FWINV2(ebt_dev_check(e->logical_in, br_port_get_rcu(in)->br->dev),
+		   EBT_ILOGICALIN))
 		return 1;
-	if ((!out || !out->br_port) ? 0 : FWINV2(ebt_dev_check(
-	   e->logical_out, out->br_port->br->dev), EBT_ILOGICALOUT))
+	if (out && br_port_exists(out) &&
+	    FWINV2(ebt_dev_check(e->logical_out, br_port_get_rcu(out)->br->dev),
+		   EBT_ILOGICALOUT))
 		return 1;
 
 	if (e->bitmask & EBT_SOURCEMAC) {
diff --git a/net/core/dev.c b/net/core/dev.c
index abdb19e547a7..5902426ef585 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2765,7 +2765,8 @@ int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
 	if (master->priv_flags & IFF_MASTER_ARPMON)
 		dev->last_rx = jiffies;
 
-	if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+	if ((master->priv_flags & IFF_MASTER_ALB) &&
+	    (master->priv_flags & IFF_BRIDGE_PORT)) {
 		/* Do address unmangle. The local destination address
 		 * will be always the one master has. Provides the right
 		 * functionality in a bridge.
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index fc9a211e629e..e0504e90a0f0 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -403,8 +403,9 @@ __build_packet_message(struct nfulnl_instance *inst,
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
 				     htonl(indev->ifindex));
 			/* this is the bridge group "brX" */
+			/* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
-				     htonl(indev->br_port->br->dev->ifindex));
+				     htonl(br_port_get_rcu(indev)->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is bridge group, we need to look for
 			 * physical device (when called from ipv4) */
@@ -430,8 +431,9 @@ __build_packet_message(struct nfulnl_instance *inst,
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
 				     htonl(outdev->ifindex));
 			/* this is the bridge group "brX" */
+			/* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
-				     htonl(outdev->br_port->br->dev->ifindex));
+				     htonl(br_port_get_rcu(outdev)->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is a bridge group, we need to look
 			 * for physical device (when called from ipv4) */
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 12e1ab37fcd8..cc3ae861e8f3 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -296,8 +296,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV,
 				     htonl(indev->ifindex));
 			/* this is the bridge group "brX" */
+			/* rcu_read_lock()ed by __nf_queue */
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV,
-				     htonl(indev->br_port->br->dev->ifindex));
+				     htonl(br_port_get_rcu(indev)->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is bridge group, we need to look for
 			 * physical device (when called from ipv4) */
@@ -321,8 +322,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV,
 				     htonl(outdev->ifindex));
 			/* this is the bridge group "brX" */
+			/* rcu_read_lock()ed by __nf_queue */
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV,
-				     htonl(outdev->br_port->br->dev->ifindex));
+				     htonl(br_port_get_rcu(outdev)->br->dev->ifindex));
 		} else {
 			/* Case 2: outdev is bridge group, we need to look for
 			 * physical output device (when called from ipv4) */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 90ab3c8519be..3a7b8a2f2d5a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1107,7 +1107,7 @@ static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev,
 			       enum nl80211_iftype iftype)
 {
 	if (!use_4addr) {
-		if (netdev && netdev->br_port)
+		if (netdev && (netdev->priv_flags & IFF_BRIDGE_PORT))
 			return -EBUSY;
 		return 0;
 	}
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 3416373a9c0c..0c8a1e8b7690 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -770,8 +770,8 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 		return -EOPNOTSUPP;
 
 	/* if it's part of a bridge, reject changing type to station/ibss */
-	if (dev->br_port && (ntype == NL80211_IFTYPE_ADHOC ||
-			     ntype == NL80211_IFTYPE_STATION))
+	if ((dev->priv_flags & IFF_BRIDGE_PORT) &&
+	    (ntype == NL80211_IFTYPE_ADHOC || ntype == NL80211_IFTYPE_STATION))
 		return -EBUSY;
 
 	if (ntype != otype) {
-- 
cgit v1.2.3-59-g8ed1b


From 82695d9b186dcefe9bd119b53521deec20858f19 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 15 Jun 2010 15:08:48 -0700
Subject: net: Fix error in comment on net_device_ops::ndo_get_stats

ndo_get_stats still returns struct net_device_stats *; there is
no struct net_device_stats64.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a7e0458029b5..398f6c28cf8a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -673,7 +673,7 @@ struct netdev_rx_queue {
  *	1. Define @ndo_get_stats64 to update a rtnl_link_stats64 structure
  *	   (which should normally be dev->stats64) and return a ponter to
  *	   it. The structure must not be changed asynchronously.
- *	2. Define @ndo_get_stats to update a net_device_stats64 structure
+ *	2. Define @ndo_get_stats to update a net_device_stats structure
  *	   (which should normally be dev->stats) and return a pointer to
  *	   it. The structure may be changed asynchronously only if each
  *	   field is written atomically.
-- 
cgit v1.2.3-59-g8ed1b


From 5933dd2f028cdcbb4b3169dca594324704ba10ae Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 15 Jun 2010 18:16:43 -0700
Subject: net: NET_SKB_PAD should depend on L1_CACHE_BYTES

In old kernels, NET_SKB_PAD was defined to 16.

Then commit d6301d3dd1c2 (net: Increase default NET_SKB_PAD to 32), and
commit 18e8c134f4e9 (net: Increase NET_SKB_PAD to 64 bytes) increased it
to 64.

While first patch was governed by network stack needs, second was more
driven by performance issues on current hardware. Real intent was to
align data on a cache line boundary.

So use max(32, L1_CACHE_BYTES) instead of 64, to be more generic.

Remove microblaze and powerpc own NET_SKB_PAD definitions.

Thanks to Alexander Duyck and David Miller for their comments.

Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/microblaze/include/asm/system.h | 3 ---
 arch/powerpc/include/asm/system.h    | 3 ---
 include/linux/skbuff.h               | 8 +++++---
 3 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/system.h b/arch/microblaze/include/asm/system.h
index 48c4f0335e3f..81e1f7d5b4cb 100644
--- a/arch/microblaze/include/asm/system.h
+++ b/arch/microblaze/include/asm/system.h
@@ -101,10 +101,7 @@ extern struct dentry *of_debugfs_root;
  * MicroBlaze doesn't handle unaligned accesses in hardware.
  *
  * Based on this we force the IP header alignment in network drivers.
- * We also modify NET_SKB_PAD to be a cacheline in size, thus maintaining
- * cacheline alignment of buffers.
  */
 #define NET_IP_ALIGN	2
-#define NET_SKB_PAD	L1_CACHE_BYTES
 
 #endif /* _ASM_MICROBLAZE_SYSTEM_H */
diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
index a6297c67c3d6..6c294acac848 100644
--- a/arch/powerpc/include/asm/system.h
+++ b/arch/powerpc/include/asm/system.h
@@ -515,11 +515,8 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
  * powers of 2 writes until it reaches sufficient alignment).
  *
  * Based on this we disable the IP header alignment in network drivers.
- * We also modify NET_SKB_PAD to be a cacheline in size, thus maintaining
- * cacheline alignment of buffers.
  */
 #define NET_IP_ALIGN	0
-#define NET_SKB_PAD	L1_CACHE_BYTES
 
 #define cmpxchg64(ptr, o, n)						\
   ({									\
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 122d08396e56..ac74ee085d74 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1414,12 +1414,14 @@ static inline int skb_network_offset(const struct sk_buff *skb)
  *
  * Various parts of the networking layer expect at least 32 bytes of
  * headroom, you should not reduce this.
- * With RPS, we raised NET_SKB_PAD to 64 so that get_rps_cpus() fetches span
- * a 64 bytes aligned block to fit modern (>= 64 bytes) cache line sizes
+ *
+ * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS)
+ * to reduce average number of cache lines per packet.
+ * get_rps_cpus() for example only access one 64 bytes aligned block :
  * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8)
  */
 #ifndef NET_SKB_PAD
-#define NET_SKB_PAD	64
+#define NET_SKB_PAD	max(32, L1_CACHE_BYTES)
 #endif
 
 extern int ___pskb_trim(struct sk_buff *skb, unsigned int len);
-- 
cgit v1.2.3-59-g8ed1b


From d5f31fbfd8fa3836a918592032853c41d1797c3d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 15 Jun 2010 21:44:29 -0700
Subject: netpoll: Use correct primitives for RCU dereferencing

Now that RCU debugging checks for matching rcu_dereference calls
and rcu_read_lock, we need to use the correct primitives or face
nasty warnings.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 4c77fe78ceff..413742c92d14 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -64,7 +64,7 @@ static inline bool netpoll_rx(struct sk_buff *skb)
 	bool ret = false;
 
 	rcu_read_lock_bh();
-	npinfo = rcu_dereference(skb->dev->npinfo);
+	npinfo = rcu_dereference_bh(skb->dev->npinfo);
 
 	if (!npinfo || (list_empty(&npinfo->rx_np) && !npinfo->rx_flags))
 		goto out;
@@ -82,7 +82,7 @@ out:
 
 static inline int netpoll_rx_on(struct sk_buff *skb)
 {
-	struct netpoll_info *npinfo = rcu_dereference(skb->dev->npinfo);
+	struct netpoll_info *npinfo = rcu_dereference_bh(skb->dev->npinfo);
 
 	return npinfo && (!list_empty(&npinfo->rx_np) || npinfo->rx_flags);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 46cd09a7de52cad464d35a75924b79984646288d Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 11 Jun 2010 12:16:57 +0200
Subject: fix typos concerning "acquire"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/arm/mach-omap2/sleep34xx.S | 2 +-
 include/linux/lru_cache.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/sleep34xx.S b/arch/arm/mach-omap2/sleep34xx.S
index d522cd70bf53..ba53191ae4c5 100644
--- a/arch/arm/mach-omap2/sleep34xx.S
+++ b/arch/arm/mach-omap2/sleep34xx.S
@@ -60,7 +60,7 @@
 #define SDRC_DLLA_CTRL_V	OMAP34XX_SDRC_REGADDR(SDRC_DLLA_CTRL)
 
         .text
-/* Function to aquire the semaphore in scratchpad */
+/* Function to acquire the semaphore in scratchpad */
 ENTRY(lock_scratchpad_sem)
 	stmfd	sp!, {lr}	@ save registers on stack
 wait_sem:
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index de48d167568b..78fbf24f357a 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -262,7 +262,7 @@ extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char
  * @lc: the lru cache to operate on
  *
  * Note that the reference counts and order on the active and lru lists may
- * still change.  Returns true if we aquired the lock.
+ * still change.  Returns true if we acquired the lock.
  */
 static inline int lc_try_lock(struct lru_cache *lc)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 65155b3708137fabee865dc4da822763c0c41208 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 11 Jun 2010 12:17:01 +0200
Subject: fix typos concerning "management"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 2 +-
 drivers/media/dvb/siano/smscoreapi.c     | 2 +-
 drivers/net/benet/be_hw.h                | 2 +-
 drivers/scsi/fcoe/fcoe.c                 | 4 ++--
 drivers/scsi/mpt2sas/mpt2sas_base.h      | 2 +-
 drivers/scsi/mpt2sas/mpt2sas_scsih.c     | 4 ++--
 drivers/scsi/pm8001/pm8001_hwi.c         | 2 +-
 drivers/scsi/qla2xxx/qla_iocb.c          | 2 +-
 drivers/scsi/qla2xxx/qla_nx.h            | 2 +-
 include/linux/ide.h                      | 2 +-
 include/linux/if_link.h                  | 2 +-
 11 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index f8fbbc67a406..7745394c3e63 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -1013,7 +1013,7 @@ int vmw_gmr_id_alloc(struct vmw_private *dev_priv, uint32_t *p_id)
 }
 
 /*
- * Stream managment
+ * Stream management
  */
 
 static void vmw_stream_destroy(struct vmw_resource *res)
diff --git a/drivers/media/dvb/siano/smscoreapi.c b/drivers/media/dvb/siano/smscoreapi.c
index 0c87a3c3899a..a19f649666d5 100644
--- a/drivers/media/dvb/siano/smscoreapi.c
+++ b/drivers/media/dvb/siano/smscoreapi.c
@@ -1297,7 +1297,7 @@ int smsclient_sendrequest(struct smscore_client_t *client,
 EXPORT_SYMBOL_GPL(smsclient_sendrequest);
 
 
-/* old GPIO managments implementation */
+/* old GPIO managements implementation */
 int smscore_configure_gpio(struct smscore_device_t *coredev, u32 pin,
 			   struct smscore_config_gpio *pinconfig)
 {
diff --git a/drivers/net/benet/be_hw.h b/drivers/net/benet/be_hw.h
index 063026de4957..3f1b7c3965bb 100644
--- a/drivers/net/benet/be_hw.h
+++ b/drivers/net/benet/be_hw.h
@@ -52,7 +52,7 @@
  */
 #define MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK	(1 << 29) /* bit 29 */
 
-/********* Power managment (WOL) **********/
+/********* Power management (WOL) **********/
 #define PCICFG_PM_CONTROL_OFFSET		0x44
 #define PCICFG_PM_CONTROL_MASK			0x108	/* bits 3 & 8 */
 
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 9276121db1ef..bc39542481a4 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -2452,7 +2452,7 @@ module_exit(fcoe_exit);
  * @fp: response frame, or error encoded in a pointer (timeout)
  * @arg: pointer the the fcoe_ctlr structure
  *
- * This handles MAC address managment for FCoE, then passes control on to
+ * This handles MAC address management for FCoE, then passes control on to
  * the libfc FLOGI response handler.
  */
 static void fcoe_flogi_resp(struct fc_seq *seq, struct fc_frame *fp, void *arg)
@@ -2484,7 +2484,7 @@ done:
  * @fp: response frame, or error encoded in a pointer (timeout)
  * @arg: pointer the the fcoe_ctlr structure
  *
- * This handles MAC address managment for FCoE, then passes control on to
+ * This handles MAC address management for FCoE, then passes control on to
  * the libfc LOGO response handler.
  */
 static void fcoe_logo_resp(struct fc_seq *seq, struct fc_frame *fp, void *arg)
diff --git a/drivers/scsi/mpt2sas/mpt2sas_base.h b/drivers/scsi/mpt2sas/mpt2sas_base.h
index b4afe431ac1e..41c29a86e834 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_base.h
+++ b/drivers/scsi/mpt2sas/mpt2sas_base.h
@@ -474,7 +474,7 @@ typedef void (*MPT_ADD_SGE)(void *paddr, u32 flags_length, dma_addr_t dma_addr);
  * @shost_recovery: host reset in progress
  * @ioc_reset_in_progress_lock:
  * @ioc_link_reset_in_progress: phy/hard reset in progress
- * @ignore_loginfos: ignore loginfos during task managment
+ * @ignore_loginfos: ignore loginfos during task management
  * @remove_host: flag for when driver unloads, to avoid sending dev resets
  * @wait_for_port_enable_to_complete:
  * @msix_enable: flag indicating msix is enabled
diff --git a/drivers/scsi/mpt2sas/mpt2sas_scsih.c b/drivers/scsi/mpt2sas/mpt2sas_scsih.c
index c5ff26a2a51d..06d645a36f1b 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_scsih.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_scsih.c
@@ -2979,7 +2979,7 @@ _scsih_qcmd(struct scsi_cmnd *scmd, void (*done)(struct scsi_cmnd *))
 	/* host recovery or link resets sent via IOCTLs */
 	if (ioc->shost_recovery || ioc->ioc_link_reset_in_progress)
 		return SCSI_MLQUEUE_HOST_BUSY;
-	/* device busy with task managment */
+	/* device busy with task management */
 	else if (sas_device_priv_data->block || sas_target_priv_data->tm_busy)
 		return SCSI_MLQUEUE_DEVICE_BUSY;
 	/* device has been deleted */
@@ -6845,7 +6845,7 @@ _scsih_init(void)
 	 /* queuecommand callback hander */
 	scsi_io_cb_idx = mpt2sas_base_register_callback_handler(_scsih_io_done);
 
-	/* task managment callback handler */
+	/* task management callback handler */
 	tm_cb_idx = mpt2sas_base_register_callback_handler(_scsih_tm_done);
 
 	/* base internal commands callback handler */
diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c
index 5ff8261c5d67..0e05e8a22167 100644
--- a/drivers/scsi/pm8001/pm8001_hwi.c
+++ b/drivers/scsi/pm8001/pm8001_hwi.c
@@ -4152,7 +4152,7 @@ static int pm8001_chip_abort_task(struct pm8001_hba_info *pm8001_ha,
 }
 
 /**
- * pm8001_chip_ssp_tm_req - built the task managment command.
+ * pm8001_chip_ssp_tm_req - built the task management command.
  * @pm8001_ha: our hba card information.
  * @ccb: the ccb information.
  * @tmf: task management function.
diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c
index 8ef945365412..782b30d0eea1 100644
--- a/drivers/scsi/qla2xxx/qla_iocb.c
+++ b/drivers/scsi/qla2xxx/qla_iocb.c
@@ -1129,7 +1129,7 @@ qla24xx_build_scsi_crc_2_iocbs(srb_t *sp, struct cmd_type_crc_2 *cmd_pkt,
 	cmd_pkt->fcp_cmnd_dseg_address[1] = cpu_to_le32(
 	    MSD(crc_ctx_dma + CRC_CONTEXT_FCPCMND_OFF));
 	fcp_cmnd->task_attribute = 0;
-	fcp_cmnd->task_managment = 0;
+	fcp_cmnd->task_management = 0;
 
 	cmd_pkt->fcp_rsp_dseg_len = 0; /* Let response come in status iocb */
 
diff --git a/drivers/scsi/qla2xxx/qla_nx.h b/drivers/scsi/qla2xxx/qla_nx.h
index f8f99a5ea532..1b44d013f151 100644
--- a/drivers/scsi/qla2xxx/qla_nx.h
+++ b/drivers/scsi/qla2xxx/qla_nx.h
@@ -832,7 +832,7 @@ struct fcp_cmnd {
 	struct scsi_lun lun;
 	uint8_t crn;
 	uint8_t task_attribute;
-	uint8_t task_managment;
+	uint8_t task_management;
 	uint8_t additional_cdb_len;
 	uint8_t cdb[260]; /* 256 for CDB len and 4 for FCP_DL */
 };
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 3239d1c10acb..c2c598ed4eed 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -458,7 +458,7 @@ enum {
 	IDE_DFLAG_DOORLOCKING		= (1 << 15),
 	/* disallow DMA */
 	IDE_DFLAG_NODMA			= (1 << 16),
-	/* powermanagment told us not to do anything, so sleep nicely */
+	/* powermanagement told us not to do anything, so sleep nicely */
 	IDE_DFLAG_BLOCKED		= (1 << 17),
 	/* sleeping & sleep field valid */
 	IDE_DFLAG_SLEEPING		= (1 << 18),
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 85c812db5a3f..9d8f0807daed 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -233,7 +233,7 @@ enum macvlan_mode {
 	MACVLAN_MODE_BRIDGE  = 4, /* talk to bridge ports directly */
 };
 
-/* SR-IOV virtual function managment section */
+/* SR-IOV virtual function management section */
 
 enum {
 	IFLA_VF_INFO_UNSPEC,
-- 
cgit v1.2.3-59-g8ed1b


From 5c1469de7545a35a16ff2b902e217044a7d2f8a5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 13 Jun 2010 03:28:03 +0000
Subject: user_ns: Introduce user_nsmap_uid and user_ns_map_gid.

Define what happens when a we view a uid from one user_namespace
in another user_namepece.

- If the user namespaces are the same no mapping is necessary.

- For most cases of difference use overflowuid and overflowgid,
  the uid and gid currently used for 16bit apis when we have a 32bit uid
  that does fit in 16bits.  Effectively the situation is the same,
  we want to return a uid or gid that is not assigned to any user.

- For the case when we happen to be mapping the uid or gid of the
  creator of the target user namespace use uid 0 and gid as confusing
  that user with root is not a problem.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/user_namespace.h | 14 ++++++++++++++
 kernel/user_namespace.c        | 44 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index cc4f45361dbb..8178156711f9 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -36,6 +36,9 @@ static inline void put_user_ns(struct user_namespace *ns)
 		kref_put(&ns->kref, free_user_ns);
 }
 
+uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid);
+gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid);
+
 #else
 
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -52,6 +55,17 @@ static inline void put_user_ns(struct user_namespace *ns)
 {
 }
 
+static inline uid_t user_ns_map_uid(struct user_namespace *to,
+	const struct cred *cred, uid_t uid)
+{
+	return uid;
+}
+static inline gid_t user_ns_map_gid(struct user_namespace *to,
+	const struct cred *cred, gid_t gid)
+{
+	return gid;
+}
+
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b2d70d38dff4..25915832291a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/highuid.h>
 #include <linux/cred.h>
 
 /*
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
 	schedule_work(&ns->destroyer);
 }
 EXPORT_SYMBOL(free_user_ns);
+
+uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
+{
+	struct user_namespace *tmp;
+
+	if (likely(to == cred->user->user_ns))
+		return uid;
+
+
+	/* Is cred->user the creator of the target user_ns
+	 * or the creator of one of it's parents?
+	 */
+	for ( tmp = to; tmp != &init_user_ns;
+	      tmp = tmp->creator->user_ns ) {
+		if (cred->user == tmp->creator) {
+			return (uid_t)0;
+		}
+	}
+
+	/* No useful relationship so no mapping */
+	return overflowuid;
+}
+
+gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
+{
+	struct user_namespace *tmp;
+
+	if (likely(to == cred->user->user_ns))
+		return gid;
+
+	/* Is cred->user the creator of the target user_ns
+	 * or the creator of one of it's parents?
+	 */
+	for ( tmp = to; tmp != &init_user_ns;
+	      tmp = tmp->creator->user_ns ) {
+		if (cred->user == tmp->creator) {
+			return (gid_t)0;
+		}
+	}
+
+	/* No useful relationship so no mapping */
+	return overflowgid;
+}
-- 
cgit v1.2.3-59-g8ed1b


From 3f551f9436c05a3b5eccdd6e94733df5bb98d2a5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 13 Jun 2010 03:28:59 +0000
Subject: sock: Introduce cred_to_ucred

To keep the coming code clear and to allow both the sock
code and the scm code to share the logic introduce a
fuction to translate from struct cred to struct ucred.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h |  5 +++++
 net/core/sock.c        | 14 ++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 032a19eb61b1..a2fada9becb6 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -24,6 +24,9 @@ struct __kernel_sockaddr_storage {
 #include <linux/types.h>		/* pid_t			*/
 #include <linux/compiler.h>		/* __user			*/
 
+struct pid;
+struct cred;
+
 #define __sockaddr_check_size(size)	\
 	BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
 
@@ -309,6 +312,8 @@ struct ucred {
 #define IPX_TYPE	1
 
 #ifdef __KERNEL__
+extern void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred);
+
 extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
 extern int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
 			       int offset, int len);
diff --git a/net/core/sock.c b/net/core/sock.c
index f9ce0db41cd6..db8335ad7559 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -110,6 +110,7 @@
 #include <linux/tcp.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
+#include <linux/user_namespace.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -749,6 +750,19 @@ set_rcvbuf:
 EXPORT_SYMBOL(sock_setsockopt);
 
 
+void cred_to_ucred(struct pid *pid, const struct cred *cred,
+		   struct ucred *ucred)
+{
+	ucred->pid = pid_vnr(pid);
+	ucred->uid = ucred->gid = -1;
+	if (cred) {
+		struct user_namespace *current_ns = current_user_ns();
+
+		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
+		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
+	}
+}
+
 int sock_getsockopt(struct socket *sock, int level, int optname,
 		    char __user *optval, int __user *optlen)
 {
-- 
cgit v1.2.3-59-g8ed1b


From d29c0c5c332131f1151cf33995e2f01299b9234f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 14 Jun 2010 20:21:04 +0000
Subject: udp: Add UFO to NETIF_F_SOFTWARE_GSO

This patch adds UFO to the list of GSO features with a software
fallback.  This allows UFO to be used even if the hardware does
not support it.

In particular, this allows us to test the UFO fallback, as it
has been reported to not work in some cases.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 398f6c28cf8a..8fa5e5aa879a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -865,7 +865,8 @@ struct net_device {
 #define NETIF_F_FSO		(SKB_GSO_FCOE << NETIF_F_GSO_SHIFT)
 
 	/* List of features with software fallbacks. */
-#define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
+#define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | \
+				 NETIF_F_TSO6 | NETIF_F_UFO)
 
 
 #define NETIF_F_GEN_CSUM	(NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
-- 
cgit v1.2.3-59-g8ed1b


From db3c9cc105ee844f6cd7a1beb9926fb8e9a093ae Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sat, 12 Jun 2010 20:30:21 +0200
Subject: firewire: replace get_features card driver hook

by feature variables in the fw_card struct.  The hook appeared to be an
unnecessary abstraction in the card driver interface.

Cleaner would be to pass those feature flags as arguments to
fw_card_initialize() or fw_card_add(), but the FairnessControl register
is in the SCLK domain and may therefore not be accessible while Link
Power Status is off, i.e. before the card->driver->enable call from
fw_card_add().

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-topology.c    |  3 +--
 drivers/firewire/core-transaction.c |  3 +--
 drivers/firewire/core.h             |  5 -----
 drivers/firewire/ohci.c             | 14 ++------------
 include/linux/firewire.h            |  3 +++
 5 files changed, 7 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index 00a556f3a585..3b9667c37b67 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -543,8 +543,7 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation,
 
 	spin_lock_irqsave(&card->lock, flags);
 
-	card->broadcast_channel_allocated = (card->driver->get_features(card) &
-					     FEATURE_CHANNEL_31_ALLOCATED) != 0;
+	card->broadcast_channel_allocated = card->broadcast_channel_auto_allocated;
 	card->node_id = node_id;
 	/*
 	 * Update node_id before generation to prevent anybody from using
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 5069cfc75b50..62bf30560a3e 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -1129,8 +1129,7 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 		break;
 
 	case CSR_PRIORITY_BUDGET:
-		if (!(card->driver->get_features(card) &
-						FEATURE_PRIORITY_BUDGET))
+		if (!card->priority_budget_implemented)
 			rcode = RCODE_ADDRESS_ERROR;
 		else if (tcode == TCODE_READ_QUADLET_REQUEST)
 			*data = cpu_to_be32(card->driver->
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 3f9e39b60bca..8dc76d8711a5 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -38,9 +38,6 @@ struct fw_packet;
 #define BROADCAST_CHANNEL_INITIAL	(1 << 31 | 31)
 #define BROADCAST_CHANNEL_VALID		(1 << 30)
 
-#define FEATURE_PRIORITY_BUDGET		0x01
-#define FEATURE_CHANNEL_31_ALLOCATED	0x02
-
 #define CSR_STATE_BIT_CMSTR	(1 << 8)
 #define CSR_STATE_BIT_ABDICATE	(1 << 10)
 
@@ -84,8 +81,6 @@ struct fw_card_driver {
 	u32 (*read_csr_reg)(struct fw_card *card, int csr_offset);
 	void (*write_csr_reg)(struct fw_card *card, int csr_offset, u32 value);
 
-	unsigned int (*get_features)(struct fw_card *card);
-
 	struct fw_iso_context *
 	(*allocate_iso_context)(struct fw_card *card,
 				int type, int channel, size_t header_size);
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 2abdb3268a10..09bba9315de9 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -172,7 +172,6 @@ struct fw_ohci {
 	int request_generation;	/* for timestamping incoming requests */
 	unsigned quirks;
 	unsigned int pri_req_max;
-	unsigned int features;
 	u32 bus_time;
 	bool is_root;
 
@@ -1753,15 +1752,14 @@ static int ohci_enable(struct fw_card *card,
 	if (version >= OHCI_VERSION_1_1) {
 		reg_write(ohci, OHCI1394_InitialChannelsAvailableHi,
 			  0xfffffffe);
-		ohci->features |= FEATURE_CHANNEL_31_ALLOCATED;
+		card->broadcast_channel_auto_allocated = true;
 	}
 
 	/* Get implemented bits of the priority arbitration request counter. */
 	reg_write(ohci, OHCI1394_FairnessControl, 0x3f);
 	ohci->pri_req_max = reg_read(ohci, OHCI1394_FairnessControl) & 0x3f;
 	reg_write(ohci, OHCI1394_FairnessControl, 0);
-	if (ohci->pri_req_max != 0)
-		ohci->features |= FEATURE_PRIORITY_BUDGET;
+	card->priority_budget_implemented = ohci->pri_req_max != 0;
 
 	ar_context_run(&ohci->ar_request_ctx);
 	ar_context_run(&ohci->ar_response_ctx);
@@ -2132,13 +2130,6 @@ static void ohci_write_csr_reg(struct fw_card *card, int csr_offset, u32 value)
 	}
 }
 
-static unsigned int ohci_get_features(struct fw_card *card)
-{
-	struct fw_ohci *ohci = fw_ohci(card);
-
-	return ohci->features;
-}
-
 static void copy_iso_headers(struct iso_context *ctx, void *p)
 {
 	int i = ctx->header_length;
@@ -2578,7 +2569,6 @@ static const struct fw_card_driver ohci_driver = {
 	.enable_phys_dma	= ohci_enable_phys_dma,
 	.read_csr_reg		= ohci_read_csr_reg,
 	.write_csr_reg		= ohci_write_csr_reg,
-	.get_features		= ohci_get_features,
 
 	.allocate_iso_context	= ohci_allocate_iso_context,
 	.free_iso_context	= ohci_free_iso_context,
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 4d22643215ef..5acb5fc19180 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -122,6 +122,9 @@ struct fw_card {
 	bool bm_abdicate; /* value of csr_abdicate before last bus reset */
 	bool csr_abdicate; /* visible in CSR STATE_CLEAR/SET registers */
 
+	bool priority_budget_implemented;	/* controller feature */
+	bool broadcast_channel_auto_allocated;	/* controller feature */
+
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
 	__be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
-- 
cgit v1.2.3-59-g8ed1b


From c8a94ded57e9cc2498d401b2f5c856213a3e19fb Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sat, 12 Jun 2010 20:34:50 +0200
Subject: firewire: normalize STATE_CLEAR/SET CSR access interface

Push the maintenance of STATE_CLEAR/SET.abdicate down into the card
driver.  This way, the read/write_csr_reg driver method works uniformly
across all CSR offsets.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-topology.c    |  5 ++---
 drivers/firewire/core-transaction.c | 41 +++++++++++--------------------------
 drivers/firewire/core.h             |  2 +-
 drivers/firewire/ohci.c             | 19 ++++++++++++-----
 include/linux/firewire.h            |  3 +--
 5 files changed, 30 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index 3b9667c37b67..56e908ba43f1 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -524,7 +524,7 @@ static void update_topology_map(struct fw_card *card,
 }
 
 void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation,
-			      int self_id_count, u32 *self_ids)
+			      int self_id_count, u32 *self_ids, bool bm_abdicate)
 {
 	struct fw_node *local_node;
 	unsigned long flags;
@@ -552,8 +552,7 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation,
 	smp_wmb();
 	card->generation = generation;
 	card->reset_jiffies = jiffies;
-	card->bm_abdicate = card->csr_abdicate;
-	card->csr_abdicate = false;
+	card->bm_abdicate = bm_abdicate;
 	fw_schedule_bm_work(card, 0);
 
 	local_node = build_tree(card, self_ids, self_id_count);
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 62bf30560a3e..87d69cddb231 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -982,20 +982,6 @@ static const struct fw_address_region registers_region =
 	{ .start = CSR_REGISTER_BASE,
 	  .end   = CSR_REGISTER_BASE | CSR_CONFIG_ROM, };
 
-static u32 read_state_register(struct fw_card *card)
-{
-	u32 value;
-
-	/* Bit 8 (cmstr): */
-	value = card->driver->read_csr_reg(card, CSR_STATE_CLEAR);
-
-	/* Bit 10 (abdicate): */
-	if (card->csr_abdicate)
-		value |= CSR_STATE_BIT_ABDICATE;
-
-	return value;
-}
-
 static void update_split_timeout(struct fw_card *card)
 {
 	unsigned int cycles;
@@ -1021,29 +1007,25 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 
 	switch (reg) {
 	case CSR_STATE_CLEAR:
-		if (tcode == TCODE_READ_QUADLET_REQUEST) {
-			*data = cpu_to_be32(read_state_register(card));
-		} else if (tcode == TCODE_WRITE_QUADLET_REQUEST) {
+		if (tcode == TCODE_READ_QUADLET_REQUEST)
+			*data = cpu_to_be32(card->driver->
+					read_csr_reg(card, CSR_STATE_CLEAR));
+		else if (tcode == TCODE_WRITE_QUADLET_REQUEST)
 			card->driver->write_csr_reg(card, CSR_STATE_CLEAR,
 						    be32_to_cpu(*data));
-			if (*data & cpu_to_be32(CSR_STATE_BIT_ABDICATE))
-				card->csr_abdicate = false;
-		} else {
+		else
 			rcode = RCODE_TYPE_ERROR;
-		}
 		break;
 
 	case CSR_STATE_SET:
-		if (tcode == TCODE_READ_QUADLET_REQUEST) {
-			*data = cpu_to_be32(read_state_register(card));
-		} else if (tcode == TCODE_WRITE_QUADLET_REQUEST) {
+		if (tcode == TCODE_READ_QUADLET_REQUEST)
+			*data = cpu_to_be32(card->driver->
+					read_csr_reg(card, CSR_STATE_SET));
+		else if (tcode == TCODE_WRITE_QUADLET_REQUEST)
 			card->driver->write_csr_reg(card, CSR_STATE_SET,
 						    be32_to_cpu(*data));
-			if (*data & cpu_to_be32(CSR_STATE_BIT_ABDICATE))
-				card->csr_abdicate = true;
-		} else {
+		else
 			rcode = RCODE_TYPE_ERROR;
-		}
 		break;
 
 	case CSR_NODE_IDS:
@@ -1063,7 +1045,8 @@ static void handle_registers(struct fw_card *card, struct fw_request *request,
 
 	case CSR_RESET_START:
 		if (tcode == TCODE_WRITE_QUADLET_REQUEST)
-			card->csr_abdicate = false;
+			card->driver->write_csr_reg(card, CSR_STATE_CLEAR,
+						    CSR_STATE_BIT_ABDICATE);
 		else
 			rcode = RCODE_TYPE_ERROR;
 		break;
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 8dc76d8711a5..8280c625170b 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -196,7 +196,7 @@ static inline void fw_node_put(struct fw_node *node)
 }
 
 void fw_core_handle_bus_reset(struct fw_card *card, int node_id,
-			      int generation, int self_id_count, u32 *self_ids);
+	int generation, int self_id_count, u32 *self_ids, bool bm_abdicate);
 void fw_destroy_nodes(struct fw_card *card);
 
 /*
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 09bba9315de9..a55cf0911b72 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -174,6 +174,7 @@ struct fw_ohci {
 	unsigned int pri_req_max;
 	u32 bus_time;
 	bool is_root;
+	bool csr_state_setclear_abdicate;
 
 	/*
 	 * Spinlock for accessing fw_ohci data.  Never call out of
@@ -1529,7 +1530,9 @@ static void bus_reset_tasklet(unsigned long data)
 		    self_id_count, ohci->self_id_buffer);
 
 	fw_core_handle_bus_reset(&ohci->card, ohci->node_id, generation,
-				 self_id_count, ohci->self_id_buffer);
+				 self_id_count, ohci->self_id_buffer,
+				 ohci->csr_state_setclear_abdicate);
+	ohci->csr_state_setclear_abdicate = false;
 }
 
 static irqreturn_t irq_handler(int irq, void *data)
@@ -2032,13 +2035,16 @@ static u32 ohci_read_csr_reg(struct fw_card *card, int csr_offset)
 	switch (csr_offset) {
 	case CSR_STATE_CLEAR:
 	case CSR_STATE_SET:
-		/* the controller driver handles only the cmstr bit */
 		if (ohci->is_root &&
 		    (reg_read(ohci, OHCI1394_LinkControlSet) &
 		     OHCI1394_LinkControl_cycleMaster))
-			return CSR_STATE_BIT_CMSTR;
+			value = CSR_STATE_BIT_CMSTR;
 		else
-			return 0;
+			value = 0;
+		if (ohci->csr_state_setclear_abdicate)
+			value |= CSR_STATE_BIT_ABDICATE;
+
+		return value;
 
 	case CSR_NODE_IDS:
 		return reg_read(ohci, OHCI1394_NodeID) << 16;
@@ -2078,12 +2084,13 @@ static void ohci_write_csr_reg(struct fw_card *card, int csr_offset, u32 value)
 
 	switch (csr_offset) {
 	case CSR_STATE_CLEAR:
-		/* the controller driver handles only the cmstr bit */
 		if ((value & CSR_STATE_BIT_CMSTR) && ohci->is_root) {
 			reg_write(ohci, OHCI1394_LinkControlClear,
 				  OHCI1394_LinkControl_cycleMaster);
 			flush_writes(ohci);
 		}
+		if (value & CSR_STATE_BIT_ABDICATE)
+			ohci->csr_state_setclear_abdicate = false;
 		break;
 
 	case CSR_STATE_SET:
@@ -2092,6 +2099,8 @@ static void ohci_write_csr_reg(struct fw_card *card, int csr_offset, u32 value)
 				  OHCI1394_LinkControl_cycleMaster);
 			flush_writes(ohci);
 		}
+		if (value & CSR_STATE_BIT_ABDICATE)
+			ohci->csr_state_setclear_abdicate = true;
 		break;
 
 	case CSR_NODE_IDS:
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 5acb5fc19180..5553018d45d6 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -119,8 +119,7 @@ struct fw_card {
 	int bm_retries;
 	int bm_generation;
 	__be32 bm_transaction_data[2];
-	bool bm_abdicate; /* value of csr_abdicate before last bus reset */
-	bool csr_abdicate; /* visible in CSR STATE_CLEAR/SET registers */
+	bool bm_abdicate;
 
 	bool priority_budget_implemented;	/* controller feature */
 	bool broadcast_channel_auto_allocated;	/* controller feature */
-- 
cgit v1.2.3-59-g8ed1b


From 33e553fe2b4a983ef34a57ab1440d8d33397bb12 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 20 Jun 2010 22:50:35 +0200
Subject: firewire: remove an unused function argument

void (*fw_address_callback_t)(..., int speed, ...) is the speed that a
remote node chose to transmit a request to us.  In case of split
transactions, firewire-core will transmit the response at that speed.

Upper layer drivers on the other hand (firewire-net, -sbp2, firedtv, and
userspace drivers) cannot do anything useful with that speed datum,
except log it for debug purposes.  But data that is merely potentially
(not even actually) used for debug purposes does not belong into the API.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c            |  3 +--
 drivers/firewire/core-transaction.c     | 14 +++++++-------
 drivers/firewire/net.c                  |  4 ++--
 drivers/firewire/sbp2.c                 |  3 +--
 drivers/media/dvb/firewire/firedtv-fw.c |  4 ++--
 include/linux/firewire.h                |  2 +-
 6 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index ca72cdaa68c9..4e0478d70d4d 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -632,8 +632,7 @@ static void release_request(struct client *client,
 
 static void handle_request(struct fw_card *card, struct fw_request *request,
 			   int tcode, int destination, int source,
-			   int generation, int speed,
-			   unsigned long long offset,
+			   int generation, unsigned long long offset,
 			   void *payload, size_t length, void *callback_data)
 {
 	struct address_handler_resource *handler = callback_data;
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index cb6390fe3686..2f67c8d5ce91 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -802,7 +802,7 @@ static void handle_exclusive_region_request(struct fw_card *card,
 	else
 		handler->address_callback(card, request,
 					  tcode, destination, source,
-					  p->generation, p->speed, offset,
+					  p->generation, offset,
 					  request->data, request->length,
 					  handler->callback_data);
 }
@@ -840,8 +840,8 @@ static void handle_fcp_region_request(struct fw_card *card,
 		if (is_enclosing_handler(handler, offset, request->length))
 			handler->address_callback(card, NULL, tcode,
 						  destination, source,
-						  p->generation, p->speed,
-						  offset, request->data,
+						  p->generation, offset,
+						  request->data,
 						  request->length,
 						  handler->callback_data);
 	}
@@ -951,8 +951,8 @@ static const struct fw_address_region topology_map_region =
 
 static void handle_topology_map(struct fw_card *card, struct fw_request *request,
 		int tcode, int destination, int source, int generation,
-		int speed, unsigned long long offset,
-		void *payload, size_t length, void *callback_data)
+		unsigned long long offset, void *payload, size_t length,
+		void *callback_data)
 {
 	int start;
 
@@ -996,8 +996,8 @@ static void update_split_timeout(struct fw_card *card)
 
 static void handle_registers(struct fw_card *card, struct fw_request *request,
 		int tcode, int destination, int source, int generation,
-		int speed, unsigned long long offset,
-		void *payload, size_t length, void *callback_data)
+		unsigned long long offset, void *payload, size_t length,
+		void *callback_data)
 {
 	int reg = offset & ~CSR_REGISTER_BASE;
 	__be32 *data = payload;
diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index 2d3dc7ded0a9..4bb3fb882f63 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -805,8 +805,8 @@ static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
 
 static void fwnet_receive_packet(struct fw_card *card, struct fw_request *r,
 		int tcode, int destination, int source, int generation,
-		int speed, unsigned long long offset, void *payload,
-		size_t length, void *callback_data)
+		unsigned long long offset, void *payload, size_t length,
+		void *callback_data)
 {
 	struct fwnet_device *dev = callback_data;
 	int rcode;
diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c
index ae715c82da2e..1931964c4fbf 100644
--- a/drivers/firewire/sbp2.c
+++ b/drivers/firewire/sbp2.c
@@ -410,8 +410,7 @@ static void free_orb(struct kref *kref)
 
 static void sbp2_status_write(struct fw_card *card, struct fw_request *request,
 			      int tcode, int destination, int source,
-			      int generation, int speed,
-			      unsigned long long offset,
+			      int generation, unsigned long long offset,
 			      void *payload, size_t length, void *callback_data)
 {
 	struct sbp2_logical_unit *lu = callback_data;
diff --git a/drivers/media/dvb/firewire/firedtv-fw.c b/drivers/media/dvb/firewire/firedtv-fw.c
index 4253b7ab0097..4dcae63f8cff 100644
--- a/drivers/media/dvb/firewire/firedtv-fw.c
+++ b/drivers/media/dvb/firewire/firedtv-fw.c
@@ -194,8 +194,8 @@ static const struct firedtv_backend backend = {
 
 static void handle_fcp(struct fw_card *card, struct fw_request *request,
 		       int tcode, int destination, int source, int generation,
-		       int speed, unsigned long long offset,
-		       void *payload, size_t length, void *callback_data)
+		       unsigned long long offset, void *payload, size_t length,
+		       void *callback_data)
 {
 	struct firedtv *f, *fdtv = NULL;
 	struct fw_device *device;
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 5553018d45d6..e44b502c8341 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -265,7 +265,7 @@ typedef void (*fw_transaction_callback_t)(struct fw_card *card, int rcode,
 typedef void (*fw_address_callback_t)(struct fw_card *card,
 				      struct fw_request *request,
 				      int tcode, int destination, int source,
-				      int generation, int speed,
+				      int generation,
 				      unsigned long long offset,
 				      void *data, size_t length,
 				      void *callback_data);
-- 
cgit v1.2.3-59-g8ed1b


From 604f45167824e18ad5766e51ecf1d4d65f15118d Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 20 Jun 2010 22:52:55 +0200
Subject: firewire: cdev: freeze FW_CDEV_VERSION due to libraw1394 bug

libraw1394 v2.0.0...v2.0.5 takes FW_CDEV_VERSION from an externally
installed header file and uses it to declare its own implementation
level in FW_CDEV_IOC_GET_INFO.  This is wrong; it should set the real
version for which it was actually written.

If we add features to the kernel ABI that require the kernel to check
a client's implementation level, we can not trust the client version if
it was set from FW_CDEV_VERSION.

Hence freeze FW_CDEV_VERSION at the current value (no damage has been
done yet), clearly document FW_CDEV_VERSION as a dummy version and what
clients are expected to do with fw_cdev_get_info.version, and use a new
defined constant (which is not placed into the exported header file) as
kernel implementation level.

Note, in order to check in client program source code which features are
present in an externally installed linux/firewire-cdev.h, use
preprocessor directives like
  #ifdef FW_CDEV_IOC_ALLOCATE_ISO_RESOURCE
or
  #ifdef FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED
instead of a check of FW_CDEV_VERSION.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c  |  7 ++++++-
 include/linux/firewire-cdev.h | 22 ++++++++++++++++------
 2 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 8f8c8eeaf046..0cf86bcbeea2 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -46,6 +46,11 @@
 
 #include "core.h"
 
+/*
+ * ABI version history is documented in linux/firewire-cdev.h.
+ */
+#define FW_CDEV_KERNEL_VERSION 3
+
 struct client {
 	u32 version;
 	struct fw_device *device;
@@ -395,7 +400,7 @@ static int ioctl_get_info(struct client *client, union ioctl_arg *arg)
 	unsigned long ret = 0;
 
 	client->version = a->version;
-	a->version = FW_CDEV_VERSION;
+	a->version = FW_CDEV_KERNEL_VERSION;
 	a->card = client->device->card->index;
 
 	down_read(&fw_device_rwsem);
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 6ffb24a1f2f2..0d0cc07358af 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -219,7 +219,7 @@ union fw_cdev_event {
 	struct fw_cdev_event_response		response;
 	struct fw_cdev_event_request		request;
 	struct fw_cdev_event_iso_interrupt	iso_interrupt;
-	struct fw_cdev_event_iso_resource	iso_resource;
+	struct fw_cdev_event_iso_resource	iso_resource; /* added in 2.6.30 */
 };
 
 /* available since kernel version 2.6.22 */
@@ -252,22 +252,32 @@ union fw_cdev_event {
 #define FW_CDEV_IOC_GET_CYCLE_TIMER2   _IOWR('#', 0x14, struct fw_cdev_get_cycle_timer2)
 
 /*
- * FW_CDEV_VERSION History
+ * ABI version history
  *  1  (2.6.22)  - initial version
+ *     (2.6.24)  - added %FW_CDEV_IOC_GET_CYCLE_TIMER
  *  2  (2.6.30)  - changed &fw_cdev_event_iso_interrupt.header if
  *                 &fw_cdev_create_iso_context.header_size is 8 or more
+ *               - added %FW_CDEV_IOC_*_ISO_RESOURCE*,
+ *                 %FW_CDEV_IOC_GET_SPEED, %FW_CDEV_IOC_SEND_BROADCAST_REQUEST,
+ *                 %FW_CDEV_IOC_SEND_STREAM_PACKET
  *     (2.6.32)  - added time stamp to xmit &fw_cdev_event_iso_interrupt
  *     (2.6.33)  - IR has always packet-per-buffer semantics now, not one of
  *                 dual-buffer or packet-per-buffer depending on hardware
  *  3  (2.6.34)  - made &fw_cdev_get_cycle_timer reliable
+ *               - added %FW_CDEV_IOC_GET_CYCLE_TIMER2
  */
-#define FW_CDEV_VERSION 3
+#define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
 /**
  * struct fw_cdev_get_info - General purpose information ioctl
- * @version:	The version field is just a running serial number.
- *		We never break backwards compatibility, but may add more
- *		structs and ioctls in later revisions.
+ * @version:	The version field is just a running serial number.  Both an
+ *		input parameter (ABI version implemented by the client) and
+ *		output parameter (ABI version implemented by the kernel).
+ *		A client must not fill in an %FW_CDEV_VERSION defined from an
+ *		included kernel header file but the actual version for which
+ *		the client was implemented.  This is necessary for forward
+ *		compatibility.  We never break backwards compatibility, but
+ *		may add more structs, events, and ioctls in later revisions.
  * @rom_length:	If @rom is non-zero, at most rom_length bytes of configuration
  *		ROM will be copied into that user space address.  In either
  *		case, @rom_length is updated with the actual length of the
-- 
cgit v1.2.3-59-g8ed1b


From e205597d188a9ea69ce43f740a14f07b3f5b996a Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 20 Jun 2010 22:53:55 +0200
Subject: firewire: cdev: fix ABI for FCP and address range mapping, add
 fw_cdev_event_request2

The problem:

A target-like userspace driver, e.g. AV/C target or SBP-2/3 target,
needs to be able to act as responder and requester.  In the latter role,
it needs to send requests to nods from which it received requests.  This
is currently impossible because fw_cdev_event_request lacks information
about sender node ID.
Reported-by: Jay Fenlason <fenlason@redhat.com>

Libffado + libraw1394 + firewire-core is currently unable to drive two
or more audio devices on the same bus.
Reported-by: Arnold Krille <arnold@arnoldarts.de>

This is because libffado requires destination node ID of FCP requests
and sender node ID of FCP responses to match.  It even prohibits
libffado from working with a bus on which libraw1394 opens a /dev/fw* as
default ioctl device that does not correspond with the audio device.
This is because libraw1394 does not receive the sender node ID from the
kernel.

Moreover, fw_cdev_event_request makes it impossible to tell unicast and
broadcast write requests apart.

The fix:

Add a replacement of struct fw_cdev_event_request request, boringly
called struct fw_cdev_event_request2.  The new event will be sent to a
userspace client instead of the old one if the client claims
compatibility with <linux/firewire-cdev.h> ABI version 4 or later.

libraw1394 needs to be extended to make use of the new event, in order
to properly support libffado and other FCP or address range mapping
users who require correct sender node IDs.

Further notes:

While we are at it, change back the range of possible values of
fw_cdev_event_request.tcode to 0x0...0xb like in ABI version <= 3.
The preceding change "firewire: expose extended tcode of incoming lock
requests to (userspace) drivers" expanded it to 0x0...0x17 which could
catch sloppily coded clients by surprise.  The extended range of codes
is only used in the new fw_cdev_event_request2.tcode.

Jay and I also suggested an alternative approach to fix the ABI for
incoming requests:  Add an FW_CDEV_IOC_GET_REQUEST_INFO ioctl which can
be called after reception of an fw_cdev_event_request, before issuing of
the closing FW_CDEV_IOC_SEND_RESPONSE ioctl.  The new ioctl would reveal
the vital information about a request that fw_cdev_event_request lacks.
Jay showed an implementation of this approach.

The former event approach adds 27 LOC of rather trivial code to
core-cdev.c, the ioctl approach 34 LOC, some of which is nontrivial.
The ioctl approach would certainly also add more LOC to userspace
programs which require the expanded information on inbound requests.
This approach is probably only on the lighter-weight side in case of
clients that want to be compatible with kernels that lack the new
capability, like libraw1394.  However, the code to be added to such
libraw1394-like clients in case of the event approach is a straight-
forward additional switch () case in its event handler.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c  | 45 ++++++++++++++++++++-----
 include/linux/firewire-cdev.h | 76 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 110 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 0cf86bcbeea2..9b8df2039155 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -49,7 +49,8 @@
 /*
  * ABI version history is documented in linux/firewire-cdev.h.
  */
-#define FW_CDEV_KERNEL_VERSION 3
+#define FW_CDEV_KERNEL_VERSION		4
+#define FW_CDEV_VERSION_EVENT_REQUEST2	4
 
 struct client {
 	u32 version;
@@ -176,7 +177,10 @@ struct outbound_transaction_event {
 
 struct inbound_transaction_event {
 	struct event event;
-	struct fw_cdev_event_request request;
+	union {
+		struct fw_cdev_event_request request;
+		struct fw_cdev_event_request2 request2;
+	} req;
 };
 
 struct iso_interrupt_event {
@@ -645,6 +649,7 @@ static void handle_request(struct fw_card *card, struct fw_request *request,
 	struct address_handler_resource *handler = callback_data;
 	struct inbound_transaction_resource *r;
 	struct inbound_transaction_event *e;
+	size_t event_size0;
 	void *fcp_frame = NULL;
 	int ret;
 
@@ -678,15 +683,37 @@ static void handle_request(struct fw_card *card, struct fw_request *request,
 	if (ret < 0)
 		goto failed;
 
-	e->request.type    = FW_CDEV_EVENT_REQUEST;
-	e->request.tcode   = tcode;
-	e->request.offset  = offset;
-	e->request.length  = length;
-	e->request.handle  = r->resource.handle;
-	e->request.closure = handler->closure;
+	if (handler->client->version < FW_CDEV_VERSION_EVENT_REQUEST2) {
+		struct fw_cdev_event_request *req = &e->req.request;
+
+		if (tcode & 0x10)
+			tcode = TCODE_LOCK_REQUEST;
+
+		req->type	= FW_CDEV_EVENT_REQUEST;
+		req->tcode	= tcode;
+		req->offset	= offset;
+		req->length	= length;
+		req->handle	= r->resource.handle;
+		req->closure	= handler->closure;
+		event_size0	= sizeof(*req);
+	} else {
+		struct fw_cdev_event_request2 *req = &e->req.request2;
+
+		req->type	= FW_CDEV_EVENT_REQUEST2;
+		req->tcode	= tcode;
+		req->offset	= offset;
+		req->source_node_id = source;
+		req->destination_node_id = destination;
+		req->card	= card->index;
+		req->generation	= generation;
+		req->length	= length;
+		req->handle	= r->resource.handle;
+		req->closure	= handler->closure;
+		event_size0	= sizeof(*req);
+	}
 
 	queue_event(handler->client, &e->event,
-		    &e->request, sizeof(e->request), r->data, length);
+		    &e->req, event_size0, r->data, length);
 	return;
 
  failed:
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 0d0cc07358af..52c7ffe934ad 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -32,6 +32,9 @@
 #define FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED	0x04
 #define FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED	0x05
 
+/* available since kernel version 2.6.36 */
+#define FW_CDEV_EVENT_REQUEST2			0x06
+
 /**
  * struct fw_cdev_event_common - Common part of all fw_cdev_event_ types
  * @closure:	For arbitrary use by userspace
@@ -98,11 +101,46 @@ struct fw_cdev_event_response {
 };
 
 /**
- * struct fw_cdev_event_request - Sent on incoming request to an address region
+ * struct fw_cdev_event_request - Old version of &fw_cdev_event_request2
  * @closure:	See &fw_cdev_event_common; set by %FW_CDEV_IOC_ALLOCATE ioctl
  * @type:	See &fw_cdev_event_common; always %FW_CDEV_EVENT_REQUEST
+ * @tcode:	See &fw_cdev_event_request2
+ * @offset:	See &fw_cdev_event_request2
+ * @handle:	See &fw_cdev_event_request2
+ * @length:	See &fw_cdev_event_request2
+ * @data:	See &fw_cdev_event_request2
+ *
+ * This event is sent instead of &fw_cdev_event_request2 if the kernel or
+ * the client implements ABI version <= 3.
+ *
+ * Unlike &fw_cdev_event_request2, the sender identity cannot be established,
+ * broadcast write requests cannot be distinguished from unicast writes, and
+ * @tcode of lock requests is %TCODE_LOCK_REQUEST.
+ *
+ * Requests to the FCP_REQUEST or FCP_RESPONSE register are responded to as
+ * with &fw_cdev_event_request2, except in kernel 2.6.32 and older which send
+ * the response packet of the client's %FW_CDEV_IOC_SEND_RESPONSE ioctl.
+ */
+struct fw_cdev_event_request {
+	__u64 closure;
+	__u32 type;
+	__u32 tcode;
+	__u64 offset;
+	__u32 handle;
+	__u32 length;
+	__u32 data[0];
+};
+
+/**
+ * struct fw_cdev_event_request2 - Sent on incoming request to an address region
+ * @closure:	See &fw_cdev_event_common; set by %FW_CDEV_IOC_ALLOCATE ioctl
+ * @type:	See &fw_cdev_event_common; always %FW_CDEV_EVENT_REQUEST2
  * @tcode:	Transaction code of the incoming request
  * @offset:	The offset into the 48-bit per-node address space
+ * @source_node_id: Sender node ID
+ * @destination_node_id: Destination node ID
+ * @card:	The index of the card from which the request came
+ * @generation:	Bus generation in which the request is valid
  * @handle:	Reference to the kernel-side pending request
  * @length:	Data length, i.e. the request's payload size in bytes
  * @data:	Incoming data, if any
@@ -115,12 +153,42 @@ struct fw_cdev_event_response {
  *
  * The payload data for requests carrying data (write and lock requests)
  * follows immediately and can be accessed through the @data field.
+ *
+ * Unlike &fw_cdev_event_request, @tcode of lock requests is one of the
+ * firewire-core specific %TCODE_LOCK_MASK_SWAP...%TCODE_LOCK_VENDOR_DEPENDENT,
+ * i.e. encodes the extended transaction code.
+ *
+ * @card may differ from &fw_cdev_get_info.card because requests are received
+ * from all cards of the Linux host.  @source_node_id, @destination_node_id, and
+ * @generation pertain to that card.  Destination node ID and bus generation may
+ * therefore differ from the corresponding fields of the last
+ * &fw_cdev_event_bus_reset.
+ *
+ * @destination_node_id may also differ from the current node ID because of a
+ * non-local bus ID part or in case of a broadcast write request.  Note, a
+ * client must call an %FW_CDEV_IOC_SEND_RESPONSE ioctl even in case of a
+ * broadcast write request; the kernel will then release the kernel-side pending
+ * request but will not actually send a response packet.
+ *
+ * In case of a write request to FCP_REQUEST or FCP_RESPONSE, the kernel already
+ * sent a write response immediately after the request was received; in this
+ * case the client must still call an %FW_CDEV_IOC_SEND_RESPONSE ioctl to
+ * release the kernel-side pending request, though another response won't be
+ * sent.
+ *
+ * If the client subsequently needs to initiate requests to the sender node of
+ * an &fw_cdev_event_request2, it needs to use a device file with matching
+ * card index, node ID, and generation for outbound requests.
  */
-struct fw_cdev_event_request {
+struct fw_cdev_event_request2 {
 	__u64 closure;
 	__u32 type;
 	__u32 tcode;
 	__u64 offset;
+	__u32 source_node_id;
+	__u32 destination_node_id;
+	__u32 card;
+	__u32 generation;
 	__u32 handle;
 	__u32 length;
 	__u32 data[0];
@@ -200,6 +268,7 @@ struct fw_cdev_event_iso_resource {
  * @bus_reset:     Valid if @common.type == %FW_CDEV_EVENT_BUS_RESET
  * @response:      Valid if @common.type == %FW_CDEV_EVENT_RESPONSE
  * @request:       Valid if @common.type == %FW_CDEV_EVENT_REQUEST
+ * @request2:      Valid if @common.type == %FW_CDEV_EVENT_REQUEST2
  * @iso_interrupt: Valid if @common.type == %FW_CDEV_EVENT_ISO_INTERRUPT
  * @iso_resource:  Valid if @common.type ==
  *				%FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or
@@ -218,6 +287,7 @@ union fw_cdev_event {
 	struct fw_cdev_event_bus_reset		bus_reset;
 	struct fw_cdev_event_response		response;
 	struct fw_cdev_event_request		request;
+	struct fw_cdev_event_request2		request2;     /* added in 2.6.36 */
 	struct fw_cdev_event_iso_interrupt	iso_interrupt;
 	struct fw_cdev_event_iso_resource	iso_resource; /* added in 2.6.30 */
 };
@@ -263,8 +333,10 @@ union fw_cdev_event {
  *     (2.6.32)  - added time stamp to xmit &fw_cdev_event_iso_interrupt
  *     (2.6.33)  - IR has always packet-per-buffer semantics now, not one of
  *                 dual-buffer or packet-per-buffer depending on hardware
+ *               - shared use and auto-response for FCP registers
  *  3  (2.6.34)  - made &fw_cdev_get_cycle_timer reliable
  *               - added %FW_CDEV_IOC_GET_CYCLE_TIMER2
+ *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2
  */
 #define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
-- 
cgit v1.2.3-59-g8ed1b


From 3b2b65d68fc87b02ac393a031a4ebb3de84a8218 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 20 Jun 2010 22:54:22 +0200
Subject: firewire: cdev: extend fw_cdev_event_iso_interrupt documentation

Add information regarding the 2.6.32 update to the xmit variant of
fw_cdev_event_iso_interrupt.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 include/linux/firewire-cdev.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 52c7ffe934ad..8b9b27373219 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -204,10 +204,21 @@ struct fw_cdev_event_request2 {
  * @header:	Stripped headers, if any
  *
  * This event is sent when the controller has completed an &fw_cdev_iso_packet
- * with the %FW_CDEV_ISO_INTERRUPT bit set.  In the receive case, the headers
- * stripped of all packets up until and including the interrupt packet are
- * returned in the @header field.  The amount of header data per packet is as
- * specified at iso context creation by &fw_cdev_create_iso_context.header_size.
+ * with the %FW_CDEV_ISO_INTERRUPT bit set.
+ *
+ * Isochronous transmit events:
+ *
+ * In version 1 of the ABI, &header_length is 0.  In version 3 and some
+ * implementations of version 2 of the ABI, &header_length is a multiple of 4
+ * and &header contains timestamps of all packets up until the interrupt packet.
+ * The format of the timestamps is as described below for isochronous reception.
+ *
+ * Isochronous receive events:
+ *
+ * The headers stripped of all packets up until and including the interrupt
+ * packet are returned in the @header field.  The amount of header data per
+ * packet is as specified at iso context creation by
+ * &fw_cdev_create_iso_context.header_size.
  *
  * In version 1 of this ABI, header data consisted of the 1394 isochronous
  * packet header, followed by quadlets from the packet payload if
-- 
cgit v1.2.3-59-g8ed1b


From 69ad78208ecf4c392f3d323ed050423847c24104 Mon Sep 17 00:00:00 2001
From: Sjur Braendeland <sjur.brandeland@stericsson.com>
Date: Thu, 17 Jun 2010 06:55:41 +0000
Subject: caif: Add debug connection type for CAIF.

Added new CAIF protocol type CAIFPROTO_DEBUG for accessing
CAIF debug on the ST Ericsson modems.

There are two debug servers on the modem, one for radio related
debug (CAIF_RADIO_DEBUG_SERVICE) and the other for
communication/application related debug (CAIF_COM_DEBUG_SERVICE).

The debug connection can contain trace debug printouts or
interactive debug used for debugging and test.

Debug connections can be of type STREAM or SEQPACKET.

Signed-off-by: Sjur Braendeland <sjur.brandeland@stericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/caif/caif_socket.h | 34 ++++++++++++++++++++++++++++++++++
 net/caif/caif_config_util.c      |  5 +++++
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/caif/caif_socket.h b/include/linux/caif/caif_socket.h
index 2a61eb1beb85..d9cb19b7cff7 100644
--- a/include/linux/caif/caif_socket.h
+++ b/include/linux/caif/caif_socket.h
@@ -62,6 +62,7 @@ enum caif_channel_priority {
  * @CAIFPROTO_DATAGRAM_LOOP:	Datagram loopback channel, used for testing.
  * @CAIFPROTO_UTIL:		Utility (Psock) channel.
  * @CAIFPROTO_RFM:		Remote File Manager
+ * @CAIFPROTO_DEBUG:		Debug link
  *
  * This enum defines the CAIF Channel type to be used. This defines
  * the service to connect to on the modem.
@@ -72,6 +73,7 @@ enum caif_protocol_type {
 	CAIFPROTO_DATAGRAM_LOOP,
 	CAIFPROTO_UTIL,
 	CAIFPROTO_RFM,
+	CAIFPROTO_DEBUG,
 	_CAIFPROTO_MAX
 };
 #define	CAIFPROTO_MAX _CAIFPROTO_MAX
@@ -83,6 +85,28 @@ enum caif_protocol_type {
 enum caif_at_type {
 	CAIF_ATTYPE_PLAIN = 2
 };
+ /**
+ * enum caif_debug_type - Content selection for debug connection
+ * @CAIF_DEBUG_TRACE_INTERACTIVE: Connection will contain
+ *				both trace and interactive debug.
+ * @CAIF_DEBUG_TRACE:		Connection contains trace only.
+ * @CAIF_DEBUG_INTERACTIVE:	Connection to interactive debug.
+ */
+enum caif_debug_type {
+	CAIF_DEBUG_TRACE_INTERACTIVE = 0,
+	CAIF_DEBUG_TRACE,
+	CAIF_DEBUG_INTERACTIVE,
+};
+
+/**
+ * enum caif_debug_service - Debug Service Endpoint
+ * @CAIF_RADIO_DEBUG_SERVICE:	Debug service on the Radio sub-system
+ * @CAIF_APP_DEBUG_SERVICE:	Debug for the applications sub-system
+ */
+enum caif_debug_service {
+	CAIF_RADIO_DEBUG_SERVICE = 1,
+	CAIF_APP_DEBUG_SERVICE
+};
 
 /**
  * struct sockaddr_caif - the sockaddr structure for CAIF sockets.
@@ -109,6 +133,12 @@ enum caif_at_type {
  *
  * @u.rfm.volume:            Volume to mount.
  *
+ * @u.dbg:		      Applies when family = CAIFPROTO_DEBUG.
+ *
+ * @u.dbg.type:			     Type of debug connection to set up
+ *			      (caif_debug_type).
+ *
+ * @u.dbg.service:	      Service sub-system to connect (caif_debug_service
  * Description:
  * This structure holds the connect parameters used for setting up a
  * CAIF Channel. It defines the service to connect to on the modem.
@@ -130,6 +160,10 @@ struct sockaddr_caif {
 			__u32 connection_id;
 			char	  volume[16];
 		} rfm;				/* CAIFPROTO_RFM */
+		struct {
+			__u8  type;		/* type:enum caif_debug_type */
+			__u8  service;		/* service:caif_debug_service */
+		} dbg;				/* CAIFPROTO_DEBUG */
 	} u;
 };
 
diff --git a/net/caif/caif_config_util.c b/net/caif/caif_config_util.c
index 6f36580366f0..76ae68303d3a 100644
--- a/net/caif/caif_config_util.c
+++ b/net/caif/caif_config_util.c
@@ -79,6 +79,11 @@ int connect_req_to_link_param(struct cfcnfg *cnfg,
 		memcpy(l->u.utility.params, s->param.data,
 		       l->u.utility.paramlen);
 
+		break;
+	case CAIFPROTO_DEBUG:
+		l->linktype = CFCTRL_SRV_DBG;
+		l->endpoint = s->sockaddr.u.dbg.service;
+		l->chtype = s->sockaddr.u.dbg.type;
 		break;
 	default:
 		return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 26ec037f9841e49cc5c615deb8e1e73e5beab2ca Mon Sep 17 00:00:00 2001
From: Nick Chalk <nick@loadbalancer.org>
Date: Tue, 22 Jun 2010 08:07:01 +0200
Subject: IPVS: one-packet scheduling

Allow one-packet scheduling for UDP connections. When the fwmark-based or
normal virtual service is marked with '-o' or '--ops' options all
connections are created only to schedule one packet. Useful to schedule UDP
packets from same client port to different real servers. Recommended with
RR or WRR schedulers (the connections are not visible with ipvsadm -L).

Signed-off-by: Nick Chalk <nick@loadbalancer.org>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/ip_vs.h           |  2 ++
 net/netfilter/ipvs/ip_vs_conn.c | 10 +++++++---
 net/netfilter/ipvs/ip_vs_core.c | 20 ++++++++++++++++----
 net/netfilter/ipvs/ip_vs_ctl.c  | 10 ++++++----
 4 files changed, 31 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h
index dfc170362842..9708de265bb1 100644
--- a/include/linux/ip_vs.h
+++ b/include/linux/ip_vs.h
@@ -19,6 +19,7 @@
  */
 #define IP_VS_SVC_F_PERSISTENT	0x0001		/* persistent port */
 #define IP_VS_SVC_F_HASHED	0x0002		/* hashed entry */
+#define IP_VS_SVC_F_ONEPACKET	0x0004		/* one-packet scheduling */
 
 /*
  *      Destination Server Flags
@@ -85,6 +86,7 @@
 #define IP_VS_CONN_F_SEQ_MASK	0x0600		/* in/out sequence mask */
 #define IP_VS_CONN_F_NO_CPORT	0x0800		/* no client port set yet */
 #define IP_VS_CONN_F_TEMPLATE	0x1000		/* template, not connection */
+#define IP_VS_CONN_F_ONE_PACKET	0x2000		/* forward only one packet */
 
 #define IP_VS_SCHEDNAME_MAXLEN	16
 #define IP_VS_IFNAME_MAXLEN	16
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index d8f7e8ef67b4..717e6233d50f 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -158,6 +158,9 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 	unsigned hash;
 	int ret;
 
+	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+		return 0;
+
 	/* Hash by protocol, client address and port */
 	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
 
@@ -355,8 +358,9 @@ struct ip_vs_conn *ip_vs_conn_out_get
  */
 void ip_vs_conn_put(struct ip_vs_conn *cp)
 {
-	/* reset it expire in its timeout */
-	mod_timer(&cp->timer, jiffies+cp->timeout);
+	unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
+		0 : cp->timeout;
+	mod_timer(&cp->timer, jiffies+t);
 
 	__ip_vs_conn_put(cp);
 }
@@ -649,7 +653,7 @@ static void ip_vs_conn_expire(unsigned long data)
 	/*
 	 *	unhash it if it is hashed in the conn table
 	 */
-	if (!ip_vs_conn_unhash(cp))
+	if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
 		goto expire_later;
 
 	/*
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 1cd6e3fd058b..50907d8472a3 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -194,6 +194,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	struct ip_vs_dest *dest;
 	struct ip_vs_conn *ct;
 	__be16  dport;			/* destination port to forward */
+	__be16  flags;
 	union nf_inet_addr snet;	/* source network of the client,
 					   after masking */
 
@@ -340,6 +341,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 		dport = ports[1];
 	}
 
+	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+		 && iph.protocol == IPPROTO_UDP)?
+		IP_VS_CONN_F_ONE_PACKET : 0;
+
 	/*
 	 *    Create a new connection according to the template
 	 */
@@ -347,7 +352,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 			    &iph.saddr, ports[0],
 			    &iph.daddr, ports[1],
 			    &dest->addr, dport,
-			    0,
+			    flags,
 			    dest);
 	if (cp == NULL) {
 		ip_vs_conn_put(ct);
@@ -377,7 +382,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
-	__be16 _ports[2], *pptr;
+	__be16 _ports[2], *pptr, flags;
 
 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
@@ -407,6 +412,10 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		return NULL;
 	}
 
+	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+		 && iph.protocol == IPPROTO_UDP)?
+		IP_VS_CONN_F_ONE_PACKET : 0;
+
 	/*
 	 *    Create a connection entry.
 	 */
@@ -414,7 +423,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 			    &iph.saddr, pptr[0],
 			    &iph.daddr, pptr[1],
 			    &dest->addr, dest->port ? dest->port : pptr[1],
-			    0,
+			    flags,
 			    dest);
 	if (cp == NULL)
 		return NULL;
@@ -464,6 +473,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
 		int ret, cs;
 		struct ip_vs_conn *cp;
+		__u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+				iph.protocol == IPPROTO_UDP)?
+				IP_VS_CONN_F_ONE_PACKET : 0;
 		union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 
 		ip_vs_service_put(svc);
@@ -474,7 +486,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 				    &iph.saddr, pptr[0],
 				    &iph.daddr, pptr[1],
 				    &daddr, 0,
-				    IP_VS_CONN_F_BYPASS,
+				    IP_VS_CONN_F_BYPASS | flags,
 				    NULL);
 		if (cp == NULL)
 			return NF_DROP;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 36dc1d88c2fa..0f0c079c422a 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1864,14 +1864,16 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 					   svc->scheduler->name);
 			else
 #endif
-				seq_printf(seq, "%s  %08X:%04X %s ",
+				seq_printf(seq, "%s  %08X:%04X %s %s ",
 					   ip_vs_proto_name(svc->protocol),
 					   ntohl(svc->addr.ip),
 					   ntohs(svc->port),
-					   svc->scheduler->name);
+					   svc->scheduler->name,
+					   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
 		} else {
-			seq_printf(seq, "FWM  %08X %s ",
-				   svc->fwmark, svc->scheduler->name);
+			seq_printf(seq, "FWM  %08X %s %s",
+				   svc->fwmark, svc->scheduler->name,
+				   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
 		}
 
 		if (svc->flags & IP_VS_SVC_F_PERSISTENT)
-- 
cgit v1.2.3-59-g8ed1b


From 97dc135947181a6670949a480da56c3ebf8d3715 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 16 Jun 2010 09:52:26 -0400
Subject: NFSv41: Clean up the NFSv4.1 minor version specific operations

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 14 ++++++--------
 fs/nfs/nfs4_fs.h          | 11 +++++++++++
 fs/nfs/nfs4proc.c         | 21 ++++++++++++++++++++-
 include/linux/nfs_fs_sb.h |  7 ++-----
 4 files changed, 39 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d25b5257b7a1..1df708fd4205 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -150,6 +150,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	clp->cl_boot_time = CURRENT_TIME;
 	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
 	clp->cl_minorversion = cl_init->minorversion;
+	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
 #endif
 	cred = rpc_lookup_machine_cred();
 	if (!IS_ERR(cred))
@@ -178,7 +179,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
 		clp->cl_session = NULL;
 	}
 
-	clp->cl_call_sync = _nfs4_call_sync;
+	clp->cl_mvops = nfs_v4_minor_ops[0];
 #endif /* CONFIG_NFS_V4_1 */
 }
 
@@ -188,7 +189,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
 static void nfs4_destroy_callback(struct nfs_client *clp)
 {
 	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
-		nfs_callback_down(clp->cl_minorversion);
+		nfs_callback_down(clp->cl_mvops->minor_version);
 }
 
 static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -1126,7 +1127,7 @@ static int nfs4_init_callback(struct nfs_client *clp)
 				return error;
 		}
 
-		error = nfs_callback_up(clp->cl_minorversion,
+		error = nfs_callback_up(clp->cl_mvops->minor_version,
 					clp->cl_rpcclient->cl_xprt);
 		if (error < 0) {
 			dprintk("%s: failed to start callback. Error = %d\n",
@@ -1143,10 +1144,8 @@ static int nfs4_init_callback(struct nfs_client *clp)
  */
 static int nfs4_init_client_minor_version(struct nfs_client *clp)
 {
-	clp->cl_call_sync = _nfs4_call_sync;
-
 #if defined(CONFIG_NFS_V4_1)
-	if (clp->cl_minorversion) {
+	if (clp->cl_mvops->minor_version) {
 		struct nfs4_session *session = NULL;
 		/*
 		 * Create the session and mark it expired.
@@ -1158,7 +1157,6 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 			return -ENOMEM;
 
 		clp->cl_session = session;
-		clp->cl_call_sync = _nfs4_call_sync_session;
 	}
 #endif /* CONFIG_NFS_V4_1 */
 
@@ -1454,7 +1452,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				data->authflavor,
 				parent_server->client->cl_xprt->prot,
 				parent_server->client->cl_timeout,
-				parent_client->cl_minorversion);
+				parent_client->cl_mvops->minor_version);
 	if (error < 0)
 		goto error;
 
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index bb1e95530699..5b01705e30f9 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -52,6 +52,16 @@ enum nfs4_session_state {
 	NFS4_SESSION_DRAINING,
 };
 
+struct nfs4_minor_version_ops {
+	u32	minor_version;
+
+	int	(*call_sync)(struct nfs_server *server,
+			struct rpc_message *msg,
+			struct nfs4_sequence_args *args,
+			struct nfs4_sequence_res *res,
+			int cache_reply);
+};
+
 /*
  * struct rpc_sequence ensures that RPC calls are sent in the exact
  * order that they appear on the list.
@@ -260,6 +270,7 @@ static inline int nfs4_init_session(struct nfs_server *server)
 }
 #endif /* CONFIG_NFS_V4_1 */
 
+extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
 extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[];
 
 extern const u32 nfs4_fattr_bitmap[2];
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fc972c6f1ce9..a938daf333da 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -667,7 +667,7 @@ int _nfs4_call_sync(struct nfs_server *server,
 }
 
 #define nfs4_call_sync(server, msg, args, res, cache_reply) \
-	(server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \
+	(server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \
 			&(res)->seq_res, (cache_reply))
 
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
@@ -5353,6 +5353,18 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 };
 #endif
 
+static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
+	.minor_version = 0,
+	.call_sync = _nfs4_call_sync,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
+	.minor_version = 1,
+	.call_sync = _nfs4_call_sync_session,
+};
+#endif
+
 /*
  * Per minor version reboot and network partition recovery ops
  */
@@ -5378,6 +5390,13 @@ struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = {
 #endif
 };
 
+const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
+	[0] = &nfs_v4_0_minor_ops,
+#if defined(CONFIG_NFS_V4_1)
+	[1] = &nfs_v4_1_minor_ops,
+#endif
+};
+
 static const struct inode_operations nfs4_file_inode_operations = {
 	.permission	= nfs_permission,
 	.getattr	= nfs_getattr,
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index d6e10a4c06e5..c82ee7cd6288 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -15,6 +15,7 @@ struct nlm_host;
 struct nfs4_sequence_args;
 struct nfs4_sequence_res;
 struct nfs_server;
+struct nfs4_minor_version_ops;
 
 /*
  * The nfs_client identifies our client state to the server.
@@ -70,11 +71,7 @@ struct nfs_client {
 	 */
 	char			cl_ipaddr[48];
 	unsigned char		cl_id_uniquifier;
-	int		     (* cl_call_sync)(struct nfs_server *server,
-					      struct rpc_message *msg,
-					      struct nfs4_sequence_args *args,
-					      struct nfs4_sequence_res *res,
-					      int cache_reply);
+	const struct nfs4_minor_version_ops *cl_mvops;
 #endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_V4_1
-- 
cgit v1.2.3-59-g8ed1b


From d77d76ffb638bd013782138cca6d8f4918c5afd6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 16 Jun 2010 09:52:27 -0400
Subject: NFSv41: Clean up exclusive create

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 17 ++++++-----------
 include/linux/nfs_xdr.h |  6 ++++--
 2 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d1ab0c36e939..5d87563d0c1a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -744,19 +744,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
 	p->o_arg.server = server;
 	p->o_arg.bitmask = server->attr_bitmask;
 	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
-	if (flags & O_EXCL) {
-		if (nfs4_has_persistent_session(server->nfs_client)) {
-			/* GUARDED */
-			p->o_arg.u.attrs = &p->attrs;
-			memcpy(&p->attrs, attrs, sizeof(p->attrs));
-		} else { /* EXCLUSIVE4_1 */
-			u32 *s = (u32 *) p->o_arg.u.verifier.data;
-			s[0] = jiffies;
-			s[1] = current->pid;
-		}
-	} else if (flags & O_CREAT) {
+	if (flags & O_CREAT) {
+		u32 *s;
+
 		p->o_arg.u.attrs = &p->attrs;
 		memcpy(&p->attrs, attrs, sizeof(p->attrs));
+		s = (u32 *) p->o_arg.u.verifier.data;
+		s[0] = jiffies;
+		s[1] = current->pid;
 	}
 	p->c_arg.fh = &p->o_res.fh;
 	p->c_arg.stateid = &p->o_res.stateid;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 51914d7d6cc4..a319cb926abf 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -196,8 +196,10 @@ struct nfs_openargs {
 	__u64                   clientid;
 	__u64                   id;
 	union {
-		struct iattr *  attrs;    /* UNCHECKED, GUARDED */
-		nfs4_verifier   verifier; /* EXCLUSIVE */
+		struct {
+			struct iattr *  attrs;    /* UNCHECKED, GUARDED */
+			nfs4_verifier   verifier; /* EXCLUSIVE */
+		};
 		nfs4_stateid	delegation;		/* CLAIM_DELEGATE_CUR */
 		fmode_t		delegation_type;	/* CLAIM_PREVIOUS */
 	} u;
-- 
cgit v1.2.3-59-g8ed1b


From 69da9bcb98ccbfb5d5f751bc13418f1307332925 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Wed, 16 Jun 2010 17:57:28 +0200
Subject: ALSA: usb-audio: unify UAC macros and struct names

Get rid of the last occurances of _v1 suffixes, and move the version
number right after the "uac" string. Now things are consitent again.

Sorry for the forth and back, but it just looks much nicer this way.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 drivers/usb/gadget/f_audio.c |  6 +++---
 drivers/usb/gadget/gmidi.c   |  2 +-
 include/linux/usb/audio-v2.h |  2 +-
 include/linux/usb/audio.h    | 12 ++++++------
 sound/usb/card.c             |  2 +-
 sound/usb/endpoint.c         |  4 ++--
 sound/usb/mixer.c            | 14 +++++++-------
 7 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/f_audio.c b/drivers/usb/gadget/f_audio.c
index b91115f84b13..1f48ceb55a77 100644
--- a/drivers/usb/gadget/f_audio.c
+++ b/drivers/usb/gadget/f_audio.c
@@ -61,7 +61,7 @@ DECLARE_UAC_AC_HEADER_DESCRIPTOR(2);
 #define UAC_DT_TOTAL_LENGTH (UAC_DT_AC_HEADER_LENGTH + UAC_DT_INPUT_TERMINAL_SIZE \
 	+ UAC_DT_OUTPUT_TERMINAL_SIZE + UAC_DT_FEATURE_UNIT_SIZE(0))
 /* B.3.2  Class-Specific AC Interface Descriptor */
-static struct uac_ac_header_descriptor_v1_2 ac_header_desc = {
+static struct uac1_ac_header_descriptor_2 ac_header_desc = {
 	.bLength =		UAC_DT_AC_HEADER_LENGTH,
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
 	.bDescriptorSubtype =	UAC_HEADER,
@@ -125,7 +125,7 @@ static struct usb_audio_control_selector feature_unit = {
 };
 
 #define OUTPUT_TERMINAL_ID	3
-static struct uac_output_terminal_descriptor_v1 output_terminal_desc = {
+static struct uac1_output_terminal_descriptor output_terminal_desc = {
 	.bLength		= UAC_DT_OUTPUT_TERMINAL_SIZE,
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
 	.bDescriptorSubtype	= UAC_OUTPUT_TERMINAL,
@@ -155,7 +155,7 @@ static struct usb_interface_descriptor as_interface_alt_1_desc = {
 };
 
 /* B.4.2  Class-Specific AS Interface Descriptor */
-static struct uac_as_header_descriptor_v1 as_header_desc = {
+static struct uac1_as_header_descriptor as_header_desc = {
 	.bLength =		UAC_DT_AS_HEADER_SIZE,
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
 	.bDescriptorSubtype =	UAC_AS_GENERAL,
diff --git a/drivers/usb/gadget/gmidi.c b/drivers/usb/gadget/gmidi.c
index 2b56ce621852..b7bf88019b06 100644
--- a/drivers/usb/gadget/gmidi.c
+++ b/drivers/usb/gadget/gmidi.c
@@ -238,7 +238,7 @@ static const struct usb_interface_descriptor ac_interface_desc = {
 };
 
 /* B.3.2  Class-Specific AC Interface Descriptor */
-static const struct uac_ac_header_descriptor_v1_1 ac_header_desc = {
+static const struct uac1_ac_header_descriptor_1 ac_header_desc = {
 	.bLength =		UAC_DT_AC_HEADER_SIZE(1),
 	.bDescriptorType =	USB_DT_CS_INTERFACE,
 	.bDescriptorSubtype =	USB_MS_HEADER,
diff --git a/include/linux/usb/audio-v2.h b/include/linux/usb/audio-v2.h
index 383b94ba8c20..716aebe339e8 100644
--- a/include/linux/usb/audio-v2.h
+++ b/include/linux/usb/audio-v2.h
@@ -121,7 +121,7 @@ struct uac2_feature_unit_descriptor {
 
 /* 4.9.2 Class-Specific AS Interface Descriptor */
 
-struct uac_as_header_descriptor_v2 {
+struct uac2_as_header_descriptor {
 	__u8 bLength;
 	__u8 bDescriptorType;
 	__u8 bDescriptorSubtype;
diff --git a/include/linux/usb/audio.h b/include/linux/usb/audio.h
index c51200c715e5..a54b8255d75f 100644
--- a/include/linux/usb/audio.h
+++ b/include/linux/usb/audio.h
@@ -39,8 +39,8 @@
 #define UAC_MIXER_UNIT			0x04
 #define UAC_SELECTOR_UNIT		0x05
 #define UAC_FEATURE_UNIT		0x06
-#define UAC_PROCESSING_UNIT_V1		0x07
-#define UAC_EXTENSION_UNIT_V1		0x08
+#define UAC1_PROCESSING_UNIT		0x07
+#define UAC1_EXTENSION_UNIT		0x08
 
 /* A.6 Audio Class-Specific AS Interface Descriptor Subtypes */
 #define UAC_AS_GENERAL			0x01
@@ -151,7 +151,7 @@
 
 /* Terminal Control Selectors */
 /* 4.3.2  Class-Specific AC Interface Descriptor */
-struct uac_ac_header_descriptor_v1 {
+struct uac1_ac_header_descriptor {
 	__u8  bLength;			/* 8 + n */
 	__u8  bDescriptorType;		/* USB_DT_CS_INTERFACE */
 	__u8  bDescriptorSubtype;	/* UAC_MS_HEADER */
@@ -165,7 +165,7 @@ struct uac_ac_header_descriptor_v1 {
 
 /* As above, but more useful for defining your own descriptors: */
 #define DECLARE_UAC_AC_HEADER_DESCRIPTOR(n)			\
-struct uac_ac_header_descriptor_v1_##n {			\
+struct uac1_ac_header_descriptor_##n {			\
 	__u8  bLength;						\
 	__u8  bDescriptorType;					\
 	__u8  bDescriptorSubtype;				\
@@ -205,7 +205,7 @@ struct uac_input_terminal_descriptor {
 #define UAC_TERMINAL_CS_COPY_PROTECT_CONTROL		0x01
 
 /* 4.3.2.2 Output Terminal Descriptor */
-struct uac_output_terminal_descriptor_v1 {
+struct uac1_output_terminal_descriptor {
 	__u8  bLength;			/* in bytes: 9 */
 	__u8  bDescriptorType;		/* CS_INTERFACE descriptor type */
 	__u8  bDescriptorSubtype;	/* OUTPUT_TERMINAL descriptor subtype */
@@ -395,7 +395,7 @@ static inline __u8 *uac_processing_unit_specific(struct uac_processing_unit_desc
 }
 
 /* 4.5.2 Class-Specific AS Interface Descriptor */
-struct uac_as_header_descriptor_v1 {
+struct uac1_as_header_descriptor {
 	__u8  bLength;			/* in bytes: 7 */
 	__u8  bDescriptorType;		/* USB_DT_CS_INTERFACE */
 	__u8  bDescriptorSubtype;	/* AS_GENERAL */
diff --git a/sound/usb/card.c b/sound/usb/card.c
index 7a8ac1d81be7..9feb00c831a0 100644
--- a/sound/usb/card.c
+++ b/sound/usb/card.c
@@ -217,7 +217,7 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif)
 
 	switch (protocol) {
 	case UAC_VERSION_1: {
-		struct uac_ac_header_descriptor_v1 *h1 = control_header;
+		struct uac1_ac_header_descriptor *h1 = control_header;
 
 		if (!h1->bInCollection) {
 			snd_printk(KERN_INFO "skipping empty audio interface (v1)\n");
diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c
index 6f6596cf2b19..2af0f9e3dcdf 100644
--- a/sound/usb/endpoint.c
+++ b/sound/usb/endpoint.c
@@ -275,7 +275,7 @@ int snd_usb_parse_audio_endpoints(struct snd_usb_audio *chip, int iface_no)
 		/* get audio formats */
 		switch (protocol) {
 		case UAC_VERSION_1: {
-			struct uac_as_header_descriptor_v1 *as =
+			struct uac1_as_header_descriptor *as =
 				snd_usb_find_csint_desc(alts->extra, alts->extralen, NULL, UAC_AS_GENERAL);
 
 			if (!as) {
@@ -297,7 +297,7 @@ int snd_usb_parse_audio_endpoints(struct snd_usb_audio *chip, int iface_no)
 		case UAC_VERSION_2: {
 			struct uac2_input_terminal_descriptor *input_term;
 			struct uac2_output_terminal_descriptor *output_term;
-			struct uac_as_header_descriptor_v2 *as =
+			struct uac2_as_header_descriptor *as =
 				snd_usb_find_csint_desc(alts->extra, alts->extralen, NULL, UAC_AS_GENERAL);
 
 			if (!as) {
diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index 736d134cc03c..ba54eb6bb0c9 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -582,9 +582,9 @@ static int get_term_name(struct mixer_build *state, struct usb_audio_term *iterm
 		switch (iterm->type >> 16) {
 		case UAC_SELECTOR_UNIT:
 			strcpy(name, "Selector"); return 8;
-		case UAC_PROCESSING_UNIT_V1:
+		case UAC1_PROCESSING_UNIT:
 			strcpy(name, "Process Unit"); return 12;
-		case UAC_EXTENSION_UNIT_V1:
+		case UAC1_EXTENSION_UNIT:
 			strcpy(name, "Ext Unit"); return 8;
 		case UAC_MIXER_UNIT:
 			strcpy(name, "Mixer"); return 5;
@@ -672,8 +672,8 @@ static int check_input_term(struct mixer_build *state, int id, struct usb_audio_
 			term->name = uac_selector_unit_iSelector(d);
 			return 0;
 		}
-		case UAC_PROCESSING_UNIT_V1:
-		case UAC_EXTENSION_UNIT_V1: {
+		case UAC1_PROCESSING_UNIT:
+		case UAC1_EXTENSION_UNIT: {
 			struct uac_processing_unit_descriptor *d = p1;
 			if (d->bNrInPins) {
 				id = d->baSourceID[0];
@@ -1855,13 +1855,13 @@ static int parse_audio_unit(struct mixer_build *state, int unitid)
 		return parse_audio_selector_unit(state, unitid, p1);
 	case UAC_FEATURE_UNIT:
 		return parse_audio_feature_unit(state, unitid, p1);
-	case UAC_PROCESSING_UNIT_V1:
+	case UAC1_PROCESSING_UNIT:
 	/*   UAC2_EFFECT_UNIT has the same value */
 		if (state->mixer->protocol == UAC_VERSION_1)
 			return parse_audio_processing_unit(state, unitid, p1);
 		else
 			return 0; /* FIXME - effect units not implemented yet */
-	case UAC_EXTENSION_UNIT_V1:
+	case UAC1_EXTENSION_UNIT:
 	/*   UAC2_PROCESSING_UNIT_V2 has the same value */
 		if (state->mixer->protocol == UAC_VERSION_1)
 			return parse_audio_extension_unit(state, unitid, p1);
@@ -1925,7 +1925,7 @@ static int snd_usb_mixer_controls(struct usb_mixer_interface *mixer)
 	p = NULL;
 	while ((p = snd_usb_find_csint_desc(hostif->extra, hostif->extralen, p, UAC_OUTPUT_TERMINAL)) != NULL) {
 		if (mixer->protocol == UAC_VERSION_1) {
-			struct uac_output_terminal_descriptor_v1 *desc = p;
+			struct uac1_output_terminal_descriptor *desc = p;
 
 			if (desc->bLength < sizeof(*desc))
 				continue; /* invalid descriptor? */
-- 
cgit v1.2.3-59-g8ed1b


From 157a57b6fae7d3c6d24b7623dcc6679c6d244621 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Wed, 16 Jun 2010 17:57:30 +0200
Subject: ALSA: usb-audio: move and add some comments

Also add a list of open topics.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/usb/audio-v2.h | 15 +++++++++++++++
 sound/usb/clock.c            | 16 ++++++++++++++--
 sound/usb/mixer.c            | 24 ++++++++++++++++--------
 3 files changed, 45 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/audio-v2.h b/include/linux/usb/audio-v2.h
index 716aebe339e8..964cb603f7c7 100644
--- a/include/linux/usb/audio-v2.h
+++ b/include/linux/usb/audio-v2.h
@@ -18,6 +18,21 @@
 /* v1.0 and v2.0 of this standard have many things in common. For the rest
  * of the definitions, please refer to audio.h */
 
+/*
+ * bmControl field decoders
+ *
+ * From the USB Audio spec v2.0:
+ *
+ *   bmaControls() is a (ch+1)-element array of 4-byte bitmaps,
+ *   each containing a set of bit pairs. If a Control is present,
+ *   it must be Host readable. If a certain Control is not
+ *   present then the bit pair must be set to 0b00.
+ *   If a Control is present but read-only, the bit pair must be
+ *   set to 0b01. If a Control is also Host programmable, the bit
+ *   pair must be set to 0b11. The value 0b10 is not allowed.
+ *
+ */
+
 static inline bool uac2_control_is_readable(u32 bmControls, u8 control)
 {
 	return (bmControls >> (control * 2)) & 0x1;
diff --git a/sound/usb/clock.c b/sound/usb/clock.c
index 386b09c5ce73..7279d6190875 100644
--- a/sound/usb/clock.c
+++ b/sound/usb/clock.c
@@ -120,8 +120,6 @@ static bool uac_clock_source_is_valid(struct snd_usb_audio *chip, int source_id)
 	return !!data;
 }
 
-/* Try to find the clock source ID of a given clock entity */
-
 static int __uac_clock_find_source(struct snd_usb_audio *chip,
 				   struct usb_host_interface *host_iface,
 				   int entity_id, unsigned long *visited)
@@ -154,6 +152,8 @@ static int __uac_clock_find_source(struct snd_usb_audio *chip,
 		if (ret < 0)
 			return ret;
 
+		/* Selector values are one-based */
+
 		if (ret > selector->bNrInPins || ret < 1) {
 			printk(KERN_ERR
 				"%s(): selector reported illegal value, id %d, ret %d\n",
@@ -176,6 +176,17 @@ static int __uac_clock_find_source(struct snd_usb_audio *chip,
 	return -EINVAL;
 }
 
+/*
+ * For all kinds of sample rate settings and other device queries,
+ * the clock source (end-leaf) must be used. However, clock selectors,
+ * clock multipliers and sample rate converters may be specified as
+ * clock source input to terminal. This functions walks the clock path
+ * to its end and tries to find the source.
+ *
+ * The 'visited' bitfield is used internally to detect recursive loops.
+ *
+ * Returns the clock source UnitID (>=0) on success, or an error.
+ */
 int snd_usb_clock_find_source(struct snd_usb_audio *chip,
 			      struct usb_host_interface *host_iface,
 			      int entity_id)
@@ -246,6 +257,7 @@ static int set_sample_rate_v2(struct snd_usb_audio *chip, int iface,
 		return clock;
 
 	if (!uac_clock_source_is_valid(chip, clock)) {
+		/* TODO: should we try to find valid clock setups by ourself? */
 		snd_printk(KERN_ERR "%d:%d:%d: clock source %d is not valid, cannot use\n",
 			   dev->devnum, iface, fmt->altsetting, clock);
 		return -ENXIO;
diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index ba54eb6bb0c9..1163ec3ca8a0 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -26,6 +26,22 @@
  *
  */
 
+/*
+ * TODOs, for both the mixer and the streaming interfaces:
+ *
+ *  - support for UAC2 effect units
+ *  - support for graphical equalizers
+ *  - RANGE and MEM set commands (UAC2)
+ *  - RANGE and MEM interrupt dispatchers (UAC2)
+ *  - audio channel clustering (UAC2)
+ *  - audio sample rate converter units (UAC2)
+ *  - proper handling of clock multipliers (UAC2)
+ *  - dispatch clock change notifications (UAC2)
+ *  	- stop PCM streams which use a clock that became invalid
+ *  	- stop PCM streams which use a clock selector that has changed
+ *  	- parse available sample rates again when clock sources changed
+ */
+
 #include <linux/bitops.h>
 #include <linux/init.h>
 #include <linux/list.h>
@@ -1199,14 +1215,6 @@ static int parse_audio_feature_unit(struct mixer_build *state, int unitid, void
 		}
 	} else { /* UAC_VERSION_2 */
 		for (i = 0; i < 30/2; i++) {
-			/* From the USB Audio spec v2.0:
-			   bmaControls() is a (ch+1)-element array of 4-byte bitmaps,
-			   each containing a set of bit pairs. If a Control is present,
-			   it must be Host readable. If a certain Control is not
-			   present then the bit pair must be set to 0b00.
-			   If a Control is present but read-only, the bit pair must be
-			   set to 0b01. If a Control is also Host programmable, the bit
-			   pair must be set to 0b11. The value 0b10 is not allowed. */
 			unsigned int ch_bits = 0;
 			unsigned int ch_read_only = 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From 16b8a4761cbe5082cd35641c066d7c4b6b83cdca Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 22 Jun 2010 10:22:17 -0700
Subject: net: Introduce u64_stats_sync infrastructure

To properly implement 64bits network statistics on 32bit or 64bit hosts,
we provide one new type and four methods, to ease conversions.

Stats producer should use following template granted it already got an
exclusive access to counters (include/linux/u64_stats_sync.h contains
some documentation about details)

    u64_stats_update_begin(&stats->syncp);
    stats->bytes64 += len;
    stats->packets64++;
    u64_stats_update_end(&stats->syncp);

While a consumer should use following template to get consistent
snapshot :

    u64 tbytes, tpackets;
    unsigned int start;

    do {
        start = u64_stats_fetch_begin(&stats->syncp);
        tbytes = stats->bytes64;
        tpackets = stats->packets64;
    } while (u64_stats_fetch_retry(&stats->lock, syncp));

Suggested by David Miller, and comments courtesy of Nick Piggin.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/u64_stats_sync.h | 107 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 include/linux/u64_stats_sync.h

(limited to 'include/linux')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
new file mode 100644
index 000000000000..d0505156ed52
--- /dev/null
+++ b/include/linux/u64_stats_sync.h
@@ -0,0 +1,107 @@
+#ifndef _LINUX_U64_STATS_SYNC_H
+#define _LINUX_U64_STATS_SYNC_H
+
+/*
+ * To properly implement 64bits network statistics on 32bit and 64bit hosts,
+ * we provide a synchronization point, that is a noop on 64bit or UP kernels.
+ *
+ * Key points :
+ * 1) Use a seqcount on SMP 32bits, with low overhead.
+ * 2) Whole thing is a noop on 64bit arches or UP kernels.
+ * 3) Write side must ensure mutual exclusion or one seqcount update could
+ *    be lost, thus blocking readers forever.
+ *    If this synchronization point is not a mutex, but a spinlock or
+ *    spinlock_bh() or disable_bh() :
+ * 3.1) Write side should not sleep.
+ * 3.2) Write side should not allow preemption.
+ * 3.3) If applicable, interrupts should be disabled.
+ *
+ * 4) If reader fetches several counters, there is no guarantee the whole values
+ *    are consistent (remember point 1) : this is a noop on 64bit arches anyway)
+ *
+ * 5) readers are allowed to sleep or be preempted/interrupted : They perform
+ *    pure reads. But if they have to fetch many values, it's better to not allow
+ *    preemptions/interruptions to avoid many retries.
+ *
+ * Usage :
+ *
+ * Stats producer (writer) should use following template granted it already got
+ * an exclusive access to counters (a lock is already taken, or per cpu
+ * data is used [in a non preemptable context])
+ *
+ *   spin_lock_bh(...) or other synchronization to get exclusive access
+ *   ...
+ *   u64_stats_update_begin(&stats->syncp);
+ *   stats->bytes64 += len; // non atomic operation
+ *   stats->packets64++;    // non atomic operation
+ *   u64_stats_update_end(&stats->syncp);
+ *
+ * While a consumer (reader) should use following template to get consistent
+ * snapshot for each variable (but no guarantee on several ones)
+ *
+ * u64 tbytes, tpackets;
+ * unsigned int start;
+ *
+ * do {
+ *         start = u64_stats_fetch_begin(&stats->syncp);
+ *         tbytes = stats->bytes64; // non atomic operation
+ *         tpackets = stats->packets64; // non atomic operation
+ * } while (u64_stats_fetch_retry(&stats->lock, syncp));
+ *
+ *
+ * Example of use in drivers/net/loopback.c, using per_cpu containers,
+ * in BH disabled context.
+ */
+#include <linux/seqlock.h>
+
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+struct u64_stats_sync {
+	seqcount_t	seq;
+};
+
+static void inline u64_stats_update_begin(struct u64_stats_sync *syncp)
+{
+	write_seqcount_begin(&syncp->seq);
+}
+
+static void inline u64_stats_update_end(struct u64_stats_sync *syncp)
+{
+	write_seqcount_end(&syncp->seq);
+}
+
+static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+{
+	return read_seqcount_begin(&syncp->seq);
+}
+
+static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+					 unsigned int start)
+{
+	return read_seqcount_retry(&syncp->seq, start);
+}
+
+#else
+struct u64_stats_sync {
+};
+
+static void inline u64_stats_update_begin(struct u64_stats_sync *syncp)
+{
+}
+
+static void inline u64_stats_update_end(struct u64_stats_sync *syncp)
+{
+}
+
+static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+{
+	return 0;
+}
+
+static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+					 unsigned int start)
+{
+	return false;
+}
+#endif
+
+#endif /* _LINUX_U64_STATS_SYNC_H */
-- 
cgit v1.2.3-59-g8ed1b


From 63a6404d8ae693e71ab27c4f9c4032aa29113e92 Mon Sep 17 00:00:00 2001
From: Henrik Rydberg <rydberg@euromail.se>
Date: Thu, 10 Jun 2010 12:05:24 -0700
Subject: Input: evdev - use driver hint to compute size of event buffer

Some devices, in particular MT devices, produce a lot of data.  This
may lead to overflowing of the event queues in evdev driver, which
by default are fairly small. Let the drivers hint the average number
of events per packet generated by the device, and use that information
when computing the buffer size evdev should use for the device.

Signed-off-by: Henrik Rydberg <rydberg@euromail.se>
Acked-by: Chase Douglas <chase.douglas@canonical.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/evdev.c |  9 +++++++--
 include/linux/input.h | 21 +++++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index cff7bf9351a8..30836c05edd7 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -10,7 +10,8 @@
 
 #define EVDEV_MINOR_BASE	64
 #define EVDEV_MINORS		32
-#define EVDEV_MIN_BUFFER_SIZE	64
+#define EVDEV_MIN_BUFFER_SIZE	64U
+#define EVDEV_BUF_PACKETS	8
 
 #include <linux/poll.h>
 #include <linux/sched.h>
@@ -245,7 +246,11 @@ static int evdev_release(struct inode *inode, struct file *file)
 
 static unsigned int evdev_compute_buffer_size(struct input_dev *dev)
 {
-	return EVDEV_MIN_BUFFER_SIZE;
+	unsigned int n_events =
+		max(dev->hint_events_per_packet * EVDEV_BUF_PACKETS,
+		    EVDEV_MIN_BUFFER_SIZE);
+
+	return roundup_pow_of_two(n_events);
 }
 
 static int evdev_open(struct inode *inode, struct file *file)
diff --git a/include/linux/input.h b/include/linux/input.h
index 6fcc9101beeb..cc524c8b6703 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -1063,6 +1063,10 @@ struct ff_effect {
  * @sndbit: bitmap of sound effects supported by the device
  * @ffbit: bitmap of force feedback effects supported by the device
  * @swbit: bitmap of switches present on the device
+ * @hint_events_per_packet: average number of events generated by the
+ *	device in a packet (between EV_SYN/SYN_REPORT events). Used by
+ *	event handlers to estimate size of the buffer needed to hold
+ *	events.
  * @keycodemax: size of keycode table
  * @keycodesize: size of elements in keycode table
  * @keycode: map of scancodes to keycodes for this device
@@ -1140,6 +1144,8 @@ struct input_dev {
 	unsigned long ffbit[BITS_TO_LONGS(FF_CNT)];
 	unsigned long swbit[BITS_TO_LONGS(SW_CNT)];
 
+	unsigned int hint_events_per_packet;
+
 	unsigned int keycodemax;
 	unsigned int keycodesize;
 	void *keycode;
@@ -1408,6 +1414,21 @@ static inline void input_mt_sync(struct input_dev *dev)
 
 void input_set_capability(struct input_dev *dev, unsigned int type, unsigned int code);
 
+/**
+ * input_set_events_per_packet - tell handlers about the driver event rate
+ * @dev: the input device used by the driver
+ * @n_events: the average number of events between calls to input_sync()
+ *
+ * If the event rate sent from a device is unusually large, use this
+ * function to set the expected event rate. This will allow handlers
+ * to set up an appropriate buffer size for the event stream, in order
+ * to minimize information loss.
+ */
+static inline void input_set_events_per_packet(struct input_dev *dev, int n_events)
+{
+	dev->hint_events_per_packet = n_events;
+}
+
 static inline void input_set_abs_params(struct input_dev *dev, int axis, int min, int max, int fuzz, int flat)
 {
 	dev->absmin[axis] = min;
-- 
cgit v1.2.3-59-g8ed1b


From 7b2ff18ee7b0ec4bc3162f821e221781aaca48bd Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 15 Jun 2010 01:07:31 +0000
Subject: net - IP_NODEFRAG option for IPv4 socket

this patch is implementing IP_NODEFRAG option for IPv4 socket.
The reason is, there's no other way to send out the packet with user
customized header of the reassembly part.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in.h                  | 1 +
 include/net/inet_sock.h             | 3 ++-
 net/ipv4/af_inet.c                  | 2 ++
 net/ipv4/ip_sockglue.c              | 9 ++++++++-
 net/ipv4/netfilter/nf_defrag_ipv4.c | 5 +++++
 5 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/in.h b/include/linux/in.h
index 583c76f9c30f..41d88a4689af 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -85,6 +85,7 @@ struct in_addr {
 #define IP_RECVORIGDSTADDR   IP_ORIGDSTADDR
 
 #define IP_MINTTL       21
+#define IP_NODEFRAG     22
 
 /* IP_MTU_DISCOVER values */
 #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 1653de515cee..1989cfd7405f 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -137,7 +137,8 @@ struct inet_sock {
 				hdrincl:1,
 				mc_loop:1,
 				transparent:1,
-				mc_all:1;
+				mc_all:1,
+				nodefrag:1;
 	int			mc_index;
 	__be32			mc_addr;
 	struct ip_mc_socklist	*mc_list;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d99e7e020189..b4c0969137cb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -355,6 +355,8 @@ lookup_protocol:
 	inet = inet_sk(sk);
 	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
 
+	inet->nodefrag = 0;
+
 	if (SOCK_RAW == sock->type) {
 		inet->inet_num = protocol;
 		if (IPPROTO_RAW == protocol)
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 47fff528ff39..6c40a8c46e79 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -465,7 +465,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			     (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
 			     (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
 			     (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
-			     (1<<IP_MINTTL))) ||
+			     (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
 	    optname == IP_MULTICAST_TTL ||
 	    optname == IP_MULTICAST_ALL ||
 	    optname == IP_MULTICAST_LOOP ||
@@ -588,6 +588,13 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		}
 		inet->hdrincl = val ? 1 : 0;
 		break;
+	case IP_NODEFRAG:
+		if (sk->sk_type != SOCK_RAW) {
+			err = -ENOPROTOOPT;
+			break;
+		}
+		inet->nodefrag = val ? 1 : 0;
+		break;
 	case IP_MTU_DISCOVER:
 		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
 			goto e_inval;
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index cb763ae9ed90..eab8de32f200 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -66,6 +66,11 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
 					  const struct net_device *out,
 					  int (*okfn)(struct sk_buff *))
 {
+	struct inet_sock *inet = inet_sk(skb->sk);
+
+	if (inet && inet->nodefrag)
+		return NF_ACCEPT;
+
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
 	/* Previously seen (loopback)?  Ignore.  Do this before
-- 
cgit v1.2.3-59-g8ed1b


From fa61cf70a6ae1089e459e4b59b2e8d8e90d8535e Mon Sep 17 00:00:00 2001
From: Juuso Oikarinen <juuso.oikarinen@nokia.com>
Date: Wed, 23 Jun 2010 12:12:37 +0300
Subject: cfg80211/mac80211: Update set_tx_power to use mBm instead of dBm
 units

In preparation for a TX power setting interface in the nl80211, change the
.set_tx_power function to use mBm units instead of dBm for greater accuracy and
smaller power levels.

Also, already in advance move the tx_power_setting enumeration to nl80211.

This change affects the .tx_set_power function prototype. As a result, the
corresponding changes are needed to modules using it. These are mac80211,
iwmc3200wifi and rndis_wlan.

Cc: Samuel Ortiz <samuel.ortiz@intel.com>
Cc: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Juuso Oikarinen <juuso.oikarinen@nokia.com>
Acked-by: Samuel Ortiz <samuel.ortiz@intel.com>
Acked-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/iwmc3200wifi/cfg80211.c | 12 ++++++++----
 drivers/net/wireless/rndis_wlan.c            | 20 +++++++++++++-------
 include/linux/nl80211.h                      | 13 +++++++++++++
 include/net/cfg80211.h                       | 15 +--------------
 net/mac80211/cfg.c                           | 22 +++++++++++-----------
 net/wireless/wext-compat.c                   | 10 +++++-----
 6 files changed, 51 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/iwmc3200wifi/cfg80211.c b/drivers/net/wireless/iwmc3200wifi/cfg80211.c
index 902e95f70f6e..60619678f4ec 100644
--- a/drivers/net/wireless/iwmc3200wifi/cfg80211.c
+++ b/drivers/net/wireless/iwmc3200wifi/cfg80211.c
@@ -670,20 +670,24 @@ static int iwm_cfg80211_disconnect(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int iwm_cfg80211_set_txpower(struct wiphy *wiphy,
-				    enum tx_power_setting type, int dbm)
+				    enum nl80211_tx_power_setting type, int mbm)
 {
 	struct iwm_priv *iwm = wiphy_to_iwm(wiphy);
 	int ret;
 
 	switch (type) {
-	case TX_POWER_AUTOMATIC:
+	case NL80211_TX_POWER_AUTOMATIC:
 		return 0;
-	case TX_POWER_FIXED:
+	case NL80211_TX_POWER_FIXED:
+		if (mbm < 0 || (mbm % 100))
+			return -EOPNOTSUPP;
+
 		if (!test_bit(IWM_STATUS_READY, &iwm->status))
 			return 0;
 
 		ret = iwm_umac_set_config_fix(iwm, UMAC_PARAM_TBL_CFG_FIX,
-					      CFG_TX_PWR_LIMIT_USR, dbm * 2);
+					      CFG_TX_PWR_LIMIT_USR,
+					      MBM_TO_DBM(mbm) * 2);
 		if (ret < 0)
 			return ret;
 
diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c
index 4102cca54882..5e26edb57d82 100644
--- a/drivers/net/wireless/rndis_wlan.c
+++ b/drivers/net/wireless/rndis_wlan.c
@@ -520,8 +520,9 @@ static int rndis_scan(struct wiphy *wiphy, struct net_device *dev,
 
 static int rndis_set_wiphy_params(struct wiphy *wiphy, u32 changed);
 
-static int rndis_set_tx_power(struct wiphy *wiphy, enum tx_power_setting type,
-				int dbm);
+static int rndis_set_tx_power(struct wiphy *wiphy,
+			      enum nl80211_tx_power_setting type,
+			      int mbm);
 static int rndis_get_tx_power(struct wiphy *wiphy, int *dbm);
 
 static int rndis_connect(struct wiphy *wiphy, struct net_device *dev,
@@ -1856,20 +1857,25 @@ static int rndis_set_wiphy_params(struct wiphy *wiphy, u32 changed)
 	return 0;
 }
 
-static int rndis_set_tx_power(struct wiphy *wiphy, enum tx_power_setting type,
-				int dbm)
+static int rndis_set_tx_power(struct wiphy *wiphy,
+			      enum nl80211_tx_power_setting type,
+			      int mbm)
 {
 	struct rndis_wlan_private *priv = wiphy_priv(wiphy);
 	struct usbnet *usbdev = priv->usbdev;
 
-	netdev_dbg(usbdev->net, "%s(): type:0x%x dbm:%i\n",
-		   __func__, type, dbm);
+	netdev_dbg(usbdev->net, "%s(): type:0x%x mbm:%i\n",
+		   __func__, type, mbm);
+
+	if (mbm < 0 || (mbm % 100))
+		return -ENOTSUPP;
 
 	/* Device doesn't support changing txpower after initialization, only
 	 * turn off/on radio. Support 'auto' mode and setting same dBm that is
 	 * currently used.
 	 */
-	if (type == TX_POWER_AUTOMATIC || dbm == get_bcm4320_power_dbm(priv)) {
+	if (type == NL80211_TX_POWER_AUTOMATIC ||
+	    MBM_TO_DBM(mbm) == get_bcm4320_power_dbm(priv)) {
 		if (!priv->radio_on)
 			disassociate(usbdev, true); /* turn on radio */
 
diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 64fb32b93a28..07aa04693f94 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -1659,4 +1659,17 @@ enum nl80211_cqm_rssi_threshold_event {
 	NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH,
 };
 
+
+/**
+ * enum nl80211_tx_power_setting - TX power adjustment
+ * @NL80211_TX_POWER_AUTOMATIC: automatically determine transmit power
+ * @NL80211_TX_POWER_LIMITED: limit TX power by the mBm parameter
+ * @NL80211_TX_POWER_FIXED: fix TX power to the mBm parameter
+ */
+enum nl80211_tx_power_setting {
+	NL80211_TX_POWER_AUTOMATIC,
+	NL80211_TX_POWER_LIMITED,
+	NL80211_TX_POWER_FIXED,
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 64374f4cb7c6..9b8b3f486ec8 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -875,19 +875,6 @@ enum wiphy_params_flags {
 	WIPHY_PARAM_COVERAGE_CLASS	= 1 << 4,
 };
 
-/**
- * enum tx_power_setting - TX power adjustment
- *
- * @TX_POWER_AUTOMATIC: the dbm parameter is ignored
- * @TX_POWER_LIMITED: limit TX power by the dbm parameter
- * @TX_POWER_FIXED: fix TX power to the dbm parameter
- */
-enum tx_power_setting {
-	TX_POWER_AUTOMATIC,
-	TX_POWER_LIMITED,
-	TX_POWER_FIXED,
-};
-
 /*
  * cfg80211_bitrate_mask - masks for bitrate control
  */
@@ -1149,7 +1136,7 @@ struct cfg80211_ops {
 	int	(*set_wiphy_params)(struct wiphy *wiphy, u32 changed);
 
 	int	(*set_tx_power)(struct wiphy *wiphy,
-				enum tx_power_setting type, int dbm);
+				enum nl80211_tx_power_setting type, int mbm);
 	int	(*get_tx_power)(struct wiphy *wiphy, int *dbm);
 
 	int	(*set_wds_peer)(struct wiphy *wiphy, struct net_device *dev,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 003b6addf5fa..f4efbfa4f237 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1329,28 +1329,28 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
 }
 
 static int ieee80211_set_tx_power(struct wiphy *wiphy,
-				  enum tx_power_setting type, int dbm)
+				  enum nl80211_tx_power_setting type, int mbm)
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_channel *chan = local->hw.conf.channel;
 	u32 changes = 0;
 
 	switch (type) {
-	case TX_POWER_AUTOMATIC:
+	case NL80211_TX_POWER_AUTOMATIC:
 		local->user_power_level = -1;
 		break;
-	case TX_POWER_LIMITED:
-		if (dbm < 0)
-			return -EINVAL;
-		local->user_power_level = dbm;
+	case NL80211_TX_POWER_LIMITED:
+		if (mbm < 0 || (mbm % 100))
+			return -EOPNOTSUPP;
+		local->user_power_level = MBM_TO_DBM(mbm);
 		break;
-	case TX_POWER_FIXED:
-		if (dbm < 0)
-			return -EINVAL;
+	case NL80211_TX_POWER_FIXED:
+		if (mbm < 0 || (mbm % 100))
+			return -EOPNOTSUPP;
 		/* TODO: move to cfg80211 when it knows the channel */
-		if (dbm > chan->max_power)
+		if (MBM_TO_DBM(mbm) > chan->max_power)
 			return -EINVAL;
-		local->user_power_level = dbm;
+		local->user_power_level = MBM_TO_DBM(mbm);
 		break;
 	}
 
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 96342993cf93..1ff1e9f49136 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -829,7 +829,7 @@ int cfg80211_wext_siwtxpower(struct net_device *dev,
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
-	enum tx_power_setting type;
+	enum nl80211_tx_power_setting type;
 	int dbm = 0;
 
 	if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM)
@@ -852,7 +852,7 @@ int cfg80211_wext_siwtxpower(struct net_device *dev,
 			if (data->txpower.value < 0)
 				return -EINVAL;
 			dbm = data->txpower.value;
-			type = TX_POWER_FIXED;
+			type = NL80211_TX_POWER_FIXED;
 			/* TODO: do regulatory check! */
 		} else {
 			/*
@@ -860,10 +860,10 @@ int cfg80211_wext_siwtxpower(struct net_device *dev,
 			 * passed in from userland.
 			 */
 			if (data->txpower.value < 0) {
-				type = TX_POWER_AUTOMATIC;
+				type = NL80211_TX_POWER_AUTOMATIC;
 			} else {
 				dbm = data->txpower.value;
-				type = TX_POWER_LIMITED;
+				type = NL80211_TX_POWER_LIMITED;
 			}
 		}
 	} else {
@@ -872,7 +872,7 @@ int cfg80211_wext_siwtxpower(struct net_device *dev,
 		return 0;
 	}
 
-	return rdev->ops->set_tx_power(wdev->wiphy, type, dbm);
+	return rdev->ops->set_tx_power(wdev->wiphy, type, DBM_TO_MBM(dbm));
 }
 EXPORT_SYMBOL_GPL(cfg80211_wext_siwtxpower);
 
-- 
cgit v1.2.3-59-g8ed1b


From 98d2ff8bec82fc35fe2008a187a5fef9241dab10 Mon Sep 17 00:00:00 2001
From: Juuso Oikarinen <juuso.oikarinen@nokia.com>
Date: Wed, 23 Jun 2010 12:12:38 +0300
Subject: nl80211: Add option to adjust transmit power

This patch adds transmit power setting type and transmit power level attributes
to NL80211_CMD_SET_WIPHY in order to facilitate adjusting of the transmit power
level of the device.

The added attributes allow selection of automatic, limited or fixed transmit
power level, with the level definable in signed mBm format.

Signed-off-by: Juuso Oikarinen <juuso.oikarinen@nokia.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h |  9 +++++++++
 net/wireless/nl80211.c  | 31 +++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 07aa04693f94..2c8701687336 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -725,6 +725,12 @@ enum nl80211_commands {
  * @NL80211_ATTR_AP_ISOLATE: (AP mode) Do not forward traffic between stations
  *	connected to this BSS.
  *
+ * @NL80211_ATTR_WIPHY_TX_POWER_SETTING: Transmit power setting type. See
+ *      &enum nl80211_tx_power_setting for possible values.
+ * @NL80211_ATTR_WIPHY_TX_POWER_LEVEL: Transmit power level in signed mBm units.
+ *      This is used in association with @NL80211_ATTR_WIPHY_TX_POWER_SETTING
+ *      for non-automatic settings.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -882,6 +888,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_AP_ISOLATE,
 
+	NL80211_ATTR_WIPHY_TX_POWER_SETTING,
+	NL80211_ATTR_WIPHY_TX_POWER_LEVEL,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 41529aca794c..a999fc154623 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -153,6 +153,9 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
 	[NL80211_ATTR_CQM] = { .type = NLA_NESTED, },
 	[NL80211_ATTR_LOCAL_STATE_CHANGE] = { .type = NLA_FLAG },
 	[NL80211_ATTR_AP_ISOLATE] = { .type = NLA_U8 },
+
+	[NL80211_ATTR_WIPHY_TX_POWER_SETTING] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] = { .type = NLA_U32 },
 };
 
 /* policy for the attributes */
@@ -869,6 +872,34 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			goto bad_res;
 	}
 
+	if (info->attrs[NL80211_ATTR_WIPHY_TX_POWER_SETTING]) {
+		enum nl80211_tx_power_setting type;
+		int idx, mbm = 0;
+
+		if (!rdev->ops->set_tx_power) {
+			return -EOPNOTSUPP;
+			goto bad_res;
+		}
+
+		idx = NL80211_ATTR_WIPHY_TX_POWER_SETTING;
+		type = nla_get_u32(info->attrs[idx]);
+
+		if (!info->attrs[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] &&
+		    (type != NL80211_TX_POWER_AUTOMATIC)) {
+			result = -EINVAL;
+			goto bad_res;
+		}
+
+		if (type != NL80211_TX_POWER_AUTOMATIC) {
+			idx = NL80211_ATTR_WIPHY_TX_POWER_LEVEL;
+			mbm = nla_get_u32(info->attrs[idx]);
+		}
+
+		result = rdev->ops->set_tx_power(&rdev->wiphy, type, mbm);
+		if (result)
+			goto bad_res;
+	}
+
 	changed = 0;
 
 	if (info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]) {
-- 
cgit v1.2.3-59-g8ed1b


From 45a73372efe4a63f44aa2e1125d4a777c2fdc8d8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 23 Jun 2010 23:00:37 +0200
Subject: hw_breakpoints: Fix per task breakpoint tracking

Freeing a perf event can happen in several ways. A task
calls perf_event_exit_task() right before exiting. This helper
will detach all the events from the task context and queue their
removal through free_event() if they are child tasks. The task
also loses its context reference there.

Releasing the breakpoint slot from the constraint table is made
from free_event() that calls release_bp_slot(). We count the number
of breakpoints this task is running by looking at the task's
perf_event_ctxp and iterating through its attached events.
But at this time, the reference to this context has been cleaned up
already.

So looking at the event->ctx instead of task->perf_event_ctxp
to count the remaining breakpoints should solve the problem.
At least it would for child breakpoints, but not for parent ones.
If the parent exits before the child, it will remove all its
events from the context but free_event() will be called later,
on fd release time. And checking the number of breakpoints the
task has attached to its context at this time is unreliable as all
events have been removed from the context.

To solve this, we keep track of the list of per task breakpoints.
On top of it, we maintain our array of numbers of breakpoints used
by the tasks. We use the context address as a task id.

So, instead of looking at the number of events attached to a context,
we walk through our list of per task breakpoints and count the number
of breakpoints that use the same ctx than the one to be reserved or
released from the constraint table, and update the count on top of this
result.

In the meantime it solves a bad refcounting, it also solves a warning,
reported by Paul.

Badness at /home/paulus/kernel/perf/kernel/hw_breakpoint.c:114
NIP: c0000000000cb470 LR: c0000000000cb46c CTR: c00000000032d9b8
REGS: c000000118e7b570 TRAP: 0700   Not tainted  (2.6.35-rc3-perf-00008-g76b0f13
)
MSR: 9000000000029032 <EE,ME,CE,IR,DR>  CR: 44004424  XER: 000fffff
TASK = c0000001187dcad0[3143] 'perf' THREAD: c000000118e78000 CPU: 1
GPR00: c0000000000cb46c c000000118e7b7f0 c0000000009866a0 0000000000000020
GPR04: 0000000000000000 000000000000001d 0000000000000000 0000000000000001
GPR08: c0000000009bed68 c00000000086dff8 c000000000a5bf10 0000000000000001
GPR12: 0000000024004422 c00000000ffff200 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000018 00000000101150f4
GPR20: 0000000010206b40 0000000000000000 0000000000000000 00000000101150f4
GPR24: c0000001199090c0 0000000000000001 0000000000000000 0000000000000001
GPR28: 0000000000000000 0000000000000000 c0000000008ec290 0000000000000000
NIP [c0000000000cb470] .task_bp_pinned+0x5c/0x12c
LR [c0000000000cb46c] .task_bp_pinned+0x58/0x12c
Call Trace:
[c000000118e7b7f0] [c0000000000cb46c] .task_bp_pinned+0x58/0x12c (unreliable)
[c000000118e7b8a0] [c0000000000cb584] .toggle_bp_task_slot+0x44/0xe4
[c000000118e7b940] [c0000000000cb6c8] .toggle_bp_slot+0xa4/0x164
[c000000118e7b9f0] [c0000000000cbafc] .release_bp_slot+0x44/0x6c
[c000000118e7ba80] [c0000000000c4178] .bp_perf_event_destroy+0x10/0x24
[c000000118e7bb00] [c0000000000c4aec] .free_event+0x180/0x1bc
[c000000118e7bbc0] [c0000000000c54c4] .perf_event_release_kernel+0x14c/0x170

Reported-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
---
 include/linux/perf_event.h |  6 ++--
 kernel/hw_breakpoint.c     | 78 ++++++++++++++++++++++++----------------------
 2 files changed, 45 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 63b5aa5dce69..0dd5f8ad77ac 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -533,8 +533,10 @@ struct hw_perf_event {
 			struct hrtimer	hrtimer;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-		/* breakpoint */
-		struct arch_hw_breakpoint	info;
+		struct { /* breakpoint */
+			struct arch_hw_breakpoint	info;
+			struct list_head		bp_list;
+		};
 #endif
 	};
 	local64_t			prev_count;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 7a56b22e0602..e34d94d50924 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -41,6 +41,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 
@@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
 
 static int nr_slots[TYPE_MAX];
 
+/* Keep track of the breakpoints attached to tasks */
+static LIST_HEAD(bp_task_head);
+
 static int constraints_initialized;
 
 /* Gather the number of total pinned and un-pinned bp in a cpuset */
@@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 	return 0;
 }
 
-static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
+/*
+ * Count the number of breakpoints of the same type and same task.
+ * The given event must be not on the list.
+ */
+static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
 {
-	struct perf_event_context *ctx = tsk->perf_event_ctxp;
-	struct list_head *list;
-	struct perf_event *bp;
-	unsigned long flags;
+	struct perf_event_context *ctx = bp->ctx;
+	struct perf_event *iter;
 	int count = 0;
 
-	if (WARN_ONCE(!ctx, "No perf context for this task"))
-		return 0;
-
-	list = &ctx->event_list;
-
-	raw_spin_lock_irqsave(&ctx->lock, flags);
-
-	/*
-	 * The current breakpoint counter is not included in the list
-	 * at the open() callback time
-	 */
-	list_for_each_entry(bp, list, event_entry) {
-		if (bp->attr.type == PERF_TYPE_BREAKPOINT)
-			if (find_slot_idx(bp) == type)
-				count += hw_breakpoint_weight(bp);
+	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
+		if (iter->ctx == ctx && find_slot_idx(iter) == type)
+			count += hw_breakpoint_weight(iter);
 	}
 
-	raw_spin_unlock_irqrestore(&ctx->lock, flags);
-
 	return count;
 }
 
@@ -149,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 		if (!tsk)
 			slots->pinned += max_task_bp_pinned(cpu, type);
 		else
-			slots->pinned += task_bp_pinned(tsk, type);
+			slots->pinned += task_bp_pinned(bp, type);
 		slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
 
 		return;
@@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 		if (!tsk)
 			nr += max_task_bp_pinned(cpu, type);
 		else
-			nr += task_bp_pinned(tsk, type);
+			nr += task_bp_pinned(bp, type);
 
 		if (nr > slots->pinned)
 			slots->pinned = nr;
@@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
 /*
  * Add a pinned breakpoint for the given task in our constraint table
  */
-static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
+static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
 				enum bp_type_idx type, int weight)
 {
 	unsigned int *tsk_pinned;
@@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
 	int old_idx = 0;
 	int idx = 0;
 
-	old_count = task_bp_pinned(tsk, type);
+	old_count = task_bp_pinned(bp, type);
 	old_idx = old_count - 1;
 	idx = old_idx + weight;
 
+	/* tsk_pinned[n] is the number of tasks having n breakpoints */
 	tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 	if (enable) {
 		tsk_pinned[idx]++;
@@ -222,23 +215,30 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
 	int cpu = bp->cpu;
 	struct task_struct *tsk = bp->ctx->task;
 
+	/* Pinned counter cpu profiling */
+	if (!tsk) {
+
+		if (enable)
+			per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
+		else
+			per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
+		return;
+	}
+
 	/* Pinned counter task profiling */
-	if (tsk) {
-		if (cpu >= 0) {
-			toggle_bp_task_slot(tsk, cpu, enable, type, weight);
-			return;
-		}
 
+	if (!enable)
+		list_del(&bp->hw.bp_list);
+
+	if (cpu >= 0) {
+		toggle_bp_task_slot(bp, cpu, enable, type, weight);
+	} else {
 		for_each_online_cpu(cpu)
-			toggle_bp_task_slot(tsk, cpu, enable, type, weight);
-		return;
+			toggle_bp_task_slot(bp, cpu, enable, type, weight);
 	}
 
-	/* Pinned counter cpu profiling */
 	if (enable)
-		per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
-	else
-		per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
+		list_add_tail(&bp->hw.bp_list, &bp_task_head);
 }
 
 /*
@@ -301,6 +301,10 @@ static int __reserve_bp_slot(struct perf_event *bp)
 	weight = hw_breakpoint_weight(bp);
 
 	fetch_bp_busy_slots(&slots, bp, type);
+	/*
+	 * Simulate the addition of this breakpoint to the constraints
+	 * and see the result.
+	 */
 	fetch_this_slot(&slots, weight);
 
 	/* Flexible counters need to keep at least one slot */
-- 
cgit v1.2.3-59-g8ed1b


From 5cfaf214856eb934759ae500a0b812dd06a00bd9 Mon Sep 17 00:00:00 2001
From: Nobuhiro Iwamatsu <nobuhiro.iwamatsu.yj@renesas.com>
Date: Wed, 23 Jun 2010 09:17:53 +0900
Subject: perf: Fix argument of perf_arch_fetch_caller_regs

"struct regs" was set to argument of perf_arch_fetch_caller_regs
off-case. It should be "struct pt_regs".

This fixes various build errors in archs that have CONFIG_PERF_EVENTS=y
but no overriden implementation of perf_arch_fetch_caller_regs.

cc1: warnings being treated as errors
In file included from include/linux/ftrace_event.h:8,
                 from include/trace/syscall.h:6,
                 from include/linux/syscalls.h:75,
                 from arch/sh/kernel/sys_sh32.c:9:
include/linux/perf_event.h:937: error: 'struct regs' declared inside parameter list
include/linux/perf_event.h:937: error: its scope is only this definition or declaration, which is probably not what you want
include/linux/perf_event.h: In function 'perf_fetch_caller_regs':
include/linux/perf_event.h:952: error: passing argument 1 of 'perf_arch_fetch_caller_regs' from incompatible pointer type

Signed-off-by: Nobuhiro Iwamatsu <nobuhiro.iwamatsu.yj@renesas.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <AANLkTinKKFKEBQrZ3Hkj-XCaMwaTqulb-XnFzqEYiFRr@mail.gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0dd5f8ad77ac..937495c25073 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -936,7 +936,7 @@ extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
 
 #ifndef perf_arch_fetch_caller_regs
 static inline void
-perf_arch_fetch_caller_regs(struct regs *regs, unsigned long ip) { }
+perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
 #endif
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 69a4af606ed4836faa2ec69b1d217f384b8235e7 Mon Sep 17 00:00:00 2001
From: Xiaolong CHEN <a21785@motorola.com>
Date: Thu, 24 Jun 2010 19:10:40 -0700
Subject: Input: adp5588-keys - support GPI events for ADP5588 devices

A column or row configured as a GPI can be programmed to be part
of the key event table and therefore also capable of generating a
key event interrupt. A key event interrupt caused by a GPI follows
the same process flow as a key event interrupt caused by a key
press. GPIs configured as part of the key event table allow single
key switches and other GPI interrupts to be monitored. As part of
the event table, GPIs are represented by the decimal value 97 (0x61
or 1100001) through the decimal value 114 (0x72 or 1110010). See
table below for GPI event number assignments for rows and columns.

GPI Event Number Assignments for Rows
Row0 Row1 Row2 Row3 Row4 Row5 Row6 Row7
97   98   99   100  101  102  103  104

GPI Event Number Assignments for Cols
Col0 Col1 Col2 Col3 Col4 Col5 Col6 Col7 Col8 Col9
105  106  107  108  109  110  111  112  113  114

Signed-off-by: Xiaolong Chen <xiao-long.chen@motorola.com>
Signed-off-by: Yuanbo Ye <yuan-bo.ye@motorola.com>
Signed-off-by: Tao Hu <taohu@motorola.com>
Acked-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/keyboard/adp5588-keys.c | 134 ++++++++++++++++++++++++++++++++--
 include/linux/i2c/adp5588.h           |  36 +++++++++
 2 files changed, 163 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c
index 4771ab172b59..4ef789ef1042 100644
--- a/drivers/input/keyboard/adp5588-keys.c
+++ b/drivers/input/keyboard/adp5588-keys.c
@@ -67,6 +67,8 @@ struct adp5588_kpad {
 	struct delayed_work work;
 	unsigned long delay;
 	unsigned short keycode[ADP5588_KEYMAPSIZE];
+	const struct adp5588_gpi_map *gpimap;
+	unsigned short gpimapsize;
 };
 
 static int adp5588_read(struct i2c_client *client, u8 reg)
@@ -84,12 +86,37 @@ static int adp5588_write(struct i2c_client *client, u8 reg, u8 val)
 	return i2c_smbus_write_byte_data(client, reg, val);
 }
 
+static void adp5588_report_events(struct adp5588_kpad *kpad, int ev_cnt)
+{
+	int i, j;
+
+	for (i = 0; i < ev_cnt; i++) {
+		int key = adp5588_read(kpad->client, Key_EVENTA + i);
+		int key_val = key & KEY_EV_MASK;
+
+		if (key_val >= GPI_PIN_BASE && key_val <= GPI_PIN_END) {
+			for (j = 0; j < kpad->gpimapsize; j++) {
+				if (key_val == kpad->gpimap[j].pin) {
+					input_report_switch(kpad->input,
+							kpad->gpimap[j].sw_evt,
+							key & KEY_EV_PRESSED);
+					break;
+				}
+			}
+		} else {
+			input_report_key(kpad->input,
+					 kpad->keycode[key_val - 1],
+					 key & KEY_EV_PRESSED);
+		}
+	}
+}
+
 static void adp5588_work(struct work_struct *work)
 {
 	struct adp5588_kpad *kpad = container_of(work,
 						struct adp5588_kpad, work.work);
 	struct i2c_client *client = kpad->client;
-	int i, key, status, ev_cnt;
+	int status, ev_cnt;
 
 	status = adp5588_read(client, INT_STAT);
 
@@ -99,12 +126,7 @@ static void adp5588_work(struct work_struct *work)
 	if (status & KE_INT) {
 		ev_cnt = adp5588_read(client, KEY_LCK_EC_STAT) & KEC;
 		if (ev_cnt) {
-			for (i = 0; i < ev_cnt; i++) {
-				key = adp5588_read(client, Key_EVENTA + i);
-				input_report_key(kpad->input,
-					kpad->keycode[(key & KEY_EV_MASK) - 1],
-					key & KEY_EV_PRESSED);
-			}
+			adp5588_report_events(kpad, ev_cnt);
 			input_sync(kpad->input);
 		}
 	}
@@ -130,6 +152,7 @@ static int __devinit adp5588_setup(struct i2c_client *client)
 {
 	struct adp5588_kpad_platform_data *pdata = client->dev.platform_data;
 	int i, ret;
+	unsigned char evt_mode1 = 0, evt_mode2 = 0, evt_mode3 = 0;
 
 	ret = adp5588_write(client, KP_GPIO1, KP_SEL(pdata->rows));
 	ret |= adp5588_write(client, KP_GPIO2, KP_SEL(pdata->cols) & 0xFF);
@@ -144,6 +167,23 @@ static int __devinit adp5588_setup(struct i2c_client *client)
 	for (i = 0; i < KEYP_MAX_EVENT; i++)
 		ret |= adp5588_read(client, Key_EVENTA);
 
+	for (i = 0; i < pdata->gpimapsize; i++) {
+		unsigned short pin = pdata->gpimap[i].pin;
+
+		if (pin <= GPI_PIN_ROW_END) {
+			evt_mode1 |= (1 << (pin - GPI_PIN_ROW_BASE));
+		} else {
+			evt_mode2 |= ((1 << (pin - GPI_PIN_COL_BASE)) & 0xFF);
+			evt_mode3 |= ((1 << (pin - GPI_PIN_COL_BASE)) >> 8);
+		}
+	}
+
+	if (pdata->gpimapsize) {
+		ret |= adp5588_write(client, GPI_EM1, evt_mode1);
+		ret |= adp5588_write(client, GPI_EM2, evt_mode2);
+		ret |= adp5588_write(client, GPI_EM3, evt_mode3);
+	}
+
 	ret |= adp5588_write(client, INT_STAT, CMP2_INT | CMP1_INT |
 					OVR_FLOW_INT | K_LCK_INT |
 					GPI_INT | KE_INT); /* Status is W1C */
@@ -158,6 +198,44 @@ static int __devinit adp5588_setup(struct i2c_client *client)
 	return 0;
 }
 
+static void __devinit adp5588_report_switch_state(struct adp5588_kpad *kpad)
+{
+	int gpi_stat1 = adp5588_read(kpad->client, GPIO_DAT_STAT1);
+	int gpi_stat2 = adp5588_read(kpad->client, GPIO_DAT_STAT2);
+	int gpi_stat3 = adp5588_read(kpad->client, GPIO_DAT_STAT3);
+	int gpi_stat_tmp, pin_loc;
+	int i;
+
+	for (i = 0; i < kpad->gpimapsize; i++) {
+		unsigned short pin = kpad->gpimap[i].pin;
+
+		if (pin <= GPI_PIN_ROW_END) {
+			gpi_stat_tmp = gpi_stat1;
+			pin_loc = pin - GPI_PIN_ROW_BASE;
+		} else if ((pin - GPI_PIN_COL_BASE) < 8) {
+			gpi_stat_tmp = gpi_stat2;
+			pin_loc = pin - GPI_PIN_COL_BASE;
+		} else {
+			gpi_stat_tmp = gpi_stat3;
+			pin_loc = pin - GPI_PIN_COL_BASE - 8;
+		}
+
+		if (gpi_stat_tmp < 0) {
+			dev_err(&kpad->client->dev,
+				"Can't read GPIO_DAT_STAT switch %d default to OFF\n",
+				pin);
+			gpi_stat_tmp = 0;
+		}
+
+		input_report_switch(kpad->input,
+				    kpad->gpimap[i].sw_evt,
+				    !(gpi_stat_tmp & (1 << pin_loc)));
+	}
+
+	input_sync(kpad->input);
+}
+
+
 static int __devinit adp5588_probe(struct i2c_client *client,
 					const struct i2c_device_id *id)
 {
@@ -189,6 +267,37 @@ static int __devinit adp5588_probe(struct i2c_client *client,
 		return -EINVAL;
 	}
 
+	if (!pdata->gpimap && pdata->gpimapsize) {
+		dev_err(&client->dev, "invalid gpimap from pdata\n");
+		return -EINVAL;
+	}
+
+	if (pdata->gpimapsize > ADP5588_GPIMAPSIZE_MAX) {
+		dev_err(&client->dev, "invalid gpimapsize\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < pdata->gpimapsize; i++) {
+		unsigned short pin = pdata->gpimap[i].pin;
+
+		if (pin < GPI_PIN_BASE || pin > GPI_PIN_END) {
+			dev_err(&client->dev, "invalid gpi pin data\n");
+			return -EINVAL;
+		}
+
+		if (pin <= GPI_PIN_ROW_END) {
+			if (pin - GPI_PIN_ROW_BASE + 1 <= pdata->rows) {
+				dev_err(&client->dev, "invalid gpi row data\n");
+				return -EINVAL;
+			}
+		} else {
+			if (pin - GPI_PIN_COL_BASE + 1 <= pdata->cols) {
+				dev_err(&client->dev, "invalid gpi col data\n");
+				return -EINVAL;
+			}
+		}
+	}
+
 	if (!client->irq) {
 		dev_err(&client->dev, "no IRQ?\n");
 		return -EINVAL;
@@ -233,6 +342,9 @@ static int __devinit adp5588_probe(struct i2c_client *client,
 	memcpy(kpad->keycode, pdata->keymap,
 		pdata->keymapsize * input->keycodesize);
 
+	kpad->gpimap = pdata->gpimap;
+	kpad->gpimapsize = pdata->gpimapsize;
+
 	/* setup input device */
 	__set_bit(EV_KEY, input->evbit);
 
@@ -243,6 +355,11 @@ static int __devinit adp5588_probe(struct i2c_client *client,
 		__set_bit(kpad->keycode[i] & KEY_MAX, input->keybit);
 	__clear_bit(KEY_RESERVED, input->keybit);
 
+	if (kpad->gpimapsize)
+		__set_bit(EV_SW, input->evbit);
+	for (i = 0; i < kpad->gpimapsize; i++)
+		__set_bit(kpad->gpimap[i].sw_evt, input->swbit);
+
 	error = input_register_device(input);
 	if (error) {
 		dev_err(&client->dev, "unable to register input device\n");
@@ -261,6 +378,9 @@ static int __devinit adp5588_probe(struct i2c_client *client,
 	if (error)
 		goto err_free_irq;
 
+	if (kpad->gpimapsize)
+		adp5588_report_switch_state(kpad);
+
 	device_init_wakeup(&client->dev, 1);
 	i2c_set_clientdata(client, kpad);
 
diff --git a/include/linux/i2c/adp5588.h b/include/linux/i2c/adp5588.h
index 02c9af374741..b5f57c498e24 100644
--- a/include/linux/i2c/adp5588.h
+++ b/include/linux/i2c/adp5588.h
@@ -78,6 +78,40 @@
 
 #define ADP5588_KEYMAPSIZE	80
 
+#define GPI_PIN_ROW0 97
+#define GPI_PIN_ROW1 98
+#define GPI_PIN_ROW2 99
+#define GPI_PIN_ROW3 100
+#define GPI_PIN_ROW4 101
+#define GPI_PIN_ROW5 102
+#define GPI_PIN_ROW6 103
+#define GPI_PIN_ROW7 104
+#define GPI_PIN_COL0 105
+#define GPI_PIN_COL1 106
+#define GPI_PIN_COL2 107
+#define GPI_PIN_COL3 108
+#define GPI_PIN_COL4 109
+#define GPI_PIN_COL5 110
+#define GPI_PIN_COL6 111
+#define GPI_PIN_COL7 112
+#define GPI_PIN_COL8 113
+#define GPI_PIN_COL9 114
+
+#define GPI_PIN_ROW_BASE GPI_PIN_ROW0
+#define GPI_PIN_ROW_END GPI_PIN_ROW7
+#define GPI_PIN_COL_BASE GPI_PIN_COL0
+#define GPI_PIN_COL_END GPI_PIN_COL9
+
+#define GPI_PIN_BASE GPI_PIN_ROW_BASE
+#define GPI_PIN_END GPI_PIN_COL_END
+
+#define ADP5588_GPIMAPSIZE_MAX (GPI_PIN_END - GPI_PIN_BASE + 1)
+
+struct adp5588_gpi_map {
+	unsigned short pin;
+	unsigned short sw_evt;
+};
+
 struct adp5588_kpad_platform_data {
 	int rows;			/* Number of rows */
 	int cols;			/* Number of columns */
@@ -87,6 +121,8 @@ struct adp5588_kpad_platform_data {
 	unsigned en_keylock:1;		/* Enable Key Lock feature */
 	unsigned short unlock_key1;	/* Unlock Key 1 */
 	unsigned short unlock_key2;	/* Unlock Key 2 */
+	const struct adp5588_gpi_map *gpimap;
+	unsigned short gpimapsize;
 };
 
 struct adp5588_gpio_platform_data {
-- 
cgit v1.2.3-59-g8ed1b


From fcb26ec5b18d88bb22366799d056dc3630d0e895 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Wed, 16 Jun 2010 23:02:23 +0000
Subject: broadcom: move all PHY_ID's to header

Move all PHY IDs to brcmphy.h header for completeness and unification of code.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 24 ++++++++++++------------
 include/linux/brcmphy.h    |  6 ++++++
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index cecdbbd549ec..b743d37532ff 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -685,7 +685,7 @@ static int brcm_fet_config_intr(struct phy_device *phydev)
 }
 
 static struct phy_driver bcm5411_driver = {
-	.phy_id		= 0x00206070,
+	.phy_id		= PHY_ID_BCM5411,
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Broadcom BCM5411",
 	.features	= PHY_GBIT_FEATURES |
@@ -700,7 +700,7 @@ static struct phy_driver bcm5411_driver = {
 };
 
 static struct phy_driver bcm5421_driver = {
-	.phy_id		= 0x002060e0,
+	.phy_id		= PHY_ID_BCM5421,
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Broadcom BCM5421",
 	.features	= PHY_GBIT_FEATURES |
@@ -715,7 +715,7 @@ static struct phy_driver bcm5421_driver = {
 };
 
 static struct phy_driver bcm5461_driver = {
-	.phy_id		= 0x002060c0,
+	.phy_id		= PHY_ID_BCM5461,
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Broadcom BCM5461",
 	.features	= PHY_GBIT_FEATURES |
@@ -730,7 +730,7 @@ static struct phy_driver bcm5461_driver = {
 };
 
 static struct phy_driver bcm5464_driver = {
-	.phy_id		= 0x002060b0,
+	.phy_id		= PHY_ID_BCM5464,
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Broadcom BCM5464",
 	.features	= PHY_GBIT_FEATURES |
@@ -745,7 +745,7 @@ static struct phy_driver bcm5464_driver = {
 };
 
 static struct phy_driver bcm5481_driver = {
-	.phy_id		= 0x0143bca0,
+	.phy_id		= PHY_ID_BCM5481,
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Broadcom BCM5481",
 	.features	= PHY_GBIT_FEATURES |
@@ -760,7 +760,7 @@ static struct phy_driver bcm5481_driver = {
 };
 
 static struct phy_driver bcm5482_driver = {
-	.phy_id		= 0x0143bcb0,
+	.phy_id		= PHY_ID_BCM5482,
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Broadcom BCM5482",
 	.features	= PHY_GBIT_FEATURES |
@@ -910,12 +910,12 @@ module_init(broadcom_init);
 module_exit(broadcom_exit);
 
 static struct mdio_device_id broadcom_tbl[] = {
-	{ 0x00206070, 0xfffffff0 },
-	{ 0x002060e0, 0xfffffff0 },
-	{ 0x002060c0, 0xfffffff0 },
-	{ 0x002060b0, 0xfffffff0 },
-	{ 0x0143bca0, 0xfffffff0 },
-	{ 0x0143bcb0, 0xfffffff0 },
+	{ PHY_ID_BCM5411, 0xfffffff0 },
+	{ PHY_ID_BCM5421, 0xfffffff0 },
+	{ PHY_ID_BCM5461, 0xfffffff0 },
+	{ PHY_ID_BCM5464, 0xfffffff0 },
+	{ PHY_ID_BCM5482, 0xfffffff0 },
+	{ PHY_ID_BCM5482, 0xfffffff0 },
 	{ PHY_ID_BCM50610, 0xfffffff0 },
 	{ PHY_ID_BCM50610M, 0xfffffff0 },
 	{ PHY_ID_BCM57780, 0xfffffff0 },
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 7f437ca1ed44..c14c3a1b64d2 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -1,6 +1,12 @@
 #define PHY_ID_BCM50610			0x0143bd60
 #define PHY_ID_BCM50610M		0x0143bd70
 #define PHY_ID_BCMAC131			0x0143bc70
+#define PHY_ID_BCM5481			0x0143bca0
+#define PHY_ID_BCM5482			0x0143bcb0
+#define PHY_ID_BCM5411			0x00206070
+#define PHY_ID_BCM5421			0x002060e0
+#define PHY_ID_BCM5464			0x002060b0
+#define PHY_ID_BCM5461			0x002060c0
 #define PHY_ID_BCM57780			0x03625d90
 
 #define PHY_BCM_OUI_MASK		0xfffffc00
-- 
cgit v1.2.3-59-g8ed1b


From 7a938f80264f2cbfb0c0841b450eab42a8093281 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Wed, 16 Jun 2010 23:02:24 +0000
Subject: broadcom: Add 5241 support

This patch adds the 5241 PHY ID to the broadcom module.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 22 ++++++++++++++++++++++
 include/linux/brcmphy.h    |  1 +
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index b743d37532ff..4accd83d3dfe 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -834,6 +834,21 @@ static struct phy_driver bcmac131_driver = {
 	.driver		= { .owner = THIS_MODULE },
 };
 
+static struct phy_driver bcm5241_driver = {
+	.phy_id		= PHY_ID_BCM5241,
+	.phy_id_mask	= 0xfffffff0,
+	.name		= "Broadcom BCM5241",
+	.features	= PHY_BASIC_FEATURES |
+			  SUPPORTED_Pause | SUPPORTED_Asym_Pause,
+	.flags		= PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
+	.config_init	= brcm_fet_config_init,
+	.config_aneg	= genphy_config_aneg,
+	.read_status	= genphy_read_status,
+	.ack_interrupt	= brcm_fet_ack_interrupt,
+	.config_intr	= brcm_fet_config_intr,
+	.driver		= { .owner = THIS_MODULE },
+};
+
 static int __init broadcom_init(void)
 {
 	int ret;
@@ -868,8 +883,13 @@ static int __init broadcom_init(void)
 	ret = phy_driver_register(&bcmac131_driver);
 	if (ret)
 		goto out_ac131;
+	ret = phy_driver_register(&bcm5241_driver);
+	if (ret)
+		goto out_5241;
 	return ret;
 
+out_5241:
+	phy_driver_unregister(&bcmac131_driver);
 out_ac131:
 	phy_driver_unregister(&bcm57780_driver);
 out_57780:
@@ -894,6 +914,7 @@ out_5411:
 
 static void __exit broadcom_exit(void)
 {
+	phy_driver_unregister(&bcm5241_driver);
 	phy_driver_unregister(&bcmac131_driver);
 	phy_driver_unregister(&bcm57780_driver);
 	phy_driver_unregister(&bcm50610m_driver);
@@ -920,6 +941,7 @@ static struct mdio_device_id broadcom_tbl[] = {
 	{ PHY_ID_BCM50610M, 0xfffffff0 },
 	{ PHY_ID_BCM57780, 0xfffffff0 },
 	{ PHY_ID_BCMAC131, 0xfffffff0 },
+	{ PHY_ID_BCM5241, 0xfffffff0 },
 	{ }
 };
 
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index c14c3a1b64d2..b840a4960282 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -1,5 +1,6 @@
 #define PHY_ID_BCM50610			0x0143bd60
 #define PHY_ID_BCM50610M		0x0143bd70
+#define PHY_ID_BCM5241			0x0143bc30
 #define PHY_ID_BCMAC131			0x0143bc70
 #define PHY_ID_BCM5481			0x0143bca0
 #define PHY_ID_BCM5482			0x0143bcb0
-- 
cgit v1.2.3-59-g8ed1b


From e27c729219ad24c8ac9a4b34cf192e56917565c5 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Fri, 25 Jun 2010 08:44:10 -0700
Subject: Input: add driver for ADXL345/346 Digital Accelerometers

This is a driver for the ADXL345/346 Three-Axis Digital Accelerometers.

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Chris Verges <chrisv@cyberswitching.com>
Signed-off-by: Luotao Fu <l.fu@pengutronix.de>
Signed-off-by: Barry Song <barry.song@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/misc/Kconfig       |  37 ++
 drivers/input/misc/Makefile      |   3 +
 drivers/input/misc/adxl34x-i2c.c | 163 ++++++++
 drivers/input/misc/adxl34x-spi.c | 145 +++++++
 drivers/input/misc/adxl34x.c     | 840 +++++++++++++++++++++++++++++++++++++++
 drivers/input/misc/adxl34x.h     |  30 ++
 include/linux/input/adxl34x.h    | 293 ++++++++++++++
 7 files changed, 1511 insertions(+)
 create mode 100644 drivers/input/misc/adxl34x-i2c.c
 create mode 100644 drivers/input/misc/adxl34x-spi.c
 create mode 100644 drivers/input/misc/adxl34x.c
 create mode 100644 drivers/input/misc/adxl34x.h
 create mode 100644 include/linux/input/adxl34x.h

(limited to 'include/linux')

diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index c44b9eafc556..ede6d52fe95c 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -390,4 +390,41 @@ config INPUT_PCAP
 	  To compile this driver as a module, choose M here: the
 	  module will be called pcap_keys.
 
+config INPUT_ADXL34X
+	tristate "Analog Devices ADXL34x Three-Axis Digital Accelerometer"
+	default n
+	help
+	  Say Y here if you have a Accelerometer interface using the
+	  ADXL345/6 controller, and your board-specific initialization
+	  code includes that in its table of devices.
+
+	  This driver can use either I2C or SPI communication to the
+	  ADXL345/6 controller.  Select the appropriate method for
+	  your system.
+
+	  If unsure, say N (but it's safe to say "Y").
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called adxl34x.
+
+config INPUT_ADXL34X_I2C
+	tristate "support I2C bus connection"
+	depends on INPUT_ADXL34X && I2C
+	default y
+	help
+	  Say Y here if you have ADXL345/6 hooked to an I2C bus.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called adxl34x-i2c.
+
+config INPUT_ADXL34X_SPI
+	tristate "support SPI bus connection"
+	depends on INPUT_ADXL34X && SPI
+	default y
+	help
+	  Say Y here if you have ADXL345/6 hooked to a SPI bus.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called adxl34x-spi.
+
 endif
diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile
index 71fe57d8023f..97b5dc32df19 100644
--- a/drivers/input/misc/Makefile
+++ b/drivers/input/misc/Makefile
@@ -8,6 +8,9 @@ obj-$(CONFIG_INPUT_88PM860X_ONKEY)	+= 88pm860x_onkey.o
 obj-$(CONFIG_INPUT_AD714X)		+= ad714x.o
 obj-$(CONFIG_INPUT_AD714X_I2C)		+= ad714x-i2c.o
 obj-$(CONFIG_INPUT_AD714X_SPI)		+= ad714x-spi.o
+obj-$(CONFIG_INPUT_ADXL34X)		+= adxl34x.o
+obj-$(CONFIG_INPUT_ADXL34X_I2C)		+= adxl34x-i2c.o
+obj-$(CONFIG_INPUT_ADXL34X_SPI)		+= adxl34x-spi.o
 obj-$(CONFIG_INPUT_APANEL)		+= apanel.o
 obj-$(CONFIG_INPUT_ATI_REMOTE)		+= ati_remote.o
 obj-$(CONFIG_INPUT_ATI_REMOTE2)		+= ati_remote2.o
diff --git a/drivers/input/misc/adxl34x-i2c.c b/drivers/input/misc/adxl34x-i2c.c
new file mode 100644
index 000000000000..76194b58bd0b
--- /dev/null
+++ b/drivers/input/misc/adxl34x-i2c.c
@@ -0,0 +1,163 @@
+/*
+ * ADLX345/346 Three-Axis Digital Accelerometers (I2C Interface)
+ *
+ * Enter bugs at http://blackfin.uclinux.org/
+ *
+ * Copyright (C) 2009 Michael Hennerich, Analog Devices Inc.
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/input.h>	/* BUS_I2C */
+#include <linux/i2c.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include "adxl34x.h"
+
+static int adxl34x_smbus_read(struct device *dev, unsigned char reg)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+
+	return i2c_smbus_read_byte_data(client, reg);
+}
+
+static int adxl34x_smbus_write(struct device *dev,
+			       unsigned char reg, unsigned char val)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+
+	return i2c_smbus_write_byte_data(client, reg, val);
+}
+
+static int adxl34x_smbus_read_block(struct device *dev,
+				    unsigned char reg, int count,
+				    void *buf)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+
+	return i2c_smbus_read_i2c_block_data(client, reg, count, buf);
+}
+
+static int adxl34x_i2c_read_block(struct device *dev,
+				  unsigned char reg, int count,
+				  void *buf)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	int ret;
+
+	ret = i2c_master_send(client, &reg, 1);
+	if (ret < 0)
+		return ret;
+
+	ret = i2c_master_recv(client, buf, count);
+	if (ret < 0)
+		return ret;
+
+	if (ret != count)
+		return -EIO;
+
+	return 0;
+}
+
+static const struct adxl34x_bus_ops adx134x_smbus_bops = {
+	.bustype	= BUS_I2C,
+	.write		= adxl34x_smbus_write,
+	.read		= adxl34x_smbus_read,
+	.read_block	= adxl34x_smbus_read_block,
+};
+
+static const struct adxl34x_bus_ops adx134x_i2c_bops = {
+	.bustype	= BUS_I2C,
+	.write		= adxl34x_smbus_write,
+	.read		= adxl34x_smbus_read,
+	.read_block	= adxl34x_i2c_read_block,
+};
+
+static int __devinit adxl34x_i2c_probe(struct i2c_client *client,
+				       const struct i2c_device_id *id)
+{
+	struct adxl34x *ac;
+	int error;
+
+	error = i2c_check_functionality(client->adapter,
+			I2C_FUNC_SMBUS_BYTE_DATA);
+	if (!error) {
+		dev_err(&client->dev, "SMBUS Byte Data not Supported\n");
+		return -EIO;
+	}
+
+	ac = adxl34x_probe(&client->dev, client->irq, false,
+			   i2c_check_functionality(client->adapter,
+						   I2C_FUNC_SMBUS_READ_I2C_BLOCK) ?
+				&adx134x_smbus_bops : &adx134x_i2c_bops);
+	if (IS_ERR(ac))
+		return PTR_ERR(ac);
+
+	i2c_set_clientdata(client, ac);
+
+	return 0;
+}
+
+static int __devexit adxl34x_i2c_remove(struct i2c_client *client)
+{
+	struct adxl34x *ac = i2c_get_clientdata(client);
+
+	return adxl34x_remove(ac);
+}
+
+#ifdef CONFIG_PM
+static int adxl34x_suspend(struct i2c_client *client, pm_message_t message)
+{
+	struct adxl34x *ac = i2c_get_clientdata(client);
+
+	adxl34x_disable(ac);
+
+	return 0;
+}
+
+static int adxl34x_resume(struct i2c_client *client)
+{
+	struct adxl34x *ac = i2c_get_clientdata(client);
+
+	adxl34x_enable(ac);
+
+	return 0;
+}
+#else
+# define adxl34x_suspend NULL
+# define adxl34x_resume  NULL
+#endif
+
+static const struct i2c_device_id adxl34x_id[] = {
+	{ "adxl34x", 0 },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(i2c, adxl34x_id);
+
+static struct i2c_driver adxl34x_driver = {
+	.driver = {
+		.name = "adxl34x",
+		.owner = THIS_MODULE,
+	},
+	.probe    = adxl34x_i2c_probe,
+	.remove   = __devexit_p(adxl34x_i2c_remove),
+	.suspend  = adxl34x_suspend,
+	.resume   = adxl34x_resume,
+	.id_table = adxl34x_id,
+};
+
+static int __init adxl34x_i2c_init(void)
+{
+	return i2c_add_driver(&adxl34x_driver);
+}
+module_init(adxl34x_i2c_init);
+
+static void __exit adxl34x_i2c_exit(void)
+{
+	i2c_del_driver(&adxl34x_driver);
+}
+module_exit(adxl34x_i2c_exit);
+
+MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
+MODULE_DESCRIPTION("ADXL345/346 Three-Axis Digital Accelerometer I2C Bus Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/input/misc/adxl34x-spi.c b/drivers/input/misc/adxl34x-spi.c
new file mode 100644
index 000000000000..7f992353ffdd
--- /dev/null
+++ b/drivers/input/misc/adxl34x-spi.c
@@ -0,0 +1,145 @@
+/*
+ * ADLX345/346 Three-Axis Digital Accelerometers (SPI Interface)
+ *
+ * Enter bugs at http://blackfin.uclinux.org/
+ *
+ * Copyright (C) 2009 Michael Hennerich, Analog Devices Inc.
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/input.h>	/* BUS_SPI */
+#include <linux/module.h>
+#include <linux/spi/spi.h>
+#include <linux/types.h>
+#include "adxl34x.h"
+
+#define MAX_SPI_FREQ_HZ		5000000
+#define MAX_FREQ_NO_FIFODELAY	1500000
+#define ADXL34X_CMD_MULTB	(1 << 6)
+#define ADXL34X_CMD_READ	(1 << 7)
+#define ADXL34X_WRITECMD(reg)	(reg & 0x3F)
+#define ADXL34X_READCMD(reg)	(ADXL34X_CMD_READ | (reg & 0x3F))
+#define ADXL34X_READMB_CMD(reg) (ADXL34X_CMD_READ | ADXL34X_CMD_MULTB \
+					| (reg & 0x3F))
+
+static int adxl34x_spi_read(struct device *dev, unsigned char reg)
+{
+	struct spi_device *spi = to_spi_device(dev);
+	unsigned char cmd;
+
+	cmd = ADXL34X_READCMD(reg);
+
+	return spi_w8r8(spi, cmd);
+}
+
+static int adxl34x_spi_write(struct device *dev,
+			     unsigned char reg, unsigned char val)
+{
+	struct spi_device *spi = to_spi_device(dev);
+	unsigned char buf[2];
+
+	buf[0] = ADXL34X_WRITECMD(reg);
+	buf[1] = val;
+
+	return spi_write(spi, buf, sizeof(buf));
+}
+
+static int adxl34x_spi_read_block(struct device *dev,
+				  unsigned char reg, int count,
+				  void *buf)
+{
+	struct spi_device *spi = to_spi_device(dev);
+	ssize_t status;
+
+	reg = ADXL34X_READMB_CMD(reg);
+	status = spi_write_then_read(spi, &reg, 1, buf, count);
+
+	return (status < 0) ? status : 0;
+}
+
+static const struct adxl34x_bus_ops adx134x_spi_bops = {
+	.bustype	= BUS_SPI,
+	.write		= adxl34x_spi_write,
+	.read		= adxl34x_spi_read,
+	.read_block	= adxl34x_spi_read_block,
+};
+
+static int __devinit adxl34x_spi_probe(struct spi_device *spi)
+{
+	struct adxl34x *ac;
+
+	/* don't exceed max specified SPI CLK frequency */
+	if (spi->max_speed_hz > MAX_SPI_FREQ_HZ) {
+		dev_err(&spi->dev, "SPI CLK %d Hz too fast\n", spi->max_speed_hz);
+		return -EINVAL;
+	}
+
+	ac = adxl34x_probe(&spi->dev, spi->irq,
+			   spi->max_speed_hz > MAX_FREQ_NO_FIFODELAY,
+			   &adx134x_spi_bops);
+
+	if (IS_ERR(ac))
+		return PTR_ERR(ac);
+
+	spi_set_drvdata(spi, ac);
+
+	return 0;
+}
+
+static int __devexit adxl34x_spi_remove(struct spi_device *spi)
+{
+	struct adxl34x *ac = dev_get_drvdata(&spi->dev);
+
+	return adxl34x_remove(ac);
+}
+
+#ifdef CONFIG_PM
+static int adxl34x_suspend(struct spi_device *spi, pm_message_t message)
+{
+	struct adxl34x *ac = dev_get_drvdata(&spi->dev);
+
+	adxl34x_disable(ac);
+
+	return 0;
+}
+
+static int adxl34x_resume(struct spi_device *spi)
+{
+	struct adxl34x *ac = dev_get_drvdata(&spi->dev);
+
+	adxl34x_enable(ac);
+
+	return 0;
+}
+#else
+# define adxl34x_suspend NULL
+# define adxl34x_resume  NULL
+#endif
+
+static struct spi_driver adxl34x_driver = {
+	.driver = {
+		.name = "adxl34x",
+		.bus = &spi_bus_type,
+		.owner = THIS_MODULE,
+	},
+	.probe   = adxl34x_spi_probe,
+	.remove  = __devexit_p(adxl34x_spi_remove),
+	.suspend = adxl34x_suspend,
+	.resume  = adxl34x_resume,
+};
+
+static int __init adxl34x_spi_init(void)
+{
+	return spi_register_driver(&adxl34x_driver);
+}
+module_init(adxl34x_spi_init);
+
+static void __exit adxl34x_spi_exit(void)
+{
+	spi_unregister_driver(&adxl34x_driver);
+}
+module_exit(adxl34x_spi_exit);
+
+MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
+MODULE_DESCRIPTION("ADXL345/346 Three-Axis Digital Accelerometer SPI Bus Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/input/misc/adxl34x.c b/drivers/input/misc/adxl34x.c
new file mode 100644
index 000000000000..07f9ef631540
--- /dev/null
+++ b/drivers/input/misc/adxl34x.c
@@ -0,0 +1,840 @@
+/*
+ * ADXL345/346 Three-Axis Digital Accelerometers
+ *
+ * Enter bugs at http://blackfin.uclinux.org/
+ *
+ * Copyright (C) 2009 Michael Hennerich, Analog Devices Inc.
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/input/adxl34x.h>
+
+#include "adxl34x.h"
+
+/* ADXL345/6 Register Map */
+#define DEVID		0x00	/* R   Device ID */
+#define THRESH_TAP	0x1D	/* R/W Tap threshold */
+#define OFSX		0x1E	/* R/W X-axis offset */
+#define OFSY		0x1F	/* R/W Y-axis offset */
+#define OFSZ		0x20	/* R/W Z-axis offset */
+#define DUR		0x21	/* R/W Tap duration */
+#define LATENT		0x22	/* R/W Tap latency */
+#define WINDOW		0x23	/* R/W Tap window */
+#define THRESH_ACT	0x24	/* R/W Activity threshold */
+#define THRESH_INACT	0x25	/* R/W Inactivity threshold */
+#define TIME_INACT	0x26	/* R/W Inactivity time */
+#define ACT_INACT_CTL	0x27	/* R/W Axis enable control for activity and */
+				/* inactivity detection */
+#define THRESH_FF	0x28	/* R/W Free-fall threshold */
+#define TIME_FF		0x29	/* R/W Free-fall time */
+#define TAP_AXES	0x2A	/* R/W Axis control for tap/double tap */
+#define ACT_TAP_STATUS	0x2B	/* R   Source of tap/double tap */
+#define BW_RATE		0x2C	/* R/W Data rate and power mode control */
+#define POWER_CTL	0x2D	/* R/W Power saving features control */
+#define INT_ENABLE	0x2E	/* R/W Interrupt enable control */
+#define INT_MAP		0x2F	/* R/W Interrupt mapping control */
+#define INT_SOURCE	0x30	/* R   Source of interrupts */
+#define DATA_FORMAT	0x31	/* R/W Data format control */
+#define DATAX0		0x32	/* R   X-Axis Data 0 */
+#define DATAX1		0x33	/* R   X-Axis Data 1 */
+#define DATAY0		0x34	/* R   Y-Axis Data 0 */
+#define DATAY1		0x35	/* R   Y-Axis Data 1 */
+#define DATAZ0		0x36	/* R   Z-Axis Data 0 */
+#define DATAZ1		0x37	/* R   Z-Axis Data 1 */
+#define FIFO_CTL	0x38	/* R/W FIFO control */
+#define FIFO_STATUS	0x39	/* R   FIFO status */
+#define TAP_SIGN	0x3A	/* R   Sign and source for tap/double tap */
+/* Orientation ADXL346 only */
+#define ORIENT_CONF	0x3B	/* R/W Orientation configuration */
+#define ORIENT		0x3C	/* R   Orientation status */
+
+/* DEVIDs */
+#define ID_ADXL345	0xE5
+#define ID_ADXL346	0xE6
+
+/* INT_ENABLE/INT_MAP/INT_SOURCE Bits */
+#define DATA_READY	(1 << 7)
+#define SINGLE_TAP	(1 << 6)
+#define DOUBLE_TAP	(1 << 5)
+#define ACTIVITY	(1 << 4)
+#define INACTIVITY	(1 << 3)
+#define FREE_FALL	(1 << 2)
+#define WATERMARK	(1 << 1)
+#define OVERRUN		(1 << 0)
+
+/* ACT_INACT_CONTROL Bits */
+#define ACT_ACDC	(1 << 7)
+#define ACT_X_EN	(1 << 6)
+#define ACT_Y_EN	(1 << 5)
+#define ACT_Z_EN	(1 << 4)
+#define INACT_ACDC	(1 << 3)
+#define INACT_X_EN	(1 << 2)
+#define INACT_Y_EN	(1 << 1)
+#define INACT_Z_EN	(1 << 0)
+
+/* TAP_AXES Bits */
+#define SUPPRESS	(1 << 3)
+#define TAP_X_EN	(1 << 2)
+#define TAP_Y_EN	(1 << 1)
+#define TAP_Z_EN	(1 << 0)
+
+/* ACT_TAP_STATUS Bits */
+#define ACT_X_SRC	(1 << 6)
+#define ACT_Y_SRC	(1 << 5)
+#define ACT_Z_SRC	(1 << 4)
+#define ASLEEP		(1 << 3)
+#define TAP_X_SRC	(1 << 2)
+#define TAP_Y_SRC	(1 << 1)
+#define TAP_Z_SRC	(1 << 0)
+
+/* BW_RATE Bits */
+#define LOW_POWER	(1 << 4)
+#define RATE(x)		((x) & 0xF)
+
+/* POWER_CTL Bits */
+#define PCTL_LINK	(1 << 5)
+#define PCTL_AUTO_SLEEP (1 << 4)
+#define PCTL_MEASURE	(1 << 3)
+#define PCTL_SLEEP	(1 << 2)
+#define PCTL_WAKEUP(x)	((x) & 0x3)
+
+/* DATA_FORMAT Bits */
+#define SELF_TEST	(1 << 7)
+#define SPI		(1 << 6)
+#define INT_INVERT	(1 << 5)
+#define FULL_RES	(1 << 3)
+#define JUSTIFY		(1 << 2)
+#define RANGE(x)	((x) & 0x3)
+#define RANGE_PM_2g	0
+#define RANGE_PM_4g	1
+#define RANGE_PM_8g	2
+#define RANGE_PM_16g	3
+
+/*
+ * Maximum value our axis may get in full res mode for the input device
+ * (signed 13 bits)
+ */
+#define ADXL_FULLRES_MAX_VAL 4096
+
+/*
+ * Maximum value our axis may get in fixed res mode for the input device
+ * (signed 10 bits)
+ */
+#define ADXL_FIXEDRES_MAX_VAL 512
+
+/* FIFO_CTL Bits */
+#define FIFO_MODE(x)	(((x) & 0x3) << 6)
+#define FIFO_BYPASS	0
+#define FIFO_FIFO	1
+#define FIFO_STREAM	2
+#define FIFO_TRIGGER	3
+#define TRIGGER		(1 << 5)
+#define SAMPLES(x)	((x) & 0x1F)
+
+/* FIFO_STATUS Bits */
+#define FIFO_TRIG	(1 << 7)
+#define ENTRIES(x)	((x) & 0x3F)
+
+/* TAP_SIGN Bits ADXL346 only */
+#define XSIGN		(1 << 6)
+#define YSIGN		(1 << 5)
+#define ZSIGN		(1 << 4)
+#define XTAP		(1 << 3)
+#define YTAP		(1 << 2)
+#define ZTAP		(1 << 1)
+
+/* ORIENT_CONF ADXL346 only */
+#define ORIENT_DEADZONE(x)	(((x) & 0x7) << 4)
+#define ORIENT_DIVISOR(x)	((x) & 0x7)
+
+/* ORIENT ADXL346 only */
+#define ADXL346_2D_VALID		(1 << 6)
+#define ADXL346_2D_ORIENT(x)		(((x) & 0x3) >> 4)
+#define ADXL346_3D_VALID		(1 << 3)
+#define ADXL346_3D_ORIENT(x)		((x) & 0x7)
+#define ADXL346_2D_PORTRAIT_POS		0	/* +X */
+#define ADXL346_2D_PORTRAIT_NEG		1	/* -X */
+#define ADXL346_2D_LANDSCAPE_POS	2	/* +Y */
+#define ADXL346_2D_LANDSCAPE_NEG	3	/* -Y */
+
+#define ADXL346_3D_FRONT		3	/* +X */
+#define ADXL346_3D_BACK			4	/* -X */
+#define ADXL346_3D_RIGHT		2	/* +Y */
+#define ADXL346_3D_LEFT			5	/* -Y */
+#define ADXL346_3D_TOP			1	/* +Z */
+#define ADXL346_3D_BOTTOM		6	/* -Z */
+
+#undef ADXL_DEBUG
+
+#define ADXL_X_AXIS			0
+#define ADXL_Y_AXIS			1
+#define ADXL_Z_AXIS			2
+
+#define AC_READ(ac, reg)	((ac)->bops->read((ac)->dev, reg))
+#define AC_WRITE(ac, reg, val)	((ac)->bops->write((ac)->dev, reg, val))
+
+struct axis_triple {
+	int x;
+	int y;
+	int z;
+};
+
+struct adxl34x {
+	struct device *dev;
+	struct input_dev *input;
+	struct mutex mutex;	/* reentrant protection for struct */
+	struct adxl34x_platform_data pdata;
+	struct axis_triple swcal;
+	struct axis_triple hwcal;
+	struct axis_triple saved;
+	char phys[32];
+	bool disabled;	/* P: mutex */
+	bool opened;	/* P: mutex */
+	bool fifo_delay;
+	int irq;
+	unsigned model;
+	unsigned int_mask;
+
+	const struct adxl34x_bus_ops *bops;
+};
+
+static const struct adxl34x_platform_data adxl34x_default_init = {
+	.tap_threshold = 35,
+	.tap_duration = 3,
+	.tap_latency = 20,
+	.tap_window = 20,
+	.tap_axis_control = ADXL_TAP_X_EN | ADXL_TAP_Y_EN | ADXL_TAP_Z_EN,
+	.act_axis_control = 0xFF,
+	.activity_threshold = 6,
+	.inactivity_threshold = 4,
+	.inactivity_time = 3,
+	.free_fall_threshold = 8,
+	.free_fall_time = 0x20,
+	.data_rate = 8,
+	.data_range = ADXL_FULL_RES,
+
+	.ev_type = EV_ABS,
+	.ev_code_x = ABS_X,	/* EV_REL */
+	.ev_code_y = ABS_Y,	/* EV_REL */
+	.ev_code_z = ABS_Z,	/* EV_REL */
+
+	.ev_code_tap = {BTN_TOUCH, BTN_TOUCH, BTN_TOUCH}, /* EV_KEY {x,y,z} */
+	.power_mode = ADXL_AUTO_SLEEP | ADXL_LINK,
+	.fifo_mode = FIFO_STREAM,
+	.watermark = 0,
+};
+
+static void adxl34x_get_triple(struct adxl34x *ac, struct axis_triple *axis)
+{
+	short buf[3];
+
+	ac->bops->read_block(ac->dev, DATAX0, DATAZ1 - DATAX0 + 1, buf);
+
+	mutex_lock(&ac->mutex);
+	ac->saved.x = (s16) le16_to_cpu(buf[0]);
+	axis->x = ac->saved.x;
+
+	ac->saved.y = (s16) le16_to_cpu(buf[1]);
+	axis->y = ac->saved.y;
+
+	ac->saved.z = (s16) le16_to_cpu(buf[2]);
+	axis->z = ac->saved.z;
+	mutex_unlock(&ac->mutex);
+}
+
+static void adxl34x_service_ev_fifo(struct adxl34x *ac)
+{
+	struct adxl34x_platform_data *pdata = &ac->pdata;
+	struct axis_triple axis;
+
+	adxl34x_get_triple(ac, &axis);
+
+	input_event(ac->input, pdata->ev_type, pdata->ev_code_x,
+		    axis.x - ac->swcal.x);
+	input_event(ac->input, pdata->ev_type, pdata->ev_code_y,
+		    axis.y - ac->swcal.y);
+	input_event(ac->input, pdata->ev_type, pdata->ev_code_z,
+		    axis.z - ac->swcal.z);
+}
+
+static void adxl34x_report_key_single(struct input_dev *input, int key)
+{
+	input_report_key(input, key, true);
+	input_sync(input);
+	input_report_key(input, key, false);
+}
+
+static void adxl34x_send_key_events(struct adxl34x *ac,
+		struct adxl34x_platform_data *pdata, int status, int press)
+{
+	int i;
+
+	for (i = ADXL_X_AXIS; i <= ADXL_Z_AXIS; i++) {
+		if (status & (1 << (ADXL_Z_AXIS - i)))
+			input_report_key(ac->input,
+					 pdata->ev_code_tap[i], press);
+	}
+}
+
+static void adxl34x_do_tap(struct adxl34x *ac,
+		struct adxl34x_platform_data *pdata, int status)
+{
+	adxl34x_send_key_events(ac, pdata, status, true);
+	input_sync(ac->input);
+	adxl34x_send_key_events(ac, pdata, status, false);
+}
+
+static irqreturn_t adxl34x_irq(int irq, void *handle)
+{
+	struct adxl34x *ac = handle;
+	struct adxl34x_platform_data *pdata = &ac->pdata;
+	int int_stat, tap_stat, samples;
+
+	/*
+	 * ACT_TAP_STATUS should be read before clearing the interrupt
+	 * Avoid reading ACT_TAP_STATUS in case TAP detection is disabled
+	 */
+
+	if (pdata->tap_axis_control & (TAP_X_EN | TAP_Y_EN | TAP_Z_EN))
+		tap_stat = AC_READ(ac, ACT_TAP_STATUS);
+	else
+		tap_stat = 0;
+
+	int_stat = AC_READ(ac, INT_SOURCE);
+
+	if (int_stat & FREE_FALL)
+		adxl34x_report_key_single(ac->input, pdata->ev_code_ff);
+
+	if (int_stat & OVERRUN)
+		dev_dbg(ac->dev, "OVERRUN\n");
+
+	if (int_stat & (SINGLE_TAP | DOUBLE_TAP)) {
+		adxl34x_do_tap(ac, pdata, tap_stat);
+
+		if (int_stat & DOUBLE_TAP)
+			adxl34x_do_tap(ac, pdata, tap_stat);
+	}
+
+	if (pdata->ev_code_act_inactivity) {
+		if (int_stat & ACTIVITY)
+			input_report_key(ac->input,
+					 pdata->ev_code_act_inactivity, 1);
+		if (int_stat & INACTIVITY)
+			input_report_key(ac->input,
+					 pdata->ev_code_act_inactivity, 0);
+	}
+
+	if (int_stat & (DATA_READY | WATERMARK)) {
+
+		if (pdata->fifo_mode)
+			samples = ENTRIES(AC_READ(ac, FIFO_STATUS)) + 1;
+		else
+			samples = 1;
+
+		for (; samples > 0; samples--) {
+			adxl34x_service_ev_fifo(ac);
+			/*
+			 * To ensure that the FIFO has
+			 * completely popped, there must be at least 5 us between
+			 * the end of reading the data registers, signified by the
+			 * transition to register 0x38 from 0x37 or the CS pin
+			 * going high, and the start of new reads of the FIFO or
+			 * reading the FIFO_STATUS register. For SPI operation at
+			 * 1.5 MHz or lower, the register addressing portion of the
+			 * transmission is sufficient delay to ensure the FIFO has
+			 * completely popped. It is necessary for SPI operation
+			 * greater than 1.5 MHz to de-assert the CS pin to ensure a
+			 * total of 5 us, which is at most 3.4 us at 5 MHz
+			 * operation.
+			 */
+			if (ac->fifo_delay && (samples > 1))
+				udelay(3);
+		}
+	}
+
+	input_sync(ac->input);
+
+	return IRQ_HANDLED;
+}
+
+static void __adxl34x_disable(struct adxl34x *ac)
+{
+	if (!ac->disabled && ac->opened) {
+		/*
+		 * A '0' places the ADXL34x into standby mode
+		 * with minimum power consumption.
+		 */
+		AC_WRITE(ac, POWER_CTL, 0);
+
+		ac->disabled = true;
+	}
+}
+
+static void __adxl34x_enable(struct adxl34x *ac)
+{
+	if (ac->disabled && ac->opened) {
+		AC_WRITE(ac, POWER_CTL, ac->pdata.power_mode | PCTL_MEASURE);
+		ac->disabled = false;
+	}
+}
+
+void adxl34x_disable(struct adxl34x *ac)
+{
+	mutex_lock(&ac->mutex);
+	__adxl34x_disable(ac);
+	mutex_unlock(&ac->mutex);
+}
+EXPORT_SYMBOL_GPL(adxl34x_disable);
+
+void adxl34x_enable(struct adxl34x *ac)
+{
+	mutex_lock(&ac->mutex);
+	__adxl34x_enable(ac);
+	mutex_unlock(&ac->mutex);
+}
+
+EXPORT_SYMBOL_GPL(adxl34x_enable);
+
+static ssize_t adxl34x_disable_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct adxl34x *ac = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%u\n", ac->disabled);
+}
+
+static ssize_t adxl34x_disable_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct adxl34x *ac = dev_get_drvdata(dev);
+	unsigned long val;
+	int error;
+
+	error = strict_strtoul(buf, 10, &val);
+	if (error)
+		return error;
+
+	if (val)
+		adxl34x_disable(ac);
+	else
+		adxl34x_enable(ac);
+
+	return count;
+}
+
+static DEVICE_ATTR(disable, 0664, adxl34x_disable_show, adxl34x_disable_store);
+
+static ssize_t adxl34x_calibrate_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct adxl34x *ac = dev_get_drvdata(dev);
+	ssize_t count;
+
+	mutex_lock(&ac->mutex);
+	count = sprintf(buf, "%d,%d,%d\n",
+			ac->hwcal.x * 4 + ac->swcal.x,
+			ac->hwcal.y * 4 + ac->swcal.y,
+			ac->hwcal.z * 4 + ac->swcal.z);
+	mutex_unlock(&ac->mutex);
+
+	return count;
+}
+
+static ssize_t adxl34x_calibrate_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct adxl34x *ac = dev_get_drvdata(dev);
+
+	/*
+	 * Hardware offset calibration has a resolution of 15.6 mg/LSB.
+	 * We use HW calibration and handle the remaining bits in SW. (4mg/LSB)
+	 */
+
+	mutex_lock(&ac->mutex);
+	ac->hwcal.x -= (ac->saved.x / 4);
+	ac->swcal.x = ac->saved.x % 4;
+
+	ac->hwcal.y -= (ac->saved.y / 4);
+	ac->swcal.y = ac->saved.y % 4;
+
+	ac->hwcal.z -= (ac->saved.z / 4);
+	ac->swcal.z = ac->saved.z % 4;
+
+	AC_WRITE(ac, OFSX, (s8) ac->hwcal.x);
+	AC_WRITE(ac, OFSY, (s8) ac->hwcal.y);
+	AC_WRITE(ac, OFSZ, (s8) ac->hwcal.z);
+	mutex_unlock(&ac->mutex);
+
+	return count;
+}
+
+static DEVICE_ATTR(calibrate, 0664,
+		   adxl34x_calibrate_show, adxl34x_calibrate_store);
+
+static ssize_t adxl34x_rate_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct adxl34x *ac = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%u\n", RATE(ac->pdata.data_rate));
+}
+
+static ssize_t adxl34x_rate_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct adxl34x *ac = dev_get_drvdata(dev);
+	unsigned long val;
+	int error;
+
+	error = strict_strtoul(buf, 10, &val);
+	if (error)
+		return error;
+
+	mutex_lock(&ac->mutex);
+
+	ac->pdata.data_rate = RATE(val);
+	AC_WRITE(ac, BW_RATE,
+		 ac->pdata.data_rate |
+			(ac->pdata.low_power_mode ? LOW_POWER : 0));
+
+	mutex_unlock(&ac->mutex);
+
+	return count;
+}
+
+static DEVICE_ATTR(rate, 0664, adxl34x_rate_show, adxl34x_rate_store);
+
+static ssize_t adxl34x_autosleep_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct adxl34x *ac = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%u\n",
+		ac->pdata.power_mode & (PCTL_AUTO_SLEEP | PCTL_LINK) ? 1 : 0);
+}
+
+static ssize_t adxl34x_autosleep_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct adxl34x *ac = dev_get_drvdata(dev);
+	unsigned long val;
+	int error;
+
+	error = strict_strtoul(buf, 10, &val);
+	if (error)
+		return error;
+
+	mutex_lock(&ac->mutex);
+
+	if (val)
+		ac->pdata.power_mode |= (PCTL_AUTO_SLEEP | PCTL_LINK);
+	else
+		ac->pdata.power_mode &= ~(PCTL_AUTO_SLEEP | PCTL_LINK);
+
+	if (!ac->disabled && ac->opened)
+		AC_WRITE(ac, POWER_CTL, ac->pdata.power_mode | PCTL_MEASURE);
+
+	mutex_unlock(&ac->mutex);
+
+	return count;
+}
+
+static DEVICE_ATTR(autosleep, 0664,
+		   adxl34x_autosleep_show, adxl34x_autosleep_store);
+
+static ssize_t adxl34x_position_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct adxl34x *ac = dev_get_drvdata(dev);
+	ssize_t count;
+
+	mutex_lock(&ac->mutex);
+	count = sprintf(buf, "(%d, %d, %d)\n",
+			ac->saved.x, ac->saved.y, ac->saved.z);
+	mutex_unlock(&ac->mutex);
+
+	return count;
+}
+
+static DEVICE_ATTR(position, S_IRUGO, adxl34x_position_show, NULL);
+
+#ifdef ADXL_DEBUG
+static ssize_t adxl34x_write_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
+{
+	struct adxl34x *ac = dev_get_drvdata(dev);
+	unsigned long val;
+	int error;
+
+	/*
+	 * This allows basic ADXL register write access for debug purposes.
+	 */
+	error = strict_strtoul(buf, 16, &val);
+	if (error)
+		return error;
+
+	mutex_lock(&ac->mutex);
+	AC_WRITE(ac, val >> 8, val & 0xFF);
+	mutex_unlock(&ac->mutex);
+
+	return count;
+}
+
+static DEVICE_ATTR(write, 0664, NULL, adxl34x_write_store);
+#endif
+
+static struct attribute *adxl34x_attributes[] = {
+	&dev_attr_disable.attr,
+	&dev_attr_calibrate.attr,
+	&dev_attr_rate.attr,
+	&dev_attr_autosleep.attr,
+	&dev_attr_position.attr,
+#ifdef ADXL_DEBUG
+	&dev_attr_write.attr,
+#endif
+	NULL
+};
+
+static const struct attribute_group adxl34x_attr_group = {
+	.attrs = adxl34x_attributes,
+};
+
+static int adxl34x_input_open(struct input_dev *input)
+{
+	struct adxl34x *ac = input_get_drvdata(input);
+
+	mutex_lock(&ac->mutex);
+	ac->opened = true;
+	__adxl34x_enable(ac);
+	mutex_unlock(&ac->mutex);
+
+	return 0;
+}
+
+static void adxl34x_input_close(struct input_dev *input)
+{
+	struct adxl34x *ac = input_get_drvdata(input);
+
+	mutex_lock(&ac->mutex);
+	__adxl34x_disable(ac);
+	ac->opened = false;
+	mutex_unlock(&ac->mutex);
+}
+
+struct adxl34x *adxl34x_probe(struct device *dev, int irq,
+			      bool fifo_delay_default,
+			      const struct adxl34x_bus_ops *bops)
+{
+	struct adxl34x *ac;
+	struct input_dev *input_dev;
+	const struct adxl34x_platform_data *pdata;
+	int err, range;
+	unsigned char revid;
+
+	if (!irq) {
+		dev_err(dev, "no IRQ?\n");
+		err = -ENODEV;
+		goto err_out;
+	}
+
+	ac = kzalloc(sizeof(*ac), GFP_KERNEL);
+	input_dev = input_allocate_device();
+	if (!ac || !input_dev) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	ac->fifo_delay = fifo_delay_default;
+
+	pdata = dev->platform_data;
+	if (!pdata) {
+		dev_dbg(dev,
+			"No platfrom data: Using default initialization\n");
+		pdata = &adxl34x_default_init;
+	}
+
+	ac->pdata = *pdata;
+	pdata = &ac->pdata;
+
+	ac->input = input_dev;
+	ac->disabled = true;
+	ac->dev = dev;
+	ac->irq = irq;
+	ac->bops = bops;
+
+	mutex_init(&ac->mutex);
+
+	input_dev->name = "ADXL34x accelerometer";
+	revid = ac->bops->read(dev, DEVID);
+
+	switch (revid) {
+	case ID_ADXL345:
+		ac->model = 345;
+		break;
+	case ID_ADXL346:
+		ac->model = 346;
+		break;
+	default:
+		dev_err(dev, "Failed to probe %s\n", input_dev->name);
+		err = -ENODEV;
+		goto err_free_mem;
+	}
+
+	snprintf(ac->phys, sizeof(ac->phys), "%s/input0", dev_name(dev));
+
+	input_dev->phys = ac->phys;
+	input_dev->dev.parent = dev;
+	input_dev->id.product = ac->model;
+	input_dev->id.bustype = bops->bustype;
+	input_dev->open = adxl34x_input_open;
+	input_dev->close = adxl34x_input_close;
+
+	input_set_drvdata(input_dev, ac);
+
+	__set_bit(ac->pdata.ev_type, input_dev->evbit);
+
+	if (ac->pdata.ev_type == EV_REL) {
+		__set_bit(REL_X, input_dev->relbit);
+		__set_bit(REL_Y, input_dev->relbit);
+		__set_bit(REL_Z, input_dev->relbit);
+	} else {
+		/* EV_ABS */
+		__set_bit(ABS_X, input_dev->absbit);
+		__set_bit(ABS_Y, input_dev->absbit);
+		__set_bit(ABS_Z, input_dev->absbit);
+
+		if (pdata->data_range & FULL_RES)
+			range = ADXL_FULLRES_MAX_VAL;	/* Signed 13-bit */
+		else
+			range = ADXL_FIXEDRES_MAX_VAL;	/* Signed 10-bit */
+
+		input_set_abs_params(input_dev, ABS_X, -range, range, 3, 3);
+		input_set_abs_params(input_dev, ABS_Y, -range, range, 3, 3);
+		input_set_abs_params(input_dev, ABS_Z, -range, range, 3, 3);
+	}
+
+	__set_bit(EV_KEY, input_dev->evbit);
+	__set_bit(pdata->ev_code_tap[ADXL_X_AXIS], input_dev->keybit);
+	__set_bit(pdata->ev_code_tap[ADXL_Y_AXIS], input_dev->keybit);
+	__set_bit(pdata->ev_code_tap[ADXL_Z_AXIS], input_dev->keybit);
+
+	if (pdata->ev_code_ff) {
+		ac->int_mask = FREE_FALL;
+		__set_bit(pdata->ev_code_ff, input_dev->keybit);
+	}
+
+	if (pdata->ev_code_act_inactivity)
+		__set_bit(pdata->ev_code_act_inactivity, input_dev->keybit);
+
+	ac->int_mask |= ACTIVITY | INACTIVITY;
+
+	if (pdata->watermark) {
+		ac->int_mask |= WATERMARK;
+		if (!FIFO_MODE(pdata->fifo_mode))
+			ac->pdata.fifo_mode |= FIFO_STREAM;
+	} else {
+		ac->int_mask |= DATA_READY;
+	}
+
+	if (pdata->tap_axis_control & (TAP_X_EN | TAP_Y_EN | TAP_Z_EN))
+		ac->int_mask |= SINGLE_TAP | DOUBLE_TAP;
+
+	if (FIFO_MODE(pdata->fifo_mode) == FIFO_BYPASS)
+		ac->fifo_delay = false;
+
+	ac->bops->write(dev, POWER_CTL, 0);
+
+	err = request_threaded_irq(ac->irq, NULL, adxl34x_irq,
+				   IRQF_TRIGGER_HIGH | IRQF_ONESHOT,
+				   dev_name(dev), ac);
+	if (err) {
+		dev_err(dev, "irq %d busy?\n", ac->irq);
+		goto err_free_mem;
+	}
+
+	err = sysfs_create_group(&dev->kobj, &adxl34x_attr_group);
+	if (err)
+		goto err_free_irq;
+
+	err = input_register_device(input_dev);
+	if (err)
+		goto err_remove_attr;
+
+	AC_WRITE(ac, THRESH_TAP, pdata->tap_threshold);
+	AC_WRITE(ac, OFSX, pdata->x_axis_offset);
+	ac->hwcal.x = pdata->x_axis_offset;
+	AC_WRITE(ac, OFSY, pdata->y_axis_offset);
+	ac->hwcal.y = pdata->y_axis_offset;
+	AC_WRITE(ac, OFSZ, pdata->z_axis_offset);
+	ac->hwcal.z = pdata->z_axis_offset;
+	AC_WRITE(ac, THRESH_TAP, pdata->tap_threshold);
+	AC_WRITE(ac, DUR, pdata->tap_duration);
+	AC_WRITE(ac, LATENT, pdata->tap_latency);
+	AC_WRITE(ac, WINDOW, pdata->tap_window);
+	AC_WRITE(ac, THRESH_ACT, pdata->activity_threshold);
+	AC_WRITE(ac, THRESH_INACT, pdata->inactivity_threshold);
+	AC_WRITE(ac, TIME_INACT, pdata->inactivity_time);
+	AC_WRITE(ac, THRESH_FF, pdata->free_fall_threshold);
+	AC_WRITE(ac, TIME_FF, pdata->free_fall_time);
+	AC_WRITE(ac, TAP_AXES, pdata->tap_axis_control);
+	AC_WRITE(ac, ACT_INACT_CTL, pdata->act_axis_control);
+	AC_WRITE(ac, BW_RATE, RATE(ac->pdata.data_rate) |
+		 (pdata->low_power_mode ? LOW_POWER : 0));
+	AC_WRITE(ac, DATA_FORMAT, pdata->data_range);
+	AC_WRITE(ac, FIFO_CTL, FIFO_MODE(pdata->fifo_mode) |
+			SAMPLES(pdata->watermark));
+
+	if (pdata->use_int2)
+		/* Map all INTs to INT2 */
+		AC_WRITE(ac, INT_MAP, ac->int_mask | OVERRUN);
+	else
+		/* Map all INTs to INT1 */
+		AC_WRITE(ac, INT_MAP, 0);
+
+	AC_WRITE(ac, INT_ENABLE, ac->int_mask | OVERRUN);
+
+	ac->pdata.power_mode &= (PCTL_AUTO_SLEEP | PCTL_LINK);
+
+	return ac;
+
+ err_remove_attr:
+	sysfs_remove_group(&dev->kobj, &adxl34x_attr_group);
+ err_free_irq:
+	free_irq(ac->irq, ac);
+ err_free_mem:
+	input_free_device(input_dev);
+	kfree(ac);
+ err_out:
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(adxl34x_probe);
+
+int adxl34x_remove(struct adxl34x *ac)
+{
+	adxl34x_disable(ac);
+	sysfs_remove_group(&ac->dev->kobj, &adxl34x_attr_group);
+	free_irq(ac->irq, ac);
+	input_unregister_device(ac->input);
+	kfree(ac);
+
+	dev_dbg(ac->dev, "unregistered accelerometer\n");
+	return 0;
+}
+EXPORT_SYMBOL_GPL(adxl34x_remove);
+
+MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
+MODULE_DESCRIPTION("ADXL345/346 Three-Axis Digital Accelerometer Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/input/misc/adxl34x.h b/drivers/input/misc/adxl34x.h
new file mode 100644
index 000000000000..ea9093c15c81
--- /dev/null
+++ b/drivers/input/misc/adxl34x.h
@@ -0,0 +1,30 @@
+/*
+ * ADXL345/346 Three-Axis Digital Accelerometers (I2C/SPI Interface)
+ *
+ * Enter bugs at http://blackfin.uclinux.org/
+ *
+ * Copyright (C) 2009 Michael Hennerich, Analog Devices Inc.
+ * Licensed under the GPL-2 or later.
+ */
+
+#ifndef _ADXL34X_H_
+#define _ADXL34X_H_
+
+struct device;
+struct adxl34x;
+
+struct adxl34x_bus_ops {
+	u16 bustype;
+	int (*read)(struct device *, unsigned char);
+	int (*read_block)(struct device *, unsigned char, int, void *);
+	int (*write)(struct device *, unsigned char, unsigned char);
+};
+
+void adxl34x_disable(struct adxl34x *ac);
+void adxl34x_enable(struct adxl34x *ac);
+struct adxl34x *adxl34x_probe(struct device *dev, int irq,
+			      bool fifo_delay_default,
+			      const struct adxl34x_bus_ops *bops);
+int adxl34x_remove(struct adxl34x *ac);
+
+#endif
diff --git a/include/linux/input/adxl34x.h b/include/linux/input/adxl34x.h
new file mode 100644
index 000000000000..712118238038
--- /dev/null
+++ b/include/linux/input/adxl34x.h
@@ -0,0 +1,293 @@
+/*
+ * include/linux/input/adxl34x.h
+ *
+ * Digital Accelerometer characteristics are highly application specific
+ * and may vary between boards and models. The platform_data for the
+ * device's "struct device" holds this information.
+ *
+ * Copyright 2009 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#ifndef __LINUX_INPUT_ADXL34X_H__
+#define __LINUX_INPUT_ADXL34X_H__
+
+struct adxl34x_platform_data {
+
+	/*
+	 * X,Y,Z Axis Offset:
+	 * offer user offset adjustments in twoscompliment
+	 * form with a scale factor of 15.6 mg/LSB (i.e. 0x7F = +2 g)
+	 */
+
+	s8 x_axis_offset;
+	s8 y_axis_offset;
+	s8 z_axis_offset;
+
+	/*
+	 * TAP_X/Y/Z Enable: Setting TAP_X, Y, or Z Enable enables X,
+	 * Y, or Z participation in Tap detection. A '0' excludes the
+	 * selected axis from participation in Tap detection.
+	 * Setting the SUPPRESS bit suppresses Double Tap detection if
+	 * acceleration greater than tap_threshold is present between
+	 * taps.
+	 */
+
+#define ADXL_SUPPRESS	(1 << 3)
+#define ADXL_TAP_X_EN	(1 << 2)
+#define ADXL_TAP_Y_EN	(1 << 1)
+#define ADXL_TAP_Z_EN	(1 << 0)
+
+	u8 tap_axis_control;
+
+	/*
+	 * tap_threshold:
+	 * holds the threshold value for tap detection/interrupts.
+	 * The data format is unsigned. The scale factor is 62.5 mg/LSB
+	 * (i.e. 0xFF = +16 g). A zero value may result in undesirable
+	 * behavior if Tap/Double Tap is enabled.
+	 */
+
+	u8 tap_threshold;
+
+	/*
+	 * tap_duration:
+	 * is an unsigned time value representing the maximum
+	 * time that an event must be above the tap_threshold threshold
+	 * to qualify as a tap event. The scale factor is 625 us/LSB. A zero
+	 * value will prevent Tap/Double Tap functions from working.
+	 */
+
+	u8 tap_duration;
+
+	/*
+	 * tap_latency:
+	 * is an unsigned time value representing the wait time
+	 * from the detection of a tap event to the opening of the time
+	 * window tap_window for a possible second tap event. The scale
+	 * factor is 1.25 ms/LSB. A zero value will disable the Double Tap
+	 * function.
+	 */
+
+	u8 tap_latency;
+
+	/*
+	 * tap_window:
+	 * is an unsigned time value representing the amount
+	 * of time after the expiration of tap_latency during which a second
+	 * tap can begin. The scale factor is 1.25 ms/LSB. A zero value will
+	 * disable the Double Tap function.
+	 */
+
+	u8 tap_window;
+
+	/*
+	 * act_axis_control:
+	 * X/Y/Z Enable: A '1' enables X, Y, or Z participation in activity
+	 * or inactivity detection. A '0' excludes the selected axis from
+	 * participation. If all of the axes are excluded, the function is
+	 * disabled.
+	 * AC/DC: A '0' = DC coupled operation and a '1' = AC coupled
+	 * operation. In DC coupled operation, the current acceleration is
+	 * compared with activity_threshold and inactivity_threshold directly
+	 * to determine whether activity or inactivity is detected. In AC
+	 * coupled operation for activity detection, the acceleration value
+	 * at the start of activity detection is taken as a reference value.
+	 * New samples of acceleration are then compared to this
+	 * reference value and if the magnitude of the difference exceeds
+	 * activity_threshold the device will trigger an activity interrupt. In
+	 * AC coupled operation for inactivity detection, a reference value
+	 * is used again for comparison and is updated whenever the
+	 * device exceeds the inactivity threshold. Once the reference
+	 * value is selected, the device compares the magnitude of the
+	 * difference between the reference value and the current
+	 * acceleration with inactivity_threshold. If the difference is below
+	 * inactivity_threshold for a total of inactivity_time, the device is
+	 * considered inactive and the inactivity interrupt is triggered.
+	 */
+
+#define ADXL_ACT_ACDC		(1 << 7)
+#define ADXL_ACT_X_EN		(1 << 6)
+#define ADXL_ACT_Y_EN		(1 << 5)
+#define ADXL_ACT_Z_EN		(1 << 4)
+#define ADXL_INACT_ACDC		(1 << 3)
+#define ADXL_INACT_X_EN		(1 << 2)
+#define ADXL_INACT_Y_EN		(1 << 1)
+#define ADXL_INACT_Z_EN		(1 << 0)
+
+	u8 act_axis_control;
+
+	/*
+	 * activity_threshold:
+	 * holds the threshold value for activity detection.
+	 * The data format is unsigned. The scale factor is
+	 * 62.5 mg/LSB. A zero value may result in undesirable behavior if
+	 * Activity interrupt is enabled.
+	 */
+
+	u8 activity_threshold;
+
+	/*
+	 * inactivity_threshold:
+	 * holds the threshold value for inactivity
+	 * detection. The data format is unsigned. The scale
+	 * factor is 62.5 mg/LSB. A zero value may result in undesirable
+	 * behavior if Inactivity interrupt is enabled.
+	 */
+
+	u8 inactivity_threshold;
+
+	/*
+	 * inactivity_time:
+	 * is an unsigned time value representing the
+	 * amount of time that acceleration must be below the value in
+	 * inactivity_threshold for inactivity to be declared. The scale factor
+	 * is 1 second/LSB. Unlike the other interrupt functions, which
+	 * operate on unfiltered data, the inactivity function operates on the
+	 * filtered output data. At least one output sample must be
+	 * generated for the inactivity interrupt to be triggered. This will
+	 * result in the function appearing un-responsive if the
+	 * inactivity_time register is set with a value less than the time
+	 * constant of the Output Data Rate. A zero value will result in an
+	 * interrupt when the output data is below inactivity_threshold.
+	 */
+
+	u8 inactivity_time;
+
+	/*
+	 * free_fall_threshold:
+	 * holds the threshold value for Free-Fall detection.
+	 * The data format is unsigned. The root-sum-square(RSS) value
+	 * of all axes is calculated and compared to the value in
+	 * free_fall_threshold to determine if a free fall event may be
+	 * occurring.  The scale factor is 62.5 mg/LSB. A zero value may
+	 * result in undesirable behavior if Free-Fall interrupt is
+	 * enabled. Values between 300 and 600 mg (0x05 to 0x09) are
+	 * recommended.
+	 */
+
+	u8 free_fall_threshold;
+
+	/*
+	 * free_fall_time:
+	 * is an unsigned time value representing the minimum
+	 * time that the RSS value of all axes must be less than
+	 * free_fall_threshold to generate a Free-Fall interrupt. The
+	 * scale factor is 5 ms/LSB. A zero value may result in
+	 * undesirable behavior if Free-Fall interrupt is enabled.
+	 * Values between 100 to 350 ms (0x14 to 0x46) are recommended.
+	 */
+
+	u8 free_fall_time;
+
+	/*
+	 * data_rate:
+	 * Selects device bandwidth and output data rate.
+	 * RATE = 3200 Hz / (2^(15 - x)). Default value is 0x0A, or 100 Hz
+	 * Output Data Rate. An Output Data Rate should be selected that
+	 * is appropriate for the communication protocol and frequency
+	 * selected. Selecting too high of an Output Data Rate with a low
+	 * communication speed will result in samples being discarded.
+	 */
+
+	u8 data_rate;
+
+	/*
+	 * data_range:
+	 * FULL_RES: When this bit is set with the device is
+	 * in Full-Resolution Mode, where the output resolution increases
+	 * with RANGE to maintain a 4 mg/LSB scale factor. When this
+	 * bit is cleared the device is in 10-bit Mode and RANGE determine the
+	 * maximum g-Range and scale factor.
+	 */
+
+#define ADXL_FULL_RES		(1 << 3)
+#define ADXL_RANGE_PM_2g	0
+#define ADXL_RANGE_PM_4g	1
+#define ADXL_RANGE_PM_8g	2
+#define ADXL_RANGE_PM_16g	3
+
+	u8 data_range;
+
+	/*
+	 * low_power_mode:
+	 * A '0' = Normal operation and a '1' = Reduced
+	 * power operation with somewhat higher noise.
+	 */
+
+	u8 low_power_mode;
+
+	/*
+	 * power_mode:
+	 * LINK: A '1' with both the activity and inactivity functions
+	 * enabled will delay the start of the activity function until
+	 * inactivity is detected. Once activity is detected, inactivity
+	 * detection will begin and prevent the detection of activity. This
+	 * bit serially links the activity and inactivity functions. When '0'
+	 * the inactivity and activity functions are concurrent. Additional
+	 * information can be found in the Application section under Link
+	 * Mode.
+	 * AUTO_SLEEP: A '1' sets the ADXL34x to switch to Sleep Mode
+	 * when inactivity (acceleration has been below inactivity_threshold
+	 * for at least inactivity_time) is detected and the LINK bit is set.
+	 * A '0' disables automatic switching to Sleep Mode. See SLEEP
+	 * for further description.
+	 */
+
+#define ADXL_LINK	(1 << 5)
+#define ADXL_AUTO_SLEEP	(1 << 4)
+
+	u8 power_mode;
+
+	/*
+	 * fifo_mode:
+	 * BYPASS The FIFO is bypassed
+	 * FIFO   FIFO collects up to 32 values then stops collecting data
+	 * STREAM FIFO holds the last 32 data values. Once full, the FIFO's
+	 *        oldest data is lost as it is replaced with newer data
+	 *
+	 * DEFAULT should be ADXL_FIFO_STREAM
+	 */
+
+#define ADXL_FIFO_BYPASS	0
+#define ADXL_FIFO_FIFO		1
+#define ADXL_FIFO_STREAM	2
+
+	u8 fifo_mode;
+
+	/*
+	 * watermark:
+	 * The Watermark feature can be used to reduce the interrupt load
+	 * of the system. The FIFO fills up to the value stored in watermark
+	 * [1..32] and then generates an interrupt.
+	 * A '0' disables the watermark feature.
+	 */
+
+	u8 watermark;
+
+	u32 ev_type;	/* EV_ABS or EV_REL */
+
+	u32 ev_code_x;	/* ABS_X,Y,Z or REL_X,Y,Z */
+	u32 ev_code_y;	/* ABS_X,Y,Z or REL_X,Y,Z */
+	u32 ev_code_z;	/* ABS_X,Y,Z or REL_X,Y,Z */
+
+	/*
+	 * A valid BTN or KEY Code; use tap_axis_control to disable
+	 * event reporting
+	 */
+
+	u32 ev_code_tap[3];	/* EV_KEY {X-Axis, Y-Axis, Z-Axis} */
+
+	/*
+	 * A valid BTN or KEY Code for Free-Fall or Activity enables
+	 * input event reporting. A '0' disables the Free-Fall or
+	 * Activity reporting.
+	 */
+
+	u32 ev_code_ff;	/* EV_KEY */
+	u32 ev_code_act_inactivity;	/* EV_KEY */
+
+	u8 use_int2;
+};
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 671386bb23c57e5448f386a41101ed65ad1d488c Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Fri, 25 Jun 2010 08:44:10 -0700
Subject: Input: adxl34x - add support for ADXL346 orientation sensing

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/misc/adxl34x.c  | 62 ++++++++++++++++++++++++++++++++++++++++---
 include/linux/input/adxl34x.h | 56 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/misc/adxl34x.c b/drivers/input/misc/adxl34x.c
index 07f9ef631540..77fb40987059 100644
--- a/drivers/input/misc/adxl34x.c
+++ b/drivers/input/misc/adxl34x.c
@@ -196,6 +196,8 @@ struct adxl34x {
 	struct axis_triple hwcal;
 	struct axis_triple saved;
 	char phys[32];
+	unsigned orient2d_saved;
+	unsigned orient3d_saved;
 	bool disabled;	/* P: mutex */
 	bool opened;	/* P: mutex */
 	bool fifo_delay;
@@ -296,7 +298,7 @@ static irqreturn_t adxl34x_irq(int irq, void *handle)
 {
 	struct adxl34x *ac = handle;
 	struct adxl34x_platform_data *pdata = &ac->pdata;
-	int int_stat, tap_stat, samples;
+	int int_stat, tap_stat, samples, orient, orient_code;
 
 	/*
 	 * ACT_TAP_STATUS should be read before clearing the interrupt
@@ -332,6 +334,36 @@ static irqreturn_t adxl34x_irq(int irq, void *handle)
 					 pdata->ev_code_act_inactivity, 0);
 	}
 
+	/*
+	 * ORIENTATION SENSING ADXL346 only
+	 */
+	if (pdata->orientation_enable) {
+		orient = AC_READ(ac, ORIENT);
+		if ((pdata->orientation_enable & ADXL_EN_ORIENTATION_2D) &&
+		    (orient & ADXL346_2D_VALID)) {
+
+			orient_code = ADXL346_2D_ORIENT(orient);
+			/* Report orientation only when it changes */
+			if (ac->orient2d_saved != orient_code) {
+				ac->orient2d_saved = orient_code;
+				adxl34x_report_key_single(ac->input,
+					pdata->ev_codes_orient_2d[orient_code]);
+			}
+		}
+
+		if ((pdata->orientation_enable & ADXL_EN_ORIENTATION_3D) &&
+		    (orient & ADXL346_3D_VALID)) {
+
+			orient_code = ADXL346_3D_ORIENT(orient) - 1;
+			/* Report orientation only when it changes */
+			if (ac->orient3d_saved != orient_code) {
+				ac->orient3d_saved = orient_code;
+				adxl34x_report_key_single(ac->input,
+					pdata->ev_codes_orient_3d[orient_code]);
+			}
+		}
+	}
+
 	if (int_stat & (DATA_READY | WATERMARK)) {
 
 		if (pdata->fifo_mode)
@@ -641,7 +673,7 @@ struct adxl34x *adxl34x_probe(struct device *dev, int irq,
 	struct adxl34x *ac;
 	struct input_dev *input_dev;
 	const struct adxl34x_platform_data *pdata;
-	int err, range;
+	int err, range, i;
 	unsigned char revid;
 
 	if (!irq) {
@@ -797,12 +829,34 @@ struct adxl34x *adxl34x_probe(struct device *dev, int irq,
 	AC_WRITE(ac, FIFO_CTL, FIFO_MODE(pdata->fifo_mode) |
 			SAMPLES(pdata->watermark));
 
-	if (pdata->use_int2)
+	if (pdata->use_int2) {
 		/* Map all INTs to INT2 */
 		AC_WRITE(ac, INT_MAP, ac->int_mask | OVERRUN);
-	else
+	} else {
 		/* Map all INTs to INT1 */
 		AC_WRITE(ac, INT_MAP, 0);
+	}
+
+	if (ac->model == 346 && ac->pdata.orientation_enable) {
+		AC_WRITE(ac, ORIENT_CONF,
+			ORIENT_DEADZONE(ac->pdata.deadzone_angle) |
+			ORIENT_DIVISOR(ac->pdata.divisor_length));
+
+		ac->orient2d_saved = 1234;
+		ac->orient3d_saved = 1234;
+
+		if (pdata->orientation_enable & ADXL_EN_ORIENTATION_3D)
+			for (i = 0; i < ARRAY_SIZE(pdata->ev_codes_orient_3d); i++)
+				__set_bit(pdata->ev_codes_orient_3d[i],
+					  input_dev->keybit);
+
+		if (pdata->orientation_enable & ADXL_EN_ORIENTATION_2D)
+			for (i = 0; i < ARRAY_SIZE(pdata->ev_codes_orient_2d); i++)
+				__set_bit(pdata->ev_codes_orient_2d[i],
+					  input_dev->keybit);
+	} else {
+		ac->pdata.orientation_enable = 0;
+	}
 
 	AC_WRITE(ac, INT_ENABLE, ac->int_mask | OVERRUN);
 
diff --git a/include/linux/input/adxl34x.h b/include/linux/input/adxl34x.h
index 712118238038..df00d998a44a 100644
--- a/include/linux/input/adxl34x.h
+++ b/include/linux/input/adxl34x.h
@@ -288,6 +288,62 @@ struct adxl34x_platform_data {
 	u32 ev_code_ff;	/* EV_KEY */
 	u32 ev_code_act_inactivity;	/* EV_KEY */
 
+	/*
+	 * Use ADXL34x INT2 instead of INT1
+	 */
 	u8 use_int2;
+
+	/*
+	 * ADXL346 only ORIENTATION SENSING feature
+	 * The orientation function of the ADXL346 reports both 2-D and
+	 * 3-D orientation concurrently.
+	 */
+
+#define ADXL_EN_ORIENTATION_2D		1
+#define ADXL_EN_ORIENTATION_3D		2
+#define ADXL_EN_ORIENTATION_2D_3D	3
+
+	u8 orientation_enable;
+
+	/*
+	 * The width of the deadzone region between two or more
+	 * orientation positions is determined by setting the Deadzone
+	 * value. The deadzone region size can be specified with a
+	 * resolution of 3.6deg. The deadzone angle represents the total
+	 * angle where the orientation is considered invalid.
+	 */
+
+#define ADXL_DEADZONE_ANGLE_0p0		0	/* !!!0.0 [deg] */
+#define ADXL_DEADZONE_ANGLE_3p6		1	/* 3.6 [deg] */
+#define ADXL_DEADZONE_ANGLE_7p2		2	/* 7.2 [deg] */
+#define ADXL_DEADZONE_ANGLE_10p8	3	/* 10.8 [deg] */
+#define ADXL_DEADZONE_ANGLE_14p4	4	/* 14.4 [deg] */
+#define ADXL_DEADZONE_ANGLE_18p0	5	/* 18.0 [deg] */
+#define ADXL_DEADZONE_ANGLE_21p6	6	/* 21.6 [deg] */
+#define ADXL_DEADZONE_ANGLE_25p2	7	/* 25.2 [deg] */
+
+	u8 deadzone_angle;
+
+	/*
+	 * To eliminate most human motion such as walking or shaking,
+	 * a Divisor value should be selected to effectively limit the
+	 * orientation bandwidth. Set the depth of the filter used to
+	 * low-pass filter the measured acceleration for stable
+	 * orientation sensing
+	 */
+
+#define ADXL_LP_FILTER_DIVISOR_2	0
+#define ADXL_LP_FILTER_DIVISOR_4	1
+#define ADXL_LP_FILTER_DIVISOR_8	2
+#define ADXL_LP_FILTER_DIVISOR_16	3
+#define ADXL_LP_FILTER_DIVISOR_32	4
+#define ADXL_LP_FILTER_DIVISOR_64	5
+#define ADXL_LP_FILTER_DIVISOR_128	6
+#define ADXL_LP_FILTER_DIVISOR_256	7
+
+	u8 divisor_length;
+
+	u32 ev_codes_orient_2d[4];	/* EV_KEY {+X, -X, +Y, -Y} */
+	u32 ev_codes_orient_3d[6];	/* EV_KEY {+Z, +Y, +X, -X, -Y, -Z} */
 };
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 01f2f3f6ef4d076c0c10a8a7b42624416d56b523 Mon Sep 17 00:00:00 2001
From: Hagen Paul Pfeifer <hagen@jauu.net>
Date: Sat, 19 Jun 2010 17:05:36 +0000
Subject: net: optimize Berkeley Packet Filter (BPF) processing

Gcc is currenlty not in the ability to optimize the switch statement in
sk_run_filter() because of dense case labels. This patch replace the
OR'd labels with ordered sequenced case labels. The sk_chk_filter()
function is modified to patch/replace the original OPCODES in a
ordered but equivalent form. gcc is now in the ability to transform the
switch statement in sk_run_filter into a jump table of complexity O(1).

Until this patch gcc generates a sequence of conditional branches (O(n) of 567
byte .text segment size (arch x86_64):

7ff: 8b 06                 mov    (%rsi),%eax
801: 66 83 f8 35           cmp    $0x35,%ax
805: 0f 84 d0 02 00 00     je     adb <sk_run_filter+0x31d>
80b: 0f 87 07 01 00 00     ja     918 <sk_run_filter+0x15a>
811: 66 83 f8 15           cmp    $0x15,%ax
815: 0f 84 c5 02 00 00     je     ae0 <sk_run_filter+0x322>
81b: 77 73                 ja     890 <sk_run_filter+0xd2>
81d: 66 83 f8 04           cmp    $0x4,%ax
821: 0f 84 17 02 00 00     je     a3e <sk_run_filter+0x280>
827: 77 29                 ja     852 <sk_run_filter+0x94>
829: 66 83 f8 01           cmp    $0x1,%ax
[...]

With the modification the compiler translate the switch statement into
the following jump table fragment:

7ff: 66 83 3e 2c           cmpw   $0x2c,(%rsi)
803: 0f 87 1f 02 00 00     ja     a28 <sk_run_filter+0x26a>
809: 0f b7 06              movzwl (%rsi),%eax
80c: ff 24 c5 00 00 00 00  jmpq   *0x0(,%rax,8)
813: 44 89 e3              mov    %r12d,%ebx
816: e9 43 03 00 00        jmpq   b5e <sk_run_filter+0x3a0>
81b: 41 89 dc              mov    %ebx,%r12d
81e: e9 3b 03 00 00        jmpq   b5e <sk_run_filter+0x3a0>

Furthermore, I reordered the instructions to reduce cache line misses by
order the most common instruction to the start.

Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  48 +++++++++++
 net/core/filter.c      | 212 +++++++++++++++++++++++++++++++++++++------------
 2 files changed, 209 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 151f5d703b7e..69b43dbea6c6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -91,6 +91,54 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define         BPF_TAX         0x00
 #define         BPF_TXA         0x80
 
+enum {
+	BPF_S_RET_K = 0,
+	BPF_S_RET_A,
+	BPF_S_ALU_ADD_K,
+	BPF_S_ALU_ADD_X,
+	BPF_S_ALU_SUB_K,
+	BPF_S_ALU_SUB_X,
+	BPF_S_ALU_MUL_K,
+	BPF_S_ALU_MUL_X,
+	BPF_S_ALU_DIV_X,
+	BPF_S_ALU_AND_K,
+	BPF_S_ALU_AND_X,
+	BPF_S_ALU_OR_K,
+	BPF_S_ALU_OR_X,
+	BPF_S_ALU_LSH_K,
+	BPF_S_ALU_LSH_X,
+	BPF_S_ALU_RSH_K,
+	BPF_S_ALU_RSH_X,
+	BPF_S_ALU_NEG,
+	BPF_S_LD_W_ABS,
+	BPF_S_LD_H_ABS,
+	BPF_S_LD_B_ABS,
+	BPF_S_LD_W_LEN,
+	BPF_S_LD_W_IND,
+	BPF_S_LD_H_IND,
+	BPF_S_LD_B_IND,
+	BPF_S_LD_IMM,
+	BPF_S_LDX_W_LEN,
+	BPF_S_LDX_B_MSH,
+	BPF_S_LDX_IMM,
+	BPF_S_MISC_TAX,
+	BPF_S_MISC_TXA,
+	BPF_S_ALU_DIV_K,
+	BPF_S_LD_MEM,
+	BPF_S_LDX_MEM,
+	BPF_S_ST,
+	BPF_S_STX,
+	BPF_S_JMP_JA,
+	BPF_S_JMP_JEQ_K,
+	BPF_S_JMP_JEQ_X,
+	BPF_S_JMP_JGE_K,
+	BPF_S_JMP_JGE_X,
+	BPF_S_JMP_JGT_K,
+	BPF_S_JMP_JGT_X,
+	BPF_S_JMP_JSET_K,
+	BPF_S_JMP_JSET_X,
+};
+
 #ifndef BPF_MAXINSNS
 #define BPF_MAXINSNS 4096
 #endif
diff --git a/net/core/filter.c b/net/core/filter.c
index da69fb728d32..52b051f82a01 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -128,87 +128,87 @@ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int
 		fentry = &filter[pc];
 
 		switch (fentry->code) {
-		case BPF_ALU|BPF_ADD|BPF_X:
+		case BPF_S_ALU_ADD_X:
 			A += X;
 			continue;
-		case BPF_ALU|BPF_ADD|BPF_K:
+		case BPF_S_ALU_ADD_K:
 			A += fentry->k;
 			continue;
-		case BPF_ALU|BPF_SUB|BPF_X:
+		case BPF_S_ALU_SUB_X:
 			A -= X;
 			continue;
-		case BPF_ALU|BPF_SUB|BPF_K:
+		case BPF_S_ALU_SUB_K:
 			A -= fentry->k;
 			continue;
-		case BPF_ALU|BPF_MUL|BPF_X:
+		case BPF_S_ALU_MUL_X:
 			A *= X;
 			continue;
-		case BPF_ALU|BPF_MUL|BPF_K:
+		case BPF_S_ALU_MUL_K:
 			A *= fentry->k;
 			continue;
-		case BPF_ALU|BPF_DIV|BPF_X:
+		case BPF_S_ALU_DIV_X:
 			if (X == 0)
 				return 0;
 			A /= X;
 			continue;
-		case BPF_ALU|BPF_DIV|BPF_K:
+		case BPF_S_ALU_DIV_K:
 			A /= fentry->k;
 			continue;
-		case BPF_ALU|BPF_AND|BPF_X:
+		case BPF_S_ALU_AND_X:
 			A &= X;
 			continue;
-		case BPF_ALU|BPF_AND|BPF_K:
+		case BPF_S_ALU_AND_K:
 			A &= fentry->k;
 			continue;
-		case BPF_ALU|BPF_OR|BPF_X:
+		case BPF_S_ALU_OR_X:
 			A |= X;
 			continue;
-		case BPF_ALU|BPF_OR|BPF_K:
+		case BPF_S_ALU_OR_K:
 			A |= fentry->k;
 			continue;
-		case BPF_ALU|BPF_LSH|BPF_X:
+		case BPF_S_ALU_LSH_X:
 			A <<= X;
 			continue;
-		case BPF_ALU|BPF_LSH|BPF_K:
+		case BPF_S_ALU_LSH_K:
 			A <<= fentry->k;
 			continue;
-		case BPF_ALU|BPF_RSH|BPF_X:
+		case BPF_S_ALU_RSH_X:
 			A >>= X;
 			continue;
-		case BPF_ALU|BPF_RSH|BPF_K:
+		case BPF_S_ALU_RSH_K:
 			A >>= fentry->k;
 			continue;
-		case BPF_ALU|BPF_NEG:
+		case BPF_S_ALU_NEG:
 			A = -A;
 			continue;
-		case BPF_JMP|BPF_JA:
+		case BPF_S_JMP_JA:
 			pc += fentry->k;
 			continue;
-		case BPF_JMP|BPF_JGT|BPF_K:
+		case BPF_S_JMP_JGT_K:
 			pc += (A > fentry->k) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JGE|BPF_K:
+		case BPF_S_JMP_JGE_K:
 			pc += (A >= fentry->k) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JEQ|BPF_K:
+		case BPF_S_JMP_JEQ_K:
 			pc += (A == fentry->k) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JSET|BPF_K:
+		case BPF_S_JMP_JSET_K:
 			pc += (A & fentry->k) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JGT|BPF_X:
+		case BPF_S_JMP_JGT_X:
 			pc += (A > X) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JGE|BPF_X:
+		case BPF_S_JMP_JGE_X:
 			pc += (A >= X) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JEQ|BPF_X:
+		case BPF_S_JMP_JEQ_X:
 			pc += (A == X) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JSET|BPF_X:
+		case BPF_S_JMP_JSET_X:
 			pc += (A & X) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_LD|BPF_W|BPF_ABS:
+		case BPF_S_LD_W_ABS:
 			k = fentry->k;
 load_w:
 			ptr = load_pointer(skb, k, 4, &tmp);
@@ -217,7 +217,7 @@ load_w:
 				continue;
 			}
 			break;
-		case BPF_LD|BPF_H|BPF_ABS:
+		case BPF_S_LD_H_ABS:
 			k = fentry->k;
 load_h:
 			ptr = load_pointer(skb, k, 2, &tmp);
@@ -226,7 +226,7 @@ load_h:
 				continue;
 			}
 			break;
-		case BPF_LD|BPF_B|BPF_ABS:
+		case BPF_S_LD_B_ABS:
 			k = fentry->k;
 load_b:
 			ptr = load_pointer(skb, k, 1, &tmp);
@@ -235,54 +235,54 @@ load_b:
 				continue;
 			}
 			break;
-		case BPF_LD|BPF_W|BPF_LEN:
+		case BPF_S_LD_W_LEN:
 			A = skb->len;
 			continue;
-		case BPF_LDX|BPF_W|BPF_LEN:
+		case BPF_S_LDX_W_LEN:
 			X = skb->len;
 			continue;
-		case BPF_LD|BPF_W|BPF_IND:
+		case BPF_S_LD_W_IND:
 			k = X + fentry->k;
 			goto load_w;
-		case BPF_LD|BPF_H|BPF_IND:
+		case BPF_S_LD_H_IND:
 			k = X + fentry->k;
 			goto load_h;
-		case BPF_LD|BPF_B|BPF_IND:
+		case BPF_S_LD_B_IND:
 			k = X + fentry->k;
 			goto load_b;
-		case BPF_LDX|BPF_B|BPF_MSH:
+		case BPF_S_LDX_B_MSH:
 			ptr = load_pointer(skb, fentry->k, 1, &tmp);
 			if (ptr != NULL) {
 				X = (*(u8 *)ptr & 0xf) << 2;
 				continue;
 			}
 			return 0;
-		case BPF_LD|BPF_IMM:
+		case BPF_S_LD_IMM:
 			A = fentry->k;
 			continue;
-		case BPF_LDX|BPF_IMM:
+		case BPF_S_LDX_IMM:
 			X = fentry->k;
 			continue;
-		case BPF_LD|BPF_MEM:
+		case BPF_S_LD_MEM:
 			A = mem[fentry->k];
 			continue;
-		case BPF_LDX|BPF_MEM:
+		case BPF_S_LDX_MEM:
 			X = mem[fentry->k];
 			continue;
-		case BPF_MISC|BPF_TAX:
+		case BPF_S_MISC_TAX:
 			X = A;
 			continue;
-		case BPF_MISC|BPF_TXA:
+		case BPF_S_MISC_TXA:
 			A = X;
 			continue;
-		case BPF_RET|BPF_K:
+		case BPF_S_RET_K:
 			return fentry->k;
-		case BPF_RET|BPF_A:
+		case BPF_S_RET_A:
 			return A;
-		case BPF_ST:
+		case BPF_S_ST:
 			mem[fentry->k] = A;
 			continue;
-		case BPF_STX:
+		case BPF_S_STX:
 			mem[fentry->k] = X;
 			continue;
 		default:
@@ -390,53 +390,128 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
 		/* Only allow valid instructions */
 		switch (ftest->code) {
 		case BPF_ALU|BPF_ADD|BPF_K:
+			ftest->code = BPF_S_ALU_ADD_K;
+			break;
 		case BPF_ALU|BPF_ADD|BPF_X:
+			ftest->code = BPF_S_ALU_ADD_X;
+			break;
 		case BPF_ALU|BPF_SUB|BPF_K:
+			ftest->code = BPF_S_ALU_SUB_K;
+			break;
 		case BPF_ALU|BPF_SUB|BPF_X:
+			ftest->code = BPF_S_ALU_SUB_X;
+			break;
 		case BPF_ALU|BPF_MUL|BPF_K:
+			ftest->code = BPF_S_ALU_MUL_K;
+			break;
 		case BPF_ALU|BPF_MUL|BPF_X:
+			ftest->code = BPF_S_ALU_MUL_X;
+			break;
 		case BPF_ALU|BPF_DIV|BPF_X:
+			ftest->code = BPF_S_ALU_DIV_X;
+			break;
 		case BPF_ALU|BPF_AND|BPF_K:
+			ftest->code = BPF_S_ALU_AND_K;
+			break;
 		case BPF_ALU|BPF_AND|BPF_X:
+			ftest->code = BPF_S_ALU_AND_X;
+			break;
 		case BPF_ALU|BPF_OR|BPF_K:
+			ftest->code = BPF_S_ALU_OR_K;
+			break;
 		case BPF_ALU|BPF_OR|BPF_X:
+			ftest->code = BPF_S_ALU_OR_X;
+			break;
 		case BPF_ALU|BPF_LSH|BPF_K:
+			ftest->code = BPF_S_ALU_LSH_K;
+			break;
 		case BPF_ALU|BPF_LSH|BPF_X:
+			ftest->code = BPF_S_ALU_LSH_X;
+			break;
 		case BPF_ALU|BPF_RSH|BPF_K:
+			ftest->code = BPF_S_ALU_RSH_K;
+			break;
 		case BPF_ALU|BPF_RSH|BPF_X:
+			ftest->code = BPF_S_ALU_RSH_X;
+			break;
 		case BPF_ALU|BPF_NEG:
+			ftest->code = BPF_S_ALU_NEG;
+			break;
 		case BPF_LD|BPF_W|BPF_ABS:
+			ftest->code = BPF_S_LD_W_ABS;
+			break;
 		case BPF_LD|BPF_H|BPF_ABS:
+			ftest->code = BPF_S_LD_H_ABS;
+			break;
 		case BPF_LD|BPF_B|BPF_ABS:
+			ftest->code = BPF_S_LD_B_ABS;
+			break;
 		case BPF_LD|BPF_W|BPF_LEN:
+			ftest->code = BPF_S_LD_W_LEN;
+			break;
 		case BPF_LD|BPF_W|BPF_IND:
+			ftest->code = BPF_S_LD_W_IND;
+			break;
 		case BPF_LD|BPF_H|BPF_IND:
+			ftest->code = BPF_S_LD_H_IND;
+			break;
 		case BPF_LD|BPF_B|BPF_IND:
+			ftest->code = BPF_S_LD_B_IND;
+			break;
 		case BPF_LD|BPF_IMM:
+			ftest->code = BPF_S_LD_IMM;
+			break;
 		case BPF_LDX|BPF_W|BPF_LEN:
+			ftest->code = BPF_S_LDX_W_LEN;
+			break;
 		case BPF_LDX|BPF_B|BPF_MSH:
+			ftest->code = BPF_S_LDX_B_MSH;
+			break;
 		case BPF_LDX|BPF_IMM:
+			ftest->code = BPF_S_LDX_IMM;
+			break;
 		case BPF_MISC|BPF_TAX:
+			ftest->code = BPF_S_MISC_TAX;
+			break;
 		case BPF_MISC|BPF_TXA:
+			ftest->code = BPF_S_MISC_TXA;
+			break;
 		case BPF_RET|BPF_K:
+			ftest->code = BPF_S_RET_K;
+			break;
 		case BPF_RET|BPF_A:
+			ftest->code = BPF_S_RET_A;
 			break;
 
 		/* Some instructions need special checks */
 
-		case BPF_ALU|BPF_DIV|BPF_K:
 			/* check for division by zero */
+		case BPF_ALU|BPF_DIV|BPF_K:
 			if (ftest->k == 0)
 				return -EINVAL;
+			ftest->code = BPF_S_ALU_DIV_K;
 			break;
 
+		/* check for invalid memory addresses */
 		case BPF_LD|BPF_MEM:
+			if (ftest->k >= BPF_MEMWORDS)
+				return -EINVAL;
+			ftest->code = BPF_S_LD_MEM;
+			break;
 		case BPF_LDX|BPF_MEM:
+			if (ftest->k >= BPF_MEMWORDS)
+				return -EINVAL;
+			ftest->code = BPF_S_LDX_MEM;
+			break;
 		case BPF_ST:
+			if (ftest->k >= BPF_MEMWORDS)
+				return -EINVAL;
+			ftest->code = BPF_S_ST;
+			break;
 		case BPF_STX:
-			/* check for invalid memory addresses */
 			if (ftest->k >= BPF_MEMWORDS)
 				return -EINVAL;
+			ftest->code = BPF_S_STX;
 			break;
 
 		case BPF_JMP|BPF_JA:
@@ -447,28 +522,63 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
 			 */
 			if (ftest->k >= (unsigned)(flen-pc-1))
 				return -EINVAL;
+			ftest->code = BPF_S_JMP_JA;
 			break;
 
 		case BPF_JMP|BPF_JEQ|BPF_K:
+			ftest->code = BPF_S_JMP_JEQ_K;
+			break;
 		case BPF_JMP|BPF_JEQ|BPF_X:
+			ftest->code = BPF_S_JMP_JEQ_X;
+			break;
 		case BPF_JMP|BPF_JGE|BPF_K:
+			ftest->code = BPF_S_JMP_JGE_K;
+			break;
 		case BPF_JMP|BPF_JGE|BPF_X:
+			ftest->code = BPF_S_JMP_JGE_X;
+			break;
 		case BPF_JMP|BPF_JGT|BPF_K:
+			ftest->code = BPF_S_JMP_JGT_K;
+			break;
 		case BPF_JMP|BPF_JGT|BPF_X:
+			ftest->code = BPF_S_JMP_JGT_X;
+			break;
 		case BPF_JMP|BPF_JSET|BPF_K:
+			ftest->code = BPF_S_JMP_JSET_K;
+			break;
 		case BPF_JMP|BPF_JSET|BPF_X:
+			ftest->code = BPF_S_JMP_JSET_X;
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
 			/* for conditionals both must be safe */
+		switch (ftest->code) {
+		case BPF_S_JMP_JEQ_K:
+		case BPF_S_JMP_JEQ_X:
+		case BPF_S_JMP_JGE_K:
+		case BPF_S_JMP_JGE_X:
+		case BPF_S_JMP_JGT_K:
+		case BPF_S_JMP_JGT_X:
+		case BPF_S_JMP_JSET_X:
+		case BPF_S_JMP_JSET_K:
 			if (pc + ftest->jt + 1 >= flen ||
 			    pc + ftest->jf + 1 >= flen)
 				return -EINVAL;
-			break;
+		}
+	}
 
+	/* last instruction must be a RET code */
+	switch (filter[flen - 1].code) {
+	case BPF_S_RET_K:
+	case BPF_S_RET_A:
+		return 0;
+		break;
 		default:
 			return -EINVAL;
 		}
-	}
-
-	return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL;
 }
 EXPORT_SYMBOL(sk_chk_filter);
 
-- 
cgit v1.2.3-59-g8ed1b


From 4ba6ce250e406b20bcd6f0f3aed6b3d80965e6c2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 27 Jun 2010 18:49:59 +0200
Subject: percpu: make @dyn_size always mean min dyn_size in first chunk init
 functions

In pcpu_build_alloc_info() and pcpu_embed_first_chunk(), @dyn_size was
ssize_t, -1 meant auto-size, 0 forced 0 and positive meant minimum
size.  There's no use case for forcing 0 and the upcoming early alloc
support always requires non-zero dynamic size.  Make @dyn_size always
mean minimum dyn_size.

While at it, make pcpu_build_alloc_info() static which doesn't have
any external caller as suggested by David Rientjes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
---
 include/linux/percpu.h |  7 +------
 mm/percpu.c            | 35 ++++++++++-------------------------
 2 files changed, 11 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index d3a38d687104..3ffd05e550de 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -104,16 +104,11 @@ extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
 							     int nr_units);
 extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai);
 
-extern struct pcpu_alloc_info * __init pcpu_build_alloc_info(
-				size_t reserved_size, ssize_t dyn_size,
-				size_t atom_size,
-				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
-
 extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 					 void *base_addr);
 
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
-extern int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
+extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
 				pcpu_fc_alloc_fn_t alloc_fn,
diff --git a/mm/percpu.c b/mm/percpu.c
index 6470e7710231..c3e7010c6d71 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1013,20 +1013,6 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
 		return page_to_phys(pcpu_addr_to_page(addr));
 }
 
-static inline size_t pcpu_calc_fc_sizes(size_t static_size,
-					size_t reserved_size,
-					ssize_t *dyn_sizep)
-{
-	size_t size_sum;
-
-	size_sum = PFN_ALIGN(static_size + reserved_size +
-			     (*dyn_sizep >= 0 ? *dyn_sizep : 0));
-	if (*dyn_sizep != 0)
-		*dyn_sizep = size_sum - static_size - reserved_size;
-
-	return size_sum;
-}
-
 /**
  * pcpu_alloc_alloc_info - allocate percpu allocation info
  * @nr_groups: the number of groups
@@ -1085,7 +1071,7 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
 /**
  * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
  * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @dyn_size: minimum free size for dynamic allocation in bytes
  * @atom_size: allocation atom size
  * @cpu_distance_fn: callback to determine distance between cpus, optional
  *
@@ -1103,8 +1089,8 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
  * On success, pointer to the new allocation_info is returned.  On
  * failure, ERR_PTR value is returned.
  */
-struct pcpu_alloc_info * __init pcpu_build_alloc_info(
-				size_t reserved_size, ssize_t dyn_size,
+static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+				size_t reserved_size, size_t dyn_size,
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
 {
@@ -1123,13 +1109,15 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info(
 	memset(group_map, 0, sizeof(group_map));
 	memset(group_cnt, 0, sizeof(group_cnt));
 
+	size_sum = PFN_ALIGN(static_size + reserved_size + dyn_size);
+	dyn_size = size_sum - static_size - reserved_size;
+
 	/*
 	 * Determine min_unit_size, alloc_size and max_upa such that
 	 * alloc_size is multiple of atom_size and is the smallest
 	 * which can accomodate 4k aligned segments which are equal to
 	 * or larger than min_unit_size.
 	 */
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
 	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
 
 	alloc_size = roundup(min_unit_size, atom_size);
@@ -1532,7 +1520,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
 /**
  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
  * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @dyn_size: minimum free size for dynamic allocation in bytes
  * @atom_size: allocation atom size
  * @cpu_distance_fn: callback to determine distance between cpus, optional
  * @alloc_fn: function to allocate percpu page
@@ -1553,10 +1541,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
  * vmalloc space is not orders of magnitude larger than distances
  * between node memory addresses (ie. 32bit NUMA machines).
  *
- * When @dyn_size is positive, dynamic area might be larger than
- * specified to fill page alignment.  When @dyn_size is auto,
- * @dyn_size is just big enough to fill page alignment after static
- * and reserved areas.
+ * @dyn_size specifies the minimum dynamic area size.
  *
  * If the needed size is smaller than the minimum or specified unit
  * size, the leftover is returned using @free_fn.
@@ -1564,7 +1549,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
+int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 				  size_t atom_size,
 				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
 				  pcpu_fc_alloc_fn_t alloc_fn,
@@ -1695,7 +1680,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
 
 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
 
-	ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
+	ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
 	if (IS_ERR(ai))
 		return PTR_ERR(ai);
 	BUG_ON(ai->nr_groups != 1);
-- 
cgit v1.2.3-59-g8ed1b


From 099a19d91ca429944743d51bef8fee240e94d8e3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 27 Jun 2010 18:50:00 +0200
Subject: percpu: allow limited allocation before slab is online

This patch updates percpu allocator such that it can serve limited
amount of allocation before slab comes online.  This is primarily to
allow slab to depend on working percpu allocator.

Two parameters, PERCPU_DYNAMIC_EARLY_SIZE and SLOTS, determine how
much memory space and allocation map slots are reserved.  If this
reserved area is exhausted, WARN_ON_ONCE() will trigger and allocation
will fail till slab comes online.

The following changes are made to implement early alloc.

* pcpu_mem_alloc() now checks slab_is_available()

* Chunks are allocated using pcpu_mem_alloc()

* Init paths make sure ai->dyn_size is at least as large as
  PERCPU_DYNAMIC_EARLY_SIZE.

* Initial alloc maps are allocated in __initdata and copied to
  kmalloc'd areas once slab is online.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
---
 include/linux/percpu.h | 13 +++++++++++++
 init/main.c            |  1 +
 mm/percpu.c            | 52 ++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 54 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 3ffd05e550de..b8b9084527b1 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -44,6 +44,16 @@
 /* minimum unit size, also is the maximum supported allocation size */
 #define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(64 << 10)
 
+/*
+ * Percpu allocator can serve percpu allocations before slab is
+ * initialized which allows slab to depend on the percpu allocator.
+ * The following two parameters decide how much resource to
+ * preallocate for this.  Keep PERCPU_DYNAMIC_RESERVE equal to or
+ * larger than PERCPU_DYNAMIC_EARLY_SIZE.
+ */
+#define PERCPU_DYNAMIC_EARLY_SLOTS	128
+#define PERCPU_DYNAMIC_EARLY_SIZE	(12 << 10)
+
 /*
  * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
  * back on the first chunk for dynamic percpu allocation if arch is
@@ -135,6 +145,7 @@ extern bool is_kernel_percpu_address(unsigned long addr);
 #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
 extern void __init setup_per_cpu_areas(void);
 #endif
+extern void __init percpu_init_late(void);
 
 #else /* CONFIG_SMP */
 
@@ -148,6 +159,8 @@ static inline bool is_kernel_percpu_address(unsigned long addr)
 
 static inline void __init setup_per_cpu_areas(void) { }
 
+static inline void __init percpu_init_late(void) { }
+
 static inline void *pcpu_lpage_remapped(void *kaddr)
 {
 	return NULL;
diff --git a/init/main.c b/init/main.c
index 3bdb152f412f..3ff8dd0fb512 100644
--- a/init/main.c
+++ b/init/main.c
@@ -522,6 +522,7 @@ static void __init mm_init(void)
 	page_cgroup_init_flatmem();
 	mem_init();
 	kmem_cache_init();
+	percpu_init_late();
 	pgtable_cache_init();
 	vmalloc_init();
 }
diff --git a/mm/percpu.c b/mm/percpu.c
index c3e7010c6d71..e61dc2cc5873 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -282,6 +282,9 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
  */
 static void *pcpu_mem_alloc(size_t size)
 {
+	if (WARN_ON_ONCE(!slab_is_available()))
+		return NULL;
+
 	if (size <= PAGE_SIZE)
 		return kzalloc(size, GFP_KERNEL);
 	else {
@@ -392,13 +395,6 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
 	old_size = chunk->map_alloc * sizeof(chunk->map[0]);
 	memcpy(new, chunk->map, old_size);
 
-	/*
-	 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
-	 * one of the first chunks and still using static map.
-	 */
-	if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
-		old = chunk->map;
-
 	chunk->map_alloc = new_alloc;
 	chunk->map = new;
 	new = NULL;
@@ -604,7 +600,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
 {
 	struct pcpu_chunk *chunk;
 
-	chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
+	chunk = pcpu_mem_alloc(pcpu_chunk_struct_size);
 	if (!chunk)
 		return NULL;
 
@@ -1109,7 +1105,9 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
 	memset(group_map, 0, sizeof(group_map));
 	memset(group_cnt, 0, sizeof(group_cnt));
 
-	size_sum = PFN_ALIGN(static_size + reserved_size + dyn_size);
+	/* calculate size_sum and ensure dyn_size is enough for early alloc */
+	size_sum = PFN_ALIGN(static_size + reserved_size +
+			    max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
 	dyn_size = size_sum - static_size - reserved_size;
 
 	/*
@@ -1338,7 +1336,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 				  void *base_addr)
 {
 	static char cpus_buf[4096] __initdata;
-	static int smap[2], dmap[2];
+	static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
+	static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
 	size_t dyn_size = ai->dyn_size;
 	size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
 	struct pcpu_chunk *schunk, *dchunk = NULL;
@@ -1361,14 +1360,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 } while (0)
 
 	/* sanity checks */
-	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
-		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
 	PCPU_SETUP_BUG_ON(!ai->static_size);
 	PCPU_SETUP_BUG_ON(!base_addr);
 	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
 	PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
 	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
 	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
 
 	/* process group information and build config tables accordingly */
@@ -1806,3 +1804,33 @@ void __init setup_per_cpu_areas(void)
 		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
 }
 #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+
+/*
+ * First and reserved chunks are initialized with temporary allocation
+ * map in initdata so that they can be used before slab is online.
+ * This function is called after slab is brought up and replaces those
+ * with properly allocated maps.
+ */
+void __init percpu_init_late(void)
+{
+	struct pcpu_chunk *target_chunks[] =
+		{ pcpu_first_chunk, pcpu_reserved_chunk, NULL };
+	struct pcpu_chunk *chunk;
+	unsigned long flags;
+	int i;
+
+	for (i = 0; (chunk = target_chunks[i]); i++) {
+		int *map;
+		const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
+
+		BUILD_BUG_ON(size > PAGE_SIZE);
+
+		map = pcpu_mem_alloc(size);
+		BUG_ON(!map);
+
+		spin_lock_irqsave(&pcpu_lock, flags);
+		memcpy(map, chunk->map, size);
+		chunk->map = map;
+		spin_unlock_irqrestore(&pcpu_lock, flags);
+	}
+}
-- 
cgit v1.2.3-59-g8ed1b


From 7804302b14032d357d889e4a23e463eb6a6c5136 Mon Sep 17 00:00:00 2001
From: Anatolij Gustschin <agust@denx.de>
Date: Mon, 28 Jun 2010 01:25:19 -0700
Subject: Input: ads7846 - allow specifying irq trigger type in platform data

On some platforms, for example with GPIO interrupts on mpc5121,
it is not possible to configure falling edge interrupts.

Specifying irq trigger type in platform data structure
allows using ads7846 driver on such platforms.

Signed-off-by: Anatolij Gustschin <agust@denx.de>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/ads7846.c | 5 ++++-
 include/linux/spi/ads7846.h         | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index a9fdf55c0238..69210cb56c54 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -1174,7 +1174,10 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 		goto err_put_regulator;
 	}
 
-	if (request_irq(spi->irq, ads7846_irq, IRQF_TRIGGER_FALLING,
+	if (!pdata->irq_flags)
+		pdata->irq_flags = IRQF_TRIGGER_FALLING;
+
+	if (request_irq(spi->irq, ads7846_irq, pdata->irq_flags,
 			spi->dev.driver->name, ts)) {
 		dev_info(&spi->dev,
 			"trying pin change workaround on irq %d\n", spi->irq);
diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h
index b4ae570d3c98..95d36bfb34bc 100644
--- a/include/linux/spi/ads7846.h
+++ b/include/linux/spi/ads7846.h
@@ -54,5 +54,6 @@ struct ads7846_platform_data {
 	void	(*filter_cleanup)(void *filter_data);
 	void	(*wait_for_sync)(void);
 	bool	wakeup;
+	unsigned long irq_flags;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 7eb9282cd0efac08b8377cbd5037ba297c77e3f7 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 28 Jun 2010 14:16:08 +0200
Subject: netfilter: ipt_LOG/ip6t_LOG: add option to print decoded MAC header

The LOG targets print the entire MAC header as one long string, which is not
readable very well:

IN=eth0 OUT= MAC=00:15:f2:24:91:f8:00:1b:24:dc:61:e6:08:00 ...

Add an option to decode known header formats (currently just ARPHRD_ETHER devices)
in their individual fields:

IN=eth0 OUT= MACSRC=00:1b:24:dc:61:e6 MACDST=00:15:f2:24:91:f8 MACPROTO=0800 ...
IN=eth0 OUT= MACSRC=00:1b:24:dc:61:e6 MACDST=00:15:f2:24:91:f8 MACPROTO=86dd ...

The option needs to be explicitly enabled by userspace to avoid breaking
existing parsers.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter_ipv4/ipt_LOG.h  |  3 +-
 include/linux/netfilter_ipv6/ip6t_LOG.h |  3 +-
 net/ipv4/netfilter/ipt_LOG.c            | 54 ++++++++++++++++------
 net/ipv6/netfilter/ip6t_LOG.c           | 81 +++++++++++++++++++++------------
 4 files changed, 97 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ipt_LOG.h b/include/linux/netfilter_ipv4/ipt_LOG.h
index 90fa6525ef9c..dcdbadf9fd4a 100644
--- a/include/linux/netfilter_ipv4/ipt_LOG.h
+++ b/include/linux/netfilter_ipv4/ipt_LOG.h
@@ -7,7 +7,8 @@
 #define IPT_LOG_IPOPT		0x04	/* Log IP options */
 #define IPT_LOG_UID		0x08	/* Log UID owning local socket */
 #define IPT_LOG_NFLOG		0x10	/* Unsupported, don't reuse */
-#define IPT_LOG_MASK		0x1f
+#define IPT_LOG_MACDECODE	0x20	/* Decode MAC header */
+#define IPT_LOG_MASK		0x2f
 
 struct ipt_log_info {
 	unsigned char level;
diff --git a/include/linux/netfilter_ipv6/ip6t_LOG.h b/include/linux/netfilter_ipv6/ip6t_LOG.h
index 0d0119b0458c..9dd5579e02ec 100644
--- a/include/linux/netfilter_ipv6/ip6t_LOG.h
+++ b/include/linux/netfilter_ipv6/ip6t_LOG.h
@@ -7,7 +7,8 @@
 #define IP6T_LOG_IPOPT		0x04	/* Log IP options */
 #define IP6T_LOG_UID		0x08	/* Log UID owning local socket */
 #define IP6T_LOG_NFLOG		0x10	/* Unsupported, don't use */
-#define IP6T_LOG_MASK		0x1f
+#define IP6T_LOG_MACDECODE	0x20	/* Decode MAC header */
+#define IP6T_LOG_MASK		0x2f
 
 struct ip6t_log_info {
 	unsigned char level;
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 0a452a54adbe..915fc17d7ce2 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/skbuff.h>
+#include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <net/icmp.h>
 #include <net/udp.h>
@@ -363,6 +364,42 @@ static void dump_packet(const struct nf_loginfo *info,
 	/* maxlen = 230+   91  + 230 + 252 = 803 */
 }
 
+static void dump_mac_header(const struct nf_loginfo *info,
+			    const struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	unsigned int logflags = 0;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+
+	if (!(logflags & IPT_LOG_MACDECODE))
+		goto fallback;
+
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+		printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+		       ntohs(eth_hdr(skb)->h_proto));
+		return;
+	default:
+		break;
+	}
+
+fallback:
+	printk("MAC=");
+	if (dev->hard_header_len &&
+	    skb->mac_header != skb->network_header) {
+		const unsigned char *p = skb_mac_header(skb);
+		unsigned int i;
+
+		printk("%02x", *p++);
+		for (i = 1; i < dev->hard_header_len; i++, p++)
+			printk(":%02x", *p);
+	}
+	printk(" ");
+}
+
 static struct nf_loginfo default_loginfo = {
 	.type	= NF_LOG_TYPE_LOG,
 	.u = {
@@ -404,20 +441,9 @@ ipt_log_packet(u_int8_t pf,
 	}
 #endif
 
-	if (in && !out) {
-		/* MAC logging for input chain only. */
-		printk("MAC=");
-		if (skb->dev && skb->dev->hard_header_len &&
-		    skb->mac_header != skb->network_header) {
-			int i;
-			const unsigned char *p = skb_mac_header(skb);
-
-			printk("%02x", *p++);
-			for (i = 1; i < skb->dev->hard_header_len; i++, p++)
-				printk(":%02x", *p);
-		}
-		printk(" ");
-	}
+	/* MAC logging for input path only. */
+	if (in && !out)
+		dump_mac_header(loginfo, skb);
 
 	dump_packet(loginfo, skb, 0);
 	printk("\n");
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 4c7ddac7c62b..0a07ae7b933f 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -373,6 +373,56 @@ static void dump_packet(const struct nf_loginfo *info,
 		printk("MARK=0x%x ", skb->mark);
 }
 
+static void dump_mac_header(const struct nf_loginfo *info,
+			    const struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	unsigned int logflags = 0;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+
+	if (!(logflags & IP6T_LOG_MACDECODE))
+		goto fallback;
+
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+		printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+		       ntohs(eth_hdr(skb)->h_proto));
+		return;
+	default:
+		break;
+	}
+
+fallback:
+	printk("MAC=");
+	if (dev->hard_header_len &&
+	    skb->mac_header != skb->network_header) {
+		const unsigned char *p = skb_mac_header(skb);
+		unsigned int len = dev->hard_header_len;
+		unsigned int i;
+
+		if (dev->type == ARPHRD_SIT &&
+		    (p -= ETH_HLEN) < skb->head)
+			p = NULL;
+
+		if (p != NULL) {
+			printk("%02x", *p++);
+			for (i = 1; i < len; i++)
+				printk(":%02x", p[i]);
+		}
+		printk(" ");
+
+		if (dev->type == ARPHRD_SIT) {
+			const struct iphdr *iph =
+				(struct iphdr *)skb_mac_header(skb);
+			printk("TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr);
+		}
+	} else
+		printk(" ");
+}
+
 static struct nf_loginfo default_loginfo = {
 	.type	= NF_LOG_TYPE_LOG,
 	.u = {
@@ -400,35 +450,10 @@ ip6t_log_packet(u_int8_t pf,
 		prefix,
 		in ? in->name : "",
 		out ? out->name : "");
-	if (in && !out) {
-		unsigned int len;
-		/* MAC logging for input chain only. */
-		printk("MAC=");
-		if (skb->dev && (len = skb->dev->hard_header_len) &&
-		    skb->mac_header != skb->network_header) {
-			const unsigned char *p = skb_mac_header(skb);
-			int i;
-
-			if (skb->dev->type == ARPHRD_SIT &&
-			    (p -= ETH_HLEN) < skb->head)
-				p = NULL;
-
-			if (p != NULL) {
-				printk("%02x", *p++);
-				for (i = 1; i < len; i++)
-					printk(":%02x", p[i]);
-			}
-			printk(" ");
 
-			if (skb->dev->type == ARPHRD_SIT) {
-				const struct iphdr *iph =
-					(struct iphdr *)skb_mac_header(skb);
-				printk("TUNNEL=%pI4->%pI4 ",
-				       &iph->saddr, &iph->daddr);
-			}
-		} else
-			printk(" ");
-	}
+	/* MAC logging for input path only. */
+	if (in && !out)
+		dump_mac_header(loginfo, skb);
 
 	dump_packet(loginfo, skb, skb_network_offset(skb), 1);
 	printk("\n");
-- 
cgit v1.2.3-59-g8ed1b


From b505ff5e7291cca6379549297e3852ce3622d550 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 18 Jun 2010 11:09:59 -0600
Subject: of: kill struct of_device

Now that the device tree node pointer has been moved out of struct
of_device and into the common struct device, there isn't anything
unique about of_device anymore.  In fact, there isn't much need
for a separate of_bus when all busses have access to OF style
probing.

arch/powerpc and arch/microblaze are moving away from using the of_bus
and using the regular platform bus instead for mmio devices.  This
patch makes of_device the same as platform_device as a stepping stone
in migrating of_platform_drivers over to the platform bus.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/of_device.h | 10 ----------
 arch/powerpc/include/asm/of_device.h    | 11 -----------
 arch/powerpc/include/asm/smu.h          |  4 ++--
 arch/sparc/include/asm/device.h         |  4 ++--
 arch/sparc/include/asm/of_device.h      | 14 --------------
 drivers/net/niu.h                       |  4 ++--
 include/linux/of_device.h               | 17 +++++++++++++++++
 7 files changed, 23 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_device.h b/arch/microblaze/include/asm/of_device.h
index 73cb98040982..0a5f3f914b42 100644
--- a/arch/microblaze/include/asm/of_device.h
+++ b/arch/microblaze/include/asm/of_device.h
@@ -15,16 +15,6 @@
 #include <linux/device.h>
 #include <linux/of.h>
 
-/*
- * The of_device is a kind of "base class" that is a superset of
- * struct device for use by devices attached to an OF node and
- * probed using OF properties.
- */
-struct of_device {
-	struct device		dev; /* Generic device interface */
-	struct pdev_archdata	archdata;
-};
-
 extern ssize_t of_device_get_modalias(struct of_device *ofdev,
 					char *str, ssize_t len);
 
diff --git a/arch/powerpc/include/asm/of_device.h b/arch/powerpc/include/asm/of_device.h
index 444e97e2982e..cb36632f953c 100644
--- a/arch/powerpc/include/asm/of_device.h
+++ b/arch/powerpc/include/asm/of_device.h
@@ -5,17 +5,6 @@
 #include <linux/device.h>
 #include <linux/of.h>
 
-/*
- * The of_device is a kind of "base class" that is a superset of
- * struct device for use by devices attached to an OF node and
- * probed using OF properties.
- */
-struct of_device
-{
-	struct device		dev;		/* Generic device interface */
-	struct pdev_archdata	archdata;
-};
-
 extern struct of_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
diff --git a/arch/powerpc/include/asm/smu.h b/arch/powerpc/include/asm/smu.h
index 7ae2753da565..e3bdada8c542 100644
--- a/arch/powerpc/include/asm/smu.h
+++ b/arch/powerpc/include/asm/smu.h
@@ -457,8 +457,8 @@ extern void smu_poll(void);
  */
 extern int smu_init(void);
 extern int smu_present(void);
-struct of_device;
-extern struct of_device *smu_get_ofdev(void);
+struct platform_device;
+extern struct platform_device *smu_get_ofdev(void);
 
 
 /*
diff --git a/arch/sparc/include/asm/device.h b/arch/sparc/include/asm/device.h
index f9740d065fe7..fb220e482039 100644
--- a/arch/sparc/include/asm/device.h
+++ b/arch/sparc/include/asm/device.h
@@ -9,13 +9,13 @@
 #include <asm/openprom.h>
 
 struct device_node;
-struct of_device;
+struct platform_device;
 
 struct dev_archdata {
 	void			*iommu;
 	void			*stc;
 	void			*host_controller;
-	struct of_device	*op;
+	struct platform_device	*op;
 	int			numa_node;
 };
 
diff --git a/arch/sparc/include/asm/of_device.h b/arch/sparc/include/asm/of_device.h
index 6d1844a547b4..22b9828fe693 100644
--- a/arch/sparc/include/asm/of_device.h
+++ b/arch/sparc/include/asm/of_device.h
@@ -7,20 +7,6 @@
 #include <linux/mod_devicetable.h>
 #include <asm/openprom.h>
 
-/*
- * The of_device is a kind of "base class" that is a superset of
- * struct device for use by devices attached to an OF node and
- * probed using OF properties.
- */
-struct of_device
-{
-	struct device			dev;
-	u32				num_resources;
-	struct resource			*resource;
-
-	struct pdev_archdata		archdata;
-};
-
 extern void __iomem *of_ioremap(struct resource *res, unsigned long offset, unsigned long size, char *name);
 extern void of_iounmap(struct resource *res, void __iomem *base, unsigned long size);
 
diff --git a/drivers/net/niu.h b/drivers/net/niu.h
index d6715465f35d..a41fa8ebe05f 100644
--- a/drivers/net/niu.h
+++ b/drivers/net/niu.h
@@ -3236,7 +3236,7 @@ struct niu_phy_ops {
 	int (*link_status)(struct niu *np, int *);
 };
 
-struct of_device;
+struct platform_device;
 struct niu {
 	void __iomem			*regs;
 	struct net_device		*dev;
@@ -3297,7 +3297,7 @@ struct niu {
 	struct niu_vpd			vpd;
 	u32				eeprom_len;
 
-	struct of_device		*op;
+	struct platform_device		*op;
 	void __iomem			*vir_regs_1;
 	void __iomem			*vir_regs_2;
 };
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 11651facc5f1..a3ae5900fc58 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -3,9 +3,26 @@
 
 #ifdef CONFIG_OF_DEVICE
 #include <linux/device.h>
+#include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/mod_devicetable.h>
 
+
+/*
+ * The of_device *was* a kind of "base class" that was a superset of
+ * struct device for use by devices attached to an OF node and probed
+ * using OF properties.  However, the important bit of OF-style
+ * probing, namely the device node pointer, has been moved into the
+ * common struct device when CONFIG_OF is set to make OF-style probing
+ * available to all bus types.  So now, just make of_device and
+ * platform_device equivalent so that current of_platform bus users
+ * can be transparently migrated over to using the platform bus.
+ *
+ * This line will go away once all references to of_device are removed
+ * from the kernel.
+ */
+#define of_device platform_device
+
 #include <asm/of_device.h>
 
 #define	to_of_device(d) container_of(d, struct of_device, dev)
-- 
cgit v1.2.3-59-g8ed1b


From e3873444990dd6f8a095d1f72b5ad45192f8c506 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 18 Jun 2010 11:09:59 -0600
Subject: of/irq: Move irq_of_parse_and_map() to common code

Merge common code between PowerPC and Microblaze.  SPARC implements
irq_of_parse_and_map(), but the implementation is different, so it
does not use this code.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Jeremy Kerr <jeremy.kerr@canonical.com>
---
 arch/microblaze/include/asm/irq.h  | 24 --------------------
 arch/microblaze/include/asm/prom.h | 26 +---------------------
 arch/microblaze/kernel/irq.c       | 14 ++----------
 arch/powerpc/include/asm/irq.h     | 28 ------------------------
 arch/powerpc/include/asm/prom.h    | 27 +----------------------
 arch/powerpc/kernel/irq.c          | 14 ++----------
 arch/sparc/include/asm/prom.h      |  1 -
 drivers/of/Kconfig                 |  4 ++++
 drivers/of/Makefile                |  1 +
 drivers/of/irq.c                   | 45 ++++++++++++++++++++++++++++++++++++++
 drivers/of/of_mdio.c               |  1 +
 drivers/of/of_spi.c                |  1 +
 include/linux/of_irq.h             | 41 ++++++++++++++++++++++++++++++++++
 13 files changed, 99 insertions(+), 128 deletions(-)
 create mode 100644 drivers/of/irq.c
 create mode 100644 include/linux/of_irq.h

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/irq.h b/arch/microblaze/include/asm/irq.h
index 31a35c33df63..ec5583d6111c 100644
--- a/arch/microblaze/include/asm/irq.h
+++ b/arch/microblaze/include/asm/irq.h
@@ -27,17 +27,6 @@ extern unsigned int nr_irq;
 struct pt_regs;
 extern void do_IRQ(struct pt_regs *regs);
 
-/**
- * irq_of_parse_and_map - Parse and Map an interrupt into linux virq space
- * @device: Device node of the device whose interrupt is to be mapped
- * @index: Index of the interrupt to map
- *
- * This function is a wrapper that chains of_irq_map_one() and
- * irq_create_of_mapping() to make things easier to callers
- */
-struct device_node;
-extern unsigned int irq_of_parse_and_map(struct device_node *dev, int index);
-
 /** FIXME - not implement
  * irq_dispose_mapping - Unmap an interrupt
  * @virq: linux virq number of the interrupt to unmap
@@ -62,17 +51,4 @@ struct irq_host;
 extern unsigned int irq_create_mapping(struct irq_host *host,
 					irq_hw_number_t hwirq);
 
-/**
- * irq_create_of_mapping - Map a hardware interrupt into linux virq space
- * @controller: Device node of the interrupt controller
- * @inspec: Interrupt specifier from the device-tree
- * @intsize: Size of the interrupt specifier from the device-tree
- *
- * This function is identical to irq_create_mapping except that it takes
- * as input informations straight from the device-tree (typically the results
- * of the of_irq_map_*() functions.
- */
-extern unsigned int irq_create_of_mapping(struct device_node *controller,
-					u32 *intspec, unsigned int intsize);
-
 #endif /* _ASM_MICROBLAZE_IRQ_H */
diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index e7d67a329bd7..e9fb2eb0035d 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -20,6 +20,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <linux/of_irq.h>
 #include <linux/of_fdt.h>
 #include <linux/proc_fs.h>
 #include <linux/platform_device.h>
@@ -92,18 +93,6 @@ extern const void *of_get_mac_address(struct device_node *np);
  * OF interrupt mapping
  */
 
-/* This structure is returned when an interrupt is mapped. The controller
- * field needs to be put() after use
- */
-
-#define OF_MAX_IRQ_SPEC		4 /* We handle specifiers of at most 4 cells */
-
-struct of_irq {
-	struct device_node *controller; /* Interrupt controller node */
-	u32 size; /* Specifier size */
-	u32 specifier[OF_MAX_IRQ_SPEC]; /* Specifier copy */
-};
-
 /**
  * of_irq_map_init - Initialize the irq remapper
  * @flags:	flags defining workarounds to enable
@@ -138,19 +127,6 @@ extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
 			u32 ointsize, const u32 *addr,
 			struct of_irq *out_irq);
 
-/**
- * of_irq_map_one - Resolve an interrupt for a device
- * @device:	the device whose interrupt is to be resolved
- * @index:	index of the interrupt to resolve
- * @out_irq:	structure of_irq filled by this function
- *
- * This function resolves an interrupt, walking the tree, for a given
- * device-tree node. It's the high level pendant to of_irq_map_raw().
- * It also implements the workarounds for OldWolrd Macs.
- */
-extern int of_irq_map_one(struct device_node *device, int index,
-			struct of_irq *out_irq);
-
 /**
  * of_irq_map_pci - Resolve the interrupt for a PCI device
  * @pdev:	the device whose interrupt is to be resolved
diff --git a/arch/microblaze/kernel/irq.c b/arch/microblaze/kernel/irq.c
index 8f120aca123d..dd32b09b4a3c 100644
--- a/arch/microblaze/kernel/irq.c
+++ b/arch/microblaze/kernel/irq.c
@@ -17,20 +17,10 @@
 #include <linux/seq_file.h>
 #include <linux/kernel_stat.h>
 #include <linux/irq.h>
+#include <linux/of_irq.h>
 
 #include <asm/prom.h>
 
-unsigned int irq_of_parse_and_map(struct device_node *dev, int index)
-{
-	struct of_irq oirq;
-
-	if (of_irq_map_one(dev, index, &oirq))
-		return NO_IRQ;
-
-	return oirq.specifier[0];
-}
-EXPORT_SYMBOL_GPL(irq_of_parse_and_map);
-
 static u32 concurrent_irq;
 
 void __irq_entry do_IRQ(struct pt_regs *regs)
@@ -104,7 +94,7 @@ unsigned int irq_create_mapping(struct irq_host *host, irq_hw_number_t hwirq)
 EXPORT_SYMBOL_GPL(irq_create_mapping);
 
 unsigned int irq_create_of_mapping(struct device_node *controller,
-					u32 *intspec, unsigned int intsize)
+				   const u32 *intspec, unsigned int intsize)
 {
 	return intspec[0];
 }
diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index e054baef1845..4e3051595b2b 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -300,34 +300,6 @@ extern unsigned int irq_alloc_virt(struct irq_host *host,
  */
 extern void irq_free_virt(unsigned int virq, unsigned int count);
 
-
-/* -- OF helpers -- */
-
-/**
- * irq_create_of_mapping - Map a hardware interrupt into linux virq space
- * @controller: Device node of the interrupt controller
- * @inspec: Interrupt specifier from the device-tree
- * @intsize: Size of the interrupt specifier from the device-tree
- *
- * This function is identical to irq_create_mapping except that it takes
- * as input informations straight from the device-tree (typically the results
- * of the of_irq_map_*() functions.
- */
-extern unsigned int irq_create_of_mapping(struct device_node *controller,
-					  const u32 *intspec, unsigned int intsize);
-
-/**
- * irq_of_parse_and_map - Parse and Map an interrupt into linux virq space
- * @device: Device node of the device whose interrupt is to be mapped
- * @index: Index of the interrupt to map
- *
- * This function is a wrapper that chains of_irq_map_one() and
- * irq_create_of_mapping() to make things easier to callers
- */
-extern unsigned int irq_of_parse_and_map(struct device_node *dev, int index);
-
-/* -- End OF helpers -- */
-
 /**
  * irq_early_init - Init irq remapping subsystem
  */
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index ddd408a93b5a..47d41b67c94d 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -18,6 +18,7 @@
  */
 #include <linux/types.h>
 #include <linux/of_fdt.h>
+#include <linux/of_irq.h>
 #include <linux/proc_fs.h>
 #include <linux/platform_device.h>
 #include <asm/irq.h>
@@ -108,18 +109,6 @@ extern const void *of_get_mac_address(struct device_node *np);
  * OF interrupt mapping
  */
 
-/* This structure is returned when an interrupt is mapped. The controller
- * field needs to be put() after use
- */
-
-#define OF_MAX_IRQ_SPEC		 4 /* We handle specifiers of at most 4 cells */
-
-struct of_irq {
-	struct device_node *controller;	/* Interrupt controller node */
-	u32 size;			/* Specifier size */
-	u32 specifier[OF_MAX_IRQ_SPEC];	/* Specifier copy */
-};
-
 /**
  * of_irq_map_init - Initialize the irq remapper
  * @flags:	flags defining workarounds to enable
@@ -154,20 +143,6 @@ extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
 			  u32 ointsize, const u32 *addr,
 			  struct of_irq *out_irq);
 
-
-/**
- * of_irq_map_one - Resolve an interrupt for a device
- * @device:	the device whose interrupt is to be resolved
- * @index:     	index of the interrupt to resolve
- * @out_irq:	structure of_irq filled by this function
- *
- * This function resolves an interrupt, walking the tree, for a given
- * device-tree node. It's the high level pendant to of_irq_map_raw().
- * It also implements the workarounds for OldWolrd Macs.
- */
-extern int of_irq_map_one(struct device_node *device, int index,
-			  struct of_irq *out_irq);
-
 /**
  * of_irq_map_pci - Resolve the interrupt for a PCI device
  * @pdev:	the device whose interrupt is to be resolved
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 30817d9b20cb..2676ef288bf5 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -53,6 +53,8 @@
 #include <linux/bootmem.h>
 #include <linux/pci.h>
 #include <linux/debugfs.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -813,18 +815,6 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
 }
 EXPORT_SYMBOL_GPL(irq_create_of_mapping);
 
-unsigned int irq_of_parse_and_map(struct device_node *dev, int index)
-{
-	struct of_irq oirq;
-
-	if (of_irq_map_one(dev, index, &oirq))
-		return NO_IRQ;
-
-	return irq_create_of_mapping(oirq.controller, oirq.specifier,
-				     oirq.size);
-}
-EXPORT_SYMBOL_GPL(irq_of_parse_and_map);
-
 void irq_dispose_mapping(unsigned int virq)
 {
 	struct irq_host *host;
diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h
index f845828ca4c6..ac695742df85 100644
--- a/arch/sparc/include/asm/prom.h
+++ b/arch/sparc/include/asm/prom.h
@@ -56,7 +56,6 @@ extern void of_fill_in_cpu_data(void);
  * register them in the of_device objects, whereas powerpc computes them
  * on request.
  */
-extern unsigned int irq_of_parse_and_map(struct device_node *node, int index);
 static inline void irq_dispose_mapping(unsigned int virq)
 {
 }
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index 7cecc8fea9bd..b87495efa16e 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -6,6 +6,10 @@ config OF_DYNAMIC
 	def_bool y
 	depends on OF && PPC_OF
 
+config OF_IRQ
+	def_bool y
+	depends on OF && !SPARC
+
 config OF_DEVICE
 	def_bool y
 	depends on OF && (SPARC || PPC_OF || MICROBLAZE)
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index f232cc98ce00..3631a5ea0b47 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -1,5 +1,6 @@
 obj-y = base.o
 obj-$(CONFIG_OF_FLATTREE) += fdt.o
+obj-$(CONFIG_OF_IRQ)    += irq.o
 obj-$(CONFIG_OF_DEVICE) += device.o platform.o
 obj-$(CONFIG_OF_GPIO)   += gpio.o
 obj-$(CONFIG_OF_I2C)	+= of_i2c.o
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
new file mode 100644
index 000000000000..9b3397c27096
--- /dev/null
+++ b/drivers/of/irq.c
@@ -0,0 +1,45 @@
+/*
+ *  Derived from arch/i386/kernel/irq.c
+ *    Copyright (C) 1992 Linus Torvalds
+ *  Adapted from arch/i386 by Gary Thomas
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *  Updated and modified by Cort Dougan <cort@fsmlabs.com>
+ *    Copyright (C) 1996-2001 Cort Dougan
+ *  Adapted for Power Macintosh by Paul Mackerras
+ *    Copyright (C) 1996 Paul Mackerras (paulus@cs.anu.edu.au)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This file contains the code used to make IRQ descriptions in the
+ * device tree to actual irq numbers on an interrupt controller
+ * driver.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/string.h>
+
+/**
+ * irq_of_parse_and_map - Parse and map an interrupt into linux virq space
+ * @device: Device node of the device whose interrupt is to be mapped
+ * @index: Index of the interrupt to map
+ *
+ * This function is a wrapper that chains of_irq_map_one() and
+ * irq_create_of_mapping() to make things easier to callers
+ */
+unsigned int irq_of_parse_and_map(struct device_node *dev, int index)
+{
+	struct of_irq oirq;
+
+	if (of_irq_map_one(dev, index, &oirq))
+		return NO_IRQ;
+
+	return irq_create_of_mapping(oirq.controller, oirq.specifier,
+				     oirq.size);
+}
+EXPORT_SYMBOL_GPL(irq_of_parse_and_map);
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 42a6715f8e84..1fce00eb421b 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/phy.h>
 #include <linux/of.h>
+#include <linux/of_irq.h>
 #include <linux/of_mdio.h>
 #include <linux/module.h>
 
diff --git a/drivers/of/of_spi.c b/drivers/of/of_spi.c
index 5fed7e3c7da3..d504f1d1324b 100644
--- a/drivers/of/of_spi.c
+++ b/drivers/of/of_spi.c
@@ -9,6 +9,7 @@
 #include <linux/of.h>
 #include <linux/device.h>
 #include <linux/spi/spi.h>
+#include <linux/of_irq.h>
 #include <linux/of_spi.h>
 
 /**
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
new file mode 100644
index 000000000000..0e37c05b7dd8
--- /dev/null
+++ b/include/linux/of_irq.h
@@ -0,0 +1,41 @@
+#ifndef __OF_IRQ_H
+#define __OF_IRQ_H
+
+#if defined(CONFIG_OF)
+struct of_irq;
+#include <linux/types.h>
+#include <linux/of.h>
+
+/*
+ * irq_of_parse_and_map() is used ba all OF enabled platforms; but SPARC
+ * implements it differently.  However, the prototype is the same for all,
+ * so declare it here regardless of the CONFIG_OF_IRQ setting.
+ */
+extern unsigned int irq_of_parse_and_map(struct device_node *node, int index);
+
+#if defined(CONFIG_OF_IRQ)
+/**
+ * of_irq - container for device_node/irq_specifier pair for an irq controller
+ * @controller: pointer to interrupt controller device tree node
+ * @size: size of interrupt specifier
+ * @specifier: array of cells @size long specifing the specific interrupt
+ *
+ * This structure is returned when an interrupt is mapped. The controller
+ * field needs to be put() after use
+ */
+#define OF_MAX_IRQ_SPEC		4 /* We handle specifiers of at most 4 cells */
+struct of_irq {
+	struct device_node *controller; /* Interrupt controller node */
+	u32 size; /* Specifier size */
+	u32 specifier[OF_MAX_IRQ_SPEC]; /* Specifier copy */
+};
+
+extern int of_irq_map_one(struct device_node *device, int index,
+			  struct of_irq *out_irq);
+extern unsigned int irq_create_of_mapping(struct device_node *controller,
+					  const u32 *intspec,
+					  unsigned int intsize);
+
+#endif /* CONFIG_OF_IRQ */
+#endif /* CONFIG_OF */
+#endif /* __OF_IRQ_H */
-- 
cgit v1.2.3-59-g8ed1b


From c9642c49aae1272d7c24008a40ae614470b957a6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 24 May 2010 16:22:30 +0800
Subject: tracing: Use a global field list for all syscall exit events

All syscall exit events have the same fields.

The kernel size drops 2.5K:

   text    data     bss     dec     hex filename
7018612 2034376 7251132 16304120         f8c7f8 vmlinux.o.orig
7018612 2031888 7251132 16301632         f8be40 vmlinux.o

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BFA3746.8070100@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/syscalls.h      | 2 --
 include/trace/syscall.h       | 1 -
 kernel/trace/trace_syscalls.c | 7 ++++---
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7f614ce274a9..7994bd44eb56 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -165,7 +165,6 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.enter_event	= &event_enter_##sname,		\
 		.exit_event	= &event_exit_##sname,		\
 		.enter_fields	= LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
-		.exit_fields	= LIST_HEAD_INIT(__syscall_meta_##sname.exit_fields), \
 	};
 
 #define SYSCALL_DEFINE0(sname)					\
@@ -180,7 +179,6 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.enter_event	= &event_enter__##sname,	\
 		.exit_event	= &event_exit__##sname,		\
 		.enter_fields	= LIST_HEAD_INIT(__syscall_meta__##sname.enter_fields), \
-		.exit_fields	= LIST_HEAD_INIT(__syscall_meta__##sname.exit_fields), \
 	};							\
 	asmlinkage long sys_##sname(void)
 #else
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 257e08960d7b..31966a4fb8cc 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -26,7 +26,6 @@ struct syscall_metadata {
 	const char	**types;
 	const char	**args;
 	struct list_head enter_fields;
-	struct list_head exit_fields;
 
 	struct ftrace_event_call *enter_event;
 	struct ftrace_event_call *exit_event;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 34e35804304b..bac752f0cfb5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 static int syscall_enter_define_fields(struct ftrace_event_call *call);
 static int syscall_exit_define_fields(struct ftrace_event_call *call);
 
+/* All syscall exit events have the same fields */
+static LIST_HEAD(syscall_exit_fields);
+
 static struct list_head *
 syscall_get_enter_fields(struct ftrace_event_call *call)
 {
@@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
 static struct list_head *
 syscall_get_exit_fields(struct ftrace_event_call *call)
 {
-	struct syscall_metadata *entry = call->data;
-
-	return &entry->exit_fields;
+	return &syscall_exit_fields;
 }
 
 struct trace_event_functions enter_syscall_print_funcs = {
-- 
cgit v1.2.3-59-g8ed1b


From a1d0ce8213e9ddf4046ef5ba95c55762d075f541 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 8 Jun 2010 11:22:06 -0400
Subject: tracing: Use class->reg() for all registering of events

Because kprobes and syscalls need special processing to register
events, the class->reg() method was created to handle the differences.

But instead of creating a default ->reg for perf and ftrace events,
the code was scattered with:

	if (class->reg)
		class->reg();
	else
		default_reg();

This is messy and can also lead to bugs.

This patch cleans up this code and creates a default reg() entry for
the events allowing for the code to directly call the class->reg()
without the condition.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h    |  3 +++
 include/trace/ftrace.h          |  2 ++
 kernel/trace/trace_event_perf.c | 19 +++-----------
 kernel/trace/trace_events.c     | 55 +++++++++++++++++++++++++++--------------
 4 files changed, 44 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0af31cd335d6..01df7ca4ead7 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -146,6 +146,9 @@ struct ftrace_event_class {
 	int			(*raw_init)(struct ftrace_event_call *);
 };
 
+extern int ftrace_event_reg(struct ftrace_event_call *event,
+			    enum trace_reg type);
+
 enum {
 	TRACE_EVENT_FL_ENABLED_BIT,
 	TRACE_EVENT_FL_FILTERED_BIT,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index fc013a8201e9..55c1fd1bbc3d 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -439,6 +439,7 @@ static inline notrace int ftrace_get_offsets_##call(			\
  *	.fields			= LIST_HEAD_INIT(event_class_##call.fields),
  *	.raw_init		= trace_event_raw_init,
  *	.probe			= ftrace_raw_event_##call,
+ *	.reg			= ftrace_event_reg,
  * };
  *
  * static struct ftrace_event_call __used
@@ -567,6 +568,7 @@ static struct ftrace_event_class __used event_class_##call = {		\
 	.fields			= LIST_HEAD_INIT(event_class_##call.fields),\
 	.raw_init		= trace_event_raw_init,			\
 	.probe			= ftrace_raw_event_##call,		\
+	.reg			= ftrace_event_reg,			\
 	_TRACE_PERF_INIT(call)						\
 };
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 6053982dc302..23751659582e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -54,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 		}
 	}
 
-	if (tp_event->class->reg)
-		ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
-	else
-		ret = tracepoint_probe_register(tp_event->name,
-						tp_event->class->perf_probe,
-						tp_event);
-
+	ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
 	if (ret)
 		goto fail;
 
@@ -94,9 +88,7 @@ int perf_trace_init(struct perf_event *p_event)
 	mutex_lock(&event_mutex);
 	list_for_each_entry(tp_event, &ftrace_events, list) {
 		if (tp_event->event.type == event_id &&
-		    tp_event->class &&
-		    (tp_event->class->perf_probe ||
-		     tp_event->class->reg) &&
+		    tp_event->class && tp_event->class->reg &&
 		    try_module_get(tp_event->mod)) {
 			ret = perf_trace_event_init(tp_event, p_event);
 			break;
@@ -136,12 +128,7 @@ void perf_trace_destroy(struct perf_event *p_event)
 	if (--tp_event->perf_refcount > 0)
 		goto out;
 
-	if (tp_event->class->reg)
-		tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
-	else
-		tracepoint_probe_unregister(tp_event->name,
-					    tp_event->class->perf_probe,
-					    tp_event);
+	tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
 
 	/*
 	 * Ensure our callback won't be called anymore. See
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 69bee4cc0e10..e8e6043f4d29 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -141,6 +141,35 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
 
+int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
+{
+	switch (type) {
+	case TRACE_REG_REGISTER:
+		return tracepoint_probe_register(call->name,
+						 call->class->probe,
+						 call);
+	case TRACE_REG_UNREGISTER:
+		tracepoint_probe_unregister(call->name,
+					    call->class->probe,
+					    call);
+		return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+	case TRACE_REG_PERF_REGISTER:
+		return tracepoint_probe_register(call->name,
+						 call->class->perf_probe,
+						 call);
+	case TRACE_REG_PERF_UNREGISTER:
+		tracepoint_probe_unregister(call->name,
+					    call->class->perf_probe,
+					    call);
+		return 0;
+#endif
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_reg);
+
 static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 					int enable)
 {
@@ -151,23 +180,13 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 		if (call->flags & TRACE_EVENT_FL_ENABLED) {
 			call->flags &= ~TRACE_EVENT_FL_ENABLED;
 			tracing_stop_cmdline_record();
-			if (call->class->reg)
-				call->class->reg(call, TRACE_REG_UNREGISTER);
-			else
-				tracepoint_probe_unregister(call->name,
-							    call->class->probe,
-							    call);
+			call->class->reg(call, TRACE_REG_UNREGISTER);
 		}
 		break;
 	case 1:
 		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
 			tracing_start_cmdline_record();
-			if (call->class->reg)
-				ret = call->class->reg(call, TRACE_REG_REGISTER);
-			else
-				ret = tracepoint_probe_register(call->name,
-								call->class->probe,
-								call);
+			ret = call->class->reg(call, TRACE_REG_REGISTER);
 			if (ret) {
 				tracing_stop_cmdline_record();
 				pr_info("event trace: Could not enable event "
@@ -205,8 +224,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 
-		if (!call->name || !call->class ||
-		    (!call->class->probe && !call->class->reg))
+		if (!call->name || !call->class || !call->class->reg)
 			continue;
 
 		if (match &&
@@ -332,7 +350,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		 * The ftrace subsystem is for showing formats only.
 		 * They can not be enabled or disabled via the event files.
 		 */
-		if (call->class && (call->class->probe || call->class->reg))
+		if (call->class && call->class->reg)
 			return call;
 	}
 
@@ -485,8 +503,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
-		if (!call->name || !call->class ||
-		    (!call->class->probe && !call->class->reg))
+		if (!call->name || !call->class || !call->class->reg)
 			continue;
 
 		if (system && strcmp(call->class->system, system) != 0)
@@ -977,12 +994,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 		return -1;
 	}
 
-	if (call->class->probe || call->class->reg)
+	if (call->class->reg)
 		trace_create_file("enable", 0644, call->dir, call,
 				  enable);
 
 #ifdef CONFIG_PERF_EVENTS
-	if (call->event.type && (call->class->perf_probe || call->class->reg))
+	if (call->event.type && call->class->reg)
 		trace_create_file("id", 0444, call->dir, call,
 		 		  id);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From b6b3ecc71a0664d44ed8d087d583aee98fbf492a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 24 Jun 2010 00:04:38 +0000
Subject: net: u64_stats_sync improvements

- Add a comment about interrupts:

6) If counter might be written by an interrupt, readers should block
interrupts.

- Fix a typo in sample of use.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/u64_stats_sync.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index d0505156ed52..b38e3a58de83 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -23,6 +23,10 @@
  *    pure reads. But if they have to fetch many values, it's better to not allow
  *    preemptions/interruptions to avoid many retries.
  *
+ * 6) If counter might be written by an interrupt, readers should block interrupts.
+ *    (On UP, there is no seqcount_t protection, a reader allowing interrupts could
+ *     read partial values)
+ *
  * Usage :
  *
  * Stats producer (writer) should use following template granted it already got
@@ -46,7 +50,7 @@
  *         start = u64_stats_fetch_begin(&stats->syncp);
  *         tbytes = stats->bytes64; // non atomic operation
  *         tpackets = stats->packets64; // non atomic operation
- * } while (u64_stats_fetch_retry(&stats->lock, syncp));
+ * } while (u64_stats_fetch_retry(&stats->syncp, start));
  *
  *
  * Example of use in drivers/net/loopback.c, using per_cpu containers,
-- 
cgit v1.2.3-59-g8ed1b


From 33d91f00c73ba0012bce18c1690cb8313ca7adaa Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 24 Jun 2010 00:54:06 +0000
Subject: net: u64_stats_fetch_begin_bh() and u64_stats_fetch_retry_bh()

- Must disable preemption in case of 32bit UP in u64_stats_fetch_begin()
and u64_stats_fetch_retry()

- Add new u64_stats_fetch_begin_bh() and u64_stats_fetch_retry_bh() for
network usage, disabling BH on 32bit UP only.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/u64_stats_sync.h | 59 +++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index b38e3a58de83..fa261a0da280 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -27,6 +27,9 @@
  *    (On UP, there is no seqcount_t protection, a reader allowing interrupts could
  *     read partial values)
  *
+ * 7) For softirq uses, readers can use u64_stats_fetch_begin_bh() and
+ *    u64_stats_fetch_retry_bh() helpers
+ *
  * Usage :
  *
  * Stats producer (writer) should use following template granted it already got
@@ -58,54 +61,80 @@
  */
 #include <linux/seqlock.h>
 
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 struct u64_stats_sync {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	seqcount_t	seq;
+#endif
 };
 
 static void inline u64_stats_update_begin(struct u64_stats_sync *syncp)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	write_seqcount_begin(&syncp->seq);
+#endif
 }
 
 static void inline u64_stats_update_end(struct u64_stats_sync *syncp)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	write_seqcount_end(&syncp->seq);
+#endif
 }
 
 static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	return read_seqcount_begin(&syncp->seq);
+#else
+#if BITS_PER_LONG==32
+	preempt_disable();
+#endif
+	return 0;
+#endif
 }
 
 static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	return read_seqcount_retry(&syncp->seq, start);
-}
-
 #else
-struct u64_stats_sync {
-};
-
-static void inline u64_stats_update_begin(struct u64_stats_sync *syncp)
-{
-}
-
-static void inline u64_stats_update_end(struct u64_stats_sync *syncp)
-{
+#if BITS_PER_LONG==32
+	preempt_enable();
+#endif
+	return false;
+#endif
 }
 
-static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+/*
+ * In case softirq handlers can update u64 counters, readers can use following helpers
+ * - SMP 32bit arches use seqcount protection, irq safe.
+ * - UP 32bit must disable BH.
+ * - 64bit have no problem atomically reading u64 values, irq safe.
+ */
+static unsigned int inline u64_stats_fetch_begin_bh(const struct u64_stats_sync *syncp)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	return read_seqcount_begin(&syncp->seq);
+#else
+#if BITS_PER_LONG==32
+	local_bh_disable();
+#endif
 	return 0;
+#endif
 }
 
-static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+static bool inline u64_stats_fetch_retry_bh(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	return read_seqcount_retry(&syncp->seq, start);
+#else
+#if BITS_PER_LONG==32
+	local_bh_enable();
+#endif
 	return false;
-}
 #endif
+}
 
 #endif /* _LINUX_U64_STATS_SYNC_H */
-- 
cgit v1.2.3-59-g8ed1b


From bc66154efe163a80f269d448572f7906756e9338 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 24 Jun 2010 00:54:21 +0000
Subject: macvlan: 64 bit rx counters

Use u64_stats_sync infrastructure to implement 64bit stats.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c      | 37 +++++++++++++++++++++++--------------
 include/linux/if_macvlan.h | 19 ++++++++++++-------
 2 files changed, 35 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index e096875aa055..e6d626e78515 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -431,29 +431,38 @@ static void macvlan_uninit(struct net_device *dev)
 	free_percpu(vlan->rx_stats);
 }
 
-static struct net_device_stats *macvlan_dev_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev)
 {
-	struct net_device_stats *stats = &dev->stats;
+	struct rtnl_link_stats64 *stats = &dev->stats64;
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
-	dev_txq_stats_fold(dev, stats);
+	dev_txq_stats_fold(dev, &dev->stats);
 
 	if (vlan->rx_stats) {
-		struct macvlan_rx_stats *p, rx = {0};
+		struct macvlan_rx_stats *p, accum = {0};
+		u64 rx_packets, rx_bytes, rx_multicast;
+		unsigned int start;
 		int i;
 
 		for_each_possible_cpu(i) {
 			p = per_cpu_ptr(vlan->rx_stats, i);
-			rx.rx_packets += p->rx_packets;
-			rx.rx_bytes   += p->rx_bytes;
-			rx.rx_errors  += p->rx_errors;
-			rx.multicast  += p->multicast;
+			do {
+				start = u64_stats_fetch_begin_bh(&p->syncp);
+				rx_packets	= p->rx_packets;
+				rx_bytes	= p->rx_bytes;
+				rx_multicast	= p->rx_multicast;
+			} while (u64_stats_fetch_retry_bh(&p->syncp, start));
+			accum.rx_packets	+= rx_packets;
+			accum.rx_bytes		+= rx_bytes;
+			accum.rx_multicast	+= rx_multicast;
+			/* rx_errors is an ulong, updated without syncp protection */
+			accum.rx_errors		+= p->rx_errors;
 		}
-		stats->rx_packets = rx.rx_packets;
-		stats->rx_bytes   = rx.rx_bytes;
-		stats->rx_errors  = rx.rx_errors;
-		stats->rx_dropped = rx.rx_errors;
-		stats->multicast  = rx.multicast;
+		stats->rx_packets = accum.rx_packets;
+		stats->rx_bytes   = accum.rx_bytes;
+		stats->rx_errors  = accum.rx_errors;
+		stats->rx_dropped = accum.rx_errors;
+		stats->multicast  = accum.rx_multicast;
 	}
 	return stats;
 }
@@ -502,7 +511,7 @@ static const struct net_device_ops macvlan_netdev_ops = {
 	.ndo_change_rx_flags	= macvlan_change_rx_flags,
 	.ndo_set_mac_address	= macvlan_set_mac_address,
 	.ndo_set_multicast_list	= macvlan_set_multicast_list,
-	.ndo_get_stats		= macvlan_dev_get_stats,
+	.ndo_get_stats64	= macvlan_dev_get_stats64,
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index c26a0e4f0ce8..e24ce6ea1fa3 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -6,6 +6,7 @@
 #include <linux/netdevice.h>
 #include <linux/netlink.h>
 #include <net/netlink.h>
+#include <linux/u64_stats_sync.h>
 
 #if defined(CONFIG_MACVTAP) || defined(CONFIG_MACVTAP_MODULE)
 struct socket *macvtap_get_socket(struct file *);
@@ -27,14 +28,16 @@ struct macvtap_queue;
  *	struct macvlan_rx_stats - MACVLAN percpu rx stats
  *	@rx_packets: number of received packets
  *	@rx_bytes: number of received bytes
- *	@multicast: number of received multicast packets
+ *	@rx_multicast: number of received multicast packets
+ *	@syncp: synchronization point for 64bit counters
  *	@rx_errors: number of errors
  */
 struct macvlan_rx_stats {
-	unsigned long rx_packets;
-	unsigned long rx_bytes;
-	unsigned long multicast;
-	unsigned long rx_errors;
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			rx_multicast;
+	struct u64_stats_sync	syncp;
+	unsigned long		rx_errors;
 };
 
 struct macvlan_dev {
@@ -56,12 +59,14 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
 {
 	struct macvlan_rx_stats *rx_stats;
 
-	rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id());
+	rx_stats = this_cpu_ptr(vlan->rx_stats);
 	if (likely(success)) {
+		u64_stats_update_begin(&rx_stats->syncp);
 		rx_stats->rx_packets++;;
 		rx_stats->rx_bytes += len;
 		if (multicast)
-			rx_stats->multicast++;
+			rx_stats->rx_multicast++;
+		u64_stats_update_end(&rx_stats->syncp);
 	} else {
 		rx_stats->rx_errors++;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From b56c0d8937e665a27d90517ee7a746d0aa05af46 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:09 +0200
Subject: kthread: implement kthread_worker

Implement simple work processor for kthread.  This is to ease using
kthread.  Single thread workqueue used to be used for things like this
but workqueue won't guarantee fixed kthread association anymore to
enable worker sharing.

This can be used in cases where specific kthread association is
necessary, for example, when it should have RT priority or be assigned
to certain cgroup.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kthread.h |  64 +++++++++++++++++++++
 kernel/kthread.c        | 149 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 213 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index aabc8a13ba71..f93cb6979edc 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -34,4 +34,68 @@ int kthread_should_stop(void);
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;
 
+/*
+ * Simple work processor based on kthread.
+ *
+ * This provides easier way to make use of kthreads.  A kthread_work
+ * can be queued and flushed using queue/flush_kthread_work()
+ * respectively.  Queued kthread_works are processed by a kthread
+ * running kthread_worker_fn().
+ *
+ * A kthread_work can't be freed while it is executing.
+ */
+struct kthread_work;
+typedef void (*kthread_work_func_t)(struct kthread_work *work);
+
+struct kthread_worker {
+	spinlock_t		lock;
+	struct list_head	work_list;
+	struct task_struct	*task;
+};
+
+struct kthread_work {
+	struct list_head	node;
+	kthread_work_func_t	func;
+	wait_queue_head_t	done;
+	atomic_t		flushing;
+	int			queue_seq;
+	int			done_seq;
+};
+
+#define KTHREAD_WORKER_INIT(worker)	{				\
+	.lock = SPIN_LOCK_UNLOCKED,					\
+	.work_list = LIST_HEAD_INIT((worker).work_list),		\
+	}
+
+#define KTHREAD_WORK_INIT(work, fn)	{				\
+	.node = LIST_HEAD_INIT((work).node),				\
+	.func = (fn),							\
+	.done = __WAIT_QUEUE_HEAD_INITIALIZER((work).done),		\
+	.flushing = ATOMIC_INIT(0),					\
+	}
+
+#define DEFINE_KTHREAD_WORKER(worker)					\
+	struct kthread_worker worker = KTHREAD_WORKER_INIT(worker)
+
+#define DEFINE_KTHREAD_WORK(work, fn)					\
+	struct kthread_work work = KTHREAD_WORK_INIT(work, fn)
+
+static inline void init_kthread_worker(struct kthread_worker *worker)
+{
+	*worker = (struct kthread_worker)KTHREAD_WORKER_INIT(*worker);
+}
+
+static inline void init_kthread_work(struct kthread_work *work,
+				     kthread_work_func_t fn)
+{
+	*work = (struct kthread_work)KTHREAD_WORK_INIT(*work, fn);
+}
+
+int kthread_worker_fn(void *worker_ptr);
+
+bool queue_kthread_work(struct kthread_worker *worker,
+			struct kthread_work *work);
+void flush_kthread_work(struct kthread_work *work);
+void flush_kthread_worker(struct kthread_worker *worker);
+
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c780175..8b63c7fee73b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,8 @@
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/freezer.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -247,3 +249,150 @@ int kthreadd(void *unused)
 
 	return 0;
 }
+
+/**
+ * kthread_worker_fn - kthread function to process kthread_worker
+ * @worker_ptr: pointer to initialized kthread_worker
+ *
+ * This function can be used as @threadfn to kthread_create() or
+ * kthread_run() with @worker_ptr argument pointing to an initialized
+ * kthread_worker.  The started kthread will process work_list until
+ * the it is stopped with kthread_stop().  A kthread can also call
+ * this function directly after extra initialization.
+ *
+ * Different kthreads can be used for the same kthread_worker as long
+ * as there's only one kthread attached to it at any given time.  A
+ * kthread_worker without an attached kthread simply collects queued
+ * kthread_works.
+ */
+int kthread_worker_fn(void *worker_ptr)
+{
+	struct kthread_worker *worker = worker_ptr;
+	struct kthread_work *work;
+
+	WARN_ON(worker->task);
+	worker->task = current;
+repeat:
+	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+
+	if (kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		spin_lock_irq(&worker->lock);
+		worker->task = NULL;
+		spin_unlock_irq(&worker->lock);
+		return 0;
+	}
+
+	work = NULL;
+	spin_lock_irq(&worker->lock);
+	if (!list_empty(&worker->work_list)) {
+		work = list_first_entry(&worker->work_list,
+					struct kthread_work, node);
+		list_del_init(&work->node);
+	}
+	spin_unlock_irq(&worker->lock);
+
+	if (work) {
+		__set_current_state(TASK_RUNNING);
+		work->func(work);
+		smp_wmb();	/* wmb worker-b0 paired with flush-b1 */
+		work->done_seq = work->queue_seq;
+		smp_mb();	/* mb worker-b1 paired with flush-b0 */
+		if (atomic_read(&work->flushing))
+			wake_up_all(&work->done);
+	} else if (!freezing(current))
+		schedule();
+
+	try_to_freeze();
+	goto repeat;
+}
+EXPORT_SYMBOL_GPL(kthread_worker_fn);
+
+/**
+ * queue_kthread_work - queue a kthread_work
+ * @worker: target kthread_worker
+ * @work: kthread_work to queue
+ *
+ * Queue @work to work processor @task for async execution.  @task
+ * must have been created with kthread_worker_create().  Returns %true
+ * if @work was successfully queued, %false if it was already pending.
+ */
+bool queue_kthread_work(struct kthread_worker *worker,
+			struct kthread_work *work)
+{
+	bool ret = false;
+	unsigned long flags;
+
+	spin_lock_irqsave(&worker->lock, flags);
+	if (list_empty(&work->node)) {
+		list_add_tail(&work->node, &worker->work_list);
+		work->queue_seq++;
+		if (likely(worker->task))
+			wake_up_process(worker->task);
+		ret = true;
+	}
+	spin_unlock_irqrestore(&worker->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(queue_kthread_work);
+
+/**
+ * flush_kthread_work - flush a kthread_work
+ * @work: work to flush
+ *
+ * If @work is queued or executing, wait for it to finish execution.
+ */
+void flush_kthread_work(struct kthread_work *work)
+{
+	int seq = work->queue_seq;
+
+	atomic_inc(&work->flushing);
+
+	/*
+	 * mb flush-b0 paired with worker-b1, to make sure either
+	 * worker sees the above increment or we see done_seq update.
+	 */
+	smp_mb__after_atomic_inc();
+
+	/* A - B <= 0 tests whether B is in front of A regardless of overflow */
+	wait_event(work->done, seq - work->done_seq <= 0);
+	atomic_dec(&work->flushing);
+
+	/*
+	 * rmb flush-b1 paired with worker-b0, to make sure our caller
+	 * sees every change made by work->func().
+	 */
+	smp_mb__after_atomic_dec();
+}
+EXPORT_SYMBOL_GPL(flush_kthread_work);
+
+struct kthread_flush_work {
+	struct kthread_work	work;
+	struct completion	done;
+};
+
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+	struct kthread_flush_work *fwork =
+		container_of(work, struct kthread_flush_work, work);
+	complete(&fwork->done);
+}
+
+/**
+ * flush_kthread_worker - flush all current works on a kthread_worker
+ * @worker: worker to flush
+ *
+ * Wait until all currently executing or pending works on @worker are
+ * finished.
+ */
+void flush_kthread_worker(struct kthread_worker *worker)
+{
+	struct kthread_flush_work fwork = {
+		KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+		COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+	};
+
+	queue_kthread_work(worker, &fwork.work);
+	wait_for_completion(&fwork.done);
+}
+EXPORT_SYMBOL_GPL(flush_kthread_worker);
-- 
cgit v1.2.3-59-g8ed1b


From 82805ab77d25643f579d90397dcd34f05d1b750a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:09 +0200
Subject: kthread: implement kthread_data()

Implement kthread_data() which takes @task pointing to a kthread and
returns @data specified when creating the kthread.  The caller is
responsible for ensuring the validity of @task when calling this
function.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/kthread.h |  1 +
 kernel/kthread.c        | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index f93cb6979edc..685ea65eb803 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -30,6 +30,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 int kthread_stop(struct task_struct *k);
 int kthread_should_stop(void);
+void *kthread_data(struct task_struct *k);
 
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8b63c7fee73b..2dc3786349d1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -37,6 +37,7 @@ struct kthread_create_info
 
 struct kthread {
 	int should_stop;
+	void *data;
 	struct completion exited;
 };
 
@@ -56,6 +57,19 @@ int kthread_should_stop(void)
 }
 EXPORT_SYMBOL(kthread_should_stop);
 
+/**
+ * kthread_data - return data value specified on kthread creation
+ * @task: kthread task in question
+ *
+ * Return the data value specified when kthread @task was created.
+ * The caller is responsible for ensuring the validity of @task when
+ * calling this function.
+ */
+void *kthread_data(struct task_struct *task)
+{
+	return to_kthread(task)->data;
+}
+
 static int kthread(void *_create)
 {
 	/* Copy data: it's on kthread's stack */
@@ -66,6 +80,7 @@ static int kthread(void *_create)
 	int ret;
 
 	self.should_stop = 0;
+	self.data = data;
 	init_completion(&self.exited);
 	current->vfork_done = &self.exited;
 
-- 
cgit v1.2.3-59-g8ed1b


From c790bce0481857412c964c5e9d46d56e41c4b051 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:09 +0200
Subject: workqueue: kill RT workqueue

With stop_machine() converted to use cpu_stop, RT workqueue doesn't
have any user left.  Kill RT workqueue support.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 20 +++++++++-----------
 kernel/workqueue.c        |  6 ------
 2 files changed, 9 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 9466e860d8c2..0697946c66a1 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -181,12 +181,11 @@ static inline void destroy_work_on_stack(struct work_struct *work) { }
 
 
 extern struct workqueue_struct *
-__create_workqueue_key(const char *name, int singlethread,
-		       int freezeable, int rt, struct lock_class_key *key,
-		       const char *lock_name);
+__create_workqueue_key(const char *name, int singlethread, int freezeable,
+		       struct lock_class_key *key, const char *lock_name);
 
 #ifdef CONFIG_LOCKDEP
-#define __create_workqueue(name, singlethread, freezeable, rt)	\
+#define __create_workqueue(name, singlethread, freezeable)	\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
@@ -197,19 +196,18 @@ __create_workqueue_key(const char *name, int singlethread,
 		__lock_name = #name;				\
 								\
 	__create_workqueue_key((name), (singlethread),		\
-			       (freezeable), (rt), &__key,	\
+			       (freezeable), &__key,		\
 			       __lock_name);			\
 })
 #else
-#define __create_workqueue(name, singlethread, freezeable, rt)	\
-	__create_workqueue_key((name), (singlethread), (freezeable), (rt), \
+#define __create_workqueue(name, singlethread, freezeable)	\
+	__create_workqueue_key((name), (singlethread), (freezeable), \
 			       NULL, NULL)
 #endif
 
-#define create_workqueue(name) __create_workqueue((name), 0, 0, 0)
-#define create_rt_workqueue(name) __create_workqueue((name), 0, 0, 1)
-#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1, 0)
-#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0, 0)
+#define create_workqueue(name) __create_workqueue((name), 0, 0)
+#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1)
+#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 327d2deb4451..1a47fbf92fae 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -62,7 +62,6 @@ struct workqueue_struct {
 	const char *name;
 	int singlethread;
 	int freezeable;		/* Freeze threads during suspend */
-	int rt;
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map lockdep_map;
 #endif
@@ -947,7 +946,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
 
 static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
-	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	struct workqueue_struct *wq = cwq->wq;
 	const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
 	struct task_struct *p;
@@ -963,8 +961,6 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 	 */
 	if (IS_ERR(p))
 		return PTR_ERR(p);
-	if (cwq->wq->rt)
-		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 	cwq->thread = p;
 
 	trace_workqueue_creation(cwq->thread, cpu);
@@ -986,7 +982,6 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 struct workqueue_struct *__create_workqueue_key(const char *name,
 						int singlethread,
 						int freezeable,
-						int rt,
 						struct lock_class_key *key,
 						const char *lock_name)
 {
@@ -1008,7 +1003,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	wq->singlethread = singlethread;
 	wq->freezeable = freezeable;
-	wq->rt = rt;
 	INIT_LIST_HEAD(&wq->list);
 
 	if (singlethread) {
-- 
cgit v1.2.3-59-g8ed1b


From 4690c4ab56c71919893ca25252f2dd65b58188c7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:10 +0200
Subject: workqueue: misc/cosmetic updates

Make the following updates in preparation of concurrency managed
workqueue.  None of these changes causes any visible behavior
difference.

* Add comments and adjust indentations to data structures and several
  functions.

* Rename wq_per_cpu() to get_cwq() and swap the position of two
  parameters for consistency.  Convert a direct per_cpu_ptr() access
  to wq->cpu_wq to get_cwq().

* Add work_static() and Update set_wq_data() such that it sets the
  flags part to WORK_STRUCT_PENDING | WORK_STRUCT_STATIC if static |
  @extra_flags.

* Move santiy check on work->entry emptiness from queue_work_on() to
  __queue_work() which all queueing paths share.

* Make __queue_work() take @cpu and @wq instead of @cwq.

* Restructure flush_work() and __create_workqueue_key() to make them
  easier to modify.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   5 ++
 kernel/workqueue.c        | 131 +++++++++++++++++++++++++++++-----------------
 2 files changed, 89 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0697946c66a1..e724dafc9e6d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -96,9 +96,14 @@ struct execute_work {
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 extern void __init_work(struct work_struct *work, int onstack);
 extern void destroy_work_on_stack(struct work_struct *work);
+static inline unsigned int work_static(struct work_struct *work)
+{
+	return *work_data_bits(work) & (1 << WORK_STRUCT_STATIC);
+}
 #else
 static inline void __init_work(struct work_struct *work, int onstack) { }
 static inline void destroy_work_on_stack(struct work_struct *work) { }
+static inline unsigned int work_static(struct work_struct *work) { return 0; }
 #endif
 
 /*
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1a47fbf92fae..c56146a755e5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -36,6 +36,16 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
 
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Set during initialization and read-only afterwards.
+ *
+ * L: cwq->lock protected.  Access with cwq->lock held.
+ *
+ * W: workqueue_lock protected.
+ */
+
 /*
  * The per-CPU workqueue (if single thread, we always use the first
  * possible cpu).
@@ -48,8 +58,8 @@ struct cpu_workqueue_struct {
 	wait_queue_head_t more_work;
 	struct work_struct *current_work;
 
-	struct workqueue_struct *wq;
-	struct task_struct *thread;
+	struct workqueue_struct *wq;		/* I: the owning workqueue */
+	struct task_struct	*thread;
 } ____cacheline_aligned;
 
 /*
@@ -57,13 +67,13 @@ struct cpu_workqueue_struct {
  * per-CPU workqueues:
  */
 struct workqueue_struct {
-	struct cpu_workqueue_struct *cpu_wq;
-	struct list_head list;
-	const char *name;
+	struct cpu_workqueue_struct *cpu_wq;	/* I: cwq's */
+	struct list_head	list;		/* W: list of all workqueues */
+	const char		*name;		/* I: workqueue name */
 	int singlethread;
 	int freezeable;		/* Freeze threads during suspend */
 #ifdef CONFIG_LOCKDEP
-	struct lockdep_map lockdep_map;
+	struct lockdep_map	lockdep_map;
 #endif
 };
 
@@ -204,8 +214,8 @@ static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
 		? cpu_singlethread_map : cpu_populated_map;
 }
 
-static
-struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
+static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
+					    struct workqueue_struct *wq)
 {
 	if (unlikely(is_wq_single_threaded(wq)))
 		cpu = singlethread_cpu;
@@ -217,15 +227,13 @@ struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
  * - Must *only* be called if the pending flag is set
  */
 static inline void set_wq_data(struct work_struct *work,
-				struct cpu_workqueue_struct *cwq)
+			       struct cpu_workqueue_struct *cwq,
+			       unsigned long extra_flags)
 {
-	unsigned long new;
-
 	BUG_ON(!work_pending(work));
 
-	new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
-	new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
-	atomic_long_set(&work->data, new);
+	atomic_long_set(&work->data, (unsigned long)cwq | work_static(work) |
+			(1UL << WORK_STRUCT_PENDING) | extra_flags);
 }
 
 /*
@@ -233,9 +241,7 @@ static inline void set_wq_data(struct work_struct *work,
  */
 static inline void clear_wq_data(struct work_struct *work)
 {
-	unsigned long flags = *work_data_bits(work) &
-				(1UL << WORK_STRUCT_STATIC);
-	atomic_long_set(&work->data, flags);
+	atomic_long_set(&work->data, work_static(work));
 }
 
 static inline
@@ -244,29 +250,47 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
 
+/**
+ * insert_work - insert a work into cwq
+ * @cwq: cwq @work belongs to
+ * @work: work to insert
+ * @head: insertion point
+ * @extra_flags: extra WORK_STRUCT_* flags to set
+ *
+ * Insert @work into @cwq after @head.
+ *
+ * CONTEXT:
+ * spin_lock_irq(cwq->lock).
+ */
 static void insert_work(struct cpu_workqueue_struct *cwq,
-			struct work_struct *work, struct list_head *head)
+			struct work_struct *work, struct list_head *head,
+			unsigned int extra_flags)
 {
 	trace_workqueue_insertion(cwq->thread, work);
 
-	set_wq_data(work, cwq);
+	/* we own @work, set data and link */
+	set_wq_data(work, cwq, extra_flags);
+
 	/*
 	 * Ensure that we get the right work->data if we see the
 	 * result of list_add() below, see try_to_grab_pending().
 	 */
 	smp_wmb();
+
 	list_add_tail(&work->entry, head);
 	wake_up(&cwq->more_work);
 }
 
-static void __queue_work(struct cpu_workqueue_struct *cwq,
+static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
+	struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 	unsigned long flags;
 
 	debug_work_activate(work);
 	spin_lock_irqsave(&cwq->lock, flags);
-	insert_work(cwq, work, &cwq->worklist);
+	BUG_ON(!list_empty(&work->entry));
+	insert_work(cwq, work, &cwq->worklist, 0);
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
@@ -308,8 +332,7 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
 	int ret = 0;
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
-		BUG_ON(!list_empty(&work->entry));
-		__queue_work(wq_per_cpu(wq, cpu), work);
+		__queue_work(cpu, wq, work);
 		ret = 1;
 	}
 	return ret;
@@ -320,9 +343,8 @@ static void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
 	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
-	struct workqueue_struct *wq = cwq->wq;
 
-	__queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
+	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
 }
 
 /**
@@ -366,7 +388,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 		timer_stats_timer_set_start_info(&dwork->timer);
 
 		/* This stores cwq for the moment, for the timer_fn */
-		set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
+		set_wq_data(work, get_cwq(raw_smp_processor_id(), wq), 0);
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -430,6 +452,12 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 	spin_unlock_irq(&cwq->lock);
 }
 
+/**
+ * worker_thread - the worker thread function
+ * @__cwq: cwq to serve
+ *
+ * The cwq worker thread function.
+ */
 static int worker_thread(void *__cwq)
 {
 	struct cpu_workqueue_struct *cwq = __cwq;
@@ -468,6 +496,17 @@ static void wq_barrier_func(struct work_struct *work)
 	complete(&barr->done);
 }
 
+/**
+ * insert_wq_barrier - insert a barrier work
+ * @cwq: cwq to insert barrier into
+ * @barr: wq_barrier to insert
+ * @head: insertion point
+ *
+ * Insert barrier @barr into @cwq before @head.
+ *
+ * CONTEXT:
+ * spin_lock_irq(cwq->lock).
+ */
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 			struct wq_barrier *barr, struct list_head *head)
 {
@@ -479,11 +518,10 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 	 */
 	INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
-
 	init_completion(&barr->done);
 
 	debug_work_activate(&barr->work);
-	insert_work(cwq, &barr->work, head);
+	insert_work(cwq, &barr->work, head, 0);
 }
 
 static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -517,9 +555,6 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  *
  * We sleep until all works which were queued on entry have been handled,
  * but we are not livelocked by new incoming ones.
- *
- * This function used to run the workqueues itself.  Now we just wait for the
- * helper threads to do it.
  */
 void flush_workqueue(struct workqueue_struct *wq)
 {
@@ -558,7 +593,6 @@ int flush_work(struct work_struct *work)
 	lock_map_acquire(&cwq->wq->lockdep_map);
 	lock_map_release(&cwq->wq->lockdep_map);
 
-	prev = NULL;
 	spin_lock_irq(&cwq->lock);
 	if (!list_empty(&work->entry)) {
 		/*
@@ -567,22 +601,22 @@ int flush_work(struct work_struct *work)
 		 */
 		smp_rmb();
 		if (unlikely(cwq != get_wq_data(work)))
-			goto out;
+			goto already_gone;
 		prev = &work->entry;
 	} else {
 		if (cwq->current_work != work)
-			goto out;
+			goto already_gone;
 		prev = &cwq->worklist;
 	}
 	insert_wq_barrier(cwq, &barr, prev->next);
-out:
-	spin_unlock_irq(&cwq->lock);
-	if (!prev)
-		return 0;
 
+	spin_unlock_irq(&cwq->lock);
 	wait_for_completion(&barr.done);
 	destroy_work_on_stack(&barr.work);
 	return 1;
+already_gone:
+	spin_unlock_irq(&cwq->lock);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(flush_work);
 
@@ -665,7 +699,7 @@ static void wait_on_work(struct work_struct *work)
 	cpu_map = wq_cpu_map(wq);
 
 	for_each_cpu(cpu, cpu_map)
-		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+		wait_on_cpu_work(get_cwq(cpu, wq), work);
 }
 
 static int __cancel_work_timer(struct work_struct *work,
@@ -782,9 +816,8 @@ EXPORT_SYMBOL(schedule_delayed_work);
 void flush_delayed_work(struct delayed_work *dwork)
 {
 	if (del_timer_sync(&dwork->timer)) {
-		struct cpu_workqueue_struct *cwq;
-		cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
-		__queue_work(cwq, &dwork->work);
+		__queue_work(get_cpu(), get_wq_data(&dwork->work)->wq,
+			     &dwork->work);
 		put_cpu();
 	}
 	flush_work(&dwork->work);
@@ -991,13 +1024,11 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
-		return NULL;
+		goto err;
 
 	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
-	if (!wq->cpu_wq) {
-		kfree(wq);
-		return NULL;
-	}
+	if (!wq->cpu_wq)
+		goto err;
 
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
@@ -1041,6 +1072,12 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		wq = NULL;
 	}
 	return wq;
+err:
+	if (wq) {
+		free_percpu(wq->cpu_wq);
+		kfree(wq);
+	}
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
 
-- 
cgit v1.2.3-59-g8ed1b


From 97e37d7b9e65a6ac939f796f91081135b7a08acc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:10 +0200
Subject: workqueue: merge feature parameters into flags

Currently, __create_workqueue_key() takes @singlethread and
@freezeable paramters and store them separately in workqueue_struct.
Merge them into a single flags parameter and field and use
WQ_FREEZEABLE and WQ_SINGLE_THREAD.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 25 +++++++++++++++----------
 kernel/workqueue.c        | 17 +++++++----------
 2 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index e724dafc9e6d..d89cfc143b1a 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -184,13 +184,17 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 #define work_clear_pending(work) \
 	clear_bit(WORK_STRUCT_PENDING, work_data_bits(work))
 
+enum {
+	WQ_FREEZEABLE		= 1 << 0, /* freeze during suspend */
+	WQ_SINGLE_THREAD	= 1 << 1, /* no per-cpu worker */
+};
 
 extern struct workqueue_struct *
-__create_workqueue_key(const char *name, int singlethread, int freezeable,
+__create_workqueue_key(const char *name, unsigned int flags,
 		       struct lock_class_key *key, const char *lock_name);
 
 #ifdef CONFIG_LOCKDEP
-#define __create_workqueue(name, singlethread, freezeable)	\
+#define __create_workqueue(name, flags)				\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
@@ -200,19 +204,20 @@ __create_workqueue_key(const char *name, int singlethread, int freezeable,
 	else							\
 		__lock_name = #name;				\
 								\
-	__create_workqueue_key((name), (singlethread),		\
-			       (freezeable), &__key,		\
+	__create_workqueue_key((name), (flags), &__key,		\
 			       __lock_name);			\
 })
 #else
-#define __create_workqueue(name, singlethread, freezeable)	\
-	__create_workqueue_key((name), (singlethread), (freezeable), \
-			       NULL, NULL)
+#define __create_workqueue(name, flags)				\
+	__create_workqueue_key((name), (flags), NULL, NULL)
 #endif
 
-#define create_workqueue(name) __create_workqueue((name), 0, 0)
-#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1)
-#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0)
+#define create_workqueue(name)					\
+	__create_workqueue((name), 0)
+#define create_freezeable_workqueue(name)			\
+	__create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_THREAD)
+#define create_singlethread_workqueue(name)			\
+	__create_workqueue((name), WQ_SINGLE_THREAD)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c56146a755e5..68e4dd808ec0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -67,11 +67,10 @@ struct cpu_workqueue_struct {
  * per-CPU workqueues:
  */
 struct workqueue_struct {
+	unsigned int		flags;		/* I: WQ_* flags */
 	struct cpu_workqueue_struct *cpu_wq;	/* I: cwq's */
 	struct list_head	list;		/* W: list of all workqueues */
 	const char		*name;		/* I: workqueue name */
-	int singlethread;
-	int freezeable;		/* Freeze threads during suspend */
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
 #endif
@@ -203,9 +202,9 @@ static const struct cpumask *cpu_singlethread_map __read_mostly;
 static cpumask_var_t cpu_populated_map __read_mostly;
 
 /* If it's single threaded, it isn't in the list of workqueues. */
-static inline int is_wq_single_threaded(struct workqueue_struct *wq)
+static inline bool is_wq_single_threaded(struct workqueue_struct *wq)
 {
-	return wq->singlethread;
+	return wq->flags & WQ_SINGLE_THREAD;
 }
 
 static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
@@ -463,7 +462,7 @@ static int worker_thread(void *__cwq)
 	struct cpu_workqueue_struct *cwq = __cwq;
 	DEFINE_WAIT(wait);
 
-	if (cwq->wq->freezeable)
+	if (cwq->wq->flags & WQ_FREEZEABLE)
 		set_freezable();
 
 	for (;;) {
@@ -1013,8 +1012,7 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 }
 
 struct workqueue_struct *__create_workqueue_key(const char *name,
-						int singlethread,
-						int freezeable,
+						unsigned int flags,
 						struct lock_class_key *key,
 						const char *lock_name)
 {
@@ -1030,13 +1028,12 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	if (!wq->cpu_wq)
 		goto err;
 
+	wq->flags = flags;
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
-	wq->singlethread = singlethread;
-	wq->freezeable = freezeable;
 	INIT_LIST_HEAD(&wq->list);
 
-	if (singlethread) {
+	if (flags & WQ_SINGLE_THREAD) {
 		cwq = init_cpu_workqueue(wq, singlethread_cpu);
 		err = create_workqueue_thread(cwq, singlethread_cpu);
 		start_workqueue_thread(cwq, -1);
-- 
cgit v1.2.3-59-g8ed1b


From 22df02bb3fab24af97bff4c69cc6fd8529fc66fe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:10 +0200
Subject: workqueue: define masks for work flags and conditionalize STATIC
 flags

Work flags are about to see more traditional mask handling.  Define
WORK_STRUCT_*_BIT as the bit position constant and redefine
WORK_STRUCT_* as bit masks.  Also, make WORK_STRUCT_STATIC_* flags
conditional

While at it, re-define these constants as enums and use
WORK_STRUCT_STATIC instead of hard-coding 2 in
WORK_DATA_STATIC_INIT().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 29 +++++++++++++++++++++--------
 kernel/workqueue.c        | 12 ++++++------
 2 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index d89cfc143b1a..d60c5701ab45 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -22,12 +22,25 @@ typedef void (*work_func_t)(struct work_struct *work);
  */
 #define work_data_bits(work) ((unsigned long *)(&(work)->data))
 
+enum {
+	WORK_STRUCT_PENDING_BIT	= 0,	/* work item is pending execution */
+#ifdef CONFIG_DEBUG_OBJECTS_WORK
+	WORK_STRUCT_STATIC_BIT	= 1,	/* static initializer (debugobjects) */
+#endif
+
+	WORK_STRUCT_PENDING	= 1 << WORK_STRUCT_PENDING_BIT,
+#ifdef CONFIG_DEBUG_OBJECTS_WORK
+	WORK_STRUCT_STATIC	= 1 << WORK_STRUCT_STATIC_BIT,
+#else
+	WORK_STRUCT_STATIC	= 0,
+#endif
+
+	WORK_STRUCT_FLAG_MASK	= 3UL,
+	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
+};
+
 struct work_struct {
 	atomic_long_t data;
-#define WORK_STRUCT_PENDING 0		/* T if work item pending execution */
-#define WORK_STRUCT_STATIC  1		/* static initializer (debugobjects) */
-#define WORK_STRUCT_FLAG_MASK (3UL)
-#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
 	struct list_head entry;
 	work_func_t func;
 #ifdef CONFIG_LOCKDEP
@@ -36,7 +49,7 @@ struct work_struct {
 };
 
 #define WORK_DATA_INIT()	ATOMIC_LONG_INIT(0)
-#define WORK_DATA_STATIC_INIT()	ATOMIC_LONG_INIT(2)
+#define WORK_DATA_STATIC_INIT()	ATOMIC_LONG_INIT(WORK_STRUCT_STATIC)
 
 struct delayed_work {
 	struct work_struct work;
@@ -98,7 +111,7 @@ extern void __init_work(struct work_struct *work, int onstack);
 extern void destroy_work_on_stack(struct work_struct *work);
 static inline unsigned int work_static(struct work_struct *work)
 {
-	return *work_data_bits(work) & (1 << WORK_STRUCT_STATIC);
+	return *work_data_bits(work) & WORK_STRUCT_STATIC;
 }
 #else
 static inline void __init_work(struct work_struct *work, int onstack) { }
@@ -167,7 +180,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
  * @work: The work item in question
  */
 #define work_pending(work) \
-	test_bit(WORK_STRUCT_PENDING, work_data_bits(work))
+	test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))
 
 /**
  * delayed_work_pending - Find out whether a delayable work item is currently
@@ -182,7 +195,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
  * @work: The work item in question
  */
 #define work_clear_pending(work) \
-	clear_bit(WORK_STRUCT_PENDING, work_data_bits(work))
+	clear_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))
 
 enum {
 	WQ_FREEZEABLE		= 1 << 0, /* freeze during suspend */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 68e4dd808ec0..5c49d762293b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -115,7 +115,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
 		 * statically initialized. We just make sure that it
 		 * is tracked in the object tracker.
 		 */
-		if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
+		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
 			debug_object_init(work, &work_debug_descr);
 			debug_object_activate(work, &work_debug_descr);
 			return 0;
@@ -232,7 +232,7 @@ static inline void set_wq_data(struct work_struct *work,
 	BUG_ON(!work_pending(work));
 
 	atomic_long_set(&work->data, (unsigned long)cwq | work_static(work) |
-			(1UL << WORK_STRUCT_PENDING) | extra_flags);
+			WORK_STRUCT_PENDING | extra_flags);
 }
 
 /*
@@ -330,7 +330,7 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
 {
 	int ret = 0;
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		__queue_work(cpu, wq, work);
 		ret = 1;
 	}
@@ -380,7 +380,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct timer_list *timer = &dwork->timer;
 	struct work_struct *work = &dwork->work;
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
@@ -516,7 +516,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 	 * might deadlock.
 	 */
 	INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
-	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
+	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
 	init_completion(&barr->done);
 
 	debug_work_activate(&barr->work);
@@ -628,7 +628,7 @@ static int try_to_grab_pending(struct work_struct *work)
 	struct cpu_workqueue_struct *cwq;
 	int ret = -1;
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
 		return 0;
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 0f900049cbe2767d47c2a62b54f0e822e1d66840 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:11 +0200
Subject: workqueue: update cwq alignement

work->data field is used for two purposes.  It points to cwq it's
queued on and the lower bits are used for flags.  Currently, two bits
are reserved which is always safe as 4 byte alignment is guaranteed on
every architecture.  However, future changes will need more flag bits.

On SMP, the percpu allocator is capable of honoring larger alignment
(there are other users which depend on it) and larger alignment works
just fine.  On UP, percpu allocator is a thin wrapper around
kzalloc/kfree() and don't honor alignment request.

This patch introduces WORK_STRUCT_FLAG_BITS and implements
alloc/free_cwqs() which guarantees max(1 << WORK_STRUCT_FLAG_BITS,
__alignof__(unsigned long long) alignment both on SMP and UP.  On SMP,
simply wrapping percpu allocator is enough.  On UP, extra space is
allocated so that cwq can be aligned and the original pointer can be
stored after it which is used in the free path.

* Alignment problem on UP is reported by Michal Simek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Reported-by: Michal Simek <michal.simek@petalogix.com>
---
 include/linux/workqueue.h |  5 +++-
 kernel/workqueue.c        | 60 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 59 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index d60c5701ab45..b90958a037dc 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -26,6 +26,9 @@ enum {
 	WORK_STRUCT_PENDING_BIT	= 0,	/* work item is pending execution */
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 	WORK_STRUCT_STATIC_BIT	= 1,	/* static initializer (debugobjects) */
+	WORK_STRUCT_FLAG_BITS	= 2,
+#else
+	WORK_STRUCT_FLAG_BITS	= 1,
 #endif
 
 	WORK_STRUCT_PENDING	= 1 << WORK_STRUCT_PENDING_BIT,
@@ -35,7 +38,7 @@ enum {
 	WORK_STRUCT_STATIC	= 0,
 #endif
 
-	WORK_STRUCT_FLAG_MASK	= 3UL,
+	WORK_STRUCT_FLAG_MASK	= (1UL << WORK_STRUCT_FLAG_BITS) - 1,
 	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
 };
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dc78956ccf03..74a38499b19a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -46,7 +46,9 @@
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
- * possible cpu).
+ * possible cpu).  The lower WORK_STRUCT_FLAG_BITS of
+ * work_struct->data are used for flags and thus cwqs need to be
+ * aligned at two's power of the number of flag bits.
  */
 struct cpu_workqueue_struct {
 
@@ -59,7 +61,7 @@ struct cpu_workqueue_struct {
 
 	struct workqueue_struct *wq;		/* I: the owning workqueue */
 	struct task_struct	*thread;
-} ____cacheline_aligned;
+};
 
 /*
  * The externally visible workqueue abstraction is an array of
@@ -967,6 +969,53 @@ int current_is_keventd(void)
 
 }
 
+static struct cpu_workqueue_struct *alloc_cwqs(void)
+{
+	/*
+	 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
+	 * Make sure that the alignment isn't lower than that of
+	 * unsigned long long.
+	 */
+	const size_t size = sizeof(struct cpu_workqueue_struct);
+	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
+				   __alignof__(unsigned long long));
+	struct cpu_workqueue_struct *cwqs;
+#ifndef CONFIG_SMP
+	void *ptr;
+
+	/*
+	 * On UP, percpu allocator doesn't honor alignment parameter
+	 * and simply uses arch-dependent default.  Allocate enough
+	 * room to align cwq and put an extra pointer at the end
+	 * pointing back to the originally allocated pointer which
+	 * will be used for free.
+	 *
+	 * FIXME: This really belongs to UP percpu code.  Update UP
+	 * percpu code to honor alignment and remove this ugliness.
+	 */
+	ptr = __alloc_percpu(size + align + sizeof(void *), 1);
+	cwqs = PTR_ALIGN(ptr, align);
+	*(void **)per_cpu_ptr(cwqs + 1, 0) = ptr;
+#else
+	/* On SMP, percpu allocator can do it itself */
+	cwqs = __alloc_percpu(size, align);
+#endif
+	/* just in case, make sure it's actually aligned */
+	BUG_ON(!IS_ALIGNED((unsigned long)cwqs, align));
+	return cwqs;
+}
+
+static void free_cwqs(struct cpu_workqueue_struct *cwqs)
+{
+#ifndef CONFIG_SMP
+	/* on UP, the pointer to free is stored right after the cwq */
+	if (cwqs)
+		free_percpu(*(void **)per_cpu_ptr(cwqs + 1, 0));
+#else
+	free_percpu(cwqs);
+#endif
+}
+
 static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	struct workqueue_struct *wq = cwq->wq;
@@ -1012,7 +1061,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	if (!wq)
 		goto err;
 
-	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
+	wq->cpu_wq = alloc_cwqs();
 	if (!wq->cpu_wq)
 		goto err;
 
@@ -1031,6 +1080,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	for_each_possible_cpu(cpu) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
+		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
 		cwq->wq = wq;
 		cwq->cpu = cpu;
 		spin_lock_init(&cwq->lock);
@@ -1059,7 +1109,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	return wq;
 err:
 	if (wq) {
-		free_percpu(wq->cpu_wq);
+		free_cwqs(wq->cpu_wq);
 		kfree(wq);
 	}
 	return NULL;
@@ -1112,7 +1162,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	for_each_possible_cpu(cpu)
 		cleanup_workqueue_thread(get_cwq(cpu, wq));
 
-	free_percpu(wq->cpu_wq);
+	free_cwqs(wq->cpu_wq);
 	kfree(wq);
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
-- 
cgit v1.2.3-59-g8ed1b


From 73f53c4aa732eced5fcb1844d3d452c30905f20f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:11 +0200
Subject: workqueue: reimplement workqueue flushing using color coded works

Reimplement workqueue flushing using color coded works.  wq has the
current work color which is painted on the works being issued via
cwqs.  Flushing a workqueue is achieved by advancing the current work
colors of cwqs and waiting for all the works which have any of the
previous colors to drain.

Currently there are 16 possible colors, one is reserved for no color
and 15 colors are useable allowing 14 concurrent flushes.  When color
space gets full, flush attempts are batched up and processed together
when color frees up, so even with many concurrent flushers, the new
implementation won't build up huge queue of flushers which has to be
processed one after another.

Only works which are queued via __queue_work() are colored.  Works
which are directly put on queue using insert_work() use NO_COLOR and
don't participate in workqueue flushing.  Currently only works used
for work-specific flush fall in this category.

This new implementation leaves only cleanup_workqueue_thread() as the
user of flush_cpu_workqueue().  Just make its users use
flush_workqueue() and kthread_stop() directly and kill
cleanup_workqueue_thread().  As workqueue flushing doesn't use barrier
request anymore, the comment describing the complex synchronization
around it in cleanup_workqueue_thread() is removed together with the
function.

This new implementation is to allow having and sharing multiple
workers per cpu.

Please note that one more bit is reserved for a future work flag by
this patch.  This is to avoid shifting bits and updating comments
later.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |  21 ++-
 kernel/workqueue.c        | 355 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 322 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index b90958a037dc..8762f62103d8 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -26,11 +26,13 @@ enum {
 	WORK_STRUCT_PENDING_BIT	= 0,	/* work item is pending execution */
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 	WORK_STRUCT_STATIC_BIT	= 1,	/* static initializer (debugobjects) */
-	WORK_STRUCT_FLAG_BITS	= 2,
+	WORK_STRUCT_COLOR_SHIFT	= 3,	/* color for workqueue flushing */
 #else
-	WORK_STRUCT_FLAG_BITS	= 1,
+	WORK_STRUCT_COLOR_SHIFT	= 2,	/* color for workqueue flushing */
 #endif
 
+	WORK_STRUCT_COLOR_BITS	= 4,
+
 	WORK_STRUCT_PENDING	= 1 << WORK_STRUCT_PENDING_BIT,
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 	WORK_STRUCT_STATIC	= 1 << WORK_STRUCT_STATIC_BIT,
@@ -38,6 +40,21 @@ enum {
 	WORK_STRUCT_STATIC	= 0,
 #endif
 
+	/*
+	 * The last color is no color used for works which don't
+	 * participate in workqueue flushing.
+	 */
+	WORK_NR_COLORS		= (1 << WORK_STRUCT_COLOR_BITS) - 1,
+	WORK_NO_COLOR		= WORK_NR_COLORS,
+
+	/*
+	 * Reserve 6 bits off of cwq pointer w/ debugobjects turned
+	 * off.  This makes cwqs aligned to 64 bytes which isn't too
+	 * excessive while allowing 15 workqueue flush colors.
+	 */
+	WORK_STRUCT_FLAG_BITS	= WORK_STRUCT_COLOR_SHIFT +
+				  WORK_STRUCT_COLOR_BITS,
+
 	WORK_STRUCT_FLAG_MASK	= (1UL << WORK_STRUCT_FLAG_BITS) - 1,
 	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
 };
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 74a38499b19a..56e47c59d73b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,6 +41,8 @@
  *
  * L: cwq->lock protected.  Access with cwq->lock held.
  *
+ * F: wq->flush_mutex protected.
+ *
  * W: workqueue_lock protected.
  */
 
@@ -60,9 +62,22 @@ struct cpu_workqueue_struct {
 	unsigned int		cpu;
 
 	struct workqueue_struct *wq;		/* I: the owning workqueue */
+	int			work_color;	/* L: current color */
+	int			flush_color;	/* L: flushing color */
+	int			nr_in_flight[WORK_NR_COLORS];
+						/* L: nr of in_flight works */
 	struct task_struct	*thread;
 };
 
+/*
+ * Structure used to wait for workqueue flush.
+ */
+struct wq_flusher {
+	struct list_head	list;		/* F: list of flushers */
+	int			flush_color;	/* F: flush color waiting for */
+	struct completion	done;		/* flush completion */
+};
+
 /*
  * The externally visible workqueue abstraction is an array of
  * per-CPU workqueues:
@@ -71,6 +86,15 @@ struct workqueue_struct {
 	unsigned int		flags;		/* I: WQ_* flags */
 	struct cpu_workqueue_struct *cpu_wq;	/* I: cwq's */
 	struct list_head	list;		/* W: list of all workqueues */
+
+	struct mutex		flush_mutex;	/* protects wq flushing */
+	int			work_color;	/* F: current work color */
+	int			flush_color;	/* F: current flush color */
+	atomic_t		nr_cwqs_to_flush; /* flush in progress */
+	struct wq_flusher	*first_flusher;	/* F: first flusher */
+	struct list_head	flusher_queue;	/* F: flush waiters */
+	struct list_head	flusher_overflow; /* F: flush overflow list */
+
 	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
@@ -207,6 +231,22 @@ static struct cpu_workqueue_struct *target_cwq(unsigned int cpu,
 	return get_cwq(cpu, wq);
 }
 
+static unsigned int work_color_to_flags(int color)
+{
+	return color << WORK_STRUCT_COLOR_SHIFT;
+}
+
+static int get_work_color(struct work_struct *work)
+{
+	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
+		((1 << WORK_STRUCT_COLOR_BITS) - 1);
+}
+
+static int work_next_color(int color)
+{
+	return (color + 1) % WORK_NR_COLORS;
+}
+
 /*
  * Set the workqueue on which a work item is to be run
  * - Must *only* be called if the pending flag is set
@@ -273,7 +313,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	debug_work_activate(work);
 	spin_lock_irqsave(&cwq->lock, flags);
 	BUG_ON(!list_empty(&work->entry));
-	insert_work(cwq, work, &cwq->worklist, 0);
+	cwq->nr_in_flight[cwq->work_color]++;
+	insert_work(cwq, work, &cwq->worklist,
+		    work_color_to_flags(cwq->work_color));
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
@@ -386,6 +428,44 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
+/**
+ * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * @cwq: cwq of interest
+ * @color: color of work which left the queue
+ *
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ *
+ * CONTEXT:
+ * spin_lock_irq(cwq->lock).
+ */
+static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
+{
+	/* ignore uncolored works */
+	if (color == WORK_NO_COLOR)
+		return;
+
+	cwq->nr_in_flight[color]--;
+
+	/* is flush in progress and are we at the flushing tip? */
+	if (likely(cwq->flush_color != color))
+		return;
+
+	/* are there still in-flight works? */
+	if (cwq->nr_in_flight[color])
+		return;
+
+	/* this cwq is done, clear flush_color */
+	cwq->flush_color = -1;
+
+	/*
+	 * If this was the last cwq, wake up the first flusher.  It
+	 * will handle the rest.
+	 */
+	if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+		complete(&cwq->wq->first_flusher->done);
+}
+
 /**
  * process_one_work - process single work
  * @cwq: cwq to process work for
@@ -404,6 +484,7 @@ static void process_one_work(struct cpu_workqueue_struct *cwq,
 			     struct work_struct *work)
 {
 	work_func_t f = work->func;
+	int work_color;
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * It is permissible to free the struct work_struct from
@@ -417,6 +498,7 @@ static void process_one_work(struct cpu_workqueue_struct *cwq,
 	/* claim and process */
 	debug_work_deactivate(work);
 	cwq->current_work = work;
+	work_color = get_work_color(work);
 	list_del_init(&work->entry);
 
 	spin_unlock_irq(&cwq->lock);
@@ -443,6 +525,7 @@ static void process_one_work(struct cpu_workqueue_struct *cwq,
 
 	/* we're done with it, release */
 	cwq->current_work = NULL;
+	cwq_dec_nr_in_flight(cwq, work_color);
 }
 
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
@@ -529,29 +612,78 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 	init_completion(&barr->done);
 
 	debug_work_activate(&barr->work);
-	insert_work(cwq, &barr->work, head, 0);
+	insert_work(cwq, &barr->work, head, work_color_to_flags(WORK_NO_COLOR));
 }
 
-static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
+/**
+ * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
+ * @wq: workqueue being flushed
+ * @flush_color: new flush color, < 0 for no-op
+ * @work_color: new work color, < 0 for no-op
+ *
+ * Prepare cwqs for workqueue flushing.
+ *
+ * If @flush_color is non-negative, flush_color on all cwqs should be
+ * -1.  If no cwq has in-flight commands at the specified color, all
+ * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
+ * has in flight commands, its cwq->flush_color is set to
+ * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
+ * wakeup logic is armed and %true is returned.
+ *
+ * The caller should have initialized @wq->first_flusher prior to
+ * calling this function with non-negative @flush_color.  If
+ * @flush_color is negative, no flush color update is done and %false
+ * is returned.
+ *
+ * If @work_color is non-negative, all cwqs should have the same
+ * work_color which is previous to @work_color and all will be
+ * advanced to @work_color.
+ *
+ * CONTEXT:
+ * mutex_lock(wq->flush_mutex).
+ *
+ * RETURNS:
+ * %true if @flush_color >= 0 and there's something to flush.  %false
+ * otherwise.
+ */
+static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
+				      int flush_color, int work_color)
 {
-	int active = 0;
-	struct wq_barrier barr;
+	bool wait = false;
+	unsigned int cpu;
 
-	WARN_ON(cwq->thread == current);
-
-	spin_lock_irq(&cwq->lock);
-	if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
-		insert_wq_barrier(cwq, &barr, &cwq->worklist);
-		active = 1;
+	if (flush_color >= 0) {
+		BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
+		atomic_set(&wq->nr_cwqs_to_flush, 1);
 	}
-	spin_unlock_irq(&cwq->lock);
 
-	if (active) {
-		wait_for_completion(&barr.done);
-		destroy_work_on_stack(&barr.work);
+	for_each_possible_cpu(cpu) {
+		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+		spin_lock_irq(&cwq->lock);
+
+		if (flush_color >= 0) {
+			BUG_ON(cwq->flush_color != -1);
+
+			if (cwq->nr_in_flight[flush_color]) {
+				cwq->flush_color = flush_color;
+				atomic_inc(&wq->nr_cwqs_to_flush);
+				wait = true;
+			}
+		}
+
+		if (work_color >= 0) {
+			BUG_ON(work_color != work_next_color(cwq->work_color));
+			cwq->work_color = work_color;
+		}
+
+		spin_unlock_irq(&cwq->lock);
 	}
 
-	return active;
+	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
+		complete(&wq->first_flusher->done);
+
+	return wait;
 }
 
 /**
@@ -566,13 +698,143 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  */
 void flush_workqueue(struct workqueue_struct *wq)
 {
-	int cpu;
+	struct wq_flusher this_flusher = {
+		.list = LIST_HEAD_INIT(this_flusher.list),
+		.flush_color = -1,
+		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+	};
+	int next_color;
 
-	might_sleep();
 	lock_map_acquire(&wq->lockdep_map);
 	lock_map_release(&wq->lockdep_map);
-	for_each_possible_cpu(cpu)
-		flush_cpu_workqueue(get_cwq(cpu, wq));
+
+	mutex_lock(&wq->flush_mutex);
+
+	/*
+	 * Start-to-wait phase
+	 */
+	next_color = work_next_color(wq->work_color);
+
+	if (next_color != wq->flush_color) {
+		/*
+		 * Color space is not full.  The current work_color
+		 * becomes our flush_color and work_color is advanced
+		 * by one.
+		 */
+		BUG_ON(!list_empty(&wq->flusher_overflow));
+		this_flusher.flush_color = wq->work_color;
+		wq->work_color = next_color;
+
+		if (!wq->first_flusher) {
+			/* no flush in progress, become the first flusher */
+			BUG_ON(wq->flush_color != this_flusher.flush_color);
+
+			wq->first_flusher = &this_flusher;
+
+			if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
+						       wq->work_color)) {
+				/* nothing to flush, done */
+				wq->flush_color = next_color;
+				wq->first_flusher = NULL;
+				goto out_unlock;
+			}
+		} else {
+			/* wait in queue */
+			BUG_ON(wq->flush_color == this_flusher.flush_color);
+			list_add_tail(&this_flusher.list, &wq->flusher_queue);
+			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+		}
+	} else {
+		/*
+		 * Oops, color space is full, wait on overflow queue.
+		 * The next flush completion will assign us
+		 * flush_color and transfer to flusher_queue.
+		 */
+		list_add_tail(&this_flusher.list, &wq->flusher_overflow);
+	}
+
+	mutex_unlock(&wq->flush_mutex);
+
+	wait_for_completion(&this_flusher.done);
+
+	/*
+	 * Wake-up-and-cascade phase
+	 *
+	 * First flushers are responsible for cascading flushes and
+	 * handling overflow.  Non-first flushers can simply return.
+	 */
+	if (wq->first_flusher != &this_flusher)
+		return;
+
+	mutex_lock(&wq->flush_mutex);
+
+	wq->first_flusher = NULL;
+
+	BUG_ON(!list_empty(&this_flusher.list));
+	BUG_ON(wq->flush_color != this_flusher.flush_color);
+
+	while (true) {
+		struct wq_flusher *next, *tmp;
+
+		/* complete all the flushers sharing the current flush color */
+		list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
+			if (next->flush_color != wq->flush_color)
+				break;
+			list_del_init(&next->list);
+			complete(&next->done);
+		}
+
+		BUG_ON(!list_empty(&wq->flusher_overflow) &&
+		       wq->flush_color != work_next_color(wq->work_color));
+
+		/* this flush_color is finished, advance by one */
+		wq->flush_color = work_next_color(wq->flush_color);
+
+		/* one color has been freed, handle overflow queue */
+		if (!list_empty(&wq->flusher_overflow)) {
+			/*
+			 * Assign the same color to all overflowed
+			 * flushers, advance work_color and append to
+			 * flusher_queue.  This is the start-to-wait
+			 * phase for these overflowed flushers.
+			 */
+			list_for_each_entry(tmp, &wq->flusher_overflow, list)
+				tmp->flush_color = wq->work_color;
+
+			wq->work_color = work_next_color(wq->work_color);
+
+			list_splice_tail_init(&wq->flusher_overflow,
+					      &wq->flusher_queue);
+			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+		}
+
+		if (list_empty(&wq->flusher_queue)) {
+			BUG_ON(wq->flush_color != wq->work_color);
+			break;
+		}
+
+		/*
+		 * Need to flush more colors.  Make the next flusher
+		 * the new first flusher and arm cwqs.
+		 */
+		BUG_ON(wq->flush_color == wq->work_color);
+		BUG_ON(wq->flush_color != next->flush_color);
+
+		list_del_init(&next->list);
+		wq->first_flusher = next;
+
+		if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
+			break;
+
+		/*
+		 * Meh... this color is already done, clear first
+		 * flusher and repeat cascading.
+		 */
+		wq->first_flusher = NULL;
+	}
+
+out_unlock:
+	mutex_unlock(&wq->flush_mutex);
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
@@ -659,6 +921,7 @@ static int try_to_grab_pending(struct work_struct *work)
 		if (cwq == get_wq_data(work)) {
 			debug_work_deactivate(work);
 			list_del_init(&work->entry);
+			cwq_dec_nr_in_flight(cwq, get_work_color(work));
 			ret = 1;
 		}
 	}
@@ -1066,6 +1329,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		goto err;
 
 	wq->flags = flags;
+	mutex_init(&wq->flush_mutex);
+	atomic_set(&wq->nr_cwqs_to_flush, 0);
+	INIT_LIST_HEAD(&wq->flusher_queue);
+	INIT_LIST_HEAD(&wq->flusher_overflow);
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	INIT_LIST_HEAD(&wq->list);
@@ -1083,6 +1350,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
 		cwq->wq = wq;
 		cwq->cpu = cpu;
+		cwq->flush_color = -1;
 		spin_lock_init(&cwq->lock);
 		INIT_LIST_HEAD(&cwq->worklist);
 		init_waitqueue_head(&cwq->more_work);
@@ -1116,33 +1384,6 @@ err:
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
 
-static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
-{
-	/*
-	 * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
-	 * cpu_add_remove_lock protects cwq->thread.
-	 */
-	if (cwq->thread == NULL)
-		return;
-
-	lock_map_acquire(&cwq->wq->lockdep_map);
-	lock_map_release(&cwq->wq->lockdep_map);
-
-	flush_cpu_workqueue(cwq);
-	/*
-	 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
-	 * a concurrent flush_workqueue() can insert a barrier after us.
-	 * However, in that case run_workqueue() won't return and check
-	 * kthread_should_stop() until it flushes all work_struct's.
-	 * When ->worklist becomes empty it is safe to exit because no
-	 * more work_structs can be queued on this cwq: flush_workqueue
-	 * checks list_empty(), and a "normal" queue_work() can't use
-	 * a dead CPU.
-	 */
-	kthread_stop(cwq->thread);
-	cwq->thread = NULL;
-}
-
 /**
  * destroy_workqueue - safely terminate a workqueue
  * @wq: target workqueue
@@ -1159,8 +1400,20 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	spin_unlock(&workqueue_lock);
 	cpu_maps_update_done();
 
-	for_each_possible_cpu(cpu)
-		cleanup_workqueue_thread(get_cwq(cpu, wq));
+	flush_workqueue(wq);
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+		int i;
+
+		if (cwq->thread) {
+			kthread_stop(cwq->thread);
+			cwq->thread = NULL;
+		}
+
+		for (i = 0; i < WORK_NR_COLORS; i++)
+			BUG_ON(cwq->nr_in_flight[i]);
+	}
 
 	free_cwqs(wq->cpu_wq);
 	kfree(wq);
@@ -1185,9 +1438,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 		switch (action) {
 		case CPU_POST_DEAD:
-			lock_map_acquire(&cwq->wq->lockdep_map);
-			lock_map_release(&cwq->wq->lockdep_map);
-			flush_cpu_workqueue(cwq);
+			flush_workqueue(wq);
 			break;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From affee4b294a0fc97d67c8a77dc080c4dd262a79e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:12 +0200
Subject: workqueue: reimplement work flushing using linked works

A work is linked to the next one by having WORK_STRUCT_LINKED bit set
and these links can be chained.  When a linked work is dispatched to a
worker, all linked works are dispatched to the worker's newly added
->scheduled queue and processed back-to-back.

Currently, as there's only single worker per cwq, having linked works
doesn't make any visible behavior difference.  This change is to
prepare for multiple shared workers per cpu.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   4 +-
 kernel/workqueue.c        | 152 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 134 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 8762f62103d8..4f4fdba722c3 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -24,8 +24,9 @@ typedef void (*work_func_t)(struct work_struct *work);
 
 enum {
 	WORK_STRUCT_PENDING_BIT	= 0,	/* work item is pending execution */
+	WORK_STRUCT_LINKED_BIT	= 1,	/* next work is linked to this one */
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
-	WORK_STRUCT_STATIC_BIT	= 1,	/* static initializer (debugobjects) */
+	WORK_STRUCT_STATIC_BIT	= 2,	/* static initializer (debugobjects) */
 	WORK_STRUCT_COLOR_SHIFT	= 3,	/* color for workqueue flushing */
 #else
 	WORK_STRUCT_COLOR_SHIFT	= 2,	/* color for workqueue flushing */
@@ -34,6 +35,7 @@ enum {
 	WORK_STRUCT_COLOR_BITS	= 4,
 
 	WORK_STRUCT_PENDING	= 1 << WORK_STRUCT_PENDING_BIT,
+	WORK_STRUCT_LINKED	= 1 << WORK_STRUCT_LINKED_BIT,
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 	WORK_STRUCT_STATIC	= 1 << WORK_STRUCT_STATIC_BIT,
 #else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 600db10a4dbf..9953d3c7bd10 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -51,6 +51,7 @@ struct cpu_workqueue_struct;
 
 struct worker {
 	struct work_struct	*current_work;	/* L: work being processed */
+	struct list_head	scheduled;	/* L: scheduled works */
 	struct task_struct	*task;		/* I: worker task */
 	struct cpu_workqueue_struct *cwq;	/* I: the associated cwq */
 	int			id;		/* I: worker id */
@@ -445,6 +446,8 @@ static struct worker *alloc_worker(void)
 	struct worker *worker;
 
 	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+	if (worker)
+		INIT_LIST_HEAD(&worker->scheduled);
 	return worker;
 }
 
@@ -530,6 +533,7 @@ static void destroy_worker(struct worker *worker)
 
 	/* sanity check frenzy */
 	BUG_ON(worker->current_work);
+	BUG_ON(!list_empty(&worker->scheduled));
 
 	kthread_stop(worker->task);
 	kfree(worker);
@@ -539,6 +543,47 @@ static void destroy_worker(struct worker *worker)
 	spin_unlock(&workqueue_lock);
 }
 
+/**
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head.  Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work.  This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
+ *
+ * CONTEXT:
+ * spin_lock_irq(cwq->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+			      struct work_struct **nextp)
+{
+	struct work_struct *n;
+
+	/*
+	 * Linked worklist will always end before the end of the list,
+	 * use NULL for list head.
+	 */
+	list_for_each_entry_safe_from(work, n, NULL, entry) {
+		list_move_tail(&work->entry, head);
+		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
+			break;
+	}
+
+	/*
+	 * If we're already inside safe list traversal and have moved
+	 * multiple works to the scheduled queue, the next position
+	 * needs to be updated.
+	 */
+	if (nextp)
+		*nextp = n;
+}
+
 /**
  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
  * @cwq: cwq of interest
@@ -639,17 +684,25 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
 	cwq_dec_nr_in_flight(cwq, work_color);
 }
 
-static void run_workqueue(struct worker *worker)
+/**
+ * process_scheduled_works - process scheduled works
+ * @worker: self
+ *
+ * Process all scheduled works.  Please note that the scheduled list
+ * may change while processing a work, so this function repeatedly
+ * fetches a work from the top and executes it.
+ *
+ * CONTEXT:
+ * spin_lock_irq(cwq->lock) which may be released and regrabbed
+ * multiple times.
+ */
+static void process_scheduled_works(struct worker *worker)
 {
-	struct cpu_workqueue_struct *cwq = worker->cwq;
-
-	spin_lock_irq(&cwq->lock);
-	while (!list_empty(&cwq->worklist)) {
-		struct work_struct *work = list_entry(cwq->worklist.next,
+	while (!list_empty(&worker->scheduled)) {
+		struct work_struct *work = list_first_entry(&worker->scheduled,
 						struct work_struct, entry);
 		process_one_work(worker, work);
 	}
-	spin_unlock_irq(&cwq->lock);
 }
 
 /**
@@ -684,7 +737,28 @@ static int worker_thread(void *__worker)
 					    get_cpu_mask(cwq->cpu))))
 			set_cpus_allowed_ptr(worker->task,
 					     get_cpu_mask(cwq->cpu));
-		run_workqueue(worker);
+
+		spin_lock_irq(&cwq->lock);
+
+		while (!list_empty(&cwq->worklist)) {
+			struct work_struct *work =
+				list_first_entry(&cwq->worklist,
+						 struct work_struct, entry);
+
+			if (likely(!(*work_data_bits(work) &
+				     WORK_STRUCT_LINKED))) {
+				/* optimization path, not strictly necessary */
+				process_one_work(worker, work);
+				if (unlikely(!list_empty(&worker->scheduled)))
+					process_scheduled_works(worker);
+			} else {
+				move_linked_works(work, &worker->scheduled,
+						  NULL);
+				process_scheduled_works(worker);
+			}
+		}
+
+		spin_unlock_irq(&cwq->lock);
 	}
 
 	return 0;
@@ -705,16 +779,33 @@ static void wq_barrier_func(struct work_struct *work)
  * insert_wq_barrier - insert a barrier work
  * @cwq: cwq to insert barrier into
  * @barr: wq_barrier to insert
- * @head: insertion point
+ * @target: target work to attach @barr to
+ * @worker: worker currently executing @target, NULL if @target is not executing
  *
- * Insert barrier @barr into @cwq before @head.
+ * @barr is linked to @target such that @barr is completed only after
+ * @target finishes execution.  Please note that the ordering
+ * guarantee is observed only with respect to @target and on the local
+ * cpu.
+ *
+ * Currently, a queued barrier can't be canceled.  This is because
+ * try_to_grab_pending() can't determine whether the work to be
+ * grabbed is at the head of the queue and thus can't clear LINKED
+ * flag of the previous work while there must be a valid next work
+ * after a work with LINKED flag set.
+ *
+ * Note that when @worker is non-NULL, @target may be modified
+ * underneath us, so we can't reliably determine cwq from @target.
  *
  * CONTEXT:
  * spin_lock_irq(cwq->lock).
  */
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
-			struct wq_barrier *barr, struct list_head *head)
+			      struct wq_barrier *barr,
+			      struct work_struct *target, struct worker *worker)
 {
+	struct list_head *head;
+	unsigned int linked = 0;
+
 	/*
 	 * debugobject calls are safe here even with cwq->lock locked
 	 * as we know for sure that this will not trigger any of the
@@ -725,8 +816,24 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
 	init_completion(&barr->done);
 
+	/*
+	 * If @target is currently being executed, schedule the
+	 * barrier to the worker; otherwise, put it after @target.
+	 */
+	if (worker)
+		head = worker->scheduled.next;
+	else {
+		unsigned long *bits = work_data_bits(target);
+
+		head = target->entry.next;
+		/* there can already be other linked works, inherit and set */
+		linked = *bits & WORK_STRUCT_LINKED;
+		__set_bit(WORK_STRUCT_LINKED_BIT, bits);
+	}
+
 	debug_work_activate(&barr->work);
-	insert_work(cwq, &barr->work, head, work_color_to_flags(WORK_NO_COLOR));
+	insert_work(cwq, &barr->work, head,
+		    work_color_to_flags(WORK_NO_COLOR) | linked);
 }
 
 /**
@@ -964,8 +1071,8 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
  */
 int flush_work(struct work_struct *work)
 {
+	struct worker *worker = NULL;
 	struct cpu_workqueue_struct *cwq;
-	struct list_head *prev;
 	struct wq_barrier barr;
 
 	might_sleep();
@@ -985,14 +1092,14 @@ int flush_work(struct work_struct *work)
 		smp_rmb();
 		if (unlikely(cwq != get_wq_data(work)))
 			goto already_gone;
-		prev = &work->entry;
 	} else {
-		if (!cwq->worker || cwq->worker->current_work != work)
+		if (cwq->worker && cwq->worker->current_work == work)
+			worker = cwq->worker;
+		if (!worker)
 			goto already_gone;
-		prev = &cwq->worklist;
 	}
-	insert_wq_barrier(cwq, &barr, prev->next);
 
+	insert_wq_barrier(cwq, &barr, work, worker);
 	spin_unlock_irq(&cwq->lock);
 	wait_for_completion(&barr.done);
 	destroy_work_on_stack(&barr.work);
@@ -1048,16 +1155,19 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
 				struct work_struct *work)
 {
 	struct wq_barrier barr;
-	int running = 0;
+	struct worker *worker;
 
 	spin_lock_irq(&cwq->lock);
+
+	worker = NULL;
 	if (unlikely(cwq->worker && cwq->worker->current_work == work)) {
-		insert_wq_barrier(cwq, &barr, cwq->worklist.next);
-		running = 1;
+		worker = cwq->worker;
+		insert_wq_barrier(cwq, &barr, work, worker);
 	}
+
 	spin_unlock_irq(&cwq->lock);
 
-	if (unlikely(running)) {
+	if (unlikely(worker)) {
 		wait_for_completion(&barr.done);
 		destroy_work_on_stack(&barr.work);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 1e19ffc63dbbaea7a7d1c63d99c38d3e5a4c7edf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:12 +0200
Subject: workqueue: implement per-cwq active work limit

Add cwq->nr_active, cwq->max_active and cwq->delayed_work.  nr_active
counts the number of active works per cwq.  A work is active if it's
flushable (colored) and is on cwq's worklist.  If nr_active reaches
max_active, new works are queued on cwq->delayed_work and activated
later as works on the cwq complete and decrement nr_active.

cwq->max_active can be specified via the new @max_active parameter to
__create_workqueue() and is set to 1 for all workqueues for now.  As
each cwq has only single worker now, this double queueing doesn't
cause any behavior difference visible to its users.

This will be used to reimplement freeze/thaw and implement shared
worker pool.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 18 +++++++++---------
 kernel/workqueue.c        | 39 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 46 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 4f4fdba722c3..eb753b7790e5 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -225,11 +225,11 @@ enum {
 };
 
 extern struct workqueue_struct *
-__create_workqueue_key(const char *name, unsigned int flags,
+__create_workqueue_key(const char *name, unsigned int flags, int max_active,
 		       struct lock_class_key *key, const char *lock_name);
 
 #ifdef CONFIG_LOCKDEP
-#define __create_workqueue(name, flags)				\
+#define __create_workqueue(name, flags, max_active)		\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
@@ -239,20 +239,20 @@ __create_workqueue_key(const char *name, unsigned int flags,
 	else							\
 		__lock_name = #name;				\
 								\
-	__create_workqueue_key((name), (flags), &__key,		\
-			       __lock_name);			\
+	__create_workqueue_key((name), (flags), (max_active),	\
+				&__key, __lock_name);		\
 })
 #else
-#define __create_workqueue(name, flags)				\
-	__create_workqueue_key((name), (flags), NULL, NULL)
+#define __create_workqueue(name, flags, max_active)		\
+	__create_workqueue_key((name), (flags), (max_active), NULL, NULL)
 #endif
 
 #define create_workqueue(name)					\
-	__create_workqueue((name), 0)
+	__create_workqueue((name), 0, 1)
 #define create_freezeable_workqueue(name)			\
-	__create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_THREAD)
+	__create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_THREAD, 1)
 #define create_singlethread_workqueue(name)			\
-	__create_workqueue((name), WQ_SINGLE_THREAD)
+	__create_workqueue((name), WQ_SINGLE_THREAD, 1)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9953d3c7bd10..e541b5db67dd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -77,6 +77,9 @@ struct cpu_workqueue_struct {
 	int			flush_color;	/* L: flushing color */
 	int			nr_in_flight[WORK_NR_COLORS];
 						/* L: nr of in_flight works */
+	int			nr_active;	/* L: nr of active works */
+	int			max_active;	/* I: max active works */
+	struct list_head	delayed_works;	/* L: delayed works */
 };
 
 /*
@@ -321,14 +324,24 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
 	struct cpu_workqueue_struct *cwq = target_cwq(cpu, wq);
+	struct list_head *worklist;
 	unsigned long flags;
 
 	debug_work_activate(work);
+
 	spin_lock_irqsave(&cwq->lock, flags);
 	BUG_ON(!list_empty(&work->entry));
+
 	cwq->nr_in_flight[cwq->work_color]++;
-	insert_work(cwq, work, &cwq->worklist,
-		    work_color_to_flags(cwq->work_color));
+
+	if (likely(cwq->nr_active < cwq->max_active)) {
+		cwq->nr_active++;
+		worklist = &cwq->worklist;
+	} else
+		worklist = &cwq->delayed_works;
+
+	insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color));
+
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
@@ -584,6 +597,15 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
 		*nextp = n;
 }
 
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+	struct work_struct *work = list_first_entry(&cwq->delayed_works,
+						    struct work_struct, entry);
+
+	move_linked_works(work, &cwq->worklist, NULL);
+	cwq->nr_active++;
+}
+
 /**
  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
  * @cwq: cwq of interest
@@ -602,6 +624,12 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
 		return;
 
 	cwq->nr_in_flight[color]--;
+	cwq->nr_active--;
+
+	/* one down, submit a delayed one */
+	if (!list_empty(&cwq->delayed_works) &&
+	    cwq->nr_active < cwq->max_active)
+		cwq_activate_first_delayed(cwq);
 
 	/* is flush in progress and are we at the flushing tip? */
 	if (likely(cwq->flush_color != color))
@@ -1505,6 +1533,7 @@ static void free_cwqs(struct cpu_workqueue_struct *cwqs)
 
 struct workqueue_struct *__create_workqueue_key(const char *name,
 						unsigned int flags,
+						int max_active,
 						struct lock_class_key *key,
 						const char *lock_name)
 {
@@ -1513,6 +1542,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	bool failed = false;
 	unsigned int cpu;
 
+	max_active = clamp_val(max_active, 1, INT_MAX);
+
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
 		goto err;
@@ -1544,8 +1575,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		cwq->cpu = cpu;
 		cwq->wq = wq;
 		cwq->flush_color = -1;
+		cwq->max_active = max_active;
 		spin_lock_init(&cwq->lock);
 		INIT_LIST_HEAD(&cwq->worklist);
+		INIT_LIST_HEAD(&cwq->delayed_works);
 		init_waitqueue_head(&cwq->more_work);
 
 		if (failed)
@@ -1607,6 +1640,8 @@ void destroy_workqueue(struct workqueue_struct *wq)
 
 		for (i = 0; i < WORK_NR_COLORS; i++)
 			BUG_ON(cwq->nr_in_flight[i]);
+		BUG_ON(cwq->nr_active);
+		BUG_ON(!list_empty(&cwq->delayed_works));
 	}
 
 	free_cwqs(wq->cpu_wq);
-- 
cgit v1.2.3-59-g8ed1b


From a0a1a5fd4fb15ec61117c759fe9f5c16c53d9e9c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:12 +0200
Subject: workqueue: reimplement workqueue freeze using max_active

Currently, workqueue freezing is implemented by marking the worker
freezeable and calling try_to_freeze() from dispatch loop.
Reimplement it using cwq->limit so that the workqueue is frozen
instead of the worker.

* workqueue_struct->saved_max_active is added which stores the
  specified max_active on initialization.

* On freeze, all cwq->max_active's are quenched to zero.  Freezing is
  complete when nr_active on all cwqs reach zero.

* On thaw, all cwq->max_active's are restored to wq->saved_max_active
  and the worklist is repopulated.

This new implementation allows having single shared pool of workers
per cpu.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   7 ++
 kernel/power/process.c    |  21 +++++-
 kernel/workqueue.c        | 163 +++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 179 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index eb753b7790e5..ab0b7fb99bc2 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -340,4 +340,11 @@ static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 #else
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg);
 #endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FREEZER
+extern void freeze_workqueues_begin(void);
+extern bool freeze_workqueues_busy(void);
+extern void thaw_workqueues(void);
+#endif /* CONFIG_FREEZER */
+
 #endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 71ae29052ab6..028a99598f49 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -15,6 +15,7 @@
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
 #include <linux/delay.h>
+#include <linux/workqueue.h>
 
 /* 
  * Timeout for stopping processes
@@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only)
 	struct task_struct *g, *p;
 	unsigned long end_time;
 	unsigned int todo;
+	bool wq_busy = false;
 	struct timeval start, end;
 	u64 elapsed_csecs64;
 	unsigned int elapsed_csecs;
@@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only)
 	do_gettimeofday(&start);
 
 	end_time = jiffies + TIMEOUT;
+
+	if (!sig_only)
+		freeze_workqueues_begin();
+
 	while (true) {
 		todo = 0;
 		read_lock(&tasklist_lock);
@@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only)
 				todo++;
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
+
+		if (!sig_only) {
+			wq_busy = freeze_workqueues_busy();
+			todo += wq_busy;
+		}
+
 		if (!todo || time_after(jiffies, end_time))
 			break;
 
@@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only)
 		 */
 		printk("\n");
 		printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
-				"(%d tasks refusing to freeze):\n",
-				elapsed_csecs / 100, elapsed_csecs % 100, todo);
+		       "(%d tasks refusing to freeze, wq_busy=%d):\n",
+		       elapsed_csecs / 100, elapsed_csecs % 100,
+		       todo - wq_busy, wq_busy);
+
+		thaw_workqueues();
+
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			task_lock(p);
@@ -157,6 +173,7 @@ void thaw_processes(void)
 	oom_killer_enable();
 
 	printk("Restarting tasks ... ");
+	thaw_workqueues();
 	thaw_tasks(true);
 	thaw_tasks(false);
 	schedule();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e541b5db67dd..4d059c532792 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -78,7 +78,7 @@ struct cpu_workqueue_struct {
 	int			nr_in_flight[WORK_NR_COLORS];
 						/* L: nr of in_flight works */
 	int			nr_active;	/* L: nr of active works */
-	int			max_active;	/* I: max active works */
+	int			max_active;	/* L: max active works */
 	struct list_head	delayed_works;	/* L: delayed works */
 };
 
@@ -108,6 +108,7 @@ struct workqueue_struct {
 	struct list_head	flusher_queue;	/* F: flush waiters */
 	struct list_head	flusher_overflow; /* F: flush overflow list */
 
+	int			saved_max_active; /* I: saved cwq max_active */
 	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
@@ -228,6 +229,7 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
 static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
 static DEFINE_PER_CPU(struct ida, worker_ida);
+static bool workqueue_freezing;		/* W: have wqs started freezing? */
 
 static int worker_thread(void *__worker);
 
@@ -745,19 +747,13 @@ static int worker_thread(void *__worker)
 	struct cpu_workqueue_struct *cwq = worker->cwq;
 	DEFINE_WAIT(wait);
 
-	if (cwq->wq->flags & WQ_FREEZEABLE)
-		set_freezable();
-
 	for (;;) {
 		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
-		if (!freezing(current) &&
-		    !kthread_should_stop() &&
+		if (!kthread_should_stop() &&
 		    list_empty(&cwq->worklist))
 			schedule();
 		finish_wait(&cwq->more_work, &wait);
 
-		try_to_freeze();
-
 		if (kthread_should_stop())
 			break;
 
@@ -1553,6 +1549,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		goto err;
 
 	wq->flags = flags;
+	wq->saved_max_active = max_active;
 	mutex_init(&wq->flush_mutex);
 	atomic_set(&wq->nr_cwqs_to_flush, 0);
 	INIT_LIST_HEAD(&wq->flusher_queue);
@@ -1591,8 +1588,19 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 			failed = true;
 	}
 
+	/*
+	 * workqueue_lock protects global freeze state and workqueues
+	 * list.  Grab it, set max_active accordingly and add the new
+	 * workqueue to workqueues list.
+	 */
 	spin_lock(&workqueue_lock);
+
+	if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
+		for_each_possible_cpu(cpu)
+			get_cwq(cpu, wq)->max_active = 0;
+
 	list_add(&wq->list, &workqueues);
+
 	spin_unlock(&workqueue_lock);
 
 	cpu_maps_update_done();
@@ -1621,14 +1629,18 @@ void destroy_workqueue(struct workqueue_struct *wq)
 {
 	int cpu;
 
+	flush_workqueue(wq);
+
+	/*
+	 * wq list is used to freeze wq, remove from list after
+	 * flushing is complete in case freeze races us.
+	 */
 	cpu_maps_update_begin();
 	spin_lock(&workqueue_lock);
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 	cpu_maps_update_done();
 
-	flush_workqueue(wq);
-
 	for_each_possible_cpu(cpu) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		int i;
@@ -1722,6 +1734,137 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 EXPORT_SYMBOL_GPL(work_on_cpu);
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_FREEZER
+
+/**
+ * freeze_workqueues_begin - begin freezing workqueues
+ *
+ * Start freezing workqueues.  After this function returns, all
+ * freezeable workqueues will queue new works to their frozen_works
+ * list instead of the cwq ones.
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock and cwq->lock's.
+ */
+void freeze_workqueues_begin(void)
+{
+	struct workqueue_struct *wq;
+	unsigned int cpu;
+
+	spin_lock(&workqueue_lock);
+
+	BUG_ON(workqueue_freezing);
+	workqueue_freezing = true;
+
+	for_each_possible_cpu(cpu) {
+		list_for_each_entry(wq, &workqueues, list) {
+			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+			spin_lock_irq(&cwq->lock);
+
+			if (wq->flags & WQ_FREEZEABLE)
+				cwq->max_active = 0;
+
+			spin_unlock_irq(&cwq->lock);
+		}
+	}
+
+	spin_unlock(&workqueue_lock);
+}
+
+/**
+ * freeze_workqueues_busy - are freezeable workqueues still busy?
+ *
+ * Check whether freezing is complete.  This function must be called
+ * between freeze_workqueues_begin() and thaw_workqueues().
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock.
+ *
+ * RETURNS:
+ * %true if some freezeable workqueues are still busy.  %false if
+ * freezing is complete.
+ */
+bool freeze_workqueues_busy(void)
+{
+	struct workqueue_struct *wq;
+	unsigned int cpu;
+	bool busy = false;
+
+	spin_lock(&workqueue_lock);
+
+	BUG_ON(!workqueue_freezing);
+
+	for_each_possible_cpu(cpu) {
+		/*
+		 * nr_active is monotonically decreasing.  It's safe
+		 * to peek without lock.
+		 */
+		list_for_each_entry(wq, &workqueues, list) {
+			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+			if (!(wq->flags & WQ_FREEZEABLE))
+				continue;
+
+			BUG_ON(cwq->nr_active < 0);
+			if (cwq->nr_active) {
+				busy = true;
+				goto out_unlock;
+			}
+		}
+	}
+out_unlock:
+	spin_unlock(&workqueue_lock);
+	return busy;
+}
+
+/**
+ * thaw_workqueues - thaw workqueues
+ *
+ * Thaw workqueues.  Normal queueing is restored and all collected
+ * frozen works are transferred to their respective cwq worklists.
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock and cwq->lock's.
+ */
+void thaw_workqueues(void)
+{
+	struct workqueue_struct *wq;
+	unsigned int cpu;
+
+	spin_lock(&workqueue_lock);
+
+	if (!workqueue_freezing)
+		goto out_unlock;
+
+	for_each_possible_cpu(cpu) {
+		list_for_each_entry(wq, &workqueues, list) {
+			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+			if (!(wq->flags & WQ_FREEZEABLE))
+				continue;
+
+			spin_lock_irq(&cwq->lock);
+
+			/* restore max_active and repopulate worklist */
+			cwq->max_active = wq->saved_max_active;
+
+			while (!list_empty(&cwq->delayed_works) &&
+			       cwq->nr_active < cwq->max_active)
+				cwq_activate_first_delayed(cwq);
+
+			wake_up(&cwq->more_work);
+
+			spin_unlock_irq(&cwq->lock);
+		}
+	}
+
+	workqueue_freezing = false;
+out_unlock:
+	spin_unlock(&workqueue_lock);
+}
+#endif /* CONFIG_FREEZER */
+
 void __init init_workqueues(void)
 {
 	unsigned int cpu;
-- 
cgit v1.2.3-59-g8ed1b


From db7bccf45cb87522096b8f43144e31ca605a9f24 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:12 +0200
Subject: workqueue: reimplement CPU hotplugging support using trustee

Reimplement CPU hotplugging support using trustee thread.  On CPU
down, a trustee thread is created and each step of CPU down is
executed by the trustee and workqueue_cpu_callback() simply drives and
waits for trustee state transitions.

CPU down operation no longer waits for works to be drained but trustee
sticks around till all pending works have been completed.  If CPU is
brought back up while works are still draining,
workqueue_cpu_callback() tells trustee to step down and tell workers
to rebind to the cpu.

As it's difficult to tell whether cwqs are empty if it's freezing or
frozen, trustee doesn't consider draining to be complete while a gcwq
is freezing or frozen (tracked by new GCWQ_FREEZING flag).  Also,
workers which get unbound from their cpu are marked with WORKER_ROGUE.

Trustee based implementation doesn't bring any new feature at this
point but it will be used to manage worker pool when dynamic shared
worker pool is implemented.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpu.h |   2 +
 kernel/workqueue.c  | 293 +++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 279 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index de6b1722cdca..4823af64e9db 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -71,6 +71,8 @@ enum {
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION	= 10,
+	/* prepare workqueues for other notifiers */
+	CPU_PRI_WORKQUEUE	= 5,
 };
 
 #ifdef CONFIG_SMP
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d64913aa486a..f57855f718d7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -36,14 +36,27 @@
 #include <linux/idr.h>
 
 enum {
+	/* global_cwq flags */
+	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
+
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
 	WORKER_DIE		= 1 << 1,	/* die die die */
 	WORKER_IDLE		= 1 << 2,	/* is idle */
+	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
+
+	/* gcwq->trustee_state */
+	TRUSTEE_START		= 0,		/* start */
+	TRUSTEE_IN_CHARGE	= 1,		/* trustee in charge of gcwq */
+	TRUSTEE_BUTCHER		= 2,		/* butcher workers */
+	TRUSTEE_RELEASE		= 3,		/* release workers */
+	TRUSTEE_DONE		= 4,		/* trustee is done */
 
 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
 	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
 	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
+
+	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
 };
 
 /*
@@ -83,6 +96,7 @@ struct worker {
 struct global_cwq {
 	spinlock_t		lock;		/* the gcwq lock */
 	unsigned int		cpu;		/* I: the associated cpu */
+	unsigned int		flags;		/* L: GCWQ_* flags */
 
 	int			nr_workers;	/* L: total number of workers */
 	int			nr_idle;	/* L: currently idle ones */
@@ -93,6 +107,10 @@ struct global_cwq {
 						/* L: hash of busy workers */
 
 	struct ida		worker_ida;	/* L: for worker IDs */
+
+	struct task_struct	*trustee;	/* L: for gcwq shutdown */
+	unsigned int		trustee_state;	/* L: trustee state */
+	wait_queue_head_t	trustee_wait;	/* trustee wait */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -148,6 +166,10 @@ struct workqueue_struct {
 #endif
 };
 
+#define for_each_busy_worker(worker, i, pos, gcwq)			\
+	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
+		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
+
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
 static struct debug_obj_descr work_debug_descr;
@@ -546,6 +568,9 @@ static void worker_enter_idle(struct worker *worker)
 
 	/* idle_list is LIFO */
 	list_add(&worker->entry, &gcwq->idle_list);
+
+	if (unlikely(worker->flags & WORKER_ROGUE))
+		wake_up_all(&gcwq->trustee_wait);
 }
 
 /**
@@ -622,8 +647,15 @@ static struct worker *create_worker(struct cpu_workqueue_struct *cwq, bool bind)
 	if (IS_ERR(worker->task))
 		goto fail;
 
+	/*
+	 * A rogue worker will become a regular one if CPU comes
+	 * online later on.  Make sure every worker has
+	 * PF_THREAD_BOUND set.
+	 */
 	if (bind)
 		kthread_bind(worker->task, gcwq->cpu);
+	else
+		worker->task->flags |= PF_THREAD_BOUND;
 
 	return worker;
 fail:
@@ -882,10 +914,6 @@ static int worker_thread(void *__worker)
 	struct cpu_workqueue_struct *cwq = worker->cwq;
 
 woke_up:
-	if (unlikely(!cpumask_equal(&worker->task->cpus_allowed,
-				    get_cpu_mask(gcwq->cpu))))
-		set_cpus_allowed_ptr(worker->task, get_cpu_mask(gcwq->cpu));
-
 	spin_lock_irq(&gcwq->lock);
 
 	/* DIE can be set only while we're idle, checking here is enough */
@@ -895,7 +923,7 @@ woke_up:
 	}
 
 	worker_leave_idle(worker);
-
+recheck:
 	/*
 	 * ->scheduled list can only be filled while a worker is
 	 * preparing to process a work or actually processing it.
@@ -908,6 +936,22 @@ woke_up:
 			list_first_entry(&cwq->worklist,
 					 struct work_struct, entry);
 
+		/*
+		 * The following is a rather inefficient way to close
+		 * race window against cpu hotplug operations.  Will
+		 * be replaced soon.
+		 */
+		if (unlikely(!(worker->flags & WORKER_ROGUE) &&
+			     !cpumask_equal(&worker->task->cpus_allowed,
+					    get_cpu_mask(gcwq->cpu)))) {
+			spin_unlock_irq(&gcwq->lock);
+			set_cpus_allowed_ptr(worker->task,
+					     get_cpu_mask(gcwq->cpu));
+			cpu_relax();
+			spin_lock_irq(&gcwq->lock);
+			goto recheck;
+		}
+
 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
 			/* optimization path, not strictly necessary */
 			process_one_work(worker, work);
@@ -1812,29 +1856,237 @@ void destroy_workqueue(struct workqueue_struct *wq)
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
 
+/*
+ * CPU hotplug.
+ *
+ * CPU hotplug is implemented by allowing cwqs to be detached from
+ * CPU, running with unbound workers and allowing them to be
+ * reattached later if the cpu comes back online.  A separate thread
+ * is created to govern cwqs in such state and is called the trustee.
+ *
+ * Trustee states and their descriptions.
+ *
+ * START	Command state used on startup.  On CPU_DOWN_PREPARE, a
+ *		new trustee is started with this state.
+ *
+ * IN_CHARGE	Once started, trustee will enter this state after
+ *		making all existing workers rogue.  DOWN_PREPARE waits
+ *		for trustee to enter this state.  After reaching
+ *		IN_CHARGE, trustee tries to execute the pending
+ *		worklist until it's empty and the state is set to
+ *		BUTCHER, or the state is set to RELEASE.
+ *
+ * BUTCHER	Command state which is set by the cpu callback after
+ *		the cpu has went down.  Once this state is set trustee
+ *		knows that there will be no new works on the worklist
+ *		and once the worklist is empty it can proceed to
+ *		killing idle workers.
+ *
+ * RELEASE	Command state which is set by the cpu callback if the
+ *		cpu down has been canceled or it has come online
+ *		again.  After recognizing this state, trustee stops
+ *		trying to drain or butcher and transits to DONE.
+ *
+ * DONE		Trustee will enter this state after BUTCHER or RELEASE
+ *		is complete.
+ *
+ *          trustee                 CPU                draining
+ *         took over                down               complete
+ * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
+ *                        |                     |                  ^
+ *                        | CPU is back online  v   return workers |
+ *                         ----------------> RELEASE --------------
+ */
+
+/**
+ * trustee_wait_event_timeout - timed event wait for trustee
+ * @cond: condition to wait for
+ * @timeout: timeout in jiffies
+ *
+ * wait_event_timeout() for trustee to use.  Handles locking and
+ * checks for RELEASE request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * Positive indicating left time if @cond is satisfied, 0 if timed
+ * out, -1 if canceled.
+ */
+#define trustee_wait_event_timeout(cond, timeout) ({			\
+	long __ret = (timeout);						\
+	while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) &&	\
+	       __ret) {							\
+		spin_unlock_irq(&gcwq->lock);				\
+		__wait_event_timeout(gcwq->trustee_wait, (cond) ||	\
+			(gcwq->trustee_state == TRUSTEE_RELEASE),	\
+			__ret);						\
+		spin_lock_irq(&gcwq->lock);				\
+	}								\
+	gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);		\
+})
+
+/**
+ * trustee_wait_event - event wait for trustee
+ * @cond: condition to wait for
+ *
+ * wait_event() for trustee to use.  Automatically handles locking and
+ * checks for CANCEL request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * 0 if @cond is satisfied, -1 if canceled.
+ */
+#define trustee_wait_event(cond) ({					\
+	long __ret1;							\
+	__ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
+	__ret1 < 0 ? -1 : 0;						\
+})
+
+static int __cpuinit trustee_thread(void *__gcwq)
+{
+	struct global_cwq *gcwq = __gcwq;
+	struct worker *worker;
+	struct hlist_node *pos;
+	int i;
+
+	BUG_ON(gcwq->cpu != smp_processor_id());
+
+	spin_lock_irq(&gcwq->lock);
+	/*
+	 * Make all multithread workers rogue.  Trustee must be bound
+	 * to the target cpu and can't be cancelled.
+	 */
+	BUG_ON(gcwq->cpu != smp_processor_id());
+
+	list_for_each_entry(worker, &gcwq->idle_list, entry)
+		if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
+			worker->flags |= WORKER_ROGUE;
+
+	for_each_busy_worker(worker, i, pos, gcwq)
+		if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
+			worker->flags |= WORKER_ROGUE;
+
+	/*
+	 * We're now in charge.  Notify and proceed to drain.  We need
+	 * to keep the gcwq running during the whole CPU down
+	 * procedure as other cpu hotunplug callbacks may need to
+	 * flush currently running tasks.
+	 */
+	gcwq->trustee_state = TRUSTEE_IN_CHARGE;
+	wake_up_all(&gcwq->trustee_wait);
+
+	/*
+	 * The original cpu is in the process of dying and may go away
+	 * anytime now.  When that happens, we and all workers would
+	 * be migrated to other cpus.  Try draining any left work.
+	 * Note that if the gcwq is frozen, there may be frozen works
+	 * in freezeable cwqs.  Don't declare completion while frozen.
+	 */
+	while (gcwq->nr_workers != gcwq->nr_idle ||
+	       gcwq->flags & GCWQ_FREEZING ||
+	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+		/* give a breather */
+		if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
+			break;
+	}
+
+	/* notify completion */
+	gcwq->trustee = NULL;
+	gcwq->trustee_state = TRUSTEE_DONE;
+	wake_up_all(&gcwq->trustee_wait);
+	spin_unlock_irq(&gcwq->lock);
+	return 0;
+}
+
+/**
+ * wait_trustee_state - wait for trustee to enter the specified state
+ * @gcwq: gcwq the trustee of interest belongs to
+ * @state: target state to wait for
+ *
+ * Wait for the trustee to reach @state.  DONE is already matched.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by cpu_callback.
+ */
+static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
+{
+	if (!(gcwq->trustee_state == state ||
+	      gcwq->trustee_state == TRUSTEE_DONE)) {
+		spin_unlock_irq(&gcwq->lock);
+		__wait_event(gcwq->trustee_wait,
+			     gcwq->trustee_state == state ||
+			     gcwq->trustee_state == TRUSTEE_DONE);
+		spin_lock_irq(&gcwq->lock);
+	}
+}
+
 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 						unsigned long action,
 						void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct cpu_workqueue_struct *cwq;
-	struct workqueue_struct *wq;
+	struct global_cwq *gcwq = get_gcwq(cpu);
+	struct task_struct *new_trustee = NULL;
+	struct worker *worker;
+	struct hlist_node *pos;
+	unsigned long flags;
+	int i;
 
 	action &= ~CPU_TASKS_FROZEN;
 
-	list_for_each_entry(wq, &workqueues, list) {
-		if (wq->flags & WQ_SINGLE_THREAD)
-			continue;
+	switch (action) {
+	case CPU_DOWN_PREPARE:
+		new_trustee = kthread_create(trustee_thread, gcwq,
+					     "workqueue_trustee/%d\n", cpu);
+		if (IS_ERR(new_trustee))
+			return notifier_from_errno(PTR_ERR(new_trustee));
+		kthread_bind(new_trustee, cpu);
+	}
 
-		cwq = get_cwq(cpu, wq);
+	/* some are called w/ irq disabled, don't disturb irq status */
+	spin_lock_irqsave(&gcwq->lock, flags);
 
-		switch (action) {
-		case CPU_POST_DEAD:
-			flush_workqueue(wq);
-			break;
+	switch (action) {
+	case CPU_DOWN_PREPARE:
+		/* initialize trustee and tell it to acquire the gcwq */
+		BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
+		gcwq->trustee = new_trustee;
+		gcwq->trustee_state = TRUSTEE_START;
+		wake_up_process(gcwq->trustee);
+		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
+		break;
+
+	case CPU_POST_DEAD:
+		gcwq->trustee_state = TRUSTEE_BUTCHER;
+		break;
+
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE:
+		if (gcwq->trustee_state != TRUSTEE_DONE) {
+			gcwq->trustee_state = TRUSTEE_RELEASE;
+			wake_up_process(gcwq->trustee);
+			wait_trustee_state(gcwq, TRUSTEE_DONE);
 		}
+
+		/* clear ROGUE from all multithread workers */
+		list_for_each_entry(worker, &gcwq->idle_list, entry)
+			if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
+				worker->flags &= ~WORKER_ROGUE;
+
+		for_each_busy_worker(worker, i, pos, gcwq)
+			if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
+				worker->flags &= ~WORKER_ROGUE;
+		break;
 	}
 
+	spin_unlock_irqrestore(&gcwq->lock, flags);
+
 	return notifier_from_errno(0);
 }
 
@@ -1912,6 +2164,9 @@ void freeze_workqueues_begin(void)
 
 		spin_lock_irq(&gcwq->lock);
 
+		BUG_ON(gcwq->flags & GCWQ_FREEZING);
+		gcwq->flags |= GCWQ_FREEZING;
+
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
@@ -1995,6 +2250,9 @@ void thaw_workqueues(void)
 
 		spin_lock_irq(&gcwq->lock);
 
+		BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
+		gcwq->flags &= ~GCWQ_FREEZING;
+
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
@@ -2026,7 +2284,7 @@ void __init init_workqueues(void)
 	int i;
 
 	singlethread_cpu = cpumask_first(cpu_possible_mask);
-	hotcpu_notifier(workqueue_cpu_callback, 0);
+	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
 	/* initialize gcwqs */
 	for_each_possible_cpu(cpu) {
@@ -2040,6 +2298,9 @@ void __init init_workqueues(void)
 			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
 
 		ida_init(&gcwq->worker_ida);
+
+		gcwq->trustee_state = TRUSTEE_DONE;
+		init_waitqueue_head(&gcwq->trustee_wait);
 	}
 
 	keventd_wq = create_workqueue("events");
-- 
cgit v1.2.3-59-g8ed1b


From 502ca9d819792e7d79b6e002afe9094c641fe410 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:13 +0200
Subject: workqueue: make single thread workqueue shared worker pool friendly

Reimplement st (single thread) workqueue so that it's friendly to
shared worker pool.  It was originally implemented by confining st
workqueues to use cwq of a fixed cpu and always having a worker for
the cpu.  This implementation isn't very friendly to shared worker
pool and suboptimal in that it ends up crossing cpu boundaries often.

Reimplement st workqueue using dynamic single cpu binding and
cwq->limit.  WQ_SINGLE_THREAD is replaced with WQ_SINGLE_CPU.  In a
single cpu workqueue, at most single cwq is bound to the wq at any
given time.  Arbitration is done using atomic accesses to
wq->single_cpu when queueing a work.  Once bound, the binding stays
till the workqueue is drained.

Note that the binding is never broken while a workqueue is frozen.
This is because idle cwqs may have works waiting in delayed_works
queue while frozen.  On thaw, the cwq is restarted if there are any
delayed works or unbound otherwise.

When combined with max_active limit of 1, single cpu workqueue has
exactly the same execution properties as the original single thread
workqueue while allowing sharing of per-cpu workers.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   6 +--
 kernel/workqueue.c        | 135 ++++++++++++++++++++++++++++++++++------------
 2 files changed, 103 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index ab0b7fb99bc2..10611f7fc809 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -221,7 +221,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 
 enum {
 	WQ_FREEZEABLE		= 1 << 0, /* freeze during suspend */
-	WQ_SINGLE_THREAD	= 1 << 1, /* no per-cpu worker */
+	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
 };
 
 extern struct workqueue_struct *
@@ -250,9 +250,9 @@ __create_workqueue_key(const char *name, unsigned int flags, int max_active,
 #define create_workqueue(name)					\
 	__create_workqueue((name), 0, 1)
 #define create_freezeable_workqueue(name)			\
-	__create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_THREAD, 1)
+	__create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_CPU, 1)
 #define create_singlethread_workqueue(name)			\
-	__create_workqueue((name), WQ_SINGLE_THREAD, 1)
+	__create_workqueue((name), WQ_SINGLE_CPU, 1)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f57855f718d7..cfb8aa567e17 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -114,8 +114,7 @@ struct global_cwq {
 } ____cacheline_aligned_in_smp;
 
 /*
- * The per-CPU workqueue (if single thread, we always use the first
- * possible cpu).  The lower WORK_STRUCT_FLAG_BITS of
+ * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
  * work_struct->data are used for flags and thus cwqs need to be
  * aligned at two's power of the number of flag bits.
  */
@@ -159,6 +158,8 @@ struct workqueue_struct {
 	struct list_head	flusher_queue;	/* F: flush waiters */
 	struct list_head	flusher_overflow; /* F: flush overflow list */
 
+	unsigned long		single_cpu;	/* cpu for single cpu wq */
+
 	int			saved_max_active; /* I: saved cwq max_active */
 	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
@@ -289,8 +290,6 @@ static DEFINE_PER_CPU(struct global_cwq, global_cwq);
 
 static int worker_thread(void *__worker);
 
-static int singlethread_cpu __read_mostly;
-
 static struct global_cwq *get_gcwq(unsigned int cpu)
 {
 	return &per_cpu(global_cwq, cpu);
@@ -302,14 +301,6 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 	return per_cpu_ptr(wq->cpu_wq, cpu);
 }
 
-static struct cpu_workqueue_struct *target_cwq(unsigned int cpu,
-					       struct workqueue_struct *wq)
-{
-	if (unlikely(wq->flags & WQ_SINGLE_THREAD))
-		cpu = singlethread_cpu;
-	return get_cwq(cpu, wq);
-}
-
 static unsigned int work_color_to_flags(int color)
 {
 	return color << WORK_STRUCT_COLOR_SHIFT;
@@ -410,17 +401,87 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 	wake_up_process(cwq->worker->task);
 }
 
+/**
+ * cwq_unbind_single_cpu - unbind cwq from single cpu workqueue processing
+ * @cwq: cwq to unbind
+ *
+ * Try to unbind @cwq from single cpu workqueue processing.  If
+ * @cwq->wq is frozen, unbind is delayed till the workqueue is thawed.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_unbind_single_cpu(struct cpu_workqueue_struct *cwq)
+{
+	struct workqueue_struct *wq = cwq->wq;
+	struct global_cwq *gcwq = cwq->gcwq;
+
+	BUG_ON(wq->single_cpu != gcwq->cpu);
+	/*
+	 * Unbind from workqueue if @cwq is not frozen.  If frozen,
+	 * thaw_workqueues() will either restart processing on this
+	 * cpu or unbind if empty.  This keeps works queued while
+	 * frozen fully ordered and flushable.
+	 */
+	if (likely(!(gcwq->flags & GCWQ_FREEZING))) {
+		smp_wmb();	/* paired with cmpxchg() in __queue_work() */
+		wq->single_cpu = NR_CPUS;
+	}
+}
+
 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
-	struct cpu_workqueue_struct *cwq = target_cwq(cpu, wq);
-	struct global_cwq *gcwq = cwq->gcwq;
+	struct global_cwq *gcwq;
+	struct cpu_workqueue_struct *cwq;
 	struct list_head *worklist;
 	unsigned long flags;
+	bool arbitrate;
 
 	debug_work_activate(work);
 
-	spin_lock_irqsave(&gcwq->lock, flags);
+	/* determine gcwq to use */
+	if (!(wq->flags & WQ_SINGLE_CPU)) {
+		/* just use the requested cpu for multicpu workqueues */
+		gcwq = get_gcwq(cpu);
+		spin_lock_irqsave(&gcwq->lock, flags);
+	} else {
+		unsigned int req_cpu = cpu;
+
+		/*
+		 * It's a bit more complex for single cpu workqueues.
+		 * We first need to determine which cpu is going to be
+		 * used.  If no cpu is currently serving this
+		 * workqueue, arbitrate using atomic accesses to
+		 * wq->single_cpu; otherwise, use the current one.
+		 */
+	retry:
+		cpu = wq->single_cpu;
+		arbitrate = cpu == NR_CPUS;
+		if (arbitrate)
+			cpu = req_cpu;
+
+		gcwq = get_gcwq(cpu);
+		spin_lock_irqsave(&gcwq->lock, flags);
+
+		/*
+		 * The following cmpxchg() is a full barrier paired
+		 * with smp_wmb() in cwq_unbind_single_cpu() and
+		 * guarantees that all changes to wq->st_* fields are
+		 * visible on the new cpu after this point.
+		 */
+		if (arbitrate)
+			cmpxchg(&wq->single_cpu, NR_CPUS, cpu);
+
+		if (unlikely(wq->single_cpu != cpu)) {
+			spin_unlock_irqrestore(&gcwq->lock, flags);
+			goto retry;
+		}
+	}
+
+	/* gcwq determined, get cwq and queue */
+	cwq = get_cwq(gcwq->cpu, wq);
+
 	BUG_ON(!list_empty(&work->entry));
 
 	cwq->nr_in_flight[cwq->work_color]++;
@@ -530,7 +591,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 		timer_stats_timer_set_start_info(&dwork->timer);
 
 		/* This stores cwq for the moment, for the timer_fn */
-		set_wq_data(work, target_cwq(raw_smp_processor_id(), wq), 0);
+		set_wq_data(work, get_cwq(raw_smp_processor_id(), wq), 0);
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -790,10 +851,14 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
 	cwq->nr_in_flight[color]--;
 	cwq->nr_active--;
 
-	/* one down, submit a delayed one */
-	if (!list_empty(&cwq->delayed_works) &&
-	    cwq->nr_active < cwq->max_active)
-		cwq_activate_first_delayed(cwq);
+	if (!list_empty(&cwq->delayed_works)) {
+		/* one down, submit a delayed one */
+		if (cwq->nr_active < cwq->max_active)
+			cwq_activate_first_delayed(cwq);
+	} else if (!cwq->nr_active && cwq->wq->flags & WQ_SINGLE_CPU) {
+		/* this was the last work, unbind from single cpu */
+		cwq_unbind_single_cpu(cwq);
+	}
 
 	/* is flush in progress and are we at the flushing tip? */
 	if (likely(cwq->flush_color != color))
@@ -1727,7 +1792,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 						struct lock_class_key *key,
 						const char *lock_name)
 {
-	bool singlethread = flags & WQ_SINGLE_THREAD;
 	struct workqueue_struct *wq;
 	bool failed = false;
 	unsigned int cpu;
@@ -1748,6 +1812,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	atomic_set(&wq->nr_cwqs_to_flush, 0);
 	INIT_LIST_HEAD(&wq->flusher_queue);
 	INIT_LIST_HEAD(&wq->flusher_overflow);
+	wq->single_cpu = NR_CPUS;
+
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	INIT_LIST_HEAD(&wq->list);
@@ -1773,8 +1839,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 
 		if (failed)
 			continue;
-		cwq->worker = create_worker(cwq,
-					    cpu_online(cpu) && !singlethread);
+		cwq->worker = create_worker(cwq, cpu_online(cpu));
 		if (cwq->worker)
 			start_worker(cwq->worker);
 		else
@@ -1958,18 +2023,16 @@ static int __cpuinit trustee_thread(void *__gcwq)
 
 	spin_lock_irq(&gcwq->lock);
 	/*
-	 * Make all multithread workers rogue.  Trustee must be bound
-	 * to the target cpu and can't be cancelled.
+	 * Make all workers rogue.  Trustee must be bound to the
+	 * target cpu and can't be cancelled.
 	 */
 	BUG_ON(gcwq->cpu != smp_processor_id());
 
 	list_for_each_entry(worker, &gcwq->idle_list, entry)
-		if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
-			worker->flags |= WORKER_ROGUE;
+		worker->flags |= WORKER_ROGUE;
 
 	for_each_busy_worker(worker, i, pos, gcwq)
-		if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
-			worker->flags |= WORKER_ROGUE;
+		worker->flags |= WORKER_ROGUE;
 
 	/*
 	 * We're now in charge.  Notify and proceed to drain.  We need
@@ -2074,14 +2137,12 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			wait_trustee_state(gcwq, TRUSTEE_DONE);
 		}
 
-		/* clear ROGUE from all multithread workers */
+		/* clear ROGUE from all workers */
 		list_for_each_entry(worker, &gcwq->idle_list, entry)
-			if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
-				worker->flags &= ~WORKER_ROGUE;
+			worker->flags &= ~WORKER_ROGUE;
 
 		for_each_busy_worker(worker, i, pos, gcwq)
-			if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
-				worker->flags &= ~WORKER_ROGUE;
+			worker->flags &= ~WORKER_ROGUE;
 		break;
 	}
 
@@ -2266,6 +2327,11 @@ void thaw_workqueues(void)
 			       cwq->nr_active < cwq->max_active)
 				cwq_activate_first_delayed(cwq);
 
+			/* perform delayed unbind from single cpu if empty */
+			if (wq->single_cpu == gcwq->cpu &&
+			    !cwq->nr_active && list_empty(&cwq->delayed_works))
+				cwq_unbind_single_cpu(cwq);
+
 			wake_up_process(cwq->worker->task);
 		}
 
@@ -2283,7 +2349,6 @@ void __init init_workqueues(void)
 	unsigned int cpu;
 	int i;
 
-	singlethread_cpu = cpumask_first(cpu_possible_mask);
 	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
 	/* initialize gcwqs */
-- 
cgit v1.2.3-59-g8ed1b


From 7a22ad757ec75186ad43a5b4670fa7423ee8f480 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:13 +0200
Subject: workqueue: carry cpu number in work data once execution starts

To implement non-reentrant workqueue, the last gcwq a work was
executed on must be reliably obtainable as long as the work structure
is valid even if the previous workqueue has been destroyed.

To achieve this, work->data will be overloaded to carry the last cpu
number once execution starts so that the previous gcwq can be located
reliably.  This means that cwq can't be obtained from work after
execution starts but only gcwq.

Implement set_work_{cwq|cpu}(), get_work_[g]cwq() and
clear_work_data() to set work data to the cpu number when starting
execution, access the overloaded work data and clear it after
cancellation.

queue_delayed_work_on() is updated to preserve the last cpu while
in-flight in timer and other callers which depended on getting cwq
from work after execution starts are converted to depend on gcwq
instead.

* Anton Blanchard fixed compile error on powerpc due to missing
  linux/threads.h include.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Anton Blanchard <anton@samba.org>
---
 include/linux/workqueue.h |   7 +-
 kernel/workqueue.c        | 163 +++++++++++++++++++++++++++++-----------------
 2 files changed, 109 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 10611f7fc809..0a7814131e66 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -9,6 +9,7 @@
 #include <linux/linkage.h>
 #include <linux/bitops.h>
 #include <linux/lockdep.h>
+#include <linux/threads.h>
 #include <asm/atomic.h>
 
 struct workqueue_struct;
@@ -59,6 +60,7 @@ enum {
 
 	WORK_STRUCT_FLAG_MASK	= (1UL << WORK_STRUCT_FLAG_BITS) - 1,
 	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
+	WORK_STRUCT_NO_CPU	= NR_CPUS << WORK_STRUCT_FLAG_BITS,
 };
 
 struct work_struct {
@@ -70,8 +72,9 @@ struct work_struct {
 #endif
 };
 
-#define WORK_DATA_INIT()	ATOMIC_LONG_INIT(0)
-#define WORK_DATA_STATIC_INIT()	ATOMIC_LONG_INIT(WORK_STRUCT_STATIC)
+#define WORK_DATA_INIT()	ATOMIC_LONG_INIT(WORK_STRUCT_NO_CPU)
+#define WORK_DATA_STATIC_INIT()	\
+	ATOMIC_LONG_INIT(WORK_STRUCT_NO_CPU | WORK_STRUCT_STATIC)
 
 struct delayed_work {
 	struct work_struct work;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c276dec75ea4..c68277c204ab 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -319,31 +319,71 @@ static int work_next_color(int color)
 }
 
 /*
- * Set the workqueue on which a work item is to be run
- * - Must *only* be called if the pending flag is set
+ * Work data points to the cwq while a work is on queue.  Once
+ * execution starts, it points to the cpu the work was last on.  This
+ * can be distinguished by comparing the data value against
+ * PAGE_OFFSET.
+ *
+ * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
+ * cwq, cpu or clear work->data.  These functions should only be
+ * called while the work is owned - ie. while the PENDING bit is set.
+ *
+ * get_work_[g]cwq() can be used to obtain the gcwq or cwq
+ * corresponding to a work.  gcwq is available once the work has been
+ * queued anywhere after initialization.  cwq is available only from
+ * queueing until execution starts.
  */
-static inline void set_wq_data(struct work_struct *work,
-			       struct cpu_workqueue_struct *cwq,
-			       unsigned long extra_flags)
+static inline void set_work_data(struct work_struct *work, unsigned long data,
+				 unsigned long flags)
 {
 	BUG_ON(!work_pending(work));
+	atomic_long_set(&work->data, data | flags | work_static(work));
+}
 
-	atomic_long_set(&work->data, (unsigned long)cwq | work_static(work) |
-			WORK_STRUCT_PENDING | extra_flags);
+static void set_work_cwq(struct work_struct *work,
+			 struct cpu_workqueue_struct *cwq,
+			 unsigned long extra_flags)
+{
+	set_work_data(work, (unsigned long)cwq,
+		      WORK_STRUCT_PENDING | extra_flags);
 }
 
-/*
- * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
- */
-static inline void clear_wq_data(struct work_struct *work)
+static void set_work_cpu(struct work_struct *work, unsigned int cpu)
+{
+	set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
+}
+
+static void clear_work_data(struct work_struct *work)
+{
+	set_work_data(work, WORK_STRUCT_NO_CPU, 0);
+}
+
+static inline unsigned long get_work_data(struct work_struct *work)
+{
+	return atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK;
+}
+
+static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
 {
-	atomic_long_set(&work->data, work_static(work));
+	unsigned long data = get_work_data(work);
+
+	return data >= PAGE_OFFSET ? (void *)data : NULL;
 }
 
-static inline struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
+static struct global_cwq *get_work_gcwq(struct work_struct *work)
 {
-	return (void *)(atomic_long_read(&work->data) &
-			WORK_STRUCT_WQ_DATA_MASK);
+	unsigned long data = get_work_data(work);
+	unsigned int cpu;
+
+	if (data >= PAGE_OFFSET)
+		return ((struct cpu_workqueue_struct *)data)->gcwq;
+
+	cpu = data >> WORK_STRUCT_FLAG_BITS;
+	if (cpu == NR_CPUS)
+		return NULL;
+
+	BUG_ON(cpu >= num_possible_cpus());
+	return get_gcwq(cpu);
 }
 
 /**
@@ -443,7 +483,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 			unsigned int extra_flags)
 {
 	/* we own @work, set data and link */
-	set_wq_data(work, cwq, extra_flags);
+	set_work_cwq(work, cwq, extra_flags);
 
 	/*
 	 * Ensure that we get the right work->data if we see the
@@ -599,7 +639,7 @@ EXPORT_SYMBOL_GPL(queue_work_on);
 static void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
-	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
+	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
 
 	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
 }
@@ -639,13 +679,19 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+		struct global_cwq *gcwq = get_work_gcwq(work);
+		unsigned int lcpu = gcwq ? gcwq->cpu : raw_smp_processor_id();
+
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
 		timer_stats_timer_set_start_info(&dwork->timer);
-
-		/* This stores cwq for the moment, for the timer_fn */
-		set_wq_data(work, get_cwq(raw_smp_processor_id(), wq), 0);
+		/*
+		 * This stores cwq for the moment, for the timer_fn.
+		 * Note that the work's gcwq is preserved to allow
+		 * reentrance detection for delayed works.
+		 */
+		set_work_cwq(work, get_cwq(lcpu, wq), 0);
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -970,11 +1016,14 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
 	worker->current_work = work;
 	worker->current_cwq = cwq;
 	work_color = get_work_color(work);
+
+	BUG_ON(get_work_cwq(work) != cwq);
+	/* record the current cpu number in the work data and dequeue */
+	set_work_cpu(work, gcwq->cpu);
 	list_del_init(&work->entry);
 
 	spin_unlock_irq(&gcwq->lock);
 
-	BUG_ON(get_wq_data(work) != cwq);
 	work_clear_pending(work);
 	lock_map_acquire(&cwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
@@ -1406,37 +1455,39 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
 int flush_work(struct work_struct *work)
 {
 	struct worker *worker = NULL;
-	struct cpu_workqueue_struct *cwq;
 	struct global_cwq *gcwq;
+	struct cpu_workqueue_struct *cwq;
 	struct wq_barrier barr;
 
 	might_sleep();
-	cwq = get_wq_data(work);
-	if (!cwq)
+	gcwq = get_work_gcwq(work);
+	if (!gcwq)
 		return 0;
-	gcwq = cwq->gcwq;
-
-	lock_map_acquire(&cwq->wq->lockdep_map);
-	lock_map_release(&cwq->wq->lockdep_map);
 
 	spin_lock_irq(&gcwq->lock);
 	if (!list_empty(&work->entry)) {
 		/*
 		 * See the comment near try_to_grab_pending()->smp_rmb().
-		 * If it was re-queued under us we are not going to wait.
+		 * If it was re-queued to a different gcwq under us, we
+		 * are not going to wait.
 		 */
 		smp_rmb();
-		if (unlikely(cwq != get_wq_data(work)))
+		cwq = get_work_cwq(work);
+		if (unlikely(!cwq || gcwq != cwq->gcwq))
 			goto already_gone;
 	} else {
-		if (cwq->worker && cwq->worker->current_work == work)
-			worker = cwq->worker;
+		worker = find_worker_executing_work(gcwq, work);
 		if (!worker)
 			goto already_gone;
+		cwq = worker->current_cwq;
 	}
 
 	insert_wq_barrier(cwq, &barr, work, worker);
 	spin_unlock_irq(&gcwq->lock);
+
+	lock_map_acquire(&cwq->wq->lockdep_map);
+	lock_map_release(&cwq->wq->lockdep_map);
+
 	wait_for_completion(&barr.done);
 	destroy_work_on_stack(&barr.work);
 	return 1;
@@ -1453,7 +1504,6 @@ EXPORT_SYMBOL_GPL(flush_work);
 static int try_to_grab_pending(struct work_struct *work)
 {
 	struct global_cwq *gcwq;
-	struct cpu_workqueue_struct *cwq;
 	int ret = -1;
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
@@ -1463,24 +1513,23 @@ static int try_to_grab_pending(struct work_struct *work)
 	 * The queueing is in progress, or it is already queued. Try to
 	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
 	 */
-
-	cwq = get_wq_data(work);
-	if (!cwq)
+	gcwq = get_work_gcwq(work);
+	if (!gcwq)
 		return ret;
-	gcwq = cwq->gcwq;
 
 	spin_lock_irq(&gcwq->lock);
 	if (!list_empty(&work->entry)) {
 		/*
-		 * This work is queued, but perhaps we locked the wrong cwq.
+		 * This work is queued, but perhaps we locked the wrong gcwq.
 		 * In that case we must see the new value after rmb(), see
 		 * insert_work()->wmb().
 		 */
 		smp_rmb();
-		if (cwq == get_wq_data(work)) {
+		if (gcwq == get_work_gcwq(work)) {
 			debug_work_deactivate(work);
 			list_del_init(&work->entry);
-			cwq_dec_nr_in_flight(cwq, get_work_color(work));
+			cwq_dec_nr_in_flight(get_work_cwq(work),
+					     get_work_color(work));
 			ret = 1;
 		}
 	}
@@ -1489,20 +1538,16 @@ static int try_to_grab_pending(struct work_struct *work)
 	return ret;
 }
 
-static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
-				struct work_struct *work)
+static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
 {
-	struct global_cwq *gcwq = cwq->gcwq;
 	struct wq_barrier barr;
 	struct worker *worker;
 
 	spin_lock_irq(&gcwq->lock);
 
-	worker = NULL;
-	if (unlikely(cwq->worker && cwq->worker->current_work == work)) {
-		worker = cwq->worker;
-		insert_wq_barrier(cwq, &barr, work, worker);
-	}
+	worker = find_worker_executing_work(gcwq, work);
+	if (unlikely(worker))
+		insert_wq_barrier(worker->current_cwq, &barr, work, worker);
 
 	spin_unlock_irq(&gcwq->lock);
 
@@ -1514,8 +1559,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
 
 static void wait_on_work(struct work_struct *work)
 {
-	struct cpu_workqueue_struct *cwq;
-	struct workqueue_struct *wq;
 	int cpu;
 
 	might_sleep();
@@ -1523,14 +1566,8 @@ static void wait_on_work(struct work_struct *work)
 	lock_map_acquire(&work->lockdep_map);
 	lock_map_release(&work->lockdep_map);
 
-	cwq = get_wq_data(work);
-	if (!cwq)
-		return;
-
-	wq = cwq->wq;
-
 	for_each_possible_cpu(cpu)
-		wait_on_cpu_work(get_cwq(cpu, wq), work);
+		wait_on_cpu_work(get_gcwq(cpu), work);
 }
 
 static int __cancel_work_timer(struct work_struct *work,
@@ -1545,7 +1582,7 @@ static int __cancel_work_timer(struct work_struct *work,
 		wait_on_work(work);
 	} while (unlikely(ret < 0));
 
-	clear_wq_data(work);
+	clear_work_data(work);
 	return ret;
 }
 
@@ -1647,7 +1684,7 @@ EXPORT_SYMBOL(schedule_delayed_work);
 void flush_delayed_work(struct delayed_work *dwork)
 {
 	if (del_timer_sync(&dwork->timer)) {
-		__queue_work(get_cpu(), get_wq_data(&dwork->work)->wq,
+		__queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
 			     &dwork->work);
 		put_cpu();
 	}
@@ -2405,6 +2442,14 @@ void __init init_workqueues(void)
 	unsigned int cpu;
 	int i;
 
+	/*
+	 * The pointer part of work->data is either pointing to the
+	 * cwq or contains the cpu number the work ran last on.  Make
+	 * sure cpu number won't overflow into kernel pointer area so
+	 * that they can be distinguished.
+	 */
+	BUILD_BUG_ON(NR_CPUS << WORK_STRUCT_FLAG_BITS >= PAGE_OFFSET);
+
 	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
 	/* initialize gcwqs */
-- 
cgit v1.2.3-59-g8ed1b


From 18aa9effad4adb2c1efe123af4eb24fec9f59b30 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:13 +0200
Subject: workqueue: implement WQ_NON_REENTRANT

With gcwq managing all the workers and work->data pointing to the last
gcwq it was on, non-reentrance can be easily implemented by checking
whether the work is still running on the previous gcwq on queueing.
Implement it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 32 +++++++++++++++++++++++++++++---
 2 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0a7814131e66..07cf5e5f91cb 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -225,6 +225,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 enum {
 	WQ_FREEZEABLE		= 1 << 0, /* freeze during suspend */
 	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
+	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
 };
 
 extern struct workqueue_struct *
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c68277c204ab..bce1074bdec1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -534,11 +534,37 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 
 	debug_work_activate(work);
 
-	/* determine gcwq to use */
+	/*
+	 * Determine gcwq to use.  SINGLE_CPU is inherently
+	 * NON_REENTRANT, so test it first.
+	 */
 	if (!(wq->flags & WQ_SINGLE_CPU)) {
-		/* just use the requested cpu for multicpu workqueues */
+		struct global_cwq *last_gcwq;
+
+		/*
+		 * It's multi cpu.  If @wq is non-reentrant and @work
+		 * was previously on a different cpu, it might still
+		 * be running there, in which case the work needs to
+		 * be queued on that cpu to guarantee non-reentrance.
+		 */
 		gcwq = get_gcwq(cpu);
-		spin_lock_irqsave(&gcwq->lock, flags);
+		if (wq->flags & WQ_NON_REENTRANT &&
+		    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
+			struct worker *worker;
+
+			spin_lock_irqsave(&last_gcwq->lock, flags);
+
+			worker = find_worker_executing_work(last_gcwq, work);
+
+			if (worker && worker->current_cwq->wq == wq)
+				gcwq = last_gcwq;
+			else {
+				/* meh... not running there, queue here */
+				spin_unlock_irqrestore(&last_gcwq->lock, flags);
+				spin_lock_irqsave(&gcwq->lock, flags);
+			}
+		} else
+			spin_lock_irqsave(&gcwq->lock, flags);
 	} else {
 		unsigned int req_cpu = cpu;
 
-- 
cgit v1.2.3-59-g8ed1b


From e22bee782b3b00bd4534ae9b1c5fb2e8e6573c5c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:14 +0200
Subject: workqueue: implement concurrency managed dynamic worker pool

Instead of creating a worker for each cwq and putting it into the
shared pool, manage per-cpu workers dynamically.

Works aren't supposed to be cpu cycle hogs and maintaining just enough
concurrency to prevent work processing from stalling due to lack of
processing context is optimal.  gcwq keeps the number of concurrent
active workers to minimum but no less.  As long as there's one or more
running workers on the cpu, no new worker is scheduled so that works
can be processed in batch as much as possible but when the last
running worker blocks, gcwq immediately schedules new worker so that
the cpu doesn't sit idle while there are works to be processed.

gcwq always keeps at least single idle worker around.  When a new
worker is necessary and the worker is the last idle one, the worker
assumes the role of "manager" and manages the worker pool -
ie. creates another worker.  Forward-progress is guaranteed by having
dedicated rescue workers for workqueues which may be necessary while
creating a new worker.  When the manager is having problem creating a
new worker, mayday timer activates and rescue workers are summoned to
the cpu and execute works which might be necessary to create new
workers.

Trustee is expanded to serve the role of manager while a CPU is being
taken down and stays down.  As no new works are supposed to be queued
on a dead cpu, it just needs to drain all the existing ones.  Trustee
continues to try to create new workers and summon rescuers as long as
there are pending works.  If the CPU is brought back up while the
trustee is still trying to drain the gcwq from the previous offlining,
the trustee will kill all idles ones and tell workers which are still
busy to rebind to the cpu, and pass control over to gcwq which assumes
the manager role as necessary.

Concurrency managed worker pool reduces the number of workers
drastically.  Only workers which are necessary to keep the processing
going are created and kept.  Also, it reduces cache footprint by
avoiding unnecessarily switching contexts between different workers.

Please note that this patch does not increase max_active of any
workqueue.  All workqueues can still only process one work per cpu.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   8 +-
 kernel/workqueue.c        | 936 +++++++++++++++++++++++++++++++++++++++++-----
 kernel/workqueue_sched.h  |  13 +-
 3 files changed, 841 insertions(+), 116 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 07cf5e5f91cb..b8f4ec45c40a 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -226,6 +226,7 @@ enum {
 	WQ_FREEZEABLE		= 1 << 0, /* freeze during suspend */
 	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
 	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
+	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
 };
 
 extern struct workqueue_struct *
@@ -252,11 +253,12 @@ __create_workqueue_key(const char *name, unsigned int flags, int max_active,
 #endif
 
 #define create_workqueue(name)					\
-	__create_workqueue((name), 0, 1)
+	__create_workqueue((name), WQ_RESCUER, 1)
 #define create_freezeable_workqueue(name)			\
-	__create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_CPU, 1)
+	__create_workqueue((name),				\
+			   WQ_FREEZEABLE | WQ_SINGLE_CPU | WQ_RESCUER, 1)
 #define create_singlethread_workqueue(name)			\
-	__create_workqueue((name), WQ_SINGLE_CPU, 1)
+	__create_workqueue((name), WQ_SINGLE_CPU | WQ_RESCUER, 1)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4c31fde092c6..0ad46523b423 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -34,17 +34,25 @@
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 #include <linux/idr.h>
-#include <linux/delay.h>
+
+#include "workqueue_sched.h"
 
 enum {
 	/* global_cwq flags */
+	GCWQ_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
+	GCWQ_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
+	GCWQ_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
 	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
 
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
 	WORKER_DIE		= 1 << 1,	/* die die die */
 	WORKER_IDLE		= 1 << 2,	/* is idle */
+	WORKER_PREP		= 1 << 3,	/* preparing to run works */
 	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
+	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
+
+	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND,
 
 	/* gcwq->trustee_state */
 	TRUSTEE_START		= 0,		/* start */
@@ -57,7 +65,19 @@ enum {
 	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
 	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
 
+	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
+	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */
+
+	MAYDAY_INITIAL_TIMEOUT	= HZ / 100,	/* call for help after 10ms */
+	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
+	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */
 	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
+
+	/*
+	 * Rescue workers are used only on emergencies and shared by
+	 * all cpus.  Give -20.
+	 */
+	RESCUER_NICE_LEVEL	= -20,
 };
 
 /*
@@ -65,8 +85,16 @@ enum {
  *
  * I: Set during initialization and read-only afterwards.
  *
+ * P: Preemption protected.  Disabling preemption is enough and should
+ *    only be modified and accessed from the local cpu.
+ *
  * L: gcwq->lock protected.  Access with gcwq->lock held.
  *
+ * X: During normal operation, modification requires gcwq->lock and
+ *    should be done only from local cpu.  Either disabling preemption
+ *    on local cpu or grabbing gcwq->lock is enough for read access.
+ *    While trustee is in charge, it's identical to L.
+ *
  * F: wq->flush_mutex protected.
  *
  * W: workqueue_lock protected.
@@ -74,6 +102,10 @@ enum {
 
 struct global_cwq;
 
+/*
+ * The poor guys doing the actual heavy lifting.  All on-duty workers
+ * are either serving the manager role, on idle list or on busy hash.
+ */
 struct worker {
 	/* on idle list while idle, on busy hash table while busy */
 	union {
@@ -86,12 +118,17 @@ struct worker {
 	struct list_head	scheduled;	/* L: scheduled works */
 	struct task_struct	*task;		/* I: worker task */
 	struct global_cwq	*gcwq;		/* I: the associated gcwq */
-	unsigned int		flags;		/* L: flags */
+	/* 64 bytes boundary on 64bit, 32 on 32bit */
+	unsigned long		last_active;	/* L: last active timestamp */
+	unsigned int		flags;		/* X: flags */
 	int			id;		/* I: worker id */
+	struct work_struct	rebind_work;	/* L: rebind worker to cpu */
 };
 
 /*
- * Global per-cpu workqueue.
+ * Global per-cpu workqueue.  There's one and only one for each cpu
+ * and all works are queued and processed here regardless of their
+ * target workqueues.
  */
 struct global_cwq {
 	spinlock_t		lock;		/* the gcwq lock */
@@ -103,15 +140,19 @@ struct global_cwq {
 	int			nr_idle;	/* L: currently idle ones */
 
 	/* workers are chained either in the idle_list or busy_hash */
-	struct list_head	idle_list;	/* L: list of idle workers */
+	struct list_head	idle_list;	/* X: list of idle workers */
 	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
 						/* L: hash of busy workers */
 
+	struct timer_list	idle_timer;	/* L: worker idle timeout */
+	struct timer_list	mayday_timer;	/* L: SOS timer for dworkers */
+
 	struct ida		worker_ida;	/* L: for worker IDs */
 
 	struct task_struct	*trustee;	/* L: for gcwq shutdown */
 	unsigned int		trustee_state;	/* L: trustee state */
 	wait_queue_head_t	trustee_wait;	/* trustee wait */
+	struct worker		*first_idle;	/* L: first idle worker */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -121,7 +162,6 @@ struct global_cwq {
  */
 struct cpu_workqueue_struct {
 	struct global_cwq	*gcwq;		/* I: the associated gcwq */
-	struct worker		*worker;
 	struct workqueue_struct *wq;		/* I: the owning workqueue */
 	int			work_color;	/* L: current color */
 	int			flush_color;	/* L: flushing color */
@@ -160,6 +200,9 @@ struct workqueue_struct {
 
 	unsigned long		single_cpu;	/* cpu for single cpu wq */
 
+	cpumask_var_t		mayday_mask;	/* cpus requesting rescue */
+	struct worker		*rescuer;	/* I: rescue worker */
+
 	int			saved_max_active; /* I: saved cwq max_active */
 	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
@@ -286,7 +329,13 @@ static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
 static bool workqueue_freezing;		/* W: have wqs started freezing? */
 
+/*
+ * The almighty global cpu workqueues.  nr_running is the only field
+ * which is expected to be used frequently by other cpus via
+ * try_to_wake_up().  Put it in a separate cacheline.
+ */
 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
 
 static int worker_thread(void *__worker);
 
@@ -295,6 +344,11 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
 	return &per_cpu(global_cwq, cpu);
 }
 
+static atomic_t *get_gcwq_nr_running(unsigned int cpu)
+{
+	return &per_cpu(gcwq_nr_running, cpu);
+}
+
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 					    struct workqueue_struct *wq)
 {
@@ -385,6 +439,63 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 	return get_gcwq(cpu);
 }
 
+/*
+ * Policy functions.  These define the policies on how the global
+ * worker pool is managed.  Unless noted otherwise, these functions
+ * assume that they're being called with gcwq->lock held.
+ */
+
+/*
+ * Need to wake up a worker?  Called from anything but currently
+ * running workers.
+ */
+static bool need_more_worker(struct global_cwq *gcwq)
+{
+	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+
+	return !list_empty(&gcwq->worklist) && !atomic_read(nr_running);
+}
+
+/* Can I start working?  Called from busy but !running workers. */
+static bool may_start_working(struct global_cwq *gcwq)
+{
+	return gcwq->nr_idle;
+}
+
+/* Do I need to keep working?  Called from currently running workers. */
+static bool keep_working(struct global_cwq *gcwq)
+{
+	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+
+	return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1;
+}
+
+/* Do we need a new worker?  Called from manager. */
+static bool need_to_create_worker(struct global_cwq *gcwq)
+{
+	return need_more_worker(gcwq) && !may_start_working(gcwq);
+}
+
+/* Do I need to be the manager? */
+static bool need_to_manage_workers(struct global_cwq *gcwq)
+{
+	return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+}
+
+/* Do we have too many workers and should some go away? */
+static bool too_many_workers(struct global_cwq *gcwq)
+{
+	bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
+	int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
+	int nr_busy = gcwq->nr_workers - nr_idle;
+
+	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
+}
+
+/*
+ * Wake up functions.
+ */
+
 /* Return the first worker.  Safe with preemption disabled */
 static struct worker *first_worker(struct global_cwq *gcwq)
 {
@@ -412,12 +523,77 @@ static void wake_up_worker(struct global_cwq *gcwq)
 }
 
 /**
- * worker_set_flags - set worker flags
+ * wq_worker_waking_up - a worker is waking up
+ * @task: task waking up
+ * @cpu: CPU @task is waking up to
+ *
+ * This function is called during try_to_wake_up() when a worker is
+ * being awoken.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
+{
+	struct worker *worker = kthread_data(task);
+
+	if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
+		atomic_inc(get_gcwq_nr_running(cpu));
+}
+
+/**
+ * wq_worker_sleeping - a worker is going to sleep
+ * @task: task going to sleep
+ * @cpu: CPU in question, must be the current CPU number
+ *
+ * This function is called during schedule() when a busy worker is
+ * going to sleep.  Worker on the same cpu can be woken up by
+ * returning pointer to its task.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ *
+ * RETURNS:
+ * Worker task on @cpu to wake up, %NULL if none.
+ */
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+				       unsigned int cpu)
+{
+	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+	struct global_cwq *gcwq = get_gcwq(cpu);
+	atomic_t *nr_running = get_gcwq_nr_running(cpu);
+
+	if (unlikely(worker->flags & WORKER_NOT_RUNNING))
+		return NULL;
+
+	/* this can only happen on the local cpu */
+	BUG_ON(cpu != raw_smp_processor_id());
+
+	/*
+	 * The counterpart of the following dec_and_test, implied mb,
+	 * worklist not empty test sequence is in insert_work().
+	 * Please read comment there.
+	 *
+	 * NOT_RUNNING is clear.  This means that trustee is not in
+	 * charge and we're running on the local cpu w/ rq lock held
+	 * and preemption disabled, which in turn means that none else
+	 * could be manipulating idle_list, so dereferencing idle_list
+	 * without gcwq lock is safe.
+	 */
+	if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
+		to_wakeup = first_worker(gcwq);
+	return to_wakeup ? to_wakeup->task : NULL;
+}
+
+/**
+ * worker_set_flags - set worker flags and adjust nr_running accordingly
  * @worker: worker to set flags for
  * @flags: flags to set
  * @wakeup: wakeup an idle worker if necessary
  *
- * Set @flags in @worker->flags.
+ * Set @flags in @worker->flags and adjust nr_running accordingly.  If
+ * nr_running becomes zero and @wakeup is %true, an idle worker is
+ * woken up.
  *
  * LOCKING:
  * spin_lock_irq(gcwq->lock).
@@ -425,22 +601,49 @@ static void wake_up_worker(struct global_cwq *gcwq)
 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 				    bool wakeup)
 {
+	struct global_cwq *gcwq = worker->gcwq;
+
+	/*
+	 * If transitioning into NOT_RUNNING, adjust nr_running and
+	 * wake up an idle worker as necessary if requested by
+	 * @wakeup.
+	 */
+	if ((flags & WORKER_NOT_RUNNING) &&
+	    !(worker->flags & WORKER_NOT_RUNNING)) {
+		atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+
+		if (wakeup) {
+			if (atomic_dec_and_test(nr_running) &&
+			    !list_empty(&gcwq->worklist))
+				wake_up_worker(gcwq);
+		} else
+			atomic_dec(nr_running);
+	}
+
 	worker->flags |= flags;
 }
 
 /**
- * worker_clr_flags - clear worker flags
+ * worker_clr_flags - clear worker flags and adjust nr_running accordingly
  * @worker: worker to set flags for
  * @flags: flags to clear
  *
- * Clear @flags in @worker->flags.
+ * Clear @flags in @worker->flags and adjust nr_running accordingly.
  *
  * LOCKING:
  * spin_lock_irq(gcwq->lock).
  */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
+	struct global_cwq *gcwq = worker->gcwq;
+	unsigned int oflags = worker->flags;
+
 	worker->flags &= ~flags;
+
+	/* if transitioning out of NOT_RUNNING, increment nr_running */
+	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
+		if (!(worker->flags & WORKER_NOT_RUNNING))
+			atomic_inc(get_gcwq_nr_running(gcwq->cpu));
 }
 
 /**
@@ -540,6 +743,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 			struct work_struct *work, struct list_head *head,
 			unsigned int extra_flags)
 {
+	struct global_cwq *gcwq = cwq->gcwq;
+
 	/* we own @work, set data and link */
 	set_work_cwq(work, cwq, extra_flags);
 
@@ -550,7 +755,16 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 	smp_wmb();
 
 	list_add_tail(&work->entry, head);
-	wake_up_worker(cwq->gcwq);
+
+	/*
+	 * Ensure either worker_sched_deactivated() sees the above
+	 * list_add_tail() or we see zero nr_running to avoid workers
+	 * lying around lazily while there are works to be processed.
+	 */
+	smp_mb();
+
+	if (!atomic_read(get_gcwq_nr_running(gcwq->cpu)))
+		wake_up_worker(gcwq);
 }
 
 /**
@@ -810,11 +1024,16 @@ static void worker_enter_idle(struct worker *worker)
 
 	worker_set_flags(worker, WORKER_IDLE, false);
 	gcwq->nr_idle++;
+	worker->last_active = jiffies;
 
 	/* idle_list is LIFO */
 	list_add(&worker->entry, &gcwq->idle_list);
 
-	if (unlikely(worker->flags & WORKER_ROGUE))
+	if (likely(!(worker->flags & WORKER_ROGUE))) {
+		if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
+			mod_timer(&gcwq->idle_timer,
+				  jiffies + IDLE_WORKER_TIMEOUT);
+	} else
 		wake_up_all(&gcwq->trustee_wait);
 }
 
@@ -837,6 +1056,81 @@ static void worker_leave_idle(struct worker *worker)
 	list_del_init(&worker->entry);
 }
 
+/**
+ * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
+ * @worker: self
+ *
+ * Works which are scheduled while the cpu is online must at least be
+ * scheduled to a worker which is bound to the cpu so that if they are
+ * flushed from cpu callbacks while cpu is going down, they are
+ * guaranteed to execute on the cpu.
+ *
+ * This function is to be used by rogue workers and rescuers to bind
+ * themselves to the target cpu and may race with cpu going down or
+ * coming online.  kthread_bind() can't be used because it may put the
+ * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
+ * verbatim as it's best effort and blocking and gcwq may be
+ * [dis]associated in the meantime.
+ *
+ * This function tries set_cpus_allowed() and locks gcwq and verifies
+ * the binding against GCWQ_DISASSOCIATED which is set during
+ * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
+ * idle state or fetches works without dropping lock, it can guarantee
+ * the scheduling requirement described in the first paragraph.
+ *
+ * CONTEXT:
+ * Might sleep.  Called without any lock but returns with gcwq->lock
+ * held.
+ *
+ * RETURNS:
+ * %true if the associated gcwq is online (@worker is successfully
+ * bound), %false if offline.
+ */
+static bool worker_maybe_bind_and_lock(struct worker *worker)
+{
+	struct global_cwq *gcwq = worker->gcwq;
+	struct task_struct *task = worker->task;
+
+	while (true) {
+		/*
+		 * The following call may fail, succeed or succeed
+		 * without actually migrating the task to the cpu if
+		 * it races with cpu hotunplug operation.  Verify
+		 * against GCWQ_DISASSOCIATED.
+		 */
+		set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
+
+		spin_lock_irq(&gcwq->lock);
+		if (gcwq->flags & GCWQ_DISASSOCIATED)
+			return false;
+		if (task_cpu(task) == gcwq->cpu &&
+		    cpumask_equal(&current->cpus_allowed,
+				  get_cpu_mask(gcwq->cpu)))
+			return true;
+		spin_unlock_irq(&gcwq->lock);
+
+		/* CPU has come up inbetween, retry migration */
+		cpu_relax();
+	}
+}
+
+/*
+ * Function for worker->rebind_work used to rebind rogue busy workers
+ * to the associated cpu which is coming back online.  This is
+ * scheduled by cpu up but can race with other cpu hotplug operations
+ * and may be executed twice without intervening cpu down.
+ */
+static void worker_rebind_fn(struct work_struct *work)
+{
+	struct worker *worker = container_of(work, struct worker, rebind_work);
+	struct global_cwq *gcwq = worker->gcwq;
+
+	if (worker_maybe_bind_and_lock(worker))
+		worker_clr_flags(worker, WORKER_REBIND);
+
+	spin_unlock_irq(&gcwq->lock);
+}
+
 static struct worker *alloc_worker(void)
 {
 	struct worker *worker;
@@ -845,6 +1139,9 @@ static struct worker *alloc_worker(void)
 	if (worker) {
 		INIT_LIST_HEAD(&worker->entry);
 		INIT_LIST_HEAD(&worker->scheduled);
+		INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+		/* on creation a worker is in !idle && prep state */
+		worker->flags = WORKER_PREP;
 	}
 	return worker;
 }
@@ -963,6 +1260,220 @@ static void destroy_worker(struct worker *worker)
 	ida_remove(&gcwq->worker_ida, id);
 }
 
+static void idle_worker_timeout(unsigned long __gcwq)
+{
+	struct global_cwq *gcwq = (void *)__gcwq;
+
+	spin_lock_irq(&gcwq->lock);
+
+	if (too_many_workers(gcwq)) {
+		struct worker *worker;
+		unsigned long expires;
+
+		/* idle_list is kept in LIFO order, check the last one */
+		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+
+		if (time_before(jiffies, expires))
+			mod_timer(&gcwq->idle_timer, expires);
+		else {
+			/* it's been idle for too long, wake up manager */
+			gcwq->flags |= GCWQ_MANAGE_WORKERS;
+			wake_up_worker(gcwq);
+		}
+	}
+
+	spin_unlock_irq(&gcwq->lock);
+}
+
+static bool send_mayday(struct work_struct *work)
+{
+	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+	struct workqueue_struct *wq = cwq->wq;
+
+	if (!(wq->flags & WQ_RESCUER))
+		return false;
+
+	/* mayday mayday mayday */
+	if (!cpumask_test_and_set_cpu(cwq->gcwq->cpu, wq->mayday_mask))
+		wake_up_process(wq->rescuer->task);
+	return true;
+}
+
+static void gcwq_mayday_timeout(unsigned long __gcwq)
+{
+	struct global_cwq *gcwq = (void *)__gcwq;
+	struct work_struct *work;
+
+	spin_lock_irq(&gcwq->lock);
+
+	if (need_to_create_worker(gcwq)) {
+		/*
+		 * We've been trying to create a new worker but
+		 * haven't been successful.  We might be hitting an
+		 * allocation deadlock.  Send distress signals to
+		 * rescuers.
+		 */
+		list_for_each_entry(work, &gcwq->worklist, entry)
+			send_mayday(work);
+	}
+
+	spin_unlock_irq(&gcwq->lock);
+
+	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
+}
+
+/**
+ * maybe_create_worker - create a new worker if necessary
+ * @gcwq: gcwq to create a new worker for
+ *
+ * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
+ * have at least one idle worker on return from this function.  If
+ * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
+ * sent to all rescuers with works scheduled on @gcwq to resolve
+ * possible allocation deadlock.
+ *
+ * On return, need_to_create_worker() is guaranteed to be false and
+ * may_start_working() true.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  Does GFP_KERNEL allocations.  Called only from
+ * manager.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true
+ * otherwise.
+ */
+static bool maybe_create_worker(struct global_cwq *gcwq)
+{
+	if (!need_to_create_worker(gcwq))
+		return false;
+restart:
+	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
+	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+
+	while (true) {
+		struct worker *worker;
+
+		spin_unlock_irq(&gcwq->lock);
+
+		worker = create_worker(gcwq, true);
+		if (worker) {
+			del_timer_sync(&gcwq->mayday_timer);
+			spin_lock_irq(&gcwq->lock);
+			start_worker(worker);
+			BUG_ON(need_to_create_worker(gcwq));
+			return true;
+		}
+
+		if (!need_to_create_worker(gcwq))
+			break;
+
+		spin_unlock_irq(&gcwq->lock);
+		__set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(CREATE_COOLDOWN);
+		spin_lock_irq(&gcwq->lock);
+		if (!need_to_create_worker(gcwq))
+			break;
+	}
+
+	spin_unlock_irq(&gcwq->lock);
+	del_timer_sync(&gcwq->mayday_timer);
+	spin_lock_irq(&gcwq->lock);
+	if (need_to_create_worker(gcwq))
+		goto restart;
+	return true;
+}
+
+/**
+ * maybe_destroy_worker - destroy workers which have been idle for a while
+ * @gcwq: gcwq to destroy workers for
+ *
+ * Destroy @gcwq workers which have been idle for longer than
+ * IDLE_WORKER_TIMEOUT.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  Called only from manager.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true
+ * otherwise.
+ */
+static bool maybe_destroy_workers(struct global_cwq *gcwq)
+{
+	bool ret = false;
+
+	while (too_many_workers(gcwq)) {
+		struct worker *worker;
+		unsigned long expires;
+
+		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+
+		if (time_before(jiffies, expires)) {
+			mod_timer(&gcwq->idle_timer, expires);
+			break;
+		}
+
+		destroy_worker(worker);
+		ret = true;
+	}
+
+	return ret;
+}
+
+/**
+ * manage_workers - manage worker pool
+ * @worker: self
+ *
+ * Assume the manager role and manage gcwq worker pool @worker belongs
+ * to.  At any given time, there can be only zero or one manager per
+ * gcwq.  The exclusion is handled automatically by this function.
+ *
+ * The caller can safely start processing works on false return.  On
+ * true return, it's guaranteed that need_to_create_worker() is false
+ * and may_start_working() is true.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  Does GFP_KERNEL allocations.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true if
+ * some action was taken.
+ */
+static bool manage_workers(struct worker *worker)
+{
+	struct global_cwq *gcwq = worker->gcwq;
+	bool ret = false;
+
+	if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+		return ret;
+
+	gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
+	gcwq->flags |= GCWQ_MANAGING_WORKERS;
+
+	/*
+	 * Destroy and then create so that may_start_working() is true
+	 * on return.
+	 */
+	ret |= maybe_destroy_workers(gcwq);
+	ret |= maybe_create_worker(gcwq);
+
+	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+
+	/*
+	 * The trustee might be waiting to take over the manager
+	 * position, tell it we're done.
+	 */
+	if (unlikely(gcwq->trustee))
+		wake_up_all(&gcwq->trustee_wait);
+
+	return ret;
+}
+
 /**
  * move_linked_works - move linked works to a list
  * @work: start of series of works to be scheduled
@@ -1169,24 +1680,39 @@ static void process_scheduled_works(struct worker *worker)
  * worker_thread - the worker thread function
  * @__worker: self
  *
- * The cwq worker thread function.
+ * The gcwq worker thread function.  There's a single dynamic pool of
+ * these per each cpu.  These workers process all works regardless of
+ * their specific target workqueue.  The only exception is works which
+ * belong to workqueues with a rescuer which will be explained in
+ * rescuer_thread().
  */
 static int worker_thread(void *__worker)
 {
 	struct worker *worker = __worker;
 	struct global_cwq *gcwq = worker->gcwq;
 
+	/* tell the scheduler that this is a workqueue worker */
+	worker->task->flags |= PF_WQ_WORKER;
 woke_up:
 	spin_lock_irq(&gcwq->lock);
 
 	/* DIE can be set only while we're idle, checking here is enough */
 	if (worker->flags & WORKER_DIE) {
 		spin_unlock_irq(&gcwq->lock);
+		worker->task->flags &= ~PF_WQ_WORKER;
 		return 0;
 	}
 
 	worker_leave_idle(worker);
 recheck:
+	/* no more worker necessary? */
+	if (!need_more_worker(gcwq))
+		goto sleep;
+
+	/* do we need to manage? */
+	if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+		goto recheck;
+
 	/*
 	 * ->scheduled list can only be filled while a worker is
 	 * preparing to process a work or actually processing it.
@@ -1194,27 +1720,18 @@ recheck:
 	 */
 	BUG_ON(!list_empty(&worker->scheduled));
 
-	while (!list_empty(&gcwq->worklist)) {
+	/*
+	 * When control reaches this point, we're guaranteed to have
+	 * at least one idle worker or that someone else has already
+	 * assumed the manager role.
+	 */
+	worker_clr_flags(worker, WORKER_PREP);
+
+	do {
 		struct work_struct *work =
 			list_first_entry(&gcwq->worklist,
 					 struct work_struct, entry);
 
-		/*
-		 * The following is a rather inefficient way to close
-		 * race window against cpu hotplug operations.  Will
-		 * be replaced soon.
-		 */
-		if (unlikely(!(worker->flags & WORKER_ROGUE) &&
-			     !cpumask_equal(&worker->task->cpus_allowed,
-					    get_cpu_mask(gcwq->cpu)))) {
-			spin_unlock_irq(&gcwq->lock);
-			set_cpus_allowed_ptr(worker->task,
-					     get_cpu_mask(gcwq->cpu));
-			cpu_relax();
-			spin_lock_irq(&gcwq->lock);
-			goto recheck;
-		}
-
 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
 			/* optimization path, not strictly necessary */
 			process_one_work(worker, work);
@@ -1224,13 +1741,19 @@ recheck:
 			move_linked_works(work, &worker->scheduled, NULL);
 			process_scheduled_works(worker);
 		}
-	}
+	} while (keep_working(gcwq));
+
+	worker_set_flags(worker, WORKER_PREP, false);
 
+	if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+		goto recheck;
+sleep:
 	/*
-	 * gcwq->lock is held and there's no work to process, sleep.
-	 * Workers are woken up only while holding gcwq->lock, so
-	 * setting the current state before releasing gcwq->lock is
-	 * enough to prevent losing any event.
+	 * gcwq->lock is held and there's no work to process and no
+	 * need to manage, sleep.  Workers are woken up only while
+	 * holding gcwq->lock or from local cpu, so setting the
+	 * current state before releasing gcwq->lock is enough to
+	 * prevent losing any event.
 	 */
 	worker_enter_idle(worker);
 	__set_current_state(TASK_INTERRUPTIBLE);
@@ -1239,6 +1762,68 @@ recheck:
 	goto woke_up;
 }
 
+/**
+ * rescuer_thread - the rescuer thread function
+ * @__wq: the associated workqueue
+ *
+ * Workqueue rescuer thread function.  There's one rescuer for each
+ * workqueue which has WQ_RESCUER set.
+ *
+ * Regular work processing on a gcwq may block trying to create a new
+ * worker which uses GFP_KERNEL allocation which has slight chance of
+ * developing into deadlock if some works currently on the same queue
+ * need to be processed to satisfy the GFP_KERNEL allocation.  This is
+ * the problem rescuer solves.
+ *
+ * When such condition is possible, the gcwq summons rescuers of all
+ * workqueues which have works queued on the gcwq and let them process
+ * those works so that forward progress can be guaranteed.
+ *
+ * This should happen rarely.
+ */
+static int rescuer_thread(void *__wq)
+{
+	struct workqueue_struct *wq = __wq;
+	struct worker *rescuer = wq->rescuer;
+	struct list_head *scheduled = &rescuer->scheduled;
+	unsigned int cpu;
+
+	set_user_nice(current, RESCUER_NICE_LEVEL);
+repeat:
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	if (kthread_should_stop())
+		return 0;
+
+	for_each_cpu(cpu, wq->mayday_mask) {
+		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+		struct global_cwq *gcwq = cwq->gcwq;
+		struct work_struct *work, *n;
+
+		__set_current_state(TASK_RUNNING);
+		cpumask_clear_cpu(cpu, wq->mayday_mask);
+
+		/* migrate to the target cpu if possible */
+		rescuer->gcwq = gcwq;
+		worker_maybe_bind_and_lock(rescuer);
+
+		/*
+		 * Slurp in all works issued via this workqueue and
+		 * process'em.
+		 */
+		BUG_ON(!list_empty(&rescuer->scheduled));
+		list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+			if (get_work_cwq(work) == cwq)
+				move_linked_works(work, scheduled, &n);
+
+		process_scheduled_works(rescuer);
+		spin_unlock_irq(&gcwq->lock);
+	}
+
+	schedule();
+	goto repeat;
+}
+
 struct wq_barrier {
 	struct work_struct	work;
 	struct completion	done;
@@ -1998,7 +2583,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 						const char *lock_name)
 {
 	struct workqueue_struct *wq;
-	bool failed = false;
 	unsigned int cpu;
 
 	max_active = clamp_val(max_active, 1, INT_MAX);
@@ -2023,13 +2607,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	INIT_LIST_HEAD(&wq->list);
 
-	cpu_maps_update_begin();
-	/*
-	 * We must initialize cwqs for each possible cpu even if we
-	 * are going to call destroy_workqueue() finally. Otherwise
-	 * cpu_up() can hit the uninitialized cwq once we drop the
-	 * lock.
-	 */
 	for_each_possible_cpu(cpu) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		struct global_cwq *gcwq = get_gcwq(cpu);
@@ -2040,14 +2617,25 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		cwq->flush_color = -1;
 		cwq->max_active = max_active;
 		INIT_LIST_HEAD(&cwq->delayed_works);
+	}
 
-		if (failed)
-			continue;
-		cwq->worker = create_worker(gcwq, cpu_online(cpu));
-		if (cwq->worker)
-			start_worker(cwq->worker);
-		else
-			failed = true;
+	if (flags & WQ_RESCUER) {
+		struct worker *rescuer;
+
+		if (!alloc_cpumask_var(&wq->mayday_mask, GFP_KERNEL))
+			goto err;
+
+		wq->rescuer = rescuer = alloc_worker();
+		if (!rescuer)
+			goto err;
+
+		rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+		if (IS_ERR(rescuer->task))
+			goto err;
+
+		wq->rescuer = rescuer;
+		rescuer->task->flags |= PF_THREAD_BOUND;
+		wake_up_process(rescuer->task);
 	}
 
 	/*
@@ -2065,16 +2653,12 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 
 	spin_unlock(&workqueue_lock);
 
-	cpu_maps_update_done();
-
-	if (failed) {
-		destroy_workqueue(wq);
-		wq = NULL;
-	}
 	return wq;
 err:
 	if (wq) {
 		free_cwqs(wq->cpu_wq);
+		free_cpumask_var(wq->mayday_mask);
+		kfree(wq->rescuer);
 		kfree(wq);
 	}
 	return NULL;
@@ -2097,42 +2681,26 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	 * wq list is used to freeze wq, remove from list after
 	 * flushing is complete in case freeze races us.
 	 */
-	cpu_maps_update_begin();
 	spin_lock(&workqueue_lock);
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
-	cpu_maps_update_done();
 
+	/* sanity check */
 	for_each_possible_cpu(cpu) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-		struct global_cwq *gcwq = cwq->gcwq;
 		int i;
 
-		if (cwq->worker) {
-		retry:
-			spin_lock_irq(&gcwq->lock);
-			/*
-			 * Worker can only be destroyed while idle.
-			 * Wait till it becomes idle.  This is ugly
-			 * and prone to starvation.  It will go away
-			 * once dynamic worker pool is implemented.
-			 */
-			if (!(cwq->worker->flags & WORKER_IDLE)) {
-				spin_unlock_irq(&gcwq->lock);
-				msleep(100);
-				goto retry;
-			}
-			destroy_worker(cwq->worker);
-			cwq->worker = NULL;
-			spin_unlock_irq(&gcwq->lock);
-		}
-
 		for (i = 0; i < WORK_NR_COLORS; i++)
 			BUG_ON(cwq->nr_in_flight[i]);
 		BUG_ON(cwq->nr_active);
 		BUG_ON(!list_empty(&cwq->delayed_works));
 	}
 
+	if (wq->flags & WQ_RESCUER) {
+		kthread_stop(wq->rescuer->task);
+		free_cpumask_var(wq->mayday_mask);
+	}
+
 	free_cwqs(wq->cpu_wq);
 	kfree(wq);
 }
@@ -2141,10 +2709,18 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
 /*
  * CPU hotplug.
  *
- * CPU hotplug is implemented by allowing cwqs to be detached from
- * CPU, running with unbound workers and allowing them to be
- * reattached later if the cpu comes back online.  A separate thread
- * is created to govern cwqs in such state and is called the trustee.
+ * There are two challenges in supporting CPU hotplug.  Firstly, there
+ * are a lot of assumptions on strong associations among work, cwq and
+ * gcwq which make migrating pending and scheduled works very
+ * difficult to implement without impacting hot paths.  Secondly,
+ * gcwqs serve mix of short, long and very long running works making
+ * blocked draining impractical.
+ *
+ * This is solved by allowing a gcwq to be detached from CPU, running
+ * it with unbound (rogue) workers and allowing it to be reattached
+ * later if the cpu comes back online.  A separate thread is created
+ * to govern a gcwq in such state and is called the trustee of the
+ * gcwq.
  *
  * Trustee states and their descriptions.
  *
@@ -2152,11 +2728,12 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
  *		new trustee is started with this state.
  *
  * IN_CHARGE	Once started, trustee will enter this state after
- *		making all existing workers rogue.  DOWN_PREPARE waits
- *		for trustee to enter this state.  After reaching
- *		IN_CHARGE, trustee tries to execute the pending
- *		worklist until it's empty and the state is set to
- *		BUTCHER, or the state is set to RELEASE.
+ *		assuming the manager role and making all existing
+ *		workers rogue.  DOWN_PREPARE waits for trustee to
+ *		enter this state.  After reaching IN_CHARGE, trustee
+ *		tries to execute the pending worklist until it's empty
+ *		and the state is set to BUTCHER, or the state is set
+ *		to RELEASE.
  *
  * BUTCHER	Command state which is set by the cpu callback after
  *		the cpu has went down.  Once this state is set trustee
@@ -2167,7 +2744,9 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
  * RELEASE	Command state which is set by the cpu callback if the
  *		cpu down has been canceled or it has come online
  *		again.  After recognizing this state, trustee stops
- *		trying to drain or butcher and transits to DONE.
+ *		trying to drain or butcher and clears ROGUE, rebinds
+ *		all remaining workers back to the cpu and releases
+ *		manager role.
  *
  * DONE		Trustee will enter this state after BUTCHER or RELEASE
  *		is complete.
@@ -2233,17 +2812,24 @@ static int __cpuinit trustee_thread(void *__gcwq)
 {
 	struct global_cwq *gcwq = __gcwq;
 	struct worker *worker;
+	struct work_struct *work;
 	struct hlist_node *pos;
+	long rc;
 	int i;
 
 	BUG_ON(gcwq->cpu != smp_processor_id());
 
 	spin_lock_irq(&gcwq->lock);
 	/*
-	 * Make all workers rogue.  Trustee must be bound to the
-	 * target cpu and can't be cancelled.
+	 * Claim the manager position and make all workers rogue.
+	 * Trustee must be bound to the target cpu and can't be
+	 * cancelled.
 	 */
 	BUG_ON(gcwq->cpu != smp_processor_id());
+	rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
+	BUG_ON(rc < 0);
+
+	gcwq->flags |= GCWQ_MANAGING_WORKERS;
 
 	list_for_each_entry(worker, &gcwq->idle_list, entry)
 		worker_set_flags(worker, WORKER_ROGUE, false);
@@ -2251,6 +2837,28 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	for_each_busy_worker(worker, i, pos, gcwq)
 		worker_set_flags(worker, WORKER_ROGUE, false);
 
+	/*
+	 * Call schedule() so that we cross rq->lock and thus can
+	 * guarantee sched callbacks see the rogue flag.  This is
+	 * necessary as scheduler callbacks may be invoked from other
+	 * cpus.
+	 */
+	spin_unlock_irq(&gcwq->lock);
+	schedule();
+	spin_lock_irq(&gcwq->lock);
+
+	/*
+	 * Sched callbacks are disabled now.  gcwq->nr_running should
+	 * be zero and will stay that way, making need_more_worker()
+	 * and keep_working() always return true as long as the
+	 * worklist is not empty.
+	 */
+	WARN_ON_ONCE(atomic_read(get_gcwq_nr_running(gcwq->cpu)) != 0);
+
+	spin_unlock_irq(&gcwq->lock);
+	del_timer_sync(&gcwq->idle_timer);
+	spin_lock_irq(&gcwq->lock);
+
 	/*
 	 * We're now in charge.  Notify and proceed to drain.  We need
 	 * to keep the gcwq running during the whole CPU down
@@ -2263,18 +2871,90 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	/*
 	 * The original cpu is in the process of dying and may go away
 	 * anytime now.  When that happens, we and all workers would
-	 * be migrated to other cpus.  Try draining any left work.
-	 * Note that if the gcwq is frozen, there may be frozen works
-	 * in freezeable cwqs.  Don't declare completion while frozen.
+	 * be migrated to other cpus.  Try draining any left work.  We
+	 * want to get it over with ASAP - spam rescuers, wake up as
+	 * many idlers as necessary and create new ones till the
+	 * worklist is empty.  Note that if the gcwq is frozen, there
+	 * may be frozen works in freezeable cwqs.  Don't declare
+	 * completion while frozen.
 	 */
 	while (gcwq->nr_workers != gcwq->nr_idle ||
 	       gcwq->flags & GCWQ_FREEZING ||
 	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+		int nr_works = 0;
+
+		list_for_each_entry(work, &gcwq->worklist, entry) {
+			send_mayday(work);
+			nr_works++;
+		}
+
+		list_for_each_entry(worker, &gcwq->idle_list, entry) {
+			if (!nr_works--)
+				break;
+			wake_up_process(worker->task);
+		}
+
+		if (need_to_create_worker(gcwq)) {
+			spin_unlock_irq(&gcwq->lock);
+			worker = create_worker(gcwq, false);
+			spin_lock_irq(&gcwq->lock);
+			if (worker) {
+				worker_set_flags(worker, WORKER_ROGUE, false);
+				start_worker(worker);
+			}
+		}
+
 		/* give a breather */
 		if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
 			break;
 	}
 
+	/*
+	 * Either all works have been scheduled and cpu is down, or
+	 * cpu down has already been canceled.  Wait for and butcher
+	 * all workers till we're canceled.
+	 */
+	do {
+		rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
+		while (!list_empty(&gcwq->idle_list))
+			destroy_worker(list_first_entry(&gcwq->idle_list,
+							struct worker, entry));
+	} while (gcwq->nr_workers && rc >= 0);
+
+	/*
+	 * At this point, either draining has completed and no worker
+	 * is left, or cpu down has been canceled or the cpu is being
+	 * brought back up.  There shouldn't be any idle one left.
+	 * Tell the remaining busy ones to rebind once it finishes the
+	 * currently scheduled works by scheduling the rebind_work.
+	 */
+	WARN_ON(!list_empty(&gcwq->idle_list));
+
+	for_each_busy_worker(worker, i, pos, gcwq) {
+		struct work_struct *rebind_work = &worker->rebind_work;
+
+		/*
+		 * Rebind_work may race with future cpu hotplug
+		 * operations.  Use a separate flag to mark that
+		 * rebinding is scheduled.
+		 */
+		worker_set_flags(worker, WORKER_REBIND, false);
+		worker_clr_flags(worker, WORKER_ROGUE);
+
+		/* queue rebind_work, wq doesn't matter, use the default one */
+		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+				     work_data_bits(rebind_work)))
+			continue;
+
+		debug_work_activate(rebind_work);
+		insert_work(get_cwq(gcwq->cpu, keventd_wq), rebind_work,
+			    worker->scheduled.next,
+			    work_color_to_flags(WORK_NO_COLOR));
+	}
+
+	/* relinquish manager role */
+	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+
 	/* notify completion */
 	gcwq->trustee = NULL;
 	gcwq->trustee_state = TRUSTEE_DONE;
@@ -2313,10 +2993,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	unsigned int cpu = (unsigned long)hcpu;
 	struct global_cwq *gcwq = get_gcwq(cpu);
 	struct task_struct *new_trustee = NULL;
-	struct worker *worker;
-	struct hlist_node *pos;
+	struct worker *uninitialized_var(new_worker);
 	unsigned long flags;
-	int i;
 
 	action &= ~CPU_TASKS_FROZEN;
 
@@ -2327,6 +3005,15 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		if (IS_ERR(new_trustee))
 			return notifier_from_errno(PTR_ERR(new_trustee));
 		kthread_bind(new_trustee, cpu);
+		/* fall through */
+	case CPU_UP_PREPARE:
+		BUG_ON(gcwq->first_idle);
+		new_worker = create_worker(gcwq, false);
+		if (!new_worker) {
+			if (new_trustee)
+				kthread_stop(new_trustee);
+			return NOTIFY_BAD;
+		}
 	}
 
 	/* some are called w/ irq disabled, don't disturb irq status */
@@ -2340,26 +3027,50 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		gcwq->trustee_state = TRUSTEE_START;
 		wake_up_process(gcwq->trustee);
 		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
+		/* fall through */
+	case CPU_UP_PREPARE:
+		BUG_ON(gcwq->first_idle);
+		gcwq->first_idle = new_worker;
+		break;
+
+	case CPU_DYING:
+		/*
+		 * Before this, the trustee and all workers except for
+		 * the ones which are still executing works from
+		 * before the last CPU down must be on the cpu.  After
+		 * this, they'll all be diasporas.
+		 */
+		gcwq->flags |= GCWQ_DISASSOCIATED;
 		break;
 
 	case CPU_POST_DEAD:
 		gcwq->trustee_state = TRUSTEE_BUTCHER;
+		/* fall through */
+	case CPU_UP_CANCELED:
+		destroy_worker(gcwq->first_idle);
+		gcwq->first_idle = NULL;
 		break;
 
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
+		gcwq->flags &= ~GCWQ_DISASSOCIATED;
 		if (gcwq->trustee_state != TRUSTEE_DONE) {
 			gcwq->trustee_state = TRUSTEE_RELEASE;
 			wake_up_process(gcwq->trustee);
 			wait_trustee_state(gcwq, TRUSTEE_DONE);
 		}
 
-		/* clear ROGUE from all workers */
-		list_for_each_entry(worker, &gcwq->idle_list, entry)
-			worker_clr_flags(worker, WORKER_ROGUE);
-
-		for_each_busy_worker(worker, i, pos, gcwq)
-			worker_clr_flags(worker, WORKER_ROGUE);
+		/*
+		 * Trustee is done and there might be no worker left.
+		 * Put the first_idle in and request a real manager to
+		 * take a look.
+		 */
+		spin_unlock_irq(&gcwq->lock);
+		kthread_bind(gcwq->first_idle->task, cpu);
+		spin_lock_irq(&gcwq->lock);
+		gcwq->flags |= GCWQ_MANAGE_WORKERS;
+		start_worker(gcwq->first_idle);
+		gcwq->first_idle = NULL;
 		break;
 	}
 
@@ -2548,10 +3259,10 @@ void thaw_workqueues(void)
 			if (wq->single_cpu == gcwq->cpu &&
 			    !cwq->nr_active && list_empty(&cwq->delayed_works))
 				cwq_unbind_single_cpu(cwq);
-
-			wake_up_process(cwq->worker->task);
 		}
 
+		wake_up_worker(gcwq);
+
 		spin_unlock_irq(&gcwq->lock);
 	}
 
@@ -2588,12 +3299,31 @@ void __init init_workqueues(void)
 		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
 			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
 
+		init_timer_deferrable(&gcwq->idle_timer);
+		gcwq->idle_timer.function = idle_worker_timeout;
+		gcwq->idle_timer.data = (unsigned long)gcwq;
+
+		setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
+			    (unsigned long)gcwq);
+
 		ida_init(&gcwq->worker_ida);
 
 		gcwq->trustee_state = TRUSTEE_DONE;
 		init_waitqueue_head(&gcwq->trustee_wait);
 	}
 
+	/* create the initial worker */
+	for_each_online_cpu(cpu) {
+		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct worker *worker;
+
+		worker = create_worker(gcwq, true);
+		BUG_ON(!worker);
+		spin_lock_irq(&gcwq->lock);
+		start_worker(worker);
+		spin_unlock_irq(&gcwq->lock);
+	}
+
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
 }
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
index af040babb742..2d10fc98dc79 100644
--- a/kernel/workqueue_sched.h
+++ b/kernel/workqueue_sched.h
@@ -4,13 +4,6 @@
  * Scheduler hooks for concurrency managed workqueue.  Only to be
  * included from sched.c and workqueue.c.
  */
-static inline void wq_worker_waking_up(struct task_struct *task,
-				       unsigned int cpu)
-{
-}
-
-static inline struct task_struct *wq_worker_sleeping(struct task_struct *task,
-						     unsigned int cpu)
-{
-	return NULL;
-}
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+				       unsigned int cpu);
-- 
cgit v1.2.3-59-g8ed1b


From b71ab8c2025caef8db719aa41af0ed735dc543cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:14 +0200
Subject: workqueue: increase max_active of keventd and kill
 current_is_keventd()

Define WQ_MAX_ACTIVE and create keventd with max_active set to half of
it which means that keventd now can process upto WQ_MAX_ACTIVE / 2 - 1
works concurrently.  Unless some combination can result in dependency
loop longer than max_active, deadlock won't happen and thus it's
unnecessary to check whether current_is_keventd() before trying to
schedule a work.  Kill current_is_keventd().

(Lockdep annotations are broken.  We need lock_map_acquire_read_norecurse())

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 arch/ia64/kernel/smpboot.c |  2 +-
 arch/x86/kernel/smpboot.c  |  2 +-
 include/linux/workqueue.h  |  4 ++-
 kernel/workqueue.c         | 63 ++++++++++------------------------------------
 4 files changed, 18 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 6a1380e90f87..99dcc85193c9 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -519,7 +519,7 @@ do_boot_cpu (int sapicid, int cpu)
 	/*
 	 * We can't use kernel_thread since we must avoid to reschedule the child.
 	 */
-	if (!keventd_up() || current_is_keventd())
+	if (!keventd_up())
 		c_idle.work.func(&c_idle.work);
 	else {
 		schedule_work(&c_idle.work);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d6..4d90f376e985 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -735,7 +735,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 		goto do_rest;
 	}
 
-	if (!keventd_up() || current_is_keventd())
+	if (!keventd_up())
 		c_idle.work.func(&c_idle.work);
 	else {
 		schedule_work(&c_idle.work);
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index b8f4ec45c40a..33e24e734d50 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -227,6 +227,9 @@ enum {
 	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
 	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
+
+	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
+	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
 };
 
 extern struct workqueue_struct *
@@ -280,7 +283,6 @@ extern int schedule_delayed_work(struct delayed_work *work, unsigned long delay)
 extern int schedule_delayed_work_on(int cpu, struct delayed_work *work,
 					unsigned long delay);
 extern int schedule_on_each_cpu(work_func_t func);
-extern int current_is_keventd(void);
 extern int keventd_up(void);
 
 extern void init_workqueues(void);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ad46523b423..4190e84cf995 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2398,7 +2398,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 int schedule_on_each_cpu(work_func_t func)
 {
 	int cpu;
-	int orig = -1;
 	struct work_struct *works;
 
 	works = alloc_percpu(struct work_struct);
@@ -2407,23 +2406,12 @@ int schedule_on_each_cpu(work_func_t func)
 
 	get_online_cpus();
 
-	/*
-	 * When running in keventd don't schedule a work item on
-	 * itself.  Can just call directly because the work queue is
-	 * already bound.  This also is faster.
-	 */
-	if (current_is_keventd())
-		orig = raw_smp_processor_id();
-
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
-		if (cpu != orig)
-			schedule_work_on(cpu, work);
+		schedule_work_on(cpu, work);
 	}
-	if (orig >= 0)
-		func(per_cpu_ptr(works, orig));
 
 	for_each_online_cpu(cpu)
 		flush_work(per_cpu_ptr(works, cpu));
@@ -2494,41 +2482,6 @@ int keventd_up(void)
 	return keventd_wq != NULL;
 }
 
-int current_is_keventd(void)
-{
-	bool found = false;
-	unsigned int cpu;
-
-	/*
-	 * There no longer is one-to-one relation between worker and
-	 * work queue and a worker task might be unbound from its cpu
-	 * if the cpu was offlined.  Match all busy workers.  This
-	 * function will go away once dynamic pool is implemented.
-	 */
-	for_each_possible_cpu(cpu) {
-		struct global_cwq *gcwq = get_gcwq(cpu);
-		struct worker *worker;
-		struct hlist_node *pos;
-		unsigned long flags;
-		int i;
-
-		spin_lock_irqsave(&gcwq->lock, flags);
-
-		for_each_busy_worker(worker, i, pos, gcwq) {
-			if (worker->task == current) {
-				found = true;
-				break;
-			}
-		}
-
-		spin_unlock_irqrestore(&gcwq->lock, flags);
-		if (found)
-			break;
-	}
-
-	return found;
-}
-
 static struct cpu_workqueue_struct *alloc_cwqs(void)
 {
 	/*
@@ -2576,6 +2529,16 @@ static void free_cwqs(struct cpu_workqueue_struct *cwqs)
 #endif
 }
 
+static int wq_clamp_max_active(int max_active, const char *name)
+{
+	if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
+		printk(KERN_WARNING "workqueue: max_active %d requested for %s "
+		       "is out of range, clamping between %d and %d\n",
+		       max_active, name, 1, WQ_MAX_ACTIVE);
+
+	return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
+}
+
 struct workqueue_struct *__create_workqueue_key(const char *name,
 						unsigned int flags,
 						int max_active,
@@ -2585,7 +2548,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 	struct workqueue_struct *wq;
 	unsigned int cpu;
 
-	max_active = clamp_val(max_active, 1, INT_MAX);
+	max_active = wq_clamp_max_active(max_active, name);
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
@@ -3324,6 +3287,6 @@ void __init init_workqueues(void)
 		spin_unlock_irq(&gcwq->lock);
 	}
 
-	keventd_wq = create_workqueue("events");
+	keventd_wq = __create_workqueue("events", 0, WQ_DFL_ACTIVE);
 	BUG_ON(!keventd_wq);
 }
-- 
cgit v1.2.3-59-g8ed1b


From d320c03830b17af64e4547075003b1eeb274bc6c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:14 +0200
Subject: workqueue: s/__create_workqueue()/alloc_workqueue()/, and add system
 workqueues

This patch makes changes to make new workqueue features available to
its users.

* Now that workqueue is more featureful, there should be a public
  workqueue creation function which takes paramters to control them.
  Rename __create_workqueue() to alloc_workqueue() and make 0
  max_active mean WQ_DFL_ACTIVE.  In the long run, all
  create_workqueue_*() will be converted over to alloc_workqueue().

* To further unify access interface, rename keventd_wq to system_wq
  and export it.

* Add system_long_wq and system_nrt_wq.  The former is to host long
  running works separately (so that flush_scheduled_work() dosen't
  take so long) and the latter guarantees any queued work item is
  never executed in parallel by multiple CPUs.  These will be used by
  future patches to update workqueue users.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 40 +++++++++++++++++++++++++++++-----------
 kernel/workqueue.c        | 42 +++++++++++++++++++++++++-----------------
 2 files changed, 54 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 33e24e734d50..48b7422f25ae 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -232,12 +232,31 @@ enum {
 	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
 };
 
+/*
+ * System-wide workqueues which are always present.
+ *
+ * system_wq is the one used by schedule[_delayed]_work[_on]().
+ * Multi-CPU multi-threaded.  There are users which expect relatively
+ * short queue flush time.  Don't queue works which can run for too
+ * long.
+ *
+ * system_long_wq is similar to system_wq but may host long running
+ * works.  Queue flushing might take relatively long.
+ *
+ * system_nrt_wq is non-reentrant and guarantees that any given work
+ * item is never executed in parallel by multiple CPUs.  Queue
+ * flushing might take relatively long.
+ */
+extern struct workqueue_struct *system_wq;
+extern struct workqueue_struct *system_long_wq;
+extern struct workqueue_struct *system_nrt_wq;
+
 extern struct workqueue_struct *
-__create_workqueue_key(const char *name, unsigned int flags, int max_active,
-		       struct lock_class_key *key, const char *lock_name);
+__alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
+		      struct lock_class_key *key, const char *lock_name);
 
 #ifdef CONFIG_LOCKDEP
-#define __create_workqueue(name, flags, max_active)		\
+#define alloc_workqueue(name, flags, max_active)		\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
@@ -247,21 +266,20 @@ __create_workqueue_key(const char *name, unsigned int flags, int max_active,
 	else							\
 		__lock_name = #name;				\
 								\
-	__create_workqueue_key((name), (flags), (max_active),	\
-				&__key, __lock_name);		\
+	__alloc_workqueue_key((name), (flags), (max_active),	\
+			      &__key, __lock_name);		\
 })
 #else
-#define __create_workqueue(name, flags, max_active)		\
-	__create_workqueue_key((name), (flags), (max_active), NULL, NULL)
+#define alloc_workqueue(name, flags, max_active)		\
+	__alloc_workqueue_key((name), (flags), (max_active), NULL, NULL)
 #endif
 
 #define create_workqueue(name)					\
-	__create_workqueue((name), WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_RESCUER, 1)
 #define create_freezeable_workqueue(name)			\
-	__create_workqueue((name),				\
-			   WQ_FREEZEABLE | WQ_SINGLE_CPU | WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_CPU | WQ_RESCUER, 1)
 #define create_singlethread_workqueue(name)			\
-	__create_workqueue((name), WQ_SINGLE_CPU | WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_SINGLE_CPU | WQ_RESCUER, 1)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4190e84cf995..16ce617974d2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -210,6 +210,13 @@ struct workqueue_struct {
 #endif
 };
 
+struct workqueue_struct *system_wq __read_mostly;
+struct workqueue_struct *system_long_wq __read_mostly;
+struct workqueue_struct *system_nrt_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_wq);
+EXPORT_SYMBOL_GPL(system_long_wq);
+EXPORT_SYMBOL_GPL(system_nrt_wq);
+
 #define for_each_busy_worker(worker, i, pos, gcwq)			\
 	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
 		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -2306,8 +2313,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork)
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
-static struct workqueue_struct *keventd_wq __read_mostly;
-
 /**
  * schedule_work - put work task in global workqueue
  * @work: job to be done
@@ -2321,7 +2326,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
  */
 int schedule_work(struct work_struct *work)
 {
-	return queue_work(keventd_wq, work);
+	return queue_work(system_wq, work);
 }
 EXPORT_SYMBOL(schedule_work);
 
@@ -2334,7 +2339,7 @@ EXPORT_SYMBOL(schedule_work);
  */
 int schedule_work_on(int cpu, struct work_struct *work)
 {
-	return queue_work_on(cpu, keventd_wq, work);
+	return queue_work_on(cpu, system_wq, work);
 }
 EXPORT_SYMBOL(schedule_work_on);
 
@@ -2349,7 +2354,7 @@ EXPORT_SYMBOL(schedule_work_on);
 int schedule_delayed_work(struct delayed_work *dwork,
 					unsigned long delay)
 {
-	return queue_delayed_work(keventd_wq, dwork, delay);
+	return queue_delayed_work(system_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 
@@ -2382,7 +2387,7 @@ EXPORT_SYMBOL(flush_delayed_work);
 int schedule_delayed_work_on(int cpu,
 			struct delayed_work *dwork, unsigned long delay)
 {
-	return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
+	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
 
@@ -2447,7 +2452,7 @@ int schedule_on_each_cpu(work_func_t func)
  */
 void flush_scheduled_work(void)
 {
-	flush_workqueue(keventd_wq);
+	flush_workqueue(system_wq);
 }
 EXPORT_SYMBOL(flush_scheduled_work);
 
@@ -2479,7 +2484,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
 
 int keventd_up(void)
 {
-	return keventd_wq != NULL;
+	return system_wq != NULL;
 }
 
 static struct cpu_workqueue_struct *alloc_cwqs(void)
@@ -2539,15 +2544,16 @@ static int wq_clamp_max_active(int max_active, const char *name)
 	return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
 }
 
-struct workqueue_struct *__create_workqueue_key(const char *name,
-						unsigned int flags,
-						int max_active,
-						struct lock_class_key *key,
-						const char *lock_name)
+struct workqueue_struct *__alloc_workqueue_key(const char *name,
+					       unsigned int flags,
+					       int max_active,
+					       struct lock_class_key *key,
+					       const char *lock_name)
 {
 	struct workqueue_struct *wq;
 	unsigned int cpu;
 
+	max_active = max_active ?: WQ_DFL_ACTIVE;
 	max_active = wq_clamp_max_active(max_active, name);
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
@@ -2626,7 +2632,7 @@ err:
 	}
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(__create_workqueue_key);
+EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 
 /**
  * destroy_workqueue - safely terminate a workqueue
@@ -2910,7 +2916,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 			continue;
 
 		debug_work_activate(rebind_work);
-		insert_work(get_cwq(gcwq->cpu, keventd_wq), rebind_work,
+		insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
 			    worker->scheduled.next,
 			    work_color_to_flags(WORK_NO_COLOR));
 	}
@@ -3287,6 +3293,8 @@ void __init init_workqueues(void)
 		spin_unlock_irq(&gcwq->lock);
 	}
 
-	keventd_wq = __create_workqueue("events", 0, WQ_DFL_ACTIVE);
-	BUG_ON(!keventd_wq);
+	system_wq = alloc_workqueue("events", 0, 0);
+	system_long_wq = alloc_workqueue("events_long", 0, 0);
+	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
+	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
 }
-- 
cgit v1.2.3-59-g8ed1b


From dcd989cb73ab0f7b722d64ab6516f101d9f43f88 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:14 +0200
Subject: workqueue: implement several utility APIs

Implement the following utility APIs.

 workqueue_set_max_active()	: adjust max_active of a wq
 workqueue_congested()		: test whether a wq is contested
 work_cpu()			: determine the last / current cpu of a work
 work_busy()			: query whether a work is busy

* Anton Blanchard fixed missing ret initialization in work_busy().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Anton Blanchard <anton@samba.org>
---
 include/linux/workqueue.h |  11 ++++-
 kernel/workqueue.c        | 108 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 117 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 48b7422f25ae..0a7f79729380 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -61,6 +61,10 @@ enum {
 	WORK_STRUCT_FLAG_MASK	= (1UL << WORK_STRUCT_FLAG_BITS) - 1,
 	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
 	WORK_STRUCT_NO_CPU	= NR_CPUS << WORK_STRUCT_FLAG_BITS,
+
+	/* bit mask for work_busy() return values */
+	WORK_BUSY_PENDING	= 1 << 0,
+	WORK_BUSY_RUNNING	= 1 << 1,
 };
 
 struct work_struct {
@@ -307,9 +311,14 @@ extern void init_workqueues(void);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
 extern int flush_work(struct work_struct *work);
-
 extern int cancel_work_sync(struct work_struct *work);
 
+extern void workqueue_set_max_active(struct workqueue_struct *wq,
+				     int max_active);
+extern bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq);
+extern unsigned int work_cpu(struct work_struct *work);
+extern unsigned int work_busy(struct work_struct *work);
+
 /*
  * Kill off a pending schedule_delayed_work().  Note that the work callback
  * function may still be running on return from cancel_delayed_work(), unless
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 16ce617974d2..c1aa65c2ff38 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -203,7 +203,7 @@ struct workqueue_struct {
 	cpumask_var_t		mayday_mask;	/* cpus requesting rescue */
 	struct worker		*rescuer;	/* I: rescue worker */
 
-	int			saved_max_active; /* I: saved cwq max_active */
+	int			saved_max_active; /* W: saved cwq max_active */
 	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
@@ -2675,6 +2675,112 @@ void destroy_workqueue(struct workqueue_struct *wq)
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
 
+/**
+ * workqueue_set_max_active - adjust max_active of a workqueue
+ * @wq: target workqueue
+ * @max_active: new max_active value.
+ *
+ * Set max_active of @wq to @max_active.
+ *
+ * CONTEXT:
+ * Don't call from IRQ context.
+ */
+void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
+{
+	unsigned int cpu;
+
+	max_active = wq_clamp_max_active(max_active, wq->name);
+
+	spin_lock(&workqueue_lock);
+
+	wq->saved_max_active = max_active;
+
+	for_each_possible_cpu(cpu) {
+		struct global_cwq *gcwq = get_gcwq(cpu);
+
+		spin_lock_irq(&gcwq->lock);
+
+		if (!(wq->flags & WQ_FREEZEABLE) ||
+		    !(gcwq->flags & GCWQ_FREEZING))
+			get_cwq(gcwq->cpu, wq)->max_active = max_active;
+
+		spin_unlock_irq(&gcwq->lock);
+	}
+
+	spin_unlock(&workqueue_lock);
+}
+EXPORT_SYMBOL_GPL(workqueue_set_max_active);
+
+/**
+ * workqueue_congested - test whether a workqueue is congested
+ * @cpu: CPU in question
+ * @wq: target workqueue
+ *
+ * Test whether @wq's cpu workqueue for @cpu is congested.  There is
+ * no synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ *
+ * RETURNS:
+ * %true if congested, %false otherwise.
+ */
+bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
+{
+	struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+	return !list_empty(&cwq->delayed_works);
+}
+EXPORT_SYMBOL_GPL(workqueue_congested);
+
+/**
+ * work_cpu - return the last known associated cpu for @work
+ * @work: the work of interest
+ *
+ * RETURNS:
+ * CPU number if @work was ever queued.  NR_CPUS otherwise.
+ */
+unsigned int work_cpu(struct work_struct *work)
+{
+	struct global_cwq *gcwq = get_work_gcwq(work);
+
+	return gcwq ? gcwq->cpu : NR_CPUS;
+}
+EXPORT_SYMBOL_GPL(work_cpu);
+
+/**
+ * work_busy - test whether a work is currently pending or running
+ * @work: the work to be tested
+ *
+ * Test whether @work is currently pending or running.  There is no
+ * synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ * Especially for reentrant wqs, the pending state might hide the
+ * running state.
+ *
+ * RETURNS:
+ * OR'd bitmask of WORK_BUSY_* bits.
+ */
+unsigned int work_busy(struct work_struct *work)
+{
+	struct global_cwq *gcwq = get_work_gcwq(work);
+	unsigned long flags;
+	unsigned int ret = 0;
+
+	if (!gcwq)
+		return false;
+
+	spin_lock_irqsave(&gcwq->lock, flags);
+
+	if (work_pending(work))
+		ret |= WORK_BUSY_PENDING;
+	if (find_worker_executing_work(gcwq, work))
+		ret |= WORK_BUSY_RUNNING;
+
+	spin_unlock_irqrestore(&gcwq->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(work_busy);
+
 /*
  * CPU hotplug.
  *
-- 
cgit v1.2.3-59-g8ed1b


From 649027d73a6309ac34dc2886362e662bd73456dc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:14 +0200
Subject: workqueue: implement high priority workqueue

This patch implements high priority workqueue which can be specified
with WQ_HIGHPRI flag on creation.  A high priority workqueue has the
following properties.

* A work queued to it is queued at the head of the worklist of the
  respective gcwq after other highpri works, while normal works are
  always appended at the end.

* As long as there are highpri works on gcwq->worklist,
  [__]need_more_worker() remains %true and process_one_work() wakes up
  another worker before it start executing a work.

The above two properties guarantee that works queued to high priority
workqueues are dispatched to workers and start execution as soon as
possible regardless of the state of other works.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 70 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 65 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0a7f79729380..006dcf7e808a 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -231,6 +231,7 @@ enum {
 	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
 	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
+	WQ_HIGHPRI		= 1 << 4, /* high priority */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c1aa65c2ff38..5775717288d5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -43,6 +43,7 @@ enum {
 	GCWQ_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
 	GCWQ_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
 	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
+	GCWQ_HIGHPRI_PENDING	= 1 << 4,	/* highpri works on queue */
 
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
@@ -452,15 +453,19 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
  * assume that they're being called with gcwq->lock held.
  */
 
+static bool __need_more_worker(struct global_cwq *gcwq)
+{
+	return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
+		gcwq->flags & GCWQ_HIGHPRI_PENDING;
+}
+
 /*
  * Need to wake up a worker?  Called from anything but currently
  * running workers.
  */
 static bool need_more_worker(struct global_cwq *gcwq)
 {
-	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
-
-	return !list_empty(&gcwq->worklist) && !atomic_read(nr_running);
+	return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
 }
 
 /* Can I start working?  Called from busy but !running workers. */
@@ -733,6 +738,43 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 					    work);
 }
 
+/**
+ * gcwq_determine_ins_pos - find insertion position
+ * @gcwq: gcwq of interest
+ * @cwq: cwq a work is being queued for
+ *
+ * A work for @cwq is about to be queued on @gcwq, determine insertion
+ * position for the work.  If @cwq is for HIGHPRI wq, the work is
+ * queued at the head of the queue but in FIFO order with respect to
+ * other HIGHPRI works; otherwise, at the end of the queue.  This
+ * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
+ * there are HIGHPRI works pending.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to inserstion position.
+ */
+static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
+					       struct cpu_workqueue_struct *cwq)
+{
+	struct work_struct *twork;
+
+	if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
+		return &gcwq->worklist;
+
+	list_for_each_entry(twork, &gcwq->worklist, entry) {
+		struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
+
+		if (!(tcwq->wq->flags & WQ_HIGHPRI))
+			break;
+	}
+
+	gcwq->flags |= GCWQ_HIGHPRI_PENDING;
+	return &twork->entry;
+}
+
 /**
  * insert_work - insert a work into gcwq
  * @cwq: cwq @work belongs to
@@ -770,7 +812,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 	 */
 	smp_mb();
 
-	if (!atomic_read(get_gcwq_nr_running(gcwq->cpu)))
+	if (__need_more_worker(gcwq))
 		wake_up_worker(gcwq);
 }
 
@@ -887,7 +929,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 
 	if (likely(cwq->nr_active < cwq->max_active)) {
 		cwq->nr_active++;
-		worklist = &gcwq->worklist;
+		worklist = gcwq_determine_ins_pos(gcwq, cwq);
 	} else
 		worklist = &cwq->delayed_works;
 
@@ -1526,8 +1568,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 {
 	struct work_struct *work = list_first_entry(&cwq->delayed_works,
 						    struct work_struct, entry);
+	struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
 
-	move_linked_works(work, &cwq->gcwq->worklist, NULL);
+	move_linked_works(work, pos, NULL);
 	cwq->nr_active++;
 }
 
@@ -1634,6 +1677,21 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
 	set_work_cpu(work, gcwq->cpu);
 	list_del_init(&work->entry);
 
+	/*
+	 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
+	 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
+	 */
+	if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
+		struct work_struct *nwork = list_first_entry(&gcwq->worklist,
+						struct work_struct, entry);
+
+		if (!list_empty(&gcwq->worklist) &&
+		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
+			wake_up_worker(gcwq);
+		else
+			gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
+	}
+
 	spin_unlock_irq(&gcwq->lock);
 
 	work_clear_pending(work);
-- 
cgit v1.2.3-59-g8ed1b


From fb0e7beb5c1b6fb4da786ba709d7138373d5fb22 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Jun 2010 10:07:15 +0200
Subject: workqueue: implement cpu intensive workqueue

This patch implements cpu intensive workqueue which can be specified
with WQ_CPU_INTENSIVE flag on creation.  Works queued to a cpu
intensive workqueue don't participate in concurrency management.  IOW,
it doesn't contribute to gcwq->nr_running and thus doesn't delay
excution of other works.

Note that although cpu intensive works won't delay other works, they
can be delayed by other works.  Combine with WQ_HIGHPRI to avoid being
delayed by other works too.

As the name suggests this is useful when using workqueue for cpu
intensive works.  Workers executing cpu intensive works are not
considered for workqueue concurrency management and left for the
scheduler to manage.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 006dcf7e808a..3f36d37ac5ba 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -232,6 +232,7 @@ enum {
 	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
+	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5775717288d5..6fa847c5c5e9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -52,8 +52,10 @@ enum {
 	WORKER_PREP		= 1 << 3,	/* preparing to run works */
 	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
 	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
+	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
 
-	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND,
+	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
+				  WORKER_CPU_INTENSIVE,
 
 	/* gcwq->trustee_state */
 	TRUSTEE_START		= 0,		/* start */
@@ -1641,6 +1643,7 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
 	struct global_cwq *gcwq = cwq->gcwq;
 	struct hlist_head *bwh = busy_worker_head(gcwq, work);
+	bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
 	work_func_t f = work->func;
 	int work_color;
 	struct worker *collision;
@@ -1692,6 +1695,13 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
 			gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
 	}
 
+	/*
+	 * CPU intensive works don't participate in concurrency
+	 * management.  They're the scheduler's responsibility.
+	 */
+	if (unlikely(cpu_intensive))
+		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+
 	spin_unlock_irq(&gcwq->lock);
 
 	work_clear_pending(work);
@@ -1713,6 +1723,10 @@ static void process_one_work(struct worker *worker, struct work_struct *work)
 
 	spin_lock_irq(&gcwq->lock);
 
+	/* clear cpu intensive status */
+	if (unlikely(cpu_intensive))
+		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+
 	/* we're done with it, release */
 	hlist_del_init(&worker->hentry);
 	worker->current_work = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 2ec57d448b2e8fcfba539a46701b43f14f037f17 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Tue, 29 Jun 2010 12:02:01 +1000
Subject: sched: Fix spelling of sibling

No logic changes, only spelling.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Cc: linuxppc-dev@ozlabs.org
Cc: David Howells <dhowells@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <15249.1277776921@neuling.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/process.c | 2 +-
 include/linux/topology.h      | 2 +-
 kernel/sched_fair.c           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9b41ece010b6..22f08cb7e7d1 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1270,7 +1270,7 @@ unsigned long randomize_et_dyn(unsigned long base)
 }
 
 #ifdef CONFIG_SMP
-int arch_sd_sibiling_asym_packing(void)
+int arch_sd_sibling_asym_packing(void)
 {
 	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
 		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
diff --git a/include/linux/topology.h b/include/linux/topology.h
index cf57f30d0dcb..b572e432d2f3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -103,7 +103,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
 				| 0*SD_PREFER_SIBLING			\
-				| arch_sd_sibiling_asym_packing()	\
+				| arch_sd_sibling_asym_packing()	\
 				,					\
 	.last_balance		= jiffies,				\
 	.balance_interval	= 1,					\
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5e8f98c103fb..b4da534f4b8c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2567,7 +2567,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 	} while (sg != sd->groups);
 }
 
-int __weak arch_sd_sibiling_asym_packing(void)
+int __weak arch_sd_sibling_asym_packing(void)
 {
        return 0*SD_ASYM_PACKING;
 }
-- 
cgit v1.2.3-59-g8ed1b


From afd2a5ca1ef6ffe1f9fd0846ae39795527ead555 Mon Sep 17 00:00:00 2001
From: Gertjan van Wingerde <gwingerde@gmail.com>
Date: Tue, 29 Jun 2010 21:43:44 +0200
Subject: eeprom_93cx6: Add support for 93c86 EEPROMs.

Signed-off-by: Gertjan van Wingerde <gwingerde@gmail.com>
Signed-off-by: Ivo van Doorn <IvDoorn@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/eeprom_93cx6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/eeprom_93cx6.h b/include/linux/eeprom_93cx6.h
index a55c873e8b66..c4627cbdb8e0 100644
--- a/include/linux/eeprom_93cx6.h
+++ b/include/linux/eeprom_93cx6.h
@@ -30,6 +30,7 @@
 #define PCI_EEPROM_WIDTH_93C46	6
 #define PCI_EEPROM_WIDTH_93C56	8
 #define PCI_EEPROM_WIDTH_93C66	8
+#define PCI_EEPROM_WIDTH_93C86	8
 #define PCI_EEPROM_WIDTH_OPCODE	3
 #define PCI_EEPROM_WRITE_OPCODE	0x05
 #define PCI_EEPROM_READ_OPCODE	0x06
-- 
cgit v1.2.3-59-g8ed1b


From 1437ce3983bcbc0447a0dedcd644c14fe833d266 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 30 Jun 2010 02:44:32 +0000
Subject: ethtool: Change ethtool_op_set_flags to validate flags

ethtool_op_set_flags() does not check for unsupported flags, and has
no way of doing so.  This means it is not suitable for use as a
default implementation of ethtool_ops::set_flags.

Add a 'supported' parameter specifying the flags that the driver and
hardware support, validate the requested flags against this, and
change all current callers to pass this parameter.

Change some other trivial implementations of ethtool_ops::set_flags to
call ethtool_op_set_flags().

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Reviewed-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Jeff Garzik <jgarzik@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/cxgb4/cxgb4_main.c    |  9 +--------
 drivers/net/enic/enic_main.c      |  1 -
 drivers/net/ixgbe/ixgbe_ethtool.c |  5 ++++-
 drivers/net/mv643xx_eth.c         |  7 ++++++-
 drivers/net/myri10ge/myri10ge.c   | 10 +++++++---
 drivers/net/niu.c                 |  9 +--------
 drivers/net/sfc/ethtool.c         |  5 +----
 drivers/net/sky2.c                | 16 ++++++----------
 include/linux/ethtool.h           |  2 +-
 net/core/ethtool.c                | 28 +++++-----------------------
 10 files changed, 32 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/cxgb4/cxgb4_main.c b/drivers/net/cxgb4/cxgb4_main.c
index 65281674de91..55a720e4abdc 100644
--- a/drivers/net/cxgb4/cxgb4_main.c
+++ b/drivers/net/cxgb4/cxgb4_main.c
@@ -1799,14 +1799,7 @@ static int set_tso(struct net_device *dev, u32 value)
 
 static int set_flags(struct net_device *dev, u32 flags)
 {
-	if (flags & ~ETH_FLAG_RXHASH)
-		return -EOPNOTSUPP;
-
-	if (flags & ETH_FLAG_RXHASH)
-		dev->features |= NETIF_F_RXHASH;
-	else
-		dev->features &= ~NETIF_F_RXHASH;
-	return 0;
+	return ethtool_op_set_flags(dev, flags, ETH_FLAG_RXHASH);
 }
 
 static struct ethtool_ops cxgb_ethtool_ops = {
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index 6c6795b90fa6..77a7f87d498e 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -365,7 +365,6 @@ static const struct ethtool_ops enic_ethtool_ops = {
 	.get_coalesce = enic_get_coalesce,
 	.set_coalesce = enic_set_coalesce,
 	.get_flags = ethtool_op_get_flags,
-	.set_flags = ethtool_op_set_flags,
 };
 
 static void enic_free_wq_buf(struct vnic_wq *wq, struct vnic_wq_buf *buf)
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index 873b45efca40..7d2e5ea2deba 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -2205,8 +2205,11 @@ static int ixgbe_set_flags(struct net_device *netdev, u32 data)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	bool need_reset = false;
+	int rc;
 
-	ethtool_op_set_flags(netdev, data);
+	rc = ethtool_op_set_flags(netdev, data, ETH_FLAG_LRO | ETH_FLAG_NTUPLE);
+	if (rc)
+		return rc;
 
 	/* if state changes we need to update adapter->flags and reset */
 	if (adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE) {
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index e345ec8cb473..82b720f29c75 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -1636,6 +1636,11 @@ static void mv643xx_eth_get_ethtool_stats(struct net_device *dev,
 	}
 }
 
+static int mv643xx_eth_set_flags(struct net_device *dev, u32 data)
+{
+	return ethtool_op_set_flags(dev, data, ETH_FLAG_LRO);
+}
+
 static int mv643xx_eth_get_sset_count(struct net_device *dev, int sset)
 {
 	if (sset == ETH_SS_STATS)
@@ -1661,7 +1666,7 @@ static const struct ethtool_ops mv643xx_eth_ethtool_ops = {
 	.get_strings		= mv643xx_eth_get_strings,
 	.get_ethtool_stats	= mv643xx_eth_get_ethtool_stats,
 	.get_flags		= ethtool_op_get_flags,
-	.set_flags		= ethtool_op_set_flags,
+	.set_flags		= mv643xx_eth_set_flags,
 	.get_sset_count		= mv643xx_eth_get_sset_count,
 };
 
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index e0b47cc8a86e..d771d1650d60 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -1730,8 +1730,7 @@ static int myri10ge_set_rx_csum(struct net_device *netdev, u32 csum_enabled)
 	if (csum_enabled)
 		mgp->csum_flag = MXGEFW_FLAGS_CKSUM;
 	else {
-		u32 flags = ethtool_op_get_flags(netdev);
-		err = ethtool_op_set_flags(netdev, (flags & ~ETH_FLAG_LRO));
+		netdev->features &= ~NETIF_F_LRO;
 		mgp->csum_flag = 0;
 
 	}
@@ -1900,6 +1899,11 @@ static u32 myri10ge_get_msglevel(struct net_device *netdev)
 	return mgp->msg_enable;
 }
 
+static int myri10ge_set_flags(struct net_device *netdev, u32 value)
+{
+	return ethtool_op_set_flags(netdev, value, ETH_FLAG_LRO);
+}
+
 static const struct ethtool_ops myri10ge_ethtool_ops = {
 	.get_settings = myri10ge_get_settings,
 	.get_drvinfo = myri10ge_get_drvinfo,
@@ -1920,7 +1924,7 @@ static const struct ethtool_ops myri10ge_ethtool_ops = {
 	.set_msglevel = myri10ge_set_msglevel,
 	.get_msglevel = myri10ge_get_msglevel,
 	.get_flags = ethtool_op_get_flags,
-	.set_flags = ethtool_op_set_flags
+	.set_flags = myri10ge_set_flags
 };
 
 static int myri10ge_allocate_rings(struct myri10ge_slice_state *ss)
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index 63e8e3893bd6..3d523cb7975a 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -7920,14 +7920,7 @@ static int niu_phys_id(struct net_device *dev, u32 data)
 
 static int niu_set_flags(struct net_device *dev, u32 data)
 {
-	if (data & (ETH_FLAG_LRO | ETH_FLAG_NTUPLE))
-		return -EOPNOTSUPP;
-
-	if (data & ETH_FLAG_RXHASH)
-		dev->features |= NETIF_F_RXHASH;
-	else
-		dev->features &= ~NETIF_F_RXHASH;
-	return 0;
+	return ethtool_op_set_flags(dev, data, ETH_FLAG_RXHASH);
 }
 
 static const struct ethtool_ops niu_ethtool_ops = {
diff --git a/drivers/net/sfc/ethtool.c b/drivers/net/sfc/ethtool.c
index 7693cfbf9cf4..23372bf5cd59 100644
--- a/drivers/net/sfc/ethtool.c
+++ b/drivers/net/sfc/ethtool.c
@@ -551,10 +551,7 @@ static int efx_ethtool_set_flags(struct net_device *net_dev, u32 data)
 	struct efx_nic *efx = netdev_priv(net_dev);
 	u32 supported = efx->type->offload_features & ETH_FLAG_RXHASH;
 
-	if (data & ~supported)
-		return -EOPNOTSUPP;
-
-	return ethtool_op_set_flags(net_dev, data);
+	return ethtool_op_set_flags(net_dev, data, supported);
 }
 
 static void efx_ethtool_self_test(struct net_device *net_dev,
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 7985165e84fc..c762c6ac055b 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -4188,17 +4188,13 @@ static int sky2_set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom
 static int sky2_set_flags(struct net_device *dev, u32 data)
 {
 	struct sky2_port *sky2 = netdev_priv(dev);
+	u32 supported =
+		(sky2->hw->flags & SKY2_HW_RSS_BROKEN) ? 0 : ETH_FLAG_RXHASH;
+	int rc;
 
-	if (data & ~ETH_FLAG_RXHASH)
-		return -EOPNOTSUPP;
-
-	if (data & ETH_FLAG_RXHASH) {
-		if (sky2->hw->flags & SKY2_HW_RSS_BROKEN)
-			return -EINVAL;
-
-		dev->features |= NETIF_F_RXHASH;
-	} else
-		dev->features &= ~NETIF_F_RXHASH;
+	rc = ethtool_op_set_flags(dev, data, supported);
+	if (rc)
+		return rc;
 
 	rx_set_rss(dev);
 
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 2c8af093d8b3..084ddb3c8032 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -457,7 +457,7 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data);
 u32 ethtool_op_get_ufo(struct net_device *dev);
 int ethtool_op_set_ufo(struct net_device *dev, u32 data);
 u32 ethtool_op_get_flags(struct net_device *dev);
-int ethtool_op_set_flags(struct net_device *dev, u32 data);
+int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported);
 void ethtool_ntuple_flush(struct net_device *dev);
 
 /**
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index a0f4964033d2..5d42fae520d9 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -144,31 +144,13 @@ u32 ethtool_op_get_flags(struct net_device *dev)
 }
 EXPORT_SYMBOL(ethtool_op_get_flags);
 
-int ethtool_op_set_flags(struct net_device *dev, u32 data)
+int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported)
 {
-	const struct ethtool_ops *ops = dev->ethtool_ops;
-	unsigned long features = dev->features;
-
-	if (data & ETH_FLAG_LRO)
-		features |= NETIF_F_LRO;
-	else
-		features &= ~NETIF_F_LRO;
-
-	if (data & ETH_FLAG_NTUPLE) {
-		if (!ops->set_rx_ntuple)
-			return -EOPNOTSUPP;
-		features |= NETIF_F_NTUPLE;
-	} else {
-		/* safe to clear regardless */
-		features &= ~NETIF_F_NTUPLE;
-	}
-
-	if (data & ETH_FLAG_RXHASH)
-		features |= NETIF_F_RXHASH;
-	else
-		features &= ~NETIF_F_RXHASH;
+	if (data & ~supported)
+		return -EINVAL;
 
-	dev->features = features;
+	dev->features = ((dev->features & ~flags_dup_features) |
+			 (data & flags_dup_features));
 	return 0;
 }
 EXPORT_SYMBOL(ethtool_op_set_flags);
-- 
cgit v1.2.3-59-g8ed1b


From a5b6ee291e39e285e021cf251dbcf770c83cd74e Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 30 Jun 2010 05:05:23 +0000
Subject: ethtool: Add support for control of RX flow hash indirection

Many NICs use an indirection table to map an RX flow hash value to one
of an arbitrary number of queues (not necessarily a power of 2).  It
can be useful to remove some queues from this indirection table so
that they are only used for flows that are specifically filtered
there.  It may also be useful to weight the mapping to account for
user processes with the same CPU-affinity as the RX interrupts.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 15 ++++++++++
 net/core/ethtool.c      | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 084ddb3c8032..c1be61f3938b 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -384,6 +384,15 @@ struct ethtool_rxnfc {
 	__u32				rule_locs[0];
 };
 
+struct ethtool_rxfh_indir {
+	__u32	cmd;
+	/* On entry, this is the array size of the user buffer.  On
+	 * return from ETHTOOL_GRXFHINDIR, this is the array size of
+	 * the hardware indirection table. */
+	__u32	size;
+	__u32	ring_index[0];	/* ring/queue index for each hash value */
+};
+
 struct ethtool_rx_ntuple_flow_spec {
 	__u32		 flow_type;
 	union {
@@ -576,6 +585,10 @@ struct ethtool_ops {
 	int	(*set_rx_ntuple)(struct net_device *,
 				 struct ethtool_rx_ntuple *);
 	int	(*get_rx_ntuple)(struct net_device *, u32 stringset, void *);
+	int	(*get_rxfh_indir)(struct net_device *,
+				  struct ethtool_rxfh_indir *);
+	int	(*set_rxfh_indir)(struct net_device *,
+				  const struct ethtool_rxfh_indir *);
 };
 #endif /* __KERNEL__ */
 
@@ -637,6 +650,8 @@ struct ethtool_ops {
 #define ETHTOOL_SRXNTUPLE	0x00000035 /* Add an n-tuple filter to device */
 #define ETHTOOL_GRXNTUPLE	0x00000036 /* Get n-tuple filters from device */
 #define ETHTOOL_GSSET_INFO	0x00000037 /* Get string set info */
+#define ETHTOOL_GRXFHINDIR	0x00000038 /* Get RX flow hash indir'n table */
+#define ETHTOOL_SRXFHINDIR	0x00000039 /* Set RX flow hash indir'n table */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 5d42fae520d9..072d1d3796cb 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -358,6 +358,80 @@ err_out:
 	return ret;
 }
 
+static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
+						     void __user *useraddr)
+{
+	struct ethtool_rxfh_indir *indir;
+	u32 table_size;
+	size_t full_size;
+	int ret;
+
+	if (!dev->ethtool_ops->get_rxfh_indir)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&table_size,
+			   useraddr + offsetof(struct ethtool_rxfh_indir, size),
+			   sizeof(table_size)))
+		return -EFAULT;
+
+	if (table_size >
+	    (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
+		return -ENOMEM;
+	full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
+	indir = kmalloc(full_size, GFP_USER);
+	if (!indir)
+		return -ENOMEM;
+
+	indir->cmd = ETHTOOL_GRXFHINDIR;
+	indir->size = table_size;
+	ret = dev->ethtool_ops->get_rxfh_indir(dev, indir);
+	if (ret)
+		goto out;
+
+	if (copy_to_user(useraddr, indir, full_size))
+		ret = -EFAULT;
+
+out:
+	kfree(indir);
+	return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
+						     void __user *useraddr)
+{
+	struct ethtool_rxfh_indir *indir;
+	u32 table_size;
+	size_t full_size;
+	int ret;
+
+	if (!dev->ethtool_ops->set_rxfh_indir)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&table_size,
+			   useraddr + offsetof(struct ethtool_rxfh_indir, size),
+			   sizeof(table_size)))
+		return -EFAULT;
+
+	if (table_size >
+	    (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
+		return -ENOMEM;
+	full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
+	indir = kmalloc(full_size, GFP_USER);
+	if (!indir)
+		return -ENOMEM;
+
+	if (copy_from_user(indir, useraddr, full_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = dev->ethtool_ops->set_rxfh_indir(dev, indir);
+
+out:
+	kfree(indir);
+	return ret;
+}
+
 static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
 			struct ethtool_rx_ntuple_flow_spec *spec,
 			struct ethtool_rx_ntuple_flow_spec_container *fsc)
@@ -1526,6 +1600,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GSSET_INFO:
 		rc = ethtool_get_sset_info(dev, useraddr);
 		break;
+	case ETHTOOL_GRXFHINDIR:
+		rc = ethtool_get_rxfh_indir(dev, useraddr);
+		break;
+	case ETHTOOL_SRXFHINDIR:
+		rc = ethtool_set_rxfh_indir(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From a53f4b61a76a7e95139b8e8abba02e9bfe87a58a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 1 Jul 2010 12:45:34 -0700
Subject: Revert "net: Make accesses to ->br_port safe for sparse RCU"

This reverts commit 81bdf5bd7349bd4523538cbd7878f334bc2bfe14, which is
obsoleted by commit f350a0a87374 from the net tree.
---
 include/linux/if_bridge.h           | 3 ---
 net/bridge/br_fdb.c                 | 2 +-
 net/bridge/br_private.h             | 5 -----
 net/bridge/netfilter/ebt_redirect.c | 2 +-
 net/bridge/netfilter/ebt_ulog.c     | 4 ++--
 net/bridge/netfilter/ebtables.c     | 4 ++--
 net/netfilter/nfnetlink_log.c       | 4 ++--
 net/netfilter/nfnetlink_queue.c     | 4 ++--
 8 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index d001d782922d..938b7e81df95 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -101,9 +101,6 @@ struct __fdb_entry {
 
 #include <linux/netdevice.h>
 
-/* br_handle_frame_hook() needs the following forward declaration. */
-struct net_bridge_port;
-
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
 extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
 					       struct sk_buff *skb);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 09c479e05623..b01dde35a69e 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -244,7 +244,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
 		return 0;
 
 	rcu_read_lock();
-	fdb = __br_fdb_get(br_port(dev)->br, addr);
+	fdb = __br_fdb_get(dev->br_port->br, addr);
 	ret = fdb && fdb->dst->dev != dev &&
 		fdb->dst->state == BR_STATE_FORWARDING;
 	rcu_read_unlock();
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 3255188355b5..0f4a74bc6a9b 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -268,11 +268,6 @@ static inline int br_is_root_bridge(const struct net_bridge *br)
 	return !memcmp(&br->bridge_id, &br->designated_root, 8);
 }
 
-static inline struct net_bridge_port *br_port(const struct net_device *dev)
-{
-	return rcu_dereference(dev->br_port);
-}
-
 /* br_device.c */
 extern void br_dev_setup(struct net_device *dev);
 extern netdev_tx_t br_dev_xmit(struct sk_buff *skb,
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index a39df0ae0f81..9e19166ba453 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -25,7 +25,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
 	if (par->hooknum != NF_BR_BROUTING)
 		memcpy(eth_hdr(skb)->h_dest,
-		       br_port(par->in)->br->dev->dev_addr, ETH_ALEN);
+		       par->in->br_port->br->dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(eth_hdr(skb)->h_dest, par->in->dev_addr, ETH_ALEN);
 	skb->pkt_type = PACKET_HOST;
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 5a4996bbb090..ae3c7cef1484 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -178,7 +178,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 		strcpy(pm->physindev, in->name);
 		/* If in isn't a bridge, then physindev==indev */
 		if (in->br_port)
-			strcpy(pm->indev, br_port(in)->br->dev->name);
+			strcpy(pm->indev, in->br_port->br->dev->name);
 		else
 			strcpy(pm->indev, in->name);
 	} else
@@ -187,7 +187,7 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 	if (out) {
 		/* If out exists, then out is a bridge port */
 		strcpy(pm->physoutdev, out->name);
-		strcpy(pm->outdev, br_port(out)->br->dev->name);
+		strcpy(pm->outdev, out->br_port->br->dev->name);
 	} else
 		pm->outdev[0] = pm->physoutdev[0] = '\0';
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 4c2aab8cbfc7..59ca00e40dec 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -141,10 +141,10 @@ ebt_basic_match(const struct ebt_entry *e, const struct ethhdr *h,
 	if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT))
 		return 1;
 	if ((!in || !in->br_port) ? 0 : FWINV2(ebt_dev_check(
-	   e->logical_in, br_port(in)->br->dev), EBT_ILOGICALIN))
+	   e->logical_in, in->br_port->br->dev), EBT_ILOGICALIN))
 		return 1;
 	if ((!out || !out->br_port) ? 0 : FWINV2(ebt_dev_check(
-	   e->logical_out, br_port(out)->br->dev), EBT_ILOGICALOUT))
+	   e->logical_out, out->br_port->br->dev), EBT_ILOGICALOUT))
 		return 1;
 
 	if (e->bitmask & EBT_SOURCEMAC) {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 78957cfa3bdd..fc9a211e629e 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -404,7 +404,7 @@ __build_packet_message(struct nfulnl_instance *inst,
 				     htonl(indev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
-				     htonl(br_port(indev)->br->dev->ifindex));
+				     htonl(indev->br_port->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is bridge group, we need to look for
 			 * physical device (when called from ipv4) */
@@ -431,7 +431,7 @@ __build_packet_message(struct nfulnl_instance *inst,
 				     htonl(outdev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
-				     htonl(br_port(outdev)->br->dev->ifindex));
+				     htonl(outdev->br_port->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is a bridge group, we need to look
 			 * for physical device (when called from ipv4) */
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index c3c17498298e..12e1ab37fcd8 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -297,7 +297,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 				     htonl(indev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV,
-				     htonl(br_port(indev)->br->dev->ifindex));
+				     htonl(indev->br_port->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is bridge group, we need to look for
 			 * physical device (when called from ipv4) */
@@ -322,7 +322,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 				     htonl(outdev->ifindex));
 			/* this is the bridge group "brX" */
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV,
-				     htonl(br_port(outdev)->br->dev->ifindex));
+				     htonl(outdev->br_port->br->dev->ifindex));
 		} else {
 			/* Case 2: outdev is bridge group, we need to look for
 			 * physical output device (when called from ipv4) */
-- 
cgit v1.2.3-59-g8ed1b


From ad72cf9885c536e3adae03f8337557ac9dd1e4bb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 2 Jul 2010 10:03:52 +0200
Subject: libata: take advantage of cmwq and remove concurrency limitations

libata has two concurrency related limitations.

a. ata_wq which is used for polling PIO has single thread per CPU.  If
   there are multiple devices doing polling PIO on the same CPU, they
   can't be executed simultaneously.

b. ata_aux_wq which is used for SCSI probing has single thread.  In
   cases where SCSI probing is stalled for extended period of time
   which is possible for ATAPI devices, this will stall all probing.

#a is solved by increasing maximum concurrency of ata_wq.  Please note
that polling PIO might be used under allocation path and thus needs to
be served by a separate wq with a rescuer.

#b is solved by using the default wq instead and achieving exclusion
via per-port mutex.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/ata/libata-core.c | 20 +++++---------------
 drivers/ata/libata-eh.c   |  4 ++--
 drivers/ata/libata-scsi.c | 10 ++++++----
 drivers/ata/libata-sff.c  |  9 +--------
 drivers/ata/libata.h      |  1 -
 include/linux/libata.h    |  1 +
 6 files changed, 15 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index ddf8e4862787..4f78741692dc 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -98,8 +98,6 @@ static unsigned long ata_dev_blacklisted(const struct ata_device *dev);
 
 unsigned int ata_print_id = 1;
 
-struct workqueue_struct *ata_aux_wq;
-
 struct ata_force_param {
 	const char	*name;
 	unsigned int	cbl;
@@ -5611,6 +5609,7 @@ struct ata_port *ata_port_alloc(struct ata_host *host)
 	ap->msg_enable = ATA_MSG_DRV | ATA_MSG_ERR | ATA_MSG_WARN;
 #endif
 
+	mutex_init(&ap->scsi_scan_mutex);
 	INIT_DELAYED_WORK(&ap->hotplug_task, ata_scsi_hotplug);
 	INIT_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan);
 	INIT_LIST_HEAD(&ap->eh_done_q);
@@ -6549,29 +6548,20 @@ static int __init ata_init(void)
 
 	ata_parse_force_param();
 
-	ata_aux_wq = create_singlethread_workqueue("ata_aux");
-	if (!ata_aux_wq)
-		goto fail;
-
 	rc = ata_sff_init();
-	if (rc)
-		goto fail;
+	if (rc) {
+		kfree(ata_force_tbl);
+		return rc;
+	}
 
 	printk(KERN_DEBUG "libata version " DRV_VERSION " loaded.\n");
 	return 0;
-
-fail:
-	kfree(ata_force_tbl);
-	if (ata_aux_wq)
-		destroy_workqueue(ata_aux_wq);
-	return rc;
 }
 
 static void __exit ata_exit(void)
 {
 	ata_sff_exit();
 	kfree(ata_force_tbl);
-	destroy_workqueue(ata_aux_wq);
 }
 
 subsys_initcall(ata_init);
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index f77a67303f8b..4d2af824dd23 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -727,7 +727,7 @@ void ata_scsi_error(struct Scsi_Host *host)
 	if (ap->pflags & ATA_PFLAG_LOADING)
 		ap->pflags &= ~ATA_PFLAG_LOADING;
 	else if (ap->pflags & ATA_PFLAG_SCSI_HOTPLUG)
-		queue_delayed_work(ata_aux_wq, &ap->hotplug_task, 0);
+		schedule_delayed_work(&ap->hotplug_task, 0);
 
 	if (ap->pflags & ATA_PFLAG_RECOVERED)
 		ata_port_printk(ap, KERN_INFO, "EH complete\n");
@@ -2944,7 +2944,7 @@ static int ata_eh_revalidate_and_attach(struct ata_link *link,
 			ehc->i.flags |= ATA_EHI_SETMODE;
 
 			/* schedule the scsi_rescan_device() here */
-			queue_work(ata_aux_wq, &(ap->scsi_rescan_task));
+			schedule_work(&(ap->scsi_rescan_task));
 		} else if (dev->class == ATA_DEV_UNKNOWN &&
 			   ehc->tries[dev->devno] &&
 			   ata_class_enabled(ehc->classes[dev->devno])) {
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index a54273d2c3c6..d75c9c479d1a 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3435,7 +3435,7 @@ void ata_scsi_scan_host(struct ata_port *ap, int sync)
 				"                  switching to async\n");
 	}
 
-	queue_delayed_work(ata_aux_wq, &ap->hotplug_task,
+	queue_delayed_work(system_long_wq, &ap->hotplug_task,
 			   round_jiffies_relative(HZ));
 }
 
@@ -3582,6 +3582,7 @@ void ata_scsi_hotplug(struct work_struct *work)
 	}
 
 	DPRINTK("ENTER\n");
+	mutex_lock(&ap->scsi_scan_mutex);
 
 	/* Unplug detached devices.  We cannot use link iterator here
 	 * because PMP links have to be scanned even if PMP is
@@ -3595,6 +3596,7 @@ void ata_scsi_hotplug(struct work_struct *work)
 	/* scan for new ones */
 	ata_scsi_scan_host(ap, 0);
 
+	mutex_unlock(&ap->scsi_scan_mutex);
 	DPRINTK("EXIT\n");
 }
 
@@ -3673,9 +3675,7 @@ static int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
  *	@work: Pointer to ATA port to perform scsi_rescan_device()
  *
  *	After ATA pass thru (SAT) commands are executed successfully,
- *	libata need to propagate the changes to SCSI layer.  This
- *	function must be executed from ata_aux_wq such that sdev
- *	attach/detach don't race with rescan.
+ *	libata need to propagate the changes to SCSI layer.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep).
@@ -3688,6 +3688,7 @@ void ata_scsi_dev_rescan(struct work_struct *work)
 	struct ata_device *dev;
 	unsigned long flags;
 
+	mutex_lock(&ap->scsi_scan_mutex);
 	spin_lock_irqsave(ap->lock, flags);
 
 	ata_for_each_link(link, ap, EDGE) {
@@ -3707,6 +3708,7 @@ void ata_scsi_dev_rescan(struct work_struct *work)
 	}
 
 	spin_unlock_irqrestore(ap->lock, flags);
+	mutex_unlock(&ap->scsi_scan_mutex);
 }
 
 /**
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index efa4a18cfb9d..674c1436491f 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -3318,14 +3318,7 @@ void ata_sff_port_init(struct ata_port *ap)
 
 int __init ata_sff_init(void)
 {
-	/*
-	 * FIXME: In UP case, there is only one workqueue thread and if you
-	 * have more than one PIO device, latency is bloody awful, with
-	 * occasional multi-second "hiccups" as one PIO device waits for
-	 * another.  It's an ugly wart that users DO occasionally complain
-	 * about; luckily most users have at most one PIO polled device.
-	 */
-	ata_sff_wq = create_workqueue("ata_sff");
+	ata_sff_wq = alloc_workqueue("ata_sff", WQ_RESCUER, WQ_MAX_ACTIVE);
 	if (!ata_sff_wq)
 		return -ENOMEM;
 
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 4b84ed60324a..9ce1ecc63e39 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -54,7 +54,6 @@ enum {
 };
 
 extern unsigned int ata_print_id;
-extern struct workqueue_struct *ata_aux_wq;
 extern int atapi_passthru16;
 extern int libata_fua;
 extern int libata_noacpi;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index b85f3ff34d7d..f010f18a0f86 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -751,6 +751,7 @@ struct ata_port {
 	struct ata_host		*host;
 	struct device 		*dev;
 
+	struct mutex		scsi_scan_mutex;
 	struct delayed_work	hotplug_task;
 	struct work_struct	scsi_rescan_task;
 
-- 
cgit v1.2.3-59-g8ed1b


From bdbc5dd7de5d07d6c9d3536e598956165a031d4c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 2 Jul 2010 10:03:51 +0200
Subject: workqueue: prepare for WQ_UNBOUND implementation

In preparation of WQ_UNBOUND addition, make the following changes.

* Add WORK_CPU_* constants for pseudo cpu id numbers used (currently
  only WORK_CPU_NONE) and use them instead of NR_CPUS.  This is to
  allow another pseudo cpu id for unbound cpu.

* Reorder WQ_* flags.

* Make workqueue_struct->cpu_wq a union which contains a percpu
  pointer, regular pointer and an unsigned long value and use
  kzalloc/kfree() in UP allocation path.  This will be used to
  implement unbound workqueues which will use only one cwq on SMPs.

* Move alloc_cwqs() allocation after initialization of wq fields, so
  that alloc_cwqs() has access to wq->flags.

* Trivial relocation of wq local variables in freeze functions.

These changes don't cause any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 10 ++++--
 kernel/workqueue.c        | 83 ++++++++++++++++++++++++-----------------------
 2 files changed, 50 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 3f36d37ac5ba..139069a6286c 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -50,6 +50,10 @@ enum {
 	WORK_NR_COLORS		= (1 << WORK_STRUCT_COLOR_BITS) - 1,
 	WORK_NO_COLOR		= WORK_NR_COLORS,
 
+	/* special cpu IDs */
+	WORK_CPU_NONE		= NR_CPUS,
+	WORK_CPU_LAST		= WORK_CPU_NONE,
+
 	/*
 	 * Reserve 6 bits off of cwq pointer w/ debugobjects turned
 	 * off.  This makes cwqs aligned to 64 bytes which isn't too
@@ -60,7 +64,7 @@ enum {
 
 	WORK_STRUCT_FLAG_MASK	= (1UL << WORK_STRUCT_FLAG_BITS) - 1,
 	WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
-	WORK_STRUCT_NO_CPU	= NR_CPUS << WORK_STRUCT_FLAG_BITS,
+	WORK_STRUCT_NO_CPU	= WORK_CPU_NONE << WORK_STRUCT_FLAG_BITS,
 
 	/* bit mask for work_busy() return values */
 	WORK_BUSY_PENDING	= 1 << 0,
@@ -227,9 +231,9 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 	clear_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))
 
 enum {
-	WQ_FREEZEABLE		= 1 << 0, /* freeze during suspend */
+	WQ_NON_REENTRANT	= 1 << 0, /* guarantee non-reentrance */
 	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
-	WQ_NON_REENTRANT	= 1 << 2, /* guarantee non-reentrance */
+	WQ_FREEZEABLE		= 1 << 2, /* freeze during suspend */
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2eb9fbddf5c6..a105ddf55f79 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -190,7 +190,11 @@ struct wq_flusher {
  */
 struct workqueue_struct {
 	unsigned int		flags;		/* I: WQ_* flags */
-	struct cpu_workqueue_struct *cpu_wq;	/* I: cwq's */
+	union {
+		struct cpu_workqueue_struct __percpu	*pcpu;
+		struct cpu_workqueue_struct		*single;
+		unsigned long				v;
+	} cpu_wq;				/* I: cwq's */
 	struct list_head	list;		/* W: list of all workqueues */
 
 	struct mutex		flush_mutex;	/* protects wq flushing */
@@ -362,7 +366,11 @@ static atomic_t *get_gcwq_nr_running(unsigned int cpu)
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 					    struct workqueue_struct *wq)
 {
-	return per_cpu_ptr(wq->cpu_wq, cpu);
+#ifndef CONFIG_SMP
+	return wq->cpu_wq.single;
+#else
+	return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+#endif
 }
 
 static unsigned int work_color_to_flags(int color)
@@ -442,7 +450,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 		return ((struct cpu_workqueue_struct *)data)->gcwq;
 
 	cpu = data >> WORK_STRUCT_FLAG_BITS;
-	if (cpu == NR_CPUS)
+	if (cpu == WORK_CPU_NONE)
 		return NULL;
 
 	BUG_ON(cpu >= nr_cpu_ids);
@@ -846,7 +854,7 @@ static void cwq_unbind_single_cpu(struct cpu_workqueue_struct *cwq)
 	 */
 	if (likely(!(gcwq->flags & GCWQ_FREEZING))) {
 		smp_wmb();	/* paired with cmpxchg() in __queue_work() */
-		wq->single_cpu = NR_CPUS;
+		wq->single_cpu = WORK_CPU_NONE;
 	}
 }
 
@@ -904,7 +912,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 		 */
 	retry:
 		cpu = wq->single_cpu;
-		arbitrate = cpu == NR_CPUS;
+		arbitrate = cpu == WORK_CPU_NONE;
 		if (arbitrate)
 			cpu = req_cpu;
 
@@ -918,7 +926,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 		 * visible on the new cpu after this point.
 		 */
 		if (arbitrate)
-			cmpxchg(&wq->single_cpu, NR_CPUS, cpu);
+			cmpxchg(&wq->single_cpu, WORK_CPU_NONE, cpu);
 
 		if (unlikely(wq->single_cpu != cpu)) {
 			spin_unlock_irqrestore(&gcwq->lock, flags);
@@ -2572,7 +2580,7 @@ int keventd_up(void)
 	return system_wq != NULL;
 }
 
-static struct cpu_workqueue_struct *alloc_cwqs(void)
+static int alloc_cwqs(struct workqueue_struct *wq)
 {
 	/*
 	 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
@@ -2582,40 +2590,36 @@ static struct cpu_workqueue_struct *alloc_cwqs(void)
 	const size_t size = sizeof(struct cpu_workqueue_struct);
 	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
 				   __alignof__(unsigned long long));
-	struct cpu_workqueue_struct *cwqs;
 #ifndef CONFIG_SMP
 	void *ptr;
 
 	/*
-	 * On UP, percpu allocator doesn't honor alignment parameter
-	 * and simply uses arch-dependent default.  Allocate enough
-	 * room to align cwq and put an extra pointer at the end
-	 * pointing back to the originally allocated pointer which
-	 * will be used for free.
-	 *
-	 * FIXME: This really belongs to UP percpu code.  Update UP
-	 * percpu code to honor alignment and remove this ugliness.
+	 * Allocate enough room to align cwq and put an extra pointer
+	 * at the end pointing back to the originally allocated
+	 * pointer which will be used for free.
 	 */
-	ptr = __alloc_percpu(size + align + sizeof(void *), 1);
-	cwqs = PTR_ALIGN(ptr, align);
-	*(void **)per_cpu_ptr(cwqs + 1, 0) = ptr;
+	ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
+	if (ptr) {
+		wq->cpu_wq.single = PTR_ALIGN(ptr, align);
+		*(void **)(wq->cpu_wq.single + 1) = ptr;
+	}
 #else
-	/* On SMP, percpu allocator can do it itself */
-	cwqs = __alloc_percpu(size, align);
+	/* On SMP, percpu allocator can align itself */
+	wq->cpu_wq.pcpu = __alloc_percpu(size, align);
 #endif
 	/* just in case, make sure it's actually aligned */
-	BUG_ON(!IS_ALIGNED((unsigned long)cwqs, align));
-	return cwqs;
+	BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
+	return wq->cpu_wq.v ? 0 : -ENOMEM;
 }
 
-static void free_cwqs(struct cpu_workqueue_struct *cwqs)
+static void free_cwqs(struct workqueue_struct *wq)
 {
 #ifndef CONFIG_SMP
 	/* on UP, the pointer to free is stored right after the cwq */
-	if (cwqs)
-		free_percpu(*(void **)per_cpu_ptr(cwqs + 1, 0));
+	if (wq->cpu_wq.single)
+		kfree(*(void **)(wq->cpu_wq.single + 1));
 #else
-	free_percpu(cwqs);
+	free_percpu(wq->cpu_wq.pcpu);
 #endif
 }
 
@@ -2645,22 +2649,21 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	if (!wq)
 		goto err;
 
-	wq->cpu_wq = alloc_cwqs();
-	if (!wq->cpu_wq)
-		goto err;
-
 	wq->flags = flags;
 	wq->saved_max_active = max_active;
 	mutex_init(&wq->flush_mutex);
 	atomic_set(&wq->nr_cwqs_to_flush, 0);
 	INIT_LIST_HEAD(&wq->flusher_queue);
 	INIT_LIST_HEAD(&wq->flusher_overflow);
-	wq->single_cpu = NR_CPUS;
+	wq->single_cpu = WORK_CPU_NONE;
 
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	INIT_LIST_HEAD(&wq->list);
 
+	if (alloc_cwqs(wq) < 0)
+		goto err;
+
 	for_each_possible_cpu(cpu) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		struct global_cwq *gcwq = get_gcwq(cpu);
@@ -2710,7 +2713,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	return wq;
 err:
 	if (wq) {
-		free_cwqs(wq->cpu_wq);
+		free_cwqs(wq);
 		free_cpumask_var(wq->mayday_mask);
 		kfree(wq->rescuer);
 		kfree(wq);
@@ -2755,7 +2758,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 		free_cpumask_var(wq->mayday_mask);
 	}
 
-	free_cwqs(wq->cpu_wq);
+	free_cwqs(wq);
 	kfree(wq);
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
@@ -2821,13 +2824,13 @@ EXPORT_SYMBOL_GPL(workqueue_congested);
  * @work: the work of interest
  *
  * RETURNS:
- * CPU number if @work was ever queued.  NR_CPUS otherwise.
+ * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise.
  */
 unsigned int work_cpu(struct work_struct *work)
 {
 	struct global_cwq *gcwq = get_work_gcwq(work);
 
-	return gcwq ? gcwq->cpu : NR_CPUS;
+	return gcwq ? gcwq->cpu : WORK_CPU_NONE;
 }
 EXPORT_SYMBOL_GPL(work_cpu);
 
@@ -3300,7 +3303,6 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
  */
 void freeze_workqueues_begin(void)
 {
-	struct workqueue_struct *wq;
 	unsigned int cpu;
 
 	spin_lock(&workqueue_lock);
@@ -3310,6 +3312,7 @@ void freeze_workqueues_begin(void)
 
 	for_each_possible_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct workqueue_struct *wq;
 
 		spin_lock_irq(&gcwq->lock);
 
@@ -3344,7 +3347,6 @@ void freeze_workqueues_begin(void)
  */
 bool freeze_workqueues_busy(void)
 {
-	struct workqueue_struct *wq;
 	unsigned int cpu;
 	bool busy = false;
 
@@ -3353,6 +3355,7 @@ bool freeze_workqueues_busy(void)
 	BUG_ON(!workqueue_freezing);
 
 	for_each_possible_cpu(cpu) {
+		struct workqueue_struct *wq;
 		/*
 		 * nr_active is monotonically decreasing.  It's safe
 		 * to peek without lock.
@@ -3386,7 +3389,6 @@ out_unlock:
  */
 void thaw_workqueues(void)
 {
-	struct workqueue_struct *wq;
 	unsigned int cpu;
 
 	spin_lock(&workqueue_lock);
@@ -3396,6 +3398,7 @@ void thaw_workqueues(void)
 
 	for_each_possible_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct workqueue_struct *wq;
 
 		spin_lock_irq(&gcwq->lock);
 
@@ -3443,7 +3446,7 @@ void __init init_workqueues(void)
 	 * sure cpu number won't overflow into kernel pointer area so
 	 * that they can be distinguished.
 	 */
-	BUILD_BUG_ON(NR_CPUS << WORK_STRUCT_FLAG_BITS >= PAGE_OFFSET);
+	BUILD_BUG_ON(WORK_CPU_LAST << WORK_STRUCT_FLAG_BITS >= PAGE_OFFSET);
 
 	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
-- 
cgit v1.2.3-59-g8ed1b


From f34217977d717385a3e9fd7018ac39fade3964c0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 2 Jul 2010 10:03:51 +0200
Subject: workqueue: implement unbound workqueue

This patch implements unbound workqueue which can be specified with
WQ_UNBOUND flag on creation.  An unbound workqueue has the following
properties.

* It uses a dedicated gcwq with a pseudo CPU number WORK_CPU_UNBOUND.
  This gcwq is always online and disassociated.

* Workers are not bound to any CPU and not concurrency managed.  Works
  are dispatched to workers as soon as possible and the only applied
  limitation is @max_active.  IOW, all unbound workqeueues are
  implicitly high priority.

Unbound workqueues can be used as simple execution context provider.
Contexts unbound to any cpu are served as soon as possible.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: David Howells <dhowells@redhat.com>
---
 include/linux/workqueue.h |  15 +++-
 kernel/workqueue.c        | 218 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 173 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 139069a6286c..67ce734747f6 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -51,7 +51,8 @@ enum {
 	WORK_NO_COLOR		= WORK_NR_COLORS,
 
 	/* special cpu IDs */
-	WORK_CPU_NONE		= NR_CPUS,
+	WORK_CPU_UNBOUND	= NR_CPUS,
+	WORK_CPU_NONE		= NR_CPUS + 1,
 	WORK_CPU_LAST		= WORK_CPU_NONE,
 
 	/*
@@ -237,11 +238,17 @@ enum {
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
+	WQ_UNBOUND		= 1 << 6, /* not bound to any cpu */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
+	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
 	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
 };
 
+/* unbound wq's aren't per-cpu, scale max_active according to #cpus */
+#define WQ_UNBOUND_MAX_ACTIVE	\
+	max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU)
+
 /*
  * System-wide workqueues which are always present.
  *
@@ -256,10 +263,16 @@ enum {
  * system_nrt_wq is non-reentrant and guarantees that any given work
  * item is never executed in parallel by multiple CPUs.  Queue
  * flushing might take relatively long.
+ *
+ * system_unbound_wq is unbound workqueue.  Workers are not bound to
+ * any specific CPU, not concurrency managed, and all queued works are
+ * executed immediately as long as max_active limit is not reached and
+ * resources are available.
  */
 extern struct workqueue_struct *system_wq;
 extern struct workqueue_struct *system_long_wq;
 extern struct workqueue_struct *system_nrt_wq;
+extern struct workqueue_struct *system_unbound_wq;
 
 extern struct workqueue_struct *
 __alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a105ddf55f79..4608563cdd63 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -53,9 +53,10 @@ enum {
 	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
 	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
 	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
+	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
 
 	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
-				  WORKER_CPU_INTENSIVE,
+				  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
 
 	/* gcwq->trustee_state */
 	TRUSTEE_START		= 0,		/* start */
@@ -96,7 +97,7 @@ enum {
  * X: During normal operation, modification requires gcwq->lock and
  *    should be done only from local cpu.  Either disabling preemption
  *    on local cpu or grabbing gcwq->lock is enough for read access.
- *    While trustee is in charge, it's identical to L.
+ *    If GCWQ_DISASSOCIATED is set, it's identical to L.
  *
  * F: wq->flush_mutex protected.
  *
@@ -220,14 +221,52 @@ struct workqueue_struct {
 struct workqueue_struct *system_wq __read_mostly;
 struct workqueue_struct *system_long_wq __read_mostly;
 struct workqueue_struct *system_nrt_wq __read_mostly;
+struct workqueue_struct *system_unbound_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
 EXPORT_SYMBOL_GPL(system_long_wq);
 EXPORT_SYMBOL_GPL(system_nrt_wq);
+EXPORT_SYMBOL_GPL(system_unbound_wq);
 
 #define for_each_busy_worker(worker, i, pos, gcwq)			\
 	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
 		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
 
+static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
+				  unsigned int sw)
+{
+	if (cpu < nr_cpu_ids) {
+		if (sw & 1) {
+			cpu = cpumask_next(cpu, mask);
+			if (cpu < nr_cpu_ids)
+				return cpu;
+		}
+		if (sw & 2)
+			return WORK_CPU_UNBOUND;
+	}
+	return WORK_CPU_NONE;
+}
+
+static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
+				struct workqueue_struct *wq)
+{
+	return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
+}
+
+#define for_each_gcwq_cpu(cpu)						\
+	for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);		\
+	     (cpu) < WORK_CPU_NONE;					\
+	     (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
+
+#define for_each_online_gcwq_cpu(cpu)					\
+	for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);		\
+	     (cpu) < WORK_CPU_NONE;					\
+	     (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
+
+#define for_each_cwq_cpu(cpu, wq)					\
+	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));	\
+	     (cpu) < WORK_CPU_NONE;					\
+	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
+
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
 static struct debug_obj_descr work_debug_descr;
@@ -351,26 +390,46 @@ static bool workqueue_freezing;		/* W: have wqs started freezing? */
 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
 static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
 
+/*
+ * Global cpu workqueue and nr_running counter for unbound gcwq.  The
+ * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
+ * workers have WORKER_UNBOUND set.
+ */
+static struct global_cwq unbound_global_cwq;
+static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);	/* always 0 */
+
 static int worker_thread(void *__worker);
 
 static struct global_cwq *get_gcwq(unsigned int cpu)
 {
-	return &per_cpu(global_cwq, cpu);
+	if (cpu != WORK_CPU_UNBOUND)
+		return &per_cpu(global_cwq, cpu);
+	else
+		return &unbound_global_cwq;
 }
 
 static atomic_t *get_gcwq_nr_running(unsigned int cpu)
 {
-	return &per_cpu(gcwq_nr_running, cpu);
+	if (cpu != WORK_CPU_UNBOUND)
+		return &per_cpu(gcwq_nr_running, cpu);
+	else
+		return &unbound_gcwq_nr_running;
 }
 
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 					    struct workqueue_struct *wq)
 {
-#ifndef CONFIG_SMP
-	return wq->cpu_wq.single;
+	if (!(wq->flags & WQ_UNBOUND)) {
+		if (likely(cpu < nr_cpu_ids)) {
+#ifdef CONFIG_SMP
+			return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
 #else
-	return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+			return wq->cpu_wq.single;
 #endif
+		}
+	} else if (likely(cpu == WORK_CPU_UNBOUND))
+		return wq->cpu_wq.single;
+	return NULL;
 }
 
 static unsigned int work_color_to_flags(int color)
@@ -453,7 +512,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 	if (cpu == WORK_CPU_NONE)
 		return NULL;
 
-	BUG_ON(cpu >= nr_cpu_ids);
+	BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
 	return get_gcwq(cpu);
 }
 
@@ -869,11 +928,14 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 
 	debug_work_activate(work);
 
+	if (unlikely(cpu == WORK_CPU_UNBOUND))
+		cpu = raw_smp_processor_id();
+
 	/*
 	 * Determine gcwq to use.  SINGLE_CPU is inherently
 	 * NON_REENTRANT, so test it first.
 	 */
-	if (!(wq->flags & WQ_SINGLE_CPU)) {
+	if (!(wq->flags & (WQ_SINGLE_CPU | WQ_UNBOUND))) {
 		struct global_cwq *last_gcwq;
 
 		/*
@@ -900,7 +962,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			}
 		} else
 			spin_lock_irqsave(&gcwq->lock, flags);
-	} else {
+	} else if (!(wq->flags & WQ_UNBOUND)) {
 		unsigned int req_cpu = cpu;
 
 		/*
@@ -932,6 +994,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			spin_unlock_irqrestore(&gcwq->lock, flags);
 			goto retry;
 		}
+	} else {
+		gcwq = get_gcwq(WORK_CPU_UNBOUND);
+		spin_lock_irqsave(&gcwq->lock, flags);
 	}
 
 	/* gcwq determined, get cwq and queue */
@@ -1166,7 +1231,8 @@ static bool worker_maybe_bind_and_lock(struct worker *worker)
 		 * it races with cpu hotunplug operation.  Verify
 		 * against GCWQ_DISASSOCIATED.
 		 */
-		set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
+		if (!(gcwq->flags & GCWQ_DISASSOCIATED))
+			set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
 
 		spin_lock_irq(&gcwq->lock);
 		if (gcwq->flags & GCWQ_DISASSOCIATED)
@@ -1231,8 +1297,9 @@ static struct worker *alloc_worker(void)
  */
 static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 {
-	int id = -1;
+	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
 	struct worker *worker = NULL;
+	int id = -1;
 
 	spin_lock_irq(&gcwq->lock);
 	while (ida_get_new(&gcwq->worker_ida, &id)) {
@@ -1250,8 +1317,12 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 	worker->gcwq = gcwq;
 	worker->id = id;
 
-	worker->task = kthread_create(worker_thread, worker, "kworker/%u:%d",
-				      gcwq->cpu, id);
+	if (!on_unbound_cpu)
+		worker->task = kthread_create(worker_thread, worker,
+					      "kworker/%u:%d", gcwq->cpu, id);
+	else
+		worker->task = kthread_create(worker_thread, worker,
+					      "kworker/u:%d", id);
 	if (IS_ERR(worker->task))
 		goto fail;
 
@@ -1260,10 +1331,13 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 	 * online later on.  Make sure every worker has
 	 * PF_THREAD_BOUND set.
 	 */
-	if (bind)
+	if (bind && !on_unbound_cpu)
 		kthread_bind(worker->task, gcwq->cpu);
-	else
+	else {
 		worker->task->flags |= PF_THREAD_BOUND;
+		if (on_unbound_cpu)
+			worker->flags |= WORKER_UNBOUND;
+	}
 
 	return worker;
 fail:
@@ -1358,12 +1432,17 @@ static bool send_mayday(struct work_struct *work)
 {
 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
 	struct workqueue_struct *wq = cwq->wq;
+	unsigned int cpu;
 
 	if (!(wq->flags & WQ_RESCUER))
 		return false;
 
 	/* mayday mayday mayday */
-	if (!cpumask_test_and_set_cpu(cwq->gcwq->cpu, wq->mayday_mask))
+	cpu = cwq->gcwq->cpu;
+	/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
+	if (cpu == WORK_CPU_UNBOUND)
+		cpu = 0;
+	if (!cpumask_test_and_set_cpu(cpu, wq->mayday_mask))
 		wake_up_process(wq->rescuer->task);
 	return true;
 }
@@ -1882,6 +1961,7 @@ static int rescuer_thread(void *__wq)
 	struct workqueue_struct *wq = __wq;
 	struct worker *rescuer = wq->rescuer;
 	struct list_head *scheduled = &rescuer->scheduled;
+	bool is_unbound = wq->flags & WQ_UNBOUND;
 	unsigned int cpu;
 
 	set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -1891,8 +1971,13 @@ repeat:
 	if (kthread_should_stop())
 		return 0;
 
+	/*
+	 * See whether any cpu is asking for help.  Unbounded
+	 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
+	 */
 	for_each_cpu(cpu, wq->mayday_mask) {
-		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+		unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
+		struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
 		struct global_cwq *gcwq = cwq->gcwq;
 		struct work_struct *work, *n;
 
@@ -2034,7 +2119,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
 		atomic_set(&wq->nr_cwqs_to_flush, 1);
 	}
 
-	for_each_possible_cpu(cpu) {
+	for_each_cwq_cpu(cpu, wq) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		struct global_cwq *gcwq = cwq->gcwq;
 
@@ -2344,7 +2429,7 @@ static void wait_on_work(struct work_struct *work)
 	lock_map_acquire(&work->lockdep_map);
 	lock_map_release(&work->lockdep_map);
 
-	for_each_possible_cpu(cpu)
+	for_each_gcwq_cpu(cpu)
 		wait_on_cpu_work(get_gcwq(cpu), work);
 }
 
@@ -2590,23 +2675,25 @@ static int alloc_cwqs(struct workqueue_struct *wq)
 	const size_t size = sizeof(struct cpu_workqueue_struct);
 	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
 				   __alignof__(unsigned long long));
-#ifndef CONFIG_SMP
-	void *ptr;
 
-	/*
-	 * Allocate enough room to align cwq and put an extra pointer
-	 * at the end pointing back to the originally allocated
-	 * pointer which will be used for free.
-	 */
-	ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
-	if (ptr) {
-		wq->cpu_wq.single = PTR_ALIGN(ptr, align);
-		*(void **)(wq->cpu_wq.single + 1) = ptr;
+	if (CONFIG_SMP && !(wq->flags & WQ_UNBOUND)) {
+		/* on SMP, percpu allocator can align itself */
+		wq->cpu_wq.pcpu = __alloc_percpu(size, align);
+	} else {
+		void *ptr;
+
+		/*
+		 * Allocate enough room to align cwq and put an extra
+		 * pointer at the end pointing back to the originally
+		 * allocated pointer which will be used for free.
+		 */
+		ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
+		if (ptr) {
+			wq->cpu_wq.single = PTR_ALIGN(ptr, align);
+			*(void **)(wq->cpu_wq.single + 1) = ptr;
+		}
 	}
-#else
-	/* On SMP, percpu allocator can align itself */
-	wq->cpu_wq.pcpu = __alloc_percpu(size, align);
-#endif
+
 	/* just in case, make sure it's actually aligned */
 	BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
 	return wq->cpu_wq.v ? 0 : -ENOMEM;
@@ -2614,23 +2701,25 @@ static int alloc_cwqs(struct workqueue_struct *wq)
 
 static void free_cwqs(struct workqueue_struct *wq)
 {
-#ifndef CONFIG_SMP
-	/* on UP, the pointer to free is stored right after the cwq */
-	if (wq->cpu_wq.single)
+	if (CONFIG_SMP && !(wq->flags & WQ_UNBOUND))
+		free_percpu(wq->cpu_wq.pcpu);
+	else if (wq->cpu_wq.single) {
+		/* the pointer to free is stored right after the cwq */
 		kfree(*(void **)(wq->cpu_wq.single + 1));
-#else
-	free_percpu(wq->cpu_wq.pcpu);
-#endif
+	}
 }
 
-static int wq_clamp_max_active(int max_active, const char *name)
+static int wq_clamp_max_active(int max_active, unsigned int flags,
+			       const char *name)
 {
-	if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
+	int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
+
+	if (max_active < 1 || max_active > lim)
 		printk(KERN_WARNING "workqueue: max_active %d requested for %s "
 		       "is out of range, clamping between %d and %d\n",
-		       max_active, name, 1, WQ_MAX_ACTIVE);
+		       max_active, name, 1, lim);
 
-	return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
+	return clamp_val(max_active, 1, lim);
 }
 
 struct workqueue_struct *__alloc_workqueue_key(const char *name,
@@ -2642,8 +2731,15 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	struct workqueue_struct *wq;
 	unsigned int cpu;
 
+	/*
+	 * Unbound workqueues aren't concurrency managed and should be
+	 * dispatched to workers immediately.
+	 */
+	if (flags & WQ_UNBOUND)
+		flags |= WQ_HIGHPRI;
+
 	max_active = max_active ?: WQ_DFL_ACTIVE;
-	max_active = wq_clamp_max_active(max_active, name);
+	max_active = wq_clamp_max_active(max_active, flags, name);
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
@@ -2664,7 +2760,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	if (alloc_cwqs(wq) < 0)
 		goto err;
 
-	for_each_possible_cpu(cpu) {
+	for_each_cwq_cpu(cpu, wq) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		struct global_cwq *gcwq = get_gcwq(cpu);
 
@@ -2703,7 +2799,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	spin_lock(&workqueue_lock);
 
 	if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
-		for_each_possible_cpu(cpu)
+		for_each_cwq_cpu(cpu, wq)
 			get_cwq(cpu, wq)->max_active = 0;
 
 	list_add(&wq->list, &workqueues);
@@ -2743,7 +2839,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	spin_unlock(&workqueue_lock);
 
 	/* sanity check */
-	for_each_possible_cpu(cpu) {
+	for_each_cwq_cpu(cpu, wq) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		int i;
 
@@ -2777,13 +2873,13 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 {
 	unsigned int cpu;
 
-	max_active = wq_clamp_max_active(max_active, wq->name);
+	max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
 
 	spin_lock(&workqueue_lock);
 
 	wq->saved_max_active = max_active;
 
-	for_each_possible_cpu(cpu) {
+	for_each_cwq_cpu(cpu, wq) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
 
 		spin_lock_irq(&gcwq->lock);
@@ -3310,7 +3406,7 @@ void freeze_workqueues_begin(void)
 	BUG_ON(workqueue_freezing);
 	workqueue_freezing = true;
 
-	for_each_possible_cpu(cpu) {
+	for_each_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
 		struct workqueue_struct *wq;
 
@@ -3322,7 +3418,7 @@ void freeze_workqueues_begin(void)
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
-			if (wq->flags & WQ_FREEZEABLE)
+			if (cwq && wq->flags & WQ_FREEZEABLE)
 				cwq->max_active = 0;
 		}
 
@@ -3354,7 +3450,7 @@ bool freeze_workqueues_busy(void)
 
 	BUG_ON(!workqueue_freezing);
 
-	for_each_possible_cpu(cpu) {
+	for_each_gcwq_cpu(cpu) {
 		struct workqueue_struct *wq;
 		/*
 		 * nr_active is monotonically decreasing.  It's safe
@@ -3363,7 +3459,7 @@ bool freeze_workqueues_busy(void)
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
-			if (!(wq->flags & WQ_FREEZEABLE))
+			if (!cwq || !(wq->flags & WQ_FREEZEABLE))
 				continue;
 
 			BUG_ON(cwq->nr_active < 0);
@@ -3396,7 +3492,7 @@ void thaw_workqueues(void)
 	if (!workqueue_freezing)
 		goto out_unlock;
 
-	for_each_possible_cpu(cpu) {
+	for_each_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
 		struct workqueue_struct *wq;
 
@@ -3408,7 +3504,7 @@ void thaw_workqueues(void)
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
-			if (!(wq->flags & WQ_FREEZEABLE))
+			if (!cwq || !(wq->flags & WQ_FREEZEABLE))
 				continue;
 
 			/* restore max_active and repopulate worklist */
@@ -3451,12 +3547,14 @@ void __init init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
 	/* initialize gcwqs */
-	for_each_possible_cpu(cpu) {
+	for_each_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
 
 		spin_lock_init(&gcwq->lock);
 		INIT_LIST_HEAD(&gcwq->worklist);
 		gcwq->cpu = cpu;
+		if (cpu == WORK_CPU_UNBOUND)
+			gcwq->flags |= GCWQ_DISASSOCIATED;
 
 		INIT_LIST_HEAD(&gcwq->idle_list);
 		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
@@ -3476,7 +3574,7 @@ void __init init_workqueues(void)
 	}
 
 	/* create the initial worker */
-	for_each_online_cpu(cpu) {
+	for_each_online_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
 		struct worker *worker;
 
@@ -3490,5 +3588,7 @@ void __init init_workqueues(void)
 	system_wq = alloc_workqueue("events", 0, 0);
 	system_long_wq = alloc_workqueue("events_long", 0, 0);
 	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
+	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
+					    WQ_UNBOUND_MAX_ACTIVE);
 	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
 }
-- 
cgit v1.2.3-59-g8ed1b


From c7fc77f78f16d138ca997ce096a62f46e2e9420a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 2 Jul 2010 10:03:51 +0200
Subject: workqueue: remove WQ_SINGLE_CPU and use WQ_UNBOUND instead

WQ_SINGLE_CPU combined with @max_active of 1 is used to achieve full
ordering among works queued to a workqueue.  The same can be achieved
using WQ_UNBOUND as unbound workqueues always use the gcwq for
WORK_CPU_UNBOUND.  As @max_active is always one and benefits from cpu
locality isn't accessible anyway, serving them with unbound workqueues
should be fine.

Drop WQ_SINGLE_CPU support and use WQ_UNBOUND instead.  Note that most
single thread workqueue users will be converted to use multithread or
non-reentrant instead and only the ones which require strict ordering
will keep using WQ_UNBOUND + @max_active of 1.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   7 ++--
 kernel/workqueue.c        | 100 +++++++++-------------------------------------
 2 files changed, 21 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 67ce734747f6..d74a529ed13e 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -233,12 +233,11 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 
 enum {
 	WQ_NON_REENTRANT	= 1 << 0, /* guarantee non-reentrance */
-	WQ_SINGLE_CPU		= 1 << 1, /* only single cpu at a time */
+	WQ_UNBOUND		= 1 << 1, /* not bound to any cpu */
 	WQ_FREEZEABLE		= 1 << 2, /* freeze during suspend */
 	WQ_RESCUER		= 1 << 3, /* has an rescue worker */
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
-	WQ_UNBOUND		= 1 << 6, /* not bound to any cpu */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
@@ -300,9 +299,9 @@ __alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
 #define create_workqueue(name)					\
 	alloc_workqueue((name), WQ_RESCUER, 1)
 #define create_freezeable_workqueue(name)			\
-	alloc_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_CPU | WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_FREEZEABLE | WQ_UNBOUND | WQ_RESCUER, 1)
 #define create_singlethread_workqueue(name)			\
-	alloc_workqueue((name), WQ_SINGLE_CPU | WQ_RESCUER, 1)
+	alloc_workqueue((name), WQ_UNBOUND | WQ_RESCUER, 1)
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4608563cdd63..20d6237d7498 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -206,8 +206,6 @@ struct workqueue_struct {
 	struct list_head	flusher_queue;	/* F: flush waiters */
 	struct list_head	flusher_overflow; /* F: flush overflow list */
 
-	unsigned long		single_cpu;	/* cpu for single cpu wq */
-
 	cpumask_var_t		mayday_mask;	/* cpus requesting rescue */
 	struct worker		*rescuer;	/* I: rescue worker */
 
@@ -889,34 +887,6 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 		wake_up_worker(gcwq);
 }
 
-/**
- * cwq_unbind_single_cpu - unbind cwq from single cpu workqueue processing
- * @cwq: cwq to unbind
- *
- * Try to unbind @cwq from single cpu workqueue processing.  If
- * @cwq->wq is frozen, unbind is delayed till the workqueue is thawed.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void cwq_unbind_single_cpu(struct cpu_workqueue_struct *cwq)
-{
-	struct workqueue_struct *wq = cwq->wq;
-	struct global_cwq *gcwq = cwq->gcwq;
-
-	BUG_ON(wq->single_cpu != gcwq->cpu);
-	/*
-	 * Unbind from workqueue if @cwq is not frozen.  If frozen,
-	 * thaw_workqueues() will either restart processing on this
-	 * cpu or unbind if empty.  This keeps works queued while
-	 * frozen fully ordered and flushable.
-	 */
-	if (likely(!(gcwq->flags & GCWQ_FREEZING))) {
-		smp_wmb();	/* paired with cmpxchg() in __queue_work() */
-		wq->single_cpu = WORK_CPU_NONE;
-	}
-}
-
 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
@@ -924,20 +894,16 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	struct cpu_workqueue_struct *cwq;
 	struct list_head *worklist;
 	unsigned long flags;
-	bool arbitrate;
 
 	debug_work_activate(work);
 
-	if (unlikely(cpu == WORK_CPU_UNBOUND))
-		cpu = raw_smp_processor_id();
-
-	/*
-	 * Determine gcwq to use.  SINGLE_CPU is inherently
-	 * NON_REENTRANT, so test it first.
-	 */
-	if (!(wq->flags & (WQ_SINGLE_CPU | WQ_UNBOUND))) {
+	/* determine gcwq to use */
+	if (!(wq->flags & WQ_UNBOUND)) {
 		struct global_cwq *last_gcwq;
 
+		if (unlikely(cpu == WORK_CPU_UNBOUND))
+			cpu = raw_smp_processor_id();
+
 		/*
 		 * It's multi cpu.  If @wq is non-reentrant and @work
 		 * was previously on a different cpu, it might still
@@ -962,38 +928,6 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 			}
 		} else
 			spin_lock_irqsave(&gcwq->lock, flags);
-	} else if (!(wq->flags & WQ_UNBOUND)) {
-		unsigned int req_cpu = cpu;
-
-		/*
-		 * It's a bit more complex for single cpu workqueues.
-		 * We first need to determine which cpu is going to be
-		 * used.  If no cpu is currently serving this
-		 * workqueue, arbitrate using atomic accesses to
-		 * wq->single_cpu; otherwise, use the current one.
-		 */
-	retry:
-		cpu = wq->single_cpu;
-		arbitrate = cpu == WORK_CPU_NONE;
-		if (arbitrate)
-			cpu = req_cpu;
-
-		gcwq = get_gcwq(cpu);
-		spin_lock_irqsave(&gcwq->lock, flags);
-
-		/*
-		 * The following cmpxchg() is a full barrier paired
-		 * with smp_wmb() in cwq_unbind_single_cpu() and
-		 * guarantees that all changes to wq->st_* fields are
-		 * visible on the new cpu after this point.
-		 */
-		if (arbitrate)
-			cmpxchg(&wq->single_cpu, WORK_CPU_NONE, cpu);
-
-		if (unlikely(wq->single_cpu != cpu)) {
-			spin_unlock_irqrestore(&gcwq->lock, flags);
-			goto retry;
-		}
 	} else {
 		gcwq = get_gcwq(WORK_CPU_UNBOUND);
 		spin_lock_irqsave(&gcwq->lock, flags);
@@ -1105,19 +1039,30 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
-		struct global_cwq *gcwq = get_work_gcwq(work);
-		unsigned int lcpu = gcwq ? gcwq->cpu : raw_smp_processor_id();
+		unsigned int lcpu;
 
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
 
 		timer_stats_timer_set_start_info(&dwork->timer);
+
 		/*
 		 * This stores cwq for the moment, for the timer_fn.
 		 * Note that the work's gcwq is preserved to allow
 		 * reentrance detection for delayed works.
 		 */
+		if (!(wq->flags & WQ_UNBOUND)) {
+			struct global_cwq *gcwq = get_work_gcwq(work);
+
+			if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
+				lcpu = gcwq->cpu;
+			else
+				lcpu = raw_smp_processor_id();
+		} else
+			lcpu = WORK_CPU_UNBOUND;
+
 		set_work_cwq(work, get_cwq(lcpu, wq), 0);
+
 		timer->expires = jiffies + delay;
 		timer->data = (unsigned long)dwork;
 		timer->function = delayed_work_timer_fn;
@@ -1696,9 +1641,6 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
 		/* one down, submit a delayed one */
 		if (cwq->nr_active < cwq->max_active)
 			cwq_activate_first_delayed(cwq);
-	} else if (!cwq->nr_active && cwq->wq->flags & WQ_SINGLE_CPU) {
-		/* this was the last work, unbind from single cpu */
-		cwq_unbind_single_cpu(cwq);
 	}
 
 	/* is flush in progress and are we at the flushing tip? */
@@ -2751,7 +2693,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	atomic_set(&wq->nr_cwqs_to_flush, 0);
 	INIT_LIST_HEAD(&wq->flusher_queue);
 	INIT_LIST_HEAD(&wq->flusher_overflow);
-	wq->single_cpu = WORK_CPU_NONE;
 
 	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
@@ -3513,11 +3454,6 @@ void thaw_workqueues(void)
 			while (!list_empty(&cwq->delayed_works) &&
 			       cwq->nr_active < cwq->max_active)
 				cwq_activate_first_delayed(cwq);
-
-			/* perform delayed unbind from single cpu if empty */
-			if (wq->single_cpu == gcwq->cpu &&
-			    !cwq->nr_active && list_empty(&cwq->delayed_works))
-				cwq_unbind_single_cpu(cwq);
 		}
 
 		wake_up_worker(gcwq);
-- 
cgit v1.2.3-59-g8ed1b


From 0f622bf465e78c390e13c5f4a14d0b3f8fb7c7e5 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 1 Jul 2010 09:01:50 -0700
Subject: Input: ads7846 - do not allow altering platform data

Tested-by: Anatolij Gustschin <agust@denx.de>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/ads7846.c | 35 +++++++++++++++++++----------------
 include/linux/spi/ads7846.h         |  2 +-
 2 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index 69210cb56c54..a3771607ead5 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -878,14 +878,15 @@ static int __devinit setup_pendown(struct spi_device *spi, struct ads7846 *ts)
 
 static int __devinit ads7846_probe(struct spi_device *spi)
 {
-	struct ads7846			*ts;
-	struct ads7846_packet		*packet;
-	struct input_dev		*input_dev;
-	struct ads7846_platform_data	*pdata = spi->dev.platform_data;
-	struct spi_message		*m;
-	struct spi_transfer		*x;
-	int				vref;
-	int				err;
+	struct ads7846 *ts;
+	struct ads7846_packet *packet;
+	struct input_dev *input_dev;
+	const struct ads7846_platform_data *pdata = spi->dev.platform_data;
+	struct spi_message *m;
+	struct spi_transfer *x;
+	unsigned long irq_flags;
+	int vref;
+	int err;
 
 	if (!spi->irq) {
 		dev_dbg(&spi->dev, "no IRQ?\n");
@@ -1174,20 +1175,22 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 		goto err_put_regulator;
 	}
 
-	if (!pdata->irq_flags)
-		pdata->irq_flags = IRQF_TRIGGER_FALLING;
+	irq_flags = pdata->irq_flags ? : IRQF_TRIGGER_FALLING;
 
-	if (request_irq(spi->irq, ads7846_irq, pdata->irq_flags,
-			spi->dev.driver->name, ts)) {
+	err = request_irq(spi->irq, ads7846_irq, irq_flags,
+			  spi->dev.driver->name, ts);
+
+	if (err && !pdata->irq_flags) {
 		dev_info(&spi->dev,
 			"trying pin change workaround on irq %d\n", spi->irq);
 		err = request_irq(spi->irq, ads7846_irq,
 				  IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING,
 				  spi->dev.driver->name, ts);
-		if (err) {
-			dev_dbg(&spi->dev, "irq %d busy?\n", spi->irq);
-			goto err_disable_regulator;
-		}
+	}
+
+	if (err) {
+		dev_dbg(&spi->dev, "irq %d busy?\n", spi->irq);
+		goto err_disable_regulator;
 	}
 
 	err = ads784x_hwmon_register(spi, ts);
diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h
index 95d36bfb34bc..92bd0839d5b4 100644
--- a/include/linux/spi/ads7846.h
+++ b/include/linux/spi/ads7846.h
@@ -48,7 +48,7 @@ struct ads7846_platform_data {
 					 * state if get_pendown_state == NULL
 					 */
 	int	(*get_pendown_state)(void);
-	int	(*filter_init)	(struct ads7846_platform_data *pdata,
+	int	(*filter_init)	(const struct ads7846_platform_data *pdata,
 				 void **filter_data);
 	int	(*filter)	(void *filter_data, int data_idx, int *val);
 	void	(*filter_cleanup)(void *filter_data);
-- 
cgit v1.2.3-59-g8ed1b


From 312e8e8a9e2471b0ada7366497fffb3ff1a40e2c Mon Sep 17 00:00:00 2001
From: Joonyoung Shim <jy0922.shim@samsung.com>
Date: Sun, 4 Jul 2010 01:21:25 -0700
Subject: Input: mcs - Add MCS touchkey driver

This adds support for MELPAS MCS5000/MSC5080 touch key controllers.

Signed-off-by: Joonyoung Shim <jy0922.shim@samsung.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/keyboard/Kconfig         |  12 ++
 drivers/input/keyboard/Makefile        |   1 +
 drivers/input/keyboard/mcs_touchkey.c  | 239 +++++++++++++++++++++++++++++++++
 drivers/input/touchscreen/mcs5000_ts.c |   6 +-
 include/linux/i2c/mcs.h                |  34 +++++
 include/linux/i2c/mcs5000_ts.h         |  24 ----
 6 files changed, 289 insertions(+), 27 deletions(-)
 create mode 100644 drivers/input/keyboard/mcs_touchkey.c
 create mode 100644 include/linux/i2c/mcs.h
 delete mode 100644 include/linux/i2c/mcs5000_ts.h

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index d8fa5d724c57..c7480934ceeb 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -297,6 +297,18 @@ config KEYBOARD_MAX7359
 	  To compile this driver as a module, choose M here: the
 	  module will be called max7359_keypad.
 
+config KEYBOARD_MCS
+	tristate "MELFAS MCS Touchkey"
+	depends on I2C
+	help
+	  Say Y here if you have the MELFAS MCS5000/5080 touchkey controller
+	  chip in your system.
+
+	  If unsure, say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mcs_touchkey.
+
 config KEYBOARD_IMX
 	tristate "IMX keypad support"
 	depends on ARCH_MXC
diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile
index 4596d0c6f922..0a16674ed3d2 100644
--- a/drivers/input/keyboard/Makefile
+++ b/drivers/input/keyboard/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_KEYBOARD_LOCOMO)		+= locomokbd.o
 obj-$(CONFIG_KEYBOARD_MAPLE)		+= maple_keyb.o
 obj-$(CONFIG_KEYBOARD_MATRIX)		+= matrix_keypad.o
 obj-$(CONFIG_KEYBOARD_MAX7359)		+= max7359_keypad.o
+obj-$(CONFIG_KEYBOARD_MCS)		+= mcs_touchkey.o
 obj-$(CONFIG_KEYBOARD_NEWTON)		+= newtonkbd.o
 obj-$(CONFIG_KEYBOARD_OMAP)		+= omap-keypad.o
 obj-$(CONFIG_KEYBOARD_OPENCORES)	+= opencores-kbd.o
diff --git a/drivers/input/keyboard/mcs_touchkey.c b/drivers/input/keyboard/mcs_touchkey.c
new file mode 100644
index 000000000000..63b849d7e90b
--- /dev/null
+++ b/drivers/input/keyboard/mcs_touchkey.c
@@ -0,0 +1,239 @@
+/*
+ * mcs_touchkey.c - Touchkey driver for MELFAS MCS5000/5080 controller
+ *
+ * Copyright (C) 2010 Samsung Electronics Co.Ltd
+ * Author: HeungJun Kim <riverful.kim@samsung.com>
+ * Author: Joonyoung Shim <jy0922.shim@samsung.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/i2c.h>
+#include <linux/i2c/mcs.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/irq.h>
+#include <linux/slab.h>
+
+/* MCS5000 Touchkey */
+#define MCS5000_TOUCHKEY_STATUS		0x04
+#define MCS5000_TOUCHKEY_STATUS_PRESS	7
+#define MCS5000_TOUCHKEY_FW		0x0a
+#define MCS5000_TOUCHKEY_BASE_VAL	0x61
+
+/* MCS5080 Touchkey */
+#define MCS5080_TOUCHKEY_STATUS		0x00
+#define MCS5080_TOUCHKEY_STATUS_PRESS	3
+#define MCS5080_TOUCHKEY_FW		0x01
+#define MCS5080_TOUCHKEY_BASE_VAL	0x1
+
+enum mcs_touchkey_type {
+	MCS5000_TOUCHKEY,
+	MCS5080_TOUCHKEY,
+};
+
+struct mcs_touchkey_chip {
+	unsigned int status_reg;
+	unsigned int pressbit;
+	unsigned int press_invert;
+	unsigned int baseval;
+};
+
+struct mcs_touchkey_data {
+	struct i2c_client *client;
+	struct input_dev *input_dev;
+	struct mcs_touchkey_chip chip;
+	unsigned int key_code;
+	unsigned int key_val;
+	unsigned short keycodes[];
+};
+
+static irqreturn_t mcs_touchkey_interrupt(int irq, void *dev_id)
+{
+	struct mcs_touchkey_data *data = dev_id;
+	struct mcs_touchkey_chip *chip = &data->chip;
+	struct i2c_client *client = data->client;
+	struct input_dev *input = data->input_dev;
+	unsigned int key_val;
+	unsigned int pressed;
+	int val;
+
+	val = i2c_smbus_read_byte_data(client, chip->status_reg);
+	if (val < 0) {
+		dev_err(&client->dev, "i2c read error [%d]\n", val);
+		goto out;
+	}
+
+	pressed = (val & (1 << chip->pressbit)) >> chip->pressbit;
+	if (chip->press_invert)
+		pressed ^= chip->press_invert;
+
+	/* key_val is 0 when released, so we should use key_val of press. */
+	if (pressed) {
+		key_val = val & (0xff >> (8 - chip->pressbit));
+		if (!key_val)
+			goto out;
+		key_val -= chip->baseval;
+		data->key_code = data->keycodes[key_val];
+		data->key_val = key_val;
+	}
+
+	input_event(input, EV_MSC, MSC_SCAN, data->key_val);
+	input_report_key(input, data->key_code, pressed);
+	input_sync(input);
+
+	dev_dbg(&client->dev, "key %d %d %s\n", data->key_val, data->key_code,
+		pressed ? "pressed" : "released");
+
+ out:
+	return IRQ_HANDLED;
+}
+
+static int __devinit mcs_touchkey_probe(struct i2c_client *client,
+		const struct i2c_device_id *id)
+{
+	const struct mcs_platform_data *pdata;
+	struct mcs_touchkey_data *data;
+	struct input_dev *input_dev;
+	unsigned int fw_reg;
+	int fw_ver;
+	int error;
+	int i;
+
+	pdata = client->dev.platform_data;
+	if (!pdata) {
+		dev_err(&client->dev, "no platform data defined\n");
+		return -EINVAL;
+	}
+
+	data = kzalloc(sizeof(struct mcs_touchkey_data) +
+			sizeof(data->keycodes[0]) * (pdata->key_maxval + 1),
+			GFP_KERNEL);
+	input_dev = input_allocate_device();
+	if (!data || !input_dev) {
+		dev_err(&client->dev, "Failed to allocate memory\n");
+		error = -ENOMEM;
+		goto err_free_mem;
+	}
+
+	data->client = client;
+	data->input_dev = input_dev;
+
+	if (id->driver_data == MCS5000_TOUCHKEY) {
+		data->chip.status_reg = MCS5000_TOUCHKEY_STATUS;
+		data->chip.pressbit = MCS5000_TOUCHKEY_STATUS_PRESS;
+		data->chip.baseval = MCS5000_TOUCHKEY_BASE_VAL;
+		fw_reg = MCS5000_TOUCHKEY_FW;
+	} else {
+		data->chip.status_reg = MCS5080_TOUCHKEY_STATUS;
+		data->chip.pressbit = MCS5080_TOUCHKEY_STATUS_PRESS;
+		data->chip.press_invert = 1;
+		data->chip.baseval = MCS5080_TOUCHKEY_BASE_VAL;
+		fw_reg = MCS5080_TOUCHKEY_FW;
+	}
+
+	fw_ver = i2c_smbus_read_byte_data(client, fw_reg);
+	if (fw_ver < 0) {
+		error = fw_ver;
+		dev_err(&client->dev, "i2c read error[%d]\n", error);
+		goto err_free_mem;
+	}
+	dev_info(&client->dev, "Firmware version: %d\n", fw_ver);
+
+	input_dev->name = "MELPAS MCS Touchkey";
+	input_dev->id.bustype = BUS_I2C;
+	input_dev->dev.parent = &client->dev;
+	input_dev->evbit[0] = BIT_MASK(EV_KEY);
+	if (!pdata->no_autorepeat)
+		input_dev->evbit[0] |= BIT_MASK(EV_REP);
+	input_dev->keycode = data->keycodes;
+	input_dev->keycodesize = sizeof(data->keycodes[0]);
+	input_dev->keycodemax = pdata->key_maxval + 1;
+
+	for (i = 0; i < pdata->keymap_size; i++) {
+		unsigned int val = MCS_KEY_VAL(pdata->keymap[i]);
+		unsigned int code = MCS_KEY_CODE(pdata->keymap[i]);
+
+		data->keycodes[val] = code;
+		__set_bit(code, input_dev->keybit);
+	}
+
+	input_set_capability(input_dev, EV_MSC, MSC_SCAN);
+	input_set_drvdata(input_dev, data);
+
+	if (pdata->cfg_pin)
+		pdata->cfg_pin();
+
+	error = request_threaded_irq(client->irq, NULL, mcs_touchkey_interrupt,
+			IRQF_TRIGGER_FALLING, client->dev.driver->name, data);
+	if (error) {
+		dev_err(&client->dev, "Failed to register interrupt\n");
+		goto err_free_mem;
+	}
+
+	error = input_register_device(input_dev);
+	if (error)
+		goto err_free_irq;
+
+	i2c_set_clientdata(client, data);
+	return 0;
+
+err_free_irq:
+	free_irq(client->irq, data);
+err_free_mem:
+	input_free_device(input_dev);
+	kfree(data);
+	return error;
+}
+
+static int __devexit mcs_touchkey_remove(struct i2c_client *client)
+{
+	struct mcs_touchkey_data *data = i2c_get_clientdata(client);
+
+	free_irq(client->irq, data);
+	input_unregister_device(data->input_dev);
+	kfree(data);
+
+	return 0;
+}
+
+static const struct i2c_device_id mcs_touchkey_id[] = {
+	{ "mcs5000_touchkey", MCS5000_TOUCHKEY },
+	{ "mcs5080_touchkey", MCS5080_TOUCHKEY },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, mcs_touchkey_id);
+
+static struct i2c_driver mcs_touchkey_driver = {
+	.driver = {
+		.name	= "mcs_touchkey",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= mcs_touchkey_probe,
+	.remove		= __devexit_p(mcs_touchkey_remove),
+	.id_table	= mcs_touchkey_id,
+};
+
+static int __init mcs_touchkey_init(void)
+{
+	return i2c_add_driver(&mcs_touchkey_driver);
+}
+
+static void __exit mcs_touchkey_exit(void)
+{
+	i2c_del_driver(&mcs_touchkey_driver);
+}
+
+module_init(mcs_touchkey_init);
+module_exit(mcs_touchkey_exit);
+
+/* Module information */
+MODULE_AUTHOR("Joonyoung Shim <jy0922.shim@samsung.com>");
+MODULE_AUTHOR("HeungJun Kim <riverful.kim@samsung.com>");
+MODULE_DESCRIPTION("Touchkey driver for MELFAS MCS5000/5080 controller");
+MODULE_LICENSE("GPL");
diff --git a/drivers/input/touchscreen/mcs5000_ts.c b/drivers/input/touchscreen/mcs5000_ts.c
index 1fb0c2f06a44..6ee9940aaf5b 100644
--- a/drivers/input/touchscreen/mcs5000_ts.c
+++ b/drivers/input/touchscreen/mcs5000_ts.c
@@ -16,7 +16,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/i2c.h>
-#include <linux/i2c/mcs5000_ts.h>
+#include <linux/i2c/mcs.h>
 #include <linux/interrupt.h>
 #include <linux/input.h>
 #include <linux/irq.h>
@@ -105,7 +105,7 @@ enum mcs5000_ts_read_offset {
 struct mcs5000_ts_data {
 	struct i2c_client *client;
 	struct input_dev *input_dev;
-	const struct mcs5000_ts_platform_data *platform_data;
+	const struct mcs_platform_data *platform_data;
 };
 
 static irqreturn_t mcs5000_ts_interrupt(int irq, void *dev_id)
@@ -164,7 +164,7 @@ static irqreturn_t mcs5000_ts_interrupt(int irq, void *dev_id)
 
 static void mcs5000_ts_phys_init(struct mcs5000_ts_data *data)
 {
-	const struct mcs5000_ts_platform_data *platform_data =
+	const struct mcs_platform_data *platform_data =
 		data->platform_data;
 	struct i2c_client *client = data->client;
 
diff --git a/include/linux/i2c/mcs.h b/include/linux/i2c/mcs.h
new file mode 100644
index 000000000000..725ae7c313ff
--- /dev/null
+++ b/include/linux/i2c/mcs.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2009 - 2010 Samsung Electronics Co.Ltd
+ * Author: Joonyoung Shim <jy0922.shim@samsung.com>
+ * Author: HeungJun Kim <riverful.kim@samsung.com>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __LINUX_MCS_H
+#define __LINUX_MCS_H
+
+#define MCS_KEY_MAP(v, c)	((((v) & 0xff) << 16) | ((c) & 0xffff))
+#define MCS_KEY_VAL(v)		(((v) >> 16) & 0xff)
+#define MCS_KEY_CODE(v)		((v) & 0xffff)
+
+struct mcs_platform_data {
+	void (*cfg_pin)(void);
+
+	/* touchscreen */
+	unsigned int x_size;
+	unsigned int y_size;
+
+	/* touchkey */
+	const u32 *keymap;
+	unsigned int keymap_size;
+	unsigned int key_maxval;
+	bool no_autorepeat;
+};
+
+#endif	/* __LINUX_MCS_H */
diff --git a/include/linux/i2c/mcs5000_ts.h b/include/linux/i2c/mcs5000_ts.h
deleted file mode 100644
index 5a117b5ca15e..000000000000
--- a/include/linux/i2c/mcs5000_ts.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * mcs5000_ts.h
- *
- * Copyright (C) 2009 Samsung Electronics Co.Ltd
- * Author: Joonyoung Shim <jy0922.shim@samsung.com>
- *
- *  This program is free software; you can redistribute  it and/or modify it
- *  under  the terms of  the GNU General  Public License as published by the
- *  Free Software Foundation;  either version 2 of the  License, or (at your
- *  option) any later version.
- *
- */
-
-#ifndef __LINUX_MCS5000_TS_H
-#define __LINUX_MCS5000_TS_H
-
-/* platform data for the MELFAS MCS-5000 touchscreen driver */
-struct mcs5000_ts_platform_data {
-	void (*cfg_pin)(void);
-	int x_size;
-	int y_size;
-};
-
-#endif	/* __LINUX_MCS5000_TS_H */
-- 
cgit v1.2.3-59-g8ed1b


From 7adde04a2f5a798f04a556dfb3b69bff388e5dc4 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Wed, 30 Jun 2010 17:57:22 +0800
Subject: slab: fix caller tracking on !CONFIG_DEBUG_SLAB && CONFIG_TRACING

In slab, all __xxx_track_caller is defined on CONFIG_DEBUG_SLAB || CONFIG_TRACING,
thus caller tracking function should be worked for CONFIG_TRACING. But if
CONFIG_DEBUG_SLAB is not set, include/linux/slab.h will define xxx_track_caller to
__xxx() without consideration of CONFIG_TRACING. This will break the caller tracking
behaviour then.

Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Dmitry Monakhov <dmonakhov@openvz.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slab.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 49d1247cd6d9..59260e21bdf5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -268,7 +268,8 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
  * allocator where we care about the real place the memory allocation
  * request comes from.
  */
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \
+	(defined(CONFIG_SLAB) && defined(CONFIG_TRACING))
 extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
 #define kmalloc_track_caller(size, flags) \
 	__kmalloc_track_caller(size, flags, _RET_IP_)
@@ -286,7 +287,8 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
  * standard allocator where we care about the real place the memory
  * allocation request comes from.
  */
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \
+	(defined(CONFIG_SLAB) && defined(CONFIG_TRACING))
 extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
 #define kmalloc_node_track_caller(size, flags, node) \
 	__kmalloc_node_track_caller(size, flags, node, \
-- 
cgit v1.2.3-59-g8ed1b


From 7db6f5fb65a82af03229eef104dc9899c5eecf33 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 27 Jun 2010 01:02:33 +0000
Subject: vsprintf: Recursive vsnprintf: Add "%pV", struct va_format

Add the ability to print a format and va_list from a structure pointer

Allows __dev_printk to be implemented as a single printk while
minimizing string space duplication.

%pV should not be used without some mechanism to verify the
format and argument use ala __attribute__(format (printf(...))).

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/kernel.h | 5 +++++
 lib/vsprintf.c         | 9 +++++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8317ec4b9f3b..01dfc05ef4ac 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -171,6 +171,11 @@ static inline void might_fault(void)
 }
 #endif
 
+struct va_format {
+	const char *fmt;
+	va_list *va;
+};
+
 extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(long time);
 NORET_TYPE void panic(const char * fmt, ...)
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index b8a2f549ab0e..4ee19d0d3910 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -980,6 +980,11 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
  *             [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15]
  *           little endian output byte order is:
  *             [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15]
+ * - 'V' For a struct va_format which contains a format string * and va_list *,
+ *       call vsnprintf(->format, *->va_list).
+ *       Implements a "recursive vsnprintf".
+ *       Do not use this feature without some mechanism to verify the
+ *       correctness of the format string and va_list arguments.
  *
  * Note: The difference between 'S' and 'F' is that on ia64 and ppc64
  * function pointers are really function descriptors, which contain a
@@ -1025,6 +1030,10 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 		break;
 	case 'U':
 		return uuid_string(buf, end, ptr, spec, fmt);
+	case 'V':
+		return buf + vsnprintf(buf, end - buf,
+				       ((struct va_format *)ptr)->fmt,
+				       *(((struct va_format *)ptr)->va));
 	}
 	spec.flags |= SMALL;
 	if (spec.field_width == -1) {
-- 
cgit v1.2.3-59-g8ed1b


From 99bcf217183e02ebae46373896fba7f12d588001 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 27 Jun 2010 01:02:34 +0000
Subject: device.h drivers/base/core.c Convert dev_<level> logging macros to
 functions

Reduces an x86 defconfig text and data ~55k, .6% smaller.

$ size vmlinux*
   text	   data	    bss	    dec	    hex	filename
7205273	 716016	1366288	9287577	 8db799	vmlinux
7258890	 719768	1366288	9344946	 8e97b2	vmlinux.master

Uses %pV and struct va_format
Format arguments are verified before printk

The dev_info macro is converted to _dev_info because there are
existing uses of variables named dev_info in the kernel tree
like drivers/net/pcmcia/pcnet_cs.c

A dev_info macro is created to call _dev_info

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/base/core.c    |  64 ++++++++++++++++++++++++++++
 include/linux/device.h | 112 +++++++++++++++++++++++++++++++++++++------------
 2 files changed, 150 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 9630fbdf4e6c..38bbbd029306 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1819,3 +1819,67 @@ void device_shutdown(void)
 	spin_unlock(&devices_kset->list_lock);
 	async_synchronize_full();
 }
+
+/*
+ * Device logging functions
+ */
+
+#ifdef CONFIG_PRINTK
+
+static int __dev_printk(const char *level, const struct device *dev,
+			struct va_format *vaf)
+{
+	if (!dev)
+		return printk("%s(NULL device *): %pV", level, vaf);
+
+	return printk("%s%s %s: %pV",
+		      level, dev_driver_string(dev), dev_name(dev), vaf);
+}
+
+int dev_printk(const char *level, const struct device *dev,
+	       const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	int r;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	r = __dev_printk(level, dev, &vaf);
+	va_end(args);
+
+	return r;
+}
+EXPORT_SYMBOL(dev_printk);
+
+#define define_dev_printk_level(func, kern_level)		\
+int func(const struct device *dev, const char *fmt, ...)	\
+{								\
+	struct va_format vaf;					\
+	va_list args;						\
+	int r;							\
+								\
+	va_start(args, fmt);					\
+								\
+	vaf.fmt = fmt;						\
+	vaf.va = &args;						\
+								\
+	r = __dev_printk(kern_level, dev, &vaf);		\
+	va_end(args);						\
+								\
+	return r;						\
+}								\
+EXPORT_SYMBOL(func);
+
+define_dev_printk_level(dev_emerg, KERN_EMERG);
+define_dev_printk_level(dev_alert, KERN_ALERT);
+define_dev_printk_level(dev_crit, KERN_CRIT);
+define_dev_printk_level(dev_err, KERN_ERR);
+define_dev_printk_level(dev_warn, KERN_WARNING);
+define_dev_printk_level(dev_notice, KERN_NOTICE);
+define_dev_printk_level(_dev_info, KERN_INFO);
+
+#endif
diff --git a/include/linux/device.h b/include/linux/device.h
index 0713e10571dd..6a8276f683b6 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -638,43 +638,103 @@ extern void sysdev_shutdown(void);
 
 /* debugging and troubleshooting/diagnostic helpers. */
 extern const char *dev_driver_string(const struct device *dev);
-#define dev_printk(level, dev, format, arg...)	\
-	printk(level "%s %s: " format , dev_driver_string(dev) , \
-	       dev_name(dev) , ## arg)
-
-#define dev_emerg(dev, format, arg...)		\
-	dev_printk(KERN_EMERG , dev , format , ## arg)
-#define dev_alert(dev, format, arg...)		\
-	dev_printk(KERN_ALERT , dev , format , ## arg)
-#define dev_crit(dev, format, arg...)		\
-	dev_printk(KERN_CRIT , dev , format , ## arg)
-#define dev_err(dev, format, arg...)		\
-	dev_printk(KERN_ERR , dev , format , ## arg)
-#define dev_warn(dev, format, arg...)		\
-	dev_printk(KERN_WARNING , dev , format , ## arg)
-#define dev_notice(dev, format, arg...)		\
-	dev_printk(KERN_NOTICE , dev , format , ## arg)
-#define dev_info(dev, format, arg...)		\
-	dev_printk(KERN_INFO , dev , format , ## arg)
+
+
+#ifdef CONFIG_PRINTK
+
+extern int dev_printk(const char *level, const struct device *dev,
+		      const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern int dev_emerg(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int dev_alert(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int dev_crit(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int dev_err(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int dev_warn(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int dev_notice(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int _dev_info(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+
+#else
+
+static inline int dev_printk(const char *level, const struct device *dev,
+		      const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4)));
+static inline int dev_printk(const char *level, const struct device *dev,
+		      const char *fmt, ...)
+	 { return 0; }
+
+static inline int dev_emerg(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+static inline int dev_emerg(const struct device *dev, const char *fmt, ...)
+	{ return 0; }
+static inline int dev_crit(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+static inline int dev_crit(const struct device *dev, const char *fmt, ...)
+	{ return 0; }
+static inline int dev_alert(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+static inline int dev_alert(const struct device *dev, const char *fmt, ...)
+	{ return 0; }
+static inline int dev_err(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+static inline int dev_err(const struct device *dev, const char *fmt, ...)
+	{ return 0; }
+static inline int dev_warn(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+static inline int dev_warn(const struct device *dev, const char *fmt, ...)
+	{ return 0; }
+static inline int dev_notice(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+static inline int dev_notice(const struct device *dev, const char *fmt, ...)
+	{ return 0; }
+static inline int _dev_info(const struct device *dev, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+static inline int _dev_info(const struct device *dev, const char *fmt, ...)
+	{ return 0; }
+
+#endif
+
+/*
+ * Stupid hackaround for existing uses of non-printk uses dev_info
+ *
+ * Note that the definition of dev_info below is actually _dev_info
+ * and a macro is used to avoid redefining dev_info
+ */
+
+#define dev_info(dev, fmt, arg...) _dev_info(dev, fmt, ##arg)
 
 #if defined(DEBUG)
 #define dev_dbg(dev, format, arg...)		\
-	dev_printk(KERN_DEBUG , dev , format , ## arg)
+	dev_printk(KERN_DEBUG, dev, format, ##arg)
 #elif defined(CONFIG_DYNAMIC_DEBUG)
-#define dev_dbg(dev, format, ...) do { \
+#define dev_dbg(dev, format, ...)		     \
+do {						     \
 	dynamic_dev_dbg(dev, format, ##__VA_ARGS__); \
-	} while (0)
+} while (0)
 #else
-#define dev_dbg(dev, format, arg...)		\
-	({ if (0) dev_printk(KERN_DEBUG, dev, format, ##arg); 0; })
+#define dev_dbg(dev, format, arg...)				\
+({								\
+	if (0)							\
+		dev_printk(KERN_DEBUG, dev, format, ##arg);	\
+	0;							\
+})
 #endif
 
 #ifdef VERBOSE_DEBUG
 #define dev_vdbg	dev_dbg
 #else
-
-#define dev_vdbg(dev, format, arg...)		\
-	({ if (0) dev_printk(KERN_DEBUG, dev, format, ##arg); 0; })
+#define dev_vdbg(dev, format, arg...)				\
+({								\
+	if (0)							\
+		dev_printk(KERN_DEBUG, dev, format, ##arg);	\
+	0;							\
+})
 #endif
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 256df2f3879efdb2e9808bdb1b54b16fbb11fa38 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 27 Jun 2010 01:02:35 +0000
Subject: netdevice.h net/core/dev.c: Convert netdev_<level> logging macros to
 functions

Reduces an x86 defconfig text and data ~2k.
text is smaller, data is larger.

$ size vmlinux*
   text	   data	    bss	    dec	    hex	filename
7198862	 720112	1366288	9285262	 8dae8e	vmlinux
7205273	 716016	1366288	9287577	 8db799	vmlinux.device_h

Uses %pV and struct va_format
Format arguments are verified before printk

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 36 +++++++++++++--------------
 net/core/dev.c            | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8fa5e5aa879a..0183901ea475 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2250,25 +2250,23 @@ static inline const char *netdev_name(const struct net_device *dev)
 	return dev->name;
 }
 
-#define netdev_printk(level, netdev, format, args...)		\
-	dev_printk(level, (netdev)->dev.parent,			\
-		   "%s: " format,				\
-		   netdev_name(netdev), ##args)
-
-#define netdev_emerg(dev, format, args...)			\
-	netdev_printk(KERN_EMERG, dev, format, ##args)
-#define netdev_alert(dev, format, args...)			\
-	netdev_printk(KERN_ALERT, dev, format, ##args)
-#define netdev_crit(dev, format, args...)			\
-	netdev_printk(KERN_CRIT, dev, format, ##args)
-#define netdev_err(dev, format, args...)			\
-	netdev_printk(KERN_ERR, dev, format, ##args)
-#define netdev_warn(dev, format, args...)			\
-	netdev_printk(KERN_WARNING, dev, format, ##args)
-#define netdev_notice(dev, format, args...)			\
-	netdev_printk(KERN_NOTICE, dev, format, ##args)
-#define netdev_info(dev, format, args...)			\
-	netdev_printk(KERN_INFO, dev, format, ##args)
+extern int netdev_printk(const char *level, const struct net_device *dev,
+			 const char *format, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern int netdev_emerg(const struct net_device *dev, const char *format, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int netdev_alert(const struct net_device *dev, const char *format, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int netdev_crit(const struct net_device *dev, const char *format, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int netdev_err(const struct net_device *dev, const char *format, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int netdev_warn(const struct net_device *dev, const char *format, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int netdev_notice(const struct net_device *dev, const char *format, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int netdev_info(const struct net_device *dev, const char *format, ...)
+	__attribute__ ((format (printf, 2, 3)));
 
 #if defined(DEBUG)
 #define netdev_dbg(__dev, format, args...)			\
diff --git a/net/core/dev.c b/net/core/dev.c
index e85cc5fa3c4e..93b8929fa21d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5802,6 +5802,68 @@ char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
 	return buffer;
 }
 
+static int __netdev_printk(const char *level, const struct net_device *dev,
+			   struct va_format *vaf)
+{
+	int r;
+
+	if (dev && dev->dev.parent)
+		r = dev_printk(level, dev->dev.parent, "%s: %pV",
+			       netdev_name(dev), vaf);
+	else if (dev)
+		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
+	else
+		r = printk("%s(NULL net_device): %pV", level, vaf);
+
+	return r;
+}
+
+int netdev_printk(const char *level, const struct net_device *dev,
+		  const char *format, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	int r;
+
+	va_start(args, format);
+
+	vaf.fmt = format;
+	vaf.va = &args;
+
+	r = __netdev_printk(level, dev, &vaf);
+	va_end(args);
+
+	return r;
+}
+EXPORT_SYMBOL(netdev_printk);
+
+#define define_netdev_printk_level(func, level)			\
+int func(const struct net_device *dev, const char *fmt, ...)	\
+{								\
+	int r;							\
+	struct va_format vaf;					\
+	va_list args;						\
+								\
+	va_start(args, fmt);					\
+								\
+	vaf.fmt = fmt;						\
+	vaf.va = &args;						\
+								\
+	r = __netdev_printk(level, dev, &vaf);			\
+	va_end(args);						\
+								\
+	return r;						\
+}								\
+EXPORT_SYMBOL(func);
+
+define_netdev_printk_level(netdev_emerg, KERN_EMERG);
+define_netdev_printk_level(netdev_alert, KERN_ALERT);
+define_netdev_printk_level(netdev_crit, KERN_CRIT);
+define_netdev_printk_level(netdev_err, KERN_ERR);
+define_netdev_printk_level(netdev_warn, KERN_WARNING);
+define_netdev_printk_level(netdev_notice, KERN_NOTICE);
+define_netdev_printk_level(netdev_info, KERN_INFO);
+
 static void __net_exit netdev_exit(struct net *net)
 {
 	kfree(net->dev_name_head);
-- 
cgit v1.2.3-59-g8ed1b


From f45f4321d2c977c9eff77e5a5225f3cd2140eb20 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 27 Jun 2010 01:02:36 +0000
Subject: netdevice.h: Change netif_<level> macros to call netdev_<level>
 functions

Reduces text ~300 bytes of text (woohoo!) in an x86 defconfig

$ size vmlinux*
   text	   data	    bss	    dec	    hex	filename
7198526	 720112	1366288	9284926	 8dad3e	vmlinux
7198862	 720112	1366288	9285262	 8dae8e	vmlinux.netdev

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0183901ea475..4d27368674db 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2314,20 +2314,26 @@ do {					  			\
 		netdev_printk(level, (dev), fmt, ##args);	\
 } while (0)
 
+#define netif_level(level, priv, type, dev, fmt, args...)	\
+do {								\
+	if (netif_msg_##type(priv))				\
+		netdev_##level(dev, fmt, ##args);		\
+} while (0)
+
 #define netif_emerg(priv, type, dev, fmt, args...)		\
-	netif_printk(priv, type, KERN_EMERG, dev, fmt, ##args)
+	netif_level(emerg, priv, type, dev, fmt, ##args)
 #define netif_alert(priv, type, dev, fmt, args...)		\
-	netif_printk(priv, type, KERN_ALERT, dev, fmt, ##args)
+	netif_level(alert, priv, type, dev, fmt, ##args)
 #define netif_crit(priv, type, dev, fmt, args...)		\
-	netif_printk(priv, type, KERN_CRIT, dev, fmt, ##args)
+	netif_level(crit, priv, type, dev, fmt, ##args)
 #define netif_err(priv, type, dev, fmt, args...)		\
-	netif_printk(priv, type, KERN_ERR, dev, fmt, ##args)
+	netif_level(err, priv, type, dev, fmt, ##args)
 #define netif_warn(priv, type, dev, fmt, args...)		\
-	netif_printk(priv, type, KERN_WARNING, dev, fmt, ##args)
+	netif_level(warn, priv, type, dev, fmt, ##args)
 #define netif_notice(priv, type, dev, fmt, args...)		\
-	netif_printk(priv, type, KERN_NOTICE, dev, fmt, ##args)
+	netif_level(notice, priv, type, dev, fmt, ##args)
 #define netif_info(priv, type, dev, fmt, args...)		\
-	netif_printk(priv, type, KERN_INFO, (dev), fmt, ##args)
+	netif_level(info, priv, type, dev, fmt, ##args)
 
 #if defined(DEBUG)
 #define netif_dbg(priv, type, dev, format, args...)		\
-- 
cgit v1.2.3-59-g8ed1b


From 7dc2e1134a22dc242175d5321c0c9e97d16eb87b Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:06 -0600
Subject: of/irq: merge irq mapping code

Merge common irq mapping code between PowerPC and Microblaze.

This patch merges of_irq_find_parent(), of_irq_map_raw() and
of_irq_map_one().  The functions are dependent on one another, so all
three are merged in a single patch.  Other than cosmetic difference
(ie. DBG() vs. pr_debug()), the implementations are identical.

of_irq_to_resource() is also merged, but in this case the
implementations are different.  This patch drops the microblaze version
and uses the powerpc implementation unchanged.  The microblaze version
essentially open-coded irq_of_parse_and_map() which it does not need
to do.  Therefore the powerpc version is safe to adopt.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Michal Simek <monstr@monstr.eu>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h  |  31 ----
 arch/microblaze/kernel/prom_parse.c | 290 ----------------------------------
 arch/powerpc/include/asm/prom.h     |  47 ------
 arch/powerpc/kernel/prom_parse.c    | 266 -------------------------------
 drivers/of/irq.c                    | 301 ++++++++++++++++++++++++++++++++++++
 include/linux/of_irq.h              |  29 ++++
 6 files changed, 330 insertions(+), 634 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 4f34bc5baa83..5fbdfe76fe76 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -89,34 +89,6 @@ struct device_node *of_get_cpu_node(int cpu, unsigned int *thread);
 /* Get the MAC address */
 extern const void *of_get_mac_address(struct device_node *np);
 
-/*
- * OF interrupt mapping
- */
-
-#define OF_IMAP_OLDWORLD_MAC	0x00000001
-#define OF_IMAP_NO_PHANDLE	0x00000002
-
-/**
- * of_irq_map_raw - Low level interrupt tree parsing
- * @parent:	the device interrupt parent
- * @intspec:	interrupt specifier ("interrupts" property of the device)
- * @ointsize:	size of the passed in interrupt specifier
- * @addr:	address specifier (start of "reg" property of the device)
- * @out_irq:	structure of_irq filled by this function
- *
- * Returns 0 on success and a negative number on error
- *
- * This function is a low-level interrupt tree walking function. It
- * can be used to do a partial walk with synthetized reg and interrupts
- * properties, for example when resolving PCI interrupts when no device
- * node exist for the parent.
- *
- */
-
-extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
-			u32 ointsize, const u32 *addr,
-			struct of_irq *out_irq);
-
 /**
  * of_irq_map_pci - Resolve the interrupt for a PCI device
  * @pdev:	the device whose interrupt is to be resolved
@@ -131,9 +103,6 @@ extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
 struct pci_dev;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
-extern int of_irq_to_resource(struct device_node *dev, int index,
-			struct resource *r);
-
 /**
  * of_iomap - Maps the memory mapped IO for a given device_node
  * @device:	the device whose io range will be mapped
diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c
index cba05812ab46..e28968fa34c1 100644
--- a/arch/microblaze/kernel/prom_parse.c
+++ b/arch/microblaze/kernel/prom_parse.c
@@ -644,267 +644,6 @@ void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 	*size = of_read_number(dma_window, cells);
 }
 
-/*
- * Interrupt remapper
- */
-
-static unsigned int of_irq_workarounds;
-static struct device_node *of_irq_dflt_pic;
-
-static struct device_node *of_irq_find_parent(struct device_node *child)
-{
-	struct device_node *p;
-	const phandle *parp;
-
-	if (!of_node_get(child))
-		return NULL;
-
-	do {
-		parp = of_get_property(child, "interrupt-parent", NULL);
-		if (parp == NULL)
-			p = of_get_parent(child);
-		else {
-			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
-				p = of_node_get(of_irq_dflt_pic);
-			else
-				p = of_find_node_by_phandle(*parp);
-		}
-		of_node_put(child);
-		child = p;
-	} while (p && of_get_property(p, "#interrupt-cells", NULL) == NULL);
-
-	return p;
-}
-
-int of_irq_map_raw(struct device_node *parent, const u32 *intspec, u32 ointsize,
-		const u32 *addr, struct of_irq *out_irq)
-{
-	struct device_node *ipar, *tnode, *old = NULL, *newpar = NULL;
-	const u32 *tmp, *imap, *imask;
-	u32 intsize = 1, addrsize, newintsize = 0, newaddrsize = 0;
-	int imaplen, match, i;
-
-	pr_debug("of_irq_map_raw: par=%s,intspec=[0x%08x 0x%08x...],"
-		"ointsize=%d\n",
-		parent->full_name, intspec[0], intspec[1], ointsize);
-
-	ipar = of_node_get(parent);
-
-	/* First get the #interrupt-cells property of the current cursor
-	 * that tells us how to interpret the passed-in intspec. If there
-	 * is none, we are nice and just walk up the tree
-	 */
-	do {
-		tmp = of_get_property(ipar, "#interrupt-cells", NULL);
-		if (tmp != NULL) {
-			intsize = *tmp;
-			break;
-		}
-		tnode = ipar;
-		ipar = of_irq_find_parent(ipar);
-		of_node_put(tnode);
-	} while (ipar);
-	if (ipar == NULL) {
-		pr_debug(" -> no parent found !\n");
-		goto fail;
-	}
-
-	pr_debug("of_irq_map_raw: ipar=%s, size=%d\n",
-			ipar->full_name, intsize);
-
-	if (ointsize != intsize)
-		return -EINVAL;
-
-	/* Look for this #address-cells. We have to implement the old linux
-	 * trick of looking for the parent here as some device-trees rely on it
-	 */
-	old = of_node_get(ipar);
-	do {
-		tmp = of_get_property(old, "#address-cells", NULL);
-		tnode = of_get_parent(old);
-		of_node_put(old);
-		old = tnode;
-	} while (old && tmp == NULL);
-	of_node_put(old);
-	old = NULL;
-	addrsize = (tmp == NULL) ? 2 : *tmp;
-
-	pr_debug(" -> addrsize=%d\n", addrsize);
-
-	/* Now start the actual "proper" walk of the interrupt tree */
-	while (ipar != NULL) {
-		/* Now check if cursor is an interrupt-controller and if it is
-		 * then we are done
-		 */
-		if (of_get_property(ipar, "interrupt-controller", NULL) !=
-				NULL) {
-			pr_debug(" -> got it !\n");
-			memcpy(out_irq->specifier, intspec,
-				intsize * sizeof(u32));
-			out_irq->size = intsize;
-			out_irq->controller = ipar;
-			of_node_put(old);
-			return 0;
-		}
-
-		/* Now look for an interrupt-map */
-		imap = of_get_property(ipar, "interrupt-map", &imaplen);
-		/* No interrupt map, check for an interrupt parent */
-		if (imap == NULL) {
-			pr_debug(" -> no map, getting parent\n");
-			newpar = of_irq_find_parent(ipar);
-			goto skiplevel;
-		}
-		imaplen /= sizeof(u32);
-
-		/* Look for a mask */
-		imask = of_get_property(ipar, "interrupt-map-mask", NULL);
-
-		/* If we were passed no "reg" property and we attempt to parse
-		 * an interrupt-map, then #address-cells must be 0.
-		 * Fail if it's not.
-		 */
-		if (addr == NULL && addrsize != 0) {
-			pr_debug(" -> no reg passed in when needed !\n");
-			goto fail;
-		}
-
-		/* Parse interrupt-map */
-		match = 0;
-		while (imaplen > (addrsize + intsize + 1) && !match) {
-			/* Compare specifiers */
-			match = 1;
-			for (i = 0; i < addrsize && match; ++i) {
-				u32 mask = imask ? imask[i] : 0xffffffffu;
-				match = ((addr[i] ^ imap[i]) & mask) == 0;
-			}
-			for (; i < (addrsize + intsize) && match; ++i) {
-				u32 mask = imask ? imask[i] : 0xffffffffu;
-				match =
-					((intspec[i-addrsize] ^ imap[i])
-						& mask) == 0;
-			}
-			imap += addrsize + intsize;
-			imaplen -= addrsize + intsize;
-
-			pr_debug(" -> match=%d (imaplen=%d)\n", match, imaplen);
-
-			/* Get the interrupt parent */
-			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
-				newpar = of_node_get(of_irq_dflt_pic);
-			else
-				newpar =
-					of_find_node_by_phandle((phandle)*imap);
-			imap++;
-			--imaplen;
-
-			/* Check if not found */
-			if (newpar == NULL) {
-				pr_debug(" -> imap parent not found !\n");
-				goto fail;
-			}
-
-			/* Get #interrupt-cells and #address-cells of new
-			 * parent
-			 */
-			tmp = of_get_property(newpar, "#interrupt-cells", NULL);
-			if (tmp == NULL) {
-				pr_debug(" -> parent lacks "
-						"#interrupt-cells!\n");
-				goto fail;
-			}
-			newintsize = *tmp;
-			tmp = of_get_property(newpar, "#address-cells", NULL);
-			newaddrsize = (tmp == NULL) ? 0 : *tmp;
-
-			pr_debug(" -> newintsize=%d, newaddrsize=%d\n",
-				newintsize, newaddrsize);
-
-			/* Check for malformed properties */
-			if (imaplen < (newaddrsize + newintsize))
-				goto fail;
-
-			imap += newaddrsize + newintsize;
-			imaplen -= newaddrsize + newintsize;
-
-			pr_debug(" -> imaplen=%d\n", imaplen);
-		}
-		if (!match)
-			goto fail;
-
-		of_node_put(old);
-		old = of_node_get(newpar);
-		addrsize = newaddrsize;
-		intsize = newintsize;
-		intspec = imap - intsize;
-		addr = intspec - addrsize;
-
-skiplevel:
-		/* Iterate again with new parent */
-		pr_debug(" -> new parent: %s\n",
-				newpar ? newpar->full_name : "<>");
-		of_node_put(ipar);
-		ipar = newpar;
-		newpar = NULL;
-	}
-fail:
-	of_node_put(ipar);
-	of_node_put(old);
-	of_node_put(newpar);
-
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(of_irq_map_raw);
-
-int of_irq_map_one(struct device_node *device,
-			int index, struct of_irq *out_irq)
-{
-	struct device_node *p;
-	const u32 *intspec, *tmp, *addr;
-	u32 intsize, intlen;
-	int res;
-
-	pr_debug("of_irq_map_one: dev=%s, index=%d\n",
-			device->full_name, index);
-
-	/* Get the interrupts property */
-	intspec = of_get_property(device, "interrupts", (int *) &intlen);
-	if (intspec == NULL)
-		return -EINVAL;
-	intlen /= sizeof(u32);
-
-	pr_debug(" intspec=%d intlen=%d\n", *intspec, intlen);
-
-	/* Get the reg property (if any) */
-	addr = of_get_property(device, "reg", NULL);
-
-	/* Look for the interrupt parent. */
-	p = of_irq_find_parent(device);
-	if (p == NULL)
-		return -EINVAL;
-
-	/* Get size of interrupt specifier */
-	tmp = of_get_property(p, "#interrupt-cells", NULL);
-	if (tmp == NULL) {
-		of_node_put(p);
-		return -EINVAL;
-	}
-	intsize = *tmp;
-
-	pr_debug(" intsize=%d intlen=%d\n", intsize, intlen);
-
-	/* Check index */
-	if ((index + 1) * intsize > intlen)
-		return -EINVAL;
-
-	/* Get new specifier and map it */
-	res = of_irq_map_raw(p, intspec + index * intsize, intsize,
-				addr, out_irq);
-	of_node_put(p);
-	return res;
-}
-EXPORT_SYMBOL_GPL(of_irq_map_one);
-
 /**
  * Search the device tree for the best MAC address to use.  'mac-address' is
  * checked first, because that is supposed to contain to "most recent" MAC
@@ -943,35 +682,6 @@ const void *of_get_mac_address(struct device_node *np)
 }
 EXPORT_SYMBOL(of_get_mac_address);
 
-int of_irq_to_resource(struct device_node *dev, int index, struct resource *r)
-{
-	struct of_irq out_irq;
-	int irq;
-	int res;
-
-	res = of_irq_map_one(dev, index, &out_irq);
-
-	/* Get irq for the device */
-	if (res) {
-		pr_debug("IRQ not found... code = %d", res);
-		return NO_IRQ;
-	}
-	/* Assuming single interrupt controller... */
-	irq = out_irq.specifier[0];
-
-	pr_debug("IRQ found = %d", irq);
-
-	/* Only dereference the resource if both the
-	 * resource and the irq are valid. */
-	if (r && irq != NO_IRQ) {
-		r->start = r->end = irq;
-		r->flags = IORESOURCE_IRQ;
-	}
-
-	return irq;
-}
-EXPORT_SYMBOL_GPL(of_irq_to_resource);
-
 void __iomem *of_iomap(struct device_node *np, int index)
 {
 	struct resource res;
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 4486765db6e7..10d5ee556702 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -105,50 +105,6 @@ struct device_node *of_find_next_cache_node(struct device_node *np);
 /* Get the MAC address */
 extern const void *of_get_mac_address(struct device_node *np);
 
-/*
- * OF interrupt mapping
- */
-
-#define OF_IMAP_OLDWORLD_MAC	0x00000001
-#define OF_IMAP_NO_PHANDLE	0x00000002
-
-#if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC)
-/* Workarounds only needed for 32bit powermac machines */
-extern unsigned int of_irq_workarounds;
-extern struct device_node *of_irq_dflt_pic;
-extern int of_irq_map_oldworld(struct device_node *device, int index,
-			       struct of_irq *out_irq);
-#else
-#define of_irq_workarounds (0)
-#define of_irq_dflt_pic (NULL)
-static inline int of_irq_map_oldworld(struct device_node *device, int index,
-				      struct of_irq *out_irq)
-{
-	return -EINVAL;
-}
-#endif
-
-/**
- * of_irq_map_raw - Low level interrupt tree parsing
- * @parent:	the device interrupt parent
- * @intspec:	interrupt specifier ("interrupts" property of the device)
- * @ointsize:   size of the passed in interrupt specifier
- * @addr:	address specifier (start of "reg" property of the device)
- * @out_irq:	structure of_irq filled by this function
- *
- * Returns 0 on success and a negative number on error
- *
- * This function is a low-level interrupt tree walking function. It
- * can be used to do a partial walk with synthetized reg and interrupts
- * properties, for example when resolving PCI interrupts when no device
- * node exist for the parent.
- *
- */
-
-extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
-			  u32 ointsize, const u32 *addr,
-			  struct of_irq *out_irq);
-
 /**
  * of_irq_map_pci - Resolve the interrupt for a PCI device
  * @pdev:	the device whose interrupt is to be resolved
@@ -163,9 +119,6 @@ extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
 struct pci_dev;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
-extern int of_irq_to_resource(struct device_node *dev, int index,
-			struct resource *r);
-
 /**
  * of_iomap - Maps the memory mapped IO for a given device_node
  * @device:	the device whose io range will be mapped
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index dfa6de6572bb..d61a5c5fe690 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -678,257 +678,6 @@ void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 	*size = of_read_number(dma_window, cells);
 }
 
-/*
- * Interrupt remapper
- */
-
-static struct device_node *of_irq_find_parent(struct device_node *child)
-{
-	struct device_node *p;
-	const phandle *parp;
-
-	if (!of_node_get(child))
-		return NULL;
-
-	do {
-		parp = of_get_property(child, "interrupt-parent", NULL);
-		if (parp == NULL)
-			p = of_get_parent(child);
-		else {
-			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
-				p = of_node_get(of_irq_dflt_pic);
-			else
-				p = of_find_node_by_phandle(*parp);
-		}
-		of_node_put(child);
-		child = p;
-	} while (p && of_get_property(p, "#interrupt-cells", NULL) == NULL);
-
-	return p;
-}
-
-int of_irq_map_raw(struct device_node *parent, const u32 *intspec, u32 ointsize,
-		const u32 *addr, struct of_irq *out_irq)
-{
-	struct device_node *ipar, *tnode, *old = NULL, *newpar = NULL;
-	const u32 *tmp, *imap, *imask;
-	u32 intsize = 1, addrsize, newintsize = 0, newaddrsize = 0;
-	int imaplen, match, i;
-
-	DBG("of_irq_map_raw: par=%s,intspec=[0x%08x 0x%08x...],ointsize=%d\n",
-	    parent->full_name, intspec[0], intspec[1], ointsize);
-
-	ipar = of_node_get(parent);
-
-	/* First get the #interrupt-cells property of the current cursor
-	 * that tells us how to interpret the passed-in intspec. If there
-	 * is none, we are nice and just walk up the tree
-	 */
-	do {
-		tmp = of_get_property(ipar, "#interrupt-cells", NULL);
-		if (tmp != NULL) {
-			intsize = *tmp;
-			break;
-		}
-		tnode = ipar;
-		ipar = of_irq_find_parent(ipar);
-		of_node_put(tnode);
-	} while (ipar);
-	if (ipar == NULL) {
-		DBG(" -> no parent found !\n");
-		goto fail;
-	}
-
-	DBG("of_irq_map_raw: ipar=%s, size=%d\n", ipar->full_name, intsize);
-
-	if (ointsize != intsize)
-		return -EINVAL;
-
-	/* Look for this #address-cells. We have to implement the old linux
-	 * trick of looking for the parent here as some device-trees rely on it
-	 */
-	old = of_node_get(ipar);
-	do {
-		tmp = of_get_property(old, "#address-cells", NULL);
-		tnode = of_get_parent(old);
-		of_node_put(old);
-		old = tnode;
-	} while(old && tmp == NULL);
-	of_node_put(old);
-	old = NULL;
-	addrsize = (tmp == NULL) ? 2 : *tmp;
-
-	DBG(" -> addrsize=%d\n", addrsize);
-
-	/* Now start the actual "proper" walk of the interrupt tree */
-	while (ipar != NULL) {
-		/* Now check if cursor is an interrupt-controller and if it is
-		 * then we are done
-		 */
-		if (of_get_property(ipar, "interrupt-controller", NULL) !=
-				NULL) {
-			DBG(" -> got it !\n");
-			memcpy(out_irq->specifier, intspec,
-			       intsize * sizeof(u32));
-			out_irq->size = intsize;
-			out_irq->controller = ipar;
-			of_node_put(old);
-			return 0;
-		}
-
-		/* Now look for an interrupt-map */
-		imap = of_get_property(ipar, "interrupt-map", &imaplen);
-		/* No interrupt map, check for an interrupt parent */
-		if (imap == NULL) {
-			DBG(" -> no map, getting parent\n");
-			newpar = of_irq_find_parent(ipar);
-			goto skiplevel;
-		}
-		imaplen /= sizeof(u32);
-
-		/* Look for a mask */
-		imask = of_get_property(ipar, "interrupt-map-mask", NULL);
-
-		/* If we were passed no "reg" property and we attempt to parse
-		 * an interrupt-map, then #address-cells must be 0.
-		 * Fail if it's not.
-		 */
-		if (addr == NULL && addrsize != 0) {
-			DBG(" -> no reg passed in when needed !\n");
-			goto fail;
-		}
-
-		/* Parse interrupt-map */
-		match = 0;
-		while (imaplen > (addrsize + intsize + 1) && !match) {
-			/* Compare specifiers */
-			match = 1;
-			for (i = 0; i < addrsize && match; ++i) {
-				u32 mask = imask ? imask[i] : 0xffffffffu;
-				match = ((addr[i] ^ imap[i]) & mask) == 0;
-			}
-			for (; i < (addrsize + intsize) && match; ++i) {
-				u32 mask = imask ? imask[i] : 0xffffffffu;
-				match =
-				   ((intspec[i-addrsize] ^ imap[i]) & mask) == 0;
-			}
-			imap += addrsize + intsize;
-			imaplen -= addrsize + intsize;
-
-			DBG(" -> match=%d (imaplen=%d)\n", match, imaplen);
-
-			/* Get the interrupt parent */
-			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
-				newpar = of_node_get(of_irq_dflt_pic);
-			else
-				newpar = of_find_node_by_phandle((phandle)*imap);
-			imap++;
-			--imaplen;
-
-			/* Check if not found */
-			if (newpar == NULL) {
-				DBG(" -> imap parent not found !\n");
-				goto fail;
-			}
-
-			/* Get #interrupt-cells and #address-cells of new
-			 * parent
-			 */
-			tmp = of_get_property(newpar, "#interrupt-cells", NULL);
-			if (tmp == NULL) {
-				DBG(" -> parent lacks #interrupt-cells !\n");
-				goto fail;
-			}
-			newintsize = *tmp;
-			tmp = of_get_property(newpar, "#address-cells", NULL);
-			newaddrsize = (tmp == NULL) ? 0 : *tmp;
-
-			DBG(" -> newintsize=%d, newaddrsize=%d\n",
-			    newintsize, newaddrsize);
-
-			/* Check for malformed properties */
-			if (imaplen < (newaddrsize + newintsize))
-				goto fail;
-
-			imap += newaddrsize + newintsize;
-			imaplen -= newaddrsize + newintsize;
-
-			DBG(" -> imaplen=%d\n", imaplen);
-		}
-		if (!match)
-			goto fail;
-
-		of_node_put(old);
-		old = of_node_get(newpar);
-		addrsize = newaddrsize;
-		intsize = newintsize;
-		intspec = imap - intsize;
-		addr = intspec - addrsize;
-
-	skiplevel:
-		/* Iterate again with new parent */
-		DBG(" -> new parent: %s\n", newpar ? newpar->full_name : "<>");
-		of_node_put(ipar);
-		ipar = newpar;
-		newpar = NULL;
-	}
- fail:
-	of_node_put(ipar);
-	of_node_put(old);
-	of_node_put(newpar);
-
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(of_irq_map_raw);
-
-int of_irq_map_one(struct device_node *device, int index, struct of_irq *out_irq)
-{
-	struct device_node *p;
-	const u32 *intspec, *tmp, *addr;
-	u32 intsize, intlen;
-	int res = -EINVAL;
-
-	DBG("of_irq_map_one: dev=%s, index=%d\n", device->full_name, index);
-
-	/* OldWorld mac stuff is "special", handle out of line */
-	if (of_irq_workarounds & OF_IMAP_OLDWORLD_MAC)
-		return of_irq_map_oldworld(device, index, out_irq);
-
-	/* Get the interrupts property */
-	intspec = of_get_property(device, "interrupts", &intlen);
-	if (intspec == NULL)
-		return -EINVAL;
-	intlen /= sizeof(u32);
-
-	/* Get the reg property (if any) */
-	addr = of_get_property(device, "reg", NULL);
-
-	/* Look for the interrupt parent. */
-	p = of_irq_find_parent(device);
-	if (p == NULL)
-		return -EINVAL;
-
-	/* Get size of interrupt specifier */
-	tmp = of_get_property(p, "#interrupt-cells", NULL);
-	if (tmp == NULL)
-		goto out;
-	intsize = *tmp;
-
-	DBG(" intsize=%d intlen=%d\n", intsize, intlen);
-
-	/* Check index */
-	if ((index + 1) * intsize > intlen)
-		goto out;
-
-	/* Get new specifier and map it */
-	res = of_irq_map_raw(p, intspec + index * intsize, intsize,
-			     addr, out_irq);
-out:
-	of_node_put(p);
-	return res;
-}
-EXPORT_SYMBOL_GPL(of_irq_map_one);
-
 /**
  * Search the device tree for the best MAC address to use.  'mac-address' is
  * checked first, because that is supposed to contain to "most recent" MAC
@@ -967,21 +716,6 @@ const void *of_get_mac_address(struct device_node *np)
 }
 EXPORT_SYMBOL(of_get_mac_address);
 
-int of_irq_to_resource(struct device_node *dev, int index, struct resource *r)
-{
-	int irq = irq_of_parse_and_map(dev, index);
-
-	/* Only dereference the resource if both the
-	 * resource and the irq are valid. */
-	if (r && irq != NO_IRQ) {
-		r->start = r->end = irq;
-		r->flags = IORESOURCE_IRQ;
-	}
-
-	return irq;
-}
-EXPORT_SYMBOL_GPL(of_irq_to_resource);
-
 void __iomem *of_iomap(struct device_node *np, int index)
 {
 	struct resource res;
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index 9b3397c27096..598454fbdd1e 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -43,3 +43,304 @@ unsigned int irq_of_parse_and_map(struct device_node *dev, int index)
 				     oirq.size);
 }
 EXPORT_SYMBOL_GPL(irq_of_parse_and_map);
+
+/**
+ * of_irq_find_parent - Given a device node, find its interrupt parent node
+ * @child: pointer to device node
+ *
+ * Returns a pointer to the interrupt parent node, or NULL if the interrupt
+ * parent could not be determined.
+ */
+static struct device_node *of_irq_find_parent(struct device_node *child)
+{
+	struct device_node *p;
+	const phandle *parp;
+
+	if (!of_node_get(child))
+		return NULL;
+
+	do {
+		parp = of_get_property(child, "interrupt-parent", NULL);
+		if (parp == NULL)
+			p = of_get_parent(child);
+		else {
+			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
+				p = of_node_get(of_irq_dflt_pic);
+			else
+				p = of_find_node_by_phandle(*parp);
+		}
+		of_node_put(child);
+		child = p;
+	} while (p && of_get_property(p, "#interrupt-cells", NULL) == NULL);
+
+	return p;
+}
+
+/**
+ * of_irq_map_raw - Low level interrupt tree parsing
+ * @parent:	the device interrupt parent
+ * @intspec:	interrupt specifier ("interrupts" property of the device)
+ * @ointsize:   size of the passed in interrupt specifier
+ * @addr:	address specifier (start of "reg" property of the device)
+ * @out_irq:	structure of_irq filled by this function
+ *
+ * Returns 0 on success and a negative number on error
+ *
+ * This function is a low-level interrupt tree walking function. It
+ * can be used to do a partial walk with synthetized reg and interrupts
+ * properties, for example when resolving PCI interrupts when no device
+ * node exist for the parent.
+ */
+int of_irq_map_raw(struct device_node *parent, const u32 *intspec, u32 ointsize,
+		const u32 *addr, struct of_irq *out_irq)
+{
+	struct device_node *ipar, *tnode, *old = NULL, *newpar = NULL;
+	const u32 *tmp, *imap, *imask;
+	u32 intsize = 1, addrsize, newintsize = 0, newaddrsize = 0;
+	int imaplen, match, i;
+
+	pr_debug("of_irq_map_raw: par=%s,intspec=[0x%08x 0x%08x...],ointsize=%d\n",
+	    parent->full_name, intspec[0], intspec[1], ointsize);
+
+	ipar = of_node_get(parent);
+
+	/* First get the #interrupt-cells property of the current cursor
+	 * that tells us how to interpret the passed-in intspec. If there
+	 * is none, we are nice and just walk up the tree
+	 */
+	do {
+		tmp = of_get_property(ipar, "#interrupt-cells", NULL);
+		if (tmp != NULL) {
+			intsize = *tmp;
+			break;
+		}
+		tnode = ipar;
+		ipar = of_irq_find_parent(ipar);
+		of_node_put(tnode);
+	} while (ipar);
+	if (ipar == NULL) {
+		pr_debug(" -> no parent found !\n");
+		goto fail;
+	}
+
+	pr_debug("of_irq_map_raw: ipar=%s, size=%d\n", ipar->full_name, intsize);
+
+	if (ointsize != intsize)
+		return -EINVAL;
+
+	/* Look for this #address-cells. We have to implement the old linux
+	 * trick of looking for the parent here as some device-trees rely on it
+	 */
+	old = of_node_get(ipar);
+	do {
+		tmp = of_get_property(old, "#address-cells", NULL);
+		tnode = of_get_parent(old);
+		of_node_put(old);
+		old = tnode;
+	} while (old && tmp == NULL);
+	of_node_put(old);
+	old = NULL;
+	addrsize = (tmp == NULL) ? 2 : *tmp;
+
+	pr_debug(" -> addrsize=%d\n", addrsize);
+
+	/* Now start the actual "proper" walk of the interrupt tree */
+	while (ipar != NULL) {
+		/* Now check if cursor is an interrupt-controller and if it is
+		 * then we are done
+		 */
+		if (of_get_property(ipar, "interrupt-controller", NULL) !=
+				NULL) {
+			pr_debug(" -> got it !\n");
+			memcpy(out_irq->specifier, intspec,
+			       intsize * sizeof(u32));
+			out_irq->size = intsize;
+			out_irq->controller = ipar;
+			of_node_put(old);
+			return 0;
+		}
+
+		/* Now look for an interrupt-map */
+		imap = of_get_property(ipar, "interrupt-map", &imaplen);
+		/* No interrupt map, check for an interrupt parent */
+		if (imap == NULL) {
+			pr_debug(" -> no map, getting parent\n");
+			newpar = of_irq_find_parent(ipar);
+			goto skiplevel;
+		}
+		imaplen /= sizeof(u32);
+
+		/* Look for a mask */
+		imask = of_get_property(ipar, "interrupt-map-mask", NULL);
+
+		/* If we were passed no "reg" property and we attempt to parse
+		 * an interrupt-map, then #address-cells must be 0.
+		 * Fail if it's not.
+		 */
+		if (addr == NULL && addrsize != 0) {
+			pr_debug(" -> no reg passed in when needed !\n");
+			goto fail;
+		}
+
+		/* Parse interrupt-map */
+		match = 0;
+		while (imaplen > (addrsize + intsize + 1) && !match) {
+			/* Compare specifiers */
+			match = 1;
+			for (i = 0; i < addrsize && match; ++i) {
+				u32 mask = imask ? imask[i] : 0xffffffffu;
+				match = ((addr[i] ^ imap[i]) & mask) == 0;
+			}
+			for (; i < (addrsize + intsize) && match; ++i) {
+				u32 mask = imask ? imask[i] : 0xffffffffu;
+				match =
+				   ((intspec[i-addrsize] ^ imap[i]) & mask) == 0;
+			}
+			imap += addrsize + intsize;
+			imaplen -= addrsize + intsize;
+
+			pr_debug(" -> match=%d (imaplen=%d)\n", match, imaplen);
+
+			/* Get the interrupt parent */
+			if (of_irq_workarounds & OF_IMAP_NO_PHANDLE)
+				newpar = of_node_get(of_irq_dflt_pic);
+			else
+				newpar = of_find_node_by_phandle((phandle)*imap);
+			imap++;
+			--imaplen;
+
+			/* Check if not found */
+			if (newpar == NULL) {
+				pr_debug(" -> imap parent not found !\n");
+				goto fail;
+			}
+
+			/* Get #interrupt-cells and #address-cells of new
+			 * parent
+			 */
+			tmp = of_get_property(newpar, "#interrupt-cells", NULL);
+			if (tmp == NULL) {
+				pr_debug(" -> parent lacks #interrupt-cells!\n");
+				goto fail;
+			}
+			newintsize = *tmp;
+			tmp = of_get_property(newpar, "#address-cells", NULL);
+			newaddrsize = (tmp == NULL) ? 0 : *tmp;
+
+			pr_debug(" -> newintsize=%d, newaddrsize=%d\n",
+			    newintsize, newaddrsize);
+
+			/* Check for malformed properties */
+			if (imaplen < (newaddrsize + newintsize))
+				goto fail;
+
+			imap += newaddrsize + newintsize;
+			imaplen -= newaddrsize + newintsize;
+
+			pr_debug(" -> imaplen=%d\n", imaplen);
+		}
+		if (!match)
+			goto fail;
+
+		of_node_put(old);
+		old = of_node_get(newpar);
+		addrsize = newaddrsize;
+		intsize = newintsize;
+		intspec = imap - intsize;
+		addr = intspec - addrsize;
+
+	skiplevel:
+		/* Iterate again with new parent */
+		pr_debug(" -> new parent: %s\n", newpar ? newpar->full_name : "<>");
+		of_node_put(ipar);
+		ipar = newpar;
+		newpar = NULL;
+	}
+ fail:
+	of_node_put(ipar);
+	of_node_put(old);
+	of_node_put(newpar);
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(of_irq_map_raw);
+
+/**
+ * of_irq_map_one - Resolve an interrupt for a device
+ * @device: the device whose interrupt is to be resolved
+ * @index: index of the interrupt to resolve
+ * @out_irq: structure of_irq filled by this function
+ *
+ * This function resolves an interrupt, walking the tree, for a given
+ * device-tree node. It's the high level pendant to of_irq_map_raw().
+ */
+int of_irq_map_one(struct device_node *device, int index, struct of_irq *out_irq)
+{
+	struct device_node *p;
+	const u32 *intspec, *tmp, *addr;
+	u32 intsize, intlen;
+	int res = -EINVAL;
+
+	pr_debug("of_irq_map_one: dev=%s, index=%d\n", device->full_name, index);
+
+	/* OldWorld mac stuff is "special", handle out of line */
+	if (of_irq_workarounds & OF_IMAP_OLDWORLD_MAC)
+		return of_irq_map_oldworld(device, index, out_irq);
+
+	/* Get the interrupts property */
+	intspec = of_get_property(device, "interrupts", &intlen);
+	if (intspec == NULL)
+		return -EINVAL;
+	intlen /= sizeof(u32);
+
+	pr_debug(" intspec=%d intlen=%d\n", *intspec, intlen);
+
+	/* Get the reg property (if any) */
+	addr = of_get_property(device, "reg", NULL);
+
+	/* Look for the interrupt parent. */
+	p = of_irq_find_parent(device);
+	if (p == NULL)
+		return -EINVAL;
+
+	/* Get size of interrupt specifier */
+	tmp = of_get_property(p, "#interrupt-cells", NULL);
+	if (tmp == NULL)
+		goto out;
+	intsize = *tmp;
+
+	pr_debug(" intsize=%d intlen=%d\n", intsize, intlen);
+
+	/* Check index */
+	if ((index + 1) * intsize > intlen)
+		goto out;
+
+	/* Get new specifier and map it */
+	res = of_irq_map_raw(p, intspec + index * intsize, intsize,
+			     addr, out_irq);
+ out:
+	of_node_put(p);
+	return res;
+}
+EXPORT_SYMBOL_GPL(of_irq_map_one);
+
+/**
+ * of_irq_to_resource - Decode a node's IRQ and return it as a resource
+ * @dev: pointer to device tree node
+ * @index: zero-based index of the irq
+ * @r: pointer to resource structure to return result into.
+ */
+int of_irq_to_resource(struct device_node *dev, int index, struct resource *r)
+{
+	int irq = irq_of_parse_and_map(dev, index);
+
+	/* Only dereference the resource if both the
+	 * resource and the irq are valid. */
+	if (r && irq != NO_IRQ) {
+		r->start = r->end = irq;
+		r->flags = IORESOURCE_IRQ;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(of_irq_to_resource);
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 0e37c05b7dd8..5929781c104d 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -4,6 +4,8 @@
 #if defined(CONFIG_OF)
 struct of_irq;
 #include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
 #include <linux/of.h>
 
 /*
@@ -30,11 +32,38 @@ struct of_irq {
 	u32 specifier[OF_MAX_IRQ_SPEC]; /* Specifier copy */
 };
 
+/*
+ * Workarounds only applied to 32bit powermac machines
+ */
+#define OF_IMAP_OLDWORLD_MAC	0x00000001
+#define OF_IMAP_NO_PHANDLE	0x00000002
+
+#if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC)
+extern unsigned int of_irq_workarounds;
+extern struct device_node *of_irq_dflt_pic;
+extern int of_irq_map_oldworld(struct device_node *device, int index,
+			       struct of_irq *out_irq);
+#else /* CONFIG_PPC32 && CONFIG_PPC_PMAC */
+#define of_irq_workarounds (0)
+#define of_irq_dflt_pic (NULL)
+static inline int of_irq_map_oldworld(struct device_node *device, int index,
+				      struct of_irq *out_irq)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_PPC32 && CONFIG_PPC_PMAC */
+
+
+extern int of_irq_map_raw(struct device_node *parent, const u32 *intspec,
+			  u32 ointsize, const u32 *addr,
+			  struct of_irq *out_irq);
 extern int of_irq_map_one(struct device_node *device, int index,
 			  struct of_irq *out_irq);
 extern unsigned int irq_create_of_mapping(struct device_node *controller,
 					  const u32 *intspec,
 					  unsigned int intsize);
+extern int of_irq_to_resource(struct device_node *dev, int index,
+			      struct resource *r);
 
 #endif /* CONFIG_OF_IRQ */
 #endif /* CONFIG_OF */
-- 
cgit v1.2.3-59-g8ed1b


From 6b884a8d50a6eea2fb3dad7befe748f67193073b Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:09 -0600
Subject: of/address: merge of_iomap()

Merge common code between Microblaze and PowerPC.  This patch creates
new of_address.h and address.c files to containing address translation
and mapping routines.  First routine to be moved it of_iomap()

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Michal Simek <monstr@monstr.eu>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h  | 10 +---------
 arch/microblaze/kernel/prom_parse.c | 11 -----------
 arch/powerpc/include/asm/prom.h     | 10 +---------
 arch/powerpc/kernel/prom_parse.c    | 11 -----------
 drivers/of/Kconfig                  |  4 ++++
 drivers/of/Makefile                 |  1 +
 drivers/of/address.c                | 22 ++++++++++++++++++++++
 include/linux/of_address.h          |  9 +++++++++
 8 files changed, 38 insertions(+), 40 deletions(-)
 create mode 100644 drivers/of/address.c
 create mode 100644 include/linux/of_address.h

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 5fbdfe76fe76..6411c3b3a80f 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -20,6 +20,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_fdt.h>
 #include <linux/proc_fs.h>
@@ -103,15 +104,6 @@ extern const void *of_get_mac_address(struct device_node *np);
 struct pci_dev;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
-/**
- * of_iomap - Maps the memory mapped IO for a given device_node
- * @device:	the device whose io range will be mapped
- * @index:	index of the io range
- *
- * Returns a pointer to the mapped memory
- */
-extern void __iomem *of_iomap(struct device_node *device, int index);
-
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_MICROBLAZE_PROM_H */
diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c
index e28968fa34c1..1159ba52ad4a 100644
--- a/arch/microblaze/kernel/prom_parse.c
+++ b/arch/microblaze/kernel/prom_parse.c
@@ -681,14 +681,3 @@ const void *of_get_mac_address(struct device_node *np)
 	return NULL;
 }
 EXPORT_SYMBOL(of_get_mac_address);
-
-void __iomem *of_iomap(struct device_node *np, int index)
-{
-	struct resource res;
-
-	if (of_address_to_resource(np, index, &res))
-		return NULL;
-
-	return ioremap(res.start, 1 + res.end - res.start);
-}
-EXPORT_SYMBOL(of_iomap);
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 10d5ee556702..0abe379c6f33 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -18,6 +18,7 @@
  */
 #include <linux/types.h>
 #include <linux/of_fdt.h>
+#include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/proc_fs.h>
 #include <linux/platform_device.h>
@@ -119,14 +120,5 @@ extern const void *of_get_mac_address(struct device_node *np);
 struct pci_dev;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
-/**
- * of_iomap - Maps the memory mapped IO for a given device_node
- * @device:	the device whose io range will be mapped
- * @index:	index of the io range
- *
- * Returns a pointer to the mapped memory
- */
-extern void __iomem *of_iomap(struct device_node *device, int index);
-
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_PROM_H */
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index d61a5c5fe690..1d5d4f6dfef5 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -715,14 +715,3 @@ const void *of_get_mac_address(struct device_node *np)
 	return NULL;
 }
 EXPORT_SYMBOL(of_get_mac_address);
-
-void __iomem *of_iomap(struct device_node *np, int index)
-{
-	struct resource res;
-
-	if (of_address_to_resource(np, index, &res))
-		return NULL;
-
-	return ioremap(res.start, 1 + res.end - res.start);
-}
-EXPORT_SYMBOL(of_iomap);
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index b87495efa16e..097f42aebe90 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -6,6 +6,10 @@ config OF_DYNAMIC
 	def_bool y
 	depends on OF && PPC_OF
 
+config OF_ADDRESS
+	def_bool y
+	depends on OF && !SPARC
+
 config OF_IRQ
 	def_bool y
 	depends on OF && !SPARC
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 3631a5ea0b47..0052c405463a 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -1,5 +1,6 @@
 obj-y = base.o
 obj-$(CONFIG_OF_FLATTREE) += fdt.o
+obj-$(CONFIG_OF_ADDRESS)  += address.o
 obj-$(CONFIG_OF_IRQ)    += irq.o
 obj-$(CONFIG_OF_DEVICE) += device.o platform.o
 obj-$(CONFIG_OF_GPIO)   += gpio.o
diff --git a/drivers/of/address.c b/drivers/of/address.c
new file mode 100644
index 000000000000..258528d6c4fe
--- /dev/null
+++ b/drivers/of/address.c
@@ -0,0 +1,22 @@
+
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/of_address.h>
+
+/**
+ * of_iomap - Maps the memory mapped IO for a given device_node
+ * @device:	the device whose io range will be mapped
+ * @index:	index of the io range
+ *
+ * Returns a pointer to the mapped memory
+ */
+void __iomem *of_iomap(struct device_node *np, int index)
+{
+	struct resource res;
+
+	if (of_address_to_resource(np, index, &res))
+		return NULL;
+
+	return ioremap(res.start, 1 + res.end - res.start);
+}
+EXPORT_SYMBOL(of_iomap);
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
new file mode 100644
index 000000000000..570831d7e795
--- /dev/null
+++ b/include/linux/of_address.h
@@ -0,0 +1,9 @@
+#ifndef __OF_ADDRESS_H
+#define __OF_ADDRESS_H
+#include <linux/ioport.h>
+#include <linux/of.h>
+
+extern void __iomem *of_iomap(struct device_node *device, int index);
+
+#endif /* __OF_ADDRESS_H */
+
-- 
cgit v1.2.3-59-g8ed1b


From 1f5bef30cf6c66f097ea5dfc580a41924df888d1 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:09 -0600
Subject: of/address: merge of_address_to_resource()

Merge common code between PowerPC and Microblaze.  This patch also
moves the prototype of pci_address_to_pio() out of pci-bridge.h and
into prom.h because the only user of pci_address_to_pio() is
of_address_to_resource().

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Michal Simek <monstr@monstr.eu>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/pci-bridge.h |  5 ----
 arch/microblaze/include/asm/prom.h       | 17 ++++++-----
 arch/microblaze/kernel/prom_parse.c      | 46 +---------------------------
 arch/powerpc/include/asm/pci-bridge.h    |  5 ----
 arch/powerpc/include/asm/prom.h          | 17 ++++++-----
 arch/powerpc/kernel/prom_parse.c         | 47 +----------------------------
 drivers/of/address.c                     | 51 ++++++++++++++++++++++++++++++++
 include/linux/of_address.h               |  5 ++++
 8 files changed, 76 insertions(+), 117 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/pci-bridge.h b/arch/microblaze/include/asm/pci-bridge.h
index 0c77cda9f5d8..0c68764ab547 100644
--- a/arch/microblaze/include/asm/pci-bridge.h
+++ b/arch/microblaze/include/asm/pci-bridge.h
@@ -172,13 +172,8 @@ static inline int pci_has_flag(int flag)
 
 extern struct list_head hose_list;
 
-extern unsigned long pci_address_to_pio(phys_addr_t address);
 extern int pcibios_vaddr_is_ioport(void __iomem *address);
 #else
-static inline unsigned long pci_address_to_pio(phys_addr_t address)
-{
-	return (unsigned long)-1;
-}
 static inline int pcibios_vaddr_is_ioport(void __iomem *address)
 {
 	return 0;
diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 6411c3b3a80f..4e94c0706c50 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -65,17 +65,18 @@ extern const u32 *of_get_address(struct device_node *dev, int index,
 extern const u32 *of_get_pci_address(struct device_node *dev, int bar_no,
 			u64 *size, unsigned int *flags);
 
-/* Get an address as a resource. Note that if your address is
- * a PIO address, the conversion will fail if the physical address
- * can't be internally converted to an IO token with
- * pci_address_to_pio(), that is because it's either called to early
- * or it can't be matched to any host bridge IO space
- */
-extern int of_address_to_resource(struct device_node *dev, int index,
-				struct resource *r);
 extern int of_pci_address_to_resource(struct device_node *dev, int bar,
 				struct resource *r);
 
+#ifdef CONFIG_PCI
+extern unsigned long pci_address_to_pio(phys_addr_t address);
+#else
+static inline unsigned long pci_address_to_pio(phys_addr_t address)
+{
+	return (unsigned long)-1;
+}
+#endif	/* CONFIG_PCI */
+
 /* Parse the ibm,dma-window property of an OF node into the busno, phys and
  * size parameters.
  */
diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c
index 1159ba52ad4a..2f9cdd26ca12 100644
--- a/arch/microblaze/kernel/prom_parse.c
+++ b/arch/microblaze/kernel/prom_parse.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/etherdevice.h>
+#include <linux/of_address.h>
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
 
@@ -17,9 +18,6 @@
 			(ns) > 0)
 
 static struct of_bus *of_match_bus(struct device_node *np);
-static int __of_address_to_resource(struct device_node *dev,
-		const u32 *addrp, u64 size, unsigned int flags,
-		struct resource *r);
 
 /* Debug utility */
 #ifdef DEBUG
@@ -576,48 +574,6 @@ const u32 *of_get_address(struct device_node *dev, int index, u64 *size,
 }
 EXPORT_SYMBOL(of_get_address);
 
-static int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
-				u64 size, unsigned int flags,
-				struct resource *r)
-{
-	u64 taddr;
-
-	if ((flags & (IORESOURCE_IO | IORESOURCE_MEM)) == 0)
-		return -EINVAL;
-	taddr = of_translate_address(dev, addrp);
-	if (taddr == OF_BAD_ADDR)
-		return -EINVAL;
-	memset(r, 0, sizeof(struct resource));
-	if (flags & IORESOURCE_IO) {
-		unsigned long port;
-		port = -1; /* pci_address_to_pio(taddr); */
-		if (port == (unsigned long)-1)
-			return -EINVAL;
-		r->start = port;
-		r->end = port + size - 1;
-	} else {
-		r->start = taddr;
-		r->end = taddr + size - 1;
-	}
-	r->flags = flags;
-	r->name = dev->name;
-	return 0;
-}
-
-int of_address_to_resource(struct device_node *dev, int index,
-			struct resource *r)
-{
-	const u32	*addrp;
-	u64		size;
-	unsigned int	flags;
-
-	addrp = of_get_address(dev, index, &size, &flags);
-	if (addrp == NULL)
-		return -EINVAL;
-	return __of_address_to_resource(dev, addrp, size, flags, r);
-}
-EXPORT_SYMBOL_GPL(of_address_to_resource);
-
 void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 		unsigned long *busno, unsigned long *phys, unsigned long *size)
 {
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 76e1f313a58e..51e9e6f90d12 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -303,13 +303,8 @@ extern void pcibios_free_controller(struct pci_controller *phb);
 extern void pcibios_setup_phb_resources(struct pci_controller *hose);
 
 #ifdef CONFIG_PCI
-extern unsigned long pci_address_to_pio(phys_addr_t address);
 extern int pcibios_vaddr_is_ioport(void __iomem *address);
 #else
-static inline unsigned long pci_address_to_pio(phys_addr_t address)
-{
-	return (unsigned long)-1;
-}
 static inline int pcibios_vaddr_is_ioport(void __iomem *address)
 {
 	return 0;
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 0abe379c6f33..ceace966c51c 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -70,14 +70,6 @@ static inline const u32 *of_get_pci_address(struct device_node *dev,
 }
 #endif /* CONFIG_PCI */
 
-/* Get an address as a resource. Note that if your address is
- * a PIO address, the conversion will fail if the physical address
- * can't be internally converted to an IO token with
- * pci_address_to_pio(), that is because it's either called to early
- * or it can't be matched to any host bridge IO space
- */
-extern int of_address_to_resource(struct device_node *dev, int index,
-				  struct resource *r);
 #ifdef CONFIG_PCI
 extern int of_pci_address_to_resource(struct device_node *dev, int bar,
 				      struct resource *r);
@@ -89,6 +81,15 @@ static inline int of_pci_address_to_resource(struct device_node *dev, int bar,
 }
 #endif /* CONFIG_PCI */
 
+#ifdef CONFIG_PCI
+extern unsigned long pci_address_to_pio(phys_addr_t address);
+#else
+static inline unsigned long pci_address_to_pio(phys_addr_t address)
+{
+	return (unsigned long)-1;
+}
+#endif	/* CONFIG_PCI */
+
 /* Parse the ibm,dma-window property of an OF node into the busno, phys and
  * size parameters.
  */
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index 1d5d4f6dfef5..1dac535de78f 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/etherdevice.h>
+#include <linux/of_address.h>
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
 
@@ -27,10 +28,6 @@
 			(ns) > 0)
 
 static struct of_bus *of_match_bus(struct device_node *np);
-static int __of_address_to_resource(struct device_node *dev,
-		const u32 *addrp, u64 size, unsigned int flags,
-		struct resource *r);
-
 
 /* Debug utility */
 #ifdef DEBUG
@@ -610,48 +607,6 @@ const u32 *of_get_address(struct device_node *dev, int index, u64 *size,
 }
 EXPORT_SYMBOL(of_get_address);
 
-static int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
-				    u64 size, unsigned int flags,
-				    struct resource *r)
-{
-	u64 taddr;
-
-	if ((flags & (IORESOURCE_IO | IORESOURCE_MEM)) == 0)
-		return -EINVAL;
-	taddr = of_translate_address(dev, addrp);
-	if (taddr == OF_BAD_ADDR)
-		return -EINVAL;
-	memset(r, 0, sizeof(struct resource));
-	if (flags & IORESOURCE_IO) {
-		unsigned long port;
-		port = pci_address_to_pio(taddr);
-		if (port == (unsigned long)-1)
-			return -EINVAL;
-		r->start = port;
-		r->end = port + size - 1;
-	} else {
-		r->start = taddr;
-		r->end = taddr + size - 1;
-	}
-	r->flags = flags;
-	r->name = dev->name;
-	return 0;
-}
-
-int of_address_to_resource(struct device_node *dev, int index,
-			   struct resource *r)
-{
-	const u32	*addrp;
-	u64		size;
-	unsigned int	flags;
-
-	addrp = of_get_address(dev, index, &size, &flags);
-	if (addrp == NULL)
-		return -EINVAL;
-	return __of_address_to_resource(dev, addrp, size, flags, r);
-}
-EXPORT_SYMBOL_GPL(of_address_to_resource);
-
 void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 		unsigned long *busno, unsigned long *phys, unsigned long *size)
 {
diff --git a/drivers/of/address.c b/drivers/of/address.c
index 258528d6c4fe..c3819550f907 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -3,6 +3,57 @@
 #include <linux/ioport.h>
 #include <linux/of_address.h>
 
+int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
+			     u64 size, unsigned int flags,
+			     struct resource *r)
+{
+	u64 taddr;
+
+	if ((flags & (IORESOURCE_IO | IORESOURCE_MEM)) == 0)
+		return -EINVAL;
+	taddr = of_translate_address(dev, addrp);
+	if (taddr == OF_BAD_ADDR)
+		return -EINVAL;
+	memset(r, 0, sizeof(struct resource));
+	if (flags & IORESOURCE_IO) {
+		unsigned long port;
+		port = pci_address_to_pio(taddr);
+		if (port == (unsigned long)-1)
+			return -EINVAL;
+		r->start = port;
+		r->end = port + size - 1;
+	} else {
+		r->start = taddr;
+		r->end = taddr + size - 1;
+	}
+	r->flags = flags;
+	r->name = dev->name;
+	return 0;
+}
+
+/**
+ * of_address_to_resource - Translate device tree address and return as resource
+ *
+ * Note that if your address is a PIO address, the conversion will fail if
+ * the physical address can't be internally converted to an IO token with
+ * pci_address_to_pio(), that is because it's either called to early or it
+ * can't be matched to any host bridge IO space
+ */
+int of_address_to_resource(struct device_node *dev, int index,
+			   struct resource *r)
+{
+	const u32	*addrp;
+	u64		size;
+	unsigned int	flags;
+
+	addrp = of_get_address(dev, index, &size, &flags);
+	if (addrp == NULL)
+		return -EINVAL;
+	return __of_address_to_resource(dev, addrp, size, flags, r);
+}
+EXPORT_SYMBOL_GPL(of_address_to_resource);
+
+
 /**
  * of_iomap - Maps the memory mapped IO for a given device_node
  * @device:	the device whose io range will be mapped
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 570831d7e795..474b794ed9d2 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -3,6 +3,11 @@
 #include <linux/ioport.h>
 #include <linux/of.h>
 
+extern int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
+				    u64 size, unsigned int flags,
+				    struct resource *r);
+extern int of_address_to_resource(struct device_node *dev, int index,
+				  struct resource *r);
 extern void __iomem *of_iomap(struct device_node *device, int index);
 
 #endif /* __OF_ADDRESS_H */
-- 
cgit v1.2.3-59-g8ed1b


From dbbdee94734bf6f1db7af42008a53655e77cab8f Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:10 -0600
Subject: of/address: Merge all of the bus translation code

Microblaze and PowerPC share a large chunk of code for translating
OF device tree data into usable addresses.  Differences between the two
consist of cosmetic differences, and the addition of dma-ranges support
code to powerpc but not microblaze.  This patch moves the powerpc
version into common code and applies many of the cosmetic (non-functional)
changes from the microblaze version.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Michal Simek <monstr@monstr.eu>
CC: Wolfram Sang <w.sang@pengutronix.de>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h  |   4 -
 arch/microblaze/kernel/prom_parse.c | 489 ----------------------------------
 arch/powerpc/include/asm/prom.h     |   4 -
 arch/powerpc/kernel/prom_parse.c    | 515 -----------------------------------
 drivers/of/address.c                | 517 +++++++++++++++++++++++++++++++++++-
 include/linux/of_address.h          |   4 +-
 6 files changed, 515 insertions(+), 1018 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 4e94c0706c50..cb9c3dd9a23b 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -52,10 +52,6 @@ extern void pci_create_OF_bus_map(void);
  * OF address retreival & translation
  */
 
-/* Translate an OF address block into a CPU physical address
- */
-extern u64 of_translate_address(struct device_node *np, const u32 *addr);
-
 /* Extract an address from a device, returns the region size and
  * the address space flags too. The PCI version uses a BAR number
  * instead of an absolute index
diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c
index 2f9cdd26ca12..d33ba17601fa 100644
--- a/arch/microblaze/kernel/prom_parse.c
+++ b/arch/microblaze/kernel/prom_parse.c
@@ -10,213 +10,7 @@
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
 
-#define PRu64	"%llx"
-
-/* Max address size we deal with */
-#define OF_MAX_ADDR_CELLS	4
-#define OF_CHECK_COUNTS(na, ns)	((na) > 0 && (na) <= OF_MAX_ADDR_CELLS && \
-			(ns) > 0)
-
-static struct of_bus *of_match_bus(struct device_node *np);
-
-/* Debug utility */
-#ifdef DEBUG
-static void of_dump_addr(const char *s, const u32 *addr, int na)
-{
-	printk(KERN_INFO "%s", s);
-	while (na--)
-		printk(KERN_INFO " %08x", *(addr++));
-	printk(KERN_INFO "\n");
-}
-#else
-static void of_dump_addr(const char *s, const u32 *addr, int na) { }
-#endif
-
-/* Callbacks for bus specific translators */
-struct of_bus {
-	const char	*name;
-	const char	*addresses;
-	int		(*match)(struct device_node *parent);
-	void		(*count_cells)(struct device_node *child,
-					int *addrc, int *sizec);
-	u64		(*map)(u32 *addr, const u32 *range,
-				int na, int ns, int pna);
-	int		(*translate)(u32 *addr, u64 offset, int na);
-	unsigned int	(*get_flags)(const u32 *addr);
-};
-
-/*
- * Default translator (generic bus)
- */
-
-static void of_bus_default_count_cells(struct device_node *dev,
-					int *addrc, int *sizec)
-{
-	if (addrc)
-		*addrc = of_n_addr_cells(dev);
-	if (sizec)
-		*sizec = of_n_size_cells(dev);
-}
-
-static u64 of_bus_default_map(u32 *addr, const u32 *range,
-		int na, int ns, int pna)
-{
-	u64 cp, s, da;
-
-	cp = of_read_number(range, na);
-	s  = of_read_number(range + na + pna, ns);
-	da = of_read_number(addr, na);
-
-	pr_debug("OF: default map, cp="PRu64", s="PRu64", da="PRu64"\n",
-		cp, s, da);
-
-	if (da < cp || da >= (cp + s))
-		return OF_BAD_ADDR;
-	return da - cp;
-}
-
-static int of_bus_default_translate(u32 *addr, u64 offset, int na)
-{
-	u64 a = of_read_number(addr, na);
-	memset(addr, 0, na * 4);
-	a += offset;
-	if (na > 1)
-		addr[na - 2] = a >> 32;
-	addr[na - 1] = a & 0xffffffffu;
-
-	return 0;
-}
-
-static unsigned int of_bus_default_get_flags(const u32 *addr)
-{
-	return IORESOURCE_MEM;
-}
-
 #ifdef CONFIG_PCI
-/*
- * PCI bus specific translator
- */
-
-static int of_bus_pci_match(struct device_node *np)
-{
-	/* "vci" is for the /chaos bridge on 1st-gen PCI powermacs */
-	return !strcmp(np->type, "pci") || !strcmp(np->type, "vci");
-}
-
-static void of_bus_pci_count_cells(struct device_node *np,
-				int *addrc, int *sizec)
-{
-	if (addrc)
-		*addrc = 3;
-	if (sizec)
-		*sizec = 2;
-}
-
-static u64 of_bus_pci_map(u32 *addr, const u32 *range, int na, int ns, int pna)
-{
-	u64 cp, s, da;
-
-	/* Check address type match */
-	if ((addr[0] ^ range[0]) & 0x03000000)
-		return OF_BAD_ADDR;
-
-	/* Read address values, skipping high cell */
-	cp = of_read_number(range + 1, na - 1);
-	s  = of_read_number(range + na + pna, ns);
-	da = of_read_number(addr + 1, na - 1);
-
-	pr_debug("OF: PCI map, cp="PRu64", s="PRu64", da="PRu64"\n", cp, s, da);
-
-	if (da < cp || da >= (cp + s))
-		return OF_BAD_ADDR;
-	return da - cp;
-}
-
-static int of_bus_pci_translate(u32 *addr, u64 offset, int na)
-{
-	return of_bus_default_translate(addr + 1, offset, na - 1);
-}
-
-static unsigned int of_bus_pci_get_flags(const u32 *addr)
-{
-	unsigned int flags = 0;
-	u32 w = addr[0];
-
-	switch ((w >> 24) & 0x03) {
-	case 0x01:
-		flags |= IORESOURCE_IO;
-		break;
-	case 0x02: /* 32 bits */
-	case 0x03: /* 64 bits */
-		flags |= IORESOURCE_MEM;
-		break;
-	}
-	if (w & 0x40000000)
-		flags |= IORESOURCE_PREFETCH;
-	return flags;
-}
-
-const u32 *of_get_pci_address(struct device_node *dev, int bar_no, u64 *size,
-			unsigned int *flags)
-{
-	const u32 *prop;
-	unsigned int psize;
-	struct device_node *parent;
-	struct of_bus *bus;
-	int onesize, i, na, ns;
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		return NULL;
-	bus = of_match_bus(parent);
-	if (strcmp(bus->name, "pci")) {
-		of_node_put(parent);
-		return NULL;
-	}
-	bus->count_cells(dev, &na, &ns);
-	of_node_put(parent);
-	if (!OF_CHECK_COUNTS(na, ns))
-		return NULL;
-
-	/* Get "reg" or "assigned-addresses" property */
-	prop = of_get_property(dev, bus->addresses, &psize);
-	if (prop == NULL)
-		return NULL;
-	psize /= 4;
-
-	onesize = na + ns;
-	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
-		if ((prop[0] & 0xff) == ((bar_no * 4) + PCI_BASE_ADDRESS_0)) {
-			if (size)
-				*size = of_read_number(prop + na, ns);
-			if (flags)
-				*flags = bus->get_flags(prop);
-			return prop;
-		}
-	return NULL;
-}
-EXPORT_SYMBOL(of_get_pci_address);
-
-int of_pci_address_to_resource(struct device_node *dev, int bar,
-				struct resource *r)
-{
-	const u32	*addrp;
-	u64		size;
-	unsigned int	flags;
-
-	addrp = of_get_pci_address(dev, bar, &size, &flags);
-	if (addrp == NULL)
-		return -EINVAL;
-	return __of_address_to_resource(dev, addrp, size, flags, r);
-}
-EXPORT_SYMBOL_GPL(of_pci_address_to_resource);
-
-static u8 of_irq_pci_swizzle(u8 slot, u8 pin)
-{
-	return (((pin - 1) + slot) % 4) + 1;
-}
-
 int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
 {
 	struct device_node *dn, *ppnode;
@@ -291,289 +85,6 @@ int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
 EXPORT_SYMBOL_GPL(of_irq_map_pci);
 #endif /* CONFIG_PCI */
 
-/*
- * ISA bus specific translator
- */
-
-static int of_bus_isa_match(struct device_node *np)
-{
-	return !strcmp(np->name, "isa");
-}
-
-static void of_bus_isa_count_cells(struct device_node *child,
-				int *addrc, int *sizec)
-{
-	if (addrc)
-		*addrc = 2;
-	if (sizec)
-		*sizec = 1;
-}
-
-static u64 of_bus_isa_map(u32 *addr, const u32 *range, int na, int ns, int pna)
-{
-	u64 cp, s, da;
-
-	/* Check address type match */
-	if ((addr[0] ^ range[0]) & 0x00000001)
-		return OF_BAD_ADDR;
-
-	/* Read address values, skipping high cell */
-	cp = of_read_number(range + 1, na - 1);
-	s  = of_read_number(range + na + pna, ns);
-	da = of_read_number(addr + 1, na - 1);
-
-	pr_debug("OF: ISA map, cp="PRu64", s="PRu64", da="PRu64"\n", cp, s, da);
-
-	if (da < cp || da >= (cp + s))
-		return OF_BAD_ADDR;
-	return da - cp;
-}
-
-static int of_bus_isa_translate(u32 *addr, u64 offset, int na)
-{
-	return of_bus_default_translate(addr + 1, offset, na - 1);
-}
-
-static unsigned int of_bus_isa_get_flags(const u32 *addr)
-{
-	unsigned int flags = 0;
-	u32 w = addr[0];
-
-	if (w & 1)
-		flags |= IORESOURCE_IO;
-	else
-		flags |= IORESOURCE_MEM;
-	return flags;
-}
-
-/*
- * Array of bus specific translators
- */
-
-static struct of_bus of_busses[] = {
-#ifdef CONFIG_PCI
-	/* PCI */
-	{
-		.name = "pci",
-		.addresses = "assigned-addresses",
-		.match = of_bus_pci_match,
-		.count_cells = of_bus_pci_count_cells,
-		.map = of_bus_pci_map,
-		.translate = of_bus_pci_translate,
-		.get_flags = of_bus_pci_get_flags,
-	},
-#endif /* CONFIG_PCI */
-	/* ISA */
-	{
-		.name = "isa",
-		.addresses = "reg",
-		.match = of_bus_isa_match,
-		.count_cells = of_bus_isa_count_cells,
-		.map = of_bus_isa_map,
-		.translate = of_bus_isa_translate,
-		.get_flags = of_bus_isa_get_flags,
-	},
-	/* Default */
-	{
-		.name = "default",
-		.addresses = "reg",
-		.match = NULL,
-		.count_cells = of_bus_default_count_cells,
-		.map = of_bus_default_map,
-		.translate = of_bus_default_translate,
-		.get_flags = of_bus_default_get_flags,
-	},
-};
-
-static struct of_bus *of_match_bus(struct device_node *np)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(of_busses); i++)
-		if (!of_busses[i].match || of_busses[i].match(np))
-			return &of_busses[i];
-	BUG();
-	return NULL;
-}
-
-static int of_translate_one(struct device_node *parent, struct of_bus *bus,
-			struct of_bus *pbus, u32 *addr,
-			int na, int ns, int pna)
-{
-	const u32 *ranges;
-	unsigned int rlen;
-	int rone;
-	u64 offset = OF_BAD_ADDR;
-
-	/* Normally, an absence of a "ranges" property means we are
-	 * crossing a non-translatable boundary, and thus the addresses
-	 * below the current not cannot be converted to CPU physical ones.
-	 * Unfortunately, while this is very clear in the spec, it's not
-	 * what Apple understood, and they do have things like /uni-n or
-	 * /ht nodes with no "ranges" property and a lot of perfectly
-	 * useable mapped devices below them. Thus we treat the absence of
-	 * "ranges" as equivalent to an empty "ranges" property which means
-	 * a 1:1 translation at that level. It's up to the caller not to try
-	 * to translate addresses that aren't supposed to be translated in
-	 * the first place. --BenH.
-	 */
-	ranges = of_get_property(parent, "ranges", (int *) &rlen);
-	if (ranges == NULL || rlen == 0) {
-		offset = of_read_number(addr, na);
-		memset(addr, 0, pna * 4);
-		pr_debug("OF: no ranges, 1:1 translation\n");
-		goto finish;
-	}
-
-	pr_debug("OF: walking ranges...\n");
-
-	/* Now walk through the ranges */
-	rlen /= 4;
-	rone = na + pna + ns;
-	for (; rlen >= rone; rlen -= rone, ranges += rone) {
-		offset = bus->map(addr, ranges, na, ns, pna);
-		if (offset != OF_BAD_ADDR)
-			break;
-	}
-	if (offset == OF_BAD_ADDR) {
-		pr_debug("OF: not found !\n");
-		return 1;
-	}
-	memcpy(addr, ranges + na, 4 * pna);
-
- finish:
-	of_dump_addr("OF: parent translation for:", addr, pna);
-	pr_debug("OF: with offset: "PRu64"\n", offset);
-
-	/* Translate it into parent bus space */
-	return pbus->translate(addr, offset, pna);
-}
-
-/*
- * Translate an address from the device-tree into a CPU physical address,
- * this walks up the tree and applies the various bus mappings on the
- * way.
- *
- * Note: We consider that crossing any level with #size-cells == 0 to mean
- * that translation is impossible (that is we are not dealing with a value
- * that can be mapped to a cpu physical address). This is not really specified
- * that way, but this is traditionally the way IBM at least do things
- */
-u64 of_translate_address(struct device_node *dev, const u32 *in_addr)
-{
-	struct device_node *parent = NULL;
-	struct of_bus *bus, *pbus;
-	u32 addr[OF_MAX_ADDR_CELLS];
-	int na, ns, pna, pns;
-	u64 result = OF_BAD_ADDR;
-
-	pr_debug("OF: ** translation for device %s **\n", dev->full_name);
-
-	/* Increase refcount at current level */
-	of_node_get(dev);
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		goto bail;
-	bus = of_match_bus(parent);
-
-	/* Cound address cells & copy address locally */
-	bus->count_cells(dev, &na, &ns);
-	if (!OF_CHECK_COUNTS(na, ns)) {
-		printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
-			dev->full_name);
-		goto bail;
-	}
-	memcpy(addr, in_addr, na * 4);
-
-	pr_debug("OF: bus is %s (na=%d, ns=%d) on %s\n",
-		bus->name, na, ns, parent->full_name);
-	of_dump_addr("OF: translating address:", addr, na);
-
-	/* Translate */
-	for (;;) {
-		/* Switch to parent bus */
-		of_node_put(dev);
-		dev = parent;
-		parent = of_get_parent(dev);
-
-		/* If root, we have finished */
-		if (parent == NULL) {
-			pr_debug("OF: reached root node\n");
-			result = of_read_number(addr, na);
-			break;
-		}
-
-		/* Get new parent bus and counts */
-		pbus = of_match_bus(parent);
-		pbus->count_cells(dev, &pna, &pns);
-		if (!OF_CHECK_COUNTS(pna, pns)) {
-			printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
-				dev->full_name);
-			break;
-		}
-
-		pr_debug("OF: parent bus is %s (na=%d, ns=%d) on %s\n",
-			pbus->name, pna, pns, parent->full_name);
-
-		/* Apply bus translation */
-		if (of_translate_one(dev, bus, pbus, addr, na, ns, pna))
-			break;
-
-		/* Complete the move up one level */
-		na = pna;
-		ns = pns;
-		bus = pbus;
-
-		of_dump_addr("OF: one level translation:", addr, na);
-	}
- bail:
-	of_node_put(parent);
-	of_node_put(dev);
-
-	return result;
-}
-EXPORT_SYMBOL(of_translate_address);
-
-const u32 *of_get_address(struct device_node *dev, int index, u64 *size,
-			unsigned int *flags)
-{
-	const u32 *prop;
-	unsigned int psize;
-	struct device_node *parent;
-	struct of_bus *bus;
-	int onesize, i, na, ns;
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		return NULL;
-	bus = of_match_bus(parent);
-	bus->count_cells(dev, &na, &ns);
-	of_node_put(parent);
-	if (!OF_CHECK_COUNTS(na, ns))
-		return NULL;
-
-	/* Get "reg" or "assigned-addresses" property */
-	prop = of_get_property(dev, bus->addresses, (int *) &psize);
-	if (prop == NULL)
-		return NULL;
-	psize /= 4;
-
-	onesize = na + ns;
-	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
-		if (i == index) {
-			if (size)
-				*size = of_read_number(prop + na, ns);
-			if (flags)
-				*flags = bus->get_flags(prop);
-			return prop;
-		}
-	return NULL;
-}
-EXPORT_SYMBOL(of_get_address);
-
 void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 		unsigned long *busno, unsigned long *phys, unsigned long *size)
 {
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index ceace966c51c..f864722679e8 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -45,10 +45,6 @@ extern void pci_create_OF_bus_map(void);
  * OF address retreival & translation
  */
 
-/* Translate an OF address block into a CPU physical address
- */
-extern u64 of_translate_address(struct device_node *np, const u32 *addr);
-
 /* Translate a DMA address from device space to CPU space */
 extern u64 of_translate_dma_address(struct device_node *dev,
 				    const u32 *in_addr);
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index 1dac535de78f..88334af038e5 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -10,225 +10,7 @@
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
 
-#ifdef DEBUG
-#define DBG(fmt...) do { printk(fmt); } while(0)
-#else
-#define DBG(fmt...) do { } while(0)
-#endif
-
-#ifdef CONFIG_PPC64
-#define PRu64	"%lx"
-#else
-#define PRu64	"%llx"
-#endif
-
-/* Max address size we deal with */
-#define OF_MAX_ADDR_CELLS	4
-#define OF_CHECK_COUNTS(na, ns)	((na) > 0 && (na) <= OF_MAX_ADDR_CELLS && \
-			(ns) > 0)
-
-static struct of_bus *of_match_bus(struct device_node *np);
-
-/* Debug utility */
-#ifdef DEBUG
-static void of_dump_addr(const char *s, const u32 *addr, int na)
-{
-	printk("%s", s);
-	while(na--)
-		printk(" %08x", *(addr++));
-	printk("\n");
-}
-#else
-static void of_dump_addr(const char *s, const u32 *addr, int na) { }
-#endif
-
-
-/* Callbacks for bus specific translators */
-struct of_bus {
-	const char	*name;
-	const char	*addresses;
-	int		(*match)(struct device_node *parent);
-	void		(*count_cells)(struct device_node *child,
-				       int *addrc, int *sizec);
-	u64		(*map)(u32 *addr, const u32 *range,
-				int na, int ns, int pna);
-	int		(*translate)(u32 *addr, u64 offset, int na);
-	unsigned int	(*get_flags)(const u32 *addr);
-};
-
-
-/*
- * Default translator (generic bus)
- */
-
-static void of_bus_default_count_cells(struct device_node *dev,
-				       int *addrc, int *sizec)
-{
-	if (addrc)
-		*addrc = of_n_addr_cells(dev);
-	if (sizec)
-		*sizec = of_n_size_cells(dev);
-}
-
-static u64 of_bus_default_map(u32 *addr, const u32 *range,
-		int na, int ns, int pna)
-{
-	u64 cp, s, da;
-
-	cp = of_read_number(range, na);
-	s  = of_read_number(range + na + pna, ns);
-	da = of_read_number(addr, na);
-
-	DBG("OF: default map, cp="PRu64", s="PRu64", da="PRu64"\n",
-	    cp, s, da);
-
-	if (da < cp || da >= (cp + s))
-		return OF_BAD_ADDR;
-	return da - cp;
-}
-
-static int of_bus_default_translate(u32 *addr, u64 offset, int na)
-{
-	u64 a = of_read_number(addr, na);
-	memset(addr, 0, na * 4);
-	a += offset;
-	if (na > 1)
-		addr[na - 2] = a >> 32;
-	addr[na - 1] = a & 0xffffffffu;
-
-	return 0;
-}
-
-static unsigned int of_bus_default_get_flags(const u32 *addr)
-{
-	return IORESOURCE_MEM;
-}
-
-
 #ifdef CONFIG_PCI
-/*
- * PCI bus specific translator
- */
-
-static int of_bus_pci_match(struct device_node *np)
-{
-	/* "vci" is for the /chaos bridge on 1st-gen PCI powermacs */
-	return !strcmp(np->type, "pci") || !strcmp(np->type, "vci");
-}
-
-static void of_bus_pci_count_cells(struct device_node *np,
-				   int *addrc, int *sizec)
-{
-	if (addrc)
-		*addrc = 3;
-	if (sizec)
-		*sizec = 2;
-}
-
-static unsigned int of_bus_pci_get_flags(const u32 *addr)
-{
-	unsigned int flags = 0;
-	u32 w = addr[0];
-
-	switch((w >> 24) & 0x03) {
-	case 0x01:
-		flags |= IORESOURCE_IO;
-		break;
-	case 0x02: /* 32 bits */
-	case 0x03: /* 64 bits */
-		flags |= IORESOURCE_MEM;
-		break;
-	}
-	if (w & 0x40000000)
-		flags |= IORESOURCE_PREFETCH;
-	return flags;
-}
-
-static u64 of_bus_pci_map(u32 *addr, const u32 *range, int na, int ns, int pna)
-{
-	u64 cp, s, da;
-	unsigned int af, rf;
-
-	af = of_bus_pci_get_flags(addr);
-	rf = of_bus_pci_get_flags(range);
-
-	/* Check address type match */
-	if ((af ^ rf) & (IORESOURCE_MEM | IORESOURCE_IO))
-		return OF_BAD_ADDR;
-
-	/* Read address values, skipping high cell */
-	cp = of_read_number(range + 1, na - 1);
-	s  = of_read_number(range + na + pna, ns);
-	da = of_read_number(addr + 1, na - 1);
-
-	DBG("OF: PCI map, cp="PRu64", s="PRu64", da="PRu64"\n", cp, s, da);
-
-	if (da < cp || da >= (cp + s))
-		return OF_BAD_ADDR;
-	return da - cp;
-}
-
-static int of_bus_pci_translate(u32 *addr, u64 offset, int na)
-{
-	return of_bus_default_translate(addr + 1, offset, na - 1);
-}
-
-const u32 *of_get_pci_address(struct device_node *dev, int bar_no, u64 *size,
-			unsigned int *flags)
-{
-	const u32 *prop;
-	unsigned int psize;
-	struct device_node *parent;
-	struct of_bus *bus;
-	int onesize, i, na, ns;
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		return NULL;
-	bus = of_match_bus(parent);
-	if (strcmp(bus->name, "pci")) {
-		of_node_put(parent);
-		return NULL;
-	}
-	bus->count_cells(dev, &na, &ns);
-	of_node_put(parent);
-	if (!OF_CHECK_COUNTS(na, ns))
-		return NULL;
-
-	/* Get "reg" or "assigned-addresses" property */
-	prop = of_get_property(dev, bus->addresses, &psize);
-	if (prop == NULL)
-		return NULL;
-	psize /= 4;
-
-	onesize = na + ns;
-	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
-		if ((prop[0] & 0xff) == ((bar_no * 4) + PCI_BASE_ADDRESS_0)) {
-			if (size)
-				*size = of_read_number(prop + na, ns);
-			if (flags)
-				*flags = bus->get_flags(prop);
-			return prop;
-		}
-	return NULL;
-}
-EXPORT_SYMBOL(of_get_pci_address);
-
-int of_pci_address_to_resource(struct device_node *dev, int bar,
-			       struct resource *r)
-{
-	const u32	*addrp;
-	u64		size;
-	unsigned int	flags;
-
-	addrp = of_get_pci_address(dev, bar, &size, &flags);
-	if (addrp == NULL)
-		return -EINVAL;
-	return __of_address_to_resource(dev, addrp, size, flags, r);
-}
-EXPORT_SYMBOL_GPL(of_pci_address_to_resource);
-
 int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
 {
 	struct device_node *dn, *ppnode;
@@ -310,303 +92,6 @@ int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
 EXPORT_SYMBOL_GPL(of_irq_map_pci);
 #endif /* CONFIG_PCI */
 
-/*
- * ISA bus specific translator
- */
-
-static int of_bus_isa_match(struct device_node *np)
-{
-	return !strcmp(np->name, "isa");
-}
-
-static void of_bus_isa_count_cells(struct device_node *child,
-				   int *addrc, int *sizec)
-{
-	if (addrc)
-		*addrc = 2;
-	if (sizec)
-		*sizec = 1;
-}
-
-static u64 of_bus_isa_map(u32 *addr, const u32 *range, int na, int ns, int pna)
-{
-	u64 cp, s, da;
-
-	/* Check address type match */
-	if ((addr[0] ^ range[0]) & 0x00000001)
-		return OF_BAD_ADDR;
-
-	/* Read address values, skipping high cell */
-	cp = of_read_number(range + 1, na - 1);
-	s  = of_read_number(range + na + pna, ns);
-	da = of_read_number(addr + 1, na - 1);
-
-	DBG("OF: ISA map, cp="PRu64", s="PRu64", da="PRu64"\n", cp, s, da);
-
-	if (da < cp || da >= (cp + s))
-		return OF_BAD_ADDR;
-	return da - cp;
-}
-
-static int of_bus_isa_translate(u32 *addr, u64 offset, int na)
-{
-	return of_bus_default_translate(addr + 1, offset, na - 1);
-}
-
-static unsigned int of_bus_isa_get_flags(const u32 *addr)
-{
-	unsigned int flags = 0;
-	u32 w = addr[0];
-
-	if (w & 1)
-		flags |= IORESOURCE_IO;
-	else
-		flags |= IORESOURCE_MEM;
-	return flags;
-}
-
-
-/*
- * Array of bus specific translators
- */
-
-static struct of_bus of_busses[] = {
-#ifdef CONFIG_PCI
-	/* PCI */
-	{
-		.name = "pci",
-		.addresses = "assigned-addresses",
-		.match = of_bus_pci_match,
-		.count_cells = of_bus_pci_count_cells,
-		.map = of_bus_pci_map,
-		.translate = of_bus_pci_translate,
-		.get_flags = of_bus_pci_get_flags,
-	},
-#endif /* CONFIG_PCI */
-	/* ISA */
-	{
-		.name = "isa",
-		.addresses = "reg",
-		.match = of_bus_isa_match,
-		.count_cells = of_bus_isa_count_cells,
-		.map = of_bus_isa_map,
-		.translate = of_bus_isa_translate,
-		.get_flags = of_bus_isa_get_flags,
-	},
-	/* Default */
-	{
-		.name = "default",
-		.addresses = "reg",
-		.match = NULL,
-		.count_cells = of_bus_default_count_cells,
-		.map = of_bus_default_map,
-		.translate = of_bus_default_translate,
-		.get_flags = of_bus_default_get_flags,
-	},
-};
-
-static struct of_bus *of_match_bus(struct device_node *np)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(of_busses); i ++)
-		if (!of_busses[i].match || of_busses[i].match(np))
-			return &of_busses[i];
-	BUG();
-	return NULL;
-}
-
-static int of_translate_one(struct device_node *parent, struct of_bus *bus,
-			    struct of_bus *pbus, u32 *addr,
-			    int na, int ns, int pna, const char *rprop)
-{
-	const u32 *ranges;
-	unsigned int rlen;
-	int rone;
-	u64 offset = OF_BAD_ADDR;
-
-	/* Normally, an absence of a "ranges" property means we are
-	 * crossing a non-translatable boundary, and thus the addresses
-	 * below the current not cannot be converted to CPU physical ones.
-	 * Unfortunately, while this is very clear in the spec, it's not
-	 * what Apple understood, and they do have things like /uni-n or
-	 * /ht nodes with no "ranges" property and a lot of perfectly
-	 * useable mapped devices below them. Thus we treat the absence of
-	 * "ranges" as equivalent to an empty "ranges" property which means
-	 * a 1:1 translation at that level. It's up to the caller not to try
-	 * to translate addresses that aren't supposed to be translated in
-	 * the first place. --BenH.
-	 */
-	ranges = of_get_property(parent, rprop, &rlen);
-	if (ranges == NULL || rlen == 0) {
-		offset = of_read_number(addr, na);
-		memset(addr, 0, pna * 4);
-		DBG("OF: no ranges, 1:1 translation\n");
-		goto finish;
-	}
-
-	DBG("OF: walking ranges...\n");
-
-	/* Now walk through the ranges */
-	rlen /= 4;
-	rone = na + pna + ns;
-	for (; rlen >= rone; rlen -= rone, ranges += rone) {
-		offset = bus->map(addr, ranges, na, ns, pna);
-		if (offset != OF_BAD_ADDR)
-			break;
-	}
-	if (offset == OF_BAD_ADDR) {
-		DBG("OF: not found !\n");
-		return 1;
-	}
-	memcpy(addr, ranges + na, 4 * pna);
-
- finish:
-	of_dump_addr("OF: parent translation for:", addr, pna);
-	DBG("OF: with offset: "PRu64"\n", offset);
-
-	/* Translate it into parent bus space */
-	return pbus->translate(addr, offset, pna);
-}
-
-
-/*
- * Translate an address from the device-tree into a CPU physical address,
- * this walks up the tree and applies the various bus mappings on the
- * way.
- *
- * Note: We consider that crossing any level with #size-cells == 0 to mean
- * that translation is impossible (that is we are not dealing with a value
- * that can be mapped to a cpu physical address). This is not really specified
- * that way, but this is traditionally the way IBM at least do things
- */
-u64 __of_translate_address(struct device_node *dev, const u32 *in_addr,
-			   const char *rprop)
-{
-	struct device_node *parent = NULL;
-	struct of_bus *bus, *pbus;
-	u32 addr[OF_MAX_ADDR_CELLS];
-	int na, ns, pna, pns;
-	u64 result = OF_BAD_ADDR;
-
-	DBG("OF: ** translation for device %s **\n", dev->full_name);
-
-	/* Increase refcount at current level */
-	of_node_get(dev);
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		goto bail;
-	bus = of_match_bus(parent);
-
-	/* Cound address cells & copy address locally */
-	bus->count_cells(dev, &na, &ns);
-	if (!OF_CHECK_COUNTS(na, ns)) {
-		printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
-		       dev->full_name);
-		goto bail;
-	}
-	memcpy(addr, in_addr, na * 4);
-
-	DBG("OF: bus is %s (na=%d, ns=%d) on %s\n",
-	    bus->name, na, ns, parent->full_name);
-	of_dump_addr("OF: translating address:", addr, na);
-
-	/* Translate */
-	for (;;) {
-		/* Switch to parent bus */
-		of_node_put(dev);
-		dev = parent;
-		parent = of_get_parent(dev);
-
-		/* If root, we have finished */
-		if (parent == NULL) {
-			DBG("OF: reached root node\n");
-			result = of_read_number(addr, na);
-			break;
-		}
-
-		/* Get new parent bus and counts */
-		pbus = of_match_bus(parent);
-		pbus->count_cells(dev, &pna, &pns);
-		if (!OF_CHECK_COUNTS(pna, pns)) {
-			printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
-			       dev->full_name);
-			break;
-		}
-
-		DBG("OF: parent bus is %s (na=%d, ns=%d) on %s\n",
-		    pbus->name, pna, pns, parent->full_name);
-
-		/* Apply bus translation */
-		if (of_translate_one(dev, bus, pbus, addr, na, ns, pna, rprop))
-			break;
-
-		/* Complete the move up one level */
-		na = pna;
-		ns = pns;
-		bus = pbus;
-
-		of_dump_addr("OF: one level translation:", addr, na);
-	}
- bail:
-	of_node_put(parent);
-	of_node_put(dev);
-
-	return result;
-}
-
-u64 of_translate_address(struct device_node *dev, const u32 *in_addr)
-{
-	return __of_translate_address(dev, in_addr, "ranges");
-}
-EXPORT_SYMBOL(of_translate_address);
-
-u64 of_translate_dma_address(struct device_node *dev, const u32 *in_addr)
-{
-	return __of_translate_address(dev, in_addr, "dma-ranges");
-}
-EXPORT_SYMBOL(of_translate_dma_address);
-
-const u32 *of_get_address(struct device_node *dev, int index, u64 *size,
-		    unsigned int *flags)
-{
-	const u32 *prop;
-	unsigned int psize;
-	struct device_node *parent;
-	struct of_bus *bus;
-	int onesize, i, na, ns;
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		return NULL;
-	bus = of_match_bus(parent);
-	bus->count_cells(dev, &na, &ns);
-	of_node_put(parent);
-	if (!OF_CHECK_COUNTS(na, ns))
-		return NULL;
-
-	/* Get "reg" or "assigned-addresses" property */
-	prop = of_get_property(dev, bus->addresses, &psize);
-	if (prop == NULL)
-		return NULL;
-	psize /= 4;
-
-	onesize = na + ns;
-	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
-		if (i == index) {
-			if (size)
-				*size = of_read_number(prop + na, ns);
-			if (flags)
-				*flags = bus->get_flags(prop);
-			return prop;
-		}
-	return NULL;
-}
-EXPORT_SYMBOL(of_get_address);
-
 void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 		unsigned long *busno, unsigned long *phys, unsigned long *size)
 {
diff --git a/drivers/of/address.c b/drivers/of/address.c
index c3819550f907..2a905d560c1c 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -1,11 +1,522 @@
 
 #include <linux/io.h>
 #include <linux/ioport.h>
+#include <linux/module.h>
 #include <linux/of_address.h>
+#include <linux/pci_regs.h>
+#include <linux/string.h>
 
-int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
-			     u64 size, unsigned int flags,
-			     struct resource *r)
+/* Max address size we deal with */
+#define OF_MAX_ADDR_CELLS	4
+#define OF_CHECK_COUNTS(na, ns)	((na) > 0 && (na) <= OF_MAX_ADDR_CELLS && \
+			(ns) > 0)
+
+static struct of_bus *of_match_bus(struct device_node *np);
+static int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
+				    u64 size, unsigned int flags,
+				    struct resource *r);
+
+/* Debug utility */
+#ifdef DEBUG
+static void of_dump_addr(const char *s, const u32 *addr, int na)
+{
+	printk(KERN_DEBUG "%s", s);
+	while (na--)
+		printk(" %08x", *(addr++));
+	printk("\n");
+}
+#else
+static void of_dump_addr(const char *s, const u32 *addr, int na) { }
+#endif
+
+/* Callbacks for bus specific translators */
+struct of_bus {
+	const char	*name;
+	const char	*addresses;
+	int		(*match)(struct device_node *parent);
+	void		(*count_cells)(struct device_node *child,
+				       int *addrc, int *sizec);
+	u64		(*map)(u32 *addr, const u32 *range,
+				int na, int ns, int pna);
+	int		(*translate)(u32 *addr, u64 offset, int na);
+	unsigned int	(*get_flags)(const u32 *addr);
+};
+
+/*
+ * Default translator (generic bus)
+ */
+
+static void of_bus_default_count_cells(struct device_node *dev,
+				       int *addrc, int *sizec)
+{
+	if (addrc)
+		*addrc = of_n_addr_cells(dev);
+	if (sizec)
+		*sizec = of_n_size_cells(dev);
+}
+
+static u64 of_bus_default_map(u32 *addr, const u32 *range,
+		int na, int ns, int pna)
+{
+	u64 cp, s, da;
+
+	cp = of_read_number(range, na);
+	s  = of_read_number(range + na + pna, ns);
+	da = of_read_number(addr, na);
+
+	pr_debug("OF: default map, cp=%llx, s=%llx, da=%llx\n",
+		 (unsigned long long)cp, (unsigned long long)s,
+		 (unsigned long long)da);
+
+	if (da < cp || da >= (cp + s))
+		return OF_BAD_ADDR;
+	return da - cp;
+}
+
+static int of_bus_default_translate(u32 *addr, u64 offset, int na)
+{
+	u64 a = of_read_number(addr, na);
+	memset(addr, 0, na * 4);
+	a += offset;
+	if (na > 1)
+		addr[na - 2] = a >> 32;
+	addr[na - 1] = a & 0xffffffffu;
+
+	return 0;
+}
+
+static unsigned int of_bus_default_get_flags(const u32 *addr)
+{
+	return IORESOURCE_MEM;
+}
+
+#ifdef CONFIG_PCI
+/*
+ * PCI bus specific translator
+ */
+
+static int of_bus_pci_match(struct device_node *np)
+{
+	/* "vci" is for the /chaos bridge on 1st-gen PCI powermacs */
+	return !strcmp(np->type, "pci") || !strcmp(np->type, "vci");
+}
+
+static void of_bus_pci_count_cells(struct device_node *np,
+				   int *addrc, int *sizec)
+{
+	if (addrc)
+		*addrc = 3;
+	if (sizec)
+		*sizec = 2;
+}
+
+static unsigned int of_bus_pci_get_flags(const u32 *addr)
+{
+	unsigned int flags = 0;
+	u32 w = addr[0];
+
+	switch((w >> 24) & 0x03) {
+	case 0x01:
+		flags |= IORESOURCE_IO;
+		break;
+	case 0x02: /* 32 bits */
+	case 0x03: /* 64 bits */
+		flags |= IORESOURCE_MEM;
+		break;
+	}
+	if (w & 0x40000000)
+		flags |= IORESOURCE_PREFETCH;
+	return flags;
+}
+
+static u64 of_bus_pci_map(u32 *addr, const u32 *range, int na, int ns, int pna)
+{
+	u64 cp, s, da;
+	unsigned int af, rf;
+
+	af = of_bus_pci_get_flags(addr);
+	rf = of_bus_pci_get_flags(range);
+
+	/* Check address type match */
+	if ((af ^ rf) & (IORESOURCE_MEM | IORESOURCE_IO))
+		return OF_BAD_ADDR;
+
+	/* Read address values, skipping high cell */
+	cp = of_read_number(range + 1, na - 1);
+	s  = of_read_number(range + na + pna, ns);
+	da = of_read_number(addr + 1, na - 1);
+
+	pr_debug("OF: PCI map, cp=%llx, s=%llx, da=%llx\n",
+		 (unsigned long long)cp, (unsigned long long)s,
+		 (unsigned long long)da);
+
+	if (da < cp || da >= (cp + s))
+		return OF_BAD_ADDR;
+	return da - cp;
+}
+
+static int of_bus_pci_translate(u32 *addr, u64 offset, int na)
+{
+	return of_bus_default_translate(addr + 1, offset, na - 1);
+}
+
+const u32 *of_get_pci_address(struct device_node *dev, int bar_no, u64 *size,
+			unsigned int *flags)
+{
+	const u32 *prop;
+	unsigned int psize;
+	struct device_node *parent;
+	struct of_bus *bus;
+	int onesize, i, na, ns;
+
+	/* Get parent & match bus type */
+	parent = of_get_parent(dev);
+	if (parent == NULL)
+		return NULL;
+	bus = of_match_bus(parent);
+	if (strcmp(bus->name, "pci")) {
+		of_node_put(parent);
+		return NULL;
+	}
+	bus->count_cells(dev, &na, &ns);
+	of_node_put(parent);
+	if (!OF_CHECK_COUNTS(na, ns))
+		return NULL;
+
+	/* Get "reg" or "assigned-addresses" property */
+	prop = of_get_property(dev, bus->addresses, &psize);
+	if (prop == NULL)
+		return NULL;
+	psize /= 4;
+
+	onesize = na + ns;
+	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
+		if ((prop[0] & 0xff) == ((bar_no * 4) + PCI_BASE_ADDRESS_0)) {
+			if (size)
+				*size = of_read_number(prop + na, ns);
+			if (flags)
+				*flags = bus->get_flags(prop);
+			return prop;
+		}
+	return NULL;
+}
+EXPORT_SYMBOL(of_get_pci_address);
+
+int of_pci_address_to_resource(struct device_node *dev, int bar,
+			       struct resource *r)
+{
+	const u32	*addrp;
+	u64		size;
+	unsigned int	flags;
+
+	addrp = of_get_pci_address(dev, bar, &size, &flags);
+	if (addrp == NULL)
+		return -EINVAL;
+	return __of_address_to_resource(dev, addrp, size, flags, r);
+}
+EXPORT_SYMBOL_GPL(of_pci_address_to_resource);
+#endif /* CONFIG_PCI */
+
+/*
+ * ISA bus specific translator
+ */
+
+static int of_bus_isa_match(struct device_node *np)
+{
+	return !strcmp(np->name, "isa");
+}
+
+static void of_bus_isa_count_cells(struct device_node *child,
+				   int *addrc, int *sizec)
+{
+	if (addrc)
+		*addrc = 2;
+	if (sizec)
+		*sizec = 1;
+}
+
+static u64 of_bus_isa_map(u32 *addr, const u32 *range, int na, int ns, int pna)
+{
+	u64 cp, s, da;
+
+	/* Check address type match */
+	if ((addr[0] ^ range[0]) & 0x00000001)
+		return OF_BAD_ADDR;
+
+	/* Read address values, skipping high cell */
+	cp = of_read_number(range + 1, na - 1);
+	s  = of_read_number(range + na + pna, ns);
+	da = of_read_number(addr + 1, na - 1);
+
+	pr_debug("OF: ISA map, cp=%llx, s=%llx, da=%llx\n",
+		 (unsigned long long)cp, (unsigned long long)s,
+		 (unsigned long long)da);
+
+	if (da < cp || da >= (cp + s))
+		return OF_BAD_ADDR;
+	return da - cp;
+}
+
+static int of_bus_isa_translate(u32 *addr, u64 offset, int na)
+{
+	return of_bus_default_translate(addr + 1, offset, na - 1);
+}
+
+static unsigned int of_bus_isa_get_flags(const u32 *addr)
+{
+	unsigned int flags = 0;
+	u32 w = addr[0];
+
+	if (w & 1)
+		flags |= IORESOURCE_IO;
+	else
+		flags |= IORESOURCE_MEM;
+	return flags;
+}
+
+/*
+ * Array of bus specific translators
+ */
+
+static struct of_bus of_busses[] = {
+#ifdef CONFIG_PCI
+	/* PCI */
+	{
+		.name = "pci",
+		.addresses = "assigned-addresses",
+		.match = of_bus_pci_match,
+		.count_cells = of_bus_pci_count_cells,
+		.map = of_bus_pci_map,
+		.translate = of_bus_pci_translate,
+		.get_flags = of_bus_pci_get_flags,
+	},
+#endif /* CONFIG_PCI */
+	/* ISA */
+	{
+		.name = "isa",
+		.addresses = "reg",
+		.match = of_bus_isa_match,
+		.count_cells = of_bus_isa_count_cells,
+		.map = of_bus_isa_map,
+		.translate = of_bus_isa_translate,
+		.get_flags = of_bus_isa_get_flags,
+	},
+	/* Default */
+	{
+		.name = "default",
+		.addresses = "reg",
+		.match = NULL,
+		.count_cells = of_bus_default_count_cells,
+		.map = of_bus_default_map,
+		.translate = of_bus_default_translate,
+		.get_flags = of_bus_default_get_flags,
+	},
+};
+
+static struct of_bus *of_match_bus(struct device_node *np)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(of_busses); i++)
+		if (!of_busses[i].match || of_busses[i].match(np))
+			return &of_busses[i];
+	BUG();
+	return NULL;
+}
+
+static int of_translate_one(struct device_node *parent, struct of_bus *bus,
+			    struct of_bus *pbus, u32 *addr,
+			    int na, int ns, int pna, const char *rprop)
+{
+	const u32 *ranges;
+	unsigned int rlen;
+	int rone;
+	u64 offset = OF_BAD_ADDR;
+
+	/* Normally, an absence of a "ranges" property means we are
+	 * crossing a non-translatable boundary, and thus the addresses
+	 * below the current not cannot be converted to CPU physical ones.
+	 * Unfortunately, while this is very clear in the spec, it's not
+	 * what Apple understood, and they do have things like /uni-n or
+	 * /ht nodes with no "ranges" property and a lot of perfectly
+	 * useable mapped devices below them. Thus we treat the absence of
+	 * "ranges" as equivalent to an empty "ranges" property which means
+	 * a 1:1 translation at that level. It's up to the caller not to try
+	 * to translate addresses that aren't supposed to be translated in
+	 * the first place. --BenH.
+	 */
+	ranges = of_get_property(parent, rprop, &rlen);
+	if (ranges == NULL || rlen == 0) {
+		offset = of_read_number(addr, na);
+		memset(addr, 0, pna * 4);
+		pr_debug("OF: no ranges, 1:1 translation\n");
+		goto finish;
+	}
+
+	pr_debug("OF: walking ranges...\n");
+
+	/* Now walk through the ranges */
+	rlen /= 4;
+	rone = na + pna + ns;
+	for (; rlen >= rone; rlen -= rone, ranges += rone) {
+		offset = bus->map(addr, ranges, na, ns, pna);
+		if (offset != OF_BAD_ADDR)
+			break;
+	}
+	if (offset == OF_BAD_ADDR) {
+		pr_debug("OF: not found !\n");
+		return 1;
+	}
+	memcpy(addr, ranges + na, 4 * pna);
+
+ finish:
+	of_dump_addr("OF: parent translation for:", addr, pna);
+	pr_debug("OF: with offset: %llx\n", (unsigned long long)offset);
+
+	/* Translate it into parent bus space */
+	return pbus->translate(addr, offset, pna);
+}
+
+/*
+ * Translate an address from the device-tree into a CPU physical address,
+ * this walks up the tree and applies the various bus mappings on the
+ * way.
+ *
+ * Note: We consider that crossing any level with #size-cells == 0 to mean
+ * that translation is impossible (that is we are not dealing with a value
+ * that can be mapped to a cpu physical address). This is not really specified
+ * that way, but this is traditionally the way IBM at least do things
+ */
+u64 __of_translate_address(struct device_node *dev, const u32 *in_addr,
+			   const char *rprop)
+{
+	struct device_node *parent = NULL;
+	struct of_bus *bus, *pbus;
+	u32 addr[OF_MAX_ADDR_CELLS];
+	int na, ns, pna, pns;
+	u64 result = OF_BAD_ADDR;
+
+	pr_debug("OF: ** translation for device %s **\n", dev->full_name);
+
+	/* Increase refcount at current level */
+	of_node_get(dev);
+
+	/* Get parent & match bus type */
+	parent = of_get_parent(dev);
+	if (parent == NULL)
+		goto bail;
+	bus = of_match_bus(parent);
+
+	/* Cound address cells & copy address locally */
+	bus->count_cells(dev, &na, &ns);
+	if (!OF_CHECK_COUNTS(na, ns)) {
+		printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
+		       dev->full_name);
+		goto bail;
+	}
+	memcpy(addr, in_addr, na * 4);
+
+	pr_debug("OF: bus is %s (na=%d, ns=%d) on %s\n",
+	    bus->name, na, ns, parent->full_name);
+	of_dump_addr("OF: translating address:", addr, na);
+
+	/* Translate */
+	for (;;) {
+		/* Switch to parent bus */
+		of_node_put(dev);
+		dev = parent;
+		parent = of_get_parent(dev);
+
+		/* If root, we have finished */
+		if (parent == NULL) {
+			pr_debug("OF: reached root node\n");
+			result = of_read_number(addr, na);
+			break;
+		}
+
+		/* Get new parent bus and counts */
+		pbus = of_match_bus(parent);
+		pbus->count_cells(dev, &pna, &pns);
+		if (!OF_CHECK_COUNTS(pna, pns)) {
+			printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
+			       dev->full_name);
+			break;
+		}
+
+		pr_debug("OF: parent bus is %s (na=%d, ns=%d) on %s\n",
+		    pbus->name, pna, pns, parent->full_name);
+
+		/* Apply bus translation */
+		if (of_translate_one(dev, bus, pbus, addr, na, ns, pna, rprop))
+			break;
+
+		/* Complete the move up one level */
+		na = pna;
+		ns = pns;
+		bus = pbus;
+
+		of_dump_addr("OF: one level translation:", addr, na);
+	}
+ bail:
+	of_node_put(parent);
+	of_node_put(dev);
+
+	return result;
+}
+
+u64 of_translate_address(struct device_node *dev, const u32 *in_addr)
+{
+	return __of_translate_address(dev, in_addr, "ranges");
+}
+EXPORT_SYMBOL(of_translate_address);
+
+u64 of_translate_dma_address(struct device_node *dev, const u32 *in_addr)
+{
+	return __of_translate_address(dev, in_addr, "dma-ranges");
+}
+EXPORT_SYMBOL(of_translate_dma_address);
+
+const u32 *of_get_address(struct device_node *dev, int index, u64 *size,
+		    unsigned int *flags)
+{
+	const u32 *prop;
+	unsigned int psize;
+	struct device_node *parent;
+	struct of_bus *bus;
+	int onesize, i, na, ns;
+
+	/* Get parent & match bus type */
+	parent = of_get_parent(dev);
+	if (parent == NULL)
+		return NULL;
+	bus = of_match_bus(parent);
+	bus->count_cells(dev, &na, &ns);
+	of_node_put(parent);
+	if (!OF_CHECK_COUNTS(na, ns))
+		return NULL;
+
+	/* Get "reg" or "assigned-addresses" property */
+	prop = of_get_property(dev, bus->addresses, &psize);
+	if (prop == NULL)
+		return NULL;
+	psize /= 4;
+
+	onesize = na + ns;
+	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
+		if (i == index) {
+			if (size)
+				*size = of_read_number(prop + na, ns);
+			if (flags)
+				*flags = bus->get_flags(prop);
+			return prop;
+		}
+	return NULL;
+}
+EXPORT_SYMBOL(of_get_address);
+
+static int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
+				    u64 size, unsigned int flags,
+				    struct resource *r)
 {
 	u64 taddr;
 
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 474b794ed9d2..cc567df9a00d 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -3,9 +3,7 @@
 #include <linux/ioport.h>
 #include <linux/of.h>
 
-extern int __of_address_to_resource(struct device_node *dev, const u32 *addrp,
-				    u64 size, unsigned int flags,
-				    struct resource *r);
+extern u64 of_translate_address(struct device_node *np, const u32 *addr);
 extern int of_address_to_resource(struct device_node *dev, int index,
 				  struct resource *r);
 extern void __iomem *of_iomap(struct device_node *device, int index);
-- 
cgit v1.2.3-59-g8ed1b


From dd27dcda37f0b1a3b674760fb411abc5c8fe309c Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:12 -0600
Subject: of/device: merge of_device_uevent

Merge common code between powerpc and microblaze

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Michal Simek <monstr@monstr.eu>
CC: Wolfram Sang <w.sang@pengutronix.de>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: microblaze-uclinux@itee.uq.edu.au
CC: linuxppc-dev@ozlabs.org
---
 arch/microblaze/include/asm/of_device.h |  3 --
 arch/microblaze/kernel/of_device.c      | 48 --------------------------------
 arch/powerpc/include/asm/of_device.h    |  3 --
 arch/powerpc/kernel/of_device.c         | 49 ---------------------------------
 drivers/of/device.c                     | 48 ++++++++++++++++++++++++++++++++
 include/linux/of_device.h               |  4 +++
 6 files changed, 52 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_device.h b/arch/microblaze/include/asm/of_device.h
index 0a5f3f914b42..58e627dc1412 100644
--- a/arch/microblaze/include/asm/of_device.h
+++ b/arch/microblaze/include/asm/of_device.h
@@ -22,9 +22,6 @@ extern struct of_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
 
-extern int of_device_uevent(struct device *dev,
-			    struct kobj_uevent_env *env);
-
 extern void of_device_make_bus_id(struct of_device *dev);
 
 /* This is just here during the transition */
diff --git a/arch/microblaze/kernel/of_device.c b/arch/microblaze/kernel/of_device.c
index b372787886ed..3a367d788451 100644
--- a/arch/microblaze/kernel/of_device.c
+++ b/arch/microblaze/kernel/of_device.c
@@ -62,51 +62,3 @@ struct of_device *of_device_alloc(struct device_node *np,
 	return dev;
 }
 EXPORT_SYMBOL(of_device_alloc);
-
-int of_device_uevent(struct device *dev, struct kobj_uevent_env *env)
-{
-	struct of_device *ofdev;
-	const char *compat;
-	int seen = 0, cplen, sl;
-
-	if (!dev)
-		return -ENODEV;
-
-	ofdev = to_of_device(dev);
-
-	if (add_uevent_var(env, "OF_NAME=%s", ofdev->dev.of_node->name))
-		return -ENOMEM;
-
-	if (add_uevent_var(env, "OF_TYPE=%s", ofdev->dev.of_node->type))
-		return -ENOMEM;
-
-	/* Since the compatible field can contain pretty much anything
-	 * it's not really legal to split it out with commas. We split it
-	 * up using a number of environment variables instead. */
-
-	compat = of_get_property(ofdev->dev.of_node, "compatible", &cplen);
-	while (compat && *compat && cplen > 0) {
-		if (add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat))
-			return -ENOMEM;
-
-		sl = strlen(compat) + 1;
-		compat += sl;
-		cplen -= sl;
-		seen++;
-	}
-
-	if (add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen))
-		return -ENOMEM;
-
-	/* modalias is trickier, we add it in 2 steps */
-	if (add_uevent_var(env, "MODALIAS="))
-		return -ENOMEM;
-	sl = of_device_get_modalias(ofdev, &env->buf[env->buflen-1],
-				    sizeof(env->buf) - env->buflen);
-	if (sl >= (sizeof(env->buf) - env->buflen))
-		return -ENOMEM;
-	env->buflen += sl;
-
-	return 0;
-}
-EXPORT_SYMBOL(of_device_uevent);
diff --git a/arch/powerpc/include/asm/of_device.h b/arch/powerpc/include/asm/of_device.h
index cb36632f953c..5d5103cac641 100644
--- a/arch/powerpc/include/asm/of_device.h
+++ b/arch/powerpc/include/asm/of_device.h
@@ -9,8 +9,5 @@ extern struct of_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
 
-extern int of_device_uevent(struct device *dev,
-			    struct kobj_uevent_env *env);
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_OF_DEVICE_H */
diff --git a/arch/powerpc/kernel/of_device.c b/arch/powerpc/kernel/of_device.c
index df78e0236a02..db91a9dbafb5 100644
--- a/arch/powerpc/kernel/of_device.c
+++ b/arch/powerpc/kernel/of_device.c
@@ -82,52 +82,3 @@ struct of_device *of_device_alloc(struct device_node *np,
 	return dev;
 }
 EXPORT_SYMBOL(of_device_alloc);
-
-int of_device_uevent(struct device *dev, struct kobj_uevent_env *env)
-{
-	struct of_device *ofdev;
-	const char *compat;
-	int seen = 0, cplen, sl;
-
-	if (!dev)
-		return -ENODEV;
-
-	ofdev = to_of_device(dev);
-
-	if (add_uevent_var(env, "OF_NAME=%s", ofdev->dev.of_node->name))
-		return -ENOMEM;
-
-	if (add_uevent_var(env, "OF_TYPE=%s", ofdev->dev.of_node->type))
-		return -ENOMEM;
-
-        /* Since the compatible field can contain pretty much anything
-         * it's not really legal to split it out with commas. We split it
-         * up using a number of environment variables instead. */
-
-	compat = of_get_property(ofdev->dev.of_node, "compatible", &cplen);
-	while (compat && *compat && cplen > 0) {
-		if (add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat))
-			return -ENOMEM;
-
-		sl = strlen (compat) + 1;
-		compat += sl;
-		cplen -= sl;
-		seen++;
-	}
-
-	if (add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen))
-		return -ENOMEM;
-
-	/* modalias is trickier, we add it in 2 steps */
-	if (add_uevent_var(env, "MODALIAS="))
-		return -ENOMEM;
-	sl = of_device_get_modalias(ofdev, &env->buf[env->buflen-1],
-				    sizeof(env->buf) - env->buflen);
-	if (sl >= (sizeof(env->buf) - env->buflen))
-		return -ENOMEM;
-	env->buflen += sl;
-
-	return 0;
-}
-EXPORT_SYMBOL(of_device_uevent);
-EXPORT_SYMBOL(of_device_get_modalias);
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 7d18f8e0b013..275cc9cee14c 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -170,3 +170,51 @@ ssize_t of_device_get_modalias(struct of_device *ofdev,
 
 	return tsize;
 }
+
+/**
+ * of_device_uevent - Display OF related uevent information
+ */
+int of_device_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	const char *compat;
+	int seen = 0, cplen, sl;
+
+	if ((!dev) || (!dev->of_node))
+		return -ENODEV;
+
+	if (add_uevent_var(env, "OF_NAME=%s", dev->of_node->name))
+		return -ENOMEM;
+
+	if (add_uevent_var(env, "OF_TYPE=%s", dev->of_node->type))
+		return -ENOMEM;
+
+	/* Since the compatible field can contain pretty much anything
+	 * it's not really legal to split it out with commas. We split it
+	 * up using a number of environment variables instead. */
+
+	compat = of_get_property(dev->of_node, "compatible", &cplen);
+	while (compat && *compat && cplen > 0) {
+		if (add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat))
+			return -ENOMEM;
+
+		sl = strlen(compat) + 1;
+		compat += sl;
+		cplen -= sl;
+		seen++;
+	}
+
+	if (add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen))
+		return -ENOMEM;
+
+	/* modalias is trickier, we add it in 2 steps */
+	if (add_uevent_var(env, "MODALIAS="))
+		return -ENOMEM;
+
+	sl = of_device_get_modalias(to_of_device(dev), &env->buf[env->buflen-1],
+				    sizeof(env->buf) - env->buflen);
+	if (sl >= (sizeof(env->buf) - env->buflen))
+		return -ENOMEM;
+	env->buflen += sl;
+
+	return 0;
+}
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index a3ae5900fc58..da83e734c026 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -44,6 +44,10 @@ static inline void of_device_free(struct of_device *dev)
 
 extern ssize_t of_device_get_modalias(struct of_device *ofdev,
 					char *str, ssize_t len);
+
+extern int of_device_uevent(struct device *dev, struct kobj_uevent_env *env);
+
+
 #endif /* CONFIG_OF_DEVICE */
 
 #endif /* _LINUX_OF_DEVICE_H */
-- 
cgit v1.2.3-59-g8ed1b


From 34a1c1e8c700f7cd849deb21193718a172722f8d Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:13 -0600
Subject: of: Modify of_device_get_modalias to be passed struct device

Now that the of_node pointer is part of struct device,
of_device_get_modalias could be used on any struct device
that has the device node pointer set.  This patch changes
of_device_get_modalias to accept a struct device instead
of a struct of_device.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Michal Simek <monstr@monstr.eu>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Wolfram Sang <w.sang@pengutronix.de>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: microblaze-uclinux@itee.uq.edu.au
CC: linuxppc-dev@ozlabs.org
---
 arch/microblaze/include/asm/of_device.h |  3 ---
 drivers/macintosh/macio_sysfs.c         |  5 +----
 drivers/of/device.c                     | 16 ++++++----------
 include/linux/of_device.h               |  2 +-
 4 files changed, 8 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_device.h b/arch/microblaze/include/asm/of_device.h
index 58e627dc1412..c9be53487434 100644
--- a/arch/microblaze/include/asm/of_device.h
+++ b/arch/microblaze/include/asm/of_device.h
@@ -15,9 +15,6 @@
 #include <linux/device.h>
 #include <linux/of.h>
 
-extern ssize_t of_device_get_modalias(struct of_device *ofdev,
-					char *str, ssize_t len);
-
 extern struct of_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
diff --git a/drivers/macintosh/macio_sysfs.c b/drivers/macintosh/macio_sysfs.c
index 6999ce59fd10..6024038a5b9d 100644
--- a/drivers/macintosh/macio_sysfs.c
+++ b/drivers/macintosh/macio_sysfs.c
@@ -41,10 +41,7 @@ compatible_show (struct device *dev, struct device_attribute *attr, char *buf)
 static ssize_t modalias_show (struct device *dev, struct device_attribute *attr,
 			      char *buf)
 {
-	struct of_device *ofdev = to_of_device(dev);
-	int len;
-
-	len = of_device_get_modalias(ofdev, buf, PAGE_SIZE - 2);
+	int len = of_device_get_modalias(dev, buf, PAGE_SIZE - 2);
 
 	buf[len] = '\n';
 	buf[len+1] = 0;
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 275cc9cee14c..c2a98f5ca80d 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -68,10 +68,7 @@ static ssize_t name_show(struct device *dev,
 static ssize_t modalias_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	struct of_device *ofdev = to_of_device(dev);
-	ssize_t len = 0;
-
-	len = of_device_get_modalias(ofdev, buf, PAGE_SIZE - 2);
+	ssize_t len = of_device_get_modalias(dev, buf, PAGE_SIZE - 2);
 	buf[len] = '\n';
 	buf[len+1] = 0;
 	return len+1;
@@ -123,19 +120,18 @@ void of_device_unregister(struct of_device *ofdev)
 }
 EXPORT_SYMBOL(of_device_unregister);
 
-ssize_t of_device_get_modalias(struct of_device *ofdev,
-				char *str, ssize_t len)
+ssize_t of_device_get_modalias(struct device *dev, char *str, ssize_t len)
 {
 	const char *compat;
 	int cplen, i;
 	ssize_t tsize, csize, repend;
 
 	/* Name & Type */
-	csize = snprintf(str, len, "of:N%sT%s", ofdev->dev.of_node->name,
-			 ofdev->dev.of_node->type);
+	csize = snprintf(str, len, "of:N%sT%s", dev->of_node->name,
+			 dev->of_node->type);
 
 	/* Get compatible property if any */
-	compat = of_get_property(ofdev->dev.of_node, "compatible", &cplen);
+	compat = of_get_property(dev->of_node, "compatible", &cplen);
 	if (!compat)
 		return csize;
 
@@ -210,7 +206,7 @@ int of_device_uevent(struct device *dev, struct kobj_uevent_env *env)
 	if (add_uevent_var(env, "MODALIAS="))
 		return -ENOMEM;
 
-	sl = of_device_get_modalias(to_of_device(dev), &env->buf[env->buflen-1],
+	sl = of_device_get_modalias(dev, &env->buf[env->buflen-1],
 				    sizeof(env->buf) - env->buflen);
 	if (sl >= (sizeof(env->buf) - env->buflen))
 		return -ENOMEM;
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index da83e734c026..238e92e007e6 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -42,7 +42,7 @@ static inline void of_device_free(struct of_device *dev)
 	of_release_dev(&dev->dev);
 }
 
-extern ssize_t of_device_get_modalias(struct of_device *ofdev,
+extern ssize_t of_device_get_modalias(struct device *dev,
 					char *str, ssize_t len);
 
 extern int of_device_uevent(struct device *dev, struct kobj_uevent_env *env);
-- 
cgit v1.2.3-59-g8ed1b


From 5fd200f3b351183b5489cef69961c60af9cead2f Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:13 -0600
Subject: of/device: Merge of_platform_bus_probe()

Merge common code between PowerPC and microblaze.  This patch merges
the code that scans the tree and registers devices.  The functions
merged are of_platform_bus_probe(), of_platform_bus_create(), and
of_platform_device_create().

This patch also move the of_default_bus_ids[] table out of a Microblaze
header file and makes it non-static.  The device ids table isn't merged
because powerpc and microblaze use different default data.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Michal Simek <monstr@monstr.eu>
CC: Grant Likely <grant.likely@secretlab.ca>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: microblaze-uclinux@itee.uq.edu.au
CC: linuxppc-dev@ozlabs.org
---
 arch/microblaze/include/asm/of_platform.h |  33 -------
 arch/microblaze/kernel/of_platform.c      | 141 ++++-------------------------
 arch/powerpc/include/asm/of_platform.h    |  11 ---
 arch/powerpc/kernel/of_platform.c         | 131 +--------------------------
 drivers/of/platform.c                     | 143 ++++++++++++++++++++++++++++++
 include/linux/of_platform.h               |  17 ++++
 6 files changed, 179 insertions(+), 297 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_platform.h b/arch/microblaze/include/asm/of_platform.h
index 37491276c6ca..625003f70888 100644
--- a/arch/microblaze/include/asm/of_platform.h
+++ b/arch/microblaze/include/asm/of_platform.h
@@ -14,39 +14,6 @@
 /* This is just here during the transition */
 #include <linux/of_platform.h>
 
-/*
- * The list of OF IDs below is used for matching bus types in the
- * system whose devices are to be exposed as of_platform_devices.
- *
- * This is the default list valid for most platforms. This file provides
- * functions who can take an explicit list if necessary though
- *
- * The search is always performed recursively looking for children of
- * the provided device_node and recursively if such a children matches
- * a bus type in the list
- */
-
-static const struct of_device_id of_default_bus_ids[] = {
-	{ .type = "soc", },
-	{ .compatible = "soc", },
-	{ .type = "plb5", },
-	{ .type = "plb4", },
-	{ .type = "opb", },
-	{ .type = "simple", },
-	{},
-};
-
-/* Platform devices and busses creation */
-extern struct of_device *of_platform_device_create(struct device_node *np,
-						const char *bus_id,
-						struct device *parent);
-/* pseudo "matches" value to not do deep probe */
-#define OF_NO_DEEP_PROBE ((struct of_device_id *)-1)
-
-extern int of_platform_bus_probe(struct device_node *root,
-				const struct of_device_id *matches,
-				struct device *parent);
-
 extern struct of_device *of_find_device_by_phandle(phandle ph);
 
 extern void of_instantiate_rtc(void);
diff --git a/arch/microblaze/kernel/of_platform.c b/arch/microblaze/kernel/of_platform.c
index ccf6f4257f4b..a07abdd6859b 100644
--- a/arch/microblaze/kernel/of_platform.c
+++ b/arch/microblaze/kernel/of_platform.c
@@ -37,132 +37,27 @@ static int __init of_bus_driver_init(void)
 }
 postcore_initcall(of_bus_driver_init);
 
-struct of_device *of_platform_device_create(struct device_node *np,
-					    const char *bus_id,
-					    struct device *parent)
-{
-	struct of_device *dev;
-
-	dev = of_device_alloc(np, bus_id, parent);
-	if (!dev)
-		return NULL;
-
-	dev->archdata.dma_mask = 0xffffffffUL;
-	dev->dev.bus = &of_platform_bus_type;
-
-	/* We do not fill the DMA ops for platform devices by default.
-	 * This is currently the responsibility of the platform code
-	 * to do such, possibly using a device notifier
-	 */
-
-	if (of_device_register(dev) != 0) {
-		of_device_free(dev);
-		return NULL;
-	}
-
-	return dev;
-}
-EXPORT_SYMBOL(of_platform_device_create);
-
-/**
- * of_platform_bus_create - Create an OF device for a bus node and all its
- * children. Optionally recursively instanciate matching busses.
- * @bus: device node of the bus to instanciate
- * @matches: match table, NULL to use the default, OF_NO_DEEP_PROBE to
- * disallow recursive creation of child busses
- */
-static int of_platform_bus_create(const struct device_node *bus,
-				  const struct of_device_id *matches,
-				  struct device *parent)
-{
-	struct device_node *child;
-	struct of_device *dev;
-	int rc = 0;
-
-	for_each_child_of_node(bus, child) {
-		pr_debug("   create child: %s\n", child->full_name);
-		dev = of_platform_device_create(child, NULL, parent);
-		if (dev == NULL)
-			rc = -ENOMEM;
-		else if (!of_match_node(matches, child))
-			continue;
-		if (rc == 0) {
-			pr_debug("   and sub busses\n");
-			rc = of_platform_bus_create(child, matches, &dev->dev);
-		}
-		if (rc) {
-			of_node_put(child);
-			break;
-		}
-	}
-	return rc;
-}
-
-
-/**
- * of_platform_bus_probe - Probe the device-tree for platform busses
- * @root: parent of the first level to probe or NULL for the root of the tree
- * @matches: match table, NULL to use the default
- * @parent: parent to hook devices from, NULL for toplevel
+/*
+ * The list of OF IDs below is used for matching bus types in the
+ * system whose devices are to be exposed as of_platform_devices.
  *
- * Note that children of the provided root are not instanciated as devices
- * unless the specified root itself matches the bus list and is not NULL.
+ * This is the default list valid for most platforms. This file provides
+ * functions who can take an explicit list if necessary though
+ *
+ * The search is always performed recursively looking for children of
+ * the provided device_node and recursively if such a children matches
+ * a bus type in the list
  */
 
-int of_platform_bus_probe(struct device_node *root,
-			  const struct of_device_id *matches,
-			  struct device *parent)
-{
-	struct device_node *child;
-	struct of_device *dev;
-	int rc = 0;
-
-	if (matches == NULL)
-		matches = of_default_bus_ids;
-	if (matches == OF_NO_DEEP_PROBE)
-		return -EINVAL;
-	if (root == NULL)
-		root = of_find_node_by_path("/");
-	else
-		of_node_get(root);
-
-	pr_debug("of_platform_bus_probe()\n");
-	pr_debug(" starting at: %s\n", root->full_name);
-
-	/* Do a self check of bus type, if there's a match, create
-	 * children
-	 */
-	if (of_match_node(matches, root)) {
-		pr_debug(" root match, create all sub devices\n");
-		dev = of_platform_device_create(root, NULL, parent);
-		if (dev == NULL) {
-			rc = -ENOMEM;
-			goto bail;
-		}
-		pr_debug(" create all sub busses\n");
-		rc = of_platform_bus_create(root, matches, &dev->dev);
-		goto bail;
-	}
-	for_each_child_of_node(root, child) {
-		if (!of_match_node(matches, child))
-			continue;
-
-		pr_debug("  match: %s\n", child->full_name);
-		dev = of_platform_device_create(child, NULL, parent);
-		if (dev == NULL)
-			rc = -ENOMEM;
-		else
-			rc = of_platform_bus_create(child, matches, &dev->dev);
-		if (rc) {
-			of_node_put(child);
-			break;
-		}
-	}
- bail:
-	of_node_put(root);
-	return rc;
-}
-EXPORT_SYMBOL(of_platform_bus_probe);
+const struct of_device_id of_default_bus_ids[] = {
+	{ .type = "soc", },
+	{ .compatible = "soc", },
+	{ .type = "plb5", },
+	{ .type = "plb4", },
+	{ .type = "opb", },
+	{ .type = "simple", },
+	{},
+};
 
 static int of_dev_node_match(struct device *dev, void *data)
 {
diff --git a/arch/powerpc/include/asm/of_platform.h b/arch/powerpc/include/asm/of_platform.h
index d4aaa3489440..b37d2dcddb97 100644
--- a/arch/powerpc/include/asm/of_platform.h
+++ b/arch/powerpc/include/asm/of_platform.h
@@ -11,17 +11,6 @@
  *
  */
 
-/* Platform devices and busses creation */
-extern struct of_device *of_platform_device_create(struct device_node *np,
-						   const char *bus_id,
-						   struct device *parent);
-/* pseudo "matches" value to not do deep probe */
-#define OF_NO_DEEP_PROBE ((struct of_device_id *)-1)
-
-extern int of_platform_bus_probe(struct device_node *root,
-				 const struct of_device_id *matches,
-				 struct device *parent);
-
 extern struct of_device *of_find_device_by_phandle(phandle ph);
 
 extern void of_instantiate_rtc(void);
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index 487a98851ba6..0b5cc6d892a7 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -40,7 +40,7 @@
  * a bus type in the list
  */
 
-static const struct of_device_id of_default_bus_ids[] = {
+const struct of_device_id of_default_bus_ids[] = {
 	{ .type = "soc", },
 	{ .compatible = "soc", },
 	{ .type = "spider", },
@@ -64,135 +64,6 @@ static int __init of_bus_driver_init(void)
 
 postcore_initcall(of_bus_driver_init);
 
-struct of_device* of_platform_device_create(struct device_node *np,
-					    const char *bus_id,
-					    struct device *parent)
-{
-	struct of_device *dev;
-
-	dev = of_device_alloc(np, bus_id, parent);
-	if (!dev)
-		return NULL;
-
-	dev->archdata.dma_mask = 0xffffffffUL;
-	dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
-
-	dev->dev.bus = &of_platform_bus_type;
-
-	/* We do not fill the DMA ops for platform devices by default.
-	 * This is currently the responsibility of the platform code
-	 * to do such, possibly using a device notifier
-	 */
-
-	if (of_device_register(dev) != 0) {
-		of_device_free(dev);
-		return NULL;
-	}
-
-	return dev;
-}
-EXPORT_SYMBOL(of_platform_device_create);
-
-
-
-/**
- * of_platform_bus_create - Create an OF device for a bus node and all its
- * children. Optionally recursively instanciate matching busses.
- * @bus: device node of the bus to instanciate
- * @matches: match table, NULL to use the default, OF_NO_DEEP_PROBE to
- * disallow recursive creation of child busses
- */
-static int of_platform_bus_create(const struct device_node *bus,
-				  const struct of_device_id *matches,
-				  struct device *parent)
-{
-	struct device_node *child;
-	struct of_device *dev;
-	int rc = 0;
-
-	for_each_child_of_node(bus, child) {
-		pr_debug("   create child: %s\n", child->full_name);
-		dev = of_platform_device_create(child, NULL, parent);
-		if (dev == NULL)
-			rc = -ENOMEM;
-		else if (!of_match_node(matches, child))
-			continue;
-		if (rc == 0) {
-			pr_debug("   and sub busses\n");
-			rc = of_platform_bus_create(child, matches, &dev->dev);
-		} if (rc) {
-			of_node_put(child);
-			break;
-		}
-	}
-	return rc;
-}
-
-/**
- * of_platform_bus_probe - Probe the device-tree for platform busses
- * @root: parent of the first level to probe or NULL for the root of the tree
- * @matches: match table, NULL to use the default
- * @parent: parent to hook devices from, NULL for toplevel
- *
- * Note that children of the provided root are not instanciated as devices
- * unless the specified root itself matches the bus list and is not NULL.
- */
-
-int of_platform_bus_probe(struct device_node *root,
-			  const struct of_device_id *matches,
-			  struct device *parent)
-{
-	struct device_node *child;
-	struct of_device *dev;
-	int rc = 0;
-
-	if (matches == NULL)
-		matches = of_default_bus_ids;
-	if (matches == OF_NO_DEEP_PROBE)
-		return -EINVAL;
-	if (root == NULL)
-		root = of_find_node_by_path("/");
-	else
-		of_node_get(root);
-
-	pr_debug("of_platform_bus_probe()\n");
-	pr_debug(" starting at: %s\n", root->full_name);
-
-	/* Do a self check of bus type, if there's a match, create
-	 * children
-	 */
-	if (of_match_node(matches, root)) {
-		pr_debug(" root match, create all sub devices\n");
-		dev = of_platform_device_create(root, NULL, parent);
-		if (dev == NULL) {
-			rc = -ENOMEM;
-			goto bail;
-		}
-		pr_debug(" create all sub busses\n");
-		rc = of_platform_bus_create(root, matches, &dev->dev);
-		goto bail;
-	}
-	for_each_child_of_node(root, child) {
-		if (!of_match_node(matches, child))
-			continue;
-
-		pr_debug("  match: %s\n", child->full_name);
-		dev = of_platform_device_create(child, NULL, parent);
-		if (dev == NULL)
-			rc = -ENOMEM;
-		else
-			rc = of_platform_bus_create(child, matches, &dev->dev);
-		if (rc) {
-			of_node_put(child);
-			break;
-		}
-	}
- bail:
-	of_node_put(root);
-	return rc;
-}
-EXPORT_SYMBOL(of_platform_bus_probe);
-
 static int of_dev_node_match(struct device *dev, void *data)
 {
 	return to_of_device(dev)->dev.of_node == data;
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 7dacc1ebe91e..d9c81e93bdd0 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -14,6 +14,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/dma-mapping.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 
@@ -396,3 +397,145 @@ void of_unregister_driver(struct of_platform_driver *drv)
 	driver_unregister(&drv->driver);
 }
 EXPORT_SYMBOL(of_unregister_driver);
+
+#if !defined(CONFIG_SPARC)
+/*
+ * The following routines scan a subtree and registers a device for
+ * each applicable node.
+ *
+ * Note: sparc doesn't use these routines because it has a different
+ * mechanism for creating devices from device tree nodes.
+ */
+
+/**
+ * of_platform_device_create - Alloc, initialize and register an of_device
+ * @np: pointer to node to create device for
+ * @bus_id: name to assign device
+ * @parent: Linux device model parent device.
+ */
+struct of_device *of_platform_device_create(struct device_node *np,
+					    const char *bus_id,
+					    struct device *parent)
+{
+	struct of_device *dev;
+
+	dev = of_device_alloc(np, bus_id, parent);
+	if (!dev)
+		return NULL;
+
+	dev->archdata.dma_mask = 0xffffffffUL;
+	dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+	dev->dev.bus = &of_platform_bus_type;
+
+	/* We do not fill the DMA ops for platform devices by default.
+	 * This is currently the responsibility of the platform code
+	 * to do such, possibly using a device notifier
+	 */
+
+	if (of_device_register(dev) != 0) {
+		of_device_free(dev);
+		return NULL;
+	}
+
+	return dev;
+}
+EXPORT_SYMBOL(of_platform_device_create);
+
+/**
+ * of_platform_bus_create - Create an OF device for a bus node and all its
+ * children. Optionally recursively instantiate matching busses.
+ * @bus: device node of the bus to instantiate
+ * @matches: match table, NULL to use the default, OF_NO_DEEP_PROBE to
+ * disallow recursive creation of child busses
+ */
+static int of_platform_bus_create(const struct device_node *bus,
+				  const struct of_device_id *matches,
+				  struct device *parent)
+{
+	struct device_node *child;
+	struct of_device *dev;
+	int rc = 0;
+
+	for_each_child_of_node(bus, child) {
+		pr_debug("   create child: %s\n", child->full_name);
+		dev = of_platform_device_create(child, NULL, parent);
+		if (dev == NULL)
+			rc = -ENOMEM;
+		else if (!of_match_node(matches, child))
+			continue;
+		if (rc == 0) {
+			pr_debug("   and sub busses\n");
+			rc = of_platform_bus_create(child, matches, &dev->dev);
+		}
+		if (rc) {
+			of_node_put(child);
+			break;
+		}
+	}
+	return rc;
+}
+
+/**
+ * of_platform_bus_probe - Probe the device-tree for platform busses
+ * @root: parent of the first level to probe or NULL for the root of the tree
+ * @matches: match table, NULL to use the default
+ * @parent: parent to hook devices from, NULL for toplevel
+ *
+ * Note that children of the provided root are not instantiated as devices
+ * unless the specified root itself matches the bus list and is not NULL.
+ */
+int of_platform_bus_probe(struct device_node *root,
+			  const struct of_device_id *matches,
+			  struct device *parent)
+{
+	struct device_node *child;
+	struct of_device *dev;
+	int rc = 0;
+
+	if (matches == NULL)
+		matches = of_default_bus_ids;
+	if (matches == OF_NO_DEEP_PROBE)
+		return -EINVAL;
+	if (root == NULL)
+		root = of_find_node_by_path("/");
+	else
+		of_node_get(root);
+
+	pr_debug("of_platform_bus_probe()\n");
+	pr_debug(" starting at: %s\n", root->full_name);
+
+	/* Do a self check of bus type, if there's a match, create
+	 * children
+	 */
+	if (of_match_node(matches, root)) {
+		pr_debug(" root match, create all sub devices\n");
+		dev = of_platform_device_create(root, NULL, parent);
+		if (dev == NULL) {
+			rc = -ENOMEM;
+			goto bail;
+		}
+		pr_debug(" create all sub busses\n");
+		rc = of_platform_bus_create(root, matches, &dev->dev);
+		goto bail;
+	}
+	for_each_child_of_node(root, child) {
+		if (!of_match_node(matches, child))
+			continue;
+
+		pr_debug("  match: %s\n", child->full_name);
+		dev = of_platform_device_create(child, NULL, parent);
+		if (dev == NULL)
+			rc = -ENOMEM;
+		else
+			rc = of_platform_bus_create(child, matches, &dev->dev);
+		if (rc) {
+			of_node_put(child);
+			break;
+		}
+	}
+ bail:
+	of_node_put(root);
+	return rc;
+}
+EXPORT_SYMBOL(of_platform_bus_probe);
+#endif /* !CONFIG_SPARC */
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 1643d3761eb4..4bbba41396ef 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -25,6 +25,8 @@
  */
 extern struct bus_type of_platform_bus_type;
 
+extern const struct of_device_id of_default_bus_ids[];
+
 /*
  * An of_platform_driver driver is attached to a basic of_device on
  * the "platform bus" (of_platform_bus_type).
@@ -63,6 +65,21 @@ static inline void of_unregister_platform_driver(struct of_platform_driver *drv)
 extern struct of_device *of_find_device_by_node(struct device_node *np);
 
 extern int of_bus_type_init(struct bus_type *bus, const char *name);
+
+#if !defined(CONFIG_SPARC) /* SPARC has its own device registration method */
+/* Platform devices and busses creation */
+extern struct of_device *of_platform_device_create(struct device_node *np,
+						   const char *bus_id,
+						   struct device *parent);
+
+/* pseudo "matches" value to not do deep probe */
+#define OF_NO_DEEP_PROBE ((struct of_device_id *)-1)
+
+extern int of_platform_bus_probe(struct device_node *root,
+				 const struct of_device_id *matches,
+				 struct device *parent);
+#endif /* !CONFIG_SPARC */
+
 #endif /* CONFIG_OF_DEVICE */
 
 #endif	/* _LINUX_OF_PLATFORM_H */
-- 
cgit v1.2.3-59-g8ed1b


From 94c0931983ee9d1cd96c32d52ac64c17464f0bbd Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:14 -0600
Subject: of: Merge of_device_alloc() and of_device_make_bus_id()

This patch merges the common routines of_device_alloc() and
of_device_make_bus_id() from powerpc and microblaze.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Michal Simek <monstr@monstr.eu>
CC: Grant Likely <grant.likely@secretlab.ca>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: microblaze-uclinux@itee.uq.edu.au
CC: linuxppc-dev@ozlabs.org
CC: devicetree-discuss@lists.ozlabs.org
---
 arch/microblaze/include/asm/of_device.h | 15 ------
 arch/microblaze/kernel/Makefile         |  2 +-
 arch/microblaze/kernel/of_device.c      | 64 ------------------------
 arch/powerpc/include/asm/of_device.h    | 10 ----
 arch/powerpc/kernel/Makefile            |  2 +-
 arch/powerpc/kernel/of_device.c         | 84 -------------------------------
 drivers/of/platform.c                   | 87 +++++++++++++++++++++++++++++++++
 include/linux/of_platform.h             |  3 ++
 8 files changed, 92 insertions(+), 175 deletions(-)
 delete mode 100644 arch/microblaze/kernel/of_device.c
 delete mode 100644 arch/powerpc/kernel/of_device.c

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_device.h b/arch/microblaze/include/asm/of_device.h
index c9be53487434..47e8d42aee8f 100644
--- a/arch/microblaze/include/asm/of_device.h
+++ b/arch/microblaze/include/asm/of_device.h
@@ -10,19 +10,4 @@
 
 #ifndef _ASM_MICROBLAZE_OF_DEVICE_H
 #define _ASM_MICROBLAZE_OF_DEVICE_H
-#ifdef __KERNEL__
-
-#include <linux/device.h>
-#include <linux/of.h>
-
-extern struct of_device *of_device_alloc(struct device_node *np,
-					 const char *bus_id,
-					 struct device *parent);
-
-extern void of_device_make_bus_id(struct of_device *dev);
-
-/* This is just here during the transition */
-#include <linux/of_device.h>
-
-#endif /* __KERNEL__ */
 #endif /* _ASM_MICROBLAZE_OF_DEVICE_H */
diff --git a/arch/microblaze/kernel/Makefile b/arch/microblaze/kernel/Makefile
index e51bc1520825..727e2cbff9c6 100644
--- a/arch/microblaze/kernel/Makefile
+++ b/arch/microblaze/kernel/Makefile
@@ -15,7 +15,7 @@ endif
 extra-y := head.o vmlinux.lds
 
 obj-y += dma.o exceptions.o \
-	hw_exception_handler.o init_task.o intc.o irq.o of_device.o \
+	hw_exception_handler.o init_task.o intc.o irq.o \
 	of_platform.o process.o prom.o prom_parse.o ptrace.o \
 	setup.o signal.o sys_microblaze.o timer.o traps.o reset.o
 
diff --git a/arch/microblaze/kernel/of_device.c b/arch/microblaze/kernel/of_device.c
deleted file mode 100644
index 3a367d788451..000000000000
--- a/arch/microblaze/kernel/of_device.c
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/of.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mod_devicetable.h>
-#include <linux/slab.h>
-#include <linux/of_device.h>
-
-#include <linux/errno.h>
-
-void of_device_make_bus_id(struct of_device *dev)
-{
-	static atomic_t bus_no_reg_magic;
-	struct device_node *node = dev->dev.of_node;
-	const u32 *reg;
-	u64 addr;
-	int magic;
-
-	/*
-	 * For MMIO, get the physical address
-	 */
-	reg = of_get_property(node, "reg", NULL);
-	if (reg) {
-		addr = of_translate_address(node, reg);
-		if (addr != OF_BAD_ADDR) {
-			dev_set_name(&dev->dev, "%llx.%s",
-				     (unsigned long long)addr, node->name);
-			return;
-		}
-	}
-
-	/*
-	 * No BusID, use the node name and add a globally incremented
-	 * counter (and pray...)
-	 */
-	magic = atomic_add_return(1, &bus_no_reg_magic);
-	dev_set_name(&dev->dev, "%s.%d", node->name, magic - 1);
-}
-EXPORT_SYMBOL(of_device_make_bus_id);
-
-struct of_device *of_device_alloc(struct device_node *np,
-				  const char *bus_id,
-				  struct device *parent)
-{
-	struct of_device *dev;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return NULL;
-
-	dev->dev.of_node = of_node_get(np);
-	dev->dev.dma_mask = &dev->archdata.dma_mask;
-	dev->dev.parent = parent;
-	dev->dev.release = of_release_dev;
-
-	if (bus_id)
-		dev_set_name(&dev->dev, bus_id);
-	else
-		of_device_make_bus_id(dev);
-
-	return dev;
-}
-EXPORT_SYMBOL(of_device_alloc);
diff --git a/arch/powerpc/include/asm/of_device.h b/arch/powerpc/include/asm/of_device.h
index 5d5103cac641..04f76717f82c 100644
--- a/arch/powerpc/include/asm/of_device.h
+++ b/arch/powerpc/include/asm/of_device.h
@@ -1,13 +1,3 @@
 #ifndef _ASM_POWERPC_OF_DEVICE_H
 #define _ASM_POWERPC_OF_DEVICE_H
-#ifdef __KERNEL__
-
-#include <linux/device.h>
-#include <linux/of.h>
-
-extern struct of_device *of_device_alloc(struct device_node *np,
-					 const char *bus_id,
-					 struct device *parent);
-
-#endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_OF_DEVICE_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 58d0572de6f9..83aa1fd0908f 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o
 obj-$(CONFIG_PPC64)		+= vdso64/
 obj-$(CONFIG_ALTIVEC)		+= vecemu.o
 obj-$(CONFIG_PPC_970_NAP)	+= idle_power4.o
-obj-$(CONFIG_PPC_OF)		+= of_device.o of_platform.o prom_parse.o
+obj-$(CONFIG_PPC_OF)		+= of_platform.o prom_parse.o
 obj-$(CONFIG_PPC_CLOCK)		+= clock.o
 procfs-y			:= proc_powerpc.o
 obj-$(CONFIG_PROC_FS)		+= $(procfs-y)
diff --git a/arch/powerpc/kernel/of_device.c b/arch/powerpc/kernel/of_device.c
deleted file mode 100644
index db91a9dbafb5..000000000000
--- a/arch/powerpc/kernel/of_device.c
+++ /dev/null
@@ -1,84 +0,0 @@
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/of.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mod_devicetable.h>
-#include <linux/slab.h>
-#include <linux/of_device.h>
-
-#include <asm/errno.h>
-#include <asm/dcr.h>
-
-static void of_device_make_bus_id(struct of_device *dev)
-{
-	static atomic_t bus_no_reg_magic;
-	struct device_node *node = dev->dev.of_node;
-	const u32 *reg;
-	u64 addr;
-	int magic;
-
-	/*
-	 * If it's a DCR based device, use 'd' for native DCRs
-	 * and 'D' for MMIO DCRs.
-	 */
-#ifdef CONFIG_PPC_DCR
-	reg = of_get_property(node, "dcr-reg", NULL);
-	if (reg) {
-#ifdef CONFIG_PPC_DCR_NATIVE
-		dev_set_name(&dev->dev, "d%x.%s", *reg, node->name);
-#else /* CONFIG_PPC_DCR_NATIVE */
-		addr = of_translate_dcr_address(node, *reg, NULL);
-		if (addr != OF_BAD_ADDR) {
-			dev_set_name(&dev->dev, "D%llx.%s",
-				     (unsigned long long)addr, node->name);
-			return;
-		}
-#endif /* !CONFIG_PPC_DCR_NATIVE */
-	}
-#endif /* CONFIG_PPC_DCR */
-
-	/*
-	 * For MMIO, get the physical address
-	 */
-	reg = of_get_property(node, "reg", NULL);
-	if (reg) {
-		addr = of_translate_address(node, reg);
-		if (addr != OF_BAD_ADDR) {
-			dev_set_name(&dev->dev, "%llx.%s",
-				     (unsigned long long)addr, node->name);
-			return;
-		}
-	}
-
-	/*
-	 * No BusID, use the node name and add a globally incremented
-	 * counter (and pray...)
-	 */
-	magic = atomic_add_return(1, &bus_no_reg_magic);
-	dev_set_name(&dev->dev, "%s.%d", node->name, magic - 1);
-}
-
-struct of_device *of_device_alloc(struct device_node *np,
-				  const char *bus_id,
-				  struct device *parent)
-{
-	struct of_device *dev;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return NULL;
-
-	dev->dev.of_node = of_node_get(np);
-	dev->dev.dma_mask = &dev->archdata.dma_mask;
-	dev->dev.parent = parent;
-	dev->dev.release = of_release_dev;
-
-	if (bus_id)
-		dev_set_name(&dev->dev, "%s", bus_id);
-	else
-		of_device_make_bus_id(dev);
-
-	return dev;
-}
-EXPORT_SYMBOL(of_device_alloc);
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index d9c81e93bdd0..ea87a3cf7860 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -407,6 +407,93 @@ EXPORT_SYMBOL(of_unregister_driver);
  * mechanism for creating devices from device tree nodes.
  */
 
+/**
+ * of_device_make_bus_id - Use the device node data to assign a unique name
+ * @dev: pointer to device structure that is linked to a device tree node
+ *
+ * This routine will first try using either the dcr-reg or the reg property
+ * value to derive a unique name.  As a last resort it will use the node
+ * name followed by a unique number.
+ */
+static void of_device_make_bus_id(struct device *dev)
+{
+	static atomic_t bus_no_reg_magic;
+	struct device_node *node = dev->of_node;
+	const u32 *reg;
+	u64 addr;
+	int magic;
+
+#ifdef CONFIG_PPC_DCR
+	/*
+	 * If it's a DCR based device, use 'd' for native DCRs
+	 * and 'D' for MMIO DCRs.
+	 */
+	reg = of_get_property(node, "dcr-reg", NULL);
+	if (reg) {
+#ifdef CONFIG_PPC_DCR_NATIVE
+		dev_set_name(dev, "d%x.%s", *reg, node->name);
+#else /* CONFIG_PPC_DCR_NATIVE */
+		u64 addr = of_translate_dcr_address(node, *reg, NULL);
+		if (addr != OF_BAD_ADDR) {
+			dev_set_name(dev, "D%llx.%s",
+				     (unsigned long long)addr, node->name);
+			return;
+		}
+#endif /* !CONFIG_PPC_DCR_NATIVE */
+	}
+#endif /* CONFIG_PPC_DCR */
+
+	/*
+	 * For MMIO, get the physical address
+	 */
+	reg = of_get_property(node, "reg", NULL);
+	if (reg) {
+		addr = of_translate_address(node, reg);
+		if (addr != OF_BAD_ADDR) {
+			dev_set_name(dev, "%llx.%s",
+				     (unsigned long long)addr, node->name);
+			return;
+		}
+	}
+
+	/*
+	 * No BusID, use the node name and add a globally incremented
+	 * counter (and pray...)
+	 */
+	magic = atomic_add_return(1, &bus_no_reg_magic);
+	dev_set_name(dev, "%s.%d", node->name, magic - 1);
+}
+
+/**
+ * of_device_alloc - Allocate and initialize an of_device
+ * @np: device node to assign to device
+ * @bus_id: Name to assign to the device.  May be null to use default name.
+ * @parent: Parent device.
+ */
+struct of_device *of_device_alloc(struct device_node *np,
+				  const char *bus_id,
+				  struct device *parent)
+{
+	struct of_device *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return NULL;
+
+	dev->dev.of_node = of_node_get(np);
+	dev->dev.dma_mask = &dev->archdata.dma_mask;
+	dev->dev.parent = parent;
+	dev->dev.release = of_release_dev;
+
+	if (bus_id)
+		dev_set_name(&dev->dev, "%s", bus_id);
+	else
+		of_device_make_bus_id(&dev->dev);
+
+	return dev;
+}
+EXPORT_SYMBOL(of_device_alloc);
+
 /**
  * of_platform_device_create - Alloc, initialize and register an of_device
  * @np: pointer to node to create device for
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 4bbba41396ef..a51fd30176aa 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -60,6 +60,9 @@ static inline void of_unregister_platform_driver(struct of_platform_driver *drv)
 	of_unregister_driver(drv);
 }
 
+extern struct of_device *of_device_alloc(struct device_node *np,
+					 const char *bus_id,
+					 struct device *parent);
 #include <asm/of_platform.h>
 
 extern struct of_device *of_find_device_by_node(struct device_node *np);
-- 
cgit v1.2.3-59-g8ed1b


From a19e3da5bc5fc6c10ab73f310bea80f3845b4531 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 8 Jun 2010 07:48:16 -0600
Subject: of/gpio: Kill of_gpio_chip and add members directly to gpio_chip

The OF gpio infrastructure is great for describing GPIO connections within
the device tree.  However, using a GPIO binding still requires changes to
the gpio controller just to add an of_gpio structure.  In most cases, the
gpio controller doesn't actually need any special support and the simple
OF gpio mapping function is more than sufficient.  Additional, the current
scheme of using of_gpio_chip requires a convoluted scheme to maintain
1:1 mappings between of_gpio_chip and gpio_chip instances.

If the struct of_gpio_chip data members were moved into struct gpio_chip,
then it would simplify the processing of OF gpio bindings, and it would
make it trivial to use device tree OF connections on existing gpiolib
controller drivers.

This patch eliminates the of_gpio_chip structure and moves the relevant
fields into struct gpio_chip (conditional on CONFIG_OF_GPIO).  This move
simplifies the existing code and prepares for adding automatic device tree
support to existing drivers.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Cc: Bill Gatliff <bgat@billgatliff.com>
Cc: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jean Delvare <khali@linux-fr.org>
---
 arch/microblaze/kernel/reset.c                 | 12 +++----
 arch/powerpc/platforms/52xx/mpc52xx_gpio.c     | 32 ++++++++---------
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c      | 29 ++++++++-------
 arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c | 15 ++++----
 arch/powerpc/platforms/86xx/gef_gpio.c         | 24 ++++++-------
 arch/powerpc/sysdev/cpm1.c                     | 12 +++----
 arch/powerpc/sysdev/cpm_common.c               |  6 ++--
 arch/powerpc/sysdev/mpc8xxx_gpio.c             |  6 ++--
 arch/powerpc/sysdev/ppc4xx_gpio.c              |  6 ++--
 arch/powerpc/sysdev/qe_lib/gpio.c              | 32 ++++++++---------
 arch/powerpc/sysdev/simple_gpio.c              |  6 ++--
 drivers/gpio/xilinx_gpio.c                     | 16 ++++-----
 drivers/of/gpio.c                              | 50 +++++++++++++-------------
 include/asm-generic/gpio.h                     | 12 +++++++
 include/linux/of_gpio.h                        | 29 +++------------
 15 files changed, 129 insertions(+), 158 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/kernel/reset.c b/arch/microblaze/kernel/reset.c
index a1721a33042e..5476d3caf045 100644
--- a/arch/microblaze/kernel/reset.c
+++ b/arch/microblaze/kernel/reset.c
@@ -24,8 +24,8 @@ static int of_reset_gpio_handle(void)
 	int ret; /* variable which stored handle reset gpio pin */
 	struct device_node *root; /* root node */
 	struct device_node *gpio; /* gpio node */
-	struct of_gpio_chip *of_gc = NULL;
-	enum of_gpio_flags flags ;
+	struct gpio_chip *gc;
+	u32 flags;
 	const void *gpio_spec;
 
 	/* find out root node */
@@ -39,19 +39,19 @@ static int of_reset_gpio_handle(void)
 		goto err0;
 	}
 
-	of_gc = gpio->data;
-	if (!of_gc) {
+	gc = gpio->data;
+	if (!gc) {
 		pr_debug("%s: gpio controller %s isn't registered\n",
 			 root->full_name, gpio->full_name);
 		ret = -ENODEV;
 		goto err1;
 	}
 
-	ret = of_gc->xlate(of_gc, root, gpio_spec, &flags);
+	ret = gc->of_xlate(gc, root, gpio_spec, &flags);
 	if (ret < 0)
 		goto err1;
 
-	ret += of_gc->gc.base;
+	ret += gc->base;
 err1:
 	of_node_put(gpio);
 err0:
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpio.c b/arch/powerpc/platforms/52xx/mpc52xx_gpio.c
index ca5305a5bd61..fd0912eeffe2 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpio.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpio.c
@@ -152,21 +152,21 @@ static int __devinit mpc52xx_wkup_gpiochip_probe(struct of_device *ofdev,
 {
 	struct mpc52xx_gpiochip *chip;
 	struct mpc52xx_gpio_wkup __iomem *regs;
-	struct of_gpio_chip *ofchip;
+	struct gpio_chip *gc;
 	int ret;
 
 	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
 	if (!chip)
 		return -ENOMEM;
 
-	ofchip = &chip->mmchip.of_gc;
+	gc = &chip->mmchip.gc;
 
-	ofchip->gpio_cells          = 2;
-	ofchip->gc.ngpio            = 8;
-	ofchip->gc.direction_input  = mpc52xx_wkup_gpio_dir_in;
-	ofchip->gc.direction_output = mpc52xx_wkup_gpio_dir_out;
-	ofchip->gc.get              = mpc52xx_wkup_gpio_get;
-	ofchip->gc.set              = mpc52xx_wkup_gpio_set;
+	gc->of_gpio_n_cells  = 2;
+	gc->ngpio            = 8;
+	gc->direction_input  = mpc52xx_wkup_gpio_dir_in;
+	gc->direction_output = mpc52xx_wkup_gpio_dir_out;
+	gc->get              = mpc52xx_wkup_gpio_get;
+	gc->set              = mpc52xx_wkup_gpio_set;
 
 	ret = of_mm_gpiochip_add(ofdev->dev.of_node, &chip->mmchip);
 	if (ret)
@@ -315,7 +315,7 @@ static int __devinit mpc52xx_simple_gpiochip_probe(struct of_device *ofdev,
 					const struct of_device_id *match)
 {
 	struct mpc52xx_gpiochip *chip;
-	struct of_gpio_chip *ofchip;
+	struct gpio_chip *gc;
 	struct mpc52xx_gpio __iomem *regs;
 	int ret;
 
@@ -323,14 +323,14 @@ static int __devinit mpc52xx_simple_gpiochip_probe(struct of_device *ofdev,
 	if (!chip)
 		return -ENOMEM;
 
-	ofchip = &chip->mmchip.of_gc;
+	gc = &chip->mmchip.gc;
 
-	ofchip->gpio_cells          = 2;
-	ofchip->gc.ngpio            = 32;
-	ofchip->gc.direction_input  = mpc52xx_simple_gpio_dir_in;
-	ofchip->gc.direction_output = mpc52xx_simple_gpio_dir_out;
-	ofchip->gc.get              = mpc52xx_simple_gpio_get;
-	ofchip->gc.set              = mpc52xx_simple_gpio_set;
+	gc->of_gpio_n_cells  = 2;
+	gc->ngpio            = 32;
+	gc->direction_input  = mpc52xx_simple_gpio_dir_in;
+	gc->direction_output = mpc52xx_simple_gpio_dir_out;
+	gc->get              = mpc52xx_simple_gpio_get;
+	gc->set              = mpc52xx_simple_gpio_set;
 
 	ret = of_mm_gpiochip_add(ofdev->dev.of_node, &chip->mmchip);
 	if (ret)
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 46c93578cbf0..3f2ee47f1d01 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -78,7 +78,7 @@ MODULE_LICENSE("GPL");
  * @dev: pointer to device structure
  * @regs: virtual address of GPT registers
  * @lock: spinlock to coordinate between different functions.
- * @of_gc: of_gpio_chip instance structure; used when GPIO is enabled
+ * @gc: gpio_chip instance structure; used when GPIO is enabled
  * @irqhost: Pointer to irq_host instance; used when IRQ mode is supported
  * @wdt_mode: only relevant for gpt0: bit 0 (MPC52xx_GPT_CAN_WDT) indicates
  *   if the gpt may be used as wdt, bit 1 (MPC52xx_GPT_IS_WDT) indicates
@@ -94,7 +94,7 @@ struct mpc52xx_gpt_priv {
 	u8 wdt_mode;
 
 #if defined(CONFIG_GPIOLIB)
-	struct of_gpio_chip of_gc;
+	struct gpio_chip gc;
 #endif
 };
 
@@ -280,7 +280,7 @@ mpc52xx_gpt_irq_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node)
 #if defined(CONFIG_GPIOLIB)
 static inline struct mpc52xx_gpt_priv *gc_to_mpc52xx_gpt(struct gpio_chip *gc)
 {
-	return container_of(to_of_gpio_chip(gc), struct mpc52xx_gpt_priv,of_gc);
+	return container_of(gc, struct mpc52xx_gpt_priv, gc);
 }
 
 static int mpc52xx_gpt_gpio_get(struct gpio_chip *gc, unsigned int gpio)
@@ -336,28 +336,27 @@ mpc52xx_gpt_gpio_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node)
 	if (!of_find_property(node, "gpio-controller", NULL))
 		return;
 
-	gpt->of_gc.gc.label = kstrdup(node->full_name, GFP_KERNEL);
-	if (!gpt->of_gc.gc.label) {
+	gpt->gc.label = kstrdup(node->full_name, GFP_KERNEL);
+	if (!gpt->gc.label) {
 		dev_err(gpt->dev, "out of memory\n");
 		return;
 	}
 
-	gpt->of_gc.gpio_cells = 2;
-	gpt->of_gc.gc.ngpio = 1;
-	gpt->of_gc.gc.direction_input  = mpc52xx_gpt_gpio_dir_in;
-	gpt->of_gc.gc.direction_output = mpc52xx_gpt_gpio_dir_out;
-	gpt->of_gc.gc.get = mpc52xx_gpt_gpio_get;
-	gpt->of_gc.gc.set = mpc52xx_gpt_gpio_set;
-	gpt->of_gc.gc.base = -1;
-	gpt->of_gc.xlate = of_gpio_simple_xlate;
-	node->data = &gpt->of_gc;
+	gpt->gc.ngpio = 1;
+	gpt->gc.direction_input  = mpc52xx_gpt_gpio_dir_in;
+	gpt->gc.direction_output = mpc52xx_gpt_gpio_dir_out;
+	gpt->gc.get = mpc52xx_gpt_gpio_get;
+	gpt->gc.set = mpc52xx_gpt_gpio_set;
+	gpt->gc.base = -1;
+	gpt->gc.of_gpio_n_cells = 2;
+	gpt->gc.of_xlate = of_gpio_simple_xlate;
 	of_node_get(node);
 
 	/* Setup external pin in GPIO mode */
 	clrsetbits_be32(&gpt->regs->mode, MPC52xx_GPT_MODE_MS_MASK,
 			MPC52xx_GPT_MODE_MS_GPIO);
 
-	rc = gpiochip_add(&gpt->of_gc.gc);
+	rc = gpiochip_add(&gpt->gc);
 	if (rc)
 		dev_err(gpt->dev, "gpiochip_add() failed; rc=%i\n", rc);
 
diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
index d119a7c1c17a..e49f4bd2f991 100644
--- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
+++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
@@ -37,7 +37,7 @@ struct mcu {
 	struct mutex lock;
 	struct device_node *np;
 	struct i2c_client *client;
-	struct of_gpio_chip of_gc;
+	struct gpio_chip gc;
 	u8 reg_ctrl;
 };
 
@@ -56,8 +56,7 @@ static void mcu_power_off(void)
 
 static void mcu_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 {
-	struct of_gpio_chip *of_gc = to_of_gpio_chip(gc);
-	struct mcu *mcu = container_of(of_gc, struct mcu, of_gc);
+	struct mcu *mcu = container_of(gc, struct mcu, gc);
 	u8 bit = 1 << (4 + gpio);
 
 	mutex_lock(&mcu->lock);
@@ -79,8 +78,7 @@ static int mcu_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 static int mcu_gpiochip_add(struct mcu *mcu)
 {
 	struct device_node *np;
-	struct of_gpio_chip *of_gc = &mcu->of_gc;
-	struct gpio_chip *gc = &of_gc->gc;
+	struct gpio_chip *gc = &mcu->gc;
 	int ret;
 
 	np = of_find_compatible_node(NULL, NULL, "fsl,mcu-mpc8349emitx");
@@ -94,10 +92,9 @@ static int mcu_gpiochip_add(struct mcu *mcu)
 	gc->base = -1;
 	gc->set = mcu_gpio_set;
 	gc->direction_output = mcu_gpio_dir_out;
-	of_gc->gpio_cells = 2;
-	of_gc->xlate = of_gpio_simple_xlate;
+	gc->of_gpio_n_cells = 2;
+	gc->of_xlate = of_gpio_simple_xlate;
 
-	np->data = of_gc;
 	mcu->np = np;
 
 	/*
@@ -114,7 +111,7 @@ static int mcu_gpiochip_remove(struct mcu *mcu)
 {
 	int ret;
 
-	ret = gpiochip_remove(&mcu->of_gc.gc);
+	ret = gpiochip_remove(&mcu->gc);
 	if (ret)
 		return ret;
 	of_node_put(mcu->np);
diff --git a/arch/powerpc/platforms/86xx/gef_gpio.c b/arch/powerpc/platforms/86xx/gef_gpio.c
index b8cb08dbd89c..4ff7b1e7bbad 100644
--- a/arch/powerpc/platforms/86xx/gef_gpio.c
+++ b/arch/powerpc/platforms/86xx/gef_gpio.c
@@ -118,12 +118,12 @@ static int __init gef_gpio_init(void)
 		}
 
 		/* Setup pointers to chip functions */
-		gef_gpio_chip->of_gc.gpio_cells = 2;
-		gef_gpio_chip->of_gc.gc.ngpio = 19;
-		gef_gpio_chip->of_gc.gc.direction_input = gef_gpio_dir_in;
-		gef_gpio_chip->of_gc.gc.direction_output = gef_gpio_dir_out;
-		gef_gpio_chip->of_gc.gc.get = gef_gpio_get;
-		gef_gpio_chip->of_gc.gc.set = gef_gpio_set;
+		gef_gpio_chip->gc.of_gpio_n_cells = 2;
+		gef_gpio_chip->gc.ngpio = 19;
+		gef_gpio_chip->gc.direction_input = gef_gpio_dir_in;
+		gef_gpio_chip->gc.direction_output = gef_gpio_dir_out;
+		gef_gpio_chip->gc.get = gef_gpio_get;
+		gef_gpio_chip->gc.set = gef_gpio_set;
 
 		/* This function adds a memory mapped GPIO chip */
 		retval = of_mm_gpiochip_add(np, gef_gpio_chip);
@@ -146,12 +146,12 @@ static int __init gef_gpio_init(void)
 		}
 
 		/* Setup pointers to chip functions */
-		gef_gpio_chip->of_gc.gpio_cells = 2;
-		gef_gpio_chip->of_gc.gc.ngpio = 6;
-		gef_gpio_chip->of_gc.gc.direction_input = gef_gpio_dir_in;
-		gef_gpio_chip->of_gc.gc.direction_output = gef_gpio_dir_out;
-		gef_gpio_chip->of_gc.gc.get = gef_gpio_get;
-		gef_gpio_chip->of_gc.gc.set = gef_gpio_set;
+		gef_gpio_chip->gc.of_gpio_n_cells = 2;
+		gef_gpio_chip->gc.ngpio = 6;
+		gef_gpio_chip->gc.direction_input = gef_gpio_dir_in;
+		gef_gpio_chip->gc.direction_output = gef_gpio_dir_out;
+		gef_gpio_chip->gc.get = gef_gpio_get;
+		gef_gpio_chip->gc.set = gef_gpio_set;
 
 		/* This function adds a memory mapped GPIO chip */
 		retval = of_mm_gpiochip_add(np, gef_gpio_chip);
diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c
index 8d103ca6d6ab..d5cf7d4ccf81 100644
--- a/arch/powerpc/sysdev/cpm1.c
+++ b/arch/powerpc/sysdev/cpm1.c
@@ -621,7 +621,6 @@ int cpm1_gpiochip_add16(struct device_node *np)
 {
 	struct cpm1_gpio16_chip *cpm1_gc;
 	struct of_mm_gpio_chip *mm_gc;
-	struct of_gpio_chip *of_gc;
 	struct gpio_chip *gc;
 
 	cpm1_gc = kzalloc(sizeof(*cpm1_gc), GFP_KERNEL);
@@ -631,11 +630,10 @@ int cpm1_gpiochip_add16(struct device_node *np)
 	spin_lock_init(&cpm1_gc->lock);
 
 	mm_gc = &cpm1_gc->mm_gc;
-	of_gc = &mm_gc->of_gc;
-	gc = &of_gc->gc;
+	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = cpm1_gpio16_save_regs;
-	of_gc->gpio_cells = 2;
+	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 16;
 	gc->direction_input = cpm1_gpio16_dir_in;
 	gc->direction_output = cpm1_gpio16_dir_out;
@@ -745,7 +743,6 @@ int cpm1_gpiochip_add32(struct device_node *np)
 {
 	struct cpm1_gpio32_chip *cpm1_gc;
 	struct of_mm_gpio_chip *mm_gc;
-	struct of_gpio_chip *of_gc;
 	struct gpio_chip *gc;
 
 	cpm1_gc = kzalloc(sizeof(*cpm1_gc), GFP_KERNEL);
@@ -755,11 +752,10 @@ int cpm1_gpiochip_add32(struct device_node *np)
 	spin_lock_init(&cpm1_gc->lock);
 
 	mm_gc = &cpm1_gc->mm_gc;
-	of_gc = &mm_gc->of_gc;
-	gc = &of_gc->gc;
+	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = cpm1_gpio32_save_regs;
-	of_gc->gpio_cells = 2;
+	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 32;
 	gc->direction_input = cpm1_gpio32_dir_in;
 	gc->direction_output = cpm1_gpio32_dir_out;
diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c
index 88b9812c854f..67e9b47dcf8e 100644
--- a/arch/powerpc/sysdev/cpm_common.c
+++ b/arch/powerpc/sysdev/cpm_common.c
@@ -325,7 +325,6 @@ int cpm2_gpiochip_add32(struct device_node *np)
 {
 	struct cpm2_gpio32_chip *cpm2_gc;
 	struct of_mm_gpio_chip *mm_gc;
-	struct of_gpio_chip *of_gc;
 	struct gpio_chip *gc;
 
 	cpm2_gc = kzalloc(sizeof(*cpm2_gc), GFP_KERNEL);
@@ -335,11 +334,10 @@ int cpm2_gpiochip_add32(struct device_node *np)
 	spin_lock_init(&cpm2_gc->lock);
 
 	mm_gc = &cpm2_gc->mm_gc;
-	of_gc = &mm_gc->of_gc;
-	gc = &of_gc->gc;
+	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = cpm2_gpio32_save_regs;
-	of_gc->gpio_cells = 2;
+	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 32;
 	gc->direction_input = cpm2_gpio32_dir_in;
 	gc->direction_output = cpm2_gpio32_dir_out;
diff --git a/arch/powerpc/sysdev/mpc8xxx_gpio.c b/arch/powerpc/sysdev/mpc8xxx_gpio.c
index 83f519655fac..ec8fcd42101e 100644
--- a/arch/powerpc/sysdev/mpc8xxx_gpio.c
+++ b/arch/powerpc/sysdev/mpc8xxx_gpio.c
@@ -257,7 +257,6 @@ static void __init mpc8xxx_add_controller(struct device_node *np)
 {
 	struct mpc8xxx_gpio_chip *mpc8xxx_gc;
 	struct of_mm_gpio_chip *mm_gc;
-	struct of_gpio_chip *of_gc;
 	struct gpio_chip *gc;
 	unsigned hwirq;
 	int ret;
@@ -271,11 +270,10 @@ static void __init mpc8xxx_add_controller(struct device_node *np)
 	spin_lock_init(&mpc8xxx_gc->lock);
 
 	mm_gc = &mpc8xxx_gc->mm_gc;
-	of_gc = &mm_gc->of_gc;
-	gc = &of_gc->gc;
+	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = mpc8xxx_gpio_save_regs;
-	of_gc->gpio_cells = 2;
+	gc->of_gpio_n_cells = 2;
 	gc->ngpio = MPC8XXX_GPIO_PINS;
 	gc->direction_input = mpc8xxx_gpio_dir_in;
 	gc->direction_output = mpc8xxx_gpio_dir_out;
diff --git a/arch/powerpc/sysdev/ppc4xx_gpio.c b/arch/powerpc/sysdev/ppc4xx_gpio.c
index 3812fc366bec..42e7a5eea665 100644
--- a/arch/powerpc/sysdev/ppc4xx_gpio.c
+++ b/arch/powerpc/sysdev/ppc4xx_gpio.c
@@ -181,7 +181,6 @@ static int __init ppc4xx_add_gpiochips(void)
 		int ret;
 		struct ppc4xx_gpio_chip *ppc4xx_gc;
 		struct of_mm_gpio_chip *mm_gc;
-		struct of_gpio_chip *of_gc;
 		struct gpio_chip *gc;
 
 		ppc4xx_gc = kzalloc(sizeof(*ppc4xx_gc), GFP_KERNEL);
@@ -193,10 +192,9 @@ static int __init ppc4xx_add_gpiochips(void)
 		spin_lock_init(&ppc4xx_gc->lock);
 
 		mm_gc = &ppc4xx_gc->mm_gc;
-		of_gc = &mm_gc->of_gc;
-		gc = &of_gc->gc;
+		gc = &mm_gc->gc;
 
-		of_gc->gpio_cells = 2;
+		gc->of_gpio_n_cells = 2;
 		gc->ngpio = 32;
 		gc->direction_input = ppc4xx_gpio_dir_in;
 		gc->direction_output = ppc4xx_gpio_dir_out;
diff --git a/arch/powerpc/sysdev/qe_lib/gpio.c b/arch/powerpc/sysdev/qe_lib/gpio.c
index dc8f8d618074..194478c2f4b4 100644
--- a/arch/powerpc/sysdev/qe_lib/gpio.c
+++ b/arch/powerpc/sysdev/qe_lib/gpio.c
@@ -138,8 +138,8 @@ struct qe_pin {
 struct qe_pin *qe_pin_request(struct device_node *np, int index)
 {
 	struct qe_pin *qe_pin;
-	struct device_node *gc;
-	struct of_gpio_chip *of_gc = NULL;
+	struct device_node *gpio_np;
+	struct gpio_chip *gc;
 	struct of_mm_gpio_chip *mm_gc;
 	struct qe_gpio_chip *qe_gc;
 	int err;
@@ -155,40 +155,40 @@ struct qe_pin *qe_pin_request(struct device_node *np, int index)
 	}
 
 	err = of_parse_phandles_with_args(np, "gpios", "#gpio-cells", index,
-					  &gc, &gpio_spec);
+					  &gpio_np, &gpio_spec);
 	if (err) {
 		pr_debug("%s: can't parse gpios property\n", __func__);
 		goto err0;
 	}
 
-	if (!of_device_is_compatible(gc, "fsl,mpc8323-qe-pario-bank")) {
+	if (!of_device_is_compatible(gpio_np, "fsl,mpc8323-qe-pario-bank")) {
 		pr_debug("%s: tried to get a non-qe pin\n", __func__);
 		err = -EINVAL;
 		goto err1;
 	}
 
-	of_gc = gc->data;
-	if (!of_gc) {
+	gc = gpio_np->data;
+	if (!gc) {
 		pr_debug("%s: gpio controller %s isn't registered\n",
-			 np->full_name, gc->full_name);
+			 np->full_name, gpio_np->full_name);
 		err = -ENODEV;
 		goto err1;
 	}
 
-	gpio_cells = of_get_property(gc, "#gpio-cells", &size);
+	gpio_cells = of_get_property(gpio_np, "#gpio-cells", &size);
 	if (!gpio_cells || size != sizeof(*gpio_cells) ||
-			*gpio_cells != of_gc->gpio_cells) {
+			*gpio_cells != gc->of_gpio_n_cells) {
 		pr_debug("%s: wrong #gpio-cells for %s\n",
-			 np->full_name, gc->full_name);
+			 np->full_name, gpio_np->full_name);
 		err = -EINVAL;
 		goto err1;
 	}
 
-	err = of_gc->xlate(of_gc, np, gpio_spec, NULL);
+	err = gc->of_xlate(gc, np, gpio_spec, NULL);
 	if (err < 0)
 		goto err1;
 
-	mm_gc = to_of_mm_gpio_chip(&of_gc->gc);
+	mm_gc = to_of_mm_gpio_chip(gc);
 	qe_gc = to_qe_gpio_chip(mm_gc);
 
 	spin_lock_irqsave(&qe_gc->lock, flags);
@@ -206,7 +206,7 @@ struct qe_pin *qe_pin_request(struct device_node *np, int index)
 	if (!err)
 		return qe_pin;
 err1:
-	of_node_put(gc);
+	of_node_put(gpio_np);
 err0:
 	kfree(qe_pin);
 	pr_debug("%s failed with status %d\n", __func__, err);
@@ -307,7 +307,6 @@ static int __init qe_add_gpiochips(void)
 		int ret;
 		struct qe_gpio_chip *qe_gc;
 		struct of_mm_gpio_chip *mm_gc;
-		struct of_gpio_chip *of_gc;
 		struct gpio_chip *gc;
 
 		qe_gc = kzalloc(sizeof(*qe_gc), GFP_KERNEL);
@@ -319,11 +318,10 @@ static int __init qe_add_gpiochips(void)
 		spin_lock_init(&qe_gc->lock);
 
 		mm_gc = &qe_gc->mm_gc;
-		of_gc = &mm_gc->of_gc;
-		gc = &of_gc->gc;
+		gc = &mm_gc->gc;
 
 		mm_gc->save_regs = qe_gpio_save_regs;
-		of_gc->gpio_cells = 2;
+		gc->of_gpio_n_cells = 2;
 		gc->ngpio = QE_PIO_PINS;
 		gc->direction_input = qe_gpio_dir_in;
 		gc->direction_output = qe_gpio_dir_out;
diff --git a/arch/powerpc/sysdev/simple_gpio.c b/arch/powerpc/sysdev/simple_gpio.c
index d5fb173e588c..b7559aa0c165 100644
--- a/arch/powerpc/sysdev/simple_gpio.c
+++ b/arch/powerpc/sysdev/simple_gpio.c
@@ -91,7 +91,6 @@ static int __init u8_simple_gpiochip_add(struct device_node *np)
 	int ret;
 	struct u8_gpio_chip *u8_gc;
 	struct of_mm_gpio_chip *mm_gc;
-	struct of_gpio_chip *of_gc;
 	struct gpio_chip *gc;
 
 	u8_gc = kzalloc(sizeof(*u8_gc), GFP_KERNEL);
@@ -101,11 +100,10 @@ static int __init u8_simple_gpiochip_add(struct device_node *np)
 	spin_lock_init(&u8_gc->lock);
 
 	mm_gc = &u8_gc->mm_gc;
-	of_gc = &mm_gc->of_gc;
-	gc = &of_gc->gc;
+	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = u8_gpio_save_regs;
-	of_gc->gpio_cells = 2;
+	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 8;
 	gc->direction_input = u8_gpio_dir_in;
 	gc->direction_output = u8_gpio_dir_out;
diff --git a/drivers/gpio/xilinx_gpio.c b/drivers/gpio/xilinx_gpio.c
index b8fa65b5bfca..2993c40b48e1 100644
--- a/drivers/gpio/xilinx_gpio.c
+++ b/drivers/gpio/xilinx_gpio.c
@@ -161,14 +161,12 @@ static void xgpio_save_regs(struct of_mm_gpio_chip *mm_gc)
 static int __devinit xgpio_of_probe(struct device_node *np)
 {
 	struct xgpio_instance *chip;
-	struct of_gpio_chip *ofchip;
 	int status = 0;
 	const u32 *tree_info;
 
 	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
 	if (!chip)
 		return -ENOMEM;
-	ofchip = &chip->mmchip.of_gc;
 
 	/* Update GPIO state shadow register with default value */
 	tree_info = of_get_property(np, "xlnx,dout-default", NULL);
@@ -182,21 +180,21 @@ static int __devinit xgpio_of_probe(struct device_node *np)
 		chip->gpio_dir = *tree_info;
 
 	/* Check device node and parent device node for device width */
-	ofchip->gc.ngpio = 32; /* By default assume full GPIO controller */
+	chip->mmchip.gc.ngpio = 32; /* By default assume full GPIO controller */
 	tree_info = of_get_property(np, "xlnx,gpio-width", NULL);
 	if (!tree_info)
 		tree_info = of_get_property(np->parent,
 					    "xlnx,gpio-width", NULL);
 	if (tree_info)
-		ofchip->gc.ngpio = *tree_info;
+		chip->mmchip.gc.ngpio = *tree_info;
 
 	spin_lock_init(&chip->gpio_lock);
 
-	ofchip->gpio_cells = 2;
-	ofchip->gc.direction_input = xgpio_dir_in;
-	ofchip->gc.direction_output = xgpio_dir_out;
-	ofchip->gc.get = xgpio_get;
-	ofchip->gc.set = xgpio_set;
+	chip->mmchip.gc.of_gpio_n_cells = 2;
+	chip->mmchip.gc.direction_input = xgpio_dir_in;
+	chip->mmchip.gc.direction_output = xgpio_dir_out;
+	chip->mmchip.gc.get = xgpio_get;
+	chip->mmchip.gc.set = xgpio_set;
 
 	chip->mmchip.save_regs = xgpio_save_regs;
 
diff --git a/drivers/of/gpio.c b/drivers/of/gpio.c
index a1b31a4abae4..fde53a3a45a3 100644
--- a/drivers/of/gpio.c
+++ b/drivers/of/gpio.c
@@ -33,32 +33,32 @@ int of_get_gpio_flags(struct device_node *np, int index,
 		      enum of_gpio_flags *flags)
 {
 	int ret;
-	struct device_node *gc;
-	struct of_gpio_chip *of_gc = NULL;
+	struct device_node *gpio_np;
+	struct gpio_chip *gc;
 	int size;
 	const void *gpio_spec;
 	const __be32 *gpio_cells;
 
 	ret = of_parse_phandles_with_args(np, "gpios", "#gpio-cells", index,
-					  &gc, &gpio_spec);
+					  &gpio_np, &gpio_spec);
 	if (ret) {
 		pr_debug("%s: can't parse gpios property\n", __func__);
 		goto err0;
 	}
 
-	of_gc = gc->data;
-	if (!of_gc) {
+	gc = gpio_np->data;
+	if (!gc) {
 		pr_debug("%s: gpio controller %s isn't registered\n",
-			 np->full_name, gc->full_name);
+			 np->full_name, gpio_np->full_name);
 		ret = -ENODEV;
 		goto err1;
 	}
 
-	gpio_cells = of_get_property(gc, "#gpio-cells", &size);
+	gpio_cells = of_get_property(gpio_np, "#gpio-cells", &size);
 	if (!gpio_cells || size != sizeof(*gpio_cells) ||
-			be32_to_cpup(gpio_cells) != of_gc->gpio_cells) {
+			be32_to_cpup(gpio_cells) != gc->of_gpio_n_cells) {
 		pr_debug("%s: wrong #gpio-cells for %s\n",
-			 np->full_name, gc->full_name);
+			 np->full_name, gpio_np->full_name);
 		ret = -EINVAL;
 		goto err1;
 	}
@@ -67,13 +67,13 @@ int of_get_gpio_flags(struct device_node *np, int index,
 	if (flags)
 		*flags = 0;
 
-	ret = of_gc->xlate(of_gc, np, gpio_spec, flags);
+	ret = gc->of_xlate(gc, np, gpio_spec, flags);
 	if (ret < 0)
 		goto err1;
 
-	ret += of_gc->gc.base;
+	ret += gc->base;
 err1:
-	of_node_put(gc);
+	of_node_put(gpio_np);
 err0:
 	pr_debug("%s exited with status %d\n", __func__, ret);
 	return ret;
@@ -116,7 +116,7 @@ EXPORT_SYMBOL(of_gpio_count);
 
 /**
  * of_gpio_simple_xlate - translate gpio_spec to the GPIO number and flags
- * @of_gc:	pointer to the of_gpio_chip structure
+ * @gc:		pointer to the gpio_chip structure
  * @np:		device node of the GPIO chip
  * @gpio_spec:	gpio specifier as found in the device tree
  * @flags:	a flags pointer to fill in
@@ -125,8 +125,8 @@ EXPORT_SYMBOL(of_gpio_count);
  * gpio chips. This function performs only one sanity check: whether gpio
  * is less than ngpios (that is specified in the gpio_chip).
  */
-int of_gpio_simple_xlate(struct of_gpio_chip *of_gc, struct device_node *np,
-			 const void *gpio_spec, enum of_gpio_flags *flags)
+int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
+			 const void *gpio_spec, u32 *flags)
 {
 	const __be32 *gpio = gpio_spec;
 	const u32 n = be32_to_cpup(gpio);
@@ -137,12 +137,12 @@ int of_gpio_simple_xlate(struct of_gpio_chip *of_gc, struct device_node *np,
 	 * number and the flags from a single gpio cell -- this is possible,
 	 * but not recommended).
 	 */
-	if (of_gc->gpio_cells < 2) {
+	if (gc->of_gpio_n_cells < 2) {
 		WARN_ON(1);
 		return -EINVAL;
 	}
 
-	if (n > of_gc->gc.ngpio)
+	if (n > gc->ngpio)
 		return -EINVAL;
 
 	if (flags)
@@ -161,10 +161,8 @@ EXPORT_SYMBOL(of_gpio_simple_xlate);
  *
  * 1) In the gpio_chip structure:
  *    - all the callbacks
- *
- * 2) In the of_gpio_chip structure:
- *    - gpio_cells
- *    - xlate callback (optional)
+ *    - of_gpio_n_cells
+ *    - of_xlate callback (optional)
  *
  * 3) In the of_mm_gpio_chip structure:
  *    - save_regs callback (optional)
@@ -177,8 +175,7 @@ int of_mm_gpiochip_add(struct device_node *np,
 		       struct of_mm_gpio_chip *mm_gc)
 {
 	int ret = -ENOMEM;
-	struct of_gpio_chip *of_gc = &mm_gc->of_gc;
-	struct gpio_chip *gc = &of_gc->gc;
+	struct gpio_chip *gc = &mm_gc->gc;
 
 	gc->label = kstrdup(np->full_name, GFP_KERNEL);
 	if (!gc->label)
@@ -190,13 +187,14 @@ int of_mm_gpiochip_add(struct device_node *np,
 
 	gc->base = -1;
 
-	if (!of_gc->xlate)
-		of_gc->xlate = of_gpio_simple_xlate;
+	if (!gc->of_xlate)
+		gc->of_xlate = of_gpio_simple_xlate;
 
 	if (mm_gc->save_regs)
 		mm_gc->save_regs(mm_gc);
 
-	np->data = of_gc;
+	np->data = &mm_gc->gc;
+	mm_gc->gc.of_node = np;
 
 	ret = gpiochip_add(gc);
 	if (ret)
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index 4f3d75e1ad39..af2544ef0b59 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -31,6 +31,7 @@ static inline int gpio_is_valid(int number)
 struct device;
 struct seq_file;
 struct module;
+struct device_node;
 
 /**
  * struct gpio_chip - abstract a GPIO controller
@@ -106,6 +107,17 @@ struct gpio_chip {
 	const char		*const *names;
 	unsigned		can_sleep:1;
 	unsigned		exported:1;
+
+#if defined(CONFIG_OF_GPIO)
+	/*
+	 * If CONFIG_OF is enabled, then all GPIO controllers described in the
+	 * device tree automatically may have an OF translation
+	 */
+	struct device_node *of_node;
+	int of_gpio_n_cells;
+	int (*of_xlate)(struct gpio_chip *gc, struct device_node *np,
+		        const void *gpio_spec, u32 *flags);
+#endif
 };
 
 extern const char *gpiochip_is_requested(struct gpio_chip *chip,
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index fc2472c3c254..460d6810c5eb 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -32,35 +32,18 @@ enum of_gpio_flags {
 
 #ifdef CONFIG_OF_GPIO
 
-/*
- * Generic OF GPIO chip
- */
-struct of_gpio_chip {
-	struct gpio_chip gc;
-	int gpio_cells;
-	int (*xlate)(struct of_gpio_chip *of_gc, struct device_node *np,
-		     const void *gpio_spec, enum of_gpio_flags *flags);
-};
-
-static inline struct of_gpio_chip *to_of_gpio_chip(struct gpio_chip *gc)
-{
-	return container_of(gc, struct of_gpio_chip, gc);
-}
-
 /*
  * OF GPIO chip for memory mapped banks
  */
 struct of_mm_gpio_chip {
-	struct of_gpio_chip of_gc;
+	struct gpio_chip gc;
 	void (*save_regs)(struct of_mm_gpio_chip *mm_gc);
 	void __iomem *regs;
 };
 
 static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc)
 {
-	struct of_gpio_chip *of_gc = to_of_gpio_chip(gc);
-
-	return container_of(of_gc, struct of_mm_gpio_chip, of_gc);
+	return container_of(gc, struct of_mm_gpio_chip, gc);
 }
 
 extern int of_get_gpio_flags(struct device_node *np, int index,
@@ -69,11 +52,9 @@ extern unsigned int of_gpio_count(struct device_node *np);
 
 extern int of_mm_gpiochip_add(struct device_node *np,
 			      struct of_mm_gpio_chip *mm_gc);
-extern int of_gpio_simple_xlate(struct of_gpio_chip *of_gc,
-				struct device_node *np,
-				const void *gpio_spec,
-				enum of_gpio_flags *flags);
-#else
+extern int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
+				const void *gpio_spec, u32 *flags);
+#else /* CONFIG_OF_GPIO */
 
 /* Drivers may not strictly depend on the GPIO support, so let them link. */
 static inline int of_get_gpio_flags(struct device_node *np, int index,
-- 
cgit v1.2.3-59-g8ed1b


From 594fa265e084073443390c5b93d5410fd28e9bcd Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:16 -0600
Subject: of/gpio: stop using device_node data pointer to find gpio_chip

Currently the kernel uses the struct device_node.data pointer to resolve
a struct gpio_chip pointer from a device tree node.  However, the .data
member doesn't provide any type checking and there aren't any rules
enforced on what it should be used for.  There's no guarantee that the
data stored in it actually points to an gpio_chip pointer.

Instead of relying on the .data pointer, this patch modifies the code
to add a lookup function which scans through the registered gpio_chips
and returns the gpio_chip that has a pointer to the specified
device_node.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Anton Vorontsov <avorontsov@ru.mvista.com>
CC: Grant Likely <grant.likely@secretlab.ca>
CC: David Brownell <dbrownell@users.sourceforge.net>
CC: Bill Gatliff <bgat@billgatliff.com>
CC: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Jean Delvare <khali@linux-fr.org>
CC: linux-kernel@vger.kernel.org
CC: devicetree-discuss@lists.ozlabs.org
---
 arch/microblaze/kernel/reset.c                 |  2 +-
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c      |  1 +
 arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c | 23 +++---------------
 arch/powerpc/sysdev/qe_lib/gpio.c              |  2 +-
 drivers/gpio/gpiolib.c                         | 32 ++++++++++++++++++++++++++
 drivers/of/gpio.c                              | 15 +++++++++---
 include/asm-generic/gpio.h                     |  3 +++
 include/linux/of_gpio.h                        |  3 +++
 8 files changed, 56 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/kernel/reset.c b/arch/microblaze/kernel/reset.c
index 5476d3caf045..bd8ccab5ceff 100644
--- a/arch/microblaze/kernel/reset.c
+++ b/arch/microblaze/kernel/reset.c
@@ -39,7 +39,7 @@ static int of_reset_gpio_handle(void)
 		goto err0;
 	}
 
-	gc = gpio->data;
+	gc = of_node_to_gpiochip(gpio);
 	if (!gc) {
 		pr_debug("%s: gpio controller %s isn't registered\n",
 			 root->full_name, gpio->full_name);
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 3f2ee47f1d01..6e82bd27132c 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -350,6 +350,7 @@ mpc52xx_gpt_gpio_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node)
 	gpt->gc.base = -1;
 	gpt->gc.of_gpio_n_cells = 2;
 	gpt->gc.of_xlate = of_gpio_simple_xlate;
+	gpt->gc.of_node = node;
 	of_node_get(node);
 
 	/* Setup external pin in GPIO mode */
diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
index e49f4bd2f991..f0dbace6185a 100644
--- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
+++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
@@ -35,7 +35,6 @@
 
 struct mcu {
 	struct mutex lock;
-	struct device_node *np;
 	struct i2c_client *client;
 	struct gpio_chip gc;
 	u8 reg_ctrl;
@@ -79,7 +78,6 @@ static int mcu_gpiochip_add(struct mcu *mcu)
 {
 	struct device_node *np;
 	struct gpio_chip *gc = &mcu->gc;
-	int ret;
 
 	np = of_find_compatible_node(NULL, NULL, "fsl,mcu-mpc8349emitx");
 	if (!np)
@@ -94,29 +92,14 @@ static int mcu_gpiochip_add(struct mcu *mcu)
 	gc->direction_output = mcu_gpio_dir_out;
 	gc->of_gpio_n_cells = 2;
 	gc->of_xlate = of_gpio_simple_xlate;
+	gc->of_node = np;
 
-	mcu->np = np;
-
-	/*
-	 * We don't want to lose the node, its ->data and ->full_name...
-	 * So, if succeeded, we don't put the node here.
-	 */
-	ret = gpiochip_add(gc);
-	if (ret)
-		of_node_put(np);
-	return ret;
+	return gpiochip_add(gc);
 }
 
 static int mcu_gpiochip_remove(struct mcu *mcu)
 {
-	int ret;
-
-	ret = gpiochip_remove(&mcu->gc);
-	if (ret)
-		return ret;
-	of_node_put(mcu->np);
-
-	return 0;
+	return gpiochip_remove(&mcu->gc);
 }
 
 static int __devinit mcu_probe(struct i2c_client *client,
diff --git a/arch/powerpc/sysdev/qe_lib/gpio.c b/arch/powerpc/sysdev/qe_lib/gpio.c
index 194478c2f4b4..32e9440010a1 100644
--- a/arch/powerpc/sysdev/qe_lib/gpio.c
+++ b/arch/powerpc/sysdev/qe_lib/gpio.c
@@ -167,7 +167,7 @@ struct qe_pin *qe_pin_request(struct device_node *np, int index)
 		goto err1;
 	}
 
-	gc = gpio_np->data;
+	gc = of_node_to_gpiochip(gpio_np);
 	if (!gc) {
 		pr_debug("%s: gpio controller %s isn't registered\n",
 			 np->full_name, gpio_np->full_name);
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 713ca0e37f23..73fd328f6fe4 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1153,6 +1153,38 @@ int gpiochip_remove(struct gpio_chip *chip)
 }
 EXPORT_SYMBOL_GPL(gpiochip_remove);
 
+/**
+ * gpiochip_find() - iterator for locating a specific gpio_chip
+ * @data: data to pass to match function
+ * @callback: Callback function to check gpio_chip
+ *
+ * Similar to bus_find_device.  It returns a reference to a gpio_chip as
+ * determined by a user supplied @match callback.  The callback should return
+ * 0 if the device doesn't match and non-zero if it does.  If the callback is
+ * non-zero, this function will return to the caller and not iterate over any
+ * more gpio_chips.
+ */
+struct gpio_chip *gpiochip_find(void *data,
+				int (*match)(struct gpio_chip *chip, void *data))
+{
+	struct gpio_chip *chip = NULL;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&gpio_lock, flags);
+	for (i = 0; i < ARCH_NR_GPIOS; i++) {
+		if (!gpio_desc[i].chip)
+			continue;
+
+		if (match(gpio_desc[i].chip, data)) {
+			chip = gpio_desc[i].chip;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&gpio_lock, flags);
+
+	return chip;
+}
 
 /* These "optional" allocation calls help prevent drivers from stomping
  * on each other, and help provide better diagnostics in debugfs.
diff --git a/drivers/of/gpio.c b/drivers/of/gpio.c
index fde53a3a45a3..c8618d3282cf 100644
--- a/drivers/of/gpio.c
+++ b/drivers/of/gpio.c
@@ -46,7 +46,7 @@ int of_get_gpio_flags(struct device_node *np, int index,
 		goto err0;
 	}
 
-	gc = gpio_np->data;
+	gc = of_node_to_gpiochip(gpio_np);
 	if (!gc) {
 		pr_debug("%s: gpio controller %s isn't registered\n",
 			 np->full_name, gpio_np->full_name);
@@ -193,7 +193,6 @@ int of_mm_gpiochip_add(struct device_node *np,
 	if (mm_gc->save_regs)
 		mm_gc->save_regs(mm_gc);
 
-	np->data = &mm_gc->gc;
 	mm_gc->gc.of_node = np;
 
 	ret = gpiochip_add(gc);
@@ -207,7 +206,6 @@ int of_mm_gpiochip_add(struct device_node *np,
 		 np->full_name, gc->base);
 	return 0;
 err2:
-	np->data = NULL;
 	iounmap(mm_gc->regs);
 err1:
 	kfree(gc->label);
@@ -217,3 +215,14 @@ err0:
 	return ret;
 }
 EXPORT_SYMBOL(of_mm_gpiochip_add);
+
+/* Private function for resolving node pointer to gpio_chip */
+static int of_gpiochip_is_match(struct gpio_chip *chip, void *data)
+{
+	return chip->of_node == data;
+}
+
+struct gpio_chip *of_node_to_gpiochip(struct device_node *np)
+{
+	return gpiochip_find(np, of_gpiochip_is_match);
+}
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index af2544ef0b59..c7376bf80b06 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -127,6 +127,9 @@ extern int __must_check gpiochip_reserve(int start, int ngpio);
 /* add/remove chips */
 extern int gpiochip_add(struct gpio_chip *chip);
 extern int __must_check gpiochip_remove(struct gpio_chip *chip);
+extern struct gpio_chip *gpiochip_find(void *data,
+					int (*match)(struct gpio_chip *chip,
+						     void *data));
 
 
 /* Always use the library code for GPIO management calls,
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 460d6810c5eb..1020587efed1 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -54,6 +54,9 @@ extern int of_mm_gpiochip_add(struct device_node *np,
 			      struct of_mm_gpio_chip *mm_gc);
 extern int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
 				const void *gpio_spec, u32 *flags);
+
+extern struct gpio_chip *of_node_to_gpiochip(struct device_node *np);
+
 #else /* CONFIG_OF_GPIO */
 
 /* Drivers may not strictly depend on the GPIO support, so let them link. */
-- 
cgit v1.2.3-59-g8ed1b


From 391c970c0dd1100e3b9e1681f7d0f20aac35455a Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 8 Jun 2010 07:48:17 -0600
Subject: of/gpio: add default of_xlate function if device has a node pointer

Implement generic OF gpio hooks and thus make device-enabled GPIO chips
(i.e.  the ones that have gpio_chip->dev specified) automatically attach
to the OpenFirmware subsystem.  Which means that now we can handle I2C and
SPI GPIO chips almost* transparently.

* "Almost" because some chips still require platform data, and for these
  chips OF-glue is still needed, though with this change the glue will
  be much smaller.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Cc: Bill Gatliff <bgat@billgatliff.com>
Cc: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
CC: linux-kernel@vger.kernel.org
CC: devicetree-discuss@lists.ozlabs.org
---
 arch/powerpc/platforms/52xx/mpc52xx_gpio.c     |  2 --
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c      |  3 ---
 arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c |  2 --
 arch/powerpc/sysdev/cpm1.c                     |  2 --
 arch/powerpc/sysdev/cpm_common.c               |  1 -
 arch/powerpc/sysdev/mpc8xxx_gpio.c             |  1 -
 arch/powerpc/sysdev/ppc4xx_gpio.c              |  1 -
 arch/powerpc/sysdev/qe_lib/gpio.c              |  1 -
 arch/powerpc/sysdev/simple_gpio.c              |  1 -
 drivers/gpio/gpiolib.c                         |  5 ++++
 drivers/gpio/xilinx_gpio.c                     |  1 -
 drivers/of/gpio.c                              | 33 +++++++++++++++++++-------
 include/linux/of_gpio.h                        |  7 ++++--
 13 files changed, 34 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpio.c b/arch/powerpc/platforms/52xx/mpc52xx_gpio.c
index fd0912eeffe2..0855e804fc0d 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpio.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpio.c
@@ -161,7 +161,6 @@ static int __devinit mpc52xx_wkup_gpiochip_probe(struct of_device *ofdev,
 
 	gc = &chip->mmchip.gc;
 
-	gc->of_gpio_n_cells  = 2;
 	gc->ngpio            = 8;
 	gc->direction_input  = mpc52xx_wkup_gpio_dir_in;
 	gc->direction_output = mpc52xx_wkup_gpio_dir_out;
@@ -325,7 +324,6 @@ static int __devinit mpc52xx_simple_gpiochip_probe(struct of_device *ofdev,
 
 	gc = &chip->mmchip.gc;
 
-	gc->of_gpio_n_cells  = 2;
 	gc->ngpio            = 32;
 	gc->direction_input  = mpc52xx_simple_gpio_dir_in;
 	gc->direction_output = mpc52xx_simple_gpio_dir_out;
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 6e82bd27132c..5d7d607617cd 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -348,10 +348,7 @@ mpc52xx_gpt_gpio_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node)
 	gpt->gc.get = mpc52xx_gpt_gpio_get;
 	gpt->gc.set = mpc52xx_gpt_gpio_set;
 	gpt->gc.base = -1;
-	gpt->gc.of_gpio_n_cells = 2;
-	gpt->gc.of_xlate = of_gpio_simple_xlate;
 	gpt->gc.of_node = node;
-	of_node_get(node);
 
 	/* Setup external pin in GPIO mode */
 	clrsetbits_be32(&gpt->regs->mode, MPC52xx_GPT_MODE_MS_MASK,
diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
index f0dbace6185a..59b0ed1a56b1 100644
--- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
+++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
@@ -90,8 +90,6 @@ static int mcu_gpiochip_add(struct mcu *mcu)
 	gc->base = -1;
 	gc->set = mcu_gpio_set;
 	gc->direction_output = mcu_gpio_dir_out;
-	gc->of_gpio_n_cells = 2;
-	gc->of_xlate = of_gpio_simple_xlate;
 	gc->of_node = np;
 
 	return gpiochip_add(gc);
diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c
index d5cf7d4ccf81..00852124ff4a 100644
--- a/arch/powerpc/sysdev/cpm1.c
+++ b/arch/powerpc/sysdev/cpm1.c
@@ -633,7 +633,6 @@ int cpm1_gpiochip_add16(struct device_node *np)
 	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = cpm1_gpio16_save_regs;
-	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 16;
 	gc->direction_input = cpm1_gpio16_dir_in;
 	gc->direction_output = cpm1_gpio16_dir_out;
@@ -755,7 +754,6 @@ int cpm1_gpiochip_add32(struct device_node *np)
 	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = cpm1_gpio32_save_regs;
-	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 32;
 	gc->direction_input = cpm1_gpio32_dir_in;
 	gc->direction_output = cpm1_gpio32_dir_out;
diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c
index 67e9b47dcf8e..2b69aa0315b3 100644
--- a/arch/powerpc/sysdev/cpm_common.c
+++ b/arch/powerpc/sysdev/cpm_common.c
@@ -337,7 +337,6 @@ int cpm2_gpiochip_add32(struct device_node *np)
 	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = cpm2_gpio32_save_regs;
-	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 32;
 	gc->direction_input = cpm2_gpio32_dir_in;
 	gc->direction_output = cpm2_gpio32_dir_out;
diff --git a/arch/powerpc/sysdev/mpc8xxx_gpio.c b/arch/powerpc/sysdev/mpc8xxx_gpio.c
index ec8fcd42101e..2b69084d0f0c 100644
--- a/arch/powerpc/sysdev/mpc8xxx_gpio.c
+++ b/arch/powerpc/sysdev/mpc8xxx_gpio.c
@@ -273,7 +273,6 @@ static void __init mpc8xxx_add_controller(struct device_node *np)
 	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = mpc8xxx_gpio_save_regs;
-	gc->of_gpio_n_cells = 2;
 	gc->ngpio = MPC8XXX_GPIO_PINS;
 	gc->direction_input = mpc8xxx_gpio_dir_in;
 	gc->direction_output = mpc8xxx_gpio_dir_out;
diff --git a/arch/powerpc/sysdev/ppc4xx_gpio.c b/arch/powerpc/sysdev/ppc4xx_gpio.c
index 42e7a5eea665..fc65ad1b3293 100644
--- a/arch/powerpc/sysdev/ppc4xx_gpio.c
+++ b/arch/powerpc/sysdev/ppc4xx_gpio.c
@@ -194,7 +194,6 @@ static int __init ppc4xx_add_gpiochips(void)
 		mm_gc = &ppc4xx_gc->mm_gc;
 		gc = &mm_gc->gc;
 
-		gc->of_gpio_n_cells = 2;
 		gc->ngpio = 32;
 		gc->direction_input = ppc4xx_gpio_dir_in;
 		gc->direction_output = ppc4xx_gpio_dir_out;
diff --git a/arch/powerpc/sysdev/qe_lib/gpio.c b/arch/powerpc/sysdev/qe_lib/gpio.c
index 32e9440010a1..36bf845df127 100644
--- a/arch/powerpc/sysdev/qe_lib/gpio.c
+++ b/arch/powerpc/sysdev/qe_lib/gpio.c
@@ -321,7 +321,6 @@ static int __init qe_add_gpiochips(void)
 		gc = &mm_gc->gc;
 
 		mm_gc->save_regs = qe_gpio_save_regs;
-		gc->of_gpio_n_cells = 2;
 		gc->ngpio = QE_PIO_PINS;
 		gc->direction_input = qe_gpio_dir_in;
 		gc->direction_output = qe_gpio_dir_out;
diff --git a/arch/powerpc/sysdev/simple_gpio.c b/arch/powerpc/sysdev/simple_gpio.c
index b7559aa0c165..b6defda5ccc9 100644
--- a/arch/powerpc/sysdev/simple_gpio.c
+++ b/arch/powerpc/sysdev/simple_gpio.c
@@ -103,7 +103,6 @@ static int __init u8_simple_gpiochip_add(struct device_node *np)
 	gc = &mm_gc->gc;
 
 	mm_gc->save_regs = u8_gpio_save_regs;
-	gc->of_gpio_n_cells = 2;
 	gc->ngpio = 8;
 	gc->direction_input = u8_gpio_dir_in;
 	gc->direction_output = u8_gpio_dir_out;
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 73fd328f6fe4..83cbc34e3a76 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -8,6 +8,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/gpio.h>
+#include <linux/of_gpio.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
 
@@ -1099,6 +1100,8 @@ int gpiochip_add(struct gpio_chip *chip)
 		}
 	}
 
+	of_gpiochip_add(chip);
+
 unlock:
 	spin_unlock_irqrestore(&gpio_lock, flags);
 
@@ -1133,6 +1136,8 @@ int gpiochip_remove(struct gpio_chip *chip)
 
 	spin_lock_irqsave(&gpio_lock, flags);
 
+	of_gpiochip_remove(chip);
+
 	for (id = chip->base; id < chip->base + chip->ngpio; id++) {
 		if (test_bit(FLAG_REQUESTED, &gpio_desc[id].flags)) {
 			status = -EBUSY;
diff --git a/drivers/gpio/xilinx_gpio.c b/drivers/gpio/xilinx_gpio.c
index 2993c40b48e1..709690995d0d 100644
--- a/drivers/gpio/xilinx_gpio.c
+++ b/drivers/gpio/xilinx_gpio.c
@@ -190,7 +190,6 @@ static int __devinit xgpio_of_probe(struct device_node *np)
 
 	spin_lock_init(&chip->gpio_lock);
 
-	chip->mmchip.gc.of_gpio_n_cells = 2;
 	chip->mmchip.gc.direction_input = xgpio_dir_in;
 	chip->mmchip.gc.direction_output = xgpio_dir_out;
 	chip->mmchip.gc.get = xgpio_get;
diff --git a/drivers/of/gpio.c b/drivers/of/gpio.c
index c8618d3282cf..09f05a178668 100644
--- a/drivers/of/gpio.c
+++ b/drivers/of/gpio.c
@@ -125,8 +125,8 @@ EXPORT_SYMBOL(of_gpio_count);
  * gpio chips. This function performs only one sanity check: whether gpio
  * is less than ngpios (that is specified in the gpio_chip).
  */
-int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
-			 const void *gpio_spec, u32 *flags)
+static int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
+				const void *gpio_spec, u32 *flags)
 {
 	const __be32 *gpio = gpio_spec;
 	const u32 n = be32_to_cpup(gpio);
@@ -150,7 +150,6 @@ int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
 
 	return n;
 }
-EXPORT_SYMBOL(of_gpio_simple_xlate);
 
 /**
  * of_mm_gpiochip_add - Add memory mapped GPIO chip (bank)
@@ -187,9 +186,6 @@ int of_mm_gpiochip_add(struct device_node *np,
 
 	gc->base = -1;
 
-	if (!gc->of_xlate)
-		gc->of_xlate = of_gpio_simple_xlate;
-
 	if (mm_gc->save_regs)
 		mm_gc->save_regs(mm_gc);
 
@@ -199,9 +195,6 @@ int of_mm_gpiochip_add(struct device_node *np,
 	if (ret)
 		goto err2;
 
-	/* We don't want to lose the node and its ->data */
-	of_node_get(np);
-
 	pr_debug("%s: registered as generic GPIO chip, base is %d\n",
 		 np->full_name, gc->base);
 	return 0;
@@ -216,6 +209,28 @@ err0:
 }
 EXPORT_SYMBOL(of_mm_gpiochip_add);
 
+void of_gpiochip_add(struct gpio_chip *chip)
+{
+	if ((!chip->of_node) && (chip->dev))
+		chip->of_node = chip->dev->of_node;
+
+	if (!chip->of_node)
+		return;
+
+	if (!chip->of_xlate) {
+		chip->of_gpio_n_cells = 2;
+		chip->of_xlate = of_gpio_simple_xlate;
+	}
+
+	of_node_get(chip->of_node);
+}
+
+void of_gpiochip_remove(struct gpio_chip *chip)
+{
+	if (chip->of_node)
+		of_node_put(chip->of_node);
+}
+
 /* Private function for resolving node pointer to gpio_chip */
 static int of_gpiochip_is_match(struct gpio_chip *chip, void *data)
 {
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 1020587efed1..6598c04dab01 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -52,9 +52,9 @@ extern unsigned int of_gpio_count(struct device_node *np);
 
 extern int of_mm_gpiochip_add(struct device_node *np,
 			      struct of_mm_gpio_chip *mm_gc);
-extern int of_gpio_simple_xlate(struct gpio_chip *gc, struct device_node *np,
-				const void *gpio_spec, u32 *flags);
 
+extern void of_gpiochip_add(struct gpio_chip *gc);
+extern void of_gpiochip_remove(struct gpio_chip *gc);
 extern struct gpio_chip *of_node_to_gpiochip(struct device_node *np);
 
 #else /* CONFIG_OF_GPIO */
@@ -71,6 +71,9 @@ static inline unsigned int of_gpio_count(struct device_node *np)
 	return 0;
 }
 
+static inline void of_gpiochip_add(struct gpio_chip *gc) { }
+static inline void of_gpiochip_remove(struct gpio_chip *gc) { }
+
 #endif /* CONFIG_OF_GPIO */
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 8cec0e7b4c7c0b76f2b5285f250211ad81c3eafd Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:17 -0600
Subject: of/device: Add OF style matching helper function

Add of_driver_match_device() helper function.  This function can be used
by bus types to determine if a driver works with a device when using OF
style matching.  If CONFIG_OF is unselected, then it is a nop.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
CC: Greg Kroah-Hartman <gregkh@suse.de>
CC: Michal Simek <monstr@monstr.eu>
CC: Grant Likely <grant.likely@secretlab.ca>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: linux-kernel@vger.kernel.org
CC: microblaze-uclinux@itee.uq.edu.au
CC: linuxppc-dev@ozlabs.org
CC: devicetree-discuss@lists.ozlabs.org
---
 drivers/of/device.c       |  2 +-
 include/linux/of_device.h | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/of/device.c b/drivers/of/device.c
index c2a98f5ca80d..5282a202f5a9 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -20,7 +20,7 @@
 const struct of_device_id *of_match_device(const struct of_device_id *matches,
 					   const struct device *dev)
 {
-	if (!dev->of_node)
+	if ((!matches) || (!dev->of_node))
 		return NULL;
 	return of_match_node(matches, dev->of_node);
 }
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 238e92e007e6..91d75fb0c726 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -30,6 +30,17 @@
 extern const struct of_device_id *of_match_device(
 	const struct of_device_id *matches, const struct device *dev);
 
+/**
+ * of_driver_match_device - Tell if a driver's of_match_table matches a device.
+ * @drv: the device_driver structure to test
+ * @dev: the device structure to match against
+ */
+static inline int of_driver_match_device(const struct device *dev,
+					 const struct device_driver *drv)
+{
+	return of_match_device(drv->of_match_table, dev) != NULL;
+}
+
 extern struct of_device *of_dev_get(struct of_device *dev);
 extern void of_dev_put(struct of_device *dev);
 
@@ -48,6 +59,14 @@ extern ssize_t of_device_get_modalias(struct device *dev,
 extern int of_device_uevent(struct device *dev, struct kobj_uevent_env *env);
 
 
+#else /* CONFIG_OF_DEVICE */
+
+static inline int of_driver_match_device(struct device *dev,
+					 struct device_driver *drv)
+{
+	return 0;
+}
+
 #endif /* CONFIG_OF_DEVICE */
 
 #endif /* _LINUX_OF_DEVICE_H */
-- 
cgit v1.2.3-59-g8ed1b


From f9f5a4669f1334a558f102c311debfd008e7c2bc Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Wed, 9 Jun 2010 22:22:17 -0600
Subject: of/device: Move struct of_device define outside of CONFIG_OF_DEVICE
 test

Some code uses of_device even when CONFIG_OF_DEVICE is not set.  This
patch makes of_device valid all the time by moving it outside of the
ifdef CONFIG_OF_DEVICE test.

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
---
 include/linux/of_device.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 91d75fb0c726..7d27f5a878f6 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -1,13 +1,6 @@
 #ifndef _LINUX_OF_DEVICE_H
 #define _LINUX_OF_DEVICE_H
 
-#ifdef CONFIG_OF_DEVICE
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/of.h>
-#include <linux/mod_devicetable.h>
-
-
 /*
  * The of_device *was* a kind of "base class" that was a superset of
  * struct device for use by devices attached to an OF node and probed
@@ -22,7 +15,12 @@
  * from the kernel.
  */
 #define of_device platform_device
+#include <linux/platform_device.h>
 
+#ifdef CONFIG_OF_DEVICE
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/mod_devicetable.h>
 #include <asm/of_device.h>
 
 #define	to_of_device(d) container_of(d, struct of_device, dev)
-- 
cgit v1.2.3-59-g8ed1b


From 9fd049927ccba1c1d0343239b82f28c4e07fb95d Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:18 -0600
Subject: of/i2c: Generalize OF support

This patch cleans up the i2c OF support code to make it selectable by
all architectures and allow for automatic registration of i2c devices.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/i2c/busses/i2c-cpm.c     |  3 ++-
 drivers/i2c/busses/i2c-ibm_iic.c |  3 ++-
 drivers/i2c/busses/i2c-mpc.c     |  3 ++-
 drivers/of/Kconfig               |  2 +-
 drivers/of/of_i2c.c              | 50 +++++++++++++++++++++++-----------------
 include/linux/of_i2c.h           | 13 ++++++++---
 6 files changed, 46 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-cpm.c b/drivers/i2c/busses/i2c-cpm.c
index b02b4533651d..03ae62e69596 100644
--- a/drivers/i2c/busses/i2c-cpm.c
+++ b/drivers/i2c/busses/i2c-cpm.c
@@ -652,6 +652,7 @@ static int __devinit cpm_i2c_probe(struct of_device *ofdev,
 	cpm->adap = cpm_ops;
 	i2c_set_adapdata(&cpm->adap, cpm);
 	cpm->adap.dev.parent = &ofdev->dev;
+	cpm->adap.dev.of_node = of_node_get(ofdev->dev.of_node);
 
 	result = cpm_i2c_setup(cpm);
 	if (result) {
@@ -679,7 +680,7 @@ static int __devinit cpm_i2c_probe(struct of_device *ofdev,
 	/*
 	 * register OF I2C devices
 	 */
-	of_register_i2c_devices(&cpm->adap, ofdev->dev.of_node);
+	of_i2c_register_devices(&cpm->adap);
 
 	return 0;
 out_shut:
diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c
index bf344135647a..d9641210dd3a 100644
--- a/drivers/i2c/busses/i2c-ibm_iic.c
+++ b/drivers/i2c/busses/i2c-ibm_iic.c
@@ -745,6 +745,7 @@ static int __devinit iic_probe(struct of_device *ofdev,
 	/* Register it with i2c layer */
 	adap = &dev->adap;
 	adap->dev.parent = &ofdev->dev;
+	adap->dev.of_node = of_node_get(np);
 	strlcpy(adap->name, "IBM IIC", sizeof(adap->name));
 	i2c_set_adapdata(adap, dev);
 	adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
@@ -761,7 +762,7 @@ static int __devinit iic_probe(struct of_device *ofdev,
 		 dev->fast_mode ? "fast (400 kHz)" : "standard (100 kHz)");
 
 	/* Now register all the child nodes */
-	of_register_i2c_devices(adap, np);
+	of_i2c_register_devices(adap);
 
 	return 0;
 
diff --git a/drivers/i2c/busses/i2c-mpc.c b/drivers/i2c/busses/i2c-mpc.c
index df00eb1f11f9..d2e26d290e7a 100644
--- a/drivers/i2c/busses/i2c-mpc.c
+++ b/drivers/i2c/busses/i2c-mpc.c
@@ -600,13 +600,14 @@ static int __devinit fsl_i2c_probe(struct of_device *op,
 	i2c->adap = mpc_ops;
 	i2c_set_adapdata(&i2c->adap, i2c);
 	i2c->adap.dev.parent = &op->dev;
+	i2c->adap.dev.of_node = of_node_get(op->dev.of_node);
 
 	result = i2c_add_adapter(&i2c->adap);
 	if (result < 0) {
 		dev_err(i2c->dev, "failed to add adapter\n");
 		goto fail_add;
 	}
-	of_register_i2c_devices(&i2c->adap, op->dev.of_node);
+	of_i2c_register_devices(&i2c->adap);
 
 	return result;
 
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index 097f42aebe90..80dd6318db68 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -26,7 +26,7 @@ config OF_GPIO
 
 config OF_I2C
 	def_tristate I2C
-	depends on (PPC_OF || MICROBLAZE) && I2C
+	depends on OF && !SPARC && I2C
 	help
 	  OpenFirmware I2C accessors
 
diff --git a/drivers/of/of_i2c.c b/drivers/of/of_i2c.c
index ab6522c8e4fe..0a694debd226 100644
--- a/drivers/of/of_i2c.c
+++ b/drivers/of/of_i2c.c
@@ -14,57 +14,65 @@
 #include <linux/i2c.h>
 #include <linux/of.h>
 #include <linux/of_i2c.h>
+#include <linux/of_irq.h>
 #include <linux/module.h>
 
-void of_register_i2c_devices(struct i2c_adapter *adap,
-			     struct device_node *adap_node)
+void of_i2c_register_devices(struct i2c_adapter *adap)
 {
 	void *result;
 	struct device_node *node;
 
-	for_each_child_of_node(adap_node, node) {
+	/* Only register child devices if the adapter has a node pointer set */
+	if (!adap->dev.of_node)
+		return;
+
+	dev_dbg(&adap->dev, "of_i2c: walking child nodes\n");
+
+	for_each_child_of_node(adap->dev.of_node, node) {
 		struct i2c_board_info info = {};
 		struct dev_archdata dev_ad = {};
 		const __be32 *addr;
 		int len;
 
-		if (of_modalias_node(node, info.type, sizeof(info.type)) < 0)
+		dev_dbg(&adap->dev, "of_i2c: register %s\n", node->full_name);
+
+		if (of_modalias_node(node, info.type, sizeof(info.type)) < 0) {
+			dev_err(&adap->dev, "of_i2c: modalias failure on %s\n",
+				node->full_name);
 			continue;
+		}
 
 		addr = of_get_property(node, "reg", &len);
-		if (!addr || len < sizeof(int) || *addr > (1 << 10) - 1) {
-			printk(KERN_ERR
-			       "of-i2c: invalid i2c device entry\n");
+		if (!addr || (len < sizeof(int))) {
+			dev_err(&adap->dev, "of_i2c: invalid reg on %s\n",
+				node->full_name);
 			continue;
 		}
 
-		info.irq = irq_of_parse_and_map(node, 0);
-
 		info.addr = be32_to_cpup(addr);
+		if (info.addr > (1 << 10) - 1) {
+			dev_err(&adap->dev, "of_i2c: invalid addr=%x on %s\n",
+				info.addr, node->full_name);
+			continue;
+		}
 
-		info.of_node = node;
+		info.irq = irq_of_parse_and_map(node, 0);
+		info.of_node = of_node_get(node);
 		info.archdata = &dev_ad;
 
 		request_module("%s", info.type);
 
 		result = i2c_new_device(adap, &info);
 		if (result == NULL) {
-			printk(KERN_ERR
-			       "of-i2c: Failed to load driver for %s\n",
-			       info.type);
+			dev_err(&adap->dev, "of_i2c: Failure registering %s\n",
+			        node->full_name);
+			of_node_put(node);
 			irq_dispose_mapping(info.irq);
 			continue;
 		}
-
-		/*
-		 * Get the node to not lose the dev_archdata->of_node.
-		 * Currently there is no way to put it back, as well as no
-		 * of_unregister_i2c_devices() call.
-		 */
-		of_node_get(node);
 	}
 }
-EXPORT_SYMBOL(of_register_i2c_devices);
+EXPORT_SYMBOL(of_i2c_register_devices);
 
 static int of_dev_node_match(struct device *dev, void *data)
 {
diff --git a/include/linux/of_i2c.h b/include/linux/of_i2c.h
index 34974b5a76f7..0efe8d465f55 100644
--- a/include/linux/of_i2c.h
+++ b/include/linux/of_i2c.h
@@ -12,12 +12,19 @@
 #ifndef __LINUX_OF_I2C_H
 #define __LINUX_OF_I2C_H
 
+#if defined(CONFIG_OF_I2C) || defined(CONFIG_OF_I2C_MODULE)
 #include <linux/i2c.h>
 
-void of_register_i2c_devices(struct i2c_adapter *adap,
-			     struct device_node *adap_node);
+extern void of_i2c_register_devices(struct i2c_adapter *adap);
 
 /* must call put_device() when done with returned i2c_client device */
-struct i2c_client *of_find_i2c_device_by_node(struct device_node *node);
+extern struct i2c_client *of_find_i2c_device_by_node(struct device_node *node);
+
+#else
+static inline void of_i2c_register_devices(struct i2c_adapter *adap)
+{
+	return;
+}
+#endif /* CONFIG_OF_I2C */
 
 #endif /* __LINUX_OF_I2C_H */
-- 
cgit v1.2.3-59-g8ed1b


From 8eab945c5616fc984e97b922d6a2559be93f39a1 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 1 Jul 2010 18:05:56 +0300
Subject: sunrpc: make the cache cleaner workqueue deferrable

This patch makes the cache_cleaner workqueue deferrable, to prevent
unnecessary system wake-ups, which is very important for embedded
battery-powered devices.

do_cache_clean() is called every 30 seconds at the moment, and often
makes the system wake up from its power-save sleep state. With this
change, when the workqueue uses a deferrable timer, the
do_cache_clean() invocation will be delayed and combined with the
closest "real" wake-up. This improves the power consumption situation.

Note, I tried to create a DECLARE_DELAYED_WORK_DEFERRABLE() helper
macro, similar to DECLARE_DELAYED_WORK(), but failed because of the
way the timer wheel core stores the deferrable flag (it is the
LSBit in the time->base pointer). My attempt to define a static
variable with this bit set ended up with the "initializer element is
not constant" error.

Thus, I have to use run-time initialization, so I created a new
cache_initialize() function which is called once when sunrpc is
being initialized.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 include/linux/sunrpc/cache.h | 1 +
 net/sunrpc/cache.c           | 7 ++++++-
 net/sunrpc/sunrpc_syms.c     | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 6f52b4d7c447..7bf3e84b92f4 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -192,6 +192,7 @@ extern int cache_check(struct cache_detail *detail,
 extern void cache_flush(void);
 extern void cache_purge(struct cache_detail *detail);
 #define NEVER (0x7FFFFFFF)
+extern void __init cache_initialize(void);
 extern int cache_register(struct cache_detail *cd);
 extern void cache_unregister(struct cache_detail *cd);
 
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 58de76c8540c..939d048ef92b 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -320,7 +320,7 @@ static struct cache_detail *current_detail;
 static int current_index;
 
 static void do_cache_clean(struct work_struct *work);
-static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean);
+static struct delayed_work cache_cleaner;
 
 static void sunrpc_init_cache_detail(struct cache_detail *cd)
 {
@@ -1504,6 +1504,11 @@ static int create_cache_proc_entries(struct cache_detail *cd)
 }
 #endif
 
+void __init cache_initialize(void)
+{
+	INIT_DELAYED_WORK_DEFERRABLE(&cache_cleaner, do_cache_clean);
+}
+
 int cache_register(struct cache_detail *cd)
 {
 	int ret;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index f438347d817b..c52b18489149 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -43,6 +43,7 @@ init_sunrpc(void)
 #ifdef CONFIG_PROC_FS
 	rpc_proc_init();
 #endif
+	cache_initialize();
 	cache_register(&ip_map_cache);
 	cache_register(&unix_gid_cache);
 	svc_init_xprt_sock();	/* svc sock transport */
-- 
cgit v1.2.3-59-g8ed1b


From de5d9bf6541736dc7ad264d2b5cc99bc1b2ad958 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Fri, 2 Jul 2010 13:41:14 -0400
Subject: Move list types from <linux/list.h> to <linux/types.h>.

This allows a list_head (or hlist_head, etc.) to be used from places
that used to be impractical, in particular <asm/processor.h>, which
used to cause include file recursion: <linux/list.h> includes
<linux/prefetch.h>, which always includes <asm/processor.h> for the
prefetch macros, as well as <asm/system.h>, which often includes
<asm/processor.h> directly or indirectly.

This avoids a lot of painful workaround hackery on the tile
architecture, where we use a list_head in the thread_struct to chain
together tasks that are activated on a particular hardwall.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
---
 include/linux/list.h  | 13 +------------
 include/linux/types.h | 12 ++++++++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 8392884a2977..bc43e8a0d7f6 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_LIST_H
 #define _LINUX_LIST_H
 
+#include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/poison.h>
 #include <linux/prefetch.h>
@@ -16,10 +17,6 @@
  * using the generic single-entry routines.
  */
 
-struct list_head {
-	struct list_head *next, *prev;
-};
-
 #define LIST_HEAD_INIT(name) { &(name), &(name) }
 
 #define LIST_HEAD(name) \
@@ -551,14 +548,6 @@ static inline void list_splice_tail_init(struct list_head *list,
  * You lose the ability to access the tail in O(1).
  */
 
-struct hlist_head {
-	struct hlist_node *first;
-};
-
-struct hlist_node {
-	struct hlist_node *next, **pprev;
-};
-
 #define HLIST_HEAD_INIT { .first = NULL }
 #define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
 #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
diff --git a/include/linux/types.h b/include/linux/types.h
index 23d237a075e2..336cc39c46f1 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -197,6 +197,18 @@ typedef struct {
 } atomic64_t;
 #endif
 
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
 struct ustat {
 	__kernel_daddr_t	f_tfree;
 	__kernel_ino_t		f_tinode;
-- 
cgit v1.2.3-59-g8ed1b


From 28172739f0a276eb8d6ca917b3974c2edb036da3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 7 Jul 2010 14:58:56 -0700
Subject: net: fix 64 bit counters on 32 bit arches

There is a small possibility that a reader gets incorrect values on 32
bit arches. SNMP applications could catch incorrect counters when a
32bit high part is changed by another stats consumer/provider.

One way to solve this is to add a rtnl_link_stats64 param to all
ndo_get_stats64() methods, and also add such a parameter to
dev_get_stats().

Rule is that we are not allowed to use dev->stats64 as a temporary
storage for 64bit stats, but a caller provided area (usually on stack)

Old drivers (only providing get_stats() method) need no changes.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/s390/appldata/appldata_net_sum.c       |  3 +-
 drivers/net/bonding/bond_main.c             | 64 ++++++++++++++---------------
 drivers/net/ixgbe/ixgbe_ethtool.c           |  8 ++--
 drivers/net/loopback.c                      |  4 +-
 drivers/net/macvlan.c                       |  6 +--
 drivers/net/sfc/efx.c                       |  3 +-
 drivers/net/sfc/ethtool.c                   |  3 +-
 drivers/parisc/led.c                        |  3 +-
 drivers/scsi/fcoe/fcoe.c                    |  3 +-
 drivers/staging/batman-adv/hard-interface.c |  3 +-
 drivers/usb/gadget/rndis.c                  |  3 +-
 include/linux/netdevice.h                   | 12 ++++--
 net/8021q/vlan_dev.c                        |  6 +--
 net/8021q/vlanproc.c                        |  3 +-
 net/bridge/br_device.c                      |  4 +-
 net/core/dev.c                              | 25 +++++++----
 net/core/net-sysfs.c                        |  4 +-
 net/core/rtnetlink.c                        |  3 +-
 18 files changed, 89 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/appldata/appldata_net_sum.c b/arch/s390/appldata/appldata_net_sum.c
index 9a9586f4103f..f02e89ce4df1 100644
--- a/arch/s390/appldata/appldata_net_sum.c
+++ b/arch/s390/appldata/appldata_net_sum.c
@@ -85,7 +85,8 @@ static void appldata_get_net_sum_data(void *data)
 
 	rcu_read_lock();
 	for_each_netdev_rcu(&init_net, dev) {
-		const struct net_device_stats *stats = dev_get_stats(dev);
+		struct rtnl_link_stats64 temp;
+		const struct net_device_stats *stats = dev_get_stats(dev, &temp);
 
 		rx_packets += stats->rx_packets;
 		tx_packets += stats->tx_packets;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a95a41b74b4e..9bb9bfa225b6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3804,51 +3804,49 @@ static int bond_close(struct net_device *bond_dev)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev)
+static struct rtnl_link_stats64 *bond_get_stats(struct net_device *bond_dev,
+						struct rtnl_link_stats64 *stats)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
-	struct rtnl_link_stats64 *stats = &bond_dev->stats64;
-	struct rtnl_link_stats64 local_stats;
+	struct rtnl_link_stats64 temp;
 	struct slave *slave;
 	int i;
 
-	memset(&local_stats, 0, sizeof(local_stats));
+	memset(stats, 0, sizeof(*stats));
 
 	read_lock_bh(&bond->lock);
 
 	bond_for_each_slave(bond, slave, i) {
 		const struct rtnl_link_stats64 *sstats =
-			dev_get_stats(slave->dev);
-
-		local_stats.rx_packets += sstats->rx_packets;
-		local_stats.rx_bytes += sstats->rx_bytes;
-		local_stats.rx_errors += sstats->rx_errors;
-		local_stats.rx_dropped += sstats->rx_dropped;
-
-		local_stats.tx_packets += sstats->tx_packets;
-		local_stats.tx_bytes += sstats->tx_bytes;
-		local_stats.tx_errors += sstats->tx_errors;
-		local_stats.tx_dropped += sstats->tx_dropped;
-
-		local_stats.multicast += sstats->multicast;
-		local_stats.collisions += sstats->collisions;
-
-		local_stats.rx_length_errors += sstats->rx_length_errors;
-		local_stats.rx_over_errors += sstats->rx_over_errors;
-		local_stats.rx_crc_errors += sstats->rx_crc_errors;
-		local_stats.rx_frame_errors += sstats->rx_frame_errors;
-		local_stats.rx_fifo_errors += sstats->rx_fifo_errors;
-		local_stats.rx_missed_errors += sstats->rx_missed_errors;
-
-		local_stats.tx_aborted_errors += sstats->tx_aborted_errors;
-		local_stats.tx_carrier_errors += sstats->tx_carrier_errors;
-		local_stats.tx_fifo_errors += sstats->tx_fifo_errors;
-		local_stats.tx_heartbeat_errors += sstats->tx_heartbeat_errors;
-		local_stats.tx_window_errors += sstats->tx_window_errors;
+			dev_get_stats(slave->dev, &temp);
+
+		stats->rx_packets += sstats->rx_packets;
+		stats->rx_bytes += sstats->rx_bytes;
+		stats->rx_errors += sstats->rx_errors;
+		stats->rx_dropped += sstats->rx_dropped;
+
+		stats->tx_packets += sstats->tx_packets;
+		stats->tx_bytes += sstats->tx_bytes;
+		stats->tx_errors += sstats->tx_errors;
+		stats->tx_dropped += sstats->tx_dropped;
+
+		stats->multicast += sstats->multicast;
+		stats->collisions += sstats->collisions;
+
+		stats->rx_length_errors += sstats->rx_length_errors;
+		stats->rx_over_errors += sstats->rx_over_errors;
+		stats->rx_crc_errors += sstats->rx_crc_errors;
+		stats->rx_frame_errors += sstats->rx_frame_errors;
+		stats->rx_fifo_errors += sstats->rx_fifo_errors;
+		stats->rx_missed_errors += sstats->rx_missed_errors;
+
+		stats->tx_aborted_errors += sstats->tx_aborted_errors;
+		stats->tx_carrier_errors += sstats->tx_carrier_errors;
+		stats->tx_fifo_errors += sstats->tx_fifo_errors;
+		stats->tx_heartbeat_errors += sstats->tx_heartbeat_errors;
+		stats->tx_window_errors += sstats->tx_window_errors;
 	}
 
-	memcpy(stats, &local_stats, sizeof(struct net_device_stats));
-
 	read_unlock_bh(&bond->lock);
 
 	return stats;
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index b35ef36741ef..da54b38bb480 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -55,7 +55,7 @@ struct ixgbe_stats {
 				offsetof(struct ixgbe_adapter, m)
 #define IXGBE_NETDEV_STAT(m)	NETDEV_STATS, \
 				sizeof(((struct net_device *)0)->m), \
-				offsetof(struct net_device, m)
+				offsetof(struct net_device, m) - offsetof(struct net_device, stats)
 
 static struct ixgbe_stats ixgbe_gstrings_stats[] = {
 	{"rx_packets", IXGBE_NETDEV_STAT(stats.rx_packets)},
@@ -998,16 +998,18 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	u64 *queue_stat;
 	int stat_count = sizeof(struct ixgbe_queue_stats) / sizeof(u64);
+	struct rtnl_link_stats64 temp;
+	const struct rtnl_link_stats64 *net_stats;
 	int j, k;
 	int i;
 	char *p = NULL;
 
 	ixgbe_update_stats(adapter);
-	dev_get_stats(netdev);
+	net_stats = dev_get_stats(netdev, &temp);
 	for (i = 0; i < IXGBE_GLOBAL_STATS_LEN; i++) {
 		switch (ixgbe_gstrings_stats[i].type) {
 		case NETDEV_STATS:
-			p = (char *) netdev +
+			p = (char *) net_stats +
 					ixgbe_gstrings_stats[i].stat_offset;
 			break;
 		case IXGBE_STATS:
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 4dd0510d7a99..9a0996795321 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -98,10 +98,10 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 }
 
-static struct rtnl_link_stats64 *loopback_get_stats64(struct net_device *dev)
+static struct rtnl_link_stats64 *loopback_get_stats64(struct net_device *dev,
+						      struct rtnl_link_stats64 *stats)
 {
 	const struct pcpu_lstats __percpu *pcpu_lstats;
-	struct rtnl_link_stats64 *stats = &dev->stats64;
 	u64 bytes = 0;
 	u64 packets = 0;
 	u64 drops = 0;
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index e6d626e78515..6112f1498940 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -431,12 +431,12 @@ static void macvlan_uninit(struct net_device *dev)
 	free_percpu(vlan->rx_stats);
 }
 
-static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev)
+static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev,
+							 struct rtnl_link_stats64 *stats)
 {
-	struct rtnl_link_stats64 *stats = &dev->stats64;
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
-	dev_txq_stats_fold(dev, &dev->stats);
+	dev_txq_stats_fold(dev, (struct net_device_stats *)stats);
 
 	if (vlan->rx_stats) {
 		struct macvlan_rx_stats *p, accum = {0};
diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index 35b3f2922e5c..ba674c5ca29e 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -1533,11 +1533,10 @@ static int efx_net_stop(struct net_device *net_dev)
 }
 
 /* Context: process, dev_base_lock or RTNL held, non-blocking. */
-static struct rtnl_link_stats64 *efx_net_stats(struct net_device *net_dev)
+static struct rtnl_link_stats64 *efx_net_stats(struct net_device *net_dev, struct rtnl_link_stats64 *stats)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_mac_stats *mac_stats = &efx->mac_stats;
-	struct rtnl_link_stats64 *stats = &net_dev->stats64;
 
 	spin_lock_bh(&efx->stats_lock);
 	efx->type->update_stats(efx);
diff --git a/drivers/net/sfc/ethtool.c b/drivers/net/sfc/ethtool.c
index 3b8b0a062749..fd19d6ab97a2 100644
--- a/drivers/net/sfc/ethtool.c
+++ b/drivers/net/sfc/ethtool.c
@@ -469,12 +469,13 @@ static void efx_ethtool_get_stats(struct net_device *net_dev,
 	struct efx_mac_stats *mac_stats = &efx->mac_stats;
 	struct efx_ethtool_stat *stat;
 	struct efx_channel *channel;
+	struct rtnl_link_stats64 temp;
 	int i;
 
 	EFX_BUG_ON_PARANOID(stats->n_stats != EFX_ETHTOOL_NUM_STATS);
 
 	/* Update MAC and NIC statistics */
-	dev_get_stats(net_dev);
+	dev_get_stats(net_dev, &temp);
 
 	/* Fill detailed statistics buffer */
 	for (i = 0; i < EFX_ETHTOOL_NUM_STATS; i++) {
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index 188bc8496a26..18dff43b8bd2 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -355,12 +355,13 @@ static __inline__ int led_get_net_activity(void)
 	rcu_read_lock();
 	for_each_netdev_rcu(&init_net, dev) {
 	    const struct net_device_stats *stats;
+	    struct rtnl_link_stats64 temp;
 	    struct in_device *in_dev = __in_dev_get_rcu(dev);
 	    if (!in_dev || !in_dev->ifa_list)
 		continue;
 	    if (ipv4_is_loopback(in_dev->ifa_list->ifa_local))
 		continue;
-	    stats = dev_get_stats(dev);
+	    stats = dev_get_stats(dev, &temp);
 	    rx_total += stats->rx_packets;
 	    tx_total += stats->tx_packets;
 	}
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 44a07593de56..1a429ed6da9d 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -2653,6 +2653,7 @@ static void fcoe_get_lesb(struct fc_lport *lport,
 	u32 lfc, vlfc, mdac;
 	struct fcoe_dev_stats *devst;
 	struct fcoe_fc_els_lesb *lesb;
+	struct rtnl_link_stats64 temp;
 	struct net_device *netdev = fcoe_netdev(lport);
 
 	lfc = 0;
@@ -2669,7 +2670,7 @@ static void fcoe_get_lesb(struct fc_lport *lport,
 	lesb->lesb_link_fail = htonl(lfc);
 	lesb->lesb_vlink_fail = htonl(vlfc);
 	lesb->lesb_miss_fka = htonl(mdac);
-	lesb->lesb_fcs_error = htonl(dev_get_stats(netdev)->rx_crc_errors);
+	lesb->lesb_fcs_error = htonl(dev_get_stats(netdev, &temp)->rx_crc_errors);
 }
 
 /**
diff --git a/drivers/staging/batman-adv/hard-interface.c b/drivers/staging/batman-adv/hard-interface.c
index 5ede9c255094..96c86c873011 100644
--- a/drivers/staging/batman-adv/hard-interface.c
+++ b/drivers/staging/batman-adv/hard-interface.c
@@ -440,6 +440,7 @@ int batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
 	struct batman_packet *batman_packet;
 	struct batman_if *batman_if;
 	struct net_device_stats *stats;
+	struct rtnl_link_stats64 temp;
 	int ret;
 
 	skb = skb_share_check(skb, GFP_ATOMIC);
@@ -468,7 +469,7 @@ int batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
 	if (batman_if->if_status != IF_ACTIVE)
 		goto err_free;
 
-	stats = (struct net_device_stats *)dev_get_stats(skb->dev);
+	stats = (struct net_device_stats *)dev_get_stats(skb->dev, &temp);
 	if (stats) {
 		stats->rx_packets++;
 		stats->rx_bytes += skb->len;
diff --git a/drivers/usb/gadget/rndis.c b/drivers/usb/gadget/rndis.c
index fb69b01c8f3a..020fa5a25fda 100644
--- a/drivers/usb/gadget/rndis.c
+++ b/drivers/usb/gadget/rndis.c
@@ -171,6 +171,7 @@ gen_ndis_query_resp (int configNr, u32 OID, u8 *buf, unsigned buf_len,
 	int			i, count;
 	rndis_query_cmplt_type	*resp;
 	struct net_device	*net;
+	struct rtnl_link_stats64 temp;
 	const struct rtnl_link_stats64 *stats;
 
 	if (!r) return -ENOMEM;
@@ -194,7 +195,7 @@ gen_ndis_query_resp (int configNr, u32 OID, u8 *buf, unsigned buf_len,
 	resp->InformationBufferOffset = cpu_to_le32 (16);
 
 	net = rndis_per_dev_params[configNr].dev;
-	stats = dev_get_stats(net);
+	stats = dev_get_stats(net, &temp);
 
 	switch (OID) {
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4d27368674db..60de65316fdb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -666,7 +666,8 @@ struct netdev_rx_queue {
  *	Callback uses when the transmitter has not made any progress
  *	for dev->watchdog ticks.
  *
- * struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev);
+ * struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev
+ *                      struct rtnl_link_stats64 *storage);
  * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
  *	Called when a user wants to get the network device usage
  *	statistics. Drivers must do one of the following:
@@ -733,7 +734,8 @@ struct net_device_ops {
 						   struct neigh_parms *);
 	void			(*ndo_tx_timeout) (struct net_device *dev);
 
-	struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev);
+	struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
+						     struct rtnl_link_stats64 *storage);
 	struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 
 	void			(*ndo_vlan_rx_register)(struct net_device *dev,
@@ -2139,8 +2141,10 @@ extern void		netdev_features_change(struct net_device *dev);
 /* Load a device via the kmod */
 extern void		dev_load(struct net *net, const char *name);
 extern void		dev_mcast_init(void);
-extern const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev);
-extern void		dev_txq_stats_fold(const struct net_device *dev, struct net_device_stats *stats);
+extern const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
+						     struct rtnl_link_stats64 *storage);
+extern void		dev_txq_stats_fold(const struct net_device *dev,
+					   struct net_device_stats *stats);
 
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index c6456cb842fa..7865a4ce5250 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -803,11 +803,9 @@ static u32 vlan_ethtool_get_flags(struct net_device *dev)
 	return dev_ethtool_get_flags(vlan->real_dev);
 }
 
-static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev)
+static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
-	struct rtnl_link_stats64 *stats = &dev->stats64;
-
-	dev_txq_stats_fold(dev, &dev->stats);
+	dev_txq_stats_fold(dev, (struct net_device_stats *)stats);
 
 	if (vlan_dev_info(dev)->vlan_rx_stats) {
 		struct vlan_rx_stats *p, accum = {0};
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index df56f5ce887c..80e280f56686 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -278,6 +278,7 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset)
 {
 	struct net_device *vlandev = (struct net_device *) seq->private;
 	const struct vlan_dev_info *dev_info = vlan_dev_info(vlandev);
+	struct rtnl_link_stats64 temp;
 	const struct rtnl_link_stats64 *stats;
 	static const char fmt[] = "%30s %12lu\n";
 	static const char fmt64[] = "%30s %12llu\n";
@@ -286,7 +287,7 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset)
 	if (!is_vlan_dev(vlandev))
 		return 0;
 
-	stats = dev_get_stats(vlandev);
+	stats = dev_get_stats(vlandev, &temp);
 	seq_printf(seq,
 		   "%s  VID: %d	 REORDER_HDR: %i  dev->priv_flags: %hx\n",
 		   vlandev->name, dev_info->vlan_id,
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index edf639e96281..075c435ad22d 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -98,10 +98,10 @@ static int br_dev_stop(struct net_device *dev)
 	return 0;
 }
 
-static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev)
+static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
+						struct rtnl_link_stats64 *stats)
 {
 	struct net_bridge *br = netdev_priv(dev);
-	struct rtnl_link_stats64 *stats = &dev->stats64;
 	struct br_cpu_netstats tmp, sum = { 0 };
 	unsigned int cpu;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 93b8929fa21d..92482d7a87a9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3703,7 +3703,8 @@ void dev_seq_stop(struct seq_file *seq, void *v)
 
 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
 {
-	const struct rtnl_link_stats64 *stats = dev_get_stats(dev);
+	struct rtnl_link_stats64 temp;
+	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
 
 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
@@ -5281,23 +5282,29 @@ EXPORT_SYMBOL(dev_txq_stats_fold);
 /**
  *	dev_get_stats	- get network device statistics
  *	@dev: device to get statistics from
+ *	@storage: place to store stats
  *
  *	Get network statistics from device. The device driver may provide
  *	its own method by setting dev->netdev_ops->get_stats64 or
  *	dev->netdev_ops->get_stats; otherwise the internal statistics
  *	structure is used.
  */
-const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev)
+const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
+					      struct rtnl_link_stats64 *storage)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
-	if (ops->ndo_get_stats64)
-		return ops->ndo_get_stats64(dev);
-	if (ops->ndo_get_stats)
-		return (struct rtnl_link_stats64 *)ops->ndo_get_stats(dev);
-
-	dev_txq_stats_fold(dev, &dev->stats);
-	return &dev->stats64;
+	if (ops->ndo_get_stats64) {
+		memset(storage, 0, sizeof(*storage));
+		return ops->ndo_get_stats64(dev, storage);
+	}
+	if (ops->ndo_get_stats) {
+		memcpy(storage, ops->ndo_get_stats(dev), sizeof(*storage));
+		return storage;
+	}
+	memcpy(storage, &dev->stats, sizeof(*storage));
+	dev_txq_stats_fold(dev, (struct net_device_stats *)storage);
+	return storage;
 }
 EXPORT_SYMBOL(dev_get_stats);
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index ea3bb4c3b87d..914f42b0f039 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -330,7 +330,9 @@ static ssize_t netstat_show(const struct device *d,
 
 	read_lock(&dev_base_lock);
 	if (dev_isalive(dev)) {
-		const struct rtnl_link_stats64 *stats = dev_get_stats(dev);
+		struct rtnl_link_stats64 temp;
+		const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+
 		ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset));
 	}
 	read_unlock(&dev_base_lock);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e645778e9b7e..5e773ea2201d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -791,6 +791,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
+	struct rtnl_link_stats64 temp;
 	const struct rtnl_link_stats64 *stats;
 	struct nlattr *attr;
 
@@ -847,7 +848,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (attr == NULL)
 		goto nla_put_failure;
 
-	stats = dev_get_stats(dev);
+	stats = dev_get_stats(dev, &temp);
 	copy_rtnl_link_stats(nla_data(attr), stats);
 
 	attr = nla_reserve(skb, IFLA_STATS64,
-- 
cgit v1.2.3-59-g8ed1b


From 250b2b6dd421c9f8844a867d2ac06e0661e0ad93 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 21 Jun 2010 23:24:35 +0200
Subject: firewire: cdev: fix fw_cdev_event_bus_reset.bm_node_id

Fix an obscure ABI feature that is a bit of a hassle to implement.
However, somebody put it into the ABI, so let's fill in a sensible
value there.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c     | 12 +++++++++---
 drivers/firewire/core-cdev.c     |  2 +-
 drivers/firewire/core-topology.c |  1 +
 include/linux/firewire-cdev.h    |  5 +++++
 include/linux/firewire.h         |  1 +
 5 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 11fc81500f82..6c316cfe70c4 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -239,7 +239,7 @@ static void fw_card_bm_work(struct work_struct *work)
 	struct fw_card *card = container_of(work, struct fw_card, work.work);
 	struct fw_device *root_device;
 	struct fw_node *root_node;
-	int root_id, new_root_id, irm_id, local_id;
+	int root_id, new_root_id, irm_id, bm_id, local_id;
 	int gap_count, generation, grace, rcode;
 	bool do_reset = false;
 	bool root_device_is_running;
@@ -301,9 +301,15 @@ static void fw_card_bm_work(struct work_struct *work)
 			/* Another bus reset, BM work has been rescheduled. */
 			goto out;
 
-		if (rcode == RCODE_COMPLETE &&
-		    card->bm_transaction_data[0] != cpu_to_be32(0x3f)) {
+		bm_id = be32_to_cpu(card->bm_transaction_data[0]);
 
+		spin_lock_irq(&card->lock);
+		if (rcode == RCODE_COMPLETE && generation == card->generation)
+			card->bm_node_id =
+			    bm_id == 0x3f ? local_id : 0xffc0 | bm_id;
+		spin_unlock_irq(&card->lock);
+
+		if (rcode == RCODE_COMPLETE && bm_id != 0x3f) {
 			/* Somebody else is BM.  Only act as IRM. */
 			if (local_id == irm_id)
 				allocate_broadcast_channel(card, generation);
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 9b8df2039155..d8ac0ce2d6bf 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -318,7 +318,7 @@ static void fill_bus_reset_event(struct fw_cdev_event_bus_reset *event,
 	event->generation    = client->device->generation;
 	event->node_id       = client->device->node_id;
 	event->local_node_id = card->local_node->node_id;
-	event->bm_node_id    = 0; /* FIXME: We don't track the BM. */
+	event->bm_node_id    = card->bm_node_id;
 	event->irm_node_id   = card->irm_node->node_id;
 	event->root_node_id  = card->root_node->node_id;
 
diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index 56e908ba43f1..88d5133ae702 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -552,6 +552,7 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation,
 	smp_wmb();
 	card->generation = generation;
 	card->reset_jiffies = jiffies;
+	card->bm_node_id  = 0xffff;
 	card->bm_abdicate = bm_abdicate;
 	fw_schedule_bm_work(card, 0);
 
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 8b9b27373219..d31022b05bd9 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -66,6 +66,10 @@ struct fw_cdev_event_common {
  * This event is sent when the bus the device belongs to goes through a bus
  * reset.  It provides information about the new bus configuration, such as
  * new node ID for this device, new root ID, and others.
+ *
+ * If @bm_node_id is 0xffff right after bus reset it can be reread by an
+ * %FW_CDEV_IOC_GET_INFO ioctl after bus manager selection was finished.
+ * Kernels with ABI version < 4 do not set @bm_node_id.
  */
 struct fw_cdev_event_bus_reset {
 	__u64 closure;
@@ -348,6 +352,7 @@ union fw_cdev_event {
  *  3  (2.6.34)  - made &fw_cdev_get_cycle_timer reliable
  *               - added %FW_CDEV_IOC_GET_CYCLE_TIMER2
  *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2
+ *               - implemented &fw_cdev_event_bus_reset.bm_node_id
  */
 #define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index e44b502c8341..db30a752a87a 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -119,6 +119,7 @@ struct fw_card {
 	int bm_retries;
 	int bm_generation;
 	__be32 bm_transaction_data[2];
+	int bm_node_id;
 	bool bm_abdicate;
 
 	bool priority_budget_implemented;	/* controller feature */
-- 
cgit v1.2.3-59-g8ed1b


From 7313bb8f3dd6e28bcf9c42adfd54a5cf9a4949e0 Mon Sep 17 00:00:00 2001
From: Karl Hiramoto <karl@hiramoto.org>
Date: Thu, 8 Jul 2010 20:55:30 +0000
Subject: atm: propagate signal changes via notifier

Add notifier chain for changes in atm_dev.

Clients like br2684 will call register_atmdevice_notifier() to be notified of
changes. Drivers will call atm_dev_signal_change() to notify clients like
br2684 of the change.

On DSL and ATM devices it's usefull to have a know if you have a carrier
signal. netdevice LOWER_UP changes can be propagated to userspace via netlink
monitor.

Signed-off-by: Karl Hiramoto <karl@hiramoto.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/atmdev.h | 17 +++++++++++++++++
 net/atm/common.c       | 30 ++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index 817b23705c91..f6481daf6e52 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -431,6 +431,14 @@ struct atm_dev *atm_dev_register(const char *type,const struct atmdev_ops *ops,
     int number,unsigned long *flags); /* number == -1: pick first available */
 struct atm_dev *atm_dev_lookup(int number);
 void atm_dev_deregister(struct atm_dev *dev);
+
+/* atm_dev_signal_change
+ *
+ * Propagate lower layer signal change in atm_dev->signal to netdevice.
+ * The event will be sent via a notifier call chain.
+ */
+void atm_dev_signal_change(struct atm_dev *dev, char signal);
+
 void vcc_insert_socket(struct sock *sk);
 
 
@@ -510,6 +518,15 @@ void register_atm_ioctl(struct atm_ioctl *);
  */
 void deregister_atm_ioctl(struct atm_ioctl *);
 
+
+/* register_atmdevice_notifier - register atm_dev notify events
+ *
+ * Clients like br2684 will register notify events
+ * Currently we notify of signal found/lost
+ */
+int register_atmdevice_notifier(struct notifier_block *nb);
+void unregister_atmdevice_notifier(struct notifier_block *nb);
+
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/net/atm/common.c b/net/atm/common.c
index b43feb1a3995..940404a73b3d 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(vcc_hash);
 DEFINE_RWLOCK(vcc_sklist_lock);
 EXPORT_SYMBOL(vcc_sklist_lock);
 
+static ATOMIC_NOTIFIER_HEAD(atm_dev_notify_chain);
+
 static void __vcc_insert_socket(struct sock *sk)
 {
 	struct atm_vcc *vcc = atm_sk(sk);
@@ -212,6 +214,22 @@ void vcc_release_async(struct atm_vcc *vcc, int reply)
 }
 EXPORT_SYMBOL(vcc_release_async);
 
+void atm_dev_signal_change(struct atm_dev *dev, char signal)
+{
+	pr_debug("%s signal=%d dev=%p number=%d dev->signal=%d\n",
+		__func__, signal, dev, dev->number, dev->signal);
+
+	/* atm driver sending invalid signal */
+	WARN_ON(signal < ATM_PHY_SIG_LOST || signal > ATM_PHY_SIG_FOUND);
+
+	if (dev->signal == signal)
+		return; /* no change */
+
+	dev->signal = signal;
+
+	atomic_notifier_call_chain(&atm_dev_notify_chain, signal, dev);
+}
+EXPORT_SYMBOL(atm_dev_signal_change);
 
 void atm_dev_release_vccs(struct atm_dev *dev)
 {
@@ -781,6 +799,18 @@ int vcc_getsockopt(struct socket *sock, int level, int optname,
 	return vcc->dev->ops->getsockopt(vcc, level, optname, optval, len);
 }
 
+int register_atmdevice_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&atm_dev_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_atmdevice_notifier);
+
+void unregister_atmdevice_notifier(struct notifier_block *nb)
+{
+	atomic_notifier_chain_unregister(&atm_dev_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_atmdevice_notifier);
+
 static int __init atm_init(void)
 {
 	int error;
-- 
cgit v1.2.3-59-g8ed1b


From ffa71f33a820d1ab3f2fc5723819ac60fb76080b Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Fri, 18 Jun 2010 12:22:40 +0900
Subject: x86, ioremap: Fix incorrect physical address handling in PAE mode

Current x86 ioremap() doesn't handle physical address higher than
32-bit properly in X86_32 PAE mode. When physical address higher than
32-bit is passed to ioremap(), higher 32-bits in physical address is
cleared wrongly. Due to this bug, ioremap() can map wrong address to
linear address space.

In my case, 64-bit MMIO region was assigned to a PCI device (ioat
device) on my system. Because of the ioremap()'s bug, wrong physical
address (instead of MMIO region) was mapped to linear address space.
Because of this, loading ioatdma driver caused unexpected behavior
(kernel panic, kernel hangup, ...).

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
LKML-Reference: <4C1AE680.7090408@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/ioremap.c   | 12 +++++-------
 include/linux/io.h      |  4 ++--
 include/linux/vmalloc.h |  2 +-
 lib/ioremap.c           | 10 +++++-----
 mm/vmalloc.c            |  2 +-
 5 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 12e4d2d3c110..754cb4cbce66 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -62,8 +62,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		unsigned long size, unsigned long prot_val, void *caller)
 {
-	unsigned long pfn, offset, vaddr;
-	resource_size_t last_addr;
+	unsigned long offset, vaddr;
+	resource_size_t pfn, last_pfn, last_addr;
 	const resource_size_t unaligned_phys_addr = phys_addr;
 	const unsigned long unaligned_size = size;
 	struct vm_struct *area;
@@ -100,10 +100,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
-	for (pfn = phys_addr >> PAGE_SHIFT;
-				(pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK);
-				pfn++) {
-
+	last_pfn = last_addr >> PAGE_SHIFT;
+	for (pfn = phys_addr >> PAGE_SHIFT; pfn < last_pfn; pfn++) {
 		int is_ram = page_is_ram(pfn);
 
 		if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
@@ -115,7 +113,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	 * Mappings have to be page-aligned
 	 */
 	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
+	phys_addr &= PHYSICAL_PAGE_MASK;
 	size = PAGE_ALIGN(last_addr+1) - phys_addr;
 
 	retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
diff --git a/include/linux/io.h b/include/linux/io.h
index 6c7f0ba0d5fa..7fd2d2138bf3 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -29,10 +29,10 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
 
 #ifdef CONFIG_MMU
 int ioremap_page_range(unsigned long addr, unsigned long end,
-		       unsigned long phys_addr, pgprot_t prot);
+		       phys_addr_t phys_addr, pgprot_t prot);
 #else
 static inline int ioremap_page_range(unsigned long addr, unsigned long end,
-				     unsigned long phys_addr, pgprot_t prot)
+				     phys_addr_t phys_addr, pgprot_t prot)
 {
 	return 0;
 }
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 227c2a585e4f..de05e96e0a70 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -30,7 +30,7 @@ struct vm_struct {
 	unsigned long		flags;
 	struct page		**pages;
 	unsigned int		nr_pages;
-	unsigned long		phys_addr;
+	phys_addr_t		phys_addr;
 	void			*caller;
 };
 
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 14c6078f17a2..5730ecd3eb66 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -13,10 +13,10 @@
 #include <asm/pgtable.h>
 
 static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
-		unsigned long end, unsigned long phys_addr, pgprot_t prot)
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
 	pte_t *pte;
-	unsigned long pfn;
+	u64 pfn;
 
 	pfn = phys_addr >> PAGE_SHIFT;
 	pte = pte_alloc_kernel(pmd, addr);
@@ -31,7 +31,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
 }
 
 static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
-		unsigned long end, unsigned long phys_addr, pgprot_t prot)
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -49,7 +49,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 }
 
 static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
-		unsigned long end, unsigned long phys_addr, pgprot_t prot)
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -67,7 +67,7 @@ static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
 }
 
 int ioremap_page_range(unsigned long addr,
-		       unsigned long end, unsigned long phys_addr, pgprot_t prot)
+		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
 	pgd_t *pgd;
 	unsigned long start;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ae007462b7f6..b7e314b1009f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2403,7 +2403,7 @@ static int s_show(struct seq_file *m, void *p)
 		seq_printf(m, " pages=%d", v->nr_pages);
 
 	if (v->phys_addr)
-		seq_printf(m, " phys=%lx", v->phys_addr);
+		seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
 
 	if (v->flags & VM_IOREMAP)
 		seq_printf(m, " ioremap");
-- 
cgit v1.2.3-59-g8ed1b


From 3cfde79c6c7c8002375c4a8e5be7f602fbb9675d Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 9 Jul 2010 09:11:52 +0000
Subject: net: Get rid of rtnl_link_stats64 / net_device_stats union

In commit be1f3c2c027cc5ad735df6a45a542ed1db7ec48b "net: Enable 64-bit
net device statistics on 32-bit architectures" I redefined struct
net_device_stats so that it could be used in a union with struct
rtnl_link_stats64, avoiding the need for explicit copying or
conversion between the two.  However, this is unsafe because there is
no locking required and no lock consistently held around calls to
dev_get_stats() and use of the statistics structure it returns.

In commit 28172739f0a276eb8d6ca917b3974c2edb036da3 "net: fix 64 bit
counters on 32 bit arches" Eric Dumazet dealt with that problem by
requiring callers of dev_get_stats() to provide storage for the
result.  This means that the net_device::stats64 field and the padding
in struct net_device_stats are now redundant, so remove them.

Update the comment on net_device_ops::ndo_get_stats64 to reflect its
new usage.

Change dev_txq_stats_fold() to use struct rtnl_link_stats64, since
that is what all its callers are really using and it is no longer
going to be compatible with struct net_device_stats.

Eric Dumazet suggested the separate function for the structure
conversion.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c     |  2 +-
 include/linux/netdevice.h | 70 +++++++++++++++++++----------------------------
 net/8021q/vlan_dev.c      |  2 +-
 net/core/dev.c            | 31 +++++++++++++++++----
 4 files changed, 56 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 6112f1498940..1b28aaec0a5a 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -436,7 +436,7 @@ static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev,
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
-	dev_txq_stats_fold(dev, (struct net_device_stats *)stats);
+	dev_txq_stats_fold(dev, stats);
 
 	if (vlan->rx_stats) {
 		struct macvlan_rx_stats *p, accum = {0};
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8018f6bf3051..17e95e37aed9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -162,42 +162,32 @@ static inline bool dev_xmit_complete(int rc)
 /*
  *	Old network device statistics. Fields are native words
  *	(unsigned long) so they can be read and written atomically.
- *	Each field is padded to 64 bits for compatibility with
- *	rtnl_link_stats64.
  */
 
-#if BITS_PER_LONG == 64
-#define NET_DEVICE_STATS_DEFINE(name)	unsigned long name
-#elif defined(__LITTLE_ENDIAN)
-#define NET_DEVICE_STATS_DEFINE(name)	unsigned long name, pad_ ## name
-#else
-#define NET_DEVICE_STATS_DEFINE(name)	unsigned long pad_ ## name, name
-#endif
-
 struct net_device_stats {
-	NET_DEVICE_STATS_DEFINE(rx_packets);
-	NET_DEVICE_STATS_DEFINE(tx_packets);
-	NET_DEVICE_STATS_DEFINE(rx_bytes);
-	NET_DEVICE_STATS_DEFINE(tx_bytes);
-	NET_DEVICE_STATS_DEFINE(rx_errors);
-	NET_DEVICE_STATS_DEFINE(tx_errors);
-	NET_DEVICE_STATS_DEFINE(rx_dropped);
-	NET_DEVICE_STATS_DEFINE(tx_dropped);
-	NET_DEVICE_STATS_DEFINE(multicast);
-	NET_DEVICE_STATS_DEFINE(collisions);
-	NET_DEVICE_STATS_DEFINE(rx_length_errors);
-	NET_DEVICE_STATS_DEFINE(rx_over_errors);
-	NET_DEVICE_STATS_DEFINE(rx_crc_errors);
-	NET_DEVICE_STATS_DEFINE(rx_frame_errors);
-	NET_DEVICE_STATS_DEFINE(rx_fifo_errors);
-	NET_DEVICE_STATS_DEFINE(rx_missed_errors);
-	NET_DEVICE_STATS_DEFINE(tx_aborted_errors);
-	NET_DEVICE_STATS_DEFINE(tx_carrier_errors);
-	NET_DEVICE_STATS_DEFINE(tx_fifo_errors);
-	NET_DEVICE_STATS_DEFINE(tx_heartbeat_errors);
-	NET_DEVICE_STATS_DEFINE(tx_window_errors);
-	NET_DEVICE_STATS_DEFINE(rx_compressed);
-	NET_DEVICE_STATS_DEFINE(tx_compressed);
+	unsigned long	rx_packets;
+	unsigned long	tx_packets;
+	unsigned long	rx_bytes;
+	unsigned long	tx_bytes;
+	unsigned long	rx_errors;
+	unsigned long	tx_errors;
+	unsigned long	rx_dropped;
+	unsigned long	tx_dropped;
+	unsigned long	multicast;
+	unsigned long	collisions;
+	unsigned long	rx_length_errors;
+	unsigned long	rx_over_errors;
+	unsigned long	rx_crc_errors;
+	unsigned long	rx_frame_errors;
+	unsigned long	rx_fifo_errors;
+	unsigned long	rx_missed_errors;
+	unsigned long	tx_aborted_errors;
+	unsigned long	tx_carrier_errors;
+	unsigned long	tx_fifo_errors;
+	unsigned long	tx_heartbeat_errors;
+	unsigned long	tx_window_errors;
+	unsigned long	rx_compressed;
+	unsigned long	tx_compressed;
 };
 
 #endif  /*  __KERNEL__  */
@@ -666,14 +656,13 @@ struct netdev_rx_queue {
  *	Callback uses when the transmitter has not made any progress
  *	for dev->watchdog ticks.
  *
- * struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev
+ * struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
  *                      struct rtnl_link_stats64 *storage);
  * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
  *	Called when a user wants to get the network device usage
  *	statistics. Drivers must do one of the following:
- *	1. Define @ndo_get_stats64 to update a rtnl_link_stats64 structure
- *	   (which should normally be dev->stats64) and return a ponter to
- *	   it. The structure must not be changed asynchronously.
+ *	1. Define @ndo_get_stats64 to fill in a zero-initialised
+ *	   rtnl_link_stats64 structure passed by the caller.
  *	2. Define @ndo_get_stats to update a net_device_stats structure
  *	   (which should normally be dev->stats) and return a pointer to
  *	   it. The structure may be changed asynchronously only if each
@@ -888,10 +877,7 @@ struct net_device {
 	int			ifindex;
 	int			iflink;
 
-	union {
-		struct rtnl_link_stats64 stats64;
-		struct net_device_stats stats;
-	};
+	struct net_device_stats	stats;
 
 #ifdef CONFIG_WIRELESS_EXT
 	/* List of functions to handle Wireless Extensions (instead of ioctl).
@@ -2147,7 +2133,7 @@ extern void		dev_mcast_init(void);
 extern const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 						     struct rtnl_link_stats64 *storage);
 extern void		dev_txq_stats_fold(const struct net_device *dev,
-					   struct net_device_stats *stats);
+					   struct rtnl_link_stats64 *stats);
 
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index a1b8171cfa7b..7cb285f96b99 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -805,7 +805,7 @@ static u32 vlan_ethtool_get_flags(struct net_device *dev)
 
 static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
-	dev_txq_stats_fold(dev, (struct net_device_stats *)stats);
+	dev_txq_stats_fold(dev, stats);
 
 	if (vlan_dev_info(dev)->vlan_rx_stats) {
 		struct vlan_rx_stats *p, accum = {0};
diff --git a/net/core/dev.c b/net/core/dev.c
index eb4201cf9c8c..79ee26ef5095 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5274,10 +5274,10 @@ void netdev_run_todo(void)
 /**
  *	dev_txq_stats_fold - fold tx_queues stats
  *	@dev: device to get statistics from
- *	@stats: struct net_device_stats to hold results
+ *	@stats: struct rtnl_link_stats64 to hold results
  */
 void dev_txq_stats_fold(const struct net_device *dev,
-			struct net_device_stats *stats)
+			struct rtnl_link_stats64 *stats)
 {
 	unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
 	unsigned int i;
@@ -5297,6 +5297,27 @@ void dev_txq_stats_fold(const struct net_device *dev,
 }
 EXPORT_SYMBOL(dev_txq_stats_fold);
 
+/* Convert net_device_stats to rtnl_link_stats64.  They have the same
+ * fields in the same order, with only the type differing.
+ */
+static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
+				    const struct net_device_stats *netdev_stats)
+{
+#if BITS_PER_LONG == 64
+        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
+        memcpy(stats64, netdev_stats, sizeof(*stats64));
+#else
+	size_t i, n = sizeof(*stats64) / sizeof(u64);
+	const unsigned long *src = (const unsigned long *)netdev_stats;
+	u64 *dst = (u64 *)stats64;
+
+	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
+		     sizeof(*stats64) / sizeof(u64));
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+#endif
+}
+
 /**
  *	dev_get_stats	- get network device statistics
  *	@dev: device to get statistics from
@@ -5317,11 +5338,11 @@ const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 		return ops->ndo_get_stats64(dev, storage);
 	}
 	if (ops->ndo_get_stats) {
-		memcpy(storage, ops->ndo_get_stats(dev), sizeof(*storage));
+		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
 		return storage;
 	}
-	memcpy(storage, &dev->stats, sizeof(*storage));
-	dev_txq_stats_fold(dev, (struct net_device_stats *)storage);
+	netdev_stats_to_stats64(storage, &dev->stats);
+	dev_txq_stats_fold(dev, storage);
 	return storage;
 }
 EXPORT_SYMBOL(dev_get_stats);
-- 
cgit v1.2.3-59-g8ed1b


From d77535162e736c47978d5c01469c56e1781dc91b Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 9 Jul 2010 09:12:41 +0000
Subject: net: Document that dev_get_stats() returns the given pointer

Document that dev_get_stats() returns the same stats pointer it was
given.  Remove const qualification from the returned pointer since the
caller may do what it likes with that structure.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++--
 net/core/dev.c            | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 17e95e37aed9..c4fedf000541 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2130,8 +2130,8 @@ extern void		netdev_features_change(struct net_device *dev);
 /* Load a device via the kmod */
 extern void		dev_load(struct net *net, const char *name);
 extern void		dev_mcast_init(void);
-extern const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
-						     struct rtnl_link_stats64 *storage);
+extern struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
+					       struct rtnl_link_stats64 *storage);
 extern void		dev_txq_stats_fold(const struct net_device *dev,
 					   struct rtnl_link_stats64 *stats);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 79ee26ef5095..e2b9fa2c917e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5323,13 +5323,13 @@ static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
  *	@dev: device to get statistics from
  *	@storage: place to store stats
  *
- *	Get network statistics from device. The device driver may provide
- *	its own method by setting dev->netdev_ops->get_stats64 or
- *	dev->netdev_ops->get_stats; otherwise the internal statistics
- *	structure is used.
+ *	Get network statistics from device. Return @storage.
+ *	The device driver may provide its own method by setting
+ *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
+ *	otherwise the internal statistics structure is used.
  */
-const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
-					      struct rtnl_link_stats64 *storage)
+struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
+					struct rtnl_link_stats64 *storage)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
-- 
cgit v1.2.3-59-g8ed1b


From 69c8f52b3897f2faf8510ea7ede8fffabe26c531 Mon Sep 17 00:00:00 2001
From: "Justin P. Mattock" <justinmattock@gmail.com>
Date: Thu, 1 Jul 2010 14:28:27 -0700
Subject: fix #warning about using kernel headers in userpsace

Move the preprocessor #warning message:
warning: #warning Attempt to use kernel headers from user space,
see http://kernelnewbies.org/KernelHeaders
from kernel.h to types.h.

And also fixe the #warning message due to the preprocessor not being able to
read the web address due to it thinking it was the start of a comment.  also
remove the extra #ifndef _KERNEL_ since it's already there.

Signed-off-by: Justin P. Mattock <justinmattock@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/kernel.h | 6 ------
 include/linux/types.h  | 5 ++++-
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8317ec4b9f3b..bd8501a8ca1c 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -728,12 +728,6 @@ extern int do_sysinfo(struct sysinfo *info);
 
 #endif /* __KERNEL__ */
 
-#ifndef __EXPORTED_HEADERS__
-#ifndef __KERNEL__
-#warning Attempt to use kernel headers from user space, see http://kernelnewbies.org/KernelHeaders
-#endif /* __KERNEL__ */
-#endif /* __EXPORTED_HEADERS__ */
-
 #define SI_LOAD_SHIFT	16
 struct sysinfo {
 	long uptime;			/* Seconds since boot */
diff --git a/include/linux/types.h b/include/linux/types.h
index 23d237a075e2..331d8baabcf2 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -8,7 +8,10 @@
 
 #define DECLARE_BITMAP(name,bits) \
 	unsigned long name[BITS_TO_LONGS(bits)]
-
+#else
+#ifndef __EXPORTED_HEADERS__
+#warning "Attempt to use kernel headers from user space, see http://kernelnewbies.org/KernelHeaders"
+#endif /* __EXPORTED_HEADERS__ */
 #endif
 
 #include <linux/posix_types.h>
-- 
cgit v1.2.3-59-g8ed1b


From b27d63d8f8d34af57805f56005e217c150187531 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Thu, 1 Jul 2010 20:48:44 +0200
Subject: fix comment typos concerning "sequential"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/media/video/gspca/sunplus.c | 4 ++--
 include/linux/jffs2.h               | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/gspca/sunplus.c b/drivers/media/video/gspca/sunplus.c
index 0c786e00ebcf..d0a133abe767 100644
--- a/drivers/media/video/gspca/sunplus.c
+++ b/drivers/media/video/gspca/sunplus.c
@@ -805,7 +805,7 @@ static int sd_init(struct gspca_dev *gspca_dev)
 			/* Set AE AWB Banding Type 3-> 50Hz 2-> 60Hz */
 			spca504A_acknowledged_command(gspca_dev, 0x24,
 							8, 3, 0x9e, 1);
-			/* Twice sequencial need status 0xff->0x9e->0x9d */
+			/* Twice sequential need status 0xff->0x9e->0x9d */
 			spca504A_acknowledged_command(gspca_dev, 0x24,
 							8, 3, 0x9e, 0);
 
@@ -880,7 +880,7 @@ static int sd_start(struct gspca_dev *gspca_dev)
 			/* Set AE AWB Banding Type 3-> 50Hz 2-> 60Hz */
 			spca504A_acknowledged_command(gspca_dev, 0x24,
 							8, 3, 0x9e, 1);
-			/* Twice sequencial need status 0xff->0x9e->0x9d */
+			/* Twice sequential need status 0xff->0x9e->0x9d */
 			spca504A_acknowledged_command(gspca_dev, 0x24,
 							8, 3, 0x9e, 0);
 			spca504A_acknowledged_command(gspca_dev, 0x24,
diff --git a/include/linux/jffs2.h b/include/linux/jffs2.h
index 0874ab59ffef..edb9231f1898 100644
--- a/include/linux/jffs2.h
+++ b/include/linux/jffs2.h
@@ -185,7 +185,7 @@ struct jffs2_raw_xref
 	jint32_t hdr_crc;
 	jint32_t ino;		/* inode number */
 	jint32_t xid;		/* XATTR identifier number */
-	jint32_t xseqno;	/* xref sequencial number */
+	jint32_t xseqno;	/* xref sequential number */
 	jint32_t node_crc;
 } __attribute__((packed));
 
-- 
cgit v1.2.3-59-g8ed1b


From ab0cfb928a3839e21942a28a86ad88e56ea3b136 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Tue, 6 Jul 2010 18:12:39 +0530
Subject: fscache: fix a trivial typo in the comment

Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/fscache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 595ce49288b7..1c0fc3e62f41 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -85,7 +85,7 @@ struct fscache_cookie_def {
 
 	/* get an index key
 	 * - should store the key data in the buffer
-	 * - should return the amount of amount stored
+	 * - should return the amount of data stored
 	 * - not permitted to return an error
 	 * - the netfs data from the cookie being used as the source is
 	 *   presented
-- 
cgit v1.2.3-59-g8ed1b


From 49a3df804bec09b8ee8196f79b81757e95cc6de4 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Tue, 6 Jul 2010 18:29:45 +0530
Subject: fscache: fix missing kerneldoc annotation

.. and make kerneldoc scripts happy.

Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/fscache.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 1c0fc3e62f41..ec0dad5ab90f 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -454,6 +454,7 @@ int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
  * @cookie: The cookie representing the cache object
  * @mapping: The netfs inode mapping to which the pages will be attached
  * @pages: A list of potential netfs pages to be filled
+ * @nr_pages: Number of pages to be read and/or allocated
  * @end_io_func: The callback to invoke when and if each page is filled
  * @context: An arbitrary piece of data to pass on to end_io_func()
  * @gfp: The conditions under which memory allocation should be made
-- 
cgit v1.2.3-59-g8ed1b


From a1d75f258230b75d46aecdf28b2e732413028863 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 12 Jul 2010 14:41:40 +0200
Subject: fuse: add store request

Userspace filesystem can request data to be stored in the inode's
mapping.  This request is synchronous and has no reply.  If the write
to the fuse device returns an error then the store request was not
fully completed (but may have updated some pages).

If the stored data overflows the current file size, then the size is
extended, similarly to a write(2) on the filesystem.

Pages which have been completely stored are marked uptodate.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c        | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/file.c       |  2 +-
 fs/fuse/fuse_i.h     |  2 ++
 include/linux/fuse.h | 13 +++++++-
 4 files changed, 103 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 7eb80d33c4f3..8e01c865586e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1231,6 +1231,91 @@ err:
 	return err;
 }
 
+static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
+			     struct fuse_copy_state *cs)
+{
+	struct fuse_notify_store_out outarg;
+	struct inode *inode;
+	struct address_space *mapping;
+	u64 nodeid;
+	int err;
+	pgoff_t index;
+	unsigned int offset;
+	unsigned int num;
+	loff_t file_size;
+	loff_t end;
+
+	err = -EINVAL;
+	if (size < sizeof(outarg))
+		goto out_finish;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto out_finish;
+
+	err = -EINVAL;
+	if (size - sizeof(outarg) != outarg.size)
+		goto out_finish;
+
+	nodeid = outarg.nodeid;
+
+	down_read(&fc->killsb);
+
+	err = -ENOENT;
+	if (!fc->sb)
+		goto out_up_killsb;
+
+	inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+	if (!inode)
+		goto out_up_killsb;
+
+	mapping = inode->i_mapping;
+	index = outarg.offset >> PAGE_CACHE_SHIFT;
+	offset = outarg.offset & ~PAGE_CACHE_MASK;
+	file_size = i_size_read(inode);
+	end = outarg.offset + outarg.size;
+	if (end > file_size) {
+		file_size = end;
+		fuse_write_update_size(inode, file_size);
+	}
+
+	num = outarg.size;
+	while (num) {
+		struct page *page;
+		unsigned int this_num;
+
+		err = -ENOMEM;
+		page = find_or_create_page(mapping, index,
+					   mapping_gfp_mask(mapping));
+		if (!page)
+			goto out_iput;
+
+		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+		err = fuse_copy_page(cs, &page, offset, this_num, 0);
+		if (!err && offset == 0 && (num != 0 || file_size == end))
+			SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+
+		if (err)
+			goto out_iput;
+
+		num -= this_num;
+		offset = 0;
+		index++;
+	}
+
+	err = 0;
+
+out_iput:
+	iput(inode);
+out_up_killsb:
+	up_read(&fc->killsb);
+out_finish:
+	fuse_copy_finish(cs);
+	return err;
+}
+
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -1244,6 +1329,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 	case FUSE_NOTIFY_INVAL_ENTRY:
 		return fuse_notify_inval_entry(fc, size, cs);
 
+	case FUSE_NOTIFY_STORE:
+		return fuse_notify_store(fc, size, cs);
+
 	default:
 		fuse_copy_finish(cs);
 		return -EINVAL;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ada0adeb3bb5..147c1f71bdb9 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -706,7 +706,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
 	return 0;
 }
 
-static void fuse_write_update_size(struct inode *inode, loff_t pos)
+void fuse_write_update_size(struct inode *inode, loff_t pos)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 8f309f04064e..61267d8d527b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -748,4 +748,6 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 unsigned fuse_file_poll(struct file *file, poll_table *wait);
 int fuse_dev_release(struct inode *inode, struct file *file);
 
+void fuse_write_update_size(struct inode *inode, loff_t pos);
+
 #endif /* _FS_FUSE_I_H */
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 88e0eb596919..a90bd49834aa 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -37,6 +37,9 @@
  *
  * 7.14
  *  - add splice support to fuse device
+ *
+ * 7.15
+ *  - add store notify
  */
 
 #ifndef _LINUX_FUSE_H
@@ -68,7 +71,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 14
+#define FUSE_KERNEL_MINOR_VERSION 15
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -260,6 +263,7 @@ enum fuse_notify_code {
 	FUSE_NOTIFY_POLL   = 1,
 	FUSE_NOTIFY_INVAL_INODE = 2,
 	FUSE_NOTIFY_INVAL_ENTRY = 3,
+	FUSE_NOTIFY_STORE = 4,
 	FUSE_NOTIFY_CODE_MAX,
 };
 
@@ -568,4 +572,11 @@ struct fuse_notify_inval_entry_out {
 	__u32	padding;
 };
 
+struct fuse_notify_store_out {
+	__u64	nodeid;
+	__u64	offset;
+	__u32	size;
+	__u32	padding;
+};
+
 #endif /* _LINUX_FUSE_H */
-- 
cgit v1.2.3-59-g8ed1b


From 2d45ba381a74a743eeaa2b06c7c5c0d2bf73ba1a Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 12 Jul 2010 14:41:40 +0200
Subject: fuse: add retrieve request

Userspace filesystem can request data to be retrieved from the inode's
mapping.  This request is synchronous and the retrieved data is queued
as a new request.  If the write to the fuse device returns an error
then the retrieve request was not completed and a reply will not be
sent.

Only present pages are returned in the retrieve reply.  Retrieving
stops when it finds a non-present page and only data prior to that is
returned.

This request doesn't change the dirty state of pages.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c        | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/fuse/fuse_i.h     |   1 +
 include/linux/fuse.h |  21 +++++++++
 3 files changed, 152 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8e01c865586e..69ad053ffd78 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -239,7 +239,6 @@ static u64 fuse_get_unique(struct fuse_conn *fc)
 
 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 {
-	req->in.h.unique = fuse_get_unique(fc);
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
 	list_add_tail(&req->list, &fc->pending);
@@ -261,6 +260,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
 		req = list_entry(fc->bg_queue.next, struct fuse_req, list);
 		list_del(&req->list);
 		fc->active_background++;
+		req->in.h.unique = fuse_get_unique(fc);
 		queue_request(fc, req);
 	}
 }
@@ -398,6 +398,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 	else if (fc->conn_error)
 		req->out.h.error = -ECONNREFUSED;
 	else {
+		req->in.h.unique = fuse_get_unique(fc);
 		queue_request(fc, req);
 		/* acquire extra reference, since request is still needed
 		   after request_end() */
@@ -450,6 +451,23 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 }
 EXPORT_SYMBOL_GPL(fuse_request_send_background);
 
+static int fuse_request_send_notify_reply(struct fuse_conn *fc,
+					  struct fuse_req *req, u64 unique)
+{
+	int err = -ENODEV;
+
+	req->isreply = 0;
+	req->in.h.unique = unique;
+	spin_lock(&fc->lock);
+	if (fc->connected) {
+		queue_request(fc, req);
+		err = 0;
+	}
+	spin_unlock(&fc->lock);
+
+	return err;
+}
+
 /*
  * Called under fc->lock
  *
@@ -1316,6 +1334,114 @@ out_finish:
 	return err;
 }
 
+static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int i;
+
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		page_cache_release(page);
+	}
+}
+
+static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+			 struct fuse_notify_retrieve_out *outarg)
+{
+	int err;
+	struct address_space *mapping = inode->i_mapping;
+	struct fuse_req *req;
+	pgoff_t index;
+	loff_t file_size;
+	unsigned int num;
+	unsigned int offset;
+	size_t total_len;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	offset = outarg->offset & ~PAGE_CACHE_MASK;
+
+	req->in.h.opcode = FUSE_NOTIFY_REPLY;
+	req->in.h.nodeid = outarg->nodeid;
+	req->in.numargs = 2;
+	req->in.argpages = 1;
+	req->page_offset = offset;
+	req->end = fuse_retrieve_end;
+
+	index = outarg->offset >> PAGE_CACHE_SHIFT;
+	file_size = i_size_read(inode);
+	num = outarg->size;
+	if (outarg->offset > file_size)
+		num = 0;
+	else if (outarg->offset + num > file_size)
+		num = file_size - outarg->offset;
+
+	while (num) {
+		struct page *page;
+		unsigned int this_num;
+
+		page = find_get_page(mapping, index);
+		if (!page)
+			break;
+
+		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+		req->pages[req->num_pages] = page;
+		req->num_pages++;
+
+		num -= this_num;
+		total_len += this_num;
+	}
+	req->misc.retrieve_in.offset = outarg->offset;
+	req->misc.retrieve_in.size = total_len;
+	req->in.args[0].size = sizeof(req->misc.retrieve_in);
+	req->in.args[0].value = &req->misc.retrieve_in;
+	req->in.args[1].size = total_len;
+
+	err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
+	if (err)
+		fuse_retrieve_end(fc, req);
+
+	return err;
+}
+
+static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
+				struct fuse_copy_state *cs)
+{
+	struct fuse_notify_retrieve_out outarg;
+	struct inode *inode;
+	int err;
+
+	err = -EINVAL;
+	if (size != sizeof(outarg))
+		goto copy_finish;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto copy_finish;
+
+	fuse_copy_finish(cs);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (fc->sb) {
+		u64 nodeid = outarg.nodeid;
+
+		inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+		if (inode) {
+			err = fuse_retrieve(fc, inode, &outarg);
+			iput(inode);
+		}
+	}
+	up_read(&fc->killsb);
+
+	return err;
+
+copy_finish:
+	fuse_copy_finish(cs);
+	return err;
+}
+
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -1332,6 +1458,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 	case FUSE_NOTIFY_STORE:
 		return fuse_notify_store(fc, size, cs);
 
+	case FUSE_NOTIFY_RETRIEVE:
+		return fuse_notify_retrieve(fc, size, cs);
+
 	default:
 		fuse_copy_finish(cs);
 		return -EINVAL;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 61267d8d527b..57d4a3a0f102 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -272,6 +272,7 @@ struct fuse_req {
 			struct fuse_write_in in;
 			struct fuse_write_out out;
 		} write;
+		struct fuse_notify_retrieve_in retrieve_in;
 		struct fuse_lk_in lk_in;
 	} misc;
 
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index a90bd49834aa..c3c578e09833 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -40,6 +40,7 @@
  *
  * 7.15
  *  - add store notify
+ *  - add retrieve notify
  */
 
 #ifndef _LINUX_FUSE_H
@@ -254,6 +255,7 @@ enum fuse_opcode {
 	FUSE_DESTROY       = 38,
 	FUSE_IOCTL         = 39,
 	FUSE_POLL          = 40,
+	FUSE_NOTIFY_REPLY  = 41,
 
 	/* CUSE specific operations */
 	CUSE_INIT          = 4096,
@@ -264,6 +266,7 @@ enum fuse_notify_code {
 	FUSE_NOTIFY_INVAL_INODE = 2,
 	FUSE_NOTIFY_INVAL_ENTRY = 3,
 	FUSE_NOTIFY_STORE = 4,
+	FUSE_NOTIFY_RETRIEVE = 5,
 	FUSE_NOTIFY_CODE_MAX,
 };
 
@@ -579,4 +582,22 @@ struct fuse_notify_store_out {
 	__u32	padding;
 };
 
+struct fuse_notify_retrieve_out {
+	__u64	notify_unique;
+	__u64	nodeid;
+	__u64	offset;
+	__u32	size;
+	__u32	padding;
+};
+
+/* Matches the size of fuse_write_in */
+struct fuse_notify_retrieve_in {
+	__u64	dummy1;
+	__u64	offset;
+	__u32	size;
+	__u32	dummy2;
+	__u64	dummy3;
+	__u64	dummy4;
+};
+
 #endif /* _LINUX_FUSE_H */
-- 
cgit v1.2.3-59-g8ed1b


From 02d37bed188c500ee7afb0a2dc6b65a80704c58e Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 8 Jul 2010 16:09:06 +0200
Subject: firewire: core: integrate software-forced bus resets with bus
 management

Bus resets which are triggered
  - by the kernel drivers after updates of the local nodes' config ROM,
  - by userspace software via ioctl
shall be deferred until after >=2 seconds after the last bus reset.

If multiple modifications of the local nodes' config ROM happen in a row,
only a single bus reset should happen after them.

When the local node's link goes from inactive to active or vice versa,
and at the two occasions of bus resets mentioned above --- and if the
current gap count differs from 63 --- the bus reset should be preceded
by a PHY configuration packet that reaffirms the gap count.  Otherwise a
bus manager would have to reset the bus again right after that.

This is necessary to promote bus stability, e.g. leave grace periods for
allocations and reallocations of isochronous channels and bandwidth,
SBP-2 reconnections etc.; see IEEE 1394 clause 8.2.1.

This change implements all of the above by moving bus reset initiation
into a delayed work (except for bus resets which are triggered by the
bus manager workqueue job and are performed there immediately).  It
comes with a necessary addition to the card driver methods that allows
to get the current gap count from PHY registers.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c        | 68 ++++++++++++++++++++++++++++---------
 drivers/firewire/core-cdev.c        |  3 +-
 drivers/firewire/core-transaction.c | 18 ++++++++--
 drivers/firewire/core.h             |  6 +++-
 drivers/firewire/ohci.c             | 53 +++++++++++++++++++++--------
 include/linux/firewire.h            |  6 ++--
 6 files changed, 116 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 6c316cfe70c4..2bb5c036e806 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -204,6 +204,45 @@ void fw_core_remove_descriptor(struct fw_descriptor *desc)
 }
 EXPORT_SYMBOL(fw_core_remove_descriptor);
 
+static int reset_bus(struct fw_card *card, bool short_reset)
+{
+	int reg = short_reset ? 5 : 1;
+	int bit = short_reset ? PHY_BUS_SHORT_RESET : PHY_BUS_RESET;
+
+	return card->driver->update_phy_reg(card, reg, 0, bit);
+}
+
+void fw_schedule_bus_reset(struct fw_card *card, bool delayed, bool short_reset)
+{
+	/* We don't try hard to sort out requests of long vs. short resets. */
+	card->br_short = short_reset;
+
+	/* Use an arbitrary short delay to combine multiple reset requests. */
+	fw_card_get(card);
+	if (!schedule_delayed_work(&card->br_work,
+				   delayed ? DIV_ROUND_UP(HZ, 100) : 0))
+		fw_card_put(card);
+}
+EXPORT_SYMBOL(fw_schedule_bus_reset);
+
+static void br_work(struct work_struct *work)
+{
+	struct fw_card *card = container_of(work, struct fw_card, br_work.work);
+
+	/* Delay for 2s after last reset per IEEE 1394 clause 8.2.1. */
+	if (card->reset_jiffies != 0 &&
+	    time_is_after_jiffies(card->reset_jiffies + 2 * HZ)) {
+		if (!schedule_delayed_work(&card->br_work, 2 * HZ))
+			fw_card_put(card);
+		return;
+	}
+
+	fw_send_phy_config(card, FW_PHY_CONFIG_NO_NODE_ID, card->generation,
+			   FW_PHY_CONFIG_CURRENT_GAP_COUNT);
+	reset_bus(card, card->br_short);
+	fw_card_put(card);
+}
+
 static void allocate_broadcast_channel(struct fw_card *card, int generation)
 {
 	int channel, bandwidth = 0;
@@ -230,13 +269,13 @@ static const char gap_count_table[] = {
 void fw_schedule_bm_work(struct fw_card *card, unsigned long delay)
 {
 	fw_card_get(card);
-	if (!schedule_delayed_work(&card->work, delay))
+	if (!schedule_delayed_work(&card->bm_work, delay))
 		fw_card_put(card);
 }
 
-static void fw_card_bm_work(struct work_struct *work)
+static void bm_work(struct work_struct *work)
 {
-	struct fw_card *card = container_of(work, struct fw_card, work.work);
+	struct fw_card *card = container_of(work, struct fw_card, bm_work.work);
 	struct fw_device *root_device;
 	struct fw_node *root_node;
 	int root_id, new_root_id, irm_id, bm_id, local_id;
@@ -413,7 +452,7 @@ static void fw_card_bm_work(struct work_struct *work)
 		fw_notify("phy config: card %d, new root=%x, gap_count=%d\n",
 			  card->index, new_root_id, gap_count);
 		fw_send_phy_config(card, new_root_id, generation, gap_count);
-		fw_core_initiate_bus_reset(card, 1);
+		reset_bus(card, true);
 		/* Will allocate broadcast channel after the reset. */
 		goto out;
 	}
@@ -465,7 +504,8 @@ void fw_card_initialize(struct fw_card *card,
 
 	card->local_node = NULL;
 
-	INIT_DELAYED_WORK(&card->work, fw_card_bm_work);
+	INIT_DELAYED_WORK(&card->br_work, br_work);
+	INIT_DELAYED_WORK(&card->bm_work, bm_work);
 }
 EXPORT_SYMBOL(fw_card_initialize);
 
@@ -491,7 +531,6 @@ int fw_card_add(struct fw_card *card,
 }
 EXPORT_SYMBOL(fw_card_add);
 
-
 /*
  * The next few functions implement a dummy driver that is used once a card
  * driver shuts down an fw_card.  This allows the driver to cleanly unload,
@@ -507,6 +546,11 @@ static int dummy_enable(struct fw_card *card,
 	return -1;
 }
 
+static int dummy_read_phy_reg(struct fw_card *card, int address)
+{
+	return -ENODEV;
+}
+
 static int dummy_update_phy_reg(struct fw_card *card, int address,
 				int clear_bits, int set_bits)
 {
@@ -547,6 +591,7 @@ static int dummy_enable_phys_dma(struct fw_card *card,
 
 static const struct fw_card_driver dummy_driver_template = {
 	.enable          = dummy_enable,
+	.read_phy_reg    = dummy_read_phy_reg,
 	.update_phy_reg  = dummy_update_phy_reg,
 	.set_config_rom  = dummy_set_config_rom,
 	.send_request    = dummy_send_request,
@@ -568,7 +613,7 @@ void fw_core_remove_card(struct fw_card *card)
 
 	card->driver->update_phy_reg(card, 4,
 				     PHY_LINK_ACTIVE | PHY_CONTENDER, 0);
-	fw_core_initiate_bus_reset(card, 1);
+	fw_schedule_bus_reset(card, false, true);
 
 	mutex_lock(&card_mutex);
 	list_del_init(&card->link);
@@ -588,12 +633,3 @@ void fw_core_remove_card(struct fw_card *card)
 	WARN_ON(!list_empty(&card->transaction_list));
 }
 EXPORT_SYMBOL(fw_core_remove_card);
-
-int fw_core_initiate_bus_reset(struct fw_card *card, int short_reset)
-{
-	int reg = short_reset ? 5 : 1;
-	int bit = short_reset ? PHY_BUS_SHORT_RESET : PHY_BUS_RESET;
-
-	return card->driver->update_phy_reg(card, reg, 0, bit);
-}
-EXPORT_SYMBOL(fw_core_initiate_bus_reset);
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 7a690c466ce9..ee2e87353102 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -820,8 +820,9 @@ static int ioctl_send_response(struct client *client, union ioctl_arg *arg)
 
 static int ioctl_initiate_bus_reset(struct client *client, union ioctl_arg *arg)
 {
-	return fw_core_initiate_bus_reset(client->device->card,
+	fw_schedule_bus_reset(client->device->card, true,
 			arg->initiate_bus_reset.type == FW_CDEV_SHORT_RESET);
+	return 0;
 }
 
 static void release_descriptor(struct client *client,
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 7813da8a1293..5f5a7852f7ac 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -426,9 +426,21 @@ void fw_send_phy_config(struct fw_card *card,
 			int node_id, int generation, int gap_count)
 {
 	long timeout = DIV_ROUND_UP(HZ, 10);
-	u32 data = PHY_IDENTIFIER(PHY_PACKET_CONFIG) |
-		   PHY_CONFIG_ROOT_ID(node_id) |
-		   PHY_CONFIG_GAP_COUNT(gap_count);
+	u32 data = PHY_IDENTIFIER(PHY_PACKET_CONFIG);
+
+	if (node_id != FW_PHY_CONFIG_NO_NODE_ID)
+		data |= PHY_CONFIG_ROOT_ID(node_id);
+
+	if (gap_count == FW_PHY_CONFIG_CURRENT_GAP_COUNT) {
+		gap_count = card->driver->read_phy_reg(card, 1);
+		if (gap_count < 0)
+			return;
+
+		gap_count &= 63;
+		if (gap_count == 63)
+			return;
+	}
+	data |= PHY_CONFIG_GAP_COUNT(gap_count);
 
 	mutex_lock(&phy_config_mutex);
 
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 3000dd74acfd..ff6c90922001 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -51,6 +51,7 @@ struct fw_card_driver {
 	int (*enable)(struct fw_card *card,
 		      const __be32 *config_rom, size_t length);
 
+	int (*read_phy_reg)(struct fw_card *card, int address);
 	int (*update_phy_reg)(struct fw_card *card, int address,
 			      int clear_bits, int set_bits);
 
@@ -102,8 +103,8 @@ void fw_card_initialize(struct fw_card *card,
 int fw_card_add(struct fw_card *card,
 		u32 max_receive, u32 link_speed, u64 guid);
 void fw_core_remove_card(struct fw_card *card);
-int fw_core_initiate_bus_reset(struct fw_card *card, int short_reset);
 int fw_compute_block_crc(__be32 *block);
+void fw_schedule_bus_reset(struct fw_card *card, bool delayed, bool short_reset);
 void fw_schedule_bm_work(struct fw_card *card, unsigned long delay);
 
 static inline struct fw_card *fw_card_get(struct fw_card *card)
@@ -225,6 +226,9 @@ void fw_core_handle_response(struct fw_card *card, struct fw_packet *packet);
 int fw_get_response_length(struct fw_request *request);
 void fw_fill_response(struct fw_packet *response, u32 *request_header,
 		      int rcode, void *payload, size_t length);
+
+#define FW_PHY_CONFIG_NO_NODE_ID	-1
+#define FW_PHY_CONFIG_CURRENT_GAP_COUNT	-1
 void fw_send_phy_config(struct fw_card *card,
 			int node_id, int generation, int gap_count);
 
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index a4bbf3dadf58..bb6a92bc9e6a 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -34,6 +34,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/pci_ids.h>
 #include <linux/spinlock.h>
@@ -182,6 +183,8 @@ struct fw_ohci {
 	 */
 	spinlock_t lock;
 
+	struct mutex phy_reg_mutex;
+
 	struct ar_context ar_request_ctx;
 	struct ar_context ar_response_ctx;
 	struct context at_request_ctx;
@@ -517,13 +520,10 @@ static int write_phy_reg(const struct fw_ohci *ohci, int addr, u32 val)
 	return -EBUSY;
 }
 
-static int ohci_update_phy_reg(struct fw_card *card, int addr,
-			       int clear_bits, int set_bits)
+static int update_phy_reg(struct fw_ohci *ohci, int addr,
+			  int clear_bits, int set_bits)
 {
-	struct fw_ohci *ohci = fw_ohci(card);
-	int ret;
-
-	ret = read_phy_reg(ohci, addr);
+	int ret = read_phy_reg(ohci, addr);
 	if (ret < 0)
 		return ret;
 
@@ -541,13 +541,38 @@ static int read_paged_phy_reg(struct fw_ohci *ohci, int page, int addr)
 {
 	int ret;
 
-	ret = ohci_update_phy_reg(&ohci->card, 7, PHY_PAGE_SELECT, page << 5);
+	ret = update_phy_reg(ohci, 7, PHY_PAGE_SELECT, page << 5);
 	if (ret < 0)
 		return ret;
 
 	return read_phy_reg(ohci, addr);
 }
 
+static int ohci_read_phy_reg(struct fw_card *card, int addr)
+{
+	struct fw_ohci *ohci = fw_ohci(card);
+	int ret;
+
+	mutex_lock(&ohci->phy_reg_mutex);
+	ret = read_phy_reg(ohci, addr);
+	mutex_unlock(&ohci->phy_reg_mutex);
+
+	return ret;
+}
+
+static int ohci_update_phy_reg(struct fw_card *card, int addr,
+			       int clear_bits, int set_bits)
+{
+	struct fw_ohci *ohci = fw_ohci(card);
+	int ret;
+
+	mutex_lock(&ohci->phy_reg_mutex);
+	ret = update_phy_reg(ohci, addr, clear_bits, set_bits);
+	mutex_unlock(&ohci->phy_reg_mutex);
+
+	return ret;
+}
+
 static int ar_context_add_page(struct ar_context *ctx)
 {
 	struct device *dev = ctx->ohci->card.device;
@@ -1676,7 +1701,7 @@ static int configure_1394a_enhancements(struct fw_ohci *ohci)
 		clear = PHY_ENABLE_ACCEL | PHY_ENABLE_MULTI;
 		set = 0;
 	}
-	ret = ohci_update_phy_reg(&ohci->card, 5, clear, set);
+	ret = update_phy_reg(ohci, 5, clear, set);
 	if (ret < 0)
 		return ret;
 
@@ -1856,12 +1881,8 @@ static int ohci_enable(struct fw_card *card,
 		  OHCI1394_HCControl_BIBimageValid);
 	flush_writes(ohci);
 
-	/*
-	 * We are ready to go, initiate bus reset to finish the
-	 * initialization.
-	 */
-
-	fw_core_initiate_bus_reset(&ohci->card, 1);
+	/* We are ready to go, reset bus to finish initialization. */
+	fw_schedule_bus_reset(&ohci->card, false, true);
 
 	return 0;
 }
@@ -1936,7 +1957,7 @@ static int ohci_set_config_rom(struct fw_card *card,
 	 * takes effect.
 	 */
 	if (ret == 0)
-		fw_core_initiate_bus_reset(&ohci->card, 1);
+		fw_schedule_bus_reset(&ohci->card, true, true);
 	else
 		dma_free_coherent(ohci->card.device, CONFIG_ROM_SIZE,
 				  next_config_rom, next_config_rom_bus);
@@ -2570,6 +2591,7 @@ static int ohci_queue_iso(struct fw_iso_context *base,
 
 static const struct fw_card_driver ohci_driver = {
 	.enable			= ohci_enable,
+	.read_phy_reg		= ohci_read_phy_reg,
 	.update_phy_reg		= ohci_update_phy_reg,
 	.set_config_rom		= ohci_set_config_rom,
 	.send_request		= ohci_send_request,
@@ -2645,6 +2667,7 @@ static int __devinit pci_probe(struct pci_dev *dev,
 	pci_set_drvdata(dev, ohci);
 
 	spin_lock_init(&ohci->lock);
+	mutex_init(&ohci->phy_reg_mutex);
 
 	tasklet_init(&ohci->bus_reset_tasklet,
 		     bus_reset_tasklet, (unsigned long)ohci);
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index db30a752a87a..adc5b55e6e5f 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -114,8 +114,10 @@ struct fw_card {
 
 	struct list_head link;
 
-	/* Work struct for BM duties. */
-	struct delayed_work work;
+	struct delayed_work br_work; /* bus reset job */
+	bool br_short;
+
+	struct delayed_work bm_work; /* bus manager job */
 	int bm_retries;
 	int bm_generation;
 	__be32 bm_transaction_data[2];
-- 
cgit v1.2.3-59-g8ed1b


From 4c879170296174bde05cd1c643dac16594edee77 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 7 Jul 2010 15:30:10 +0200
Subject: padata: Check for valid padata instance on start

This patch introduces the PADATA_INVALID flag which is
checked on padata start. This will be used to mark a padata
instance as invalid, if the padata cpumask does not intersect
with the active cpumask. we change padata_start to return an
error if the PADATA_INVALID is set. Also we adapt the only
padata user, pcrypt to this change.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/pcrypt.c        | 19 ++++++++++++++-----
 include/linux/padata.h |  3 ++-
 kernel/padata.c        | 18 ++++++++++++++++--
 3 files changed, 32 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index 247178cb98ec..71ae2b2ae33b 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -385,6 +385,7 @@ static struct crypto_template pcrypt_tmpl = {
 
 static int __init pcrypt_init(void)
 {
+	int err = -ENOMEM;
 	encwq = create_workqueue("pencrypt");
 	if (!encwq)
 		goto err;
@@ -400,14 +401,22 @@ static int __init pcrypt_init(void)
 
 	pcrypt_dec_padata = padata_alloc(cpu_possible_mask, decwq);
 	if (!pcrypt_dec_padata)
-		goto err_free_padata;
+		goto err_free_enc_padata;
 
-	padata_start(pcrypt_enc_padata);
-	padata_start(pcrypt_dec_padata);
+	err = padata_start(pcrypt_enc_padata);
+	if (err)
+		goto err_free_dec_padata;
+
+	err = padata_start(pcrypt_dec_padata);
+	if (err)
+		goto err_free_dec_padata;
 
 	return crypto_register_template(&pcrypt_tmpl);
 
-err_free_padata:
+err_free_dec_padata:
+	padata_free(pcrypt_dec_padata);
+
+err_free_enc_padata:
 	padata_free(pcrypt_enc_padata);
 
 err_destroy_decwq:
@@ -417,7 +426,7 @@ err_destroy_encwq:
 	destroy_workqueue(encwq);
 
 err:
-	return -ENOMEM;
+	return err;
 }
 
 static void __exit pcrypt_exit(void)
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 8d8406246eef..e4c17f9b7c9e 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -126,6 +126,7 @@ struct padata_instance {
 	u8			flags;
 #define	PADATA_INIT		1
 #define	PADATA_RESET		2
+#define	PADATA_INVALID		4
 };
 
 extern struct padata_instance *padata_alloc(const struct cpumask *cpumask,
@@ -138,6 +139,6 @@ extern int padata_set_cpumask(struct padata_instance *pinst,
 			      cpumask_var_t cpumask);
 extern int padata_add_cpu(struct padata_instance *pinst, int cpu);
 extern int padata_remove_cpu(struct padata_instance *pinst, int cpu);
-extern void padata_start(struct padata_instance *pinst);
+extern int padata_start(struct padata_instance *pinst);
 extern void padata_stop(struct padata_instance *pinst);
 #endif
diff --git a/kernel/padata.c b/kernel/padata.c
index ff8de1b71e4e..e7d723a3e31d 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -485,6 +485,11 @@ static void padata_flush_queues(struct parallel_data *pd)
 	BUG_ON(atomic_read(&pd->refcnt) != 0);
 }
 
+static void __padata_start(struct padata_instance *pinst)
+{
+	pinst->flags |= PADATA_INIT;
+}
+
 /* Replace the internal control stucture with a new one. */
 static void padata_replace(struct padata_instance *pinst,
 			   struct parallel_data *pd_new)
@@ -619,11 +624,20 @@ EXPORT_SYMBOL(padata_remove_cpu);
  *
  * @pinst: padata instance to start
  */
-void padata_start(struct padata_instance *pinst)
+int padata_start(struct padata_instance *pinst)
 {
+	int err = 0;
+
 	mutex_lock(&pinst->lock);
-	pinst->flags |= PADATA_INIT;
+
+	if (pinst->flags & PADATA_INVALID)
+		err =-EINVAL;
+
+	 __padata_start(pinst);
+
 	mutex_unlock(&pinst->lock);
+
+	return err;
 }
 EXPORT_SYMBOL(padata_start);
 
-- 
cgit v1.2.3-59-g8ed1b


From 5f1a8c1bc724498ff32acbd59ed5263275676b9d Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 7 Jul 2010 15:32:39 +0200
Subject: padata: simplify serialization mechanism

We count the number of processed objects on a percpu basis,
so we need to go through all the percpu reorder queues to calculate
the sequence number of the next object that needs serialization.
This patch changes this to count the number of processed objects
global. So we can calculate the sequence number and the percpu
reorder queue of the next object that needs serialization without
searching through the percpu reorder queues. This avoids some
accesses to memory of foreign cpus.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |  6 ++---
 kernel/padata.c        | 71 ++++++++++++++------------------------------------
 2 files changed, 22 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index e4c17f9b7c9e..8844b851191e 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -67,7 +67,6 @@ struct padata_list {
  * @pwork: work struct for parallelization.
  * @swork: work struct for serialization.
  * @pd: Backpointer to the internal control structure.
- * @num_obj: Number of objects that are processed by this cpu.
  * @cpu_index: Index of the cpu.
  */
 struct padata_queue {
@@ -77,7 +76,6 @@ struct padata_queue {
 	struct work_struct	pwork;
 	struct work_struct	swork;
 	struct parallel_data    *pd;
-	atomic_t		num_obj;
 	int			cpu_index;
 };
 
@@ -93,6 +91,7 @@ struct padata_queue {
  * @max_seq_nr:  Maximal used sequence number.
  * @cpumask: cpumask in use.
  * @lock: Reorder lock.
+ * @processed: Number of already processed objects.
  * @timer: Reorder timer.
  */
 struct parallel_data {
@@ -103,7 +102,8 @@ struct parallel_data {
 	atomic_t                refcnt;
 	unsigned int		max_seq_nr;
 	cpumask_var_t		cpumask;
-	spinlock_t              lock;
+	spinlock_t              lock ____cacheline_aligned;
+	unsigned int            processed;
 	struct timer_list       timer;
 };
 
diff --git a/kernel/padata.c b/kernel/padata.c
index ae8defcf0622..450d67d394b0 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -170,79 +170,47 @@ EXPORT_SYMBOL(padata_do_parallel);
  */
 static struct padata_priv *padata_get_next(struct parallel_data *pd)
 {
-	int cpu, num_cpus, empty, calc_seq_nr;
-	int seq_nr, next_nr, overrun, next_overrun;
+	int cpu, num_cpus;
+	int next_nr, next_index;
 	struct padata_queue *queue, *next_queue;
 	struct padata_priv *padata;
 	struct padata_list *reorder;
 
-	empty = 0;
-	next_nr = -1;
-	next_overrun = 0;
-	next_queue = NULL;
-
 	num_cpus = cpumask_weight(pd->cpumask);
 
-	for_each_cpu(cpu, pd->cpumask) {
-		queue = per_cpu_ptr(pd->queue, cpu);
-		reorder = &queue->reorder;
-
-		/*
-		 * Calculate the seq_nr of the object that should be
-		 * next in this reorder queue.
-		 */
-		overrun = 0;
-		calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
-			       + queue->cpu_index;
-
-		if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
-			calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
-			overrun = 1;
-		}
-
-		if (!list_empty(&reorder->list)) {
-			padata = list_entry(reorder->list.next,
-					    struct padata_priv, list);
-
-			seq_nr  = padata->seq_nr;
-			BUG_ON(calc_seq_nr != seq_nr);
-		} else {
-			seq_nr = calc_seq_nr;
-			empty++;
-		}
-
-		if (next_nr < 0 || seq_nr < next_nr
-		    || (next_overrun && !overrun)) {
-			next_nr = seq_nr;
-			next_overrun = overrun;
-			next_queue = queue;
-		}
+	/*
+	 * Calculate the percpu reorder queue and the sequence
+	 * number of the next object.
+	 */
+	next_nr = pd->processed;
+	next_index = next_nr % num_cpus;
+	cpu = padata_index_to_cpu(pd, next_index);
+	next_queue = per_cpu_ptr(pd->queue, cpu);
+
+	if (unlikely(next_nr > pd->max_seq_nr)) {
+		next_nr = next_nr - pd->max_seq_nr - 1;
+		next_index = next_nr % num_cpus;
+		cpu = padata_index_to_cpu(pd, next_index);
+		next_queue = per_cpu_ptr(pd->queue, cpu);
+		pd->processed = 0;
 	}
 
 	padata = NULL;
 
-	if (empty == num_cpus)
-		goto out;
-
 	reorder = &next_queue->reorder;
 
 	if (!list_empty(&reorder->list)) {
 		padata = list_entry(reorder->list.next,
 				    struct padata_priv, list);
 
-		if (unlikely(next_overrun)) {
-			for_each_cpu(cpu, pd->cpumask) {
-				queue = per_cpu_ptr(pd->queue, cpu);
-				atomic_set(&queue->num_obj, 0);
-			}
-		}
+		BUG_ON(next_nr != padata->seq_nr);
 
 		spin_lock(&reorder->lock);
 		list_del_init(&padata->list);
 		atomic_dec(&pd->reorder_objects);
 		spin_unlock(&reorder->lock);
 
-		atomic_inc(&next_queue->num_obj);
+		pd->processed++;
 
 		goto out;
 	}
@@ -430,7 +398,6 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
 
 		INIT_WORK(&queue->pwork, padata_parallel_worker);
 		INIT_WORK(&queue->swork, padata_serial_worker);
-		atomic_set(&queue->num_obj, 0);
 	}
 
 	num_cpus = cpumask_weight(pd->cpumask);
-- 
cgit v1.2.3-59-g8ed1b


From 035ebefc737cce56d3938e9b7eaa5ac0e9c28715 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Tue, 13 Jul 2010 09:42:26 +0000
Subject: of/sparc: move is_root_node() to of.h

Rename is_root_node() to of_node_is_root() and make it available for
all archs to use, as it's not PROM-specific.

Signed-off-by: Andres Salomon <dilinger@queued.net>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/sparc/kernel/prom.h        | 8 --------
 arch/sparc/kernel/prom_64.c     | 6 +++---
 arch/sparc/kernel/prom_common.c | 2 +-
 include/linux/of.h              | 5 +++++
 4 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/prom.h b/arch/sparc/kernel/prom.h
index a8591ef2636d..eeb04a782ec8 100644
--- a/arch/sparc/kernel/prom.h
+++ b/arch/sparc/kernel/prom.h
@@ -9,14 +9,6 @@ extern void irq_trans_init(struct device_node *dp);
 
 extern unsigned int prom_unique_id;
 
-static inline int is_root_node(const struct device_node *dp)
-{
-	if (!dp)
-		return 0;
-
-	return (dp->parent == NULL);
-}
-
 extern char *build_path_component(struct device_node *dp);
 extern void of_console_init(void);
 
diff --git a/arch/sparc/kernel/prom_64.c b/arch/sparc/kernel/prom_64.c
index fb06ac2bd38f..0bffafdee350 100644
--- a/arch/sparc/kernel/prom_64.c
+++ b/arch/sparc/kernel/prom_64.c
@@ -21,7 +21,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/lmb.h>
-#include <linux/of_device.h>
+#include <linux/of.h>
 
 #include <asm/prom.h>
 #include <asm/oplib.h>
@@ -81,7 +81,7 @@ static void __init sun4v_path_component(struct device_node *dp, char *tmp_buf)
 		return;
 
 	regs = rprop->value;
-	if (!is_root_node(dp->parent)) {
+	if (!of_node_is_root(dp->parent)) {
 		sprintf(tmp_buf, "%s@%x,%x",
 			dp->name,
 			(unsigned int) (regs->phys_addr >> 32UL),
@@ -121,7 +121,7 @@ static void __init sun4u_path_component(struct device_node *dp, char *tmp_buf)
 		return;
 
 	regs = prop->value;
-	if (!is_root_node(dp->parent)) {
+	if (!of_node_is_root(dp->parent)) {
 		sprintf(tmp_buf, "%s@%x,%x",
 			dp->name,
 			(unsigned int) (regs->phys_addr >> 32UL),
diff --git a/arch/sparc/kernel/prom_common.c b/arch/sparc/kernel/prom_common.c
index 57ac9e28be0c..1f830da2ddf2 100644
--- a/arch/sparc/kernel/prom_common.c
+++ b/arch/sparc/kernel/prom_common.c
@@ -244,7 +244,7 @@ char * __init build_full_name(struct device_node *dp)
 
 	n = prom_early_alloc(len);
 	strcpy(n, dp->parent->full_name);
-	if (!is_root_node(dp->parent)) {
+	if (!of_node_is_root(dp->parent)) {
 		strcpy(n + plen, "/");
 		plen++;
 	}
diff --git a/include/linux/of.h b/include/linux/of.h
index a367e19bb3af..b0756f33249e 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -70,6 +70,11 @@ extern struct device_node *allnodes;
 extern struct device_node *of_chosen;
 extern rwlock_t devtree_lock;
 
+static inline bool of_node_is_root(const struct device_node *node)
+{
+	return node && (node->parent == NULL);
+}
+
 static inline int of_node_check_flag(struct device_node *n, unsigned long flag)
 {
 	return test_bit(flag, &n->_flags);
-- 
cgit v1.2.3-59-g8ed1b


From 4cf51c383d7a8d472a6090a0d19c371d40e823c9 Mon Sep 17 00:00:00 2001
From: Joonyoung Shim <jy0922.shim@samsung.com>
Date: Wed, 14 Jul 2010 21:55:30 -0700
Subject: Input: Add ATMEL QT602240 touchscreen driver

The chip's full name is AT42QT602240 or ATMXT224. This is a capacitive
touchscreen supporting 10-contact multitouch and using I2C interface.

Signed-off-by: Joonyoung Shim <jy0922.shim@samsung.com>
Acked-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/Kconfig       |   12 +
 drivers/input/touchscreen/Makefile      |    1 +
 drivers/input/touchscreen/qt602240_ts.c | 1401 +++++++++++++++++++++++++++++++
 include/linux/i2c/qt602240_ts.h         |   38 +
 4 files changed, 1452 insertions(+)
 create mode 100644 drivers/input/touchscreen/qt602240_ts.c
 create mode 100644 include/linux/i2c/qt602240_ts.h

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index ff18d896ea6d..7bfcfdff6cf8 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -291,6 +291,18 @@ config TOUCHSCREEN_PENMOUNT
 	  To compile this driver as a module, choose M here: the
 	  module will be called penmount.
 
+config TOUCHSCREEN_QT602240
+	tristate "QT602240 I2C Touchscreen"
+	depends on I2C
+	help
+	  Say Y here if you have the AT42QT602240/ATMXT224 I2C touchscreen
+	  connected to your system.
+
+	  If unsure, say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called qt602240_ts.
+
 config TOUCHSCREEN_MIGOR
 	tristate "Renesas MIGO-R touchscreen"
 	depends on SH_MIGOR && I2C
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile
index 9efdd4424757..779de0d9d413 100644
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_TOUCHSCREEN_HTCPEN)	+= htcpen.o
 obj-$(CONFIG_TOUCHSCREEN_USB_COMPOSITE)	+= usbtouchscreen.o
 obj-$(CONFIG_TOUCHSCREEN_PCAP)		+= pcap_ts.o
 obj-$(CONFIG_TOUCHSCREEN_PENMOUNT)	+= penmount.o
+obj-$(CONFIG_TOUCHSCREEN_QT602240)	+= qt602240_ts.o
 obj-$(CONFIG_TOUCHSCREEN_S3C2410)	+= s3c2410_ts.o
 obj-$(CONFIG_TOUCHSCREEN_TOUCHIT213)	+= touchit213.o
 obj-$(CONFIG_TOUCHSCREEN_TOUCHRIGHT)	+= touchright.o
diff --git a/drivers/input/touchscreen/qt602240_ts.c b/drivers/input/touchscreen/qt602240_ts.c
new file mode 100644
index 000000000000..66b26ad3032a
--- /dev/null
+++ b/drivers/input/touchscreen/qt602240_ts.c
@@ -0,0 +1,1401 @@
+/*
+ * AT42QT602240/ATMXT224 Touchscreen driver
+ *
+ * Copyright (C) 2010 Samsung Electronics Co.Ltd
+ * Author: Joonyoung Shim <jy0922.shim@samsung.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/firmware.h>
+#include <linux/i2c.h>
+#include <linux/i2c/qt602240_ts.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+/* Version */
+#define QT602240_VER_20			20
+#define QT602240_VER_21			21
+#define QT602240_VER_22			22
+
+/* Slave addresses */
+#define QT602240_APP_LOW		0x4a
+#define QT602240_APP_HIGH		0x4b
+#define QT602240_BOOT_LOW		0x24
+#define QT602240_BOOT_HIGH		0x25
+
+/* Firmware */
+#define QT602240_FW_NAME		"qt602240.fw"
+
+/* Registers */
+#define QT602240_FAMILY_ID		0x00
+#define QT602240_VARIANT_ID		0x01
+#define QT602240_VERSION		0x02
+#define QT602240_BUILD			0x03
+#define QT602240_MATRIX_X_SIZE		0x04
+#define QT602240_MATRIX_Y_SIZE		0x05
+#define QT602240_OBJECT_NUM		0x06
+#define QT602240_OBJECT_START		0x07
+
+#define QT602240_OBJECT_SIZE		6
+
+/* Object types */
+#define QT602240_DEBUG_DIAGNOSTIC	37
+#define QT602240_GEN_MESSAGE		5
+#define QT602240_GEN_COMMAND		6
+#define QT602240_GEN_POWER		7
+#define QT602240_GEN_ACQUIRE		8
+#define QT602240_TOUCH_MULTI		9
+#define QT602240_TOUCH_KEYARRAY		15
+#define QT602240_TOUCH_PROXIMITY	23
+#define QT602240_PROCI_GRIPFACE		20
+#define QT602240_PROCG_NOISE		22
+#define QT602240_PROCI_ONETOUCH		24
+#define QT602240_PROCI_TWOTOUCH		27
+#define QT602240_SPT_COMMSCONFIG	18	/* firmware ver 21 over */
+#define QT602240_SPT_GPIOPWM		19
+#define QT602240_SPT_SELFTEST		25
+#define QT602240_SPT_CTECONFIG		28
+#define QT602240_SPT_USERDATA		38	/* firmware ver 21 over */
+
+/* QT602240_GEN_COMMAND field */
+#define QT602240_COMMAND_RESET		0
+#define QT602240_COMMAND_BACKUPNV	1
+#define QT602240_COMMAND_CALIBRATE	2
+#define QT602240_COMMAND_REPORTALL	3
+#define QT602240_COMMAND_DIAGNOSTIC	5
+
+/* QT602240_GEN_POWER field */
+#define QT602240_POWER_IDLEACQINT	0
+#define QT602240_POWER_ACTVACQINT	1
+#define QT602240_POWER_ACTV2IDLETO	2
+
+/* QT602240_GEN_ACQUIRE field */
+#define QT602240_ACQUIRE_CHRGTIME	0
+#define QT602240_ACQUIRE_TCHDRIFT	2
+#define QT602240_ACQUIRE_DRIFTST	3
+#define QT602240_ACQUIRE_TCHAUTOCAL	4
+#define QT602240_ACQUIRE_SYNC		5
+#define QT602240_ACQUIRE_ATCHCALST	6
+#define QT602240_ACQUIRE_ATCHCALSTHR	7
+
+/* QT602240_TOUCH_MULTI field */
+#define QT602240_TOUCH_CTRL		0
+#define QT602240_TOUCH_XORIGIN		1
+#define QT602240_TOUCH_YORIGIN		2
+#define QT602240_TOUCH_XSIZE		3
+#define QT602240_TOUCH_YSIZE		4
+#define QT602240_TOUCH_BLEN		6
+#define QT602240_TOUCH_TCHTHR		7
+#define QT602240_TOUCH_TCHDI		8
+#define QT602240_TOUCH_ORIENT		9
+#define QT602240_TOUCH_MOVHYSTI		11
+#define QT602240_TOUCH_MOVHYSTN		12
+#define QT602240_TOUCH_NUMTOUCH		14
+#define QT602240_TOUCH_MRGHYST		15
+#define QT602240_TOUCH_MRGTHR		16
+#define QT602240_TOUCH_AMPHYST		17
+#define QT602240_TOUCH_XRANGE_LSB	18
+#define QT602240_TOUCH_XRANGE_MSB	19
+#define QT602240_TOUCH_YRANGE_LSB	20
+#define QT602240_TOUCH_YRANGE_MSB	21
+#define QT602240_TOUCH_XLOCLIP		22
+#define QT602240_TOUCH_XHICLIP		23
+#define QT602240_TOUCH_YLOCLIP		24
+#define QT602240_TOUCH_YHICLIP		25
+#define QT602240_TOUCH_XEDGECTRL	26
+#define QT602240_TOUCH_XEDGEDIST	27
+#define QT602240_TOUCH_YEDGECTRL	28
+#define QT602240_TOUCH_YEDGEDIST	29
+#define QT602240_TOUCH_JUMPLIMIT	30	/* firmware ver 22 over */
+
+/* QT602240_PROCI_GRIPFACE field */
+#define QT602240_GRIPFACE_CTRL		0
+#define QT602240_GRIPFACE_XLOGRIP	1
+#define QT602240_GRIPFACE_XHIGRIP	2
+#define QT602240_GRIPFACE_YLOGRIP	3
+#define QT602240_GRIPFACE_YHIGRIP	4
+#define QT602240_GRIPFACE_MAXTCHS	5
+#define QT602240_GRIPFACE_SZTHR1	7
+#define QT602240_GRIPFACE_SZTHR2	8
+#define QT602240_GRIPFACE_SHPTHR1	9
+#define QT602240_GRIPFACE_SHPTHR2	10
+#define QT602240_GRIPFACE_SUPEXTTO	11
+
+/* QT602240_PROCI_NOISE field */
+#define QT602240_NOISE_CTRL		0
+#define QT602240_NOISE_OUTFLEN		1
+#define QT602240_NOISE_GCAFUL_LSB	3
+#define QT602240_NOISE_GCAFUL_MSB	4
+#define QT602240_NOISE_GCAFLL_LSB	5
+#define QT602240_NOISE_GCAFLL_MSB	6
+#define QT602240_NOISE_ACTVGCAFVALID	7
+#define QT602240_NOISE_NOISETHR		8
+#define QT602240_NOISE_FREQHOPSCALE	10
+#define QT602240_NOISE_FREQ0		11
+#define QT602240_NOISE_FREQ1		12
+#define QT602240_NOISE_FREQ2		13
+#define QT602240_NOISE_FREQ3		14
+#define QT602240_NOISE_FREQ4		15
+#define QT602240_NOISE_IDLEGCAFVALID	16
+
+/* QT602240_SPT_COMMSCONFIG */
+#define QT602240_COMMS_CTRL		0
+#define QT602240_COMMS_CMD		1
+
+/* QT602240_SPT_CTECONFIG field */
+#define QT602240_CTE_CTRL		0
+#define QT602240_CTE_CMD		1
+#define QT602240_CTE_MODE		2
+#define QT602240_CTE_IDLEGCAFDEPTH	3
+#define QT602240_CTE_ACTVGCAFDEPTH	4
+#define QT602240_CTE_VOLTAGE		5	/* firmware ver 21 over */
+
+#define QT602240_VOLTAGE_DEFAULT	2700000
+#define QT602240_VOLTAGE_STEP		10000
+
+/* Define for QT602240_GEN_COMMAND */
+#define QT602240_BOOT_VALUE		0xa5
+#define QT602240_BACKUP_VALUE		0x55
+#define QT602240_BACKUP_TIME		25	/* msec */
+#define QT602240_RESET_TIME		65	/* msec */
+
+#define QT602240_FWRESET_TIME		175	/* msec */
+
+/* Command to unlock bootloader */
+#define QT602240_UNLOCK_CMD_MSB		0xaa
+#define QT602240_UNLOCK_CMD_LSB		0xdc
+
+/* Bootloader mode status */
+#define QT602240_WAITING_BOOTLOAD_CMD	0xc0	/* valid 7 6 bit only */
+#define QT602240_WAITING_FRAME_DATA	0x80	/* valid 7 6 bit only */
+#define QT602240_FRAME_CRC_CHECK	0x02
+#define QT602240_FRAME_CRC_FAIL		0x03
+#define QT602240_FRAME_CRC_PASS		0x04
+#define QT602240_APP_CRC_FAIL		0x40	/* valid 7 8 bit only */
+#define QT602240_BOOT_STATUS_MASK	0x3f
+
+/* Touch status */
+#define QT602240_SUPPRESS		(1 << 1)
+#define QT602240_AMP			(1 << 2)
+#define QT602240_VECTOR			(1 << 3)
+#define QT602240_MOVE			(1 << 4)
+#define QT602240_RELEASE		(1 << 5)
+#define QT602240_PRESS			(1 << 6)
+#define QT602240_DETECT			(1 << 7)
+
+/* Touchscreen absolute values */
+#define QT602240_MAX_XC			0x3ff
+#define QT602240_MAX_YC			0x3ff
+#define QT602240_MAX_AREA		0xff
+
+#define QT602240_MAX_FINGER		10
+
+/* Initial register values recommended from chip vendor */
+static const u8 init_vals_ver_20[] = {
+	/* QT602240_GEN_COMMAND(6) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_GEN_POWER(7) */
+	0x20, 0xff, 0x32,
+	/* QT602240_GEN_ACQUIRE(8) */
+	0x08, 0x05, 0x05, 0x00, 0x00, 0x00, 0x05, 0x14,
+	/* QT602240_TOUCH_MULTI(9) */
+	0x00, 0x00, 0x00, 0x11, 0x0a, 0x00, 0x00, 0x00, 0x02, 0x00,
+	0x00, 0x01, 0x01, 0x0e, 0x0a, 0x0a, 0x0a, 0x0a, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0x64,
+	/* QT602240_TOUCH_KEYARRAY(15) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00,
+	/* QT602240_SPT_GPIOPWM(19) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00,
+	/* QT602240_PROCI_GRIPFACE(20) */
+	0x00, 0x64, 0x64, 0x64, 0x64, 0x00, 0x00, 0x1e, 0x14, 0x04,
+	0x1e, 0x00,
+	/* QT602240_PROCG_NOISE(22) */
+	0x05, 0x00, 0x00, 0x19, 0x00, 0xe7, 0xff, 0x04, 0x32, 0x00,
+	0x01, 0x0a, 0x0f, 0x14, 0x00, 0x00, 0xe8,
+	/* QT602240_TOUCH_PROXIMITY(23) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00,
+	/* QT602240_PROCI_ONETOUCH(24) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_SPT_SELFTEST(25) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	/* QT602240_PROCI_TWOTOUCH(27) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_SPT_CTECONFIG(28) */
+	0x00, 0x00, 0x00, 0x04, 0x08,
+};
+
+static const u8 init_vals_ver_21[] = {
+	/* QT602240_GEN_COMMAND(6) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_GEN_POWER(7) */
+	0x20, 0xff, 0x32,
+	/* QT602240_GEN_ACQUIRE(8) */
+	0x0a, 0x00, 0x05, 0x00, 0x00, 0x00, 0x09, 0x23,
+	/* QT602240_TOUCH_MULTI(9) */
+	0x00, 0x00, 0x00, 0x13, 0x0b, 0x00, 0x00, 0x00, 0x02, 0x00,
+	0x00, 0x01, 0x01, 0x0e, 0x0a, 0x0a, 0x0a, 0x0a, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_TOUCH_KEYARRAY(15) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00,
+	/* QT602240_SPT_GPIOPWM(19) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_PROCI_GRIPFACE(20) */
+	0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x28, 0x04,
+	0x0f, 0x0a,
+	/* QT602240_PROCG_NOISE(22) */
+	0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x23, 0x00,
+	0x00, 0x05, 0x0f, 0x19, 0x23, 0x2d, 0x03,
+	/* QT602240_TOUCH_PROXIMITY(23) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00,
+	/* QT602240_PROCI_ONETOUCH(24) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_SPT_SELFTEST(25) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	/* QT602240_PROCI_TWOTOUCH(27) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_SPT_CTECONFIG(28) */
+	0x00, 0x00, 0x00, 0x08, 0x10, 0x00,
+};
+
+static const u8 init_vals_ver_22[] = {
+	/* QT602240_GEN_COMMAND(6) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_GEN_POWER(7) */
+	0x20, 0xff, 0x32,
+	/* QT602240_GEN_ACQUIRE(8) */
+	0x0a, 0x00, 0x05, 0x00, 0x00, 0x00, 0x09, 0x23,
+	/* QT602240_TOUCH_MULTI(9) */
+	0x00, 0x00, 0x00, 0x13, 0x0b, 0x00, 0x00, 0x00, 0x02, 0x00,
+	0x00, 0x01, 0x01, 0x0e, 0x0a, 0x0a, 0x0a, 0x0a, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00,
+	/* QT602240_TOUCH_KEYARRAY(15) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00,
+	/* QT602240_SPT_GPIOPWM(19) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_PROCI_GRIPFACE(20) */
+	0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x28, 0x04,
+	0x0f, 0x0a,
+	/* QT602240_PROCG_NOISE(22) */
+	0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x23, 0x00,
+	0x00, 0x05, 0x0f, 0x19, 0x23, 0x2d, 0x03,
+	/* QT602240_TOUCH_PROXIMITY(23) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_PROCI_ONETOUCH(24) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_SPT_SELFTEST(25) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	/* QT602240_PROCI_TWOTOUCH(27) */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* QT602240_SPT_CTECONFIG(28) */
+	0x00, 0x00, 0x00, 0x08, 0x10, 0x00,
+};
+
+struct qt602240_info {
+	u8 family_id;
+	u8 variant_id;
+	u8 version;
+	u8 build;
+	u8 matrix_xsize;
+	u8 matrix_ysize;
+	u8 object_num;
+};
+
+struct qt602240_object {
+	u8 type;
+	u16 start_address;
+	u8 size;
+	u8 instances;
+	u8 num_report_ids;
+
+	/* to map object and message */
+	u8 max_reportid;
+};
+
+struct qt602240_message {
+	u8 reportid;
+	u8 message[7];
+	u8 checksum;
+};
+
+struct qt602240_finger {
+	int status;
+	int x;
+	int y;
+	int area;
+};
+
+/* Each client has this additional data */
+struct qt602240_data {
+	struct i2c_client *client;
+	struct input_dev *input_dev;
+	const struct qt602240_platform_data *pdata;
+	struct qt602240_object *object_table;
+	struct qt602240_info info;
+	struct qt602240_finger finger[QT602240_MAX_FINGER];
+	unsigned int irq;
+};
+
+static bool qt602240_object_readable(unsigned int type)
+{
+	switch (type) {
+	case QT602240_GEN_MESSAGE:
+	case QT602240_GEN_COMMAND:
+	case QT602240_GEN_POWER:
+	case QT602240_GEN_ACQUIRE:
+	case QT602240_TOUCH_MULTI:
+	case QT602240_TOUCH_KEYARRAY:
+	case QT602240_TOUCH_PROXIMITY:
+	case QT602240_PROCI_GRIPFACE:
+	case QT602240_PROCG_NOISE:
+	case QT602240_PROCI_ONETOUCH:
+	case QT602240_PROCI_TWOTOUCH:
+	case QT602240_SPT_COMMSCONFIG:
+	case QT602240_SPT_GPIOPWM:
+	case QT602240_SPT_SELFTEST:
+	case QT602240_SPT_CTECONFIG:
+	case QT602240_SPT_USERDATA:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool qt602240_object_writable(unsigned int type)
+{
+	switch (type) {
+	case QT602240_GEN_COMMAND:
+	case QT602240_GEN_POWER:
+	case QT602240_GEN_ACQUIRE:
+	case QT602240_TOUCH_MULTI:
+	case QT602240_TOUCH_KEYARRAY:
+	case QT602240_TOUCH_PROXIMITY:
+	case QT602240_PROCI_GRIPFACE:
+	case QT602240_PROCG_NOISE:
+	case QT602240_PROCI_ONETOUCH:
+	case QT602240_PROCI_TWOTOUCH:
+	case QT602240_SPT_GPIOPWM:
+	case QT602240_SPT_SELFTEST:
+	case QT602240_SPT_CTECONFIG:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static void qt602240_dump_message(struct device *dev,
+				  struct qt602240_message *message)
+{
+	dev_dbg(dev, "reportid:\t0x%x\n", message->reportid);
+	dev_dbg(dev, "message1:\t0x%x\n", message->message[0]);
+	dev_dbg(dev, "message2:\t0x%x\n", message->message[1]);
+	dev_dbg(dev, "message3:\t0x%x\n", message->message[2]);
+	dev_dbg(dev, "message4:\t0x%x\n", message->message[3]);
+	dev_dbg(dev, "message5:\t0x%x\n", message->message[4]);
+	dev_dbg(dev, "message6:\t0x%x\n", message->message[5]);
+	dev_dbg(dev, "message7:\t0x%x\n", message->message[6]);
+	dev_dbg(dev, "checksum:\t0x%x\n", message->checksum);
+}
+
+static int qt602240_check_bootloader(struct i2c_client *client,
+				     unsigned int state)
+{
+	u8 val;
+
+recheck:
+	if (i2c_master_recv(client, &val, 1) != 1) {
+		dev_err(&client->dev, "%s: i2c recv failed\n", __func__);
+		return -EIO;
+	}
+
+	switch (state) {
+	case QT602240_WAITING_BOOTLOAD_CMD:
+	case QT602240_WAITING_FRAME_DATA:
+		val &= ~QT602240_BOOT_STATUS_MASK;
+		break;
+	case QT602240_FRAME_CRC_PASS:
+		if (val == QT602240_FRAME_CRC_CHECK)
+			goto recheck;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (val != state) {
+		dev_err(&client->dev, "Unvalid bootloader mode state\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int qt602240_unlock_bootloader(struct i2c_client *client)
+{
+	u8 buf[2];
+
+	buf[0] = QT602240_UNLOCK_CMD_LSB;
+	buf[1] = QT602240_UNLOCK_CMD_MSB;
+
+	if (i2c_master_send(client, buf, 2) != 2) {
+		dev_err(&client->dev, "%s: i2c send failed\n", __func__);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int qt602240_fw_write(struct i2c_client *client,
+			     const u8 *data, unsigned int frame_size)
+{
+	if (i2c_master_send(client, data, frame_size) != frame_size) {
+		dev_err(&client->dev, "%s: i2c send failed\n", __func__);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int __qt602240_read_reg(struct i2c_client *client,
+			       u16 reg, u16 len, void *val)
+{
+	struct i2c_msg xfer[2];
+	u8 buf[2];
+
+	buf[0] = reg & 0xff;
+	buf[1] = (reg >> 8) & 0xff;
+
+	/* Write register */
+	xfer[0].addr = client->addr;
+	xfer[0].flags = 0;
+	xfer[0].len = 2;
+	xfer[0].buf = buf;
+
+	/* Read data */
+	xfer[1].addr = client->addr;
+	xfer[1].flags = I2C_M_RD;
+	xfer[1].len = len;
+	xfer[1].buf = val;
+
+	if (i2c_transfer(client->adapter, xfer, 2) != 2) {
+		dev_err(&client->dev, "%s: i2c transfer failed\n", __func__);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int qt602240_read_reg(struct i2c_client *client, u16 reg, u8 *val)
+{
+	return __qt602240_read_reg(client, reg, 1, val);
+}
+
+static int qt602240_write_reg(struct i2c_client *client, u16 reg, u8 val)
+{
+	u8 buf[3];
+
+	buf[0] = reg & 0xff;
+	buf[1] = (reg >> 8) & 0xff;
+	buf[2] = val;
+
+	if (i2c_master_send(client, buf, 3) != 3) {
+		dev_err(&client->dev, "%s: i2c send failed\n", __func__);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int qt602240_read_object_table(struct i2c_client *client,
+				      u16 reg, u8 *object_buf)
+{
+	return __qt602240_read_reg(client, reg, QT602240_OBJECT_SIZE,
+				   object_buf);
+}
+
+static struct qt602240_object *
+qt602240_get_object(struct qt602240_data *data, u8 type)
+{
+	struct qt602240_object *object;
+	int i;
+
+	for (i = 0; i < data->info.object_num; i++) {
+		object = data->object_table + i;
+		if (object->type == type)
+			return object;
+	}
+
+	dev_err(&data->client->dev, "Invalid object type\n");
+	return NULL;
+}
+
+static int qt602240_read_message(struct qt602240_data *data,
+				 struct qt602240_message *message)
+{
+	struct qt602240_object *object;
+	u16 reg;
+
+	object = qt602240_get_object(data, QT602240_GEN_MESSAGE);
+	if (!object)
+		return -EINVAL;
+
+	reg = object->start_address;
+	return __qt602240_read_reg(data->client, reg,
+			sizeof(struct qt602240_message), message);
+}
+
+static int qt602240_read_object(struct qt602240_data *data,
+				u8 type, u8 offset, u8 *val)
+{
+	struct qt602240_object *object;
+	u16 reg;
+
+	object = qt602240_get_object(data, type);
+	if (!object)
+		return -EINVAL;
+
+	reg = object->start_address;
+	return __qt602240_read_reg(data->client, reg + offset, 1, val);
+}
+
+static int qt602240_write_object(struct qt602240_data *data,
+				 u8 type, u8 offset, u8 val)
+{
+	struct qt602240_object *object;
+	u16 reg;
+
+	object = qt602240_get_object(data, type);
+	if (!object)
+		return -EINVAL;
+
+	reg = object->start_address;
+	return qt602240_write_reg(data->client, reg + offset, val);
+}
+
+static void qt602240_input_report(struct qt602240_data *data, int single_id)
+{
+	struct qt602240_finger *finger = data->finger;
+	struct input_dev *input_dev = data->input_dev;
+	int status = finger[single_id].status;
+	int finger_num = 0;
+	int id;
+
+	for (id = 0; id < QT602240_MAX_FINGER; id++) {
+		if (!finger[id].status)
+			continue;
+
+		input_report_abs(input_dev, ABS_MT_TOUCH_MAJOR,
+				finger[id].status != QT602240_RELEASE ?
+				finger[id].area : 0);
+		input_report_abs(input_dev, ABS_MT_POSITION_X,
+				finger[id].x);
+		input_report_abs(input_dev, ABS_MT_POSITION_Y,
+				finger[id].y);
+		input_mt_sync(input_dev);
+
+		if (finger[id].status == QT602240_RELEASE)
+			finger[id].status = 0;
+		else
+			finger_num++;
+	}
+
+	input_report_key(input_dev, BTN_TOUCH, finger_num > 0);
+
+	if (status != QT602240_RELEASE) {
+		input_report_abs(input_dev, ABS_X, finger[single_id].x);
+		input_report_abs(input_dev, ABS_Y, finger[single_id].y);
+	}
+
+	input_sync(input_dev);
+}
+
+static void qt602240_input_touchevent(struct qt602240_data *data,
+				      struct qt602240_message *message, int id)
+{
+	struct qt602240_finger *finger = data->finger;
+	struct device *dev = &data->client->dev;
+	u8 status = message->message[0];
+	int x;
+	int y;
+	int area;
+
+	/* Check the touch is present on the screen */
+	if (!(status & QT602240_DETECT)) {
+		if (status & QT602240_RELEASE) {
+			dev_dbg(dev, "[%d] released\n", id);
+
+			finger[id].status = QT602240_RELEASE;
+			qt602240_input_report(data, id);
+		}
+		return;
+	}
+
+	/* Check only AMP detection */
+	if (!(status & (QT602240_PRESS | QT602240_MOVE)))
+		return;
+
+	x = (message->message[1] << 2) | ((message->message[3] & ~0x3f) >> 6);
+	y = (message->message[2] << 2) | ((message->message[3] & ~0xf3) >> 2);
+	area = message->message[4];
+
+	dev_dbg(dev, "[%d] %s x: %d, y: %d, area: %d\n", id,
+		status & QT602240_MOVE ? "moved" : "pressed",
+		x, y, area);
+
+	finger[id].status = status & QT602240_MOVE ?
+				QT602240_MOVE : QT602240_PRESS;
+	finger[id].x = x;
+	finger[id].y = y;
+	finger[id].area = area;
+
+	qt602240_input_report(data, id);
+}
+
+static irqreturn_t qt602240_interrupt(int irq, void *dev_id)
+{
+	struct qt602240_data *data = dev_id;
+	struct qt602240_message message;
+	struct qt602240_object *object;
+	struct device *dev = &data->client->dev;
+	int id;
+	u8 reportid;
+	u8 max_reportid;
+	u8 min_reportid;
+
+	do {
+		if (qt602240_read_message(data, &message)) {
+			dev_err(dev, "Failed to read message\n");
+			goto end;
+		}
+
+		reportid = message.reportid;
+
+		/* whether reportid is thing of QT602240_TOUCH_MULTI */
+		object = qt602240_get_object(data, QT602240_TOUCH_MULTI);
+		if (!object)
+			goto end;
+
+		max_reportid = object->max_reportid;
+		min_reportid = max_reportid - object->num_report_ids + 1;
+		id = reportid - min_reportid;
+
+		if (reportid >= min_reportid && reportid <= max_reportid)
+			qt602240_input_touchevent(data, &message, id);
+		else
+			qt602240_dump_message(dev, &message);
+	} while (reportid != 0xff);
+
+end:
+	return IRQ_HANDLED;
+}
+
+static int qt602240_check_reg_init(struct qt602240_data *data)
+{
+	struct qt602240_object *object;
+	struct device *dev = &data->client->dev;
+	int index = 0;
+	int i, j;
+	u8 version = data->info.version;
+	u8 *init_vals;
+
+	switch (version) {
+	case QT602240_VER_20:
+		init_vals = (u8 *)init_vals_ver_20;
+		break;
+	case QT602240_VER_21:
+		init_vals = (u8 *)init_vals_ver_21;
+		break;
+	case QT602240_VER_22:
+		init_vals = (u8 *)init_vals_ver_22;
+		break;
+	default:
+		dev_err(dev, "Firmware version %d doesn't support\n", version);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < data->info.object_num; i++) {
+		object = data->object_table + i;
+
+		if (!qt602240_object_writable(object->type))
+			continue;
+
+		for (j = 0; j < object->size + 1; j++)
+			qt602240_write_object(data, object->type, j,
+					init_vals[index + j]);
+
+		index += object->size + 1;
+	}
+
+	return 0;
+}
+
+static int qt602240_check_matrix_size(struct qt602240_data *data)
+{
+	const struct qt602240_platform_data *pdata = data->pdata;
+	struct device *dev = &data->client->dev;
+	int mode = -1;
+	int error;
+	u8 val;
+
+	dev_dbg(dev, "Number of X lines: %d\n", pdata->x_line);
+	dev_dbg(dev, "Number of Y lines: %d\n", pdata->y_line);
+
+	switch (pdata->x_line) {
+	case 0 ... 15:
+		if (pdata->y_line <= 14)
+			mode = 0;
+		break;
+	case 16:
+		if (pdata->y_line <= 12)
+			mode = 1;
+		if (pdata->y_line == 13 || pdata->y_line == 14)
+			mode = 0;
+		break;
+	case 17:
+		if (pdata->y_line <= 11)
+			mode = 2;
+		if (pdata->y_line == 12 || pdata->y_line == 13)
+			mode = 1;
+		break;
+	case 18:
+		if (pdata->y_line <= 10)
+			mode = 3;
+		if (pdata->y_line == 11 || pdata->y_line == 12)
+			mode = 2;
+		break;
+	case 19:
+		if (pdata->y_line <= 9)
+			mode = 4;
+		if (pdata->y_line == 10 || pdata->y_line == 11)
+			mode = 3;
+		break;
+	case 20:
+		mode = 4;
+	}
+
+	if (mode < 0) {
+		dev_err(dev, "Invalid X/Y lines\n");
+		return -EINVAL;
+	}
+
+	error = qt602240_read_object(data, QT602240_SPT_CTECONFIG,
+				QT602240_CTE_MODE, &val);
+	if (error)
+		return error;
+
+	if (mode == val)
+		return 0;
+
+	/* Change the CTE configuration */
+	qt602240_write_object(data, QT602240_SPT_CTECONFIG,
+			QT602240_CTE_CTRL, 1);
+	qt602240_write_object(data, QT602240_SPT_CTECONFIG,
+			QT602240_CTE_MODE, mode);
+	qt602240_write_object(data, QT602240_SPT_CTECONFIG,
+			QT602240_CTE_CTRL, 0);
+
+	return 0;
+}
+
+static int qt602240_make_highchg(struct qt602240_data *data)
+{
+	struct device *dev = &data->client->dev;
+	int count = 10;
+	int error;
+	u8 val;
+
+	/* Read dummy message to make high CHG pin */
+	do {
+		error = qt602240_read_object(data, QT602240_GEN_MESSAGE, 0, &val);
+		if (error)
+			return error;
+	} while ((val != 0xff) && --count);
+
+	if (!count) {
+		dev_err(dev, "CHG pin isn't cleared\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void qt602240_handle_pdata(struct qt602240_data *data)
+{
+	const struct qt602240_platform_data *pdata = data->pdata;
+	u8 voltage;
+
+	/* Set touchscreen lines */
+	qt602240_write_object(data, QT602240_TOUCH_MULTI, QT602240_TOUCH_XSIZE,
+			pdata->x_line);
+	qt602240_write_object(data, QT602240_TOUCH_MULTI, QT602240_TOUCH_YSIZE,
+			pdata->y_line);
+
+	/* Set touchscreen orient */
+	qt602240_write_object(data, QT602240_TOUCH_MULTI, QT602240_TOUCH_ORIENT,
+			pdata->orient);
+
+	/* Set touchscreen burst length */
+	qt602240_write_object(data, QT602240_TOUCH_MULTI,
+			QT602240_TOUCH_BLEN, pdata->blen);
+
+	/* Set touchscreen threshold */
+	qt602240_write_object(data, QT602240_TOUCH_MULTI,
+			QT602240_TOUCH_TCHTHR, pdata->threshold);
+
+	/* Set touchscreen resolution */
+	qt602240_write_object(data, QT602240_TOUCH_MULTI,
+			QT602240_TOUCH_XRANGE_LSB, (pdata->x_size - 1) & 0xff);
+	qt602240_write_object(data, QT602240_TOUCH_MULTI,
+			QT602240_TOUCH_XRANGE_MSB, (pdata->x_size - 1) >> 8);
+	qt602240_write_object(data, QT602240_TOUCH_MULTI,
+			QT602240_TOUCH_YRANGE_LSB, (pdata->y_size - 1) & 0xff);
+	qt602240_write_object(data, QT602240_TOUCH_MULTI,
+			QT602240_TOUCH_YRANGE_MSB, (pdata->y_size - 1) >> 8);
+
+	/* Set touchscreen voltage */
+	if (data->info.version >= QT602240_VER_21 && pdata->voltage) {
+		if (pdata->voltage < QT602240_VOLTAGE_DEFAULT) {
+			voltage = (QT602240_VOLTAGE_DEFAULT - pdata->voltage) /
+				QT602240_VOLTAGE_STEP;
+			voltage = 0xff - voltage + 1;
+		} else
+			voltage = (pdata->voltage - QT602240_VOLTAGE_DEFAULT) /
+				QT602240_VOLTAGE_STEP;
+
+		qt602240_write_object(data, QT602240_SPT_CTECONFIG,
+				QT602240_CTE_VOLTAGE, voltage);
+	}
+}
+
+static int qt602240_get_info(struct qt602240_data *data)
+{
+	struct i2c_client *client = data->client;
+	struct qt602240_info *info = &data->info;
+	int error;
+	u8 val;
+
+	error = qt602240_read_reg(client, QT602240_FAMILY_ID, &val);
+	if (error)
+		return error;
+	info->family_id = val;
+
+	error = qt602240_read_reg(client, QT602240_VARIANT_ID, &val);
+	if (error)
+		return error;
+	info->variant_id = val;
+
+	error = qt602240_read_reg(client, QT602240_VERSION, &val);
+	if (error)
+		return error;
+	info->version = val;
+
+	error = qt602240_read_reg(client, QT602240_BUILD, &val);
+	if (error)
+		return error;
+	info->build = val;
+
+	error = qt602240_read_reg(client, QT602240_OBJECT_NUM, &val);
+	if (error)
+		return error;
+	info->object_num = val;
+
+	return 0;
+}
+
+static int qt602240_get_object_table(struct qt602240_data *data)
+{
+	int error;
+	int i;
+	u16 reg;
+	u8 reportid = 0;
+	u8 buf[QT602240_OBJECT_SIZE];
+
+	for (i = 0; i < data->info.object_num; i++) {
+		struct qt602240_object *object = data->object_table + i;
+
+		reg = QT602240_OBJECT_START + QT602240_OBJECT_SIZE * i;
+		error = qt602240_read_object_table(data->client, reg, buf);
+		if (error)
+			return error;
+
+		object->type = buf[0];
+		object->start_address = (buf[2] << 8) | buf[1];
+		object->size = buf[3];
+		object->instances = buf[4];
+		object->num_report_ids = buf[5];
+
+		if (object->num_report_ids) {
+			reportid += object->num_report_ids *
+					(object->instances + 1);
+			object->max_reportid = reportid;
+		}
+	}
+
+	return 0;
+}
+
+static int qt602240_initialize(struct qt602240_data *data)
+{
+	struct i2c_client *client = data->client;
+	struct qt602240_info *info = &data->info;
+	int error;
+	u8 val;
+
+	error = qt602240_get_info(data);
+	if (error)
+		return error;
+
+	data->object_table = kcalloc(info->object_num,
+				     sizeof(struct qt602240_data),
+				     GFP_KERNEL);
+	if (!data->object_table) {
+		dev_err(&client->dev, "Failed to allocate memory\n");
+		return -ENOMEM;
+	}
+
+	/* Get object table information */
+	error = qt602240_get_object_table(data);
+	if (error)
+		return error;
+
+	/* Check register init values */
+	error = qt602240_check_reg_init(data);
+	if (error)
+		return error;
+
+	/* Check X/Y matrix size */
+	error = qt602240_check_matrix_size(data);
+	if (error)
+		return error;
+
+	error = qt602240_make_highchg(data);
+	if (error)
+		return error;
+
+	qt602240_handle_pdata(data);
+
+	/* Backup to memory */
+	qt602240_write_object(data, QT602240_GEN_COMMAND,
+			QT602240_COMMAND_BACKUPNV,
+			QT602240_BACKUP_VALUE);
+	msleep(QT602240_BACKUP_TIME);
+
+	/* Soft reset */
+	qt602240_write_object(data, QT602240_GEN_COMMAND,
+			QT602240_COMMAND_RESET, 1);
+	msleep(QT602240_RESET_TIME);
+
+	/* Update matrix size at info struct */
+	error = qt602240_read_reg(client, QT602240_MATRIX_X_SIZE, &val);
+	if (error)
+		return error;
+	info->matrix_xsize = val;
+
+	error = qt602240_read_reg(client, QT602240_MATRIX_Y_SIZE, &val);
+	if (error)
+		return error;
+	info->matrix_ysize = val;
+
+	dev_info(&client->dev,
+			"Family ID: %d Variant ID: %d Version: %d Build: %d\n",
+			info->family_id, info->variant_id, info->version,
+			info->build);
+
+	dev_info(&client->dev,
+			"Matrix X Size: %d Matrix Y Size: %d Object Num: %d\n",
+			info->matrix_xsize, info->matrix_ysize,
+			info->object_num);
+
+	return 0;
+}
+
+static ssize_t qt602240_object_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct qt602240_data *data = dev_get_drvdata(dev);
+	struct qt602240_object *object;
+	int count = 0;
+	int i, j;
+	int error;
+	u8 val;
+
+	for (i = 0; i < data->info.object_num; i++) {
+		object = data->object_table + i;
+
+		count += sprintf(buf + count,
+				"Object Table Element %d(Type %d)\n",
+				i + 1, object->type);
+
+		if (!qt602240_object_readable(object->type)) {
+			count += sprintf(buf + count, "\n");
+			continue;
+		}
+
+		for (j = 0; j < object->size + 1; j++) {
+			error = qt602240_read_object(data,
+						object->type, j, &val);
+			if (error)
+				return error;
+
+			count += sprintf(buf + count,
+					"  Byte %d: 0x%x (%d)\n", j, val, val);
+		}
+
+		count += sprintf(buf + count, "\n");
+	}
+
+	return count;
+}
+
+static int qt602240_load_fw(struct device *dev, const char *fn)
+{
+	struct qt602240_data *data = dev_get_drvdata(dev);
+	struct i2c_client *client = data->client;
+	const struct firmware *fw = NULL;
+	unsigned int frame_size;
+	unsigned int pos = 0;
+	int ret;
+
+	ret = request_firmware(&fw, fn, dev);
+	if (ret) {
+		dev_err(dev, "Unable to open firmware %s\n", fn);
+		return ret;
+	}
+
+	/* Change to the bootloader mode */
+	qt602240_write_object(data, QT602240_GEN_COMMAND,
+			QT602240_COMMAND_RESET, QT602240_BOOT_VALUE);
+	msleep(QT602240_RESET_TIME);
+
+	/* Change to slave address of bootloader */
+	if (client->addr == QT602240_APP_LOW)
+		client->addr = QT602240_BOOT_LOW;
+	else
+		client->addr = QT602240_BOOT_HIGH;
+
+	ret = qt602240_check_bootloader(client, QT602240_WAITING_BOOTLOAD_CMD);
+	if (ret)
+		goto out;
+
+	/* Unlock bootloader */
+	qt602240_unlock_bootloader(client);
+
+	while (pos < fw->size) {
+		ret = qt602240_check_bootloader(client,
+						QT602240_WAITING_FRAME_DATA);
+		if (ret)
+			goto out;
+
+		frame_size = ((*(fw->data + pos) << 8) | *(fw->data + pos + 1));
+
+		/* We should add 2 at frame size as the the firmware data is not
+		 * included the CRC bytes.
+		 */
+		frame_size += 2;
+
+		/* Write one frame to device */
+		qt602240_fw_write(client, fw->data + pos, frame_size);
+
+		ret = qt602240_check_bootloader(client,
+						QT602240_FRAME_CRC_PASS);
+		if (ret)
+			goto out;
+
+		pos += frame_size;
+
+		dev_dbg(dev, "Updated %d bytes / %zd bytes\n", pos, fw->size);
+	}
+
+out:
+	release_firmware(fw);
+
+	/* Change to slave address of application */
+	if (client->addr == QT602240_BOOT_LOW)
+		client->addr = QT602240_APP_LOW;
+	else
+		client->addr = QT602240_APP_HIGH;
+
+	return ret;
+}
+
+static ssize_t qt602240_update_fw_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct qt602240_data *data = dev_get_drvdata(dev);
+	unsigned int version;
+	int error;
+
+	if (sscanf(buf, "%u", &version) != 1) {
+		dev_err(dev, "Invalid values\n");
+		return -EINVAL;
+	}
+
+	if (data->info.version < QT602240_VER_21 || version < QT602240_VER_21) {
+		dev_err(dev, "FW update supported starting with version 21\n");
+		return -EINVAL;
+	}
+
+	disable_irq(data->irq);
+
+	error = qt602240_load_fw(dev, QT602240_FW_NAME);
+	if (error) {
+		dev_err(dev, "The firmware update failed(%d)\n", error);
+		count = error;
+	} else {
+		dev_dbg(dev, "The firmware update succeeded\n");
+
+		/* Wait for reset */
+		msleep(QT602240_FWRESET_TIME);
+
+		kfree(data->object_table);
+		data->object_table = NULL;
+
+		qt602240_initialize(data);
+	}
+
+	enable_irq(data->irq);
+
+	return count;
+}
+
+static DEVICE_ATTR(object, 0444, qt602240_object_show, NULL);
+static DEVICE_ATTR(update_fw, 0664, NULL, qt602240_update_fw_store);
+
+static struct attribute *qt602240_attrs[] = {
+	&dev_attr_object.attr,
+	&dev_attr_update_fw.attr,
+	NULL
+};
+
+static const struct attribute_group qt602240_attr_group = {
+	.attrs = qt602240_attrs,
+};
+
+static void qt602240_start(struct qt602240_data *data)
+{
+	/* Touch enable */
+	qt602240_write_object(data,
+			QT602240_TOUCH_MULTI, QT602240_TOUCH_CTRL, 0x83);
+}
+
+static void qt602240_stop(struct qt602240_data *data)
+{
+	/* Touch disable */
+	qt602240_write_object(data,
+			QT602240_TOUCH_MULTI, QT602240_TOUCH_CTRL, 0);
+}
+
+static int qt602240_input_open(struct input_dev *dev)
+{
+	struct qt602240_data *data = input_get_drvdata(dev);
+
+	qt602240_start(data);
+
+	return 0;
+}
+
+static void qt602240_input_close(struct input_dev *dev)
+{
+	struct qt602240_data *data = input_get_drvdata(dev);
+
+	qt602240_stop(data);
+}
+
+static int __devinit qt602240_probe(struct i2c_client *client,
+		const struct i2c_device_id *id)
+{
+	struct qt602240_data *data;
+	struct input_dev *input_dev;
+	int error;
+
+	if (!client->dev.platform_data)
+		return -EINVAL;
+
+	data = kzalloc(sizeof(struct qt602240_data), GFP_KERNEL);
+	input_dev = input_allocate_device();
+	if (!data || !input_dev) {
+		dev_err(&client->dev, "Failed to allocate memory\n");
+		error = -ENOMEM;
+		goto err_free_mem;
+	}
+
+	input_dev->name = "AT42QT602240/ATMXT224 Touchscreen";
+	input_dev->id.bustype = BUS_I2C;
+	input_dev->dev.parent = &client->dev;
+	input_dev->open = qt602240_input_open;
+	input_dev->close = qt602240_input_close;
+
+	__set_bit(EV_ABS, input_dev->evbit);
+	__set_bit(EV_KEY, input_dev->evbit);
+	__set_bit(BTN_TOUCH, input_dev->keybit);
+
+	/* For single touch */
+	input_set_abs_params(input_dev, ABS_X,
+			     0, QT602240_MAX_XC, 0, 0);
+	input_set_abs_params(input_dev, ABS_Y,
+			     0, QT602240_MAX_YC, 0, 0);
+
+	/* For multi touch */
+	input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR,
+			     0, QT602240_MAX_AREA, 0, 0);
+	input_set_abs_params(input_dev, ABS_MT_POSITION_X,
+			     0, QT602240_MAX_XC, 0, 0);
+	input_set_abs_params(input_dev, ABS_MT_POSITION_Y,
+			     0, QT602240_MAX_YC, 0, 0);
+
+	input_set_drvdata(input_dev, data);
+
+	data->client = client;
+	data->input_dev = input_dev;
+	data->pdata = client->dev.platform_data;
+	data->irq = client->irq;
+
+	i2c_set_clientdata(client, data);
+
+	error = qt602240_initialize(data);
+	if (error)
+		goto err_free_object;
+
+	error = request_threaded_irq(client->irq, NULL, qt602240_interrupt,
+			IRQF_TRIGGER_FALLING, client->dev.driver->name, data);
+	if (error) {
+		dev_err(&client->dev, "Failed to register interrupt\n");
+		goto err_free_object;
+	}
+
+	error = input_register_device(input_dev);
+	if (error)
+		goto err_free_irq;
+
+	error = sysfs_create_group(&client->dev.kobj, &qt602240_attr_group);
+	if (error)
+		goto err_unregister_device;
+
+	return 0;
+
+err_unregister_device:
+	input_unregister_device(input_dev);
+	input_dev = NULL;
+err_free_irq:
+	free_irq(client->irq, data);
+err_free_object:
+	kfree(data->object_table);
+err_free_mem:
+	input_free_device(input_dev);
+	kfree(data);
+	return error;
+}
+
+static int __devexit qt602240_remove(struct i2c_client *client)
+{
+	struct qt602240_data *data = i2c_get_clientdata(client);
+
+	sysfs_remove_group(&client->dev.kobj, &qt602240_attr_group);
+	free_irq(data->irq, data);
+	input_unregister_device(data->input_dev);
+	kfree(data->object_table);
+	kfree(data);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int qt602240_suspend(struct i2c_client *client, pm_message_t mesg)
+{
+	struct qt602240_data *data = i2c_get_clientdata(client);
+	struct input_dev *input_dev = data->input_dev;
+
+	mutex_lock(&input_dev->mutex);
+
+	if (input_dev->users)
+		qt602240_stop(data);
+
+	mutex_unlock(&input_dev->mutex);
+
+	return 0;
+}
+
+static int qt602240_resume(struct i2c_client *client)
+{
+	struct qt602240_data *data = i2c_get_clientdata(client);
+	struct input_dev *input_dev = data->input_dev;
+
+	/* Soft reset */
+	qt602240_write_object(data, QT602240_GEN_COMMAND,
+			QT602240_COMMAND_RESET, 1);
+
+	msleep(QT602240_RESET_TIME);
+
+	mutex_lock(&input_dev->mutex);
+
+	if (input_dev->users)
+		qt602240_start(data);
+
+	mutex_unlock(&input_dev->mutex);
+
+	return 0;
+}
+#else
+#define qt602240_suspend	NULL
+#define qt602240_resume		NULL
+#endif
+
+static const struct i2c_device_id qt602240_id[] = {
+	{ "qt602240_ts", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, qt602240_id);
+
+static struct i2c_driver qt602240_driver = {
+	.driver = {
+		.name	= "qt602240_ts",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= qt602240_probe,
+	.remove		= __devexit_p(qt602240_remove),
+	.suspend	= qt602240_suspend,
+	.resume		= qt602240_resume,
+	.id_table	= qt602240_id,
+};
+
+static int __init qt602240_init(void)
+{
+	return i2c_add_driver(&qt602240_driver);
+}
+
+static void __exit qt602240_exit(void)
+{
+	i2c_del_driver(&qt602240_driver);
+}
+
+module_init(qt602240_init);
+module_exit(qt602240_exit);
+
+/* Module information */
+MODULE_AUTHOR("Joonyoung Shim <jy0922.shim@samsung.com>");
+MODULE_DESCRIPTION("AT42QT602240/ATMXT224 Touchscreen driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/i2c/qt602240_ts.h b/include/linux/i2c/qt602240_ts.h
new file mode 100644
index 000000000000..c5033e101094
--- /dev/null
+++ b/include/linux/i2c/qt602240_ts.h
@@ -0,0 +1,38 @@
+/*
+ * AT42QT602240/ATMXT224 Touchscreen driver
+ *
+ * Copyright (C) 2010 Samsung Electronics Co.Ltd
+ * Author: Joonyoung Shim <jy0922.shim@samsung.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __LINUX_QT602240_TS_H
+#define __LINUX_QT602240_TS_H
+
+/* Orient */
+#define QT602240_NORMAL			0x0
+#define QT602240_DIAGONAL		0x1
+#define QT602240_HORIZONTAL_FLIP	0x2
+#define QT602240_ROTATED_90_COUNTER	0x3
+#define QT602240_VERTICAL_FLIP		0x4
+#define QT602240_ROTATED_90		0x5
+#define QT602240_ROTATED_180		0x6
+#define QT602240_DIAGONAL_COUNTER	0x7
+
+/* The platform data for the AT42QT602240/ATMXT224 touchscreen driver */
+struct qt602240_platform_data {
+	unsigned int x_line;
+	unsigned int y_line;
+	unsigned int x_size;
+	unsigned int y_size;
+	unsigned int blen;
+	unsigned int threshold;
+	unsigned int voltage;
+	unsigned char orient;
+};
+
+#endif /* __LINUX_QT602240_TS_H */
-- 
cgit v1.2.3-59-g8ed1b


From cca5cf91c789f3301cc2541a79c323c53be5a8e1 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 15 Jul 2010 11:27:41 +0200
Subject: nfnetlink_log: do not expose NFULNL_COPY_DISABLED to user-space

This patch moves NFULNL_COPY_PACKET definition from
linux/netfilter/nfnetlink_log.h to net/netfilter/nfnetlink_log.h
since this copy mode is only for internal use.

I have also changed the value from 0x03 to 0xff. Thus, we avoid
a gap from user-space that may confuse users if we add new
copy modes in the future.

This change was introduced in:
http://www.spinics.net/lists/netfilter-devel/msg13535.html

Since this change is not included in any stable Linux kernel,
I think it's safe to make this change now. Anyway, this copy
mode does not make any sense from user-space, so this patch
should not break any existing setup.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nfnetlink_log.h | 2 +-
 include/net/netfilter/nfnetlink_log.h   | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h
index 1d0b84aa1d42..ea9b8d380527 100644
--- a/include/linux/netfilter/nfnetlink_log.h
+++ b/include/linux/netfilter/nfnetlink_log.h
@@ -89,7 +89,7 @@ enum nfulnl_attr_config {
 #define NFULNL_COPY_NONE	0x00
 #define NFULNL_COPY_META	0x01
 #define NFULNL_COPY_PACKET	0x02
-#define NFULNL_COPY_DISABLED	0x03
+/* 0xff is reserved, don't use it for new copy modes. */
 
 #define NFULNL_CFG_F_SEQ	0x0001
 #define NFULNL_CFG_F_SEQ_GLOBAL	0x0002
diff --git a/include/net/netfilter/nfnetlink_log.h b/include/net/netfilter/nfnetlink_log.h
index b0569ff0775e..e2dec42c2db2 100644
--- a/include/net/netfilter/nfnetlink_log.h
+++ b/include/net/netfilter/nfnetlink_log.h
@@ -10,5 +10,7 @@ nfulnl_log_packet(u_int8_t pf,
 		  const struct nf_loginfo *li_user,
 		  const char *prefix);
 
+#define NFULNL_COPY_DISABLED    0xff
+
 #endif /* _KER_NFNETLINK_LOG_H */
 
-- 
cgit v1.2.3-59-g8ed1b


From edf0e1fb0d0910880881523cfaaabcec06a2c0d5 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 15 Jul 2010 17:20:46 +0200
Subject: netfilter: add CHECKSUM target

This adds a `CHECKSUM' target, which can be used in the iptables mangle
table.

You can use this target to compute and fill in the checksum in
a packet that lacks a checksum.  This is particularly useful,
if you need to work around old applications such as dhcp clients,
that do not work well with checksum offloads, but don't want to
disable checksum offload in your device.

The problem happens in the field with virtualized applications.
For reference, see Red Hat bz 605555, as well as
http://www.spinics.net/lists/kvm/msg37660.html

Typical expected use (helps old dhclient binary running in a VM):
iptables -A POSTROUTING -t mangle -p udp --dport bootpc \
	-j CHECKSUM --checksum-fill

Includes fixes by Jan Engelhardt <jengelh@medozas.de>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/xt_CHECKSUM.h | 18 +++++++++
 net/netfilter/Kconfig                 | 16 ++++++++
 net/netfilter/Makefile                |  1 +
 net/netfilter/xt_CHECKSUM.c           | 70 +++++++++++++++++++++++++++++++++++
 4 files changed, 105 insertions(+)
 create mode 100644 include/linux/netfilter/xt_CHECKSUM.h
 create mode 100644 net/netfilter/xt_CHECKSUM.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_CHECKSUM.h b/include/linux/netfilter/xt_CHECKSUM.h
new file mode 100644
index 000000000000..3b4fb77acef6
--- /dev/null
+++ b/include/linux/netfilter/xt_CHECKSUM.h
@@ -0,0 +1,18 @@
+/* Header file for iptables ipt_CHECKSUM target
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2010 Red Hat Inc
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This software is distributed under GNU GPL v2, 1991
+*/
+#ifndef _IPT_CHECKSUM_TARGET_H
+#define _IPT_CHECKSUM_TARGET_H
+
+#define XT_CHECKSUM_OP_FILL	0x01	/* fill in checksum in IP header */
+
+struct xt_CHECKSUM_info {
+	__u8 operation;	/* bitset of operations */
+};
+
+#endif /* _IPT_CHECKSUM_TARGET_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index aa2f106347e4..5fb8efa84df3 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -326,6 +326,22 @@ config NETFILTER_XT_CONNMARK
 
 comment "Xtables targets"
 
+config NETFILTER_XT_TARGET_CHECKSUM
+	tristate "CHECKSUM target support"
+	depends on IP_NF_MANGLE || IP6_NF_MANGLE
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This option adds a `CHECKSUM' target, which can be used in the iptables mangle
+	  table.
+
+	  You can use this target to compute and fill in the checksum in
+	  a packet that lacks a checksum.  This is particularly useful,
+	  if you need to work around old applications such as dhcp clients,
+	  that do not work well with checksum offloads, but don't want to disable
+	  checksum offload in your device.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_TARGET_CLASSIFY
 	tristate '"CLASSIFY" target support'
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index e28420aac5ef..36ef8e63be1e 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
 obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
 
 # targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
new file mode 100644
index 000000000000..0f642ef8cd26
--- /dev/null
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -0,0 +1,70 @@
+/* iptables module for the packet checksum mangling
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * (C) 2010 Red Hat, Inc.
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>");
+MODULE_DESCRIPTION("Xtables: checksum modification");
+MODULE_ALIAS("ipt_CHECKSUM");
+MODULE_ALIAS("ip6t_CHECKSUM");
+
+static unsigned int
+checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		skb_checksum_help(skb);
+
+	return XT_CONTINUE;
+}
+
+static int checksum_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_CHECKSUM_info *einfo = par->targinfo;
+
+	if (einfo->operation & ~XT_CHECKSUM_OP_FILL) {
+		pr_info("unsupported CHECKSUM operation %x\n", einfo->operation);
+		return -EINVAL;
+	}
+	if (!einfo->operation) {
+		pr_info("no CHECKSUM operation enabled\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_target checksum_tg_reg __read_mostly = {
+	.name		= "CHECKSUM",
+	.family		= NFPROTO_UNSPEC,
+	.target		= checksum_tg,
+	.targetsize	= sizeof(struct xt_CHECKSUM_info),
+	.table		= "mangle",
+	.checkentry	= checksum_tg_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init checksum_tg_init(void)
+{
+	return xt_register_target(&checksum_tg_reg);
+}
+
+static void __exit checksum_tg_exit(void)
+{
+	xt_unregister_target(&checksum_tg_reg);
+}
+
+module_init(checksum_tg_init);
+module_exit(checksum_tg_exit);
-- 
cgit v1.2.3-59-g8ed1b


From 40d007e7df1dab17bf1ecf91e718218354d963d7 Mon Sep 17 00:00:00 2001
From: Henrik Rydberg <rydberg@euromail.se>
Date: Thu, 15 Jul 2010 23:10:10 -0700
Subject: Input: introduce MT event slots

With the rapidly increasing number of intelligent multi-contact and
multi-user devices, the need to send digested, filtered information
from a set of different sources within the same device is imminent.
This patch adds the concept of slots to the MT protocol. The slots
enumerate a set of identified sources, such that all MT events
can be passed independently and selectively per identified source.

The protocol works like this: Instead of sending a SYN_MT_REPORT
event immediately after the contact data, one sends an ABS_MT_SLOT
event immediately before the contact data. The input core will only
emit events for slots with modified MT events. It is assumed that
the same slot is used for the duration of an initiated contact.

Acked-by: Ping Cheng <pingc@wacom.com>
Acked-by: Chase Douglas <chase.douglas@canonical.com>
Acked-by: Rafi Rubin <rafi@seas.upenn.edu>
Signed-off-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/evdev.c |   4 ++
 drivers/input/input.c | 135 ++++++++++++++++++++++++++++++++++----------------
 include/linux/input.h |  33 ++++++++++++
 3 files changed, 129 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index cd323254ca6f..fc5afbd78625 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -686,6 +686,10 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
 								  sizeof(struct input_absinfo))))
 					return -EFAULT;
 
+				/* We can't change number of reserved MT slots */
+				if (t == ABS_MT_SLOT)
+					return -EINVAL;
+
 				/*
 				 * Take event lock to ensure that we are not
 				 * changing device parameters in the middle
diff --git a/drivers/input/input.c b/drivers/input/input.c
index a3d5485154e7..54109c33e36c 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -33,25 +33,6 @@ MODULE_LICENSE("GPL");
 
 #define INPUT_DEVICES	256
 
-/*
- * EV_ABS events which should not be cached are listed here.
- */
-static unsigned int input_abs_bypass_init_data[] __initdata = {
-	ABS_MT_TOUCH_MAJOR,
-	ABS_MT_TOUCH_MINOR,
-	ABS_MT_WIDTH_MAJOR,
-	ABS_MT_WIDTH_MINOR,
-	ABS_MT_ORIENTATION,
-	ABS_MT_POSITION_X,
-	ABS_MT_POSITION_Y,
-	ABS_MT_TOOL_TYPE,
-	ABS_MT_BLOB_ID,
-	ABS_MT_TRACKING_ID,
-	ABS_MT_PRESSURE,
-	0
-};
-static unsigned long input_abs_bypass[BITS_TO_LONGS(ABS_CNT)];
-
 static LIST_HEAD(input_dev_list);
 static LIST_HEAD(input_handler_list);
 
@@ -181,6 +162,56 @@ static void input_stop_autorepeat(struct input_dev *dev)
 #define INPUT_PASS_TO_DEVICE	2
 #define INPUT_PASS_TO_ALL	(INPUT_PASS_TO_HANDLERS | INPUT_PASS_TO_DEVICE)
 
+static int input_handle_abs_event(struct input_dev *dev,
+				  unsigned int code, int *pval)
+{
+	bool is_mt_event;
+	int *pold;
+
+	if (code == ABS_MT_SLOT) {
+		/*
+		 * "Stage" the event; we'll flush it later, when we
+		 * get actiual touch data.
+		 */
+		if (*pval >= 0 && *pval < dev->mtsize)
+			dev->slot = *pval;
+
+		return INPUT_IGNORE_EVENT;
+	}
+
+	is_mt_event = code >= ABS_MT_FIRST && code <= ABS_MT_LAST;
+
+	if (!is_mt_event) {
+		pold = &dev->abs[code];
+	} else if (dev->mt) {
+		struct input_mt_slot *mtslot = &dev->mt[dev->slot];
+		pold = &mtslot->abs[code - ABS_MT_FIRST];
+	} else {
+		/*
+		 * Bypass filtering for multitouch events when
+		 * not employing slots.
+		 */
+		pold = NULL;
+	}
+
+	if (pold) {
+		*pval = input_defuzz_abs_event(*pval, *pold,
+						dev->absfuzz[code]);
+		if (*pold == *pval)
+			return INPUT_IGNORE_EVENT;
+
+		*pold = *pval;
+	}
+
+	/* Flush pending "slot" event */
+	if (is_mt_event && dev->slot != dev->abs[ABS_MT_SLOT]) {
+		dev->abs[ABS_MT_SLOT] = dev->slot;
+		input_pass_event(dev, EV_ABS, ABS_MT_SLOT, dev->slot);
+	}
+
+	return INPUT_PASS_TO_HANDLERS;
+}
+
 static void input_handle_event(struct input_dev *dev,
 			       unsigned int type, unsigned int code, int value)
 {
@@ -233,21 +264,9 @@ static void input_handle_event(struct input_dev *dev,
 		break;
 
 	case EV_ABS:
-		if (is_event_supported(code, dev->absbit, ABS_MAX)) {
-
-			if (test_bit(code, input_abs_bypass)) {
-				disposition = INPUT_PASS_TO_HANDLERS;
-				break;
-			}
+		if (is_event_supported(code, dev->absbit, ABS_MAX))
+			disposition = input_handle_abs_event(dev, code, &value);
 
-			value = input_defuzz_abs_event(value,
-					dev->abs[code], dev->absfuzz[code]);
-
-			if (dev->abs[code] != value) {
-				dev->abs[code] = value;
-				disposition = INPUT_PASS_TO_HANDLERS;
-			}
-		}
 		break;
 
 	case EV_REL:
@@ -1288,6 +1307,7 @@ static void input_dev_release(struct device *device)
 	struct input_dev *dev = to_input_dev(device);
 
 	input_ff_destroy(dev);
+	input_mt_destroy_slots(dev);
 	kfree(dev);
 
 	module_put(THIS_MODULE);
@@ -1536,6 +1556,45 @@ void input_free_device(struct input_dev *dev)
 }
 EXPORT_SYMBOL(input_free_device);
 
+/**
+ * input_mt_create_slots() - create MT input slots
+ * @dev: input device supporting MT events and finger tracking
+ * @num_slots: number of slots used by the device
+ *
+ * This function allocates all necessary memory for MT slot handling
+ * in the input device, and adds ABS_MT_SLOT to the device capabilities.
+ */
+int input_mt_create_slots(struct input_dev *dev, unsigned int num_slots)
+{
+	if (!num_slots)
+		return 0;
+
+	dev->mt = kcalloc(num_slots, sizeof(struct input_mt_slot), GFP_KERNEL);
+	if (!dev->mt)
+		return -ENOMEM;
+
+	dev->mtsize = num_slots;
+	input_set_abs_params(dev, ABS_MT_SLOT, 0, num_slots - 1, 0, 0);
+
+	return 0;
+}
+EXPORT_SYMBOL(input_mt_create_slots);
+
+/**
+ * input_mt_destroy_slots() - frees the MT slots of the input device
+ * @dev: input device with allocated MT slots
+ *
+ * This function is only needed in error path as the input core will
+ * automatically free the MT slots when the device is destroyed.
+ */
+void input_mt_destroy_slots(struct input_dev *dev)
+{
+	kfree(dev->mt);
+	dev->mt = NULL;
+	dev->mtsize = 0;
+}
+EXPORT_SYMBOL(input_mt_destroy_slots);
+
 /**
  * input_set_capability - mark device as capable of a certain event
  * @dev: device that is capable of emitting or accepting event
@@ -1945,20 +2004,10 @@ static const struct file_operations input_fops = {
 	.open = input_open_file,
 };
 
-static void __init input_init_abs_bypass(void)
-{
-	const unsigned int *p;
-
-	for (p = input_abs_bypass_init_data; *p; p++)
-		input_abs_bypass[BIT_WORD(*p)] |= BIT_MASK(*p);
-}
-
 static int __init input_init(void)
 {
 	int err;
 
-	input_init_abs_bypass();
-
 	err = class_register(&input_class);
 	if (err) {
 		printk(KERN_ERR "input: unable to register input_dev class\n");
diff --git a/include/linux/input.h b/include/linux/input.h
index cc524c8b6703..a14de64ed16a 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -691,9 +691,12 @@ struct input_absinfo {
 #define ABS_TILT_X		0x1a
 #define ABS_TILT_Y		0x1b
 #define ABS_TOOL_WIDTH		0x1c
+
 #define ABS_VOLUME		0x20
+
 #define ABS_MISC		0x28
 
+#define ABS_MT_SLOT		0x2f	/* MT slot being modified */
 #define ABS_MT_TOUCH_MAJOR	0x30	/* Major axis of touching ellipse */
 #define ABS_MT_TOUCH_MINOR	0x31	/* Minor axis (omit if circular) */
 #define ABS_MT_WIDTH_MAJOR	0x32	/* Major axis of approaching ellipse */
@@ -706,6 +709,12 @@ struct input_absinfo {
 #define ABS_MT_TRACKING_ID	0x39	/* Unique ID of initiated contact */
 #define ABS_MT_PRESSURE		0x3a	/* Pressure on contact area */
 
+#ifdef __KERNEL__
+/* Implementation details, userspace should not care about these */
+#define ABS_MT_FIRST		ABS_MT_TOUCH_MAJOR
+#define ABS_MT_LAST		ABS_MT_PRESSURE
+#endif
+
 #define ABS_MAX			0x3f
 #define ABS_CNT			(ABS_MAX+1)
 
@@ -1047,6 +1056,14 @@ struct ff_effect {
 #include <linux/timer.h>
 #include <linux/mod_devicetable.h>
 
+/**
+ * struct input_mt_slot - represents the state of an input MT slot
+ * @abs: holds current values of ABS_MT axes for this slot
+ */
+struct input_mt_slot {
+	int abs[ABS_MT_LAST - ABS_MT_FIRST + 1];
+};
+
 /**
  * struct input_dev - represents an input device
  * @name: name of the device
@@ -1085,6 +1102,10 @@ struct ff_effect {
  * @sync: set to 1 when there were no new events since last EV_SYNC
  * @abs: current values for reports from absolute axes
  * @rep: current values for autorepeat parameters (delay, rate)
+ * @mt: pointer to array of struct input_mt_slot holding current values
+ *	of tracked contacts
+ * @mtsize: number of MT slots the device uses
+ * @slot: MT slot currently being transmitted
  * @key: reflects current state of device's keys/buttons
  * @led: reflects current state of device's LEDs
  * @snd: reflects current state of sound effects
@@ -1164,6 +1185,10 @@ struct input_dev {
 	int abs[ABS_CNT];
 	int rep[REP_MAX + 1];
 
+	struct input_mt_slot *mt;
+	int mtsize;
+	int slot;
+
 	unsigned long key[BITS_TO_LONGS(KEY_CNT)];
 	unsigned long led[BITS_TO_LONGS(LED_CNT)];
 	unsigned long snd[BITS_TO_LONGS(SND_CNT)];
@@ -1412,6 +1437,11 @@ static inline void input_mt_sync(struct input_dev *dev)
 	input_event(dev, EV_SYN, SYN_MT_REPORT, 0);
 }
 
+static inline void input_mt_slot(struct input_dev *dev, int slot)
+{
+	input_event(dev, EV_ABS, ABS_MT_SLOT, slot);
+}
+
 void input_set_capability(struct input_dev *dev, unsigned int type, unsigned int code);
 
 /**
@@ -1506,5 +1536,8 @@ int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file);
 int input_ff_create_memless(struct input_dev *dev, void *data,
 		int (*play_effect)(struct input_dev *, void *, struct ff_effect *));
 
+int input_mt_create_slots(struct input_dev *dev, unsigned int num_slots);
+void input_mt_destroy_slots(struct input_dev *dev);
+
 #endif
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 20da92de8ec3c1d4ba7e5aca322d38b6ce634932 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 15 Jul 2010 23:27:36 -0700
Subject: Input: change input handlers to use bool when possible

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/evdev.c    | 6 +++---
 drivers/input/input.c    | 6 +++---
 drivers/input/joydev.c   | 7 +++----
 drivers/input/mousedev.c | 6 +++---
 include/linux/input.h    | 6 +++---
 5 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index fc5afbd78625..70c0eb52ca96 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -24,7 +24,6 @@
 #include "input-compat.h"
 
 struct evdev {
-	int exist;
 	int open;
 	int minor;
 	struct input_handle handle;
@@ -34,6 +33,7 @@ struct evdev {
 	spinlock_t client_lock; /* protects client_list */
 	struct mutex mutex;
 	struct device dev;
+	bool exist;
 };
 
 struct evdev_client {
@@ -793,7 +793,7 @@ static void evdev_remove_chrdev(struct evdev *evdev)
 static void evdev_mark_dead(struct evdev *evdev)
 {
 	mutex_lock(&evdev->mutex);
-	evdev->exist = 0;
+	evdev->exist = false;
 	mutex_unlock(&evdev->mutex);
 }
 
@@ -842,7 +842,7 @@ static int evdev_connect(struct input_handler *handler, struct input_dev *dev,
 	init_waitqueue_head(&evdev->wait);
 
 	dev_set_name(&evdev->dev, "event%d", minor);
-	evdev->exist = 1;
+	evdev->exist = true;
 	evdev->minor = minor;
 
 	evdev->handle.dev = input_get_device(dev);
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 54109c33e36c..e1243b4b32a5 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -227,12 +227,12 @@ static void input_handle_event(struct input_dev *dev,
 
 		case SYN_REPORT:
 			if (!dev->sync) {
-				dev->sync = 1;
+				dev->sync = true;
 				disposition = INPUT_PASS_TO_HANDLERS;
 			}
 			break;
 		case SYN_MT_REPORT:
-			dev->sync = 0;
+			dev->sync = false;
 			disposition = INPUT_PASS_TO_HANDLERS;
 			break;
 		}
@@ -317,7 +317,7 @@ static void input_handle_event(struct input_dev *dev,
 	}
 
 	if (disposition != INPUT_IGNORE_EVENT && type != EV_SYN)
-		dev->sync = 0;
+		dev->sync = false;
 
 	if ((disposition & INPUT_PASS_TO_DEVICE) && dev->event)
 		dev->event(dev, type, code, value);
diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c
index 34157bb97ed6..63834585c283 100644
--- a/drivers/input/joydev.c
+++ b/drivers/input/joydev.c
@@ -37,7 +37,6 @@ MODULE_LICENSE("GPL");
 #define JOYDEV_BUFFER_SIZE	64
 
 struct joydev {
-	int exist;
 	int open;
 	int minor;
 	struct input_handle handle;
@@ -46,6 +45,7 @@ struct joydev {
 	spinlock_t client_lock; /* protects client_list */
 	struct mutex mutex;
 	struct device dev;
+	bool exist;
 
 	struct js_corr corr[ABS_CNT];
 	struct JS_DATA_SAVE_TYPE glue;
@@ -760,7 +760,7 @@ static void joydev_remove_chrdev(struct joydev *joydev)
 static void joydev_mark_dead(struct joydev *joydev)
 {
 	mutex_lock(&joydev->mutex);
-	joydev->exist = 0;
+	joydev->exist = false;
 	mutex_unlock(&joydev->mutex);
 }
 
@@ -817,10 +817,9 @@ static int joydev_connect(struct input_handler *handler, struct input_dev *dev,
 	init_waitqueue_head(&joydev->wait);
 
 	dev_set_name(&joydev->dev, "js%d", minor);
-	joydev->exist = 1;
+	joydev->exist = true;
 	joydev->minor = minor;
 
-	joydev->exist = 1;
 	joydev->handle.dev = input_get_device(dev);
 	joydev->handle.name = dev_name(&joydev->dev);
 	joydev->handle.handler = handler;
diff --git a/drivers/input/mousedev.c b/drivers/input/mousedev.c
index f34b22bce4ff..d7a7a2fce745 100644
--- a/drivers/input/mousedev.c
+++ b/drivers/input/mousedev.c
@@ -57,7 +57,6 @@ struct mousedev_hw_data {
 };
 
 struct mousedev {
-	int exist;
 	int open;
 	int minor;
 	struct input_handle handle;
@@ -66,6 +65,7 @@ struct mousedev {
 	spinlock_t client_lock; /* protects client_list */
 	struct mutex mutex;
 	struct device dev;
+	bool exist;
 
 	struct list_head mixdev_node;
 	int mixdev_open;
@@ -802,7 +802,7 @@ static void mousedev_remove_chrdev(struct mousedev *mousedev)
 static void mousedev_mark_dead(struct mousedev *mousedev)
 {
 	mutex_lock(&mousedev->mutex);
-	mousedev->exist = 0;
+	mousedev->exist = false;
 	mutex_unlock(&mousedev->mutex);
 }
 
@@ -862,7 +862,7 @@ static struct mousedev *mousedev_create(struct input_dev *dev,
 		dev_set_name(&mousedev->dev, "mouse%d", minor);
 
 	mousedev->minor = minor;
-	mousedev->exist = 1;
+	mousedev->exist = true;
 	mousedev->handle.dev = input_get_device(dev);
 	mousedev->handle.name = dev_name(&mousedev->dev);
 	mousedev->handle.handler = handler;
diff --git a/include/linux/input.h b/include/linux/input.h
index a14de64ed16a..339d043ccb53 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -1099,7 +1099,6 @@ struct input_mt_slot {
  * @repeat_key: stores key code of the last key pressed; used to implement
  *	software autorepeat
  * @timer: timer for software autorepeat
- * @sync: set to 1 when there were no new events since last EV_SYNC
  * @abs: current values for reports from absolute axes
  * @rep: current values for autorepeat parameters (delay, rate)
  * @mt: pointer to array of struct input_mt_slot holding current values
@@ -1144,6 +1143,7 @@ struct input_mt_slot {
  *	last user closes the device
  * @going_away: marks devices that are in a middle of unregistering and
  *	causes input_open_device*() fail with -ENODEV.
+ * @sync: set to %true when there were no new events since last EV_SYN
  * @dev: driver model's view of this device
  * @h_list: list of input handles associated with the device. When
  *	accessing the list dev->mutex must be held
@@ -1180,8 +1180,6 @@ struct input_dev {
 	unsigned int repeat_key;
 	struct timer_list timer;
 
-	int sync;
-
 	int abs[ABS_CNT];
 	int rep[REP_MAX + 1];
 
@@ -1213,6 +1211,8 @@ struct input_dev {
 	unsigned int users;
 	bool going_away;
 
+	bool sync;
+
 	struct device dev;
 
 	struct list_head	h_list;
-- 
cgit v1.2.3-59-g8ed1b


From 8fd00b4d7014b00448eb33cf0590815304769798 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 26 Aug 2009 18:41:16 +0200
Subject: rlimits: security, add task_struct to setrlimit

Add task_struct to task_setrlimit of security_operations to be able to set
rlimit of task other than current.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Acked-by: Eric Paris <eparis@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 9 ++++++---
 kernel/sys.c             | 2 +-
 security/capability.c    | 3 ++-
 security/security.c      | 5 +++--
 security/selinux/hooks.c | 7 ++++---
 5 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 0c8819170463..1a3eb5ff4357 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1501,7 +1501,8 @@ struct security_operations {
 	int (*task_setnice) (struct task_struct *p, int nice);
 	int (*task_setioprio) (struct task_struct *p, int ioprio);
 	int (*task_getioprio) (struct task_struct *p);
-	int (*task_setrlimit) (unsigned int resource, struct rlimit *new_rlim);
+	int (*task_setrlimit) (struct task_struct *p, unsigned int resource,
+			struct rlimit *new_rlim);
 	int (*task_setscheduler) (struct task_struct *p, int policy,
 				  struct sched_param *lp);
 	int (*task_getscheduler) (struct task_struct *p);
@@ -1751,7 +1752,8 @@ void security_task_getsecid(struct task_struct *p, u32 *secid);
 int security_task_setnice(struct task_struct *p, int nice);
 int security_task_setioprio(struct task_struct *p, int ioprio);
 int security_task_getioprio(struct task_struct *p);
-int security_task_setrlimit(unsigned int resource, struct rlimit *new_rlim);
+int security_task_setrlimit(struct task_struct *p, unsigned int resource,
+		struct rlimit *new_rlim);
 int security_task_setscheduler(struct task_struct *p,
 				int policy, struct sched_param *lp);
 int security_task_getscheduler(struct task_struct *p);
@@ -2313,7 +2315,8 @@ static inline int security_task_getioprio(struct task_struct *p)
 	return 0;
 }
 
-static inline int security_task_setrlimit(unsigned int resource,
+static inline int security_task_setrlimit(struct task_struct *p,
+					  unsigned int resource,
 					  struct rlimit *new_rlim)
 {
 	return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index e83ddbbaf89d..1ba4522689d4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1290,7 +1290,7 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
 		return -EPERM;
 
-	retval = security_task_setrlimit(resource, &new_rlim);
+	retval = security_task_setrlimit(current, resource, &new_rlim);
 	if (retval)
 		return retval;
 
diff --git a/security/capability.c b/security/capability.c
index 8168e3ecd5bf..7e468263f2de 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -412,7 +412,8 @@ static int cap_task_getioprio(struct task_struct *p)
 	return 0;
 }
 
-static int cap_task_setrlimit(unsigned int resource, struct rlimit *new_rlim)
+static int cap_task_setrlimit(struct task_struct *p, unsigned int resource,
+		struct rlimit *new_rlim)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 351942a4ca0e..aa510609a955 100644
--- a/security/security.c
+++ b/security/security.c
@@ -769,9 +769,10 @@ int security_task_getioprio(struct task_struct *p)
 	return security_ops->task_getioprio(p);
 }
 
-int security_task_setrlimit(unsigned int resource, struct rlimit *new_rlim)
+int security_task_setrlimit(struct task_struct *p, unsigned int resource,
+		struct rlimit *new_rlim)
 {
-	return security_ops->task_setrlimit(resource, new_rlim);
+	return security_ops->task_setrlimit(p, resource, new_rlim);
 }
 
 int security_task_setscheduler(struct task_struct *p,
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 5c9f25ba1c95..e3ce6b4127cc 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3371,16 +3371,17 @@ static int selinux_task_getioprio(struct task_struct *p)
 	return current_has_perm(p, PROCESS__GETSCHED);
 }
 
-static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim)
+static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource,
+		struct rlimit *new_rlim)
 {
-	struct rlimit *old_rlim = current->signal->rlim + resource;
+	struct rlimit *old_rlim = p->signal->rlim + resource;
 
 	/* Control the ability to change the hard limit (whether
 	   lowering or raising it), so that the hard limit can
 	   later be used as a safe reset point for the soft limit
 	   upon context transitions.  See selinux_bprm_committing_creds. */
 	if (old_rlim->rlim_max != new_rlim->rlim_max)
-		return current_has_perm(current, PROCESS__SETRLIMIT);
+		return current_has_perm(p, PROCESS__SETRLIMIT);
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 5ab46b345e418747b3a52f0892680c0745c4223c Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Fri, 28 Aug 2009 14:05:12 +0200
Subject: rlimits: add task_struct to update_rlimit_cpu

Add task_struct as a parameter to update_rlimit_cpu to be able to set
rlimit_cpu of different task than current.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Acked-by: James Morris <jmorris@namei.org>
---
 include/linux/posix-timers.h | 2 +-
 kernel/posix-cpu-timers.c    | 8 ++++----
 kernel/sys.c                 | 2 +-
 security/selinux/hooks.c     | 3 ++-
 4 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 4f71bf4e628c..3e23844a6990 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -117,6 +117,6 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 
 long clock_nanosleep_restart(struct restart_block *restart_block);
 
-void update_rlimit_cpu(unsigned long rlim_new);
+void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
 
 #endif
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9829646d399c..0513900995ce 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -16,13 +16,13 @@
  * siglock protection since other code may update expiration cache as
  * well.
  */
-void update_rlimit_cpu(unsigned long rlim_new)
+void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
 {
 	cputime_t cputime = secs_to_cputime(rlim_new);
 
-	spin_lock_irq(&current->sighand->siglock);
-	set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
-	spin_unlock_irq(&current->sighand->siglock);
+	spin_lock_irq(&task->sighand->siglock);
+	set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
+	spin_unlock_irq(&task->sighand->siglock);
 }
 
 static int check_clock(const clockid_t which_clock)
diff --git a/kernel/sys.c b/kernel/sys.c
index 1ba4522689d4..f5183b08adfc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1320,7 +1320,7 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 	if (new_rlim.rlim_cur == RLIM_INFINITY)
 		goto out;
 
-	update_rlimit_cpu(new_rlim.rlim_cur);
+	update_rlimit_cpu(current, new_rlim.rlim_cur);
 out:
 	return 0;
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index e3ce6b4127cc..afb18a9ebba1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2338,7 +2338,8 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
 			initrlim = init_task.signal->rlim + i;
 			rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
 		}
-		update_rlimit_cpu(current->signal->rlim[RLIMIT_CPU].rlim_cur);
+		update_rlimit_cpu(current,
+				current->signal->rlim[RLIMIT_CPU].rlim_cur);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 7855c35da7ba16b389d17710401c4a55a3ea2102 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 26 Aug 2009 23:45:34 +0200
Subject: rlimits: split sys_setrlimit

Create do_setrlimit from sys_setrlimit and declare do_setrlimit
in the resource header. This is the first phase to have generic
do_prlimit which allows to be called from read, write and compat
rlimits code.

The new do_setrlimit also accepts a task pointer to change the limits
of. Currently, it cannot be other than current, but this will change
with locking later.

Also pass tsk->group_leader to security_task_setrlimit to check
whether current is allowed to change rlimits of the process and not
its arbitrary thread because it makes more sense given that rlimit are
per process and not per-thread.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
---
 include/linux/resource.h |  2 ++
 kernel/sys.c             | 40 ++++++++++++++++++++++++----------------
 2 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/resource.h b/include/linux/resource.h
index f1e914eefeab..cf8dc96653ee 100644
--- a/include/linux/resource.h
+++ b/include/linux/resource.h
@@ -73,6 +73,8 @@ struct rlimit {
 struct task_struct;
 
 int getrusage(struct task_struct *p, int who, struct rusage __user *ru);
+int do_setrlimit(struct task_struct *tsk, unsigned int resource,
+		struct rlimit *new_rlim);
 
 #endif /* __KERNEL__ */
 
diff --git a/kernel/sys.c b/kernel/sys.c
index f2b2d7aa3818..b5b96e30e0d6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1272,42 +1272,41 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
 
 #endif
 
-SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+int do_setrlimit(struct task_struct *tsk, unsigned int resource,
+		struct rlimit *new_rlim)
 {
-	struct rlimit new_rlim, *old_rlim;
+	struct rlimit *old_rlim;
 	int retval;
 
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
-	if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
-		return -EFAULT;
-	if (new_rlim.rlim_cur > new_rlim.rlim_max)
+	if (new_rlim->rlim_cur > new_rlim->rlim_max)
 		return -EINVAL;
-	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
+	if (resource == RLIMIT_NOFILE && new_rlim->rlim_max > sysctl_nr_open)
 		return -EPERM;
 
-	retval = security_task_setrlimit(current, resource, &new_rlim);
+	retval = security_task_setrlimit(tsk->group_leader, resource, new_rlim);
 	if (retval)
 		return retval;
 
-	if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
+	if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
 		/*
 		 * The caller is asking for an immediate RLIMIT_CPU
 		 * expiry.  But we use the zero value to mean "it was
 		 * never set".  So let's cheat and make it one second
 		 * instead
 		 */
-		new_rlim.rlim_cur = 1;
+		new_rlim->rlim_cur = 1;
 	}
 
-	old_rlim = current->signal->rlim + resource;
-	task_lock(current->group_leader);
-	if (new_rlim.rlim_max > old_rlim->rlim_max &&
+	old_rlim = tsk->signal->rlim + resource;
+	task_lock(tsk->group_leader);
+	if (new_rlim->rlim_max > old_rlim->rlim_max &&
 			!capable(CAP_SYS_RESOURCE))
 		retval = -EPERM;
 	else
-		*old_rlim = new_rlim;
-	task_unlock(current->group_leader);
+		*old_rlim = *new_rlim;
+	task_unlock(tsk->group_leader);
 
 	if (retval || resource != RLIMIT_CPU)
 		goto out;
@@ -1318,14 +1317,23 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 	 * very long-standing error, and fixing it now risks breakage of
 	 * applications, so we live with it
 	 */
-	if (new_rlim.rlim_cur == RLIM_INFINITY)
+	if (new_rlim->rlim_cur == RLIM_INFINITY)
 		goto out;
 
-	update_rlimit_cpu(current, new_rlim.rlim_cur);
+	update_rlimit_cpu(tsk, new_rlim->rlim_cur);
 out:
 	return retval;
 }
 
+SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+{
+	struct rlimit new_rlim;
+
+	if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+		return -EFAULT;
+	return do_setrlimit(current, resource, &new_rlim);
+}
+
 /*
  * It would make sense to put struct rusage in the task_struct,
  * except that would make the task_struct be *really big*.  After
-- 
cgit v1.2.3-59-g8ed1b


From 6a1d5e2c85d06da35cdfd93f1a27675bfdc3ad8c Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 24 Mar 2010 17:06:58 +0100
Subject: rlimits: add rlimit64 structure

Add a platform independent structure for resource limits to use with
a new prlimit64 syscall. This structure is the same which uses glibc
for 64-bit limits.

Also add corresponding infinity which is a 64-bit full of bit-ones.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
---
 include/linux/resource.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/resource.h b/include/linux/resource.h
index cf8dc96653ee..037aa7e6335d 100644
--- a/include/linux/resource.h
+++ b/include/linux/resource.h
@@ -43,6 +43,13 @@ struct rlimit {
 	unsigned long	rlim_max;
 };
 
+#define RLIM64_INFINITY		(~0ULL)
+
+struct rlimit64 {
+	__u64 rlim_cur;
+	__u64 rlim_max;
+};
+
 #define	PRIO_MIN	(-20)
 #define	PRIO_MAX	20
 
-- 
cgit v1.2.3-59-g8ed1b


From 5b41535aac0c07135ff6a4c5c2ae115d1c20c0bc Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 24 Mar 2010 16:11:29 +0100
Subject: rlimits: redo do_setrlimit to more generic do_prlimit

It now allows also reading of limits. I.e. all read and writes will
later use this function.

It takes two parameters, new and old limits which can be both NULL.
If new is non-NULL, the value in it is set to rlimits.
If old is non-NULL, current rlimits are stored there.
If both are non-NULL, old are stored prior to setting the new ones,
atomically.
(Similar to sigaction.)

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
---
 include/linux/resource.h |  4 +--
 kernel/sys.c             | 71 +++++++++++++++++++++++++-----------------------
 2 files changed, 39 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/resource.h b/include/linux/resource.h
index 037aa7e6335d..88d36f9145ba 100644
--- a/include/linux/resource.h
+++ b/include/linux/resource.h
@@ -80,8 +80,8 @@ struct rlimit64 {
 struct task_struct;
 
 int getrusage(struct task_struct *p, int who, struct rusage __user *ru);
-int do_setrlimit(struct task_struct *tsk, unsigned int resource,
-		struct rlimit *new_rlim);
+int do_prlimit(struct task_struct *tsk, unsigned int resource,
+		struct rlimit *new_rlim, struct rlimit *old_rlim);
 
 #endif /* __KERNEL__ */
 
diff --git a/kernel/sys.c b/kernel/sys.c
index c762eebdebf7..bc7d1be0960e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1273,18 +1273,21 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
 #endif
 
 /* make sure you are allowed to change @tsk limits before calling this */
-int do_setrlimit(struct task_struct *tsk, unsigned int resource,
-		struct rlimit *new_rlim)
+int do_prlimit(struct task_struct *tsk, unsigned int resource,
+		struct rlimit *new_rlim, struct rlimit *old_rlim)
 {
-	struct rlimit *old_rlim;
+	struct rlimit *rlim;
 	int retval = 0;
 
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
-	if (new_rlim->rlim_cur > new_rlim->rlim_max)
-		return -EINVAL;
-	if (resource == RLIMIT_NOFILE && new_rlim->rlim_max > sysctl_nr_open)
-		return -EPERM;
+	if (new_rlim) {
+		if (new_rlim->rlim_cur > new_rlim->rlim_max)
+			return -EINVAL;
+		if (resource == RLIMIT_NOFILE &&
+				new_rlim->rlim_max > sysctl_nr_open)
+			return -EPERM;
+	}
 
 	/* protect tsk->signal and tsk->sighand from disappearing */
 	read_lock(&tasklist_lock);
@@ -1293,42 +1296,42 @@ int do_setrlimit(struct task_struct *tsk, unsigned int resource,
 		goto out;
 	}
 
-	old_rlim = tsk->signal->rlim + resource;
+	rlim = tsk->signal->rlim + resource;
 	task_lock(tsk->group_leader);
-	if (new_rlim->rlim_max > old_rlim->rlim_max &&
-			!capable(CAP_SYS_RESOURCE))
-		retval = -EPERM;
-	if (!retval)
-		retval = security_task_setrlimit(tsk->group_leader, resource,
-				new_rlim);
-
-	if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
-		/*
-		 * The caller is asking for an immediate RLIMIT_CPU
-		 * expiry.  But we use the zero value to mean "it was
-		 * never set".  So let's cheat and make it one second
-		 * instead
-		 */
-		new_rlim->rlim_cur = 1;
+	if (new_rlim) {
+		if (new_rlim->rlim_max > rlim->rlim_max &&
+				!capable(CAP_SYS_RESOURCE))
+			retval = -EPERM;
+		if (!retval)
+			retval = security_task_setrlimit(tsk->group_leader,
+					resource, new_rlim);
+		if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
+			/*
+			 * The caller is asking for an immediate RLIMIT_CPU
+			 * expiry.  But we use the zero value to mean "it was
+			 * never set".  So let's cheat and make it one second
+			 * instead
+			 */
+			new_rlim->rlim_cur = 1;
+		}
+	}
+	if (!retval) {
+		if (old_rlim)
+			*old_rlim = *rlim;
+		if (new_rlim)
+			*rlim = *new_rlim;
 	}
-
-	if (!retval)
-		*old_rlim = *new_rlim;
 	task_unlock(tsk->group_leader);
 
-	if (retval || resource != RLIMIT_CPU)
-		goto out;
-
 	/*
 	 * RLIMIT_CPU handling.   Note that the kernel fails to return an error
 	 * code if it rejected the user's attempt to set RLIMIT_CPU.  This is a
 	 * very long-standing error, and fixing it now risks breakage of
 	 * applications, so we live with it
 	 */
-	if (new_rlim->rlim_cur == RLIM_INFINITY)
-		goto out;
-
-	update_rlimit_cpu(tsk, new_rlim->rlim_cur);
+	 if (!retval && new_rlim && resource == RLIMIT_CPU &&
+			 new_rlim->rlim_cur != RLIM_INFINITY)
+		update_rlimit_cpu(tsk, new_rlim->rlim_cur);
 out:
 	read_unlock(&tasklist_lock);
 	return retval;
@@ -1340,7 +1343,7 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 
 	if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
 		return -EFAULT;
-	return do_setrlimit(current, resource, &new_rlim);
+	return do_prlimit(current, resource, &new_rlim, NULL);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From c022a0acad534fd5f5d5f17280f6d4d135e74e81 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 4 May 2010 18:03:50 +0200
Subject: rlimits: implement prlimit64 syscall

This patch adds the code to support the sys_prlimit64 syscall which
modifies-and-returns the rlim values of a selected process atomically.
The first parameter, pid, being 0 means current process.

Unlike the current implementation, it is a generic interface,
architecture indepentent so that we needn't handle compat stuff
anymore. In the future, after glibc start to use this we can deprecate
sys_setrlimit and sys_getrlimit in favor to clean up the code finally.

It also adds a possibility of changing limits of other processes. We
check the user's permissions to do that and if it succeeds, the new
limits are propagated online. This is good for large scale
applications such as SAP or databases where administrators need to
change limits time by time (e.g. on crashes increase core size). And
it is unacceptable to restart the service.

For safety, all rlim users now either use accessors or doesn't need
them due to
- locking
- the fact a process was just forked and nobody else knows about it
  yet (and nobody can't thus read/write limits)
hence it is safe to modify limits now.

The limitation is that we currently stay at ulong internal
representation. So the rlim64_is_infinity check is used where value is
compared against ULONG_MAX on 32-bit which is the maximum value there.

And since internally the limits are held in struct rlimit, converters
which are used before and after do_prlimit call in sys_prlimit64 are
introduced.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
---
 include/linux/syscalls.h |  4 +++
 kernel/sys.c             | 94 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7f614ce274a9..a60943be4270 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -35,6 +35,7 @@ struct oldold_utsname;
 struct old_utsname;
 struct pollfd;
 struct rlimit;
+struct rlimit64;
 struct rusage;
 struct sched_param;
 struct sel_arg_struct;
@@ -644,6 +645,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
 #endif
 asmlinkage long sys_setrlimit(unsigned int resource,
 				struct rlimit __user *rlim);
+asmlinkage long sys_prlimit64(pid_t pid, unsigned int resource,
+				const struct rlimit64 __user *new_rlim,
+				struct rlimit64 __user *old_rlim);
 asmlinkage long sys_getrusage(int who, struct rusage __user *ru);
 asmlinkage long sys_umask(int mask);
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 9da98dd47276..e9ad44489828 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1271,6 +1271,39 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
 
 #endif
 
+static inline bool rlim64_is_infinity(__u64 rlim64)
+{
+#if BITS_PER_LONG < 64
+	return rlim64 >= ULONG_MAX;
+#else
+	return rlim64 == RLIM64_INFINITY;
+#endif
+}
+
+static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
+{
+	if (rlim->rlim_cur == RLIM_INFINITY)
+		rlim64->rlim_cur = RLIM64_INFINITY;
+	else
+		rlim64->rlim_cur = rlim->rlim_cur;
+	if (rlim->rlim_max == RLIM_INFINITY)
+		rlim64->rlim_max = RLIM64_INFINITY;
+	else
+		rlim64->rlim_max = rlim->rlim_max;
+}
+
+static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
+{
+	if (rlim64_is_infinity(rlim64->rlim_cur))
+		rlim->rlim_cur = RLIM_INFINITY;
+	else
+		rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
+	if (rlim64_is_infinity(rlim64->rlim_max))
+		rlim->rlim_max = RLIM_INFINITY;
+	else
+		rlim->rlim_max = (unsigned long)rlim64->rlim_max;
+}
+
 /* make sure you are allowed to change @tsk limits before calling this */
 int do_prlimit(struct task_struct *tsk, unsigned int resource,
 		struct rlimit *new_rlim, struct rlimit *old_rlim)
@@ -1336,6 +1369,67 @@ out:
 	return retval;
 }
 
+/* rcu lock must be held */
+static int check_prlimit_permission(struct task_struct *task)
+{
+	const struct cred *cred = current_cred(), *tcred;
+
+	tcred = __task_cred(task);
+	if ((cred->uid != tcred->euid ||
+	     cred->uid != tcred->suid ||
+	     cred->uid != tcred->uid  ||
+	     cred->gid != tcred->egid ||
+	     cred->gid != tcred->sgid ||
+	     cred->gid != tcred->gid) &&
+	     !capable(CAP_SYS_RESOURCE)) {
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
+		const struct rlimit64 __user *, new_rlim,
+		struct rlimit64 __user *, old_rlim)
+{
+	struct rlimit64 old64, new64;
+	struct rlimit old, new;
+	struct task_struct *tsk;
+	int ret;
+
+	if (new_rlim) {
+		if (copy_from_user(&new64, new_rlim, sizeof(new64)))
+			return -EFAULT;
+		rlim64_to_rlim(&new64, &new);
+	}
+
+	rcu_read_lock();
+	tsk = pid ? find_task_by_vpid(pid) : current;
+	if (!tsk) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+	ret = check_prlimit_permission(tsk);
+	if (ret) {
+		rcu_read_unlock();
+		return ret;
+	}
+	get_task_struct(tsk);
+	rcu_read_unlock();
+
+	ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
+			old_rlim ? &old : NULL);
+
+	if (!ret && old_rlim) {
+		rlim_to_rlim64(&old, &old64);
+		if (copy_to_user(old_rlim, &old64, sizeof(old64)))
+			ret = -EFAULT;
+	}
+
+	put_task_struct(tsk);
+	return ret;
+}
+
 SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 {
 	struct rlimit new_rlim;
-- 
cgit v1.2.3-59-g8ed1b


From af537b0a6c650ab6ff7104d8163e96866b31c835 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Fri, 9 Jul 2010 14:07:14 -0500
Subject: slub: Use kmem_cache flags to detect if slab is in debugging mode.

The cacheline with the flags is reachable from the hot paths after the
percpu allocator changes went in. So there is no need anymore to put a
flag into each slab page. Get rid of the SlubDebug flag and use
the flags in kmem_cache instead.

Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/page-flags.h |  2 --
 mm/slub.c                  | 33 ++++++++++++---------------------
 2 files changed, 12 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5b59f35dcb8f..6fa317801e1c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -128,7 +128,6 @@ enum pageflags {
 
 	/* SLUB */
 	PG_slub_frozen = PG_active,
-	PG_slub_debug = PG_error,
 };
 
 #ifndef __GENERATING_BOUNDS_H
@@ -215,7 +214,6 @@ PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 __PAGEFLAG(SlobFree, slob_free)
 
 __PAGEFLAG(SlubFrozen, slub_frozen)
-__PAGEFLAG(SlubDebug, slub_debug)
 
 /*
  * Private page markings that may be used by the filesystem that owns the page
diff --git a/mm/slub.c b/mm/slub.c
index b89a7c99b2fa..9cf5dae7815e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -107,11 +107,17 @@
  * 			the fast path and disables lockless freelists.
  */
 
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+		SLAB_TRACE | SLAB_DEBUG_FREE)
+
+static inline int kmem_cache_debug(struct kmem_cache *s)
+{
 #ifdef CONFIG_SLUB_DEBUG
-#define SLABDEBUG 1
+	return unlikely(s->flags & SLAB_DEBUG_FLAGS);
 #else
-#define SLABDEBUG 0
+	return 0;
 #endif
+}
 
 /*
  * Issues still to be resolved:
@@ -1157,9 +1163,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	inc_slabs_node(s, page_to_nid(page), page->objects);
 	page->slab = s;
 	page->flags |= 1 << PG_slab;
-	if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
-			SLAB_STORE_USER | SLAB_TRACE))
-		__SetPageSlubDebug(page);
 
 	start = page_address(page);
 
@@ -1186,14 +1189,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	int order = compound_order(page);
 	int pages = 1 << order;
 
-	if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
+	if (kmem_cache_debug(s)) {
 		void *p;
 
 		slab_pad_check(s, page);
 		for_each_object(p, s, page_address(page),
 						page->objects)
 			check_object(s, page, p, 0);
-		__ClearPageSlubDebug(page);
 	}
 
 	kmemcheck_free_shadow(page, compound_order(page));
@@ -1415,8 +1417,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
 			stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
 		} else {
 			stat(s, DEACTIVATE_FULL);
-			if (SLABDEBUG && PageSlubDebug(page) &&
-						(s->flags & SLAB_STORE_USER))
+			if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
 				add_full(n, page);
 		}
 		slab_unlock(page);
@@ -1624,7 +1625,7 @@ load_freelist:
 	object = c->page->freelist;
 	if (unlikely(!object))
 		goto another_slab;
-	if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
+	if (kmem_cache_debug(s))
 		goto debug;
 
 	c->freelist = get_freepointer(s, object);
@@ -1783,7 +1784,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 	stat(s, FREE_SLOWPATH);
 	slab_lock(page);
 
-	if (unlikely(SLABDEBUG && PageSlubDebug(page)))
+	if (kmem_cache_debug(s))
 		goto debug;
 
 checks_ok:
@@ -3398,16 +3399,6 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
 	} else
 		printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
 			s->name, page);
-
-	if (s->flags & DEBUG_DEFAULT_FLAGS) {
-		if (!PageSlubDebug(page))
-			printk(KERN_ERR "SLUB %s: SlubDebug not set "
-				"on slab 0x%p\n", s->name, page);
-	} else {
-		if (PageSlubDebug(page))
-			printk(KERN_ERR "SLUB %s: SlubDebug set on "
-				"slab 0x%p\n", s->name, page);
-	}
 }
 
 static int validate_slab_node(struct kmem_cache *s,
-- 
cgit v1.2.3-59-g8ed1b


From 22cb516696304a9b85892b18c483a27d97cfa51b Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 16 Jul 2010 14:08:20 +0200
Subject: netfilter: correct CHECKSUM header and export it

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/Kbuild        | 1 +
 include/linux/netfilter/xt_CHECKSUM.h | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index bb103f43afa0..b93b64dc9fae 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -3,6 +3,7 @@ header-y += nf_conntrack_tuple_common.h
 header-y += nfnetlink_conntrack.h
 header-y += nfnetlink_log.h
 header-y += nfnetlink_queue.h
+header-y += xt_CHECKSUM.h
 header-y += xt_CLASSIFY.h
 header-y += xt_CONNMARK.h
 header-y += xt_CONNSECMARK.h
diff --git a/include/linux/netfilter/xt_CHECKSUM.h b/include/linux/netfilter/xt_CHECKSUM.h
index 3b4fb77acef6..9a2e4661654e 100644
--- a/include/linux/netfilter/xt_CHECKSUM.h
+++ b/include/linux/netfilter/xt_CHECKSUM.h
@@ -6,8 +6,10 @@
  *
  * This software is distributed under GNU GPL v2, 1991
 */
-#ifndef _IPT_CHECKSUM_TARGET_H
-#define _IPT_CHECKSUM_TARGET_H
+#ifndef _XT_CHECKSUM_TARGET_H
+#define _XT_CHECKSUM_TARGET_H
+
+#include <linux/types.h>
 
 #define XT_CHECKSUM_OP_FILL	0x01	/* fill in checksum in IP header */
 
@@ -15,4 +17,4 @@ struct xt_CHECKSUM_info {
 	__u8 operation;	/* bitset of operations */
 };
 
-#endif /* _IPT_CHECKSUM_TARGET_H */
+#endif /* _XT_CHECKSUM_TARGET_H */
-- 
cgit v1.2.3-59-g8ed1b


From 2f495c398edca50ac251c134f1995a2fb3c06cb7 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 21 Jun 2010 13:20:46 +1000
Subject: net/phy/marvell: Expose IDs and flags in a .h and add dns323 LEDs
 setup flag

This moves the various known Marvell PHY IDs to include/linux/marvell_phy.h
along with dev_flags definitions for use by the driver.

I then added a flag that changes the PHY init code to setup the LEDs
config to the values needed to operate a dns323 rev C1 NAS.

I moved the existing "resistance" flag to the .h as well, though I've
been unable to find whoever sets this to convert it to use that constant.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Reviewed-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Nicolas Pitre <nico@fluxnic.net>
---
 drivers/net/phy/marvell.c   | 38 ++++++++++++++++++++------------------
 include/linux/marvell_phy.h | 20 ++++++++++++++++++++
 2 files changed, 40 insertions(+), 18 deletions(-)
 create mode 100644 include/linux/marvell_phy.h

(limited to 'include/linux')

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 78b74e83ce5d..5a1bd5db2a93 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -29,6 +29,7 @@
 #include <linux/mii.h>
 #include <linux/ethtool.h>
 #include <linux/phy.h>
+#include <linux/marvell_phy.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -48,8 +49,6 @@
 #define MII_M1145_RGMII_RX_DELAY	0x0080
 #define MII_M1145_RGMII_TX_DELAY	0x0002
 
-#define M1145_DEV_FLAGS_RESISTANCE	0x00000001
-
 #define MII_M1111_PHY_LED_CONTROL	0x18
 #define MII_M1111_PHY_LED_DIRECT	0x4100
 #define MII_M1111_PHY_LED_COMBINE	0x411c
@@ -350,7 +349,10 @@ static int m88e1118_config_init(struct phy_device *phydev)
 		return err;
 
 	/* Adjust LED Control */
-	err = phy_write(phydev, 0x10, 0x021e);
+	if (phydev->dev_flags & MARVELL_PHY_M1118_DNS323_LEDS)
+		err = phy_write(phydev, 0x10, 0x1100);
+	else
+		err = phy_write(phydev, 0x10, 0x021e);
 	if (err < 0)
 		return err;
 
@@ -398,7 +400,7 @@ static int m88e1145_config_init(struct phy_device *phydev)
 		if (err < 0)
 			return err;
 
-		if (phydev->dev_flags & M1145_DEV_FLAGS_RESISTANCE) {
+		if (phydev->dev_flags & MARVELL_PHY_M1145_FLAGS_RESISTANCE) {
 			err = phy_write(phydev, 0x1d, 0x0012);
 			if (err < 0)
 				return err;
@@ -529,8 +531,8 @@ static int m88e1121_did_interrupt(struct phy_device *phydev)
 
 static struct phy_driver marvell_drivers[] = {
 	{
-		.phy_id = 0x01410c60,
-		.phy_id_mask = 0xfffffff0,
+		.phy_id = MARVELL_PHY_ID_88E1101,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
 		.name = "Marvell 88E1101",
 		.features = PHY_GBIT_FEATURES,
 		.flags = PHY_HAS_INTERRUPT,
@@ -541,8 +543,8 @@ static struct phy_driver marvell_drivers[] = {
 		.driver = { .owner = THIS_MODULE },
 	},
 	{
-		.phy_id = 0x01410c90,
-		.phy_id_mask = 0xfffffff0,
+		.phy_id = MARVELL_PHY_ID_88E1112,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
 		.name = "Marvell 88E1112",
 		.features = PHY_GBIT_FEATURES,
 		.flags = PHY_HAS_INTERRUPT,
@@ -554,8 +556,8 @@ static struct phy_driver marvell_drivers[] = {
 		.driver = { .owner = THIS_MODULE },
 	},
 	{
-		.phy_id = 0x01410cc0,
-		.phy_id_mask = 0xfffffff0,
+		.phy_id = MARVELL_PHY_ID_88E1111,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
 		.name = "Marvell 88E1111",
 		.features = PHY_GBIT_FEATURES,
 		.flags = PHY_HAS_INTERRUPT,
@@ -567,8 +569,8 @@ static struct phy_driver marvell_drivers[] = {
 		.driver = { .owner = THIS_MODULE },
 	},
 	{
-		.phy_id = 0x01410e10,
-		.phy_id_mask = 0xfffffff0,
+		.phy_id = MARVELL_PHY_ID_88E1118,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
 		.name = "Marvell 88E1118",
 		.features = PHY_GBIT_FEATURES,
 		.flags = PHY_HAS_INTERRUPT,
@@ -580,8 +582,8 @@ static struct phy_driver marvell_drivers[] = {
 		.driver = {.owner = THIS_MODULE,},
 	},
 	{
-		.phy_id = 0x01410cb0,
-		.phy_id_mask = 0xfffffff0,
+		.phy_id = MARVELL_PHY_ID_88E1121R,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
 		.name = "Marvell 88E1121R",
 		.features = PHY_GBIT_FEATURES,
 		.flags = PHY_HAS_INTERRUPT,
@@ -593,8 +595,8 @@ static struct phy_driver marvell_drivers[] = {
 		.driver = { .owner = THIS_MODULE },
 	},
 	{
-		.phy_id = 0x01410cd0,
-		.phy_id_mask = 0xfffffff0,
+		.phy_id = MARVELL_PHY_ID_88E1145,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
 		.name = "Marvell 88E1145",
 		.features = PHY_GBIT_FEATURES,
 		.flags = PHY_HAS_INTERRUPT,
@@ -606,8 +608,8 @@ static struct phy_driver marvell_drivers[] = {
 		.driver = { .owner = THIS_MODULE },
 	},
 	{
-		.phy_id = 0x01410e30,
-		.phy_id_mask = 0xfffffff0,
+		.phy_id = MARVELL_PHY_ID_88E1240,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
 		.name = "Marvell 88E1240",
 		.features = PHY_GBIT_FEATURES,
 		.flags = PHY_HAS_INTERRUPT,
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
new file mode 100644
index 000000000000..2ed4fb8bbd51
--- /dev/null
+++ b/include/linux/marvell_phy.h
@@ -0,0 +1,20 @@
+#ifndef _MARVELL_PHY_H
+#define _MARVELL_PHY_H
+
+/* Mask used for ID comparisons */
+#define MARVELL_PHY_ID_MASK		0xfffffff0
+
+/* Known PHY IDs */
+#define MARVELL_PHY_ID_88E1101		0x01410c60
+#define MARVELL_PHY_ID_88E1112		0x01410c90
+#define MARVELL_PHY_ID_88E1111		0x01410cc0
+#define MARVELL_PHY_ID_88E1118		0x01410e10
+#define MARVELL_PHY_ID_88E1121R		0x01410cb0
+#define MARVELL_PHY_ID_88E1145		0x01410cd0
+#define MARVELL_PHY_ID_88E1240		0x01410e30
+
+/* struct phy_device dev_flags definitions */
+#define MARVELL_PHY_M1145_FLAGS_RESISTANCE	0x00000001
+#define MARVELL_PHY_M1118_DNS323_LEDS		0x00000002
+
+#endif /* _MARVELL_PHY_H */
-- 
cgit v1.2.3-59-g8ed1b


From 396e894d289d69bacf5acd983c97cd6e21a14c08 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 9 Jul 2010 15:12:27 +0200
Subject: sched: Revert nohz_ratelimit() for now

Norbert reported that nohz_ratelimit() causes his laptop to burn about
4W (40%) extra. For now back out the change and see if we can adjust
the power management code to make better decisions.

Reported-by: Norbert Preining <preining@logic.at>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |  6 ------
 kernel/sched.c           | 10 ----------
 kernel/time/tick-sched.c |  2 +-
 3 files changed, 1 insertion(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 747fcaedddb7..6e0bb86de990 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -273,17 +273,11 @@ extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 extern int get_nohz_load_balancer(void);
-extern int nohz_ratelimit(int cpu);
 #else
 static inline int select_nohz_load_balancer(int cpu)
 {
 	return 0;
 }
-
-static inline int nohz_ratelimit(int cpu)
-{
-	return 0;
-}
 #endif
 
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a8801b7a2..63b4a14682fa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1232,16 +1232,6 @@ void wake_up_idle_cpu(int cpu)
 		smp_send_reschedule(cpu);
 }
 
-int nohz_ratelimit(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	u64 diff = rq->clock - rq->nohz_stamp;
-
-	rq->nohz_stamp = rq->clock;
-
-	return diff < (NSEC_PER_SEC / HZ) >> 1;
-}
-
 #endif /* CONFIG_NO_HZ */
 
 static u64 sched_avg_period(void)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 813993b5fb61..f898af608171 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 	} while (read_seqretry(&xtime_lock, seq));
 
 	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-	    arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
+	    arch_needs_cpu(cpu)) {
 		next_jiffies = last_jiffies + 1;
 		delta_jiffies = 1;
 	} else {
-- 
cgit v1.2.3-59-g8ed1b


From 2430d12c94ff2bafcfe4f65edf7ee5f300d2d9c6 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Sun, 13 Jun 2010 00:36:52 +0200
Subject: PM: describe kernel policy regarding wakeup defaults (v. 2)

This patch (as1381b) updates a comment describing the kernel's policy
toward enabling wakeup by default.

It also makes device_set_wakeup_capable() actually do something when
CONFIG_PM isn't enabled.  It's not clear this is necessary; however if
it isn't then device_init_wakeup() and device_can_wakeup() should also
be do-nothing routines.  Furthermore, I don't expect this change to
have any noticeable effect -- but if it does then clearly the old
behavior was wrong.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/pm_wakeup.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index 22d64c18056c..76aca48722ae 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -29,8 +29,11 @@
 
 #ifdef CONFIG_PM
 
-/* changes to device_may_wakeup take effect on the next pm state change.
- * by default, devices should wakeup if they can.
+/* Changes to device_may_wakeup take effect on the next pm state change.
+ *
+ * By default, most devices should leave wakeup disabled.  The exceptions
+ * are devices that everyone expects to be wakeup sources: keyboards,
+ * power buttons, possibly network interfaces, etc.
  */
 static inline void device_init_wakeup(struct device *dev, bool val)
 {
@@ -59,7 +62,7 @@ static inline bool device_may_wakeup(struct device *dev)
 
 #else /* !CONFIG_PM */
 
-/* For some reason the next two routines work even without CONFIG_PM */
+/* For some reason the following routines work even without CONFIG_PM */
 static inline void device_init_wakeup(struct device *dev, bool val)
 {
 	dev->power.can_wakeup = val;
@@ -67,6 +70,7 @@ static inline void device_init_wakeup(struct device *dev, bool val)
 
 static inline void device_set_wakeup_capable(struct device *dev, bool capable)
 {
+	dev->power.can_wakeup = capable;
 }
 
 static inline bool device_can_wakeup(struct device *dev)
-- 
cgit v1.2.3-59-g8ed1b


From b14e033e17d0ea0ba12668d0d2f371cd31586994 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Tue, 29 Jun 2010 22:49:24 +0200
Subject: PNPACPI: Add support for remote wakeup

This patch (as1354) adds remote-wakeup support to the pnpacpi driver.
The new can_wakeup method also allows other PNP protocol drivers
(pnpbios or iaspnp) to add wakeup support, but I don't know enough
about how they work to actually do it.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Reviewed-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/pnp/core.c         |  3 +++
 drivers/pnp/pnpacpi/core.c | 23 +++++++++++++++++++++++
 include/linux/pnp.h        |  1 +
 3 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pnp/core.c b/drivers/pnp/core.c
index 5dba90995d9e..88b3cde52596 100644
--- a/drivers/pnp/core.c
+++ b/drivers/pnp/core.c
@@ -164,6 +164,9 @@ int __pnp_add_device(struct pnp_dev *dev)
 	list_add_tail(&dev->global_list, &pnp_global);
 	list_add_tail(&dev->protocol_list, &dev->protocol->devices);
 	spin_unlock(&pnp_lock);
+	if (dev->protocol->can_wakeup)
+		device_set_wakeup_capable(&dev->dev,
+				dev->protocol->can_wakeup(dev));
 	return device_register(&dev->dev);
 }
 
diff --git a/drivers/pnp/pnpacpi/core.c b/drivers/pnp/pnpacpi/core.c
index f7ff628b7d94..dc4e32e031e9 100644
--- a/drivers/pnp/pnpacpi/core.c
+++ b/drivers/pnp/pnpacpi/core.c
@@ -122,17 +122,37 @@ static int pnpacpi_disable_resources(struct pnp_dev *dev)
 }
 
 #ifdef CONFIG_ACPI_SLEEP
+static bool pnpacpi_can_wakeup(struct pnp_dev *dev)
+{
+	struct acpi_device *acpi_dev = dev->data;
+	acpi_handle handle = acpi_dev->handle;
+
+	return acpi_bus_can_wakeup(handle);
+}
+
 static int pnpacpi_suspend(struct pnp_dev *dev, pm_message_t state)
 {
 	struct acpi_device *acpi_dev = dev->data;
 	acpi_handle handle = acpi_dev->handle;
 	int power_state;
 
+	if (device_can_wakeup(&dev->dev)) {
+		int rc = acpi_pm_device_sleep_wake(&dev->dev,
+				device_may_wakeup(&dev->dev));
+
+		if (rc)
+			return rc;
+	}
 	power_state = acpi_pm_device_sleep_state(&dev->dev, NULL);
 	if (power_state < 0)
 		power_state = (state.event == PM_EVENT_ON) ?
 				ACPI_STATE_D0 : ACPI_STATE_D3;
 
+	/* acpi_bus_set_power() often fails (keyboard port can't be
+	 * powered-down?), and in any case, our return value is ignored
+	 * by pnp_bus_suspend().  Hence we don't revert the wakeup
+	 * setting if the set_power fails.
+	 */
 	return acpi_bus_set_power(handle, power_state);
 }
 
@@ -141,6 +161,8 @@ static int pnpacpi_resume(struct pnp_dev *dev)
 	struct acpi_device *acpi_dev = dev->data;
 	acpi_handle handle = acpi_dev->handle;
 
+	if (device_may_wakeup(&dev->dev))
+		acpi_pm_device_sleep_wake(&dev->dev, false);
 	return acpi_bus_set_power(handle, ACPI_STATE_D0);
 }
 #endif
@@ -151,6 +173,7 @@ struct pnp_protocol pnpacpi_protocol = {
 	.set	 = pnpacpi_set_resources,
 	.disable = pnpacpi_disable_resources,
 #ifdef CONFIG_ACPI_SLEEP
+	.can_wakeup = pnpacpi_can_wakeup,
 	.suspend = pnpacpi_suspend,
 	.resume = pnpacpi_resume,
 #endif
diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index 7c4193eb0072..1bc1338b817b 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -414,6 +414,7 @@ struct pnp_protocol {
 	int (*disable) (struct pnp_dev *dev);
 
 	/* protocol specific suspend/resume */
+	bool (*can_wakeup) (struct pnp_dev *dev);
 	int (*suspend) (struct pnp_dev * dev, pm_message_t state);
 	int (*resume) (struct pnp_dev * dev);
 
-- 
cgit v1.2.3-59-g8ed1b


From c125e96f044427f38d106fab7bc5e4a5e6a18262 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 5 Jul 2010 22:43:53 +0200
Subject: PM: Make it possible to avoid races between wakeup and system sleep

One of the arguments during the suspend blockers discussion was that
the mainline kernel didn't contain any mechanisms making it possible
to avoid races between wakeup and system suspend.

Generally, there are two problems in that area.  First, if a wakeup
event occurs exactly when /sys/power/state is being written to, it
may be delivered to user space right before the freezer kicks in, so
the user space consumer of the event may not be able to process it
before the system is suspended.  Second, if a wakeup event occurs
after user space has been frozen, it is not generally guaranteed that
the ongoing transition of the system into a sleep state will be
aborted.

To address these issues introduce a new global sysfs attribute,
/sys/power/wakeup_count, associated with a running counter of wakeup
events and three helper functions, pm_stay_awake(), pm_relax(), and
pm_wakeup_event(), that may be used by kernel subsystems to control
the behavior of this attribute and to request the PM core to abort
system transitions into a sleep state already in progress.

The /sys/power/wakeup_count file may be read from or written to by
user space.  Reads will always succeed (unless interrupted by a
signal) and return the current value of the wakeup events counter.
Writes, however, will only succeed if the written number is equal to
the current value of the wakeup events counter.  If a write is
successful, it will cause the kernel to save the current value of the
wakeup events counter and to abort the subsequent system transition
into a sleep state if any wakeup events are reported after the write
has returned.

[The assumption is that before writing to /sys/power/state user space
will first read from /sys/power/wakeup_count.  Next, user space
consumers of wakeup events will have a chance to acknowledge or
veto the upcoming system transition to a sleep state.  Finally, if
the transition is allowed to proceed, /sys/power/wakeup_count will
be written to and if that succeeds, /sys/power/state will be written
to as well.  Still, if any wakeup events are reported to the PM core
by kernel subsystems after that point, the transition will be
aborted.]

Additionally, put a wakeup events counter into struct dev_pm_info and
make these per-device wakeup event counters available via sysfs,
so that it's possible to check the activity of various wakeup event
sources within the kernel.

To illustrate how subsystems can use pm_wakeup_event(), make the
low-level PCI runtime PM wakeup-handling code use it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: markgross <markgross@thegnar.org>
Reviewed-by: Alan Stern <stern@rowland.harvard.edu>
---
 Documentation/ABI/testing/sysfs-power |  15 +++
 drivers/base/power/Makefile           |   2 +-
 drivers/base/power/main.c             |   1 +
 drivers/base/power/sysfs.c            |  15 +++
 drivers/base/power/wakeup.c           | 229 ++++++++++++++++++++++++++++++++++
 drivers/pci/pci-acpi.c                |   1 +
 drivers/pci/pci.c                     |  20 ++-
 drivers/pci/pci.h                     |   1 +
 drivers/pci/pcie/pme/pcie_pme.c       |   5 +-
 include/linux/pm.h                    |  10 ++
 include/linux/suspend.h               |   7 ++
 kernel/power/hibernate.c              |  20 ++-
 kernel/power/main.c                   |  55 ++++++++
 kernel/power/suspend.c                |   4 +-
 14 files changed, 375 insertions(+), 10 deletions(-)
 create mode 100644 drivers/base/power/wakeup.c

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index d6a801f45b48..2875f1f74a07 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -114,3 +114,18 @@ Description:
 		if this file contains "1", which is the default.  It may be
 		disabled by writing "0" to this file, in which case all devices
 		will be suspended and resumed synchronously.
+
+What:		/sys/power/wakeup_count
+Date:		July 2010
+Contact:	Rafael J. Wysocki <rjw@sisk.pl>
+Description:
+		The /sys/power/wakeup_count file allows user space to put the
+		system into a sleep state while taking into account the
+		concurrent arrival of wakeup events.  Reading from it returns
+		the current number of registered wakeup events and it blocks if
+		some wakeup events are being processed at the time the file is
+		read from.  Writing to it will only succeed if the current
+		number of wakeup events is equal to the written value and, if
+		successful, will make the kernel abort a subsequent transition
+		to a sleep state if any wakeup events are reported after the
+		write has returned.
diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile
index 89de75325cea..cbccf9a3cee4 100644
--- a/drivers/base/power/Makefile
+++ b/drivers/base/power/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_PM)	+= sysfs.o
-obj-$(CONFIG_PM_SLEEP)	+= main.o
+obj-$(CONFIG_PM_SLEEP)	+= main.o wakeup.o
 obj-$(CONFIG_PM_RUNTIME)	+= runtime.o
 obj-$(CONFIG_PM_OPS)	+= generic_ops.o
 obj-$(CONFIG_PM_TRACE_RTC)	+= trace.o
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 941fcb87e52a..5419a49ff135 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -59,6 +59,7 @@ void device_pm_init(struct device *dev)
 {
 	dev->power.status = DPM_ON;
 	init_completion(&dev->power.completion);
+	dev->power.wakeup_count = 0;
 	pm_runtime_init(dev);
 }
 
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index a4c33bc51257..81d344e0e95d 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -73,6 +73,8 @@
  *	device are known to the PM core.  However, for some devices this
  *	attribute is set to "enabled" by bus type code or device drivers and in
  *	that cases it should be safe to leave the default value.
+ *
+ *	wakeup_count - Report the number of wakeup events related to the device
  */
 
 static const char enabled[] = "enabled";
@@ -144,6 +146,16 @@ wake_store(struct device * dev, struct device_attribute *attr,
 
 static DEVICE_ATTR(wakeup, 0644, wake_show, wake_store);
 
+#ifdef CONFIG_PM_SLEEP
+static ssize_t wakeup_count_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", dev->power.wakeup_count);
+}
+
+static DEVICE_ATTR(wakeup_count, 0444, wakeup_count_show, NULL);
+#endif
+
 #ifdef CONFIG_PM_ADVANCED_DEBUG
 #ifdef CONFIG_PM_RUNTIME
 
@@ -230,6 +242,9 @@ static struct attribute * power_attrs[] = {
 	&dev_attr_control.attr,
 #endif
 	&dev_attr_wakeup.attr,
+#ifdef CONFIG_PM_SLEEP
+	&dev_attr_wakeup_count.attr,
+#endif
 #ifdef CONFIG_PM_ADVANCED_DEBUG
 	&dev_attr_async.attr,
 #ifdef CONFIG_PM_RUNTIME
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
new file mode 100644
index 000000000000..25599077c39c
--- /dev/null
+++ b/drivers/base/power/wakeup.c
@@ -0,0 +1,229 @@
+/*
+ * drivers/base/power/wakeup.c - System wakeup events framework
+ *
+ * Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/capability.h>
+#include <linux/suspend.h>
+#include <linux/pm.h>
+
+/*
+ * If set, the suspend/hibernate code will abort transitions to a sleep state
+ * if wakeup events are registered during or immediately before the transition.
+ */
+bool events_check_enabled;
+
+/* The counter of registered wakeup events. */
+static unsigned long event_count;
+/* A preserved old value of event_count. */
+static unsigned long saved_event_count;
+/* The counter of wakeup events being processed. */
+static unsigned long events_in_progress;
+
+static DEFINE_SPINLOCK(events_lock);
+
+/*
+ * The functions below use the observation that each wakeup event starts a
+ * period in which the system should not be suspended.  The moment this period
+ * will end depends on how the wakeup event is going to be processed after being
+ * detected and all of the possible cases can be divided into two distinct
+ * groups.
+ *
+ * First, a wakeup event may be detected by the same functional unit that will
+ * carry out the entire processing of it and possibly will pass it to user space
+ * for further processing.  In that case the functional unit that has detected
+ * the event may later "close" the "no suspend" period associated with it
+ * directly as soon as it has been dealt with.  The pair of pm_stay_awake() and
+ * pm_relax(), balanced with each other, is supposed to be used in such
+ * situations.
+ *
+ * Second, a wakeup event may be detected by one functional unit and processed
+ * by another one.  In that case the unit that has detected it cannot really
+ * "close" the "no suspend" period associated with it, unless it knows in
+ * advance what's going to happen to the event during processing.  This
+ * knowledge, however, may not be available to it, so it can simply specify time
+ * to wait before the system can be suspended and pass it as the second
+ * argument of pm_wakeup_event().
+ */
+
+/**
+ * pm_stay_awake - Notify the PM core that a wakeup event is being processed.
+ * @dev: Device the wakeup event is related to.
+ *
+ * Notify the PM core of a wakeup event (signaled by @dev) by incrementing the
+ * counter of wakeup events being processed.  If @dev is not NULL, the counter
+ * of wakeup events related to @dev is incremented too.
+ *
+ * Call this function after detecting of a wakeup event if pm_relax() is going
+ * to be called directly after processing the event (and possibly passing it to
+ * user space for further processing).
+ *
+ * It is safe to call this function from interrupt context.
+ */
+void pm_stay_awake(struct device *dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&events_lock, flags);
+	if (dev)
+		dev->power.wakeup_count++;
+
+	events_in_progress++;
+	spin_unlock_irqrestore(&events_lock, flags);
+}
+
+/**
+ * pm_relax - Notify the PM core that processing of a wakeup event has ended.
+ *
+ * Notify the PM core that a wakeup event has been processed by decrementing
+ * the counter of wakeup events being processed and incrementing the counter
+ * of registered wakeup events.
+ *
+ * Call this function for wakeup events whose processing started with calling
+ * pm_stay_awake().
+ *
+ * It is safe to call it from interrupt context.
+ */
+void pm_relax(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&events_lock, flags);
+	if (events_in_progress) {
+		events_in_progress--;
+		event_count++;
+	}
+	spin_unlock_irqrestore(&events_lock, flags);
+}
+
+/**
+ * pm_wakeup_work_fn - Deferred closing of a wakeup event.
+ *
+ * Execute pm_relax() for a wakeup event detected in the past and free the
+ * work item object used for queuing up the work.
+ */
+static void pm_wakeup_work_fn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+
+	pm_relax();
+	kfree(dwork);
+}
+
+/**
+ * pm_wakeup_event - Notify the PM core of a wakeup event.
+ * @dev: Device the wakeup event is related to.
+ * @msec: Anticipated event processing time (in milliseconds).
+ *
+ * Notify the PM core of a wakeup event (signaled by @dev) that will take
+ * approximately @msec milliseconds to be processed by the kernel.  Increment
+ * the counter of wakeup events being processed and queue up a work item
+ * that will execute pm_relax() for the event after @msec milliseconds.  If @dev
+ * is not NULL, the counter of wakeup events related to @dev is incremented too.
+ *
+ * It is safe to call this function from interrupt context.
+ */
+void pm_wakeup_event(struct device *dev, unsigned int msec)
+{
+	unsigned long flags;
+	struct delayed_work *dwork;
+
+	dwork = msec ? kzalloc(sizeof(*dwork), GFP_ATOMIC) : NULL;
+
+	spin_lock_irqsave(&events_lock, flags);
+	if (dev)
+		dev->power.wakeup_count++;
+
+	if (dwork) {
+		INIT_DELAYED_WORK(dwork, pm_wakeup_work_fn);
+		schedule_delayed_work(dwork, msecs_to_jiffies(msec));
+
+		events_in_progress++;
+	} else {
+		event_count++;
+	}
+	spin_unlock_irqrestore(&events_lock, flags);
+}
+
+/**
+ * pm_check_wakeup_events - Check for new wakeup events.
+ *
+ * Compare the current number of registered wakeup events with its preserved
+ * value from the past to check if new wakeup events have been registered since
+ * the old value was stored.  Check if the current number of wakeup events being
+ * processed is zero.
+ */
+bool pm_check_wakeup_events(void)
+{
+	unsigned long flags;
+	bool ret = true;
+
+	spin_lock_irqsave(&events_lock, flags);
+	if (events_check_enabled) {
+		ret = (event_count == saved_event_count) && !events_in_progress;
+		events_check_enabled = ret;
+	}
+	spin_unlock_irqrestore(&events_lock, flags);
+	return ret;
+}
+
+/**
+ * pm_get_wakeup_count - Read the number of registered wakeup events.
+ * @count: Address to store the value at.
+ *
+ * Store the number of registered wakeup events at the address in @count.  Block
+ * if the current number of wakeup events being processed is nonzero.
+ *
+ * Return false if the wait for the number of wakeup events being processed to
+ * drop down to zero has been interrupted by a signal (and the current number
+ * of wakeup events being processed is still nonzero).  Otherwise return true.
+ */
+bool pm_get_wakeup_count(unsigned long *count)
+{
+	bool ret;
+
+	spin_lock_irq(&events_lock);
+	if (capable(CAP_SYS_ADMIN))
+		events_check_enabled = false;
+
+	while (events_in_progress && !signal_pending(current)) {
+		spin_unlock_irq(&events_lock);
+
+		schedule_timeout_interruptible(msecs_to_jiffies(100));
+
+		spin_lock_irq(&events_lock);
+	}
+	*count = event_count;
+	ret = !events_in_progress;
+	spin_unlock_irq(&events_lock);
+	return ret;
+}
+
+/**
+ * pm_save_wakeup_count - Save the current number of registered wakeup events.
+ * @count: Value to compare with the current number of registered wakeup events.
+ *
+ * If @count is equal to the current number of registered wakeup events and the
+ * current number of wakeup events being processed is zero, store @count as the
+ * old number of registered wakeup events to be used by pm_check_wakeup_events()
+ * and return true.  Otherwise return false.
+ */
+bool pm_save_wakeup_count(unsigned long count)
+{
+	bool ret = false;
+
+	spin_lock_irq(&events_lock);
+	if (count == event_count && !events_in_progress) {
+		saved_event_count = count;
+		events_check_enabled = true;
+		ret = true;
+	}
+	spin_unlock_irq(&events_lock);
+	return ret;
+}
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 2e7a3bf13824..1ab98bbe58dd 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -48,6 +48,7 @@ static void pci_acpi_wake_dev(acpi_handle handle, u32 event, void *context)
 	if (event == ACPI_NOTIFY_DEVICE_WAKE && pci_dev) {
 		pci_check_pme_status(pci_dev);
 		pm_runtime_resume(&pci_dev->dev);
+		pci_wakeup_event(pci_dev);
 		if (pci_dev->subordinate)
 			pci_pme_wakeup_bus(pci_dev->subordinate);
 	}
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 740fb4ea9669..130ed1daf0f8 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1275,6 +1275,22 @@ bool pci_check_pme_status(struct pci_dev *dev)
 	return ret;
 }
 
+/*
+ * Time to wait before the system can be put into a sleep state after reporting
+ * a wakeup event signaled by a PCI device.
+ */
+#define PCI_WAKEUP_COOLDOWN	100
+
+/**
+ * pci_wakeup_event - Report a wakeup event related to a given PCI device.
+ * @dev: Device to report the wakeup event for.
+ */
+void pci_wakeup_event(struct pci_dev *dev)
+{
+	if (device_may_wakeup(&dev->dev))
+		pm_wakeup_event(&dev->dev, PCI_WAKEUP_COOLDOWN);
+}
+
 /**
  * pci_pme_wakeup - Wake up a PCI device if its PME Status bit is set.
  * @dev: Device to handle.
@@ -1285,8 +1301,10 @@ bool pci_check_pme_status(struct pci_dev *dev)
  */
 static int pci_pme_wakeup(struct pci_dev *dev, void *ign)
 {
-	if (pci_check_pme_status(dev))
+	if (pci_check_pme_status(dev)) {
 		pm_request_resume(&dev->dev);
+		pci_wakeup_event(dev);
+	}
 	return 0;
 }
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index f8077b3c8c8c..c8b7fd056ccd 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -56,6 +56,7 @@ extern void pci_update_current_state(struct pci_dev *dev, pci_power_t state);
 extern void pci_disable_enabled_device(struct pci_dev *dev);
 extern bool pci_check_pme_status(struct pci_dev *dev);
 extern int pci_finish_runtime_suspend(struct pci_dev *dev);
+extern void pci_wakeup_event(struct pci_dev *dev);
 extern int __pci_pme_wakeup(struct pci_dev *dev, void *ign);
 extern void pci_pme_wakeup_bus(struct pci_bus *bus);
 extern void pci_pm_init(struct pci_dev *dev);
diff --git a/drivers/pci/pcie/pme/pcie_pme.c b/drivers/pci/pcie/pme/pcie_pme.c
index d672a0a63816..bbdea18693d9 100644
--- a/drivers/pci/pcie/pme/pcie_pme.c
+++ b/drivers/pci/pcie/pme/pcie_pme.c
@@ -154,6 +154,7 @@ static bool pcie_pme_walk_bus(struct pci_bus *bus)
 		/* Skip PCIe devices in case we started from a root port. */
 		if (!pci_is_pcie(dev) && pci_check_pme_status(dev)) {
 			pm_request_resume(&dev->dev);
+			pci_wakeup_event(dev);
 			ret = true;
 		}
 
@@ -254,8 +255,10 @@ static void pcie_pme_handle_request(struct pci_dev *port, u16 req_id)
 	if (found) {
 		/* The device is there, but we have to check its PME status. */
 		found = pci_check_pme_status(dev);
-		if (found)
+		if (found) {
 			pm_request_resume(&dev->dev);
+			pci_wakeup_event(dev);
+		}
 		pci_dev_put(dev);
 	} else if (devfn) {
 		/*
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 8e258c727971..b417fc46f3fc 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -457,6 +457,7 @@ struct dev_pm_info {
 #ifdef CONFIG_PM_SLEEP
 	struct list_head	entry;
 	struct completion	completion;
+	unsigned long		wakeup_count;
 #endif
 #ifdef CONFIG_PM_RUNTIME
 	struct timer_list	suspend_timer;
@@ -552,6 +553,11 @@ extern void __suspend_report_result(const char *function, void *fn, int ret);
 	} while (0)
 
 extern void device_pm_wait_for_dev(struct device *sub, struct device *dev);
+
+/* drivers/base/power/wakeup.c */
+extern void pm_wakeup_event(struct device *dev, unsigned int msec);
+extern void pm_stay_awake(struct device *dev);
+extern void pm_relax(void);
 #else /* !CONFIG_PM_SLEEP */
 
 #define device_pm_lock() do {} while (0)
@@ -565,6 +571,10 @@ static inline int dpm_suspend_start(pm_message_t state)
 #define suspend_report_result(fn, ret)		do {} while (0)
 
 static inline void device_pm_wait_for_dev(struct device *a, struct device *b) {}
+
+static inline void pm_wakeup_event(struct device *dev, unsigned int msec) {}
+static inline void pm_stay_awake(struct device *dev) {}
+static inline void pm_relax(void) {}
 #endif /* !CONFIG_PM_SLEEP */
 
 /* How to reorder dpm_list after device_move() */
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index bc7d6bb4cd8e..bf1bab7b059c 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -286,6 +286,13 @@ extern int unregister_pm_notifier(struct notifier_block *nb);
 		{ .notifier_call = fn, .priority = pri };	\
 	register_pm_notifier(&fn##_nb);			\
 }
+
+/* drivers/base/power/wakeup.c */
+extern bool events_check_enabled;
+
+extern bool pm_check_wakeup_events(void);
+extern bool pm_get_wakeup_count(unsigned long *count);
+extern bool pm_save_wakeup_count(unsigned long count);
 #else /* !CONFIG_PM_SLEEP */
 
 static inline int register_pm_notifier(struct notifier_block *nb)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa9e916da4d5..f61202916631 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -277,7 +277,7 @@ static int create_image(int platform_mode)
 		goto Enable_irqs;
 	}
 
-	if (hibernation_test(TEST_CORE))
+	if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
 		goto Power_up;
 
 	in_suspend = 1;
@@ -288,8 +288,10 @@ static int create_image(int platform_mode)
 			error);
 	/* Restore control flow magically appears here */
 	restore_processor_state();
-	if (!in_suspend)
+	if (!in_suspend) {
+		events_check_enabled = false;
 		platform_leave(platform_mode);
+	}
 
  Power_up:
 	sysdev_resume();
@@ -511,14 +513,20 @@ int hibernation_platform_enter(void)
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_HIBERNATE);
+	if (!pm_check_wakeup_events()) {
+		error = -EAGAIN;
+		goto Power_up;
+	}
+
 	hibernation_ops->enter();
 	/* We should never get here */
 	while (1);
 
-	/*
-	 * We don't need to reenable the nonboot CPUs or resume consoles, since
-	 * the system is going to be halted anyway.
-	 */
+ Power_up:
+	sysdev_resume();
+	local_irq_enable();
+	enable_nonboot_cpus();
+
  Platform_finish:
 	hibernation_ops->finish();
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b58800b21fc0..62b0bc6e4983 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 
 power_attr(state);
 
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The 'wakeup_count' attribute, along with the functions defined in
+ * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
+ * handled in a non-racy way.
+ *
+ * If a wakeup event occurs when the system is in a sleep state, it simply is
+ * woken up.  In turn, if an event that would wake the system up from a sleep
+ * state occurs when it is undergoing a transition to that sleep state, the
+ * transition should be aborted.  Moreover, if such an event occurs when the
+ * system is in the working state, an attempt to start a transition to the
+ * given sleep state should fail during certain period after the detection of
+ * the event.  Using the 'state' attribute alone is not sufficient to satisfy
+ * these requirements, because a wakeup event may occur exactly when 'state'
+ * is being written to and may be delivered to user space right before it is
+ * frozen, so the event will remain only partially processed until the system is
+ * woken up by another event.  In particular, it won't cause the transition to
+ * a sleep state to be aborted.
+ *
+ * This difficulty may be overcome if user space uses 'wakeup_count' before
+ * writing to 'state'.  It first should read from 'wakeup_count' and store
+ * the read value.  Then, after carrying out its own preparations for the system
+ * transition to a sleep state, it should write the stored value to
+ * 'wakeup_count'.  If that fails, at least one wakeup event has occured since
+ * 'wakeup_count' was read and 'state' should not be written to.  Otherwise, it
+ * is allowed to write to 'state', but the transition will be aborted if there
+ * are any wakeup events detected after 'wakeup_count' was written to.
+ */
+
+static ssize_t wakeup_count_show(struct kobject *kobj,
+				struct kobj_attribute *attr,
+				char *buf)
+{
+	unsigned long val;
+
+	return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR;
+}
+
+static ssize_t wakeup_count_store(struct kobject *kobj,
+				struct kobj_attribute *attr,
+				const char *buf, size_t n)
+{
+	unsigned long val;
+
+	if (sscanf(buf, "%lu", &val) == 1) {
+		if (pm_save_wakeup_count(val))
+			return n;
+	}
+	return -EINVAL;
+}
+
+power_attr(wakeup_count);
+#endif /* CONFIG_PM_SLEEP */
+
 #ifdef CONFIG_PM_TRACE
 int pm_trace_enabled;
 
@@ -236,6 +290,7 @@ static struct attribute * g[] = {
 #endif
 #ifdef CONFIG_PM_SLEEP
 	&pm_async_attr.attr,
+	&wakeup_count_attr.attr,
 #ifdef CONFIG_PM_DEBUG
 	&pm_test_attr.attr,
 #endif
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f37cb7dd4402..5f8d09f94325 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -163,8 +163,10 @@ static int suspend_enter(suspend_state_t state)
 
 	error = sysdev_suspend(PMSG_SUSPEND);
 	if (!error) {
-		if (!suspend_test(TEST_CORE))
+		if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
 			error = suspend_ops->enter(state);
+			events_check_enabled = false;
+		}
 		sysdev_resume();
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 12e4d0cc2e0a776a526c93bb2fcb9267abc6e0b1 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@suse.de>
Date: Thu, 1 Jul 2010 21:46:36 +0200
Subject: plist: Add plist_last

plist is currently used by the scheduler, which only needs to know the
highest item in the list.  This adds plist_last which allows you to
find the lowest.  This is necessary for using plists to implement a
fast search of dynamic ranges in pm_qos which can have both highest
and lowest criteria.

Signed-off-by: James Bottomley <James.Bottomley@suse.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/plist.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/plist.h b/include/linux/plist.h
index 6898985e7b38..7254eda078e5 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -259,6 +259,23 @@ static inline int plist_node_empty(const struct plist_node *node)
 	container_of(plist_first(head), type, member)
 #endif
 
+/**
+ * plist_last_entry - get the struct for the last entry
+ * @head:	the &struct plist_head pointer
+ * @type:	the type of the struct this is embedded in
+ * @member:	the name of the list_struct within the struct
+ */
+#ifdef CONFIG_DEBUG_PI_LIST
+# define plist_last_entry(head, type, member)	\
+({ \
+	WARN_ON(plist_head_empty(head)); \
+	container_of(plist_last(head), type, member); \
+})
+#else
+# define plist_last_entry(head, type, member)	\
+	container_of(plist_last(head), type, member)
+#endif
+
 /**
  * plist_first - return the first node (and thus, highest priority)
  * @head:	the &struct plist_head pointer
@@ -271,4 +288,16 @@ static inline struct plist_node *plist_first(const struct plist_head *head)
 			  struct plist_node, plist.node_list);
 }
 
+/**
+ * plist_last - return the last node (and thus, lowest priority)
+ * @head:	the &struct plist_head pointer
+ *
+ * Assumes the plist is _not_ empty.
+ */
+static inline struct plist_node *plist_last(const struct plist_head *head)
+{
+	return list_entry(head->node_list.prev,
+			  struct plist_node, plist.node_list);
+}
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 82f682514a5df89ffb3890627eebf0897b7a84ec Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@suse.de>
Date: Mon, 5 Jul 2010 22:53:06 +0200
Subject: pm_qos: Get rid of the allocation in pm_qos_add_request()

All current users of pm_qos_add_request() have the ability to supply
the memory required by the pm_qos routines, so make them do this and
eliminate the kmalloc() with pm_qos_add_request().  This has the
double benefit of making the call never fail and allowing it to be
called from atomic context.

Signed-off-by: James Bottomley <James.Bottomley@suse.de>
Signed-off-by: mark gross <markgross@thegnar.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/net/e1000e/netdev.c            | 17 ++++-----
 drivers/net/igbvf/netdev.c             |  9 ++---
 drivers/net/wireless/ipw2x00/ipw2100.c | 12 +++---
 include/linux/netdevice.h              |  2 +-
 include/linux/pm_qos_params.h          | 13 +++++--
 include/sound/pcm.h                    |  2 +-
 kernel/pm_qos_params.c                 | 67 ++++++++++++++++++++--------------
 sound/core/pcm_native.c                | 13 +++----
 8 files changed, 74 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 57a7e41da69e..9f13b660b801 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -2901,10 +2901,10 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
 			 * dropped transactions.
 			 */
 			pm_qos_update_request(
-				adapter->netdev->pm_qos_req, 55);
+				&adapter->netdev->pm_qos_req, 55);
 		} else {
 			pm_qos_update_request(
-				adapter->netdev->pm_qos_req,
+				&adapter->netdev->pm_qos_req,
 				PM_QOS_DEFAULT_VALUE);
 		}
 	}
@@ -3196,9 +3196,9 @@ int e1000e_up(struct e1000_adapter *adapter)
 
 	/* DMA latency requirement to workaround early-receive/jumbo issue */
 	if (adapter->flags & FLAG_HAS_ERT)
-		adapter->netdev->pm_qos_req =
-			pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
-				       PM_QOS_DEFAULT_VALUE);
+		pm_qos_add_request(&adapter->netdev->pm_qos_req,
+				   PM_QOS_CPU_DMA_LATENCY,
+				   PM_QOS_DEFAULT_VALUE);
 
 	/* hardware has been reset, we need to reload some things */
 	e1000_configure(adapter);
@@ -3263,11 +3263,8 @@ void e1000e_down(struct e1000_adapter *adapter)
 	e1000_clean_tx_ring(adapter);
 	e1000_clean_rx_ring(adapter);
 
-	if (adapter->flags & FLAG_HAS_ERT) {
-		pm_qos_remove_request(
-			      adapter->netdev->pm_qos_req);
-		adapter->netdev->pm_qos_req = NULL;
-	}
+	if (adapter->flags & FLAG_HAS_ERT)
+		pm_qos_remove_request(&adapter->netdev->pm_qos_req);
 
 	/*
 	 * TODO: for power management, we could drop the link and
diff --git a/drivers/net/igbvf/netdev.c b/drivers/net/igbvf/netdev.c
index 5e2b2a8c56c6..add6197d3bcb 100644
--- a/drivers/net/igbvf/netdev.c
+++ b/drivers/net/igbvf/netdev.c
@@ -48,7 +48,7 @@
 #define DRV_VERSION "1.0.0-k0"
 char igbvf_driver_name[] = "igbvf";
 const char igbvf_driver_version[] = DRV_VERSION;
-struct pm_qos_request_list *igbvf_driver_pm_qos_req;
+static struct pm_qos_request_list igbvf_driver_pm_qos_req;
 static const char igbvf_driver_string[] =
 				"Intel(R) Virtual Function Network Driver";
 static const char igbvf_copyright[] = "Copyright (c) 2009 Intel Corporation.";
@@ -2902,8 +2902,8 @@ static int __init igbvf_init_module(void)
 	printk(KERN_INFO "%s\n", igbvf_copyright);
 
 	ret = pci_register_driver(&igbvf_driver);
-	igbvf_driver_pm_qos_req = pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
-	                       PM_QOS_DEFAULT_VALUE);
+	pm_qos_add_request(&igbvf_driver_pm_qos_req, PM_QOS_CPU_DMA_LATENCY,
+			   PM_QOS_DEFAULT_VALUE);
 
 	return ret;
 }
@@ -2918,8 +2918,7 @@ module_init(igbvf_init_module);
 static void __exit igbvf_exit_module(void)
 {
 	pci_unregister_driver(&igbvf_driver);
-	pm_qos_remove_request(igbvf_driver_pm_qos_req);
-	igbvf_driver_pm_qos_req = NULL;
+	pm_qos_remove_request(&igbvf_driver_pm_qos_req);
 }
 module_exit(igbvf_exit_module);
 
diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
index 0bd4dfa59a8a..7f0d98b885bc 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/ipw2x00/ipw2100.c
@@ -174,7 +174,7 @@ that only one external action is invoked at a time.
 #define DRV_DESCRIPTION	"Intel(R) PRO/Wireless 2100 Network Driver"
 #define DRV_COPYRIGHT	"Copyright(c) 2003-2006 Intel Corporation"
 
-struct pm_qos_request_list *ipw2100_pm_qos_req;
+struct pm_qos_request_list ipw2100_pm_qos_req;
 
 /* Debugging stuff */
 #ifdef CONFIG_IPW2100_DEBUG
@@ -1741,7 +1741,7 @@ static int ipw2100_up(struct ipw2100_priv *priv, int deferred)
 	/* the ipw2100 hardware really doesn't want power management delays
 	 * longer than 175usec
 	 */
-	pm_qos_update_request(ipw2100_pm_qos_req, 175);
+	pm_qos_update_request(&ipw2100_pm_qos_req, 175);
 
 	/* If the interrupt is enabled, turn it off... */
 	spin_lock_irqsave(&priv->low_lock, flags);
@@ -1889,7 +1889,7 @@ static void ipw2100_down(struct ipw2100_priv *priv)
 	ipw2100_disable_interrupts(priv);
 	spin_unlock_irqrestore(&priv->low_lock, flags);
 
-	pm_qos_update_request(ipw2100_pm_qos_req, PM_QOS_DEFAULT_VALUE);
+	pm_qos_update_request(&ipw2100_pm_qos_req, PM_QOS_DEFAULT_VALUE);
 
 	/* We have to signal any supplicant if we are disassociating */
 	if (associated)
@@ -6669,8 +6669,8 @@ static int __init ipw2100_init(void)
 	if (ret)
 		goto out;
 
-	ipw2100_pm_qos_req = pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
-			PM_QOS_DEFAULT_VALUE);
+	pm_qos_add_request(&ipw2100_pm_qos_req, PM_QOS_CPU_DMA_LATENCY,
+			   PM_QOS_DEFAULT_VALUE);
 #ifdef CONFIG_IPW2100_DEBUG
 	ipw2100_debug_level = debug;
 	ret = driver_create_file(&ipw2100_pci_driver.driver,
@@ -6692,7 +6692,7 @@ static void __exit ipw2100_exit(void)
 			   &driver_attr_debug_level);
 #endif
 	pci_unregister_driver(&ipw2100_pci_driver);
-	pm_qos_remove_request(ipw2100_pm_qos_req);
+	pm_qos_remove_request(&ipw2100_pm_qos_req);
 }
 
 module_init(ipw2100_init);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b21e4054c12c..2f22119b4b08 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -779,7 +779,7 @@ struct net_device {
 	 */
 	char			name[IFNAMSIZ];
 
-	struct pm_qos_request_list *pm_qos_req;
+	struct pm_qos_request_list pm_qos_req;
 
 	/* device name hash chain */
 	struct hlist_node	name_hlist;
diff --git a/include/linux/pm_qos_params.h b/include/linux/pm_qos_params.h
index 8ba440e5eb7f..77cbddb3784c 100644
--- a/include/linux/pm_qos_params.h
+++ b/include/linux/pm_qos_params.h
@@ -1,8 +1,10 @@
+#ifndef _LINUX_PM_QOS_PARAMS_H
+#define _LINUX_PM_QOS_PARAMS_H
 /* interface for the pm_qos_power infrastructure of the linux kernel.
  *
  * Mark Gross <mgross@linux.intel.com>
  */
-#include <linux/list.h>
+#include <linux/plist.h>
 #include <linux/notifier.h>
 #include <linux/miscdevice.h>
 
@@ -14,9 +16,12 @@
 #define PM_QOS_NUM_CLASSES 4
 #define PM_QOS_DEFAULT_VALUE -1
 
-struct pm_qos_request_list;
+struct pm_qos_request_list {
+	struct plist_node list;
+	int pm_qos_class;
+};
 
-struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value);
+void pm_qos_add_request(struct pm_qos_request_list *l, int pm_qos_class, s32 value);
 void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
 		s32 new_value);
 void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req);
@@ -24,4 +29,6 @@ void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req);
 int pm_qos_request(int pm_qos_class);
 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier);
 int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier);
+int pm_qos_request_active(struct pm_qos_request_list *req);
 
+#endif
diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index dd76cdede64d..6e3a29732dc4 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -366,7 +366,7 @@ struct snd_pcm_substream {
 	int number;
 	char name[32];			/* substream name */
 	int stream;			/* stream (direction) */
-	struct pm_qos_request_list *latency_pm_qos_req; /* pm_qos request */
+	struct pm_qos_request_list latency_pm_qos_req; /* pm_qos request */
 	size_t buffer_bytes_max;	/* limit ring buffer size */
 	struct snd_dma_buffer dma_buffer;
 	unsigned int dma_buf_id;
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index db8e51d7f392..996a4dec5f96 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -30,7 +30,6 @@
 /*#define DEBUG*/
 
 #include <linux/pm_qos_params.h>
-#include <linux/plist.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
@@ -49,11 +48,6 @@
  * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
  * held, taken with _irqsave.  One lock to rule them all
  */
-struct pm_qos_request_list {
-	struct plist_node list;
-	int pm_qos_class;
-};
-
 enum pm_qos_type {
 	PM_QOS_MAX,		/* return the largest value */
 	PM_QOS_MIN		/* return the smallest value */
@@ -210,6 +204,12 @@ int pm_qos_request(int pm_qos_class)
 }
 EXPORT_SYMBOL_GPL(pm_qos_request);
 
+int pm_qos_request_active(struct pm_qos_request_list *req)
+{
+	return req->pm_qos_class != 0;
+}
+EXPORT_SYMBOL_GPL(pm_qos_request_active);
+
 /**
  * pm_qos_add_request - inserts new qos request into the list
  * @pm_qos_class: identifies which list of qos request to us
@@ -221,25 +221,23 @@ EXPORT_SYMBOL_GPL(pm_qos_request);
  * element as a handle for use in updating and removal.  Call needs to save
  * this handle for later use.
  */
-struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
+void pm_qos_add_request(struct pm_qos_request_list *dep,
+			int pm_qos_class, s32 value)
 {
-	struct pm_qos_request_list *dep;
-
-	dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
-	if (dep) {
-		struct pm_qos_object *o =  pm_qos_array[pm_qos_class];
-		int new_value;
-
-		if (value == PM_QOS_DEFAULT_VALUE)
-			new_value = o->default_value;
-		else
-			new_value = value;
-		plist_node_init(&dep->list, new_value);
-		dep->pm_qos_class = pm_qos_class;
-		update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
-	}
+	struct pm_qos_object *o =  pm_qos_array[pm_qos_class];
+	int new_value;
 
-	return dep;
+	if (pm_qos_request_active(dep)) {
+		WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
+		return;
+	}
+	if (value == PM_QOS_DEFAULT_VALUE)
+		new_value = o->default_value;
+	else
+		new_value = value;
+	plist_node_init(&dep->list, new_value);
+	dep->pm_qos_class = pm_qos_class;
+	update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
 }
 EXPORT_SYMBOL_GPL(pm_qos_add_request);
 
@@ -262,6 +260,11 @@ void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
 	if (!pm_qos_req) /*guard against callers passing in null */
 		return;
 
+	if (!pm_qos_request_active(pm_qos_req)) {
+		WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
+		return;
+	}
+
 	o = pm_qos_array[pm_qos_req->pm_qos_class];
 
 	if (new_value == PM_QOS_DEFAULT_VALUE)
@@ -290,9 +293,14 @@ void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
 		return;
 		/* silent return to keep pcm code cleaner */
 
+	if (!pm_qos_request_active(pm_qos_req)) {
+		WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
+		return;
+	}
+
 	o = pm_qos_array[pm_qos_req->pm_qos_class];
 	update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
-	kfree(pm_qos_req);
+	memset(pm_qos_req, 0, sizeof(*pm_qos_req));
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_request);
 
@@ -340,8 +348,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
 
 	pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
 	if (pm_qos_class >= 0) {
-		filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
-				PM_QOS_DEFAULT_VALUE);
+		struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req));
+		if (!req)
+			return -ENOMEM;
+
+		pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
+		filp->private_data = req;
 
 		if (filp->private_data)
 			return 0;
@@ -353,8 +365,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
 	struct pm_qos_request_list *req;
 
-	req = (struct pm_qos_request_list *)filp->private_data;
+	req = filp->private_data;
 	pm_qos_remove_request(req);
+	kfree(req);
 
 	return 0;
 }
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 303ac04ff6e4..a3b2a6479246 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -451,13 +451,11 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream,
 	snd_pcm_timer_resolution_change(substream);
 	runtime->status->state = SNDRV_PCM_STATE_SETUP;
 
-	if (substream->latency_pm_qos_req) {
-		pm_qos_remove_request(substream->latency_pm_qos_req);
-		substream->latency_pm_qos_req = NULL;
-	}
+	if (pm_qos_request_active(&substream->latency_pm_qos_req))
+		pm_qos_remove_request(&substream->latency_pm_qos_req);
 	if ((usecs = period_to_usecs(runtime)) >= 0)
-		substream->latency_pm_qos_req = pm_qos_add_request(
-					PM_QOS_CPU_DMA_LATENCY, usecs);
+		pm_qos_add_request(&substream->latency_pm_qos_req,
+				   PM_QOS_CPU_DMA_LATENCY, usecs);
 	return 0;
  _error:
 	/* hardware might be unuseable from this time,
@@ -512,8 +510,7 @@ static int snd_pcm_hw_free(struct snd_pcm_substream *substream)
 	if (substream->ops->hw_free)
 		result = substream->ops->hw_free(substream);
 	runtime->status->state = SNDRV_PCM_STATE_OPEN;
-	pm_qos_remove_request(substream->latency_pm_qos_req);
-	substream->latency_pm_qos_req = NULL;
+	pm_qos_remove_request(&substream->latency_pm_qos_req);
 	return result;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From ce4410116c5debfb0e049f5db4b5cd6211e05b80 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 7 Jul 2010 23:43:45 +0200
Subject: PM / Suspend: Fix ordering of calls in suspend error paths

The ACPI suspend code calls suspend_nvs_free() at a wrong place,
which may lead to a memory leak if there's an error executing
acpi_pm_prepare(), because acpi_pm_finish() will not be called in
that case.  However, the root cause of this problem is the
apparently confusing ordering of calls in suspend error paths that
needs to be fixed.

In addition to that, fix a typo in a label name in suspend.c.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Len Brown <len.brown@intel.com>
---
 include/linux/suspend.h | 10 ++++++----
 kernel/power/suspend.c  |  9 ++++-----
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index bf1bab7b059c..4af270ec2204 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -61,14 +61,15 @@ typedef int __bitwise suspend_state_t;
  *	before device drivers' late suspend callbacks are executed.  It returns
  *	0 on success or a negative error code otherwise, in which case the
  *	system cannot enter the desired sleep state (@prepare_late(), @enter(),
- *	@wake(), and @finish() will not be called in that case).
+ *	and @wake() will not be called in that case).
  *
  * @prepare_late: Finish preparing the platform for entering the system sleep
  *	state indicated by @begin().
  *	@prepare_late is called before disabling nonboot CPUs and after
  *	device drivers' late suspend callbacks have been executed.  It returns
  *	0 on success or a negative error code otherwise, in which case the
- *	system cannot enter the desired sleep state (@enter() and @wake()).
+ *	system cannot enter the desired sleep state (@enter() will not be
+ *	executed).
  *
  * @enter: Enter the system sleep state indicated by @begin() or represented by
  *	the argument if @begin() is not implemented.
@@ -81,14 +82,15 @@ typedef int __bitwise suspend_state_t;
  *	resume callbacks are executed.
  *	This callback is optional, but should be implemented by the platforms
  *	that implement @prepare_late().  If implemented, it is always called
- *	after @enter(), even if @enter() fails.
+ *	after @prepare_late and @enter(), even if one of them fails.
  *
  * @finish: Finish wake-up of the platform.
  *	@finish is called right prior to calling device drivers' regular suspend
  *	callbacks.
  *	This callback is optional, but should be implemented by the platforms
  *	that implement @prepare().  If implemented, it is always called after
- *	@enter() and @wake(), if implemented, even if any of them fails.
+ *	@enter() and @wake(), even if any of them fails.  It is executed after
+ *	a failing @prepare.
  *
  * @end: Called by the PM core right after resuming devices, to indicate to
  *	the platform that the system has returned to the working state or
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 5f8d09f94325..7335952ee473 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -136,19 +136,19 @@ static int suspend_enter(suspend_state_t state)
 	if (suspend_ops->prepare) {
 		error = suspend_ops->prepare();
 		if (error)
-			return error;
+			goto Platform_finish;
 	}
 
 	error = dpm_suspend_noirq(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down\n");
-		goto Platfrom_finish;
+		goto Platform_finish;
 	}
 
 	if (suspend_ops->prepare_late) {
 		error = suspend_ops->prepare_late();
 		if (error)
-			goto Power_up_devices;
+			goto Platform_wake;
 	}
 
 	if (suspend_test(TEST_PLATFORM))
@@ -180,10 +180,9 @@ static int suspend_enter(suspend_state_t state)
 	if (suspend_ops->wake)
 		suspend_ops->wake();
 
- Power_up_devices:
 	dpm_resume_noirq(PMSG_RESUME);
 
- Platfrom_finish:
+ Platform_finish:
 	if (suspend_ops->finish)
 		suspend_ops->finish();
 
-- 
cgit v1.2.3-59-g8ed1b


From 8d4b9d1bfef117862a2889dec4dac227068544c9 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 19 Jul 2010 02:01:06 +0200
Subject: PM / Runtime: Add runtime PM statistics (v3)

In order for PowerTOP to be able to report how well the new runtime PM is
working for the various drivers, the kernel needs to export some basic
statistics in sysfs.

This patch adds two sysfs files in the runtime PM domain that expose the
total time a device has been active, and the time a device has been
suspended.

With this PowerTOP can compute the activity percentage

Active %age = 100 * (delta active) / (delta active + delta suspended)

and present the information to the user.

I've written the PowerTOP code (slated for version 1.12) already, and the
output looks like this:

Runtime Device Power Management statistics
Active  Device name
 10.0%	06:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8101E/RTL8102E PCI Express Fast Ethernet controller

[version 2: fix stat update bugs noticed by Alan Stern]
[version 3: rebase to -next and move the sysfs declaration]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/power/runtime.c | 54 ++++++++++++++++++++++++++++++++++++++------
 drivers/base/power/sysfs.c   | 30 ++++++++++++++++++++++++
 include/linux/pm.h           |  6 +++++
 3 files changed, 83 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index b0ec0e9f27e9..b78c401ffa73 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -123,6 +123,45 @@ int pm_runtime_idle(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(pm_runtime_idle);
 
+
+/**
+ * update_pm_runtime_accounting - Update the time accounting of power states
+ * @dev: Device to update the accounting for
+ *
+ * In order to be able to have time accounting of the various power states
+ * (as used by programs such as PowerTOP to show the effectiveness of runtime
+ * PM), we need to track the time spent in each state.
+ * update_pm_runtime_accounting must be called each time before the
+ * runtime_status field is updated, to account the time in the old state
+ * correctly.
+ */
+void update_pm_runtime_accounting(struct device *dev)
+{
+	unsigned long now = jiffies;
+	int delta;
+
+	delta = now - dev->power.accounting_timestamp;
+
+	if (delta < 0)
+		delta = 0;
+
+	dev->power.accounting_timestamp = now;
+
+	if (dev->power.disable_depth > 0)
+		return;
+
+	if (dev->power.runtime_status == RPM_SUSPENDED)
+		dev->power.suspended_jiffies += delta;
+	else
+		dev->power.active_jiffies += delta;
+}
+
+static void __update_runtime_status(struct device *dev, enum rpm_status status)
+{
+	update_pm_runtime_accounting(dev);
+	dev->power.runtime_status = status;
+}
+
 /**
  * __pm_runtime_suspend - Carry out run-time suspend of given device.
  * @dev: Device to suspend.
@@ -197,7 +236,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq)
 		goto repeat;
 	}
 
-	dev->power.runtime_status = RPM_SUSPENDING;
+	__update_runtime_status(dev, RPM_SUSPENDING);
 	dev->power.deferred_resume = false;
 
 	if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_suspend) {
@@ -228,7 +267,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq)
 	}
 
 	if (retval) {
-		dev->power.runtime_status = RPM_ACTIVE;
+		__update_runtime_status(dev, RPM_ACTIVE);
 		if (retval == -EAGAIN || retval == -EBUSY) {
 			if (dev->power.timer_expires == 0)
 				notify = true;
@@ -237,7 +276,7 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq)
 			pm_runtime_cancel_pending(dev);
 		}
 	} else {
-		dev->power.runtime_status = RPM_SUSPENDED;
+		__update_runtime_status(dev, RPM_SUSPENDED);
 		pm_runtime_deactivate_timer(dev);
 
 		if (dev->parent) {
@@ -381,7 +420,7 @@ int __pm_runtime_resume(struct device *dev, bool from_wq)
 		goto repeat;
 	}
 
-	dev->power.runtime_status = RPM_RESUMING;
+	__update_runtime_status(dev, RPM_RESUMING);
 
 	if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_resume) {
 		spin_unlock_irq(&dev->power.lock);
@@ -411,10 +450,10 @@ int __pm_runtime_resume(struct device *dev, bool from_wq)
 	}
 
 	if (retval) {
-		dev->power.runtime_status = RPM_SUSPENDED;
+		__update_runtime_status(dev, RPM_SUSPENDED);
 		pm_runtime_cancel_pending(dev);
 	} else {
-		dev->power.runtime_status = RPM_ACTIVE;
+		__update_runtime_status(dev, RPM_ACTIVE);
 		if (parent)
 			atomic_inc(&parent->power.child_count);
 	}
@@ -848,7 +887,7 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status)
 	}
 
  out_set:
-	dev->power.runtime_status = status;
+	__update_runtime_status(dev, status);
 	dev->power.runtime_error = 0;
  out:
 	spin_unlock_irqrestore(&dev->power.lock, flags);
@@ -1077,6 +1116,7 @@ void pm_runtime_init(struct device *dev)
 	dev->power.request_pending = false;
 	dev->power.request = RPM_REQ_NONE;
 	dev->power.deferred_resume = false;
+	dev->power.accounting_timestamp = jiffies;
 	INIT_WORK(&dev->power.work, pm_runtime_work);
 
 	dev->power.timer_expires = 0;
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index 1eca50c8e7ca..e56b4388fe61 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -6,6 +6,7 @@
 #include <linux/string.h>
 #include <linux/pm_runtime.h>
 #include <asm/atomic.h>
+#include <linux/jiffies.h>
 #include "power.h"
 
 /*
@@ -111,6 +112,33 @@ static ssize_t control_store(struct device * dev, struct device_attribute *attr,
 
 static DEVICE_ATTR(control, 0644, control_show, control_store);
 
+static ssize_t rtpm_active_time_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	int ret;
+	spin_lock_irq(&dev->power.lock);
+	update_pm_runtime_accounting(dev);
+	ret = sprintf(buf, "%i\n", jiffies_to_msecs(dev->power.active_jiffies));
+	spin_unlock_irq(&dev->power.lock);
+	return ret;
+}
+
+static DEVICE_ATTR(runtime_active_time, 0444, rtpm_active_time_show, NULL);
+
+static ssize_t rtpm_suspended_time_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	int ret;
+	spin_lock_irq(&dev->power.lock);
+	update_pm_runtime_accounting(dev);
+	ret = sprintf(buf, "%i\n",
+		jiffies_to_msecs(dev->power.suspended_jiffies));
+	spin_unlock_irq(&dev->power.lock);
+	return ret;
+}
+
+static DEVICE_ATTR(runtime_suspended_time, 0444, rtpm_suspended_time_show, NULL);
+
 static ssize_t rtpm_status_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
@@ -254,6 +282,8 @@ static struct attribute * power_attrs[] = {
 #ifdef CONFIG_PM_RUNTIME
 	&dev_attr_control.attr,
 	&dev_attr_runtime_status.attr,
+	&dev_attr_runtime_suspended_time.attr,
+	&dev_attr_runtime_active_time.attr,
 #endif
 	&dev_attr_wakeup.attr,
 #ifdef CONFIG_PM_SLEEP
diff --git a/include/linux/pm.h b/include/linux/pm.h
index b417fc46f3fc..52e8c55ff314 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -477,9 +477,15 @@ struct dev_pm_info {
 	enum rpm_request	request;
 	enum rpm_status		runtime_status;
 	int			runtime_error;
+	unsigned long		active_jiffies;
+	unsigned long		suspended_jiffies;
+	unsigned long		accounting_timestamp;
 #endif
 };
 
+extern void update_pm_runtime_accounting(struct device *dev);
+
+
 /*
  * The PM_EVENT_ messages are also used by drivers implementing the legacy
  * suspend framework, based on the ->suspend() and ->resume() callbacks common
-- 
cgit v1.2.3-59-g8ed1b


From 4507a71507d4ff37e9a499c4241b7701ed1feab4 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Sat, 17 Jul 2010 08:48:28 +0000
Subject: net: add driver hook for tx time stamping.

This patch adds a hook for transmit time stamps. The transmit hook
allows a software fallback for transmit time stamps, for MACs
lacking time stamping hardware. Using the hook will still require
adding an inline function call to each MAC driver.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ac74ee085d74..a1b0400c8d86 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1947,6 +1947,27 @@ static inline ktime_t net_invalid_timestamp(void)
 extern void skb_tstamp_tx(struct sk_buff *orig_skb,
 			struct skb_shared_hwtstamps *hwtstamps);
 
+static inline void sw_tx_timestamp(struct sk_buff *skb)
+{
+	union skb_shared_tx *shtx = skb_tx(skb);
+	if (shtx->software && !shtx->in_progress)
+		skb_tstamp_tx(skb, NULL);
+}
+
+/**
+ * skb_tx_timestamp() - Driver hook for transmit timestamping
+ *
+ * Ethernet MAC Drivers should call this function in their hard_xmit()
+ * function as soon as possible after giving the sk_buff to the MAC
+ * hardware, but before freeing the sk_buff.
+ *
+ * @skb: A socket buffer.
+ */
+static inline void skb_tx_timestamp(struct sk_buff *skb)
+{
+	sw_tx_timestamp(skb);
+}
+
 extern __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
 extern __sum16 __skb_checksum_complete(struct sk_buff *skb);
 
-- 
cgit v1.2.3-59-g8ed1b


From 28b041139e344ecd0f144d6205b004ae354cfa1e Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Sat, 17 Jul 2010 08:48:55 +0000
Subject: net: preserve ifreq parameter when calling generic phy_mii_ioctl().

The phy_mii_ioctl() function unnecessarily throws away the original ifreq.
We need access to the ifreq in order to support PHYs that can perform
hardware time stamping.

Two maverick drivers filter the ioctl commands passed to phy_mii_ioctl().
This is unnecessary since phylib will check the command in any case.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/arm/ixp4xx_eth.c           |  3 ++-
 drivers/net/au1000_eth.c               |  2 +-
 drivers/net/bcm63xx_enet.c             |  2 +-
 drivers/net/cpmac.c                    |  5 +----
 drivers/net/dnet.c                     |  2 +-
 drivers/net/ethoc.c                    |  2 +-
 drivers/net/fec.c                      |  2 +-
 drivers/net/fec_mpc52xx.c              |  2 +-
 drivers/net/fs_enet/fs_enet-main.c     |  3 +--
 drivers/net/gianfar.c                  |  2 +-
 drivers/net/macb.c                     |  2 +-
 drivers/net/mv643xx_eth.c              |  2 +-
 drivers/net/octeon/octeon_mgmt.c       |  2 +-
 drivers/net/phy/phy.c                  |  3 ++-
 drivers/net/sb1250-mac.c               |  2 +-
 drivers/net/sh_eth.c                   |  2 +-
 drivers/net/smsc911x.c                 |  2 +-
 drivers/net/smsc9420.c                 |  2 +-
 drivers/net/stmmac/stmmac_main.c       | 22 ++++++++--------------
 drivers/net/tc35815.c                  |  2 +-
 drivers/net/tg3.c                      |  2 +-
 drivers/net/ucc_geth.c                 |  2 +-
 drivers/staging/octeon/ethernet-mdio.c |  2 +-
 include/linux/phy.h                    |  2 +-
 net/dsa/slave.c                        |  3 +--
 25 files changed, 34 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/arm/ixp4xx_eth.c b/drivers/net/arm/ixp4xx_eth.c
index ee2f8425dbe7..4f1cc7164ad9 100644
--- a/drivers/net/arm/ixp4xx_eth.c
+++ b/drivers/net/arm/ixp4xx_eth.c
@@ -782,7 +782,8 @@ static int eth_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
 
 	if (!netif_running(dev))
 		return -EINVAL;
-	return phy_mii_ioctl(port->phydev, if_mii(req), cmd);
+
+	return phy_mii_ioctl(port->phydev, req, cmd);
 }
 
 /* ethtool support */
diff --git a/drivers/net/au1000_eth.c b/drivers/net/au1000_eth.c
index ece6128bef14..386d4feec652 100644
--- a/drivers/net/au1000_eth.c
+++ b/drivers/net/au1000_eth.c
@@ -978,7 +978,7 @@ static int au1000_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (!aup->phy_dev)
 		return -EINVAL; /* PHY not controllable */
 
-	return phy_mii_ioctl(aup->phy_dev, if_mii(rq), cmd);
+	return phy_mii_ioctl(aup->phy_dev, rq, cmd);
 }
 
 static const struct net_device_ops au1000_netdev_ops = {
diff --git a/drivers/net/bcm63xx_enet.c b/drivers/net/bcm63xx_enet.c
index faf5add894d7..0d2c5da08937 100644
--- a/drivers/net/bcm63xx_enet.c
+++ b/drivers/net/bcm63xx_enet.c
@@ -1496,7 +1496,7 @@ static int bcm_enet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (priv->has_phy) {
 		if (!priv->phydev)
 			return -ENODEV;
-		return phy_mii_ioctl(priv->phydev, if_mii(rq), cmd);
+		return phy_mii_ioctl(priv->phydev, rq, cmd);
 	} else {
 		struct mii_if_info mii;
 
diff --git a/drivers/net/cpmac.c b/drivers/net/cpmac.c
index 38de1a4f825f..e1f6156b3710 100644
--- a/drivers/net/cpmac.c
+++ b/drivers/net/cpmac.c
@@ -846,11 +846,8 @@ static int cpmac_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		return -EINVAL;
 	if (!priv->phy)
 		return -EINVAL;
-	if ((cmd == SIOCGMIIPHY) || (cmd == SIOCGMIIREG) ||
-	    (cmd == SIOCSMIIREG))
-		return phy_mii_ioctl(priv->phy, if_mii(ifr), cmd);
 
-	return -EOPNOTSUPP;
+	return phy_mii_ioctl(priv->phy, ifr, cmd);
 }
 
 static int cpmac_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
diff --git a/drivers/net/dnet.c b/drivers/net/dnet.c
index 8b0f50bbf3e5..4ea7141f525d 100644
--- a/drivers/net/dnet.c
+++ b/drivers/net/dnet.c
@@ -797,7 +797,7 @@ static int dnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (!phydev)
 		return -ENODEV;
 
-	return phy_mii_ioctl(phydev, if_mii(rq), cmd);
+	return phy_mii_ioctl(phydev, rq, cmd);
 }
 
 static void dnet_get_drvinfo(struct net_device *dev,
diff --git a/drivers/net/ethoc.c b/drivers/net/ethoc.c
index 5bb6bb74c40e..38c282e6565b 100644
--- a/drivers/net/ethoc.c
+++ b/drivers/net/ethoc.c
@@ -730,7 +730,7 @@ static int ethoc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		phy = priv->phy;
 	}
 
-	return phy_mii_ioctl(phy, mdio, cmd);
+	return phy_mii_ioctl(phy, ifr, cmd);
 }
 
 static int ethoc_config(struct net_device *dev, struct ifmap *map)
diff --git a/drivers/net/fec.c b/drivers/net/fec.c
index 937f1b4a3483..391a553a3add 100644
--- a/drivers/net/fec.c
+++ b/drivers/net/fec.c
@@ -825,7 +825,7 @@ static int fec_enet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (!phydev)
 		return -ENODEV;
 
-	return phy_mii_ioctl(phydev, if_mii(rq), cmd);
+	return phy_mii_ioctl(phydev, rq, cmd);
 }
 
 static void fec_enet_free_buffers(struct net_device *dev)
diff --git a/drivers/net/fec_mpc52xx.c b/drivers/net/fec_mpc52xx.c
index 5f8346369b80..d1a5b17b2a95 100644
--- a/drivers/net/fec_mpc52xx.c
+++ b/drivers/net/fec_mpc52xx.c
@@ -826,7 +826,7 @@ static int mpc52xx_fec_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (!priv->phydev)
 		return -ENOTSUPP;
 
-	return phy_mii_ioctl(priv->phydev, if_mii(rq), cmd);
+	return phy_mii_ioctl(priv->phydev, rq, cmd);
 }
 
 static const struct net_device_ops mpc52xx_fec_netdev_ops = {
diff --git a/drivers/net/fs_enet/fs_enet-main.c b/drivers/net/fs_enet/fs_enet-main.c
index 309a0eaddd81..f08cff9020bd 100644
--- a/drivers/net/fs_enet/fs_enet-main.c
+++ b/drivers/net/fs_enet/fs_enet-main.c
@@ -963,12 +963,11 @@ static const struct ethtool_ops fs_ethtool_ops = {
 static int fs_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
 	struct fs_enet_private *fep = netdev_priv(dev);
-	struct mii_ioctl_data *mii = (struct mii_ioctl_data *)&rq->ifr_data;
 
 	if (!netif_running(dev))
 		return -EINVAL;
 
-	return phy_mii_ioctl(fep->phydev, mii, cmd);
+	return phy_mii_ioctl(fep->phydev, rq, cmd);
 }
 
 extern int fs_mii_connect(struct net_device *dev);
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 746a776a1653..27f02970d898 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -847,7 +847,7 @@ static int gfar_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (!priv->phydev)
 		return -ENODEV;
 
-	return phy_mii_ioctl(priv->phydev, if_mii(rq), cmd);
+	return phy_mii_ioctl(priv->phydev, rq, cmd);
 }
 
 static unsigned int reverse_bitmap(unsigned int bit_map, unsigned int max_qs)
diff --git a/drivers/net/macb.c b/drivers/net/macb.c
index 40797fbdca9f..ff2f158ab0b9 100644
--- a/drivers/net/macb.c
+++ b/drivers/net/macb.c
@@ -1082,7 +1082,7 @@ static int macb_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (!phydev)
 		return -ENODEV;
 
-	return phy_mii_ioctl(phydev, if_mii(rq), cmd);
+	return phy_mii_ioctl(phydev, rq, cmd);
 }
 
 static const struct net_device_ops macb_netdev_ops = {
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index af075af20e0c..2fcdb1e1b99d 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -2461,7 +2461,7 @@ static int mv643xx_eth_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	struct mv643xx_eth_private *mp = netdev_priv(dev);
 
 	if (mp->phy != NULL)
-		return phy_mii_ioctl(mp->phy, if_mii(ifr), cmd);
+		return phy_mii_ioctl(mp->phy, ifr, cmd);
 
 	return -EOPNOTSUPP;
 }
diff --git a/drivers/net/octeon/octeon_mgmt.c b/drivers/net/octeon/octeon_mgmt.c
index f4a0f08e14e1..b264f0f45605 100644
--- a/drivers/net/octeon/octeon_mgmt.c
+++ b/drivers/net/octeon/octeon_mgmt.c
@@ -620,7 +620,7 @@ static int octeon_mgmt_ioctl(struct net_device *netdev,
 	if (!p->phydev)
 		return -EINVAL;
 
-	return phy_mii_ioctl(p->phydev, if_mii(rq), cmd);
+	return phy_mii_ioctl(p->phydev, rq, cmd);
 }
 
 static void octeon_mgmt_adjust_link(struct net_device *netdev)
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 64be4664ccab..bd88d818f082 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -309,8 +309,9 @@ EXPORT_SYMBOL(phy_ethtool_gset);
  * current state.  Use at own risk.
  */
 int phy_mii_ioctl(struct phy_device *phydev,
-		struct mii_ioctl_data *mii_data, int cmd)
+		struct ifreq *ifr, int cmd)
 {
+	struct mii_ioctl_data *mii_data = if_mii(ifr);
 	u16 val = mii_data->val_in;
 
 	switch (cmd) {
diff --git a/drivers/net/sb1250-mac.c b/drivers/net/sb1250-mac.c
index 79eee3062083..8e6bd45b9f31 100644
--- a/drivers/net/sb1250-mac.c
+++ b/drivers/net/sb1250-mac.c
@@ -2532,7 +2532,7 @@ static int sbmac_mii_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (!netif_running(dev) || !sc->phy_dev)
 		return -EINVAL;
 
-	return phy_mii_ioctl(sc->phy_dev, if_mii(rq), cmd);
+	return phy_mii_ioctl(sc->phy_dev, rq, cmd);
 }
 
 static int sbmac_close(struct net_device *dev)
diff --git a/drivers/net/sh_eth.c b/drivers/net/sh_eth.c
index 7ac814d932b1..32f2deaa38bb 100644
--- a/drivers/net/sh_eth.c
+++ b/drivers/net/sh_eth.c
@@ -1284,7 +1284,7 @@ static int sh_eth_do_ioctl(struct net_device *ndev, struct ifreq *rq,
 	if (!phydev)
 		return -ENODEV;
 
-	return phy_mii_ioctl(phydev, if_mii(rq), cmd);
+	return phy_mii_ioctl(phydev, rq, cmd);
 }
 
 #if defined(SH_ETH_HAS_TSU)
diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index cc559741b0fa..56dc2ff75ee3 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -1538,7 +1538,7 @@ static int smsc911x_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	if (!netif_running(dev) || !pdata->phy_dev)
 		return -EINVAL;
 
-	return phy_mii_ioctl(pdata->phy_dev, if_mii(ifr), cmd);
+	return phy_mii_ioctl(pdata->phy_dev, ifr, cmd);
 }
 
 static int
diff --git a/drivers/net/smsc9420.c b/drivers/net/smsc9420.c
index 6cdee6a15f9f..b09ee1c319e8 100644
--- a/drivers/net/smsc9420.c
+++ b/drivers/net/smsc9420.c
@@ -245,7 +245,7 @@ static int smsc9420_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	if (!netif_running(dev) || !pd->phy_dev)
 		return -EINVAL;
 
-	return phy_mii_ioctl(pd->phy_dev, if_mii(ifr), cmd);
+	return phy_mii_ioctl(pd->phy_dev, ifr, cmd);
 }
 
 static int smsc9420_ethtool_get_settings(struct net_device *dev,
diff --git a/drivers/net/stmmac/stmmac_main.c b/drivers/net/stmmac/stmmac_main.c
index a31d580f306d..acf061686940 100644
--- a/drivers/net/stmmac/stmmac_main.c
+++ b/drivers/net/stmmac/stmmac_main.c
@@ -1437,24 +1437,18 @@ static void stmmac_poll_controller(struct net_device *dev)
 static int stmmac_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
-	int ret = -EOPNOTSUPP;
+	int ret;
 
 	if (!netif_running(dev))
 		return -EINVAL;
 
-	switch (cmd) {
-	case SIOCGMIIPHY:
-	case SIOCGMIIREG:
-	case SIOCSMIIREG:
-		if (!priv->phydev)
-			return -EINVAL;
-
-		spin_lock(&priv->lock);
-		ret = phy_mii_ioctl(priv->phydev, if_mii(rq), cmd);
-		spin_unlock(&priv->lock);
-	default:
-		break;
-	}
+	if (!priv->phydev)
+		return -EINVAL;
+
+	spin_lock(&priv->lock);
+	ret = phy_mii_ioctl(priv->phydev, rq, cmd);
+	spin_unlock(&priv->lock);
+
 	return ret;
 }
 
diff --git a/drivers/net/tc35815.c b/drivers/net/tc35815.c
index be08b75dbc15..99e423a5b9f1 100644
--- a/drivers/net/tc35815.c
+++ b/drivers/net/tc35815.c
@@ -2066,7 +2066,7 @@ static int tc35815_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		return -EINVAL;
 	if (!lp->phy_dev)
 		return -ENODEV;
-	return phy_mii_ioctl(lp->phy_dev, if_mii(rq), cmd);
+	return phy_mii_ioctl(lp->phy_dev, rq, cmd);
 }
 
 static void tc35815_chip_reset(struct net_device *dev)
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 5769e1507d2f..b26a57782939 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -10932,7 +10932,7 @@ static int tg3_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		if (!(tp->tg3_flags3 & TG3_FLG3_PHY_CONNECTED))
 			return -EAGAIN;
 		phydev = tp->mdio_bus->phy_map[TG3_PHY_MII_ADDR];
-		return phy_mii_ioctl(phydev, data, cmd);
+		return phy_mii_ioctl(phydev, ifr, cmd);
 	}
 
 	switch (cmd) {
diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
index dc32a62e611c..e17dd7430915 100644
--- a/drivers/net/ucc_geth.c
+++ b/drivers/net/ucc_geth.c
@@ -3714,7 +3714,7 @@ static int ucc_geth_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (!ugeth->phydev)
 		return -ENODEV;
 
-	return phy_mii_ioctl(ugeth->phydev, if_mii(rq), cmd);
+	return phy_mii_ioctl(ugeth->phydev, rq, cmd);
 }
 
 static const struct net_device_ops ucc_geth_netdev_ops = {
diff --git a/drivers/staging/octeon/ethernet-mdio.c b/drivers/staging/octeon/ethernet-mdio.c
index 7e0be8d00dc3..10a82ef30215 100644
--- a/drivers/staging/octeon/ethernet-mdio.c
+++ b/drivers/staging/octeon/ethernet-mdio.c
@@ -113,7 +113,7 @@ int cvm_oct_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (!priv->phydev)
 		return -EINVAL;
 
-	return phy_mii_ioctl(priv->phydev, if_mii(rq), cmd);
+	return phy_mii_ioctl(priv->phydev, rq, cmd);
 }
 
 static void cvm_oct_adjust_link(struct net_device *dev)
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 987e111f7b11..d63736a84002 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -498,7 +498,7 @@ void phy_stop_machine(struct phy_device *phydev);
 int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 int phy_ethtool_gset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 int phy_mii_ioctl(struct phy_device *phydev,
-		struct mii_ioctl_data *mii_data, int cmd);
+		struct ifreq *ifr, int cmd);
 int phy_start_interrupts(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
 struct phy_device* phy_device_create(struct mii_bus *bus, int addr, int phy_id);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8fdca56bb08f..64ca2a6fa0d4 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -164,10 +164,9 @@ out:
 static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct mii_ioctl_data *mii_data = if_mii(ifr);
 
 	if (p->phy != NULL)
-		return phy_mii_ioctl(p->phy, mii_data, cmd);
+		return phy_mii_ioctl(p->phy, ifr, cmd);
 
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 15f0127d1d189fda3294b7823e3e654afca54055 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Sat, 17 Jul 2010 08:49:17 +0000
Subject: net: added a BPF to help drivers detect PTP packets.

Certain kinds of hardware time stamping units in both MACs and PHYs have
the limitation that they can only time stamp PTP packets. Drivers for such
hardware are left with the task of correctly matching skbs to time stamps.
This patch adds a BPF that drivers can use to classify PTP packets when
needed.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptp_classify.h | 126 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 include/linux/ptp_classify.h

(limited to 'include/linux')

diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h
new file mode 100644
index 000000000000..943a85ab0020
--- /dev/null
+++ b/include/linux/ptp_classify.h
@@ -0,0 +1,126 @@
+/*
+ * PTP 1588 support
+ *
+ * This file implements a BPF that recognizes PTP event messages.
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _PTP_CLASSIFY_H_
+#define _PTP_CLASSIFY_H_
+
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/filter.h>
+#ifdef __KERNEL__
+#include <linux/in.h>
+#else
+#include <netinet/in.h>
+#endif
+
+#define PTP_CLASS_NONE  0x00 /* not a PTP event message */
+#define PTP_CLASS_V1    0x01 /* protocol version 1 */
+#define PTP_CLASS_V2    0x02 /* protocol version 2 */
+#define PTP_CLASS_VMASK 0x0f /* max protocol version is 15 */
+#define PTP_CLASS_IPV4  0x10 /* event in an IPV4 UDP packet */
+#define PTP_CLASS_IPV6  0x20 /* event in an IPV6 UDP packet */
+#define PTP_CLASS_L2    0x30 /* event in a L2 packet */
+#define PTP_CLASS_VLAN  0x40 /* event in a VLAN tagged L2 packet */
+#define PTP_CLASS_PMASK 0xf0 /* mask for the packet type field */
+
+#define PTP_CLASS_V1_IPV4 (PTP_CLASS_V1 | PTP_CLASS_IPV4)
+#define PTP_CLASS_V1_IPV6 (PTP_CLASS_V1 | PTP_CLASS_IPV6) /*probably DNE*/
+#define PTP_CLASS_V2_IPV4 (PTP_CLASS_V2 | PTP_CLASS_IPV4)
+#define PTP_CLASS_V2_IPV6 (PTP_CLASS_V2 | PTP_CLASS_IPV6)
+#define PTP_CLASS_V2_L2   (PTP_CLASS_V2 | PTP_CLASS_L2)
+#define PTP_CLASS_V2_VLAN (PTP_CLASS_V2 | PTP_CLASS_VLAN)
+
+#define PTP_EV_PORT 319
+
+#define OFF_ETYPE	12
+#define OFF_IHL		14
+#define OFF_FRAG	20
+#define OFF_PROTO4	23
+#define OFF_NEXT	6
+#define OFF_UDP_DST	2
+
+#define IP6_HLEN	40
+#define UDP_HLEN	8
+
+#define RELOFF_DST4	(ETH_HLEN + OFF_UDP_DST)
+#define OFF_DST6	(ETH_HLEN + IP6_HLEN + OFF_UDP_DST)
+#define OFF_PTP6	(ETH_HLEN + IP6_HLEN + UDP_HLEN)
+
+#define OP_AND	(BPF_ALU | BPF_AND  | BPF_K)
+#define OP_JEQ	(BPF_JMP | BPF_JEQ  | BPF_K)
+#define OP_JSET	(BPF_JMP | BPF_JSET | BPF_K)
+#define OP_LDB	(BPF_LD  | BPF_B    | BPF_ABS)
+#define OP_LDH	(BPF_LD  | BPF_H    | BPF_ABS)
+#define OP_LDHI	(BPF_LD  | BPF_H    | BPF_IND)
+#define OP_LDX	(BPF_LDX | BPF_B    | BPF_MSH)
+#define OP_OR	(BPF_ALU | BPF_OR   | BPF_K)
+#define OP_RETA	(BPF_RET | BPF_A)
+#define OP_RETK	(BPF_RET | BPF_K)
+
+static inline int ptp_filter_init(struct sock_filter *f, int len)
+{
+	if (OP_LDH == f[0].code)
+		return sk_chk_filter(f, len);
+	else
+		return 0;
+}
+
+#define PTP_FILTER \
+	{OP_LDH,	0,   0, OFF_ETYPE		}, /*              */ \
+	{OP_JEQ,	0,  12, ETH_P_IP		}, /* f goto L20   */ \
+	{OP_LDB,	0,   0, OFF_PROTO4		}, /*              */ \
+	{OP_JEQ,	0,   9, IPPROTO_UDP		}, /* f goto L10   */ \
+	{OP_LDH,	0,   0, OFF_FRAG		}, /*              */ \
+	{OP_JSET,	7,   0, 0x1fff			}, /* t goto L11   */ \
+	{OP_LDX,	0,   0, OFF_IHL			}, /*              */ \
+	{OP_LDHI,	0,   0, RELOFF_DST4		}, /*              */ \
+	{OP_JEQ,	0,   4, PTP_EV_PORT		}, /* f goto L12   */ \
+	{OP_LDHI,	0,   0, ETH_HLEN + UDP_HLEN	}, /*              */ \
+	{OP_AND,	0,   0, PTP_CLASS_VMASK		}, /*              */ \
+	{OP_OR,		0,   0, PTP_CLASS_IPV4		}, /*              */ \
+	{OP_RETA,	0,   0, 0			}, /*              */ \
+/*L1x*/	{OP_RETK,	0,   0, PTP_CLASS_NONE		}, /*              */ \
+/*L20*/	{OP_JEQ,	0,   9, ETH_P_IPV6		}, /* f goto L40   */ \
+	{OP_LDB,	0,   0, ETH_HLEN + OFF_NEXT	}, /*              */ \
+	{OP_JEQ,	0,   6, IPPROTO_UDP		}, /* f goto L30   */ \
+	{OP_LDH,	0,   0, OFF_DST6		}, /*              */ \
+	{OP_JEQ,	0,   4, PTP_EV_PORT		}, /* f goto L31   */ \
+	{OP_LDH,	0,   0, OFF_PTP6		}, /*              */ \
+	{OP_AND,	0,   0, PTP_CLASS_VMASK		}, /*              */ \
+	{OP_OR,		0,   0, PTP_CLASS_IPV6		}, /*              */ \
+	{OP_RETA,	0,   0, 0			}, /*              */ \
+/*L3x*/	{OP_RETK,	0,   0, PTP_CLASS_NONE		}, /*              */ \
+/*L40*/	{OP_JEQ,	0,   6, ETH_P_8021Q		}, /* f goto L50   */ \
+	{OP_LDH,	0,   0, OFF_ETYPE + 4		}, /*              */ \
+	{OP_JEQ,	0,   9, ETH_P_1588		}, /* f goto L60   */ \
+	{OP_LDH,	0,   0, ETH_HLEN + VLAN_HLEN	}, /*              */ \
+	{OP_AND,	0,   0, PTP_CLASS_VMASK		}, /*              */ \
+	{OP_OR,		0,   0, PTP_CLASS_VLAN		}, /*              */ \
+	{OP_RETA,	0,   0, 0			}, /*              */ \
+/*L50*/	{OP_JEQ,	0,   4, ETH_P_1588		}, /* f goto L61   */ \
+	{OP_LDH,	0,   0, ETH_HLEN		}, /*              */ \
+	{OP_AND,	0,   0, PTP_CLASS_VMASK		}, /*              */ \
+	{OP_OR,		0,   0, PTP_CLASS_L2		}, /*              */ \
+	{OP_RETA,	0,   0, 0			}, /*              */ \
+/*L6x*/	{OP_RETK,	0,   0, PTP_CLASS_NONE		},
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From c1f19b51d1d87f3e3bb7e6648f43f7d57ed2da6b Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Sat, 17 Jul 2010 08:49:36 +0000
Subject: net: support time stamping in phy devices.

This patch adds a new networking option to allow hardware time stamps
from PHY devices. When enabled, likely candidates among incoming and
outgoing network packets are offered to the PHY driver for possible
time stamping. When accepted by the PHY driver, incoming packets are
deferred for later delivery by the driver.

The patch also adds phylib driver methods for the SIOCSHWTSTAMP ioctl
and callbacks for transmit and receive time stamping. Drivers may
optionally implement these functions.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c        |   5 ++
 drivers/net/phy/phy_device.c |   2 +
 include/linux/netdevice.h    |   4 ++
 include/linux/phy.h          |  22 ++++++++
 include/linux/skbuff.h       |  31 +++++++++++
 net/Kconfig                  |  10 ++++
 net/core/Makefile            |   2 +-
 net/core/dev.c               |   3 ++
 net/core/timestamping.c      | 126 +++++++++++++++++++++++++++++++++++++++++++
 net/socket.c                 |   4 ++
 10 files changed, 208 insertions(+), 1 deletion(-)
 create mode 100644 net/core/timestamping.c

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index bd88d818f082..5130db8f5c4e 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -361,6 +361,11 @@ int phy_mii_ioctl(struct phy_device *phydev,
 		}
 		break;
 
+	case SIOCSHWTSTAMP:
+		if (phydev->drv->hwtstamp)
+			return phydev->drv->hwtstamp(phydev, ifr);
+		/* fall through */
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 1a99bb244106..c0761197c07e 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -460,6 +460,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 	}
 
 	phydev->attached_dev = dev;
+	dev->phydev = phydev;
 
 	phydev->dev_flags = flags;
 
@@ -513,6 +514,7 @@ EXPORT_SYMBOL(phy_attach);
  */
 void phy_detach(struct phy_device *phydev)
 {
+	phydev->attached_dev->phydev = NULL;
 	phydev->attached_dev = NULL;
 
 	/* If the device had no specific driver before (i.e. - it
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c4fedf000541..fdc3f2992230 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -54,6 +54,7 @@
 
 struct vlan_group;
 struct netpoll_info;
+struct phy_device;
 /* 802.11 specific */
 struct wireless_dev;
 					/* source back-compat hooks */
@@ -1065,6 +1066,9 @@ struct net_device {
 #endif
 	/* n-tuple filter list attached to this device */
 	struct ethtool_rx_ntuple_list ethtool_ntuple_list;
+
+	/* phy device may attach itself for hardware timestamping */
+	struct phy_device *phydev;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d63736a84002..6b0a782c6224 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -234,6 +234,8 @@ enum phy_state {
 	PHY_RESUMING
 };
 
+struct sk_buff;
+
 /* phy_device: An instance of a PHY
  *
  * drv: Pointer to the driver for this PHY instance
@@ -402,6 +404,26 @@ struct phy_driver {
 	/* Clears up any memory if needed */
 	void (*remove)(struct phy_device *phydev);
 
+	/* Handles SIOCSHWTSTAMP ioctl for hardware time stamping. */
+	int  (*hwtstamp)(struct phy_device *phydev, struct ifreq *ifr);
+
+	/*
+	 * Requests a Rx timestamp for 'skb'. If the skb is accepted,
+	 * the phy driver promises to deliver it using netif_rx() as
+	 * soon as a timestamp becomes available. One of the
+	 * PTP_CLASS_ values is passed in 'type'. The function must
+	 * return true if the skb is accepted for delivery.
+	 */
+	bool (*rxtstamp)(struct phy_device *dev, struct sk_buff *skb, int type);
+
+	/*
+	 * Requests a Tx timestamp for 'skb'. The phy driver promises
+	 * to deliver it to the socket's error queue as soon as a
+	 * timestamp becomes available. One of the PTP_CLASS_ values
+	 * is passed in 'type'.
+	 */
+	void (*txtstamp)(struct phy_device *dev, struct sk_buff *skb, int type);
+
 	struct device_driver driver;
 };
 #define to_phy_driver(d) container_of(d, struct phy_driver, driver)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a1b0400c8d86..f5aa87e1e0c8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1933,6 +1933,36 @@ static inline ktime_t net_invalid_timestamp(void)
 	return ktime_set(0, 0);
 }
 
+extern void skb_timestamping_init(void);
+
+#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
+
+extern void skb_clone_tx_timestamp(struct sk_buff *skb);
+extern bool skb_defer_rx_timestamp(struct sk_buff *skb);
+
+#else /* CONFIG_NETWORK_PHY_TIMESTAMPING */
+
+static inline void skb_clone_tx_timestamp(struct sk_buff *skb)
+{
+}
+
+static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
+{
+	return false;
+}
+
+#endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */
+
+/**
+ * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps
+ *
+ * @skb: clone of the the original outgoing packet
+ * @hwtstamps: hardware time stamps
+ *
+ */
+void skb_complete_tx_timestamp(struct sk_buff *skb,
+			       struct skb_shared_hwtstamps *hwtstamps);
+
 /**
  * skb_tstamp_tx - queue clone of skb with send time stamps
  * @orig_skb:	the original outgoing packet
@@ -1965,6 +1995,7 @@ static inline void sw_tx_timestamp(struct sk_buff *skb)
  */
 static inline void skb_tx_timestamp(struct sk_buff *skb)
 {
+	skb_clone_tx_timestamp(skb);
 	sw_tx_timestamp(skb);
 }
 
diff --git a/net/Kconfig b/net/Kconfig
index 0d68b40fc0e6..b3250944cde9 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -86,6 +86,16 @@ config NETWORK_SECMARK
 	  to nfmark, but designated for security purposes.
 	  If you are unsure how to answer this question, answer N.
 
+config NETWORK_PHY_TIMESTAMPING
+	bool "Timestamping in PHY devices"
+	depends on EXPERIMENTAL
+	help
+	  This allows timestamping of network packets by PHYs with
+	  hardware timestamping capabilities. This option adds some
+	  overhead in the transmit and receive paths.
+
+	  If you are unsure how to answer this question, answer N.
+
 menuconfig NETFILTER
 	bool "Network packet filtering framework (Netfilter)"
 	---help---
diff --git a/net/core/Makefile b/net/core/Makefile
index 51c3eec850ef..8a04dd22cf77 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -18,4 +18,4 @@ obj-$(CONFIG_NET_DMA) += user_dma.o
 obj-$(CONFIG_FIB_RULES) += fib_rules.o
 obj-$(CONFIG_TRACEPOINTS) += net-traces.o
 obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
-
+obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
diff --git a/net/core/dev.c b/net/core/dev.c
index e2b9fa2c917e..1c002c7ef5d5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2957,6 +2957,9 @@ int netif_receive_skb(struct sk_buff *skb)
 	if (netdev_tstamp_prequeue)
 		net_timestamp_check(skb);
 
+	if (skb_defer_rx_timestamp(skb))
+		return NET_RX_SUCCESS;
+
 #ifdef CONFIG_RPS
 	{
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
new file mode 100644
index 000000000000..0ae6c22da85b
--- /dev/null
+++ b/net/core/timestamping.c
@@ -0,0 +1,126 @@
+/*
+ * PTP 1588 clock support - support for timestamping in PHY devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/errqueue.h>
+#include <linux/phy.h>
+#include <linux/ptp_classify.h>
+#include <linux/skbuff.h>
+
+static struct sock_filter ptp_filter[] = {
+	PTP_FILTER
+};
+
+static unsigned int classify(struct sk_buff *skb)
+{
+	if (likely(skb->dev &&
+		   skb->dev->phydev &&
+		   skb->dev->phydev->drv))
+		return sk_run_filter(skb, ptp_filter, ARRAY_SIZE(ptp_filter));
+	else
+		return PTP_CLASS_NONE;
+}
+
+void skb_clone_tx_timestamp(struct sk_buff *skb)
+{
+	struct phy_device *phydev;
+	struct sk_buff *clone;
+	struct sock *sk = skb->sk;
+	unsigned int type;
+
+	if (!sk)
+		return;
+
+	type = classify(skb);
+
+	switch (type) {
+	case PTP_CLASS_V1_IPV4:
+	case PTP_CLASS_V1_IPV6:
+	case PTP_CLASS_V2_IPV4:
+	case PTP_CLASS_V2_IPV6:
+	case PTP_CLASS_V2_L2:
+	case PTP_CLASS_V2_VLAN:
+		phydev = skb->dev->phydev;
+		if (likely(phydev->drv->txtstamp)) {
+			clone = skb_clone(skb, GFP_ATOMIC);
+			if (!clone)
+				return;
+			clone->sk = sk;
+			phydev->drv->txtstamp(phydev, clone, type);
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+void skb_complete_tx_timestamp(struct sk_buff *skb,
+			       struct skb_shared_hwtstamps *hwtstamps)
+{
+	struct sock *sk = skb->sk;
+	struct sock_exterr_skb *serr;
+	int err;
+
+	if (!hwtstamps)
+		return;
+
+	*skb_hwtstamps(skb) = *hwtstamps;
+	serr = SKB_EXT_ERR(skb);
+	memset(serr, 0, sizeof(*serr));
+	serr->ee.ee_errno = ENOMSG;
+	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+	skb->sk = NULL;
+	err = sock_queue_err_skb(sk, skb);
+	if (err)
+		kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+
+bool skb_defer_rx_timestamp(struct sk_buff *skb)
+{
+	struct phy_device *phydev;
+	unsigned int type;
+
+	skb_push(skb, ETH_HLEN);
+
+	type = classify(skb);
+
+	skb_pull(skb, ETH_HLEN);
+
+	switch (type) {
+	case PTP_CLASS_V1_IPV4:
+	case PTP_CLASS_V1_IPV6:
+	case PTP_CLASS_V2_IPV4:
+	case PTP_CLASS_V2_IPV6:
+	case PTP_CLASS_V2_L2:
+	case PTP_CLASS_V2_VLAN:
+		phydev = skb->dev->phydev;
+		if (likely(phydev->drv->rxtstamp))
+			return phydev->drv->rxtstamp(phydev, skb, type);
+		break;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+void __init skb_timestamping_init(void)
+{
+	BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter)));
+}
diff --git a/net/socket.c b/net/socket.c
index 6fe484122a44..2270b941bcc7 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2394,6 +2394,10 @@ static int __init sock_init(void)
 	netfilter_init();
 #endif
 
+#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
+	skb_timestamping_init();
+#endif
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From e15bacbebb9dcc95f148f28dfc83a6d5e48b60b8 Mon Sep 17 00:00:00 2001
From: Dan Kruchinin <dkruchinin@acm.org>
Date: Wed, 14 Jul 2010 14:31:57 +0400
Subject: padata: Make two separate cpumasks

The aim of this patch is to make two separate cpumasks
for padata parallel and serial workers respectively.
It allows user to make more thin and sophisticated configurations
of padata framework. For example user may bind parallel and serial workers to non-intersecting
CPU groups to gain better performance. Also each padata instance has notifiers chain for its
cpumasks now. If either parallel or serial or both masks were changed all
interested subsystems will get notification about that. It's especially useful
if padata user uses algorithm for callback CPU selection according to serial cpumask.

Signed-off-by: Dan Kruchinin <dkruchinin@acm.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/pcrypt.c        | 191 ++++++++++++++------
 include/linux/padata.h | 116 ++++++++----
 kernel/padata.c        | 471 ++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 564 insertions(+), 214 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index 6036b6de9079..c9662e25595e 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -24,12 +24,38 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/notifier.h>
 #include <crypto/pcrypt.h>
 
-static struct padata_instance *pcrypt_enc_padata;
-static struct padata_instance *pcrypt_dec_padata;
-static struct workqueue_struct *encwq;
-static struct workqueue_struct *decwq;
+struct pcrypt_instance {
+	struct padata_instance *pinst;
+	struct workqueue_struct *wq;
+
+	/*
+	 * Cpumask for callback CPUs. It should be
+	 * equal to serial cpumask of corresponding padata instance,
+	 * so it is updated when padata notifies us about serial
+	 * cpumask change.
+	 *
+	 * cb_cpumask is protected by RCU. This fact prevents us from
+	 * using cpumask_var_t directly because the actual type of
+	 * cpumsak_var_t depends on kernel configuration(particularly on
+	 * CONFIG_CPUMASK_OFFSTACK macro). Depending on the configuration
+	 * cpumask_var_t may be either a pointer to the struct cpumask
+	 * or a variable allocated on the stack. Thus we can not safely use
+	 * cpumask_var_t with RCU operations such as rcu_assign_pointer or
+	 * rcu_dereference. So cpumask_var_t is wrapped with struct
+	 * pcrypt_cpumask which makes possible to use it with RCU.
+	 */
+	struct pcrypt_cpumask {
+		cpumask_var_t mask;
+	} *cb_cpumask;
+	struct notifier_block nblock;
+};
+
+static struct pcrypt_instance pencrypt;
+static struct pcrypt_instance pdecrypt;
+
 
 struct pcrypt_instance_ctx {
 	struct crypto_spawn spawn;
@@ -42,25 +68,29 @@ struct pcrypt_aead_ctx {
 };
 
 static int pcrypt_do_parallel(struct padata_priv *padata, unsigned int *cb_cpu,
-			      struct padata_instance *pinst)
+			      struct pcrypt_instance *pcrypt)
 {
 	unsigned int cpu_index, cpu, i;
+	struct pcrypt_cpumask *cpumask;
 
 	cpu = *cb_cpu;
 
-	if (cpumask_test_cpu(cpu, cpu_active_mask))
+	rcu_read_lock_bh();
+	cpumask = rcu_dereference(pcrypt->cb_cpumask);
+	if (cpumask_test_cpu(cpu, cpumask->mask))
 			goto out;
 
-	cpu_index = cpu % cpumask_weight(cpu_active_mask);
+	cpu_index = cpu % cpumask_weight(cpumask->mask);
 
-	cpu = cpumask_first(cpu_active_mask);
+	cpu = cpumask_first(cpumask->mask);
 	for (i = 0; i < cpu_index; i++)
-		cpu = cpumask_next(cpu, cpu_active_mask);
+		cpu = cpumask_next(cpu, cpumask->mask);
 
 	*cb_cpu = cpu;
 
 out:
-	return padata_do_parallel(pinst, padata, cpu);
+	rcu_read_unlock_bh();
+	return padata_do_parallel(pcrypt->pinst, padata, cpu);
 }
 
 static int pcrypt_aead_setkey(struct crypto_aead *parent,
@@ -142,7 +172,7 @@ static int pcrypt_aead_encrypt(struct aead_request *req)
 			       req->cryptlen, req->iv);
 	aead_request_set_assoc(creq, req->assoc, req->assoclen);
 
-	err = pcrypt_do_parallel(padata, &ctx->cb_cpu, pcrypt_enc_padata);
+	err = pcrypt_do_parallel(padata, &ctx->cb_cpu, &pencrypt);
 	if (!err)
 		return -EINPROGRESS;
 
@@ -184,7 +214,7 @@ static int pcrypt_aead_decrypt(struct aead_request *req)
 			       req->cryptlen, req->iv);
 	aead_request_set_assoc(creq, req->assoc, req->assoclen);
 
-	err = pcrypt_do_parallel(padata, &ctx->cb_cpu, pcrypt_dec_padata);
+	err = pcrypt_do_parallel(padata, &ctx->cb_cpu, &pdecrypt);
 	if (!err)
 		return -EINPROGRESS;
 
@@ -228,7 +258,7 @@ static int pcrypt_aead_givencrypt(struct aead_givcrypt_request *req)
 	aead_givcrypt_set_assoc(creq, areq->assoc, areq->assoclen);
 	aead_givcrypt_set_giv(creq, req->giv, req->seq);
 
-	err = pcrypt_do_parallel(padata, &ctx->cb_cpu, pcrypt_enc_padata);
+	err = pcrypt_do_parallel(padata, &ctx->cb_cpu, &pencrypt);
 	if (!err)
 		return -EINPROGRESS;
 
@@ -370,6 +400,88 @@ static void pcrypt_free(struct crypto_instance *inst)
 	kfree(inst);
 }
 
+static int pcrypt_cpumask_change_notify(struct notifier_block *self,
+					unsigned long val, void *data)
+{
+	struct pcrypt_instance *pcrypt;
+	struct pcrypt_cpumask *new_mask, *old_mask;
+
+	if (!(val & PADATA_CPU_SERIAL))
+		return 0;
+
+	pcrypt = container_of(self, struct pcrypt_instance, nblock);
+	new_mask = kmalloc(sizeof(*new_mask), GFP_KERNEL);
+	if (!new_mask)
+		return -ENOMEM;
+	if (!alloc_cpumask_var(&new_mask->mask, GFP_KERNEL)) {
+		kfree(new_mask);
+		return -ENOMEM;
+	}
+
+	old_mask = pcrypt->cb_cpumask;
+
+	padata_get_cpumask(pcrypt->pinst, PADATA_CPU_SERIAL, new_mask->mask);
+	rcu_assign_pointer(pcrypt->cb_cpumask, new_mask);
+	synchronize_rcu_bh();
+
+	free_cpumask_var(old_mask->mask);
+	kfree(old_mask);
+	return 0;
+}
+
+static int __pcrypt_init_instance(struct pcrypt_instance *pcrypt,
+				  const char *name)
+{
+	int ret = -ENOMEM;
+	struct pcrypt_cpumask *mask;
+
+	pcrypt->wq = create_workqueue(name);
+	if (!pcrypt->wq)
+		goto err;
+
+	pcrypt->pinst = padata_alloc(pcrypt->wq);
+	if (!pcrypt->pinst)
+		goto err_destroy_workqueue;
+
+	mask = kmalloc(sizeof(*mask), GFP_KERNEL);
+	if (!mask)
+		goto err_free_padata;
+	if (!alloc_cpumask_var(&mask->mask, GFP_KERNEL)) {
+		kfree(mask);
+		goto err_free_padata;
+	}
+
+	padata_get_cpumask(pcrypt->pinst, PADATA_CPU_SERIAL, mask->mask);
+	rcu_assign_pointer(pcrypt->cb_cpumask, mask);
+
+	pcrypt->nblock.notifier_call = pcrypt_cpumask_change_notify;
+	ret = padata_register_cpumask_notifier(pcrypt->pinst, &pcrypt->nblock);
+	if (ret)
+		goto err_free_cpumask;
+
+	return ret;
+err_free_cpumask:
+	free_cpumask_var(mask->mask);
+	kfree(mask);
+err_free_padata:
+	padata_free(pcrypt->pinst);
+err_destroy_workqueue:
+	destroy_workqueue(pcrypt->wq);
+err:
+	return ret;
+}
+
+static void __pcrypt_deinit_instance(struct pcrypt_instance *pcrypt)
+{
+	free_cpumask_var(pcrypt->cb_cpumask->mask);
+	kfree(pcrypt->cb_cpumask);
+
+	padata_stop(pcrypt->pinst);
+	padata_unregister_cpumask_notifier(pcrypt->pinst, &pcrypt->nblock);
+	destroy_workqueue(pcrypt->wq);
+	padata_free(pcrypt->pinst);
+}
+
 static struct crypto_template pcrypt_tmpl = {
 	.name = "pcrypt",
 	.alloc = pcrypt_alloc,
@@ -379,60 +491,31 @@ static struct crypto_template pcrypt_tmpl = {
 
 static int __init pcrypt_init(void)
 {
-	int err = -ENOMEM;
-	encwq = create_workqueue("pencrypt");
-	if (!encwq)
-		goto err;
-
-	decwq = create_workqueue("pdecrypt");
-	if (!decwq)
-		goto err_destroy_encwq;
-
-
-	pcrypt_enc_padata = padata_alloc(cpu_possible_mask, encwq);
-	if (!pcrypt_enc_padata)
-		goto err_destroy_decwq;
-
-	pcrypt_dec_padata = padata_alloc(cpu_possible_mask, decwq);
-	if (!pcrypt_dec_padata)
-		goto err_free_enc_padata;
+	int err;
 
-	err = padata_start(pcrypt_enc_padata);
+	err = __pcrypt_init_instance(&pencrypt, "pencrypt");
 	if (err)
-		goto err_free_dec_padata;
+		goto err;
 
-	err = padata_start(pcrypt_dec_padata);
+	err = __pcrypt_init_instance(&pdecrypt, "pdecrypt");
 	if (err)
-		goto err_free_dec_padata;
-
-	return crypto_register_template(&pcrypt_tmpl);
-
-err_free_dec_padata:
-	padata_free(pcrypt_dec_padata);
+		goto err_deinit_pencrypt;
 
-err_free_enc_padata:
-	padata_free(pcrypt_enc_padata);
+	padata_start(pencrypt.pinst);
+	padata_start(pdecrypt.pinst);
 
-err_destroy_decwq:
-	destroy_workqueue(decwq);
-
-err_destroy_encwq:
-	destroy_workqueue(encwq);
+	return crypto_register_template(&pcrypt_tmpl);
 
+err_deinit_pencrypt:
+	__pcrypt_deinit_instance(&pencrypt);
 err:
 	return err;
 }
 
 static void __exit pcrypt_exit(void)
 {
-	padata_stop(pcrypt_enc_padata);
-	padata_stop(pcrypt_dec_padata);
-
-	destroy_workqueue(encwq);
-	destroy_workqueue(decwq);
-
-	padata_free(pcrypt_enc_padata);
-	padata_free(pcrypt_dec_padata);
+	__pcrypt_deinit_instance(&pencrypt);
+	__pcrypt_deinit_instance(&pdecrypt);
 
 	crypto_unregister_template(&pcrypt_tmpl);
 }
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 8844b851191e..621e7736690c 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -25,6 +25,10 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/timer.h>
+#include <linux/notifier.h>
+
+#define PADATA_CPU_SERIAL   0x01
+#define PADATA_CPU_PARALLEL 0x02
 
 /**
  * struct padata_priv -  Embedded to the users data structure.
@@ -59,7 +63,20 @@ struct padata_list {
 };
 
 /**
- * struct padata_queue - The percpu padata queues.
+* struct padata_serial_queue - The percpu padata serial queue
+*
+* @serial: List to wait for serialization after reordering.
+* @work: work struct for serialization.
+* @pd: Backpointer to the internal control structure.
+*/
+struct padata_serial_queue {
+       struct padata_list    serial;
+       struct work_struct    work;
+       struct parallel_data *pd;
+};
+
+/**
+ * struct padata_parallel_queue - The percpu padata parallel queue
  *
  * @parallel: List to wait for parallelization.
  * @reorder: List to wait for reordering after parallel processing.
@@ -67,44 +84,52 @@ struct padata_list {
  * @pwork: work struct for parallelization.
  * @swork: work struct for serialization.
  * @pd: Backpointer to the internal control structure.
+ * @work: work struct for parallelization.
+ * @num_obj: Number of objects that are processed by this cpu.
  * @cpu_index: Index of the cpu.
  */
-struct padata_queue {
-	struct padata_list	parallel;
-	struct padata_list	reorder;
-	struct padata_list	serial;
-	struct work_struct	pwork;
-	struct work_struct	swork;
-	struct parallel_data    *pd;
-	int			cpu_index;
+struct padata_parallel_queue {
+       struct padata_list    parallel;
+       struct padata_list    reorder;
+       struct parallel_data *pd;
+       struct work_struct    work;
+       atomic_t              num_obj;
+       int                   cpu_index;
 };
 
+
 /**
  * struct parallel_data - Internal control structure, covers everything
  * that depends on the cpumask in use.
  *
  * @pinst: padata instance.
- * @queue: percpu padata queues.
+ * @pqueue: percpu padata queues used for parallelization.
+ * @squeue: percpu padata queues used for serialuzation.
  * @seq_nr: The sequence number that will be attached to the next object.
  * @reorder_objects: Number of objects waiting in the reorder queues.
  * @refcnt: Number of objects holding a reference on this parallel_data.
  * @max_seq_nr:  Maximal used sequence number.
- * @cpumask: cpumask in use.
+ * @cpumask: Contains two cpumasks: pcpu and cbcpu for
+ *           parallel and serial workers respectively.
  * @lock: Reorder lock.
  * @processed: Number of already processed objects.
  * @timer: Reorder timer.
  */
 struct parallel_data {
-	struct padata_instance	*pinst;
-	struct padata_queue	*queue;
-	atomic_t		seq_nr;
-	atomic_t		reorder_objects;
-	atomic_t                refcnt;
-	unsigned int		max_seq_nr;
-	cpumask_var_t		cpumask;
-	spinlock_t              lock ____cacheline_aligned;
-	unsigned int            processed;
-	struct timer_list       timer;
+	struct padata_instance		*pinst;
+	struct padata_parallel_queue	*pqueue;
+	struct padata_serial_queue	*squeue;
+	atomic_t			 seq_nr;
+	atomic_t			 reorder_objects;
+	atomic_t			 refcnt;
+	unsigned int			 max_seq_nr;
+	struct {
+		cpumask_var_t		 pcpu;
+		cpumask_var_t		 cbcpu;
+	} cpumask;
+	spinlock_t                       lock ____cacheline_aligned;
+	unsigned int			 processed;
+	struct timer_list		 timer;
 };
 
 /**
@@ -113,32 +138,51 @@ struct parallel_data {
  * @cpu_notifier: cpu hotplug notifier.
  * @wq: The workqueue in use.
  * @pd: The internal control structure.
- * @cpumask: User supplied cpumask.
+ * @cpumask: User supplied cpumask. Contains two cpumasks: pcpu and
+ *           cbcpu for parallel and serial works respectivly.
+ * @cpumask_change_notifier: Notifiers chain for user-defined notify
+ *            callbacks that will be called when either @pcpu or @cbcpu
+ *             or both cpumasks change.
  * @lock: padata instance lock.
  * @flags: padata flags.
  */
 struct padata_instance {
-	struct notifier_block   cpu_notifier;
-	struct workqueue_struct *wq;
-	struct parallel_data	*pd;
-	cpumask_var_t           cpumask;
-	struct mutex		lock;
-	u8			flags;
-#define	PADATA_INIT		1
-#define	PADATA_RESET		2
-#define	PADATA_INVALID		4
+	struct notifier_block		 cpu_notifier;
+	struct workqueue_struct		*wq;
+	struct parallel_data		*pd;
+	struct {
+		cpumask_var_t		 pcpu;
+		cpumask_var_t		 cbcpu;
+	} cpumask;
+	struct blocking_notifier_head	 cpumask_change_notifier;
+	struct mutex			 lock;
+	u8				 flags;
+#define	PADATA_INIT	1
+#define	PADATA_RESET	2
+#define	PADATA_INVALID	4
 };
 
-extern struct padata_instance *padata_alloc(const struct cpumask *cpumask,
-					    struct workqueue_struct *wq);
+extern struct padata_instance *padata_alloc(struct workqueue_struct *wq);
+extern struct padata_instance *__padata_alloc(struct workqueue_struct *wq,
+					      const struct cpumask *pcpumask,
+					      const struct cpumask *cbcpumask);
 extern void padata_free(struct padata_instance *pinst);
 extern int padata_do_parallel(struct padata_instance *pinst,
 			      struct padata_priv *padata, int cb_cpu);
 extern void padata_do_serial(struct padata_priv *padata);
-extern int padata_set_cpumask(struct padata_instance *pinst,
+extern int padata_get_cpumask(struct padata_instance *pinst,
+			      int cpumask_type, struct cpumask *out_mask);
+extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
 			      cpumask_var_t cpumask);
-extern int padata_add_cpu(struct padata_instance *pinst, int cpu);
-extern int padata_remove_cpu(struct padata_instance *pinst, int cpu);
+extern int __padata_set_cpumasks(struct padata_instance *pinst,
+				 cpumask_var_t pcpumask,
+				 cpumask_var_t cbcpumask);
+extern int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask);
+extern int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask);
 extern int padata_start(struct padata_instance *pinst);
 extern void padata_stop(struct padata_instance *pinst);
+extern int padata_register_cpumask_notifier(struct padata_instance *pinst,
+					    struct notifier_block *nblock);
+extern int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
+					      struct notifier_block *nblock);
 #endif
diff --git a/kernel/padata.c b/kernel/padata.c
index 450d67d394b0..84d0ca9dac9c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -35,9 +35,9 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
 {
 	int cpu, target_cpu;
 
-	target_cpu = cpumask_first(pd->cpumask);
+	target_cpu = cpumask_first(pd->cpumask.pcpu);
 	for (cpu = 0; cpu < cpu_index; cpu++)
-		target_cpu = cpumask_next(target_cpu, pd->cpumask);
+		target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
 
 	return target_cpu;
 }
@@ -53,26 +53,27 @@ static int padata_cpu_hash(struct padata_priv *padata)
 	 * Hash the sequence numbers to the cpus by taking
 	 * seq_nr mod. number of cpus in use.
 	 */
-	cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask);
+	cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
 
 	return padata_index_to_cpu(pd, cpu_index);
 }
 
-static void padata_parallel_worker(struct work_struct *work)
+static void padata_parallel_worker(struct work_struct *parallel_work)
 {
-	struct padata_queue *queue;
+	struct padata_parallel_queue *pqueue;
 	struct parallel_data *pd;
 	struct padata_instance *pinst;
 	LIST_HEAD(local_list);
 
 	local_bh_disable();
-	queue = container_of(work, struct padata_queue, pwork);
-	pd = queue->pd;
+	pqueue = container_of(parallel_work,
+			      struct padata_parallel_queue, work);
+	pd = pqueue->pd;
 	pinst = pd->pinst;
 
-	spin_lock(&queue->parallel.lock);
-	list_replace_init(&queue->parallel.list, &local_list);
-	spin_unlock(&queue->parallel.lock);
+	spin_lock(&pqueue->parallel.lock);
+	list_replace_init(&pqueue->parallel.list, &local_list);
+	spin_unlock(&pqueue->parallel.lock);
 
 	while (!list_empty(&local_list)) {
 		struct padata_priv *padata;
@@ -94,7 +95,7 @@ static void padata_parallel_worker(struct work_struct *work)
  * @pinst: padata instance
  * @padata: object to be parallelized
  * @cb_cpu: cpu the serialization callback function will run on,
- *          must be in the cpumask of padata.
+ *          must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
  *
  * The parallelization callback function will run with BHs off.
  * Note: Every object which is parallelized by padata_do_parallel
@@ -104,7 +105,7 @@ int padata_do_parallel(struct padata_instance *pinst,
 		       struct padata_priv *padata, int cb_cpu)
 {
 	int target_cpu, err;
-	struct padata_queue *queue;
+	struct padata_parallel_queue *queue;
 	struct parallel_data *pd;
 
 	rcu_read_lock_bh();
@@ -115,7 +116,7 @@ int padata_do_parallel(struct padata_instance *pinst,
 	if (!(pinst->flags & PADATA_INIT))
 		goto out;
 
-	if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
+	if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
 		goto out;
 
 	err =  -EBUSY;
@@ -136,13 +137,13 @@ int padata_do_parallel(struct padata_instance *pinst,
 	padata->seq_nr = atomic_inc_return(&pd->seq_nr);
 
 	target_cpu = padata_cpu_hash(padata);
-	queue = per_cpu_ptr(pd->queue, target_cpu);
+	queue = per_cpu_ptr(pd->pqueue, target_cpu);
 
 	spin_lock(&queue->parallel.lock);
 	list_add_tail(&padata->list, &queue->parallel.list);
 	spin_unlock(&queue->parallel.lock);
 
-	queue_work_on(target_cpu, pinst->wq, &queue->pwork);
+	queue_work_on(target_cpu, pinst->wq, &queue->work);
 
 out:
 	rcu_read_unlock_bh();
@@ -172,11 +173,11 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
 {
 	int cpu, num_cpus;
 	int next_nr, next_index;
-	struct padata_queue *queue, *next_queue;
+	struct padata_parallel_queue *queue, *next_queue;
 	struct padata_priv *padata;
 	struct padata_list *reorder;
 
-	num_cpus = cpumask_weight(pd->cpumask);
+	num_cpus = cpumask_weight(pd->cpumask.pcpu);
 
 	/*
 	 * Calculate the percpu reorder queue and the sequence
@@ -185,13 +186,13 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
 	next_nr = pd->processed;
 	next_index = next_nr % num_cpus;
 	cpu = padata_index_to_cpu(pd, next_index);
-	next_queue = per_cpu_ptr(pd->queue, cpu);
+	next_queue = per_cpu_ptr(pd->pqueue, cpu);
 
 	if (unlikely(next_nr > pd->max_seq_nr)) {
 		next_nr = next_nr - pd->max_seq_nr - 1;
 		next_index = next_nr % num_cpus;
 		cpu = padata_index_to_cpu(pd, next_index);
-		next_queue = per_cpu_ptr(pd->queue, cpu);
+		next_queue = per_cpu_ptr(pd->pqueue, cpu);
 		pd->processed = 0;
 	}
 
@@ -215,7 +216,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
 		goto out;
 	}
 
-	queue = per_cpu_ptr(pd->queue, smp_processor_id());
+	queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
 	if (queue->cpu_index == next_queue->cpu_index) {
 		padata = ERR_PTR(-ENODATA);
 		goto out;
@@ -229,7 +230,7 @@ out:
 static void padata_reorder(struct parallel_data *pd)
 {
 	struct padata_priv *padata;
-	struct padata_queue *queue;
+	struct padata_serial_queue *squeue;
 	struct padata_instance *pinst = pd->pinst;
 
 	/*
@@ -268,13 +269,13 @@ static void padata_reorder(struct parallel_data *pd)
 			return;
 		}
 
-		queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
+		squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
 
-		spin_lock(&queue->serial.lock);
-		list_add_tail(&padata->list, &queue->serial.list);
-		spin_unlock(&queue->serial.lock);
+		spin_lock(&squeue->serial.lock);
+		list_add_tail(&padata->list, &squeue->serial.list);
+		spin_unlock(&squeue->serial.lock);
 
-		queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
+		queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
 	}
 
 	spin_unlock_bh(&pd->lock);
@@ -300,19 +301,19 @@ static void padata_reorder_timer(unsigned long arg)
 	padata_reorder(pd);
 }
 
-static void padata_serial_worker(struct work_struct *work)
+static void padata_serial_worker(struct work_struct *serial_work)
 {
-	struct padata_queue *queue;
+	struct padata_serial_queue *squeue;
 	struct parallel_data *pd;
 	LIST_HEAD(local_list);
 
 	local_bh_disable();
-	queue = container_of(work, struct padata_queue, swork);
-	pd = queue->pd;
+	squeue = container_of(serial_work, struct padata_serial_queue, work);
+	pd = squeue->pd;
 
-	spin_lock(&queue->serial.lock);
-	list_replace_init(&queue->serial.list, &local_list);
-	spin_unlock(&queue->serial.lock);
+	spin_lock(&squeue->serial.lock);
+	list_replace_init(&squeue->serial.list, &local_list);
+	spin_unlock(&squeue->serial.lock);
 
 	while (!list_empty(&local_list)) {
 		struct padata_priv *padata;
@@ -339,18 +340,18 @@ static void padata_serial_worker(struct work_struct *work)
 void padata_do_serial(struct padata_priv *padata)
 {
 	int cpu;
-	struct padata_queue *queue;
+	struct padata_parallel_queue *pqueue;
 	struct parallel_data *pd;
 
 	pd = padata->pd;
 
 	cpu = get_cpu();
-	queue = per_cpu_ptr(pd->queue, cpu);
+	pqueue = per_cpu_ptr(pd->pqueue, cpu);
 
-	spin_lock(&queue->reorder.lock);
+	spin_lock(&pqueue->reorder.lock);
 	atomic_inc(&pd->reorder_objects);
-	list_add_tail(&padata->list, &queue->reorder.list);
-	spin_unlock(&queue->reorder.lock);
+	list_add_tail(&padata->list, &pqueue->reorder.list);
+	spin_unlock(&pqueue->reorder.lock);
 
 	put_cpu();
 
@@ -358,51 +359,88 @@ void padata_do_serial(struct padata_priv *padata)
 }
 EXPORT_SYMBOL(padata_do_serial);
 
-/* Allocate and initialize the internal cpumask dependend resources. */
-static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
-					     const struct cpumask *cpumask)
+static int padata_setup_cpumasks(struct parallel_data *pd,
+				 const struct cpumask *pcpumask,
+				 const struct cpumask *cbcpumask)
 {
-	int cpu, cpu_index, num_cpus;
-	struct padata_queue *queue;
-	struct parallel_data *pd;
+	if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
+		return -ENOMEM;
 
-	cpu_index = 0;
+	cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
+	if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
+		free_cpumask_var(pd->cpumask.cbcpu);
+		return -ENOMEM;
+	}
 
-	pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
-	if (!pd)
-		goto err;
+	cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
+	return 0;
+}
 
-	pd->queue = alloc_percpu(struct padata_queue);
-	if (!pd->queue)
-		goto err_free_pd;
+static void __padata_list_init(struct padata_list *pd_list)
+{
+	INIT_LIST_HEAD(&pd_list->list);
+	spin_lock_init(&pd_list->lock);
+}
 
-	if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
-		goto err_free_queue;
+/* Initialize all percpu queues used by serial workers */
+static void padata_init_squeues(struct parallel_data *pd)
+{
+	int cpu;
+	struct padata_serial_queue *squeue;
 
-	cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
+	for_each_cpu(cpu, pd->cpumask.cbcpu) {
+		squeue = per_cpu_ptr(pd->squeue, cpu);
+		squeue->pd = pd;
+		__padata_list_init(&squeue->serial);
+		INIT_WORK(&squeue->work, padata_serial_worker);
+	}
+}
 
-	for_each_cpu(cpu, pd->cpumask) {
-		queue = per_cpu_ptr(pd->queue, cpu);
+/* Initialize all percpu queues used by parallel workers */
+static void padata_init_pqueues(struct parallel_data *pd)
+{
+	int cpu_index, num_cpus, cpu;
+	struct padata_parallel_queue *pqueue;
 
-		queue->pd = pd;
+	cpu_index = 0;
+	for_each_cpu(cpu, pd->cpumask.pcpu) {
+		pqueue = per_cpu_ptr(pd->pqueue, cpu);
+		pqueue->pd = pd;
+		pqueue->cpu_index = cpu_index;
+
+		__padata_list_init(&pqueue->reorder);
+		__padata_list_init(&pqueue->parallel);
+		INIT_WORK(&pqueue->work, padata_parallel_worker);
+		atomic_set(&pqueue->num_obj, 0);
+	}
 
-		queue->cpu_index = cpu_index;
-		cpu_index++;
+	num_cpus = cpumask_weight(pd->cpumask.pcpu);
+	pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
+}
 
-		INIT_LIST_HEAD(&queue->reorder.list);
-		INIT_LIST_HEAD(&queue->parallel.list);
-		INIT_LIST_HEAD(&queue->serial.list);
-		spin_lock_init(&queue->reorder.lock);
-		spin_lock_init(&queue->parallel.lock);
-		spin_lock_init(&queue->serial.lock);
+/* Allocate and initialize the internal cpumask dependend resources. */
+static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+					     const struct cpumask *pcpumask,
+					     const struct cpumask *cbcpumask)
+{
+	struct parallel_data *pd;
 
-		INIT_WORK(&queue->pwork, padata_parallel_worker);
-		INIT_WORK(&queue->swork, padata_serial_worker);
-	}
+	pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+	if (!pd)
+		goto err;
 
-	num_cpus = cpumask_weight(pd->cpumask);
-	pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
+	pd->pqueue = alloc_percpu(struct padata_parallel_queue);
+	if (!pd->pqueue)
+		goto err_free_pd;
+
+	pd->squeue = alloc_percpu(struct padata_serial_queue);
+	if (!pd->squeue)
+		goto err_free_pqueue;
+	if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
+		goto err_free_squeue;
 
+	padata_init_pqueues(pd);
+	padata_init_squeues(pd);
 	setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
 	atomic_set(&pd->seq_nr, -1);
 	atomic_set(&pd->reorder_objects, 0);
@@ -412,8 +450,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
 
 	return pd;
 
-err_free_queue:
-	free_percpu(pd->queue);
+err_free_squeue:
+	free_percpu(pd->squeue);
+err_free_pqueue:
+	free_percpu(pd->pqueue);
 err_free_pd:
 	kfree(pd);
 err:
@@ -422,8 +462,10 @@ err:
 
 static void padata_free_pd(struct parallel_data *pd)
 {
-	free_cpumask_var(pd->cpumask);
-	free_percpu(pd->queue);
+	free_cpumask_var(pd->cpumask.pcpu);
+	free_cpumask_var(pd->cpumask.cbcpu);
+	free_percpu(pd->pqueue);
+	free_percpu(pd->squeue);
 	kfree(pd);
 }
 
@@ -431,11 +473,12 @@ static void padata_free_pd(struct parallel_data *pd)
 static void padata_flush_queues(struct parallel_data *pd)
 {
 	int cpu;
-	struct padata_queue *queue;
+	struct padata_parallel_queue *pqueue;
+	struct padata_serial_queue *squeue;
 
-	for_each_cpu(cpu, pd->cpumask) {
-		queue = per_cpu_ptr(pd->queue, cpu);
-		flush_work(&queue->pwork);
+	for_each_cpu(cpu, pd->cpumask.pcpu) {
+		pqueue = per_cpu_ptr(pd->pqueue, cpu);
+		flush_work(&pqueue->work);
 	}
 
 	del_timer_sync(&pd->timer);
@@ -443,9 +486,9 @@ static void padata_flush_queues(struct parallel_data *pd)
 	if (atomic_read(&pd->reorder_objects))
 		padata_reorder(pd);
 
-	for_each_cpu(cpu, pd->cpumask) {
-		queue = per_cpu_ptr(pd->queue, cpu);
-		flush_work(&queue->swork);
+	for_each_cpu(cpu, pd->cpumask.cbcpu) {
+		squeue = per_cpu_ptr(pd->squeue, cpu);
+		flush_work(&squeue->work);
 	}
 
 	BUG_ON(atomic_read(&pd->refcnt) != 0);
@@ -475,21 +518,63 @@ static void padata_replace(struct padata_instance *pinst,
 			   struct parallel_data *pd_new)
 {
 	struct parallel_data *pd_old = pinst->pd;
+	int notification_mask = 0;
 
 	pinst->flags |= PADATA_RESET;
 
 	rcu_assign_pointer(pinst->pd, pd_new);
 
 	synchronize_rcu();
+	if (!pd_old)
+		goto out;
 
-	if (pd_old) {
-		padata_flush_queues(pd_old);
-		padata_free_pd(pd_old);
-	}
+	padata_flush_queues(pd_old);
+	if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
+		notification_mask |= PADATA_CPU_PARALLEL;
+	if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
+		notification_mask |= PADATA_CPU_SERIAL;
+
+	padata_free_pd(pd_old);
+	if (notification_mask)
+		blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
+					     notification_mask, pinst);
 
+out:
 	pinst->flags &= ~PADATA_RESET;
 }
 
+/**
+ * padata_register_cpumask_notifier - Registers a notifier that will be called
+ *                             if either pcpu or cbcpu or both cpumasks change.
+ *
+ * @pinst: A poineter to padata instance
+ * @nblock: A pointer to notifier block.
+ */
+int padata_register_cpumask_notifier(struct padata_instance *pinst,
+				     struct notifier_block *nblock)
+{
+	return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
+						nblock);
+}
+EXPORT_SYMBOL(padata_register_cpumask_notifier);
+
+/**
+ * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
+ *        registered earlier  using padata_register_cpumask_notifier
+ *
+ * @pinst: A pointer to data instance.
+ * @nlock: A pointer to notifier block.
+ */
+int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
+				       struct notifier_block *nblock)
+{
+	return blocking_notifier_chain_unregister(
+		&pinst->cpumask_change_notifier,
+		nblock);
+}
+EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
+
+
 /* If cpumask contains no active cpu, we mark the instance as invalid. */
 static bool padata_validate_cpumask(struct padata_instance *pinst,
 				    const struct cpumask *cpumask)
@@ -504,13 +589,82 @@ static bool padata_validate_cpumask(struct padata_instance *pinst,
 }
 
 /**
- * padata_set_cpumask - set the cpumask that padata should use
+ * padata_get_cpumask: Fetch serial or parallel cpumask from the
+ *                     given padata instance and copy it to @out_mask
+ *
+ * @pinst: A pointer to padata instance
+ * @cpumask_type: Specifies which cpumask will be copied.
+ *                Possible values are PADATA_CPU_SERIAL *or* PADATA_CPU_PARALLEL
+ *                corresponding to serial and parallel cpumask respectively.
+ * @out_mask: A pointer to cpumask structure where selected
+ *            cpumask will be copied.
+ */
+int padata_get_cpumask(struct padata_instance *pinst,
+		       int cpumask_type, struct cpumask *out_mask)
+{
+	struct parallel_data *pd;
+	int ret = 0;
+
+	rcu_read_lock_bh();
+	pd = rcu_dereference(pinst->pd);
+	switch (cpumask_type) {
+	case PADATA_CPU_SERIAL:
+		cpumask_copy(out_mask, pd->cpumask.cbcpu);
+		break;
+	case PADATA_CPU_PARALLEL:
+		cpumask_copy(out_mask, pd->cpumask.pcpu);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	rcu_read_unlock_bh();
+	return ret;
+}
+EXPORT_SYMBOL(padata_get_cpumask);
+
+/**
+ * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
+ *                     equivalent to @cpumask.
  *
  * @pinst: padata instance
+ * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
+ *                to parallel and serial cpumasks respectively.
  * @cpumask: the cpumask to use
  */
-int padata_set_cpumask(struct padata_instance *pinst,
-			cpumask_var_t cpumask)
+int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
+		       cpumask_var_t cpumask)
+{
+	struct cpumask *serial_mask, *parallel_mask;
+
+	switch (cpumask_type) {
+	case PADATA_CPU_PARALLEL:
+		serial_mask = pinst->cpumask.cbcpu;
+		parallel_mask = cpumask;
+		break;
+	case PADATA_CPU_SERIAL:
+		parallel_mask = pinst->cpumask.pcpu;
+		serial_mask = cpumask;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
+}
+EXPORT_SYMBOL(padata_set_cpumask);
+
+/**
+ * __padata_set_cpumasks - Set both parallel and serial cpumasks. The first
+ *                         one is used by parallel workers and the second one
+ *                         by the wokers doing serialization.
+ *
+ * @pinst: padata instance
+ * @pcpumask: the cpumask to use for parallel workers
+ * @cbcpumask: the cpumsak to use for serial workers
+ */
+int __padata_set_cpumasks(struct padata_instance *pinst,
+			  cpumask_var_t pcpumask, cpumask_var_t cbcpumask)
 {
 	int valid;
 	int err = 0;
@@ -518,7 +672,13 @@ int padata_set_cpumask(struct padata_instance *pinst,
 
 	mutex_lock(&pinst->lock);
 
-	valid = padata_validate_cpumask(pinst, cpumask);
+	valid = padata_validate_cpumask(pinst, pcpumask);
+	if (!valid) {
+		__padata_stop(pinst);
+		goto out_replace;
+	}
+
+	valid = padata_validate_cpumask(pinst, cbcpumask);
 	if (!valid) {
 		__padata_stop(pinst);
 		goto out_replace;
@@ -526,14 +686,15 @@ int padata_set_cpumask(struct padata_instance *pinst,
 
 	get_online_cpus();
 
-	pd = padata_alloc_pd(pinst, cpumask);
+	pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
 	if (!pd) {
 		err = -ENOMEM;
 		goto out;
 	}
 
 out_replace:
-	cpumask_copy(pinst->cpumask, cpumask);
+	cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+	cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
 
 	padata_replace(pinst, pd);
 
@@ -546,41 +707,57 @@ out:
 	mutex_unlock(&pinst->lock);
 
 	return err;
+
 }
-EXPORT_SYMBOL(padata_set_cpumask);
+EXPORT_SYMBOL(__padata_set_cpumasks);
 
 static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
 {
 	struct parallel_data *pd;
 
 	if (cpumask_test_cpu(cpu, cpu_active_mask)) {
-		pd = padata_alloc_pd(pinst, pinst->cpumask);
+		pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
+				     pinst->cpumask.cbcpu);
 		if (!pd)
 			return -ENOMEM;
 
 		padata_replace(pinst, pd);
 
-		if (padata_validate_cpumask(pinst, pinst->cpumask))
+		if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
+		    padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
 			__padata_start(pinst);
 	}
 
 	return 0;
 }
 
-/**
- * padata_add_cpu - add a cpu to the padata cpumask
+ /**
+ * padata_add_cpu - add a cpu to one or both(parallel and serial)
+ *                  padata cpumasks.
  *
  * @pinst: padata instance
  * @cpu: cpu to add
+ * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
+ *        The @mask may be any combination of the following flags:
+ *          PADATA_CPU_SERIAL   - serial cpumask
+ *          PADATA_CPU_PARALLEL - parallel cpumask
  */
-int padata_add_cpu(struct padata_instance *pinst, int cpu)
+
+int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
 {
 	int err;
 
+	if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
+		return -EINVAL;
+
 	mutex_lock(&pinst->lock);
 
 	get_online_cpus();
-	cpumask_set_cpu(cpu, pinst->cpumask);
+	if (mask & PADATA_CPU_SERIAL)
+		cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
+	if (mask & PADATA_CPU_PARALLEL)
+		cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
+
 	err = __padata_add_cpu(pinst, cpu);
 	put_online_cpus();
 
@@ -596,13 +773,15 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
 
 	if (cpumask_test_cpu(cpu, cpu_online_mask)) {
 
-		if (!padata_validate_cpumask(pinst, pinst->cpumask)) {
+		if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
+		    !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) {
 			__padata_stop(pinst);
 			padata_replace(pinst, pd);
 			goto out;
 		}
 
-		pd = padata_alloc_pd(pinst, pinst->cpumask);
+		pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
+				     pinst->cpumask.cbcpu);
 		if (!pd)
 			return -ENOMEM;
 
@@ -613,20 +792,32 @@ out:
 	return 0;
 }
 
-/**
- * padata_remove_cpu - remove a cpu from the padata cpumask
+ /**
+ * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
+ *                     padata cpumasks.
  *
  * @pinst: padata instance
  * @cpu: cpu to remove
+ * @mask: bitmask specifying from which cpumask @cpu should be removed
+ *        The @mask may be any combination of the following flags:
+ *          PADATA_CPU_SERIAL   - serial cpumask
+ *          PADATA_CPU_PARALLEL - parallel cpumask
  */
-int padata_remove_cpu(struct padata_instance *pinst, int cpu)
+int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
 {
 	int err;
 
+	if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
+		return -EINVAL;
+
 	mutex_lock(&pinst->lock);
 
 	get_online_cpus();
-	cpumask_clear_cpu(cpu, pinst->cpumask);
+	if (mask & PADATA_CPU_SERIAL)
+		cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
+	if (mask & PADATA_CPU_PARALLEL)
+		cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
+
 	err = __padata_remove_cpu(pinst, cpu);
 	put_online_cpus();
 
@@ -672,6 +863,14 @@ void padata_stop(struct padata_instance *pinst)
 EXPORT_SYMBOL(padata_stop);
 
 #ifdef CONFIG_HOTPLUG_CPU
+
+static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
+{
+	return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
+		cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
+}
+
+
 static int padata_cpu_callback(struct notifier_block *nfb,
 			       unsigned long action, void *hcpu)
 {
@@ -684,7 +883,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		if (!cpumask_test_cpu(cpu, pinst->cpumask))
+		if (!pinst_has_cpu(pinst, cpu))
 			break;
 		mutex_lock(&pinst->lock);
 		err = __padata_add_cpu(pinst, cpu);
@@ -695,7 +894,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
 
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
-		if (!cpumask_test_cpu(cpu, pinst->cpumask))
+		if (!pinst_has_cpu(pinst, cpu))
 			break;
 		mutex_lock(&pinst->lock);
 		err = __padata_remove_cpu(pinst, cpu);
@@ -706,7 +905,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
 
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
-		if (!cpumask_test_cpu(cpu, pinst->cpumask))
+		if (!pinst_has_cpu(pinst, cpu))
 			break;
 		mutex_lock(&pinst->lock);
 		__padata_remove_cpu(pinst, cpu);
@@ -714,7 +913,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
 
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
-		if (!cpumask_test_cpu(cpu, pinst->cpumask))
+		if (!pinst_has_cpu(pinst, cpu))
 			break;
 		mutex_lock(&pinst->lock);
 		__padata_add_cpu(pinst, cpu);
@@ -726,13 +925,29 @@ static int padata_cpu_callback(struct notifier_block *nfb,
 #endif
 
 /**
- * padata_alloc - allocate and initialize a padata instance
+ * padata_alloc - Allocate and initialize padata instance.
+ *                Use default cpumask(cpu_possible_mask)
+ *                for serial and parallel workes.
+ *
+ * @wq: workqueue to use for the allocated padata instance
+ */
+struct padata_instance *padata_alloc(struct workqueue_struct *wq)
+{
+	return __padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
+}
+EXPORT_SYMBOL(padata_alloc);
+
+/**
+ * __padata_alloc - allocate and initialize a padata instance
+ *                  and specify cpumasks for serial and parallel workers.
  *
- * @cpumask: cpumask that padata uses for parallelization
  * @wq: workqueue to use for the allocated padata instance
+ * @pcpumask: cpumask that will be used for padata parallelization
+ * @cbcpumask: cpumask that will be used for padata serialization
  */
-struct padata_instance *padata_alloc(const struct cpumask *cpumask,
-				     struct workqueue_struct *wq)
+struct padata_instance *__padata_alloc(struct workqueue_struct *wq,
+				       const struct cpumask *pcpumask,
+				       const struct cpumask *cbcpumask)
 {
 	struct padata_instance *pinst;
 	struct parallel_data *pd = NULL;
@@ -742,21 +957,26 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
 		goto err;
 
 	get_online_cpus();
-
-	if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
+	if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
+		goto err_free_inst;
+	if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
+		free_cpumask_var(pinst->cpumask.pcpu);
 		goto err_free_inst;
-
-	if (padata_validate_cpumask(pinst, cpumask)) {
-		pd = padata_alloc_pd(pinst, cpumask);
-		if (!pd)
-			goto err_free_mask;
 	}
+	if (!padata_validate_cpumask(pinst, pcpumask) ||
+	    !padata_validate_cpumask(pinst, cbcpumask))
+		goto err_free_masks;
+
+	pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
+	if (!pd)
+		goto err_free_masks;
 
 	rcu_assign_pointer(pinst->pd, pd);
 
 	pinst->wq = wq;
 
-	cpumask_copy(pinst->cpumask, cpumask);
+	cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+	cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
 
 	pinst->flags = 0;
 
@@ -768,19 +988,21 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
 
 	put_online_cpus();
 
+	BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
 	mutex_init(&pinst->lock);
 
 	return pinst;
 
-err_free_mask:
-	free_cpumask_var(pinst->cpumask);
+err_free_masks:
+	free_cpumask_var(pinst->cpumask.pcpu);
+	free_cpumask_var(pinst->cpumask.cbcpu);
 err_free_inst:
 	kfree(pinst);
 	put_online_cpus();
 err:
 	return NULL;
 }
-EXPORT_SYMBOL(padata_alloc);
+EXPORT_SYMBOL(__padata_alloc);
 
 /**
  * padata_free - free a padata instance
@@ -795,7 +1017,8 @@ void padata_free(struct padata_instance *pinst)
 
 	padata_stop(pinst);
 	padata_free_pd(pinst->pd);
-	free_cpumask_var(pinst->cpumask);
+	free_cpumask_var(pinst->cpumask.pcpu);
+	free_cpumask_var(pinst->cpumask.cbcpu);
 	kfree(pinst);
 }
 EXPORT_SYMBOL(padata_free);
-- 
cgit v1.2.3-59-g8ed1b


From 5e017dc3f8bc9e4a28983666e6bc00114a2018bb Mon Sep 17 00:00:00 2001
From: Dan Kruchinin <dkruchinin@acm.org>
Date: Wed, 14 Jul 2010 14:33:08 +0400
Subject: padata: Added sysfs primitives to padata subsystem

Added sysfs primitives to padata subsystem. Now API user may
embedded kobject each padata instance contains into any sysfs
hierarchy. For now padata sysfs interface provides only
two objects:
    serial_cpumask   [RW] - cpumask for serial workers
    parallel_cpumask [RW] - cpumask for parallel workers

Signed-off-by: Dan Kruchinin <dkruchinin@acm.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |   5 +-
 kernel/padata.c        | 155 ++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 150 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 621e7736690c..293ad46ffced 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -26,6 +26,7 @@
 #include <linux/list.h>
 #include <linux/timer.h>
 #include <linux/notifier.h>
+#include <linux/kobject.h>
 
 #define PADATA_CPU_SERIAL   0x01
 #define PADATA_CPU_PARALLEL 0x02
@@ -142,7 +143,8 @@ struct parallel_data {
  *           cbcpu for parallel and serial works respectivly.
  * @cpumask_change_notifier: Notifiers chain for user-defined notify
  *            callbacks that will be called when either @pcpu or @cbcpu
- *             or both cpumasks change.
+ *            or both cpumasks change.
+ * @kobj: padata instance kernel object.
  * @lock: padata instance lock.
  * @flags: padata flags.
  */
@@ -155,6 +157,7 @@ struct padata_instance {
 		cpumask_var_t		 cbcpu;
 	} cpumask;
 	struct blocking_notifier_head	 cpumask_change_notifier;
+	struct kobject                   kobj;
 	struct mutex			 lock;
 	u8				 flags;
 #define	PADATA_INIT	1
diff --git a/kernel/padata.c b/kernel/padata.c
index 84d0ca9dac9c..526f9ea2fcc8 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -26,6 +26,7 @@
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/sysfs.h>
 #include <linux/rcupdate.h>
 
 #define MAX_SEQ_NR (INT_MAX - NR_CPUS)
@@ -924,6 +925,149 @@ static int padata_cpu_callback(struct notifier_block *nfb,
 }
 #endif
 
+static void __padata_free(struct padata_instance *pinst)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	unregister_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+
+	padata_stop(pinst);
+	padata_free_pd(pinst->pd);
+	free_cpumask_var(pinst->cpumask.pcpu);
+	free_cpumask_var(pinst->cpumask.cbcpu);
+	kfree(pinst);
+}
+
+#define kobj2pinst(_kobj)					\
+	container_of(_kobj, struct padata_instance, kobj)
+#define attr2pentry(_attr)					\
+	container_of(_attr, struct padata_sysfs_entry, attr)
+
+static void padata_sysfs_release(struct kobject *kobj)
+{
+	struct padata_instance *pinst = kobj2pinst(kobj);
+	__padata_free(pinst);
+}
+
+struct padata_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
+	ssize_t (*store)(struct padata_instance *, struct attribute *,
+			 const char *, size_t);
+};
+
+static ssize_t show_cpumask(struct padata_instance *pinst,
+			    struct attribute *attr,  char *buf)
+{
+	struct cpumask *cpumask;
+	ssize_t len;
+
+	mutex_lock(&pinst->lock);
+	if (!strcmp(attr->name, "serial_cpumask"))
+		cpumask = pinst->cpumask.cbcpu;
+	else
+		cpumask = pinst->cpumask.pcpu;
+
+	len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
+			       nr_cpu_ids);
+	if (PAGE_SIZE - len < 2)
+		len = -EINVAL;
+	else
+		len += sprintf(buf + len, "\n");
+
+	mutex_unlock(&pinst->lock);
+	return len;
+}
+
+static ssize_t store_cpumask(struct padata_instance *pinst,
+			     struct attribute *attr,
+			     const char *buf, size_t count)
+{
+	cpumask_var_t new_cpumask;
+	ssize_t ret;
+	int mask_type;
+
+	if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask),
+			   nr_cpumask_bits);
+	if (ret < 0)
+		goto out;
+
+	mask_type = !strcmp(attr->name, "serial_cpumask") ?
+		PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL;
+	ret = padata_set_cpumask(pinst, mask_type, new_cpumask);
+	if (!ret)
+		ret = count;
+
+out:
+	free_cpumask_var(new_cpumask);
+	return ret;
+}
+
+#define PADATA_ATTR_RW(_name, _show_name, _store_name)		\
+	static struct padata_sysfs_entry _name##_attr =		\
+		__ATTR(_name, 0644, _show_name, _store_name)
+#define PADATA_ATTR_RO(_name, _show_name)		\
+	static struct padata_sysfs_entry _name##_attr = \
+		__ATTR(_name, 0400, _show_name, NULL)
+
+PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
+PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
+
+/*
+ * Padata sysfs provides the following objects:
+ * serial_cpumask   [RW] - cpumask for serial workers
+ * parallel_cpumask [RW] - cpumask for parallel workers
+ */
+static struct attribute *padata_default_attrs[] = {
+	&serial_cpumask_attr.attr,
+	&parallel_cpumask_attr.attr,
+	NULL,
+};
+
+static ssize_t padata_sysfs_show(struct kobject *kobj,
+				 struct attribute *attr, char *buf)
+{
+	struct padata_instance *pinst;
+	struct padata_sysfs_entry *pentry;
+	ssize_t ret = -EIO;
+
+	pinst = kobj2pinst(kobj);
+	pentry = attr2pentry(attr);
+	if (pentry->show)
+		ret = pentry->show(pinst, attr, buf);
+
+	return ret;
+}
+
+static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct padata_instance *pinst;
+	struct padata_sysfs_entry *pentry;
+	ssize_t ret = -EIO;
+
+	pinst = kobj2pinst(kobj);
+	pentry = attr2pentry(attr);
+	if (pentry->show)
+		ret = pentry->store(pinst, attr, buf, count);
+
+	return ret;
+}
+
+static const struct sysfs_ops padata_sysfs_ops = {
+	.show = padata_sysfs_show,
+	.store = padata_sysfs_store,
+};
+
+static struct kobj_type padata_attr_type = {
+	.sysfs_ops = &padata_sysfs_ops,
+	.default_attrs = padata_default_attrs,
+	.release = padata_sysfs_release,
+};
+
 /**
  * padata_alloc - Allocate and initialize padata instance.
  *                Use default cpumask(cpu_possible_mask)
@@ -989,6 +1133,7 @@ struct padata_instance *__padata_alloc(struct workqueue_struct *wq,
 	put_online_cpus();
 
 	BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
+	kobject_init(&pinst->kobj, &padata_attr_type);
 	mutex_init(&pinst->lock);
 
 	return pinst;
@@ -1011,14 +1156,6 @@ EXPORT_SYMBOL(__padata_alloc);
  */
 void padata_free(struct padata_instance *pinst)
 {
-#ifdef CONFIG_HOTPLUG_CPU
-	unregister_hotcpu_notifier(&pinst->cpu_notifier);
-#endif
-
-	padata_stop(pinst);
-	padata_free_pd(pinst->pd);
-	free_cpumask_var(pinst->cpumask.pcpu);
-	free_cpumask_var(pinst->cpumask.cbcpu);
-	kfree(pinst);
+	kobject_put(&pinst->kobj);
 }
 EXPORT_SYMBOL(padata_free);
-- 
cgit v1.2.3-59-g8ed1b


From 7a2e3659b6ffbee4e742cd6f6a60359ad9148720 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave.bueso@gmail.com>
Date: Tue, 13 Jul 2010 05:53:44 -0400
Subject: reiserfs: typo comment fix

Fix trivial typo in code comment (change adn for and), also change comment
style for proper coding style.

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/reiserfs_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 3b603f474186..ba394163dea1 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -360,7 +360,7 @@ int is_reiserfs_jr(struct reiserfs_super_block *rs);
 /* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */
 #define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024)
 
-// reiserfs internal error code (used by search_by_key adn fix_nodes))
+/* reiserfs internal error code (used by search_by_key and fix_nodes)) */
 #define CARRY_ON      0
 #define REPEAT_SEARCH -1
 #define IO_ERROR      -2
-- 
cgit v1.2.3-59-g8ed1b


From 3a343ee4509c982552b35fbc99d3213f3bb1acde Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Mon, 12 Jul 2010 19:28:27 +0200
Subject: HID: add HID_QUIRK_HIDINPUT_FORCE

For devices with exotic HID report descriptors, it might be necessary to
make the HID core force the registration of an input device. Make that
possible by introducing a new quirk type.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c | 2 ++
 include/linux/hid.h    | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 866e54ec5fb2..7ccee899b59e 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1157,6 +1157,8 @@ int hid_connect(struct hid_device *hdev, unsigned int connect_mask)
 
 	if (hdev->quirks & HID_QUIRK_HIDDEV_FORCE)
 		connect_mask |= (HID_CONNECT_HIDDEV_FORCE | HID_CONNECT_HIDDEV);
+	if (hdev->quirks & HID_QUIRK_HIDINPUT_FORCE)
+		connect_mask |= HID_CONNECT_HIDINPUT_FORCE;
 	if (hdev->bus != BUS_USB)
 		connect_mask &= ~HID_CONNECT_HIDDEV;
 	if (hid_hiddev(hdev))
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 895001f7f4b2..42a0f1d11365 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -311,6 +311,7 @@ struct hid_item {
 #define HID_QUIRK_HIDDEV_FORCE			0x00000010
 #define HID_QUIRK_BADPAD			0x00000020
 #define HID_QUIRK_MULTI_INPUT			0x00000040
+#define HID_QUIRK_HIDINPUT_FORCE		0x00000080
 #define HID_QUIRK_SKIP_OUTPUT_REPORTS		0x00010000
 #define HID_QUIRK_FULLSPEED_INTERVAL		0x10000000
 #define HID_QUIRK_NO_INIT_REPORTS		0x20000000
-- 
cgit v1.2.3-59-g8ed1b


From 323f99cbc35c52a65dea9d072b3ecf1e662240d2 Mon Sep 17 00:00:00 2001
From: Tom Lyon <pugs@cisco.com>
Date: Fri, 2 Jul 2010 16:56:14 -0400
Subject: iommu-api: Extension to check for interrupt remapping

This patch allows IOMMU users to determine whether the
hardware and software support safe, isolated interrupt
remapping.  Not all Intel IOMMUs have the hardware, and the
software for AMD is not there yet.

Signed-off-by: Tom Lyon <pugs@cisco.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/pci/intel-iommu.c | 2 ++
 include/linux/iommu.h     | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index c9171be74564..6a5af18faf68 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3698,6 +3698,8 @@ static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
 
 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
 		return dmar_domain->iommu_snooping;
+	if (cap == IOMMU_CAP_INTR_REMAP)
+		return intr_remapping_enabled;
 
 	return 0;
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index be22ad83689c..0a2ba4098996 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -30,6 +30,7 @@ struct iommu_domain {
 };
 
 #define IOMMU_CAP_CACHE_COHERENCY	0x1
+#define IOMMU_CAP_INTR_REMAP		0x2	/* isolates device intrs */
 
 struct iommu_ops {
 	int (*domain_init)(struct iommu_domain *domain);
-- 
cgit v1.2.3-59-g8ed1b


From bd27290a593f80cb99e95287cb29c72c0d57608b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 19 Jul 2010 09:35:40 -0700
Subject: net: 64bit stats for netdev_queue

Since struct netdev_queue tx_bytes/tx_packets/tx_dropped are already
protected by _xmit_lock, its easy to convert these fields to u64 instead
of unsigned long.
This completes 64bit stats for devices using them (vlan, macvlan, ...)

Strictly, we could avoid the locking in dev_txq_stats_fold() on 64bit
arches, but its slow path and we prefer keep it simple.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 6 +++---
 net/core/dev.c            | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fdc3f2992230..b6262898ece0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -501,9 +501,9 @@ struct netdev_queue {
 	 * please use this field instead of dev->trans_start
 	 */
 	unsigned long		trans_start;
-	unsigned long		tx_bytes;
-	unsigned long		tx_packets;
-	unsigned long		tx_dropped;
+	u64			tx_bytes;
+	u64			tx_packets;
+	u64			tx_dropped;
 } ____cacheline_aligned_in_smp;
 
 #ifdef CONFIG_RPS
diff --git a/net/core/dev.c b/net/core/dev.c
index 1c002c7ef5d5..9de75cdade56 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5282,15 +5282,17 @@ void netdev_run_todo(void)
 void dev_txq_stats_fold(const struct net_device *dev,
 			struct rtnl_link_stats64 *stats)
 {
-	unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
+	u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
 	unsigned int i;
 	struct netdev_queue *txq;
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		txq = netdev_get_tx_queue(dev, i);
+		spin_lock_bh(&txq->_xmit_lock);
 		tx_bytes   += txq->tx_bytes;
 		tx_packets += txq->tx_packets;
 		tx_dropped += txq->tx_dropped;
+		spin_unlock_bh(&txq->_xmit_lock);
 	}
 	if (tx_bytes || tx_packets || tx_dropped) {
 		stats->tx_bytes   = tx_bytes;
-- 
cgit v1.2.3-59-g8ed1b


From e7c38157c61649e66f853d7b9f109119b8361448 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 19 Jul 2010 22:01:26 -0700
Subject: ipv6: Make IP6CB(skb)->nhoff 16-bit.

Even with jumbograms I cannot see any way in which we would need
to records a larger than 65535 valued next-header offset.

The maximum extension header length is (256 << 3) == 2048.
There are only a handful of extension headers specified which
we'd even accept (say 5 or 6), therefore the largest next-header
offset we'd ever have to contend with is something less than
say 16k.

Therefore make it a u16 instead of a u32.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 940e21595351..ab9e9e89e407 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -246,7 +246,7 @@ struct inet6_skb_parm {
 	__u16			srcrt;
 	__u16			dst1;
 	__u16			lastopt;
-	__u32			nhoff;
+	__u16			nhoff;
 	__u16			flags;
 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 	__u16			dsthao;
-- 
cgit v1.2.3-59-g8ed1b


From eb7beb5c09af75494234ea6acd09d0a647cf7338 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 16 Jul 2010 00:50:03 +0200
Subject: tracing: Remove special traces

Special traces type was only used by sysprof. Lets remove it now
that sysprof ftrace plugin has been dropped.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Soeren Sandmann <sandmann@daimi.au.dk>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/kernel.h        |  5 ----
 include/linux/sched.h         | 12 --------
 kernel/trace/trace.c          | 55 ------------------------------------
 kernel/trace/trace.h          |  7 -----
 kernel/trace/trace_entries.h  | 17 -----------
 kernel/trace/trace_output.c   | 66 -------------------------------------------
 kernel/trace/trace_selftest.c |  1 -
 7 files changed, 163 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8317ec4b9f3b..adee958b5989 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -508,9 +508,6 @@ extern void tracing_start(void);
 extern void tracing_stop(void);
 extern void ftrace_off_permanent(void);
 
-extern void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
-
 static inline void __attribute__ ((format (printf, 1, 2)))
 ____trace_printk_check_format(const char *fmt, ...)
 {
@@ -586,8 +583,6 @@ __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 
 extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
 #else
-static inline void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
 trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 747fcaedddb7..f751ea9dcb7b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2434,18 +2434,6 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_TRACING
-extern void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3);
-#else
-static inline void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-}
-#endif
-
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 78a49e67f7db..d9a4aa02c384 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1331,61 +1331,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 
 #endif /* CONFIG_STACKTRACE */
 
-static void
-ftrace_trace_special(void *__tr,
-		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
-		     int pc)
-{
-	struct ftrace_event_call *call = &event_special;
-	struct ring_buffer_event *event;
-	struct trace_array *tr = __tr;
-	struct ring_buffer *buffer = tr->buffer;
-	struct special_entry *entry;
-
-	event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
-					  sizeof(*entry), 0, pc);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	entry->arg1			= arg1;
-	entry->arg2			= arg2;
-	entry->arg3			= arg3;
-
-	if (!filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, pc);
-}
-
-void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
-}
-
-void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	int cpu;
-	int pc;
-
-	if (tracing_disabled)
-		return;
-
-	pc = preempt_count();
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (likely(atomic_inc_return(&data->disabled) == 1))
-		ftrace_trace_special(tr, arg1, arg2, arg3, pc);
-
-	atomic_dec(&data->disabled);
-	local_irq_restore(flags);
-}
-
 /**
  * trace_vbprintk - write binary msg to tracing buffer
  *
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2114b4c1150f..638a5887e2ec 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,7 +22,6 @@ enum trace_type {
 	TRACE_STACK,
 	TRACE_PRINT,
 	TRACE_BPRINT,
-	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
 	TRACE_BRANCH,
@@ -189,7 +188,6 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
 		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
-		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
@@ -332,11 +330,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
 				struct task_struct *wakee,
 				struct task_struct *cur,
 				unsigned long flags, int pc);
-void trace_special(struct trace_array *tr,
-		   struct trace_array_cpu *data,
-		   unsigned long arg1,
-		   unsigned long arg2,
-		   unsigned long arg3, int pc);
 void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 84128371f254..e3dfecaf13e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -150,23 +150,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
 		)
 );
 
-/*
- * Special (free-form) trace entry:
- */
-FTRACE_ENTRY(special, special_entry,
-
-	TRACE_SPECIAL,
-
-	F_STRUCT(
-		__field(	unsigned long,	arg1	)
-		__field(	unsigned long,	arg2	)
-		__field(	unsigned long,	arg3	)
-	),
-
-	F_printk("(%08lx) (%08lx) (%08lx)",
-		 __entry->arg1, __entry->arg2, __entry->arg3)
-);
-
 /*
  * Stack-trace entry:
  */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57c1b4596470..a46197b80b7f 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1069,65 +1069,6 @@ static struct trace_event trace_wake_event = {
 	.funcs		= &trace_wake_funcs,
 };
 
-/* TRACE_SPECIAL */
-static enum print_line_t trace_special_print(struct trace_iterator *iter,
-					     int flags, struct trace_event *event)
-{
-	struct special_entry *field;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
-			      field->arg1,
-			      field->arg2,
-			      field->arg3))
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t trace_special_hex(struct trace_iterator *iter,
-					   int flags, struct trace_event *event)
-{
-	struct special_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
-	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
-	SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t trace_special_bin(struct trace_iterator *iter,
-					   int flags, struct trace_event *event)
-{
-	struct special_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	SEQ_PUT_FIELD_RET(s, field->arg1);
-	SEQ_PUT_FIELD_RET(s, field->arg2);
-	SEQ_PUT_FIELD_RET(s, field->arg3);
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static struct trace_event_functions trace_special_funcs = {
-	.trace		= trace_special_print,
-	.raw		= trace_special_print,
-	.hex		= trace_special_hex,
-	.binary		= trace_special_bin,
-};
-
-static struct trace_event trace_special_event = {
-	.type		= TRACE_SPECIAL,
-	.funcs		= &trace_special_funcs,
-};
-
 /* TRACE_STACK */
 
 static enum print_line_t trace_stack_print(struct trace_iterator *iter,
@@ -1161,9 +1102,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 
 static struct trace_event_functions trace_stack_funcs = {
 	.trace		= trace_stack_print,
-	.raw		= trace_special_print,
-	.hex		= trace_special_hex,
-	.binary		= trace_special_bin,
 };
 
 static struct trace_event trace_stack_event = {
@@ -1194,9 +1132,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 
 static struct trace_event_functions trace_user_stack_funcs = {
 	.trace		= trace_user_stack_print,
-	.raw		= trace_special_print,
-	.hex		= trace_special_hex,
-	.binary		= trace_special_bin,
 };
 
 static struct trace_event trace_user_stack_event = {
@@ -1314,7 +1249,6 @@ static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
 	&trace_wake_event,
-	&trace_special_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_bprint_event,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 6ed05ee6cbc7..155a415b3209 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,7 +13,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_WAKE:
 	case TRACE_STACK:
 	case TRACE_PRINT:
-	case TRACE_SPECIAL:
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
-- 
cgit v1.2.3-59-g8ed1b


From e870e9a1240bcef1157ffaaf71dac63362e71904 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Jul 2010 11:07:32 +0800
Subject: tracing: Allow to disable cmdline recording

We found that even enabling a single trace event that will rarely be
triggered can add big overhead to context switch.

(lmbench context switch test)
 -------------------------------------------------
 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K
 ctxsw  ctxsw  ctxsw ctxsw  ctxsw   ctxsw   ctxsw
------ ------ ------ ------ ------ ------- -------
  2.19   2.3   2.21   2.56   2.13     2.54    2.07
  2.39   2.51  2.35   2.75   2.27     2.81    2.24

The overhead is 6% ~ 11%.

It's because when a trace event is enabled 3 tracepoints (sched_switch,
sched_wakeup, sched_wakeup_new) will be activated to map pid to cmdname.

We'd like to avoid this overhead, so add a trace option '(no)record-cmd'
to allow to disable cmdline recording.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4C2D57F4.2050204@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  7 +++++--
 kernel/trace/trace.c         |  6 +++++-
 kernel/trace/trace.h         |  3 +++
 kernel/trace/trace_events.c  | 30 ++++++++++++++++++++++++++++--
 4 files changed, 41 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 01df7ca4ead7..2b7b1395b4d5 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -152,11 +152,13 @@ extern int ftrace_event_reg(struct ftrace_event_call *event,
 enum {
 	TRACE_EVENT_FL_ENABLED_BIT,
 	TRACE_EVENT_FL_FILTERED_BIT,
+	TRACE_EVENT_FL_RECORDED_CMD_BIT,
 };
 
 enum {
-	TRACE_EVENT_FL_ENABLED	= (1 << TRACE_EVENT_FL_ENABLED_BIT),
-	TRACE_EVENT_FL_FILTERED	= (1 << TRACE_EVENT_FL_FILTERED_BIT),
+	TRACE_EVENT_FL_ENABLED		= (1 << TRACE_EVENT_FL_ENABLED_BIT),
+	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
+	TRACE_EVENT_FL_RECORDED_CMD	= (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT),
 };
 
 struct ftrace_event_call {
@@ -174,6 +176,7 @@ struct ftrace_event_call {
 	 * 32 bit flags:
 	 *   bit 1:		enabled
 	 *   bit 2:		filter_active
+	 *   bit 3:		enabled cmd record
 	 *
 	 * Changes to flags must hold the event_mutex.
 	 *
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8683dec6946b..af9042977c08 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -344,7 +344,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-	TRACE_ITER_GRAPH_TIME;
+	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
 
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
@@ -428,6 +428,7 @@ static const char *trace_options[] = {
 	"latency-format",
 	"sleep-time",
 	"graph-time",
+	"record-cmd",
 	NULL
 };
 
@@ -2561,6 +2562,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
 		trace_flags |= mask;
 	else
 		trace_flags &= ~mask;
+
+	if (mask == TRACE_ITER_RECORD_CMD)
+		trace_event_enable_cmd_record(enabled);
 }
 
 static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 84d3f123e86f..7778f067fc8b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -591,6 +591,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_LATENCY_FMT		= 0x20000,
 	TRACE_ITER_SLEEP_TIME		= 0x40000,
 	TRACE_ITER_GRAPH_TIME		= 0x80000,
+	TRACE_ITER_RECORD_CMD		= 0x100000,
 };
 
 /*
@@ -723,6 +724,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
+extern void trace_event_enable_cmd_record(bool enable);
+
 extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e8e6043f4d29..09b4fa6e4d3b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -170,6 +170,26 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
 }
 EXPORT_SYMBOL_GPL(ftrace_event_reg);
 
+void trace_event_enable_cmd_record(bool enable)
+{
+	struct ftrace_event_call *call;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+			continue;
+
+		if (enable) {
+			tracing_start_cmdline_record();
+			call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+		} else {
+			tracing_stop_cmdline_record();
+			call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+		}
+	}
+	mutex_unlock(&event_mutex);
+}
+
 static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 					int enable)
 {
@@ -179,13 +199,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 	case 0:
 		if (call->flags & TRACE_EVENT_FL_ENABLED) {
 			call->flags &= ~TRACE_EVENT_FL_ENABLED;
-			tracing_stop_cmdline_record();
+			if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
+				tracing_stop_cmdline_record();
+				call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+			}
 			call->class->reg(call, TRACE_REG_UNREGISTER);
 		}
 		break;
 	case 1:
 		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
-			tracing_start_cmdline_record();
+			if (trace_flags & TRACE_ITER_RECORD_CMD) {
+				tracing_start_cmdline_record();
+				call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+			}
 			ret = call->class->reg(call, TRACE_REG_REGISTER);
 			if (ret) {
 				tracing_stop_cmdline_record();
-- 
cgit v1.2.3-59-g8ed1b


From bc289ae98b75d93228d24f521ef02a076e506e94 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 3 Jun 2010 18:26:24 +0800
Subject: tracing: Reduce latency and remove percpu trace_seq

__print_flags() and __print_symbolic() use percpu trace_seq:

1) Its memory is allocated at compile time, it wastes memory if we don't use tracing.
2) It is percpu data and it wastes more memory for multi-cpus system.
3) It disables preemption when it executes its core routine
   "trace_seq_printf(s, "%s: ", #call);" and introduces latency.

So we move this trace_seq to struct trace_iterator.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4C078350.7090106@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  5 +++--
 include/trace/ftrace.h       | 12 +++---------
 kernel/trace/trace_output.c  |  3 ---
 3 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 2b7b1395b4d5..02b8b24f8f51 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -11,8 +11,6 @@ struct trace_array;
 struct tracer;
 struct dentry;
 
-DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq);
-
 struct trace_print_flags {
 	unsigned long		mask;
 	const char		*name;
@@ -58,6 +56,9 @@ struct trace_iterator {
 	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
 	unsigned long		iter_flags;
 
+	/* trace_seq for __print_flags() and __print_symbolic() etc. */
+	struct trace_seq	tmp_seq;
+
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
 	struct trace_entry	*ent;
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 55c1fd1bbc3d..fb783d94fc54 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -145,7 +145,7 @@
  *	struct trace_seq *s = &iter->seq;
  *	struct ftrace_raw_<call> *field; <-- defined in stage 1
  *	struct trace_entry *entry;
- *	struct trace_seq *p;
+ *	struct trace_seq *p = &iter->tmp_seq;
  *	int ret;
  *
  *	entry = iter->ent;
@@ -157,12 +157,10 @@
  *
  *	field = (typeof(field))entry;
  *
- *	p = &get_cpu_var(ftrace_event_seq);
  *	trace_seq_init(p);
  *	ret = trace_seq_printf(s, "%s: ", <call>);
  *	if (ret)
  *		ret = trace_seq_printf(s, <TP_printk> "\n");
- *	put_cpu();
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -216,7 +214,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 	struct trace_seq *s = &iter->seq;				\
 	struct ftrace_raw_##call *field;				\
 	struct trace_entry *entry;					\
-	struct trace_seq *p;						\
+	struct trace_seq *p = &iter->tmp_seq;				\
 	int ret;							\
 									\
 	event = container_of(trace_event, struct ftrace_event_call,	\
@@ -231,12 +229,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 									\
 	field = (typeof(field))entry;					\
 									\
-	p = &get_cpu_var(ftrace_event_seq);				\
 	trace_seq_init(p);						\
 	ret = trace_seq_printf(s, "%s: ", event->name);			\
 	if (ret)							\
 		ret = trace_seq_printf(s, print);			\
-	put_cpu();							\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
@@ -255,7 +251,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 	struct trace_seq *s = &iter->seq;				\
 	struct ftrace_raw_##template *field;				\
 	struct trace_entry *entry;					\
-	struct trace_seq *p;						\
+	struct trace_seq *p = &iter->tmp_seq;				\
 	int ret;							\
 									\
 	entry = iter->ent;						\
@@ -267,12 +263,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 									\
 	field = (typeof(field))entry;					\
 									\
-	p = &get_cpu_var(ftrace_event_seq);				\
 	trace_seq_init(p);						\
 	ret = trace_seq_printf(s, "%s: ", #call);			\
 	if (ret)							\
 		ret = trace_seq_printf(s, print);			\
-	put_cpu();							\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 57c1b4596470..1ba64d3cc567 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,9 +16,6 @@
 
 DECLARE_RWSEM(trace_event_mutex);
 
-DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
-EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
-
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
-- 
cgit v1.2.3-59-g8ed1b


From ade7ce31c22e961dfbe1a6d57fd362c90c187cbd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 10:56:01 +0200
Subject: quota: Clean up the namespace in dqblk_xfs.h

Almost all identifiers use the FS_* namespace, so rename the missing few
XFS_* ones to FS_* as well.  Without this some people might get upset
about having too many XFS names in generic code.

Acked-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/gfs2/quota.c                 | 10 +++++-----
 fs/quota/dquot.c                |  2 +-
 fs/xfs/linux-2.6/xfs_quotaops.c | 10 +++++-----
 fs/xfs/quota/xfs_qm_syscalls.c  | 32 ++++++++++++++++----------------
 include/linux/dqblk_xfs.h       | 24 ++++++++++++------------
 5 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index b256d6f24288..ce345f8c69c2 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1455,10 +1455,10 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
 
 	switch (sdp->sd_args.ar_quota) {
 	case GFS2_QUOTA_ON:
-		fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+		fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD);
 		/*FALLTHRU*/
 	case GFS2_QUOTA_ACCOUNT:
-		fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+		fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT);
 		break;
 	case GFS2_QUOTA_OFF:
 		break;
@@ -1504,7 +1504,7 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
 
 	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
 	fdq->d_version = FS_DQUOT_VERSION;
-	fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+	fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
 	fdq->d_id = id;
 	fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
 	fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
@@ -1539,12 +1539,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 	switch(type) {
 	case USRQUOTA:
 		type = QUOTA_USER;
-		if (fdq->d_flags != XFS_USER_QUOTA)
+		if (fdq->d_flags != FS_USER_QUOTA)
 			return -EINVAL;
 		break;
 	case GRPQUOTA:
 		type = QUOTA_GROUP;
-		if (fdq->d_flags != XFS_GROUP_QUOTA)
+		if (fdq->d_flags != FS_GROUP_QUOTA)
 			return -EINVAL;
 		break;
 	default:
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a5974c49a78b..2857fd67ff33 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2281,7 +2281,7 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 	memset(di, 0, sizeof(*di));
 	di->d_version = FS_DQUOT_VERSION;
 	di->d_flags = dquot->dq_type == USRQUOTA ?
-			XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+			FS_USER_QUOTA : FS_GROUP_QUOTA;
 	di->d_id = dquot->dq_id;
 
 	spin_lock(&dq_data_lock);
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 067cafbfc635..b9ba7536f4b4 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -69,15 +69,15 @@ xfs_fs_set_xstate(
 	if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
 		return -ENOSYS;
 
-	if (uflags & XFS_QUOTA_UDQ_ACCT)
+	if (uflags & FS_QUOTA_UDQ_ACCT)
 		flags |= XFS_UQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_PDQ_ACCT)
+	if (uflags & FS_QUOTA_PDQ_ACCT)
 		flags |= XFS_PQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_GDQ_ACCT)
+	if (uflags & FS_QUOTA_GDQ_ACCT)
 		flags |= XFS_GQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_UDQ_ENFD)
+	if (uflags & FS_QUOTA_UDQ_ENFD)
 		flags |= XFS_UQUOTA_ENFD;
-	if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
+	if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
 		flags |= XFS_OQUOTA_ENFD;
 
 	switch (op) {
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index b4487764e923..41b04b96975b 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -786,9 +786,9 @@ xfs_qm_export_dquot(
 	}
 
 #ifdef DEBUG
-	if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == XFS_USER_QUOTA) ||
+	if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
 	     (XFS_IS_OQUOTA_ENFORCED(mp) &&
-			(dst->d_flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)))) &&
+			(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
 	    dst->d_id != 0) {
 		if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
 		    (dst->d_blk_softlimit > 0)) {
@@ -809,17 +809,17 @@ xfs_qm_export_qtype_flags(
 	/*
 	 * Can't be more than one, or none.
 	 */
-	ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) !=
-		(XFS_PROJ_QUOTA | XFS_USER_QUOTA));
-	ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) !=
-		(XFS_PROJ_QUOTA | XFS_GROUP_QUOTA));
-	ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) !=
-		(XFS_USER_QUOTA | XFS_GROUP_QUOTA));
-	ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0);
+	ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
+		(FS_PROJ_QUOTA | FS_USER_QUOTA));
+	ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
+		(FS_PROJ_QUOTA | FS_GROUP_QUOTA));
+	ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
+		(FS_USER_QUOTA | FS_GROUP_QUOTA));
+	ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
 
 	return (flags & XFS_DQ_USER) ?
-		XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
-			XFS_PROJ_QUOTA : XFS_GROUP_QUOTA;
+		FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
+			FS_PROJ_QUOTA : FS_GROUP_QUOTA;
 }
 
 STATIC uint
@@ -830,16 +830,16 @@ xfs_qm_export_flags(
 
 	uflags = 0;
 	if (flags & XFS_UQUOTA_ACCT)
-		uflags |= XFS_QUOTA_UDQ_ACCT;
+		uflags |= FS_QUOTA_UDQ_ACCT;
 	if (flags & XFS_PQUOTA_ACCT)
-		uflags |= XFS_QUOTA_PDQ_ACCT;
+		uflags |= FS_QUOTA_PDQ_ACCT;
 	if (flags & XFS_GQUOTA_ACCT)
-		uflags |= XFS_QUOTA_GDQ_ACCT;
+		uflags |= FS_QUOTA_GDQ_ACCT;
 	if (flags & XFS_UQUOTA_ENFD)
-		uflags |= XFS_QUOTA_UDQ_ENFD;
+		uflags |= FS_QUOTA_UDQ_ENFD;
 	if (flags & (XFS_OQUOTA_ENFD)) {
 		uflags |= (flags & XFS_GQUOTA_ACCT) ?
-			XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD;
+			FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
 	}
 	return (uflags);
 }
diff --git a/include/linux/dqblk_xfs.h b/include/linux/dqblk_xfs.h
index 4389ae72024e..86552807aed9 100644
--- a/include/linux/dqblk_xfs.h
+++ b/include/linux/dqblk_xfs.h
@@ -49,7 +49,7 @@
 #define FS_DQUOT_VERSION	1	/* fs_disk_quota.d_version */
 typedef struct fs_disk_quota {
 	__s8		d_version;	/* version of this structure */
-	__s8		d_flags;	/* XFS_{USER,PROJ,GROUP}_QUOTA */
+	__s8		d_flags;	/* FS_{USER,PROJ,GROUP}_QUOTA */
 	__u16		d_fieldmask;	/* field specifier */
 	__u32		d_id;		/* user, project, or group ID */
 	__u64		d_blk_hardlimit;/* absolute limit on disk blks */
@@ -119,18 +119,18 @@ typedef struct fs_disk_quota {
 #define FS_DQ_ACCT_MASK		(FS_DQ_BCOUNT | FS_DQ_ICOUNT | FS_DQ_RTBCOUNT)
 
 /*
- * Various flags related to quotactl(2).  Only relevant to XFS filesystems.
+ * Various flags related to quotactl(2).
  */
-#define XFS_QUOTA_UDQ_ACCT	(1<<0)  /* user quota accounting */
-#define XFS_QUOTA_UDQ_ENFD	(1<<1)  /* user quota limits enforcement */
-#define XFS_QUOTA_GDQ_ACCT	(1<<2)  /* group quota accounting */
-#define XFS_QUOTA_GDQ_ENFD	(1<<3)  /* group quota limits enforcement */
-#define XFS_QUOTA_PDQ_ACCT	(1<<4)  /* project quota accounting */
-#define XFS_QUOTA_PDQ_ENFD	(1<<5)  /* project quota limits enforcement */
+#define FS_QUOTA_UDQ_ACCT	(1<<0)  /* user quota accounting */
+#define FS_QUOTA_UDQ_ENFD	(1<<1)  /* user quota limits enforcement */
+#define FS_QUOTA_GDQ_ACCT	(1<<2)  /* group quota accounting */
+#define FS_QUOTA_GDQ_ENFD	(1<<3)  /* group quota limits enforcement */
+#define FS_QUOTA_PDQ_ACCT	(1<<4)  /* project quota accounting */
+#define FS_QUOTA_PDQ_ENFD	(1<<5)  /* project quota limits enforcement */
 
-#define XFS_USER_QUOTA		(1<<0)	/* user quota type */
-#define XFS_PROJ_QUOTA		(1<<1)	/* project quota type */
-#define XFS_GROUP_QUOTA		(1<<2)	/* group quota type */
+#define FS_USER_QUOTA		(1<<0)	/* user quota type */
+#define FS_PROJ_QUOTA		(1<<1)	/* project quota type */
+#define FS_GROUP_QUOTA		(1<<2)	/* group quota type */
 
 /*
  * fs_quota_stat is the struct returned in Q_XGETQSTAT for a given file system.
@@ -151,7 +151,7 @@ typedef struct fs_qfilestat {
 
 typedef struct fs_quota_stat {
 	__s8		qs_version;	/* version number for future changes */
-	__u16		qs_flags;	/* XFS_QUOTA_{U,P,G}DQ_{ACCT,ENFD} */
+	__u16		qs_flags;	/* FS_QUOTA_{U,P,G}DQ_{ACCT,ENFD} */
 	__s8		qs_pad;		/* unused */
 	fs_qfilestat_t	qs_uquota;	/* user quota storage information */
 	fs_qfilestat_t	qs_gquota;	/* group quota storage information */
-- 
cgit v1.2.3-59-g8ed1b


From 189eef59e70e3e56edf726864629f310d114eefb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 10:56:29 +0200
Subject: quota: clean up quota active checks

The various quota operations check for any quota beeing active on
a superblock, and the inode not having the noquota flag.

Merge these two checks into a dquot_active check and move that
into dquot.c as that's the only place where it's needed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c         | 23 ++++++++++++++++-------
 include/linux/quotaops.h | 10 ----------
 2 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 2857fd67ff33..2eebf72d07c8 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1315,6 +1315,15 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
 	return QUOTA_NL_NOWARN;
 }
 
+static int dquot_active(const struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (IS_NOQUOTA(inode))
+		return 0;
+	return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
+}
+
 /*
  * Initialize quota pointers in inode
  *
@@ -1334,7 +1343,7 @@ static void __dquot_initialize(struct inode *inode, int type)
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+	if (!dquot_active(inode))
 		return;
 
 	/* First get references to structures we might need. */
@@ -1518,7 +1527,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 	 * First test before acquiring mutex - solves deadlocks when we
 	 * re-enter the quota code and are already holding the mutex
 	 */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+	if (!dquot_active(inode)) {
 		inode_incr_space(inode, number, reserve);
 		goto out;
 	}
@@ -1570,7 +1579,7 @@ int dquot_alloc_inode(const struct inode *inode)
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+	if (!dquot_active(inode))
 		return 0;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1607,7 +1616,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
 	int cnt;
 
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+	if (!dquot_active(inode)) {
 		inode_claim_rsv_space(inode, number);
 		return 0;
 	}
@@ -1640,7 +1649,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+	if (!dquot_active(inode)) {
 		inode_decr_space(inode, number, reserve);
 		return;
 	}
@@ -1678,7 +1687,7 @@ void dquot_free_inode(const struct inode *inode)
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+	if (!dquot_active(inode))
 		return;
 
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1801,7 +1810,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	struct super_block *sb = inode->i_sb;
 	int ret;
 
-	if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode))
+	if (!dquot_active(inode))
 		return 0;
 
 	if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index aa36793b48bd..126193c1a5ce 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -145,11 +145,6 @@ static inline bool sb_has_quota_active(struct super_block *sb, int type)
 	       !sb_has_quota_suspended(sb, type);
 }
 
-static inline unsigned sb_any_quota_active(struct super_block *sb)
-{
-	return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
-}
-
 /*
  * Operations supported for diskquotas.
  */
@@ -194,11 +189,6 @@ static inline int sb_has_quota_active(struct super_block *sb, int type)
 	return 0;
 }
 
-static inline int sb_any_quota_active(struct super_block *sb)
-{
-	return 0;
-}
-
 static inline void dquot_initialize(struct inode *inode)
 {
 }
-- 
cgit v1.2.3-59-g8ed1b


From 4c4d3901225518ed1a4c938ba15ba09842a00770 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 7 Jun 2010 10:20:39 +0200
Subject: ext3: remove vestiges of nobh support

The nobh option was only supported for writeback mode, but given that all
write paths (except mmapped writed) actually create buffer heads, it
effectively was a no-op already.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/inode.c         | 16 +---------------
 fs/ext3/super.c         | 17 ++++-------------
 include/linux/ext3_fs.h |  1 -
 3 files changed, 5 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 735f0190ec2a..a786db403efc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1625,10 +1625,7 @@ static int ext3_writeback_writepage(struct page *page,
 		goto out_fail;
 	}
 
-	if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext3_get_block, wbc);
-	else
-		ret = block_write_full_page(page, ext3_get_block, wbc);
+	ret = block_write_full_page(page, ext3_get_block, wbc);
 
 	err = ext3_journal_stop(handle);
 	if (!ret)
@@ -1922,17 +1919,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 
-	/*
-	 * For "nobh" option,  we can only work if we don't need to
-	 * read-in the page - otherwise we create buffers to do the IO.
-	 */
-	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
-	     ext3_should_writeback_data(inode) && PageUptodate(page)) {
-		zero_user(page, offset, length);
-		set_page_dirty(page);
-		goto unlock;
-	}
-
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6c953bb255e7..9650a956fd0e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -661,9 +661,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	 */
 	seq_puts(seq, ",barrier=");
 	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
-	if (test_opt(sb, NOBH))
-		seq_puts(seq, ",nobh");
-
 	seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
 	if (test_opt(sb, DATA_ERR_ABORT))
 		seq_puts(seq, ",data_err=abort");
@@ -1255,10 +1252,12 @@ set_qf_format:
 			*n_blocks_count = option;
 			break;
 		case Opt_nobh:
-			set_opt(sbi->s_mount_opt, NOBH);
+			ext3_msg(sb, KERN_WARNING,
+				"warning: ignoring deprecated nobh option");
 			break;
 		case Opt_bh:
-			clear_opt(sbi->s_mount_opt, NOBH);
+			ext3_msg(sb, KERN_WARNING,
+				"warning: ignoring deprecated bh option");
 			break;
 		default:
 			ext3_msg(sb, KERN_ERR,
@@ -2001,14 +2000,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		break;
 	}
 
-	if (test_opt(sb, NOBH)) {
-		if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
-			ext3_msg(sb, KERN_WARNING,
-				"warning: ignoring nobh option - "
-				"it is supported only with writeback mode");
-			clear_opt(sbi->s_mount_opt, NOBH);
-		}
-	}
 	/*
 	 * The journal_load will have done any necessary log recovery,
 	 * so we can safely mount the rest of the filesystem now.
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 7fc62d4550b2..3d3a9915dde2 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -400,7 +400,6 @@ struct ext3_inode {
 #define EXT3_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
 #define EXT3_MOUNT_RESERVATION		0x10000	/* Preallocation */
 #define EXT3_MOUNT_BARRIER		0x20000 /* Use block barriers */
-#define EXT3_MOUNT_NOBH			0x40000 /* No bufferheads */
 #define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT3_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT3_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
-- 
cgit v1.2.3-59-g8ed1b


From fb5ffb0e160c93c3fe08ab83845eb9a2768af812 Mon Sep 17 00:00:00 2001
From: Jiaying Zhang <jiayingz@google.com>
Date: Tue, 20 Jul 2010 16:54:43 +0200
Subject: quota: Change quota error message to print out disk and function name

The current quota error message doesn't always print the disk name, so
it is hard to identify the "bad" disk when quota error happens.

This patch changes the standardized quota error message to print out disk name
and function name. It also uses a combination of cpp macro and inline function
to provide better type checking and to lower the text size of the message.

[Jan Kara: Export __quota_error]

Signed-off-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c         | 39 +++++++++++++++-------
 fs/quota/quota_tree.c    | 85 ++++++++++++++++++++++++------------------------
 fs/quota/quota_tree.h    |  6 ----
 fs/quota/quota_v1.c      |  3 +-
 fs/quota/quota_v2.c      | 11 +++----
 include/linux/quotaops.h |  6 ++++
 6 files changed, 80 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 2eebf72d07c8..b171221000fa 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -132,6 +132,22 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
 
+void __quota_error(struct super_block *sb, const char *func,
+		  const char *fmt, ...)
+{
+	va_list args;
+
+	if (printk_ratelimit()) {
+		va_start(args, fmt);
+		printk(KERN_ERR "Quota error (device %s): %s: ",
+		       sb->s_id, func);
+		vprintk(fmt, args);
+		printk("\n");
+		va_end(args);
+	}
+}
+EXPORT_SYMBOL(__quota_error);
+
 #if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
 static char *quotatypes[] = INITQFNAMES;
 #endif
@@ -705,11 +721,8 @@ void dqput(struct dquot *dquot)
 		return;
 #ifdef CONFIG_QUOTA_DEBUG
 	if (!atomic_read(&dquot->dq_count)) {
-		printk("VFS: dqput: trying to free free dquot\n");
-		printk("VFS: device %s, dquot of %s %d\n",
-			dquot->dq_sb->s_id,
-			quotatypes[dquot->dq_type],
-			dquot->dq_id);
+		quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
+			    quotatypes[dquot->dq_type], dquot->dq_id);
 		BUG();
 	}
 #endif
@@ -732,9 +745,9 @@ we_slept:
 		/* Commit dquot before releasing */
 		ret = dquot->dq_sb->dq_op->write_dquot(dquot);
 		if (ret < 0) {
-			printk(KERN_ERR "VFS: cannot write quota structure on "
-				"device %s (error %d). Quota may get out of "
-				"sync!\n", dquot->dq_sb->s_id, ret);
+			quota_error(dquot->dq_sb, "Can't write quota structure"
+				    " (error %d). Quota may get out of sync!",
+				    ret);
 			/*
 			 * We clear dirty bit anyway, so that we avoid
 			 * infinite loop here
@@ -914,9 +927,9 @@ static void add_dquot_ref(struct super_block *sb, int type)
 
 #ifdef CONFIG_QUOTA_DEBUG
 	if (reserved) {
-		printk(KERN_WARNING "VFS (%s): Writes happened before quota"
-			" was turned on thus quota information is probably "
-			"inconsistent. Please run quotacheck(8).\n", sb->s_id);
+		quota_error(sb, "Writes happened before quota was turned on "
+			"thus quota information is probably inconsistent. "
+			"Please run quotacheck(8)");
 	}
 #endif
 }
@@ -947,7 +960,9 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
 		if (dqput_blocks(dquot)) {
 #ifdef CONFIG_QUOTA_DEBUG
 			if (atomic_read(&dquot->dq_count) != 1)
-				printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
+				quota_error(inode->i_sb, "Adding dquot with "
+					    "dq_count %d to dispose list",
+					    atomic_read(&dquot->dq_count));
 #endif
 			spin_lock(&dq_list_lock);
 			/* As dquot must have currently users it can't be on
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 24f03407eeb5..9e48874eabcc 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -65,8 +65,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 	ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
 	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
 	if (ret != info->dqi_usable_bs) {
-		q_warn(KERN_WARNING "VFS: dquota write failed on "
-			"dev %s\n", sb->s_id);
+		quota_error(sb, "dquota write failed");
 		if (ret >= 0)
 			ret = -EIO;
 	}
@@ -160,9 +159,8 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
 	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
 	/* No matter whether write succeeds block is out of list */
 	if (write_blk(info, blk, buf) < 0)
-		q_warn(KERN_ERR
-		       "VFS: Can't write block (%u) with free entries.\n",
-		       blk);
+		quota_error(info->dqi_sb, "Can't write block (%u) "
+			    "with free entries", blk);
 	return 0;
 out_buf:
 	kfree(tmpbuf);
@@ -252,9 +250,8 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 	if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
 		*err = remove_free_dqentry(info, buf, blk);
 		if (*err < 0) {
-			q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't "
-			       "remove block (%u) from entry free list.\n",
-			       blk);
+			quota_error(dquot->dq_sb, "Can't remove block (%u) "
+				    "from entry free list", blk);
 			goto out_buf;
 		}
 	}
@@ -268,16 +265,15 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 	}
 #ifdef __QUOTA_QT_PARANOIA
 	if (i == qtree_dqstr_in_blk(info)) {
-		printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
-				"but it shouldn't.\n");
+		quota_error(dquot->dq_sb, "Data block full but it shouldn't");
 		*err = -EIO;
 		goto out_buf;
 	}
 #endif
 	*err = write_blk(info, blk, buf);
 	if (*err < 0) {
-		q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
-				"data block %u.\n", blk);
+		quota_error(dquot->dq_sb, "Can't write quota data block %u",
+			    blk);
 		goto out_buf;
 	}
 	dquot->dq_off = (blk << info->dqi_blocksize_bits) +
@@ -311,8 +307,8 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	} else {
 		ret = read_blk(info, *treeblk, buf);
 		if (ret < 0) {
-			q_warn(KERN_ERR "VFS: Can't read tree quota block "
-					"%u.\n", *treeblk);
+			quota_error(dquot->dq_sb, "Can't read tree quota "
+				    "block %u", *treeblk);
 			goto out_buf;
 		}
 	}
@@ -323,9 +319,9 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	if (depth == info->dqi_qtree_depth - 1) {
 #ifdef __QUOTA_QT_PARANOIA
 		if (newblk) {
-			printk(KERN_ERR "VFS: Inserting already present quota "
-					"entry (block %u).\n",
-			       le32_to_cpu(ref[get_index(info,
+			quota_error(dquot->dq_sb, "Inserting already present "
+				    "quota entry (block %u)",
+				    le32_to_cpu(ref[get_index(info,
 						dquot->dq_id, depth)]));
 			ret = -EIO;
 			goto out_buf;
@@ -373,8 +369,8 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	if (!dquot->dq_off) {
 		ret = dq_insert_tree(info, dquot);
 		if (ret < 0) {
-			q_warn(KERN_ERR "VFS: Error %zd occurred while "
-					"creating quota.\n", ret);
+			quota_error(sb, "Error %zd occurred while creating "
+				    "quota", ret);
 			kfree(ddquot);
 			return ret;
 		}
@@ -385,8 +381,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
 				    dquot->dq_off);
 	if (ret != info->dqi_entry_size) {
-		q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n",
-		       sb->s_id);
+		quota_error(sb, "dquota write failed");
 		if (ret >= 0)
 			ret = -ENOSPC;
 	} else {
@@ -410,14 +405,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	if (!buf)
 		return -ENOMEM;
 	if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
-		q_warn(KERN_ERR "VFS: Quota structure has offset to other "
-		  "block (%u) than it should (%u).\n", blk,
-		  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+		quota_error(dquot->dq_sb, "Quota structure has offset to "
+			"other block (%u) than it should (%u)", blk,
+			(uint)(dquot->dq_off >> info->dqi_blocksize_bits));
 		goto out_buf;
 	}
 	ret = read_blk(info, blk, buf);
 	if (ret < 0) {
-		q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+		quota_error(dquot->dq_sb, "Can't read quota data block %u",
+			    blk);
 		goto out_buf;
 	}
 	dh = (struct qt_disk_dqdbheader *)buf;
@@ -427,8 +423,8 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		if (ret >= 0)
 			ret = put_free_dqblk(info, buf, blk);
 		if (ret < 0) {
-			q_warn(KERN_ERR "VFS: Can't move quota data block (%u) "
-			  "to free list.\n", blk);
+			quota_error(dquot->dq_sb, "Can't move quota data block "
+				    "(%u) to free list", blk);
 			goto out_buf;
 		}
 	} else {
@@ -440,15 +436,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 			/* Insert will write block itself */
 			ret = insert_free_dqentry(info, buf, blk);
 			if (ret < 0) {
-				q_warn(KERN_ERR "VFS: Can't insert quota data "
-				       "block (%u) to free entry list.\n", blk);
+				quota_error(dquot->dq_sb, "Can't insert quota "
+				    "data block (%u) to free entry list", blk);
 				goto out_buf;
 			}
 		} else {
 			ret = write_blk(info, blk, buf);
 			if (ret < 0) {
-				q_warn(KERN_ERR "VFS: Can't write quota data "
-				  "block %u\n", blk);
+				quota_error(dquot->dq_sb, "Can't write quota "
+					    "data block %u", blk);
 				goto out_buf;
 			}
 		}
@@ -472,7 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		return -ENOMEM;
 	ret = read_blk(info, *blk, buf);
 	if (ret < 0) {
-		q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+		quota_error(dquot->dq_sb, "Can't read quota data "
+			    "block %u", blk);
 		goto out_buf;
 	}
 	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -496,8 +493,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		} else {
 			ret = write_blk(info, *blk, buf);
 			if (ret < 0)
-				q_warn(KERN_ERR "VFS: Can't write quota tree "
-				  "block %u.\n", *blk);
+				quota_error(dquot->dq_sb, "Can't write quota "
+					    "tree block %u", blk);
 		}
 	}
 out_buf:
@@ -529,7 +526,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 		return -ENOMEM;
 	ret = read_blk(info, blk, buf);
 	if (ret < 0) {
-		q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		quota_error(dquot->dq_sb, "Can't read quota tree "
+			    "block %u", blk);
 		goto out_buf;
 	}
 	ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -539,8 +537,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 		ddquot += info->dqi_entry_size;
 	}
 	if (i == qtree_dqstr_in_blk(info)) {
-		q_warn(KERN_ERR "VFS: Quota for id %u referenced "
-		  "but not present.\n", dquot->dq_id);
+		quota_error(dquot->dq_sb, "Quota for id %u referenced "
+			    "but not present", dquot->dq_id);
 		ret = -EIO;
 		goto out_buf;
 	} else {
@@ -564,7 +562,8 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
 		return -ENOMEM;
 	ret = read_blk(info, blk, buf);
 	if (ret < 0) {
-		q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		quota_error(dquot->dq_sb, "Can't read quota tree block %u",
+			    blk);
 		goto out_buf;
 	}
 	ret = 0;
@@ -598,7 +597,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 #ifdef __QUOTA_QT_PARANOIA
 	/* Invalidated quota? */
 	if (!sb_dqopt(dquot->dq_sb)->files[type]) {
-		printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+		quota_error(sb, "Quota invalidated while reading!");
 		return -EIO;
 	}
 #endif
@@ -607,8 +606,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 		offset = find_dqentry(info, dquot);
 		if (offset <= 0) {	/* Entry not present? */
 			if (offset < 0)
-				q_warn(KERN_ERR "VFS: Can't read quota "
-				  "structure for id %u.\n", dquot->dq_id);
+				quota_error(sb, "Can't read quota structure "
+					    "for id %u", dquot->dq_id);
 			dquot->dq_off = 0;
 			set_bit(DQ_FAKE_B, &dquot->dq_flags);
 			memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -625,8 +624,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	if (ret != info->dqi_entry_size) {
 		if (ret >= 0)
 			ret = -EIO;
-		q_warn(KERN_ERR "VFS: Error while reading quota "
-				"structure for id %u.\n", dquot->dq_id);
+		quota_error(sb, "Error while reading quota structure for id %u",
+			    dquot->dq_id);
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
 		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
 		kfree(ddquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index ccc3e71fb1d8..a1ab8db81a51 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -22,10 +22,4 @@ struct qt_disk_dqdbheader {
 
 #define QT_TREEOFF	1		/* Offset of tree in file in blocks */
 
-#define q_warn(fmt, args...) \
-do { \
-	if (printk_ratelimit()) \
-		printk(fmt, ## args); \
-} while(0)
-
 #endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 4af344c5852a..34b37a67bb16 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -95,8 +95,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
 			(char *)&dqblk, sizeof(struct v1_disk_dqblk),
 			v1_dqoff(dquot->dq_id));
 	if (ret != sizeof(struct v1_disk_dqblk)) {
-		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
-			dquot->dq_sb->s_id);
+		quota_error(dquot->dq_sb, "dquota write failed");
 		if (ret >= 0)
 			ret = -EIO;
 		goto out;
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 135206af1458..65444d29406b 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,9 +63,8 @@ static int v2_read_header(struct super_block *sb, int type,
 	size = sb->s_op->quota_read(sb, type, (char *)dqhead,
 				    sizeof(struct v2_disk_dqheader), 0);
 	if (size != sizeof(struct v2_disk_dqheader)) {
-		q_warn(KERN_WARNING "quota_v2: Failed header read:"
-		       " expected=%zd got=%zd\n",
-			sizeof(struct v2_disk_dqheader), size);
+		quota_error(sb, "Failed header read: expected=%zd got=%zd",
+			    sizeof(struct v2_disk_dqheader), size);
 		return 0;
 	}
 	return 1;
@@ -106,8 +105,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
 	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
-		q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
-			sb->s_id);
+		quota_error(sb, "Can't read info structure");
 		return -1;
 	}
 	info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
@@ -167,8 +165,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
-		q_warn(KERN_WARNING "Can't write info structure on device %s.\n",
-			sb->s_id);
+		quota_error(sb, "Can't write info structure");
 		return -1;
 	}
 	return 0;
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 126193c1a5ce..4881b49b1a9a 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -28,6 +28,12 @@ static inline bool is_quota_modification(struct inode *inode, struct iattr *ia)
 
 #if defined(CONFIG_QUOTA)
 
+#define quota_error(sb, fmt, args...) \
+	__quota_error((sb), __func__, fmt , ## args)
+
+extern void __quota_error(struct super_block *sb, const char *func,
+			 const char *fmt, ...);
+
 /*
  * declaration of quota_function calls in kernel.
  */
-- 
cgit v1.2.3-59-g8ed1b


From 9849ed4d72251d273524efb8b70be0be9aecb1df Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 20 Jul 2010 03:13:35 -0400
Subject: tracing/documentation: Document dynamic ftracer internals

Add more details to the dynamic function tracing design implementation.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
LKML-Reference: <1279610015-10250-1-git-send-email-vapier@gentoo.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/ftrace-design.txt | 153 ++++++++++++++++++++++++++++++++--
 include/linux/ftrace.h                |   5 ++
 2 files changed, 153 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index f1f81afee8a0..dc52bd442c92 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -13,6 +13,9 @@ Note that this focuses on architecture implementation details only.  If you
 want more explanation of a feature in terms of common code, review the common
 ftrace.txt file.
 
+Ideally, everyone who wishes to retain performance while supporting tracing in
+their kernel should make it all the way to dynamic ftrace support.
+
 
 Prerequisites
 -------------
@@ -215,7 +218,7 @@ An arch may pass in a unique value (frame pointer) to both the entering and
 exiting of a function.  On exit, the value is compared and if it does not
 match, then it will panic the kernel.  This is largely a sanity check for bad
 code generation with gcc.  If gcc for your port sanely updates the frame
-pointer under different opitmization levels, then ignore this option.
+pointer under different optimization levels, then ignore this option.
 
 However, adding support for it isn't terribly difficult.  In your assembly code
 that calls prepare_ftrace_return(), pass the frame pointer as the 3rd argument.
@@ -234,7 +237,7 @@ If you can't trace NMI functions, then skip this option.
 
 
 HAVE_SYSCALL_TRACEPOINTS
----------------------
+------------------------
 
 You need very few things to get the syscalls tracing in an arch.
 
@@ -250,12 +253,152 @@ You need very few things to get the syscalls tracing in an arch.
 HAVE_FTRACE_MCOUNT_RECORD
 -------------------------
 
-See scripts/recordmcount.pl for more info.
+See scripts/recordmcount.pl for more info.  Just fill in the arch-specific
+details for how to locate the addresses of mcount call sites via objdump.
+This option doesn't make much sense without also implementing dynamic ftrace.
 
+
+HAVE_DYNAMIC_FTRACE
+-------------------
+
+You will first need HAVE_FTRACE_MCOUNT_RECORD and HAVE_FUNCTION_TRACER, so
+scroll your reader back up if you got over eager.
+
+Once those are out of the way, you will need to implement:
+	- asm/ftrace.h:
+		- MCOUNT_ADDR
+		- ftrace_call_adjust()
+		- struct dyn_arch_ftrace{}
+	- asm code:
+		- mcount() (new stub)
+		- ftrace_caller()
+		- ftrace_call()
+		- ftrace_stub()
+	- C code:
+		- ftrace_dyn_arch_init()
+		- ftrace_make_nop()
+		- ftrace_make_call()
+		- ftrace_update_ftrace_func()
+
+First you will need to fill out some arch details in your asm/ftrace.h.
+
+Define MCOUNT_ADDR as the address of your mcount symbol similar to:
+	#define MCOUNT_ADDR ((unsigned long)mcount)
+Since no one else will have a decl for that function, you will need to:
+	extern void mcount(void);
+
+You will also need the helper function ftrace_call_adjust().  Most people
+will be able to stub it out like so:
+	static inline unsigned long ftrace_call_adjust(unsigned long addr)
+	{
+		return addr;
+	}
 <details to be filled>
 
+Lastly you will need the custom dyn_arch_ftrace structure.  If you need
+some extra state when runtime patching arbitrary call sites, this is the
+place.  For now though, create an empty struct:
+	struct dyn_arch_ftrace {
+		/* No extra data needed */
+	};
+
+With the header out of the way, we can fill out the assembly code.  While we
+did already create a mcount() function earlier, dynamic ftrace only wants a
+stub function.  This is because the mcount() will only be used during boot
+and then all references to it will be patched out never to return.  Instead,
+the guts of the old mcount() will be used to create a new ftrace_caller()
+function.  Because the two are hard to merge, it will most likely be a lot
+easier to have two separate definitions split up by #ifdefs.  Same goes for
+the ftrace_stub() as that will now be inlined in ftrace_caller().
+
+Before we get confused anymore, let's check out some pseudo code so you can
+implement your own stuff in assembly:
 
-HAVE_DYNAMIC_FTRACE
----------------------
+void mcount(void)
+{
+	return;
+}
+
+void ftrace_caller(void)
+{
+	/* implement HAVE_FUNCTION_TRACE_MCOUNT_TEST if you desire */
+
+	/* save all state needed by the ABI (see paragraph above) */
+
+	unsigned long frompc = ...;
+	unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;
+
+ftrace_call:
+	ftrace_stub(frompc, selfpc);
+
+	/* restore all state needed by the ABI */
+
+ftrace_stub:
+	return;
+}
+
+This might look a little odd at first, but keep in mind that we will be runtime
+patching multiple things.  First, only functions that we actually want to trace
+will be patched to call ftrace_caller().  Second, since we only have one tracer
+active at a time, we will patch the ftrace_caller() function itself to call the
+specific tracer in question.  That is the point of the ftrace_call label.
+
+With that in mind, let's move on to the C code that will actually be doing the
+runtime patching.  You'll need a little knowledge of your arch's opcodes in
+order to make it through the next section.
+
+Every arch has an init callback function.  If you need to do something early on
+to initialize some state, this is the time to do that.  Otherwise, this simple
+function below should be sufficient for most people:
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+	/* return value is done indirectly via data */
+	*(unsigned long *)data = 0;
+
+	return 0;
+}
+
+There are two functions that are used to do runtime patching of arbitrary
+functions.  The first is used to turn the mcount call site into a nop (which
+is what helps us retain runtime performance when not tracing).  The second is
+used to turn the mcount call site into a call to an arbitrary location (but
+typically that is ftracer_caller()).  See the general function definition in
+linux/ftrace.h for the functions:
+	ftrace_make_nop()
+	ftrace_make_call()
+The rec->ip value is the address of the mcount call site that was collected
+by the scripts/recordmcount.pl during build time.
+
+The last function is used to do runtime patching of the active tracer.  This
+will be modifying the assembly code at the location of the ftrace_call symbol
+inside of the ftrace_caller() function.  So you should have sufficient padding
+at that location to support the new function calls you'll be inserting.  Some
+people will be using a "call" type instruction while others will be using a
+"branch" type instruction.  Specifically, the function is:
+	ftrace_update_ftrace_func()
+
+
+HAVE_DYNAMIC_FTRACE + HAVE_FUNCTION_GRAPH_TRACER
+------------------------------------------------
+
+The function grapher needs a few tweaks in order to work with dynamic ftrace.
+Basically, you will need to:
+	- update:
+		- ftrace_caller()
+		- ftrace_graph_call()
+		- ftrace_graph_caller()
+	- implement:
+		- ftrace_enable_ftrace_graph_caller()
+		- ftrace_disable_ftrace_graph_caller()
 
 <details to be filled>
+Quick notes:
+	- add a nop stub after the ftrace_call location named ftrace_graph_call;
+	  stub needs to be large enough to support a call to ftrace_graph_caller()
+	- update ftrace_graph_caller() to work with being called by the new
+	  ftrace_caller() since some semantics may have changed
+	- ftrace_enable_ftrace_graph_caller() will runtime patch the
+	  ftrace_graph_call location with a call to ftrace_graph_caller()
+	- ftrace_disable_ftrace_graph_caller() will runtime patch the
+	  ftrace_graph_call location with nops
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 41e46330d9be..dcd6a7c3a435 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1,3 +1,8 @@
+/*
+ * Ftrace header.  For implementation details beyond the random comments
+ * scattered below, see: Documentation/trace/ftrace-design.txt
+ */
+
 #ifndef _LINUX_FTRACE_H
 #define _LINUX_FTRACE_H
 
-- 
cgit v1.2.3-59-g8ed1b


From e955cead031177b083fbf18d04a03c06e330a439 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Wed, 29 Jul 2009 10:20:10 +0200
Subject: CAN: Add Flexcan CAN controller driver

This core is found on some Freescale SoCs and also some Coldfire
SoCs. Support for Coldfire is missing though at the moment as
they have an older revision of the core which does not have RX FIFO
support.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Acked-by: Wolfgang Grandegger <wg@grandegger.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/Kconfig              |    9 +
 drivers/net/can/Makefile             |    1 +
 drivers/net/can/flexcan.c            | 1030 ++++++++++++++++++++++++++++++++++
 include/linux/can/platform/flexcan.h |   20 +
 4 files changed, 1060 insertions(+)
 create mode 100644 drivers/net/can/flexcan.c
 create mode 100644 include/linux/can/platform/flexcan.h

(limited to 'include/linux')

diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
index 2c5227c02fa0..9d9e45394433 100644
--- a/drivers/net/can/Kconfig
+++ b/drivers/net/can/Kconfig
@@ -73,6 +73,15 @@ config CAN_JANZ_ICAN3
 	  This driver can also be built as a module. If so, the module will be
 	  called janz-ican3.ko.
 
+config HAVE_CAN_FLEXCAN
+	bool
+
+config CAN_FLEXCAN
+	tristate "Support for Freescale FLEXCAN based chips"
+	depends on CAN_DEV && HAVE_CAN_FLEXCAN
+	---help---
+	  Say Y here if you want to support for Freescale FlexCAN.
+
 source "drivers/net/can/mscan/Kconfig"
 
 source "drivers/net/can/sja1000/Kconfig"
diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile
index 9047cd066fea..00575373bbd0 100644
--- a/drivers/net/can/Makefile
+++ b/drivers/net/can/Makefile
@@ -16,5 +16,6 @@ obj-$(CONFIG_CAN_TI_HECC)	+= ti_hecc.o
 obj-$(CONFIG_CAN_MCP251X)	+= mcp251x.o
 obj-$(CONFIG_CAN_BFIN)		+= bfin_can.o
 obj-$(CONFIG_CAN_JANZ_ICAN3)	+= janz-ican3.o
+obj-$(CONFIG_CAN_FLEXCAN)	+= flexcan.o
 
 ccflags-$(CONFIG_CAN_DEBUG_DEVICES) := -DDEBUG
diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
new file mode 100644
index 000000000000..ef443a090ba7
--- /dev/null
+++ b/drivers/net/can/flexcan.c
@@ -0,0 +1,1030 @@
+/*
+ * flexcan.c - FLEXCAN CAN controller driver
+ *
+ * Copyright (c) 2005-2006 Varma Electronics Oy
+ * Copyright (c) 2009 Sascha Hauer, Pengutronix
+ * Copyright (c) 2010 Marc Kleine-Budde, Pengutronix
+ *
+ * Based on code originally by Andrey Volkov <avolkov@varma-el.com>
+ *
+ * LICENCE:
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/netdevice.h>
+#include <linux/can.h>
+#include <linux/can/dev.h>
+#include <linux/can/error.h>
+#include <linux/can/platform/flexcan.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+#include <mach/clock.h>
+
+#define DRV_NAME			"flexcan"
+
+/* 8 for RX fifo and 2 error handling */
+#define FLEXCAN_NAPI_WEIGHT		(8 + 2)
+
+/* FLEXCAN module configuration register (CANMCR) bits */
+#define FLEXCAN_MCR_MDIS		BIT(31)
+#define FLEXCAN_MCR_FRZ			BIT(30)
+#define FLEXCAN_MCR_FEN			BIT(29)
+#define FLEXCAN_MCR_HALT		BIT(28)
+#define FLEXCAN_MCR_NOT_RDY		BIT(27)
+#define FLEXCAN_MCR_WAK_MSK		BIT(26)
+#define FLEXCAN_MCR_SOFTRST		BIT(25)
+#define FLEXCAN_MCR_FRZ_ACK		BIT(24)
+#define FLEXCAN_MCR_SUPV		BIT(23)
+#define FLEXCAN_MCR_SLF_WAK		BIT(22)
+#define FLEXCAN_MCR_WRN_EN		BIT(21)
+#define FLEXCAN_MCR_LPM_ACK		BIT(20)
+#define FLEXCAN_MCR_WAK_SRC		BIT(19)
+#define FLEXCAN_MCR_DOZE		BIT(18)
+#define FLEXCAN_MCR_SRX_DIS		BIT(17)
+#define FLEXCAN_MCR_BCC			BIT(16)
+#define FLEXCAN_MCR_LPRIO_EN		BIT(13)
+#define FLEXCAN_MCR_AEN			BIT(12)
+#define FLEXCAN_MCR_MAXMB(x)		((x) & 0xf)
+#define FLEXCAN_MCR_IDAM_A		(0 << 8)
+#define FLEXCAN_MCR_IDAM_B		(1 << 8)
+#define FLEXCAN_MCR_IDAM_C		(2 << 8)
+#define FLEXCAN_MCR_IDAM_D		(3 << 8)
+
+/* FLEXCAN control register (CANCTRL) bits */
+#define FLEXCAN_CTRL_PRESDIV(x)		(((x) & 0xff) << 24)
+#define FLEXCAN_CTRL_RJW(x)		(((x) & 0x03) << 22)
+#define FLEXCAN_CTRL_PSEG1(x)		(((x) & 0x07) << 19)
+#define FLEXCAN_CTRL_PSEG2(x)		(((x) & 0x07) << 16)
+#define FLEXCAN_CTRL_BOFF_MSK		BIT(15)
+#define FLEXCAN_CTRL_ERR_MSK		BIT(14)
+#define FLEXCAN_CTRL_CLK_SRC		BIT(13)
+#define FLEXCAN_CTRL_LPB		BIT(12)
+#define FLEXCAN_CTRL_TWRN_MSK		BIT(11)
+#define FLEXCAN_CTRL_RWRN_MSK		BIT(10)
+#define FLEXCAN_CTRL_SMP		BIT(7)
+#define FLEXCAN_CTRL_BOFF_REC		BIT(6)
+#define FLEXCAN_CTRL_TSYN		BIT(5)
+#define FLEXCAN_CTRL_LBUF		BIT(4)
+#define FLEXCAN_CTRL_LOM		BIT(3)
+#define FLEXCAN_CTRL_PROPSEG(x)		((x) & 0x07)
+#define FLEXCAN_CTRL_ERR_BUS		(FLEXCAN_CTRL_ERR_MSK)
+#define FLEXCAN_CTRL_ERR_STATE \
+	(FLEXCAN_CTRL_TWRN_MSK | FLEXCAN_CTRL_RWRN_MSK | \
+	 FLEXCAN_CTRL_BOFF_MSK)
+#define FLEXCAN_CTRL_ERR_ALL \
+	(FLEXCAN_CTRL_ERR_BUS | FLEXCAN_CTRL_ERR_STATE)
+
+/* FLEXCAN error and status register (ESR) bits */
+#define FLEXCAN_ESR_TWRN_INT		BIT(17)
+#define FLEXCAN_ESR_RWRN_INT		BIT(16)
+#define FLEXCAN_ESR_BIT1_ERR		BIT(15)
+#define FLEXCAN_ESR_BIT0_ERR		BIT(14)
+#define FLEXCAN_ESR_ACK_ERR		BIT(13)
+#define FLEXCAN_ESR_CRC_ERR		BIT(12)
+#define FLEXCAN_ESR_FRM_ERR		BIT(11)
+#define FLEXCAN_ESR_STF_ERR		BIT(10)
+#define FLEXCAN_ESR_TX_WRN		BIT(9)
+#define FLEXCAN_ESR_RX_WRN		BIT(8)
+#define FLEXCAN_ESR_IDLE		BIT(7)
+#define FLEXCAN_ESR_TXRX		BIT(6)
+#define FLEXCAN_EST_FLT_CONF_SHIFT	(4)
+#define FLEXCAN_ESR_FLT_CONF_MASK	(0x3 << FLEXCAN_EST_FLT_CONF_SHIFT)
+#define FLEXCAN_ESR_FLT_CONF_ACTIVE	(0x0 << FLEXCAN_EST_FLT_CONF_SHIFT)
+#define FLEXCAN_ESR_FLT_CONF_PASSIVE	(0x1 << FLEXCAN_EST_FLT_CONF_SHIFT)
+#define FLEXCAN_ESR_BOFF_INT		BIT(2)
+#define FLEXCAN_ESR_ERR_INT		BIT(1)
+#define FLEXCAN_ESR_WAK_INT		BIT(0)
+#define FLEXCAN_ESR_ERR_BUS \
+	(FLEXCAN_ESR_BIT1_ERR | FLEXCAN_ESR_BIT0_ERR | \
+	 FLEXCAN_ESR_ACK_ERR | FLEXCAN_ESR_CRC_ERR | \
+	 FLEXCAN_ESR_FRM_ERR | FLEXCAN_ESR_STF_ERR)
+#define FLEXCAN_ESR_ERR_STATE \
+	(FLEXCAN_ESR_TWRN_INT | FLEXCAN_ESR_RWRN_INT | FLEXCAN_ESR_BOFF_INT)
+#define FLEXCAN_ESR_ERR_ALL \
+	(FLEXCAN_ESR_ERR_BUS | FLEXCAN_ESR_ERR_STATE)
+
+/* FLEXCAN interrupt flag register (IFLAG) bits */
+#define FLEXCAN_TX_BUF_ID		8
+#define FLEXCAN_IFLAG_BUF(x)		BIT(x)
+#define FLEXCAN_IFLAG_RX_FIFO_OVERFLOW	BIT(7)
+#define FLEXCAN_IFLAG_RX_FIFO_WARN	BIT(6)
+#define FLEXCAN_IFLAG_RX_FIFO_AVAILABLE	BIT(5)
+#define FLEXCAN_IFLAG_DEFAULT \
+	(FLEXCAN_IFLAG_RX_FIFO_OVERFLOW | FLEXCAN_IFLAG_RX_FIFO_AVAILABLE | \
+	 FLEXCAN_IFLAG_BUF(FLEXCAN_TX_BUF_ID))
+
+/* FLEXCAN message buffers */
+#define FLEXCAN_MB_CNT_CODE(x)		(((x) & 0xf) << 24)
+#define FLEXCAN_MB_CNT_SRR		BIT(22)
+#define FLEXCAN_MB_CNT_IDE		BIT(21)
+#define FLEXCAN_MB_CNT_RTR		BIT(20)
+#define FLEXCAN_MB_CNT_LENGTH(x)	(((x) & 0xf) << 16)
+#define FLEXCAN_MB_CNT_TIMESTAMP(x)	((x) & 0xffff)
+
+#define FLEXCAN_MB_CODE_MASK		(0xf0ffffff)
+
+/* Structure of the message buffer */
+struct flexcan_mb {
+	u32 can_ctrl;
+	u32 can_id;
+	u32 data[2];
+};
+
+/* Structure of the hardware registers */
+struct flexcan_regs {
+	u32 mcr;		/* 0x00 */
+	u32 ctrl;		/* 0x04 */
+	u32 timer;		/* 0x08 */
+	u32 _reserved1;		/* 0x0c */
+	u32 rxgmask;		/* 0x10 */
+	u32 rx14mask;		/* 0x14 */
+	u32 rx15mask;		/* 0x18 */
+	u32 ecr;		/* 0x1c */
+	u32 esr;		/* 0x20 */
+	u32 imask2;		/* 0x24 */
+	u32 imask1;		/* 0x28 */
+	u32 iflag2;		/* 0x2c */
+	u32 iflag1;		/* 0x30 */
+	u32 _reserved2[19];
+	struct flexcan_mb cantxfg[64];
+};
+
+struct flexcan_priv {
+	struct can_priv can;
+	struct net_device *dev;
+	struct napi_struct napi;
+
+	void __iomem *base;
+	u32 reg_esr;
+	u32 reg_ctrl_default;
+
+	struct clk *clk;
+	struct flexcan_platform_data *pdata;
+};
+
+static struct can_bittiming_const flexcan_bittiming_const = {
+	.name = DRV_NAME,
+	.tseg1_min = 4,
+	.tseg1_max = 16,
+	.tseg2_min = 2,
+	.tseg2_max = 8,
+	.sjw_max = 4,
+	.brp_min = 1,
+	.brp_max = 256,
+	.brp_inc = 1,
+};
+
+/*
+ * Swtich transceiver on or off
+ */
+static void flexcan_transceiver_switch(const struct flexcan_priv *priv, int on)
+{
+	if (priv->pdata && priv->pdata->transceiver_switch)
+		priv->pdata->transceiver_switch(on);
+}
+
+static inline int flexcan_has_and_handle_berr(const struct flexcan_priv *priv,
+					      u32 reg_esr)
+{
+	return (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) &&
+		(reg_esr & FLEXCAN_ESR_ERR_BUS);
+}
+
+static inline void flexcan_chip_enable(struct flexcan_priv *priv)
+{
+	struct flexcan_regs __iomem *regs = priv->base;
+	u32 reg;
+
+	reg = readl(&regs->mcr);
+	reg &= ~FLEXCAN_MCR_MDIS;
+	writel(reg, &regs->mcr);
+
+	udelay(10);
+}
+
+static inline void flexcan_chip_disable(struct flexcan_priv *priv)
+{
+	struct flexcan_regs __iomem *regs = priv->base;
+	u32 reg;
+
+	reg = readl(&regs->mcr);
+	reg |= FLEXCAN_MCR_MDIS;
+	writel(reg, &regs->mcr);
+}
+
+static int flexcan_get_berr_counter(const struct net_device *dev,
+				    struct can_berr_counter *bec)
+{
+	const struct flexcan_priv *priv = netdev_priv(dev);
+	struct flexcan_regs __iomem *regs = priv->base;
+	u32 reg = readl(&regs->ecr);
+
+	bec->txerr = (reg >> 0) & 0xff;
+	bec->rxerr = (reg >> 8) & 0xff;
+
+	return 0;
+}
+
+static int flexcan_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	const struct flexcan_priv *priv = netdev_priv(dev);
+	struct net_device_stats *stats = &dev->stats;
+	struct flexcan_regs __iomem *regs = priv->base;
+	struct can_frame *cf = (struct can_frame *)skb->data;
+	u32 can_id;
+	u32 ctrl = FLEXCAN_MB_CNT_CODE(0xc) | (cf->can_dlc << 16);
+
+	if (can_dropped_invalid_skb(dev, skb))
+		return NETDEV_TX_OK;
+
+	netif_stop_queue(dev);
+
+	if (cf->can_id & CAN_EFF_FLAG) {
+		can_id = cf->can_id & CAN_EFF_MASK;
+		ctrl |= FLEXCAN_MB_CNT_IDE | FLEXCAN_MB_CNT_SRR;
+	} else {
+		can_id = (cf->can_id & CAN_SFF_MASK) << 18;
+	}
+
+	if (cf->can_id & CAN_RTR_FLAG)
+		ctrl |= FLEXCAN_MB_CNT_RTR;
+
+	if (cf->can_dlc > 0) {
+		u32 data = be32_to_cpup((__be32 *)&cf->data[0]);
+		writel(data, &regs->cantxfg[FLEXCAN_TX_BUF_ID].data[0]);
+	}
+	if (cf->can_dlc > 3) {
+		u32 data = be32_to_cpup((__be32 *)&cf->data[4]);
+		writel(data, &regs->cantxfg[FLEXCAN_TX_BUF_ID].data[1]);
+	}
+
+	writel(can_id, &regs->cantxfg[FLEXCAN_TX_BUF_ID].can_id);
+	writel(ctrl, &regs->cantxfg[FLEXCAN_TX_BUF_ID].can_ctrl);
+
+	kfree_skb(skb);
+
+	/* tx_packets is incremented in flexcan_irq */
+	stats->tx_bytes += cf->can_dlc;
+
+	return NETDEV_TX_OK;
+}
+
+static void do_bus_err(struct net_device *dev,
+		       struct can_frame *cf, u32 reg_esr)
+{
+	struct flexcan_priv *priv = netdev_priv(dev);
+	int rx_errors = 0, tx_errors = 0;
+
+	cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR;
+
+	if (reg_esr & FLEXCAN_ESR_BIT1_ERR) {
+		dev_dbg(dev->dev.parent, "BIT1_ERR irq\n");
+		cf->data[2] |= CAN_ERR_PROT_BIT1;
+		tx_errors = 1;
+	}
+	if (reg_esr & FLEXCAN_ESR_BIT0_ERR) {
+		dev_dbg(dev->dev.parent, "BIT0_ERR irq\n");
+		cf->data[2] |= CAN_ERR_PROT_BIT0;
+		tx_errors = 1;
+	}
+	if (reg_esr & FLEXCAN_ESR_ACK_ERR) {
+		dev_dbg(dev->dev.parent, "ACK_ERR irq\n");
+		cf->can_id |= CAN_ERR_ACK;
+		cf->data[3] |= CAN_ERR_PROT_LOC_ACK;
+		tx_errors = 1;
+	}
+	if (reg_esr & FLEXCAN_ESR_CRC_ERR) {
+		dev_dbg(dev->dev.parent, "CRC_ERR irq\n");
+		cf->data[2] |= CAN_ERR_PROT_BIT;
+		cf->data[3] |= CAN_ERR_PROT_LOC_CRC_SEQ;
+		rx_errors = 1;
+	}
+	if (reg_esr & FLEXCAN_ESR_FRM_ERR) {
+		dev_dbg(dev->dev.parent, "FRM_ERR irq\n");
+		cf->data[2] |= CAN_ERR_PROT_FORM;
+		rx_errors = 1;
+	}
+	if (reg_esr & FLEXCAN_ESR_STF_ERR) {
+		dev_dbg(dev->dev.parent, "STF_ERR irq\n");
+		cf->data[2] |= CAN_ERR_PROT_STUFF;
+		rx_errors = 1;
+	}
+
+	priv->can.can_stats.bus_error++;
+	if (rx_errors)
+		dev->stats.rx_errors++;
+	if (tx_errors)
+		dev->stats.tx_errors++;
+}
+
+static int flexcan_poll_bus_err(struct net_device *dev, u32 reg_esr)
+{
+	struct sk_buff *skb;
+	struct can_frame *cf;
+
+	skb = alloc_can_err_skb(dev, &cf);
+	if (unlikely(!skb))
+		return 0;
+
+	do_bus_err(dev, cf, reg_esr);
+	netif_receive_skb(skb);
+
+	dev->stats.rx_packets++;
+	dev->stats.rx_bytes += cf->can_dlc;
+
+	return 1;
+}
+
+static void do_state(struct net_device *dev,
+		     struct can_frame *cf, enum can_state new_state)
+{
+	struct flexcan_priv *priv = netdev_priv(dev);
+	struct can_berr_counter bec;
+
+	flexcan_get_berr_counter(dev, &bec);
+
+	switch (priv->can.state) {
+	case CAN_STATE_ERROR_ACTIVE:
+		/*
+		 * from: ERROR_ACTIVE
+		 * to  : ERROR_WARNING, ERROR_PASSIVE, BUS_OFF
+		 * =>  : there was a warning int
+		 */
+		if (new_state >= CAN_STATE_ERROR_WARNING &&
+		    new_state <= CAN_STATE_BUS_OFF) {
+			dev_dbg(dev->dev.parent, "Error Warning IRQ\n");
+			priv->can.can_stats.error_warning++;
+
+			cf->can_id |= CAN_ERR_CRTL;
+			cf->data[1] = (bec.txerr > bec.rxerr) ?
+				CAN_ERR_CRTL_TX_WARNING :
+				CAN_ERR_CRTL_RX_WARNING;
+		}
+	case CAN_STATE_ERROR_WARNING:	/* fallthrough */
+		/*
+		 * from: ERROR_ACTIVE, ERROR_WARNING
+		 * to  : ERROR_PASSIVE, BUS_OFF
+		 * =>  : error passive int
+		 */
+		if (new_state >= CAN_STATE_ERROR_PASSIVE &&
+		    new_state <= CAN_STATE_BUS_OFF) {
+			dev_dbg(dev->dev.parent, "Error Passive IRQ\n");
+			priv->can.can_stats.error_passive++;
+
+			cf->can_id |= CAN_ERR_CRTL;
+			cf->data[1] = (bec.txerr > bec.rxerr) ?
+				CAN_ERR_CRTL_TX_PASSIVE :
+				CAN_ERR_CRTL_RX_PASSIVE;
+		}
+		break;
+	case CAN_STATE_BUS_OFF:
+		dev_err(dev->dev.parent,
+			"BUG! hardware recovered automatically from BUS_OFF\n");
+		break;
+	default:
+		break;
+	}
+
+	/* process state changes depending on the new state */
+	switch (new_state) {
+	case CAN_STATE_ERROR_ACTIVE:
+		dev_dbg(dev->dev.parent, "Error Active\n");
+		cf->can_id |= CAN_ERR_PROT;
+		cf->data[2] = CAN_ERR_PROT_ACTIVE;
+		break;
+	case CAN_STATE_BUS_OFF:
+		cf->can_id |= CAN_ERR_BUSOFF;
+		can_bus_off(dev);
+		break;
+	default:
+		break;
+	}
+}
+
+static int flexcan_poll_state(struct net_device *dev, u32 reg_esr)
+{
+	struct flexcan_priv *priv = netdev_priv(dev);
+	struct sk_buff *skb;
+	struct can_frame *cf;
+	enum can_state new_state;
+	int flt;
+
+	flt = reg_esr & FLEXCAN_ESR_FLT_CONF_MASK;
+	if (likely(flt == FLEXCAN_ESR_FLT_CONF_ACTIVE)) {
+		if (likely(!(reg_esr & (FLEXCAN_ESR_TX_WRN |
+					FLEXCAN_ESR_RX_WRN))))
+			new_state = CAN_STATE_ERROR_ACTIVE;
+		else
+			new_state = CAN_STATE_ERROR_WARNING;
+	} else if (unlikely(flt == FLEXCAN_ESR_FLT_CONF_PASSIVE))
+		new_state = CAN_STATE_ERROR_PASSIVE;
+	else
+		new_state = CAN_STATE_BUS_OFF;
+
+	/* state hasn't changed */
+	if (likely(new_state == priv->can.state))
+		return 0;
+
+	skb = alloc_can_err_skb(dev, &cf);
+	if (unlikely(!skb))
+		return 0;
+
+	do_state(dev, cf, new_state);
+	priv->can.state = new_state;
+	netif_receive_skb(skb);
+
+	dev->stats.rx_packets++;
+	dev->stats.rx_bytes += cf->can_dlc;
+
+	return 1;
+}
+
+static void flexcan_read_fifo(const struct net_device *dev,
+			      struct can_frame *cf)
+{
+	const struct flexcan_priv *priv = netdev_priv(dev);
+	struct flexcan_regs __iomem *regs = priv->base;
+	struct flexcan_mb __iomem *mb = &regs->cantxfg[0];
+	u32 reg_ctrl, reg_id;
+
+	reg_ctrl = readl(&mb->can_ctrl);
+	reg_id = readl(&mb->can_id);
+	if (reg_ctrl & FLEXCAN_MB_CNT_IDE)
+		cf->can_id = ((reg_id >> 0) & CAN_EFF_MASK) | CAN_EFF_FLAG;
+	else
+		cf->can_id = (reg_id >> 18) & CAN_SFF_MASK;
+
+	if (reg_ctrl & FLEXCAN_MB_CNT_RTR)
+		cf->can_id |= CAN_RTR_FLAG;
+	cf->can_dlc = get_can_dlc((reg_ctrl >> 16) & 0xf);
+
+	*(__be32 *)(cf->data + 0) = cpu_to_be32(readl(&mb->data[0]));
+	*(__be32 *)(cf->data + 4) = cpu_to_be32(readl(&mb->data[1]));
+
+	/* mark as read */
+	writel(FLEXCAN_IFLAG_RX_FIFO_AVAILABLE, &regs->iflag1);
+	readl(&regs->timer);
+}
+
+static int flexcan_read_frame(struct net_device *dev)
+{
+	struct net_device_stats *stats = &dev->stats;
+	struct can_frame *cf;
+	struct sk_buff *skb;
+
+	skb = alloc_can_skb(dev, &cf);
+	if (unlikely(!skb)) {
+		stats->rx_dropped++;
+		return 0;
+	}
+
+	flexcan_read_fifo(dev, cf);
+	netif_receive_skb(skb);
+
+	stats->rx_packets++;
+	stats->rx_bytes += cf->can_dlc;
+
+	return 1;
+}
+
+static int flexcan_poll(struct napi_struct *napi, int quota)
+{
+	struct net_device *dev = napi->dev;
+	const struct flexcan_priv *priv = netdev_priv(dev);
+	struct flexcan_regs __iomem *regs = priv->base;
+	u32 reg_iflag1, reg_esr;
+	int work_done = 0;
+
+	/*
+	 * The error bits are cleared on read,
+	 * use saved value from irq handler.
+	 */
+	reg_esr = readl(&regs->esr) | priv->reg_esr;
+
+	/* handle state changes */
+	work_done += flexcan_poll_state(dev, reg_esr);
+
+	/* handle RX-FIFO */
+	reg_iflag1 = readl(&regs->iflag1);
+	while (reg_iflag1 & FLEXCAN_IFLAG_RX_FIFO_AVAILABLE &&
+	       work_done < quota) {
+		work_done += flexcan_read_frame(dev);
+		reg_iflag1 = readl(&regs->iflag1);
+	}
+
+	/* report bus errors */
+	if (flexcan_has_and_handle_berr(priv, reg_esr) && work_done < quota)
+		work_done += flexcan_poll_bus_err(dev, reg_esr);
+
+	if (work_done < quota) {
+		napi_complete(napi);
+		/* enable IRQs */
+		writel(FLEXCAN_IFLAG_DEFAULT, &regs->imask1);
+		writel(priv->reg_ctrl_default, &regs->ctrl);
+	}
+
+	return work_done;
+}
+
+static irqreturn_t flexcan_irq(int irq, void *dev_id)
+{
+	struct net_device *dev = dev_id;
+	struct net_device_stats *stats = &dev->stats;
+	struct flexcan_priv *priv = netdev_priv(dev);
+	struct flexcan_regs __iomem *regs = priv->base;
+	u32 reg_iflag1, reg_esr;
+
+	reg_iflag1 = readl(&regs->iflag1);
+	reg_esr = readl(&regs->esr);
+	writel(FLEXCAN_ESR_ERR_INT, &regs->esr);	/* ACK err IRQ */
+
+	/*
+	 * schedule NAPI in case of:
+	 * - rx IRQ
+	 * - state change IRQ
+	 * - bus error IRQ and bus error reporting is activated
+	 */
+	if ((reg_iflag1 & FLEXCAN_IFLAG_RX_FIFO_AVAILABLE) ||
+	    (reg_esr & FLEXCAN_ESR_ERR_STATE) ||
+	    flexcan_has_and_handle_berr(priv, reg_esr)) {
+		/*
+		 * The error bits are cleared on read,
+		 * save them for later use.
+		 */
+		priv->reg_esr = reg_esr & FLEXCAN_ESR_ERR_BUS;
+		writel(FLEXCAN_IFLAG_DEFAULT & ~FLEXCAN_IFLAG_RX_FIFO_AVAILABLE,
+		       &regs->imask1);
+		writel(priv->reg_ctrl_default & ~FLEXCAN_CTRL_ERR_ALL,
+		       &regs->ctrl);
+		napi_schedule(&priv->napi);
+	}
+
+	/* FIFO overflow */
+	if (reg_iflag1 & FLEXCAN_IFLAG_RX_FIFO_OVERFLOW) {
+		writel(FLEXCAN_IFLAG_RX_FIFO_OVERFLOW, &regs->iflag1);
+		dev->stats.rx_over_errors++;
+		dev->stats.rx_errors++;
+	}
+
+	/* transmission complete interrupt */
+	if (reg_iflag1 & (1 << FLEXCAN_TX_BUF_ID)) {
+		/* tx_bytes is incremented in flexcan_start_xmit */
+		stats->tx_packets++;
+		writel((1 << FLEXCAN_TX_BUF_ID), &regs->iflag1);
+		netif_wake_queue(dev);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void flexcan_set_bittiming(struct net_device *dev)
+{
+	const struct flexcan_priv *priv = netdev_priv(dev);
+	const struct can_bittiming *bt = &priv->can.bittiming;
+	struct flexcan_regs __iomem *regs = priv->base;
+	u32 reg;
+
+	reg = readl(&regs->ctrl);
+	reg &= ~(FLEXCAN_CTRL_PRESDIV(0xff) |
+		 FLEXCAN_CTRL_RJW(0x3) |
+		 FLEXCAN_CTRL_PSEG1(0x7) |
+		 FLEXCAN_CTRL_PSEG2(0x7) |
+		 FLEXCAN_CTRL_PROPSEG(0x7) |
+		 FLEXCAN_CTRL_LPB |
+		 FLEXCAN_CTRL_SMP |
+		 FLEXCAN_CTRL_LOM);
+
+	reg |= FLEXCAN_CTRL_PRESDIV(bt->brp - 1) |
+		FLEXCAN_CTRL_PSEG1(bt->phase_seg1 - 1) |
+		FLEXCAN_CTRL_PSEG2(bt->phase_seg2 - 1) |
+		FLEXCAN_CTRL_RJW(bt->sjw - 1) |
+		FLEXCAN_CTRL_PROPSEG(bt->prop_seg - 1);
+
+	if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK)
+		reg |= FLEXCAN_CTRL_LPB;
+	if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY)
+		reg |= FLEXCAN_CTRL_LOM;
+	if (priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES)
+		reg |= FLEXCAN_CTRL_SMP;
+
+	dev_info(dev->dev.parent, "writing ctrl=0x%08x\n", reg);
+	writel(reg, &regs->ctrl);
+
+	/* print chip status */
+	dev_dbg(dev->dev.parent, "%s: mcr=0x%08x ctrl=0x%08x\n", __func__,
+		readl(&regs->mcr), readl(&regs->ctrl));
+}
+
+/*
+ * flexcan_chip_start
+ *
+ * this functions is entered with clocks enabled
+ *
+ */
+static int flexcan_chip_start(struct net_device *dev)
+{
+	struct flexcan_priv *priv = netdev_priv(dev);
+	struct flexcan_regs __iomem *regs = priv->base;
+	unsigned int i;
+	int err;
+	u32 reg_mcr, reg_ctrl;
+
+	/* enable module */
+	flexcan_chip_enable(priv);
+
+	/* soft reset */
+	writel(FLEXCAN_MCR_SOFTRST, &regs->mcr);
+	udelay(10);
+
+	reg_mcr = readl(&regs->mcr);
+	if (reg_mcr & FLEXCAN_MCR_SOFTRST) {
+		dev_err(dev->dev.parent,
+			"Failed to softreset can module (mcr=0x%08x)\n",
+			reg_mcr);
+		err = -ENODEV;
+		goto out;
+	}
+
+	flexcan_set_bittiming(dev);
+
+	/*
+	 * MCR
+	 *
+	 * enable freeze
+	 * enable fifo
+	 * halt now
+	 * only supervisor access
+	 * enable warning int
+	 * choose format C
+	 *
+	 */
+	reg_mcr = readl(&regs->mcr);
+	reg_mcr |= FLEXCAN_MCR_FRZ | FLEXCAN_MCR_FEN | FLEXCAN_MCR_HALT |
+		FLEXCAN_MCR_SUPV | FLEXCAN_MCR_WRN_EN |
+		FLEXCAN_MCR_IDAM_C;
+	dev_dbg(dev->dev.parent, "%s: writing mcr=0x%08x", __func__, reg_mcr);
+	writel(reg_mcr, &regs->mcr);
+
+	/*
+	 * CTRL
+	 *
+	 * disable timer sync feature
+	 *
+	 * disable auto busoff recovery
+	 * transmit lowest buffer first
+	 *
+	 * enable tx and rx warning interrupt
+	 * enable bus off interrupt
+	 * (== FLEXCAN_CTRL_ERR_STATE)
+	 *
+	 * _note_: we enable the "error interrupt"
+	 * (FLEXCAN_CTRL_ERR_MSK), too. Otherwise we don't get any
+	 * warning or bus passive interrupts.
+	 */
+	reg_ctrl = readl(&regs->ctrl);
+	reg_ctrl &= ~FLEXCAN_CTRL_TSYN;
+	reg_ctrl |= FLEXCAN_CTRL_BOFF_REC | FLEXCAN_CTRL_LBUF |
+		FLEXCAN_CTRL_ERR_STATE | FLEXCAN_CTRL_ERR_MSK;
+
+	/* save for later use */
+	priv->reg_ctrl_default = reg_ctrl;
+	dev_dbg(dev->dev.parent, "%s: writing ctrl=0x%08x", __func__, reg_ctrl);
+	writel(reg_ctrl, &regs->ctrl);
+
+	for (i = 0; i < ARRAY_SIZE(regs->cantxfg); i++) {
+		writel(0, &regs->cantxfg[i].can_ctrl);
+		writel(0, &regs->cantxfg[i].can_id);
+		writel(0, &regs->cantxfg[i].data[0]);
+		writel(0, &regs->cantxfg[i].data[1]);
+
+		/* put MB into rx queue */
+		writel(FLEXCAN_MB_CNT_CODE(0x4), &regs->cantxfg[i].can_ctrl);
+	}
+
+	/* acceptance mask/acceptance code (accept everything) */
+	writel(0x0, &regs->rxgmask);
+	writel(0x0, &regs->rx14mask);
+	writel(0x0, &regs->rx15mask);
+
+	flexcan_transceiver_switch(priv, 1);
+
+	/* synchronize with the can bus */
+	reg_mcr = readl(&regs->mcr);
+	reg_mcr &= ~FLEXCAN_MCR_HALT;
+	writel(reg_mcr, &regs->mcr);
+
+	priv->can.state = CAN_STATE_ERROR_ACTIVE;
+
+	/* enable FIFO interrupts */
+	writel(FLEXCAN_IFLAG_DEFAULT, &regs->imask1);
+
+	/* print chip status */
+	dev_dbg(dev->dev.parent, "%s: reading mcr=0x%08x ctrl=0x%08x\n",
+		__func__, readl(&regs->mcr), readl(&regs->ctrl));
+
+	return 0;
+
+ out:
+	flexcan_chip_disable(priv);
+	return err;
+}
+
+/*
+ * flexcan_chip_stop
+ *
+ * this functions is entered with clocks enabled
+ *
+ */
+static void flexcan_chip_stop(struct net_device *dev)
+{
+	struct flexcan_priv *priv = netdev_priv(dev);
+	struct flexcan_regs __iomem *regs = priv->base;
+	u32 reg;
+
+	/* Disable all interrupts */
+	writel(0, &regs->imask1);
+
+	/* Disable + halt module */
+	reg = readl(&regs->mcr);
+	reg |= FLEXCAN_MCR_MDIS | FLEXCAN_MCR_HALT;
+	writel(reg, &regs->mcr);
+
+	flexcan_transceiver_switch(priv, 0);
+	priv->can.state = CAN_STATE_STOPPED;
+
+	return;
+}
+
+static int flexcan_open(struct net_device *dev)
+{
+	struct flexcan_priv *priv = netdev_priv(dev);
+	int err;
+
+	clk_enable(priv->clk);
+
+	err = open_candev(dev);
+	if (err)
+		goto out;
+
+	err = request_irq(dev->irq, flexcan_irq, IRQF_SHARED, dev->name, dev);
+	if (err)
+		goto out_close;
+
+	/* start chip and queuing */
+	err = flexcan_chip_start(dev);
+	if (err)
+		goto out_close;
+	napi_enable(&priv->napi);
+	netif_start_queue(dev);
+
+	return 0;
+
+ out_close:
+	close_candev(dev);
+ out:
+	clk_disable(priv->clk);
+
+	return err;
+}
+
+static int flexcan_close(struct net_device *dev)
+{
+	struct flexcan_priv *priv = netdev_priv(dev);
+
+	netif_stop_queue(dev);
+	napi_disable(&priv->napi);
+	flexcan_chip_stop(dev);
+
+	free_irq(dev->irq, dev);
+	clk_disable(priv->clk);
+
+	close_candev(dev);
+
+	return 0;
+}
+
+static int flexcan_set_mode(struct net_device *dev, enum can_mode mode)
+{
+	int err;
+
+	switch (mode) {
+	case CAN_MODE_START:
+		err = flexcan_chip_start(dev);
+		if (err)
+			return err;
+
+		netif_wake_queue(dev);
+		break;
+
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static const struct net_device_ops flexcan_netdev_ops = {
+	.ndo_open	= flexcan_open,
+	.ndo_stop	= flexcan_close,
+	.ndo_start_xmit	= flexcan_start_xmit,
+};
+
+static int __devinit register_flexcandev(struct net_device *dev)
+{
+	struct flexcan_priv *priv = netdev_priv(dev);
+	struct flexcan_regs __iomem *regs = priv->base;
+	u32 reg, err;
+
+	clk_enable(priv->clk);
+
+	/* select "bus clock", chip must be disabled */
+	flexcan_chip_disable(priv);
+	reg = readl(&regs->ctrl);
+	reg |= FLEXCAN_CTRL_CLK_SRC;
+	writel(reg, &regs->ctrl);
+
+	flexcan_chip_enable(priv);
+
+	/* set freeze, halt and activate FIFO, restrict register access */
+	reg = readl(&regs->mcr);
+	reg |= FLEXCAN_MCR_FRZ | FLEXCAN_MCR_HALT |
+		FLEXCAN_MCR_FEN | FLEXCAN_MCR_SUPV;
+	writel(reg, &regs->mcr);
+
+	/*
+	 * Currently we only support newer versions of this core
+	 * featuring a RX FIFO. Older cores found on some Coldfire
+	 * derivates are not yet supported.
+	 */
+	reg = readl(&regs->mcr);
+	if (!(reg & FLEXCAN_MCR_FEN)) {
+		dev_err(dev->dev.parent,
+			"Could not enable RX FIFO, unsupported core\n");
+		err = -ENODEV;
+		goto out;
+	}
+
+	err = register_candev(dev);
+
+ out:
+	/* disable core and turn off clocks */
+	flexcan_chip_disable(priv);
+	clk_disable(priv->clk);
+
+	return err;
+}
+
+static void __devexit unregister_flexcandev(struct net_device *dev)
+{
+	unregister_candev(dev);
+}
+
+static int __devinit flexcan_probe(struct platform_device *pdev)
+{
+	struct net_device *dev;
+	struct flexcan_priv *priv;
+	struct resource *mem;
+	struct clk *clk;
+	void __iomem *base;
+	resource_size_t mem_size;
+	int err, irq;
+
+	clk = clk_get(&pdev->dev, NULL);
+	if (IS_ERR(clk)) {
+		dev_err(&pdev->dev, "no clock defined\n");
+		err = PTR_ERR(clk);
+		goto failed_clock;
+	}
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	irq = platform_get_irq(pdev, 0);
+	if (!mem || irq <= 0) {
+		err = -ENODEV;
+		goto failed_get;
+	}
+
+	mem_size = resource_size(mem);
+	if (!request_mem_region(mem->start, mem_size, pdev->name)) {
+		err = -EBUSY;
+		goto failed_req;
+	}
+
+	base = ioremap(mem->start, mem_size);
+	if (!base) {
+		err = -ENOMEM;
+		goto failed_map;
+	}
+
+	dev = alloc_candev(sizeof(struct flexcan_priv), 0);
+	if (!dev) {
+		err = -ENOMEM;
+		goto failed_alloc;
+	}
+
+	dev->netdev_ops = &flexcan_netdev_ops;
+	dev->irq = irq;
+	dev->flags |= IFF_ECHO; /* we support local echo in hardware */
+
+	priv = netdev_priv(dev);
+	priv->can.clock.freq = clk_get_rate(clk);
+	priv->can.bittiming_const = &flexcan_bittiming_const;
+	priv->can.do_set_mode = flexcan_set_mode;
+	priv->can.do_get_berr_counter = flexcan_get_berr_counter;
+	priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK |
+		CAN_CTRLMODE_LISTENONLY	| CAN_CTRLMODE_3_SAMPLES |
+		CAN_CTRLMODE_BERR_REPORTING;
+	priv->base = base;
+	priv->dev = dev;
+	priv->clk = clk;
+	priv->pdata = pdev->dev.platform_data;
+
+	netif_napi_add(dev, &priv->napi, flexcan_poll, FLEXCAN_NAPI_WEIGHT);
+
+	dev_set_drvdata(&pdev->dev, dev);
+	SET_NETDEV_DEV(dev, &pdev->dev);
+
+	err = register_flexcandev(dev);
+	if (err) {
+		dev_err(&pdev->dev, "registering netdev failed\n");
+		goto failed_register;
+	}
+
+	dev_info(&pdev->dev, "device registered (reg_base=%p, irq=%d)\n",
+		 priv->base, dev->irq);
+
+	return 0;
+
+ failed_register:
+	free_candev(dev);
+ failed_alloc:
+	iounmap(base);
+ failed_map:
+	release_mem_region(mem->start, mem_size);
+ failed_req:
+	clk_put(clk);
+ failed_get:
+ failed_clock:
+	return err;
+}
+
+static int __devexit flexcan_remove(struct platform_device *pdev)
+{
+	struct net_device *dev = platform_get_drvdata(pdev);
+	struct flexcan_priv *priv = netdev_priv(dev);
+	struct resource *mem;
+
+	unregister_flexcandev(dev);
+	platform_set_drvdata(pdev, NULL);
+	free_candev(dev);
+	iounmap(priv->base);
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	release_mem_region(mem->start, resource_size(mem));
+
+	clk_put(priv->clk);
+
+	return 0;
+}
+
+static struct platform_driver flexcan_driver = {
+	.driver.name = DRV_NAME,
+	.probe = flexcan_probe,
+	.remove = __devexit_p(flexcan_remove),
+};
+
+static int __init flexcan_init(void)
+{
+	pr_info("%s netdevice driver\n", DRV_NAME);
+	return platform_driver_register(&flexcan_driver);
+}
+
+static void __exit flexcan_exit(void)
+{
+	platform_driver_unregister(&flexcan_driver);
+	pr_info("%s: driver removed\n", DRV_NAME);
+}
+
+module_init(flexcan_init);
+module_exit(flexcan_exit);
+
+MODULE_AUTHOR("Sascha Hauer <kernel@pengutronix.de>, "
+	      "Marc Kleine-Budde <kernel@pengutronix.de>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("CAN port driver for flexcan based chip");
diff --git a/include/linux/can/platform/flexcan.h b/include/linux/can/platform/flexcan.h
new file mode 100644
index 000000000000..72b713ab57e9
--- /dev/null
+++ b/include/linux/can/platform/flexcan.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2010 Marc Kleine-Budde <kernel@pengutronix.de>
+ *
+ * This file is released under the GPLv2
+ *
+ */
+
+#ifndef __CAN_PLATFORM_FLEXCAN_H
+#define __CAN_PLATFORM_FLEXCAN_H
+
+/**
+ * struct flexcan_platform_data - flex CAN controller platform data
+ * @transceiver_enable:         - called to power on/off the transceiver
+ *
+ */
+struct flexcan_platform_data {
+	void (*transceiver_switch)(int enable);
+};
+
+#endif /* __CAN_PLATFORM_FLEXCAN_H */
-- 
cgit v1.2.3-59-g8ed1b


From e120153ddf8620fd0a194d301e9c5a8b28483bb5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 22 Jul 2010 14:14:25 +0200
Subject: workqueue: fix how cpu number is stored in work->data

Once a work starts execution, its data contains the cpu number it was
on instead of pointing to cwq.  This is added by commit 7a22ad75
(workqueue: carry cpu number in work data once execution starts) to
reliably determine the work was last on even if the workqueue itself
was destroyed inbetween.

Whether data points to a cwq or contains a cpu number was
distinguished by comparing the value against PAGE_OFFSET.  The
assumption was that a cpu number should be below PAGE_OFFSET while a
pointer to cwq should be above it.  However, on architectures which
use separate address spaces for user and kernel spaces, this doesn't
hold as PAGE_OFFSET is zero.

Fix it by using an explicit flag, WORK_STRUCT_CWQ, to mark what the
data field contains.  If the flag is set, it's pointing to a cwq;
otherwise, it contains a cpu number.

Reported on s390 and microblaze during linux-next testing.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Sachin Sant <sachinp@in.ibm.com>
Reported-by: Michal Simek <michal.simek@petalogix.com>
Reported-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Tested-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Tested-by: Michal Simek <monstr@monstr.eu>
---
 include/linux/workqueue.h | 14 ++++++++------
 kernel/workqueue.c        | 36 +++++++++++++-----------------------
 2 files changed, 21 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index d74a529ed13e..5f76001c4e6d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -25,17 +25,19 @@ typedef void (*work_func_t)(struct work_struct *work);
 
 enum {
 	WORK_STRUCT_PENDING_BIT	= 0,	/* work item is pending execution */
-	WORK_STRUCT_LINKED_BIT	= 1,	/* next work is linked to this one */
+	WORK_STRUCT_CWQ_BIT	= 1,	/* data points to cwq */
+	WORK_STRUCT_LINKED_BIT	= 2,	/* next work is linked to this one */
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
-	WORK_STRUCT_STATIC_BIT	= 2,	/* static initializer (debugobjects) */
-	WORK_STRUCT_COLOR_SHIFT	= 3,	/* color for workqueue flushing */
+	WORK_STRUCT_STATIC_BIT	= 3,	/* static initializer (debugobjects) */
+	WORK_STRUCT_COLOR_SHIFT	= 4,	/* color for workqueue flushing */
 #else
-	WORK_STRUCT_COLOR_SHIFT	= 2,	/* color for workqueue flushing */
+	WORK_STRUCT_COLOR_SHIFT	= 3,	/* color for workqueue flushing */
 #endif
 
 	WORK_STRUCT_COLOR_BITS	= 4,
 
 	WORK_STRUCT_PENDING	= 1 << WORK_STRUCT_PENDING_BIT,
+	WORK_STRUCT_CWQ		= 1 << WORK_STRUCT_CWQ_BIT,
 	WORK_STRUCT_LINKED	= 1 << WORK_STRUCT_LINKED_BIT,
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 	WORK_STRUCT_STATIC	= 1 << WORK_STRUCT_STATIC_BIT,
@@ -56,8 +58,8 @@ enum {
 	WORK_CPU_LAST		= WORK_CPU_NONE,
 
 	/*
-	 * Reserve 6 bits off of cwq pointer w/ debugobjects turned
-	 * off.  This makes cwqs aligned to 64 bytes which isn't too
+	 * Reserve 7 bits off of cwq pointer w/ debugobjects turned
+	 * off.  This makes cwqs aligned to 128 bytes which isn't too
 	 * excessive while allowing 15 workqueue flush colors.
 	 */
 	WORK_STRUCT_FLAG_BITS	= WORK_STRUCT_COLOR_SHIFT +
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c11edc9c9365..e5cb7faac58e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -468,10 +468,9 @@ static int work_next_color(int color)
 }
 
 /*
- * Work data points to the cwq while a work is on queue.  Once
- * execution starts, it points to the cpu the work was last on.  This
- * can be distinguished by comparing the data value against
- * PAGE_OFFSET.
+ * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
+ * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
+ * cleared and the work data contains the cpu number it was last on.
  *
  * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
  * cwq, cpu or clear work->data.  These functions should only be
@@ -494,7 +493,7 @@ static void set_work_cwq(struct work_struct *work,
 			 unsigned long extra_flags)
 {
 	set_work_data(work, (unsigned long)cwq,
-		      WORK_STRUCT_PENDING | extra_flags);
+		      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
 }
 
 static void set_work_cpu(struct work_struct *work, unsigned int cpu)
@@ -507,25 +506,24 @@ static void clear_work_data(struct work_struct *work)
 	set_work_data(work, WORK_STRUCT_NO_CPU, 0);
 }
 
-static inline unsigned long get_work_data(struct work_struct *work)
-{
-	return atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK;
-}
-
 static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
 {
-	unsigned long data = get_work_data(work);
+	unsigned long data = atomic_long_read(&work->data);
 
-	return data >= PAGE_OFFSET ? (void *)data : NULL;
+	if (data & WORK_STRUCT_CWQ)
+		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
+	else
+		return NULL;
 }
 
 static struct global_cwq *get_work_gcwq(struct work_struct *work)
 {
-	unsigned long data = get_work_data(work);
+	unsigned long data = atomic_long_read(&work->data);
 	unsigned int cpu;
 
-	if (data >= PAGE_OFFSET)
-		return ((struct cpu_workqueue_struct *)data)->gcwq;
+	if (data & WORK_STRUCT_CWQ)
+		return ((struct cpu_workqueue_struct *)
+			(data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
 
 	cpu = data >> WORK_STRUCT_FLAG_BITS;
 	if (cpu == WORK_CPU_NONE)
@@ -3501,14 +3499,6 @@ void __init init_workqueues(void)
 	unsigned int cpu;
 	int i;
 
-	/*
-	 * The pointer part of work->data is either pointing to the
-	 * cwq or contains the cpu number the work ran last on.  Make
-	 * sure cpu number won't overflow into kernel pointer area so
-	 * that they can be distinguished.
-	 */
-	BUILD_BUG_ON(WORK_CPU_LAST << WORK_STRUCT_FLAG_BITS >= PAGE_OFFSET);
-
 	hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
 	/* initialize gcwqs */
-- 
cgit v1.2.3-59-g8ed1b


From 963bfeeeec913d135c15dc400f2f86cb62655d81 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 20 Jul 2010 22:03:14 +0000
Subject: net: RTA_MARK addition

Add a new rt attribute, RTA_MARK, and use it in
rt_fill_info()/inet_rtm_getroute() to support following commands :

ip route get 192.168.20.110 mark NUMBER
ip route get 192.168.20.108 from 192.168.20.110 iif eth1 mark NUMBER
ip route list cache [192.168.20.110] mark NUMBER

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 1 +
 net/ipv4/route.c          | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index fbc8cb0d48c3..58d44491880f 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -282,6 +282,7 @@ enum rtattr_type_t {
 	RTA_SESSION, /* no longer used */
 	RTA_MP_ALGO, /* no longer used */
 	RTA_TABLE,
+	RTA_MARK,
 	__RTA_MAX
 };
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 562ce92de2a6..3f56b6e6c6aa 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2878,6 +2878,9 @@ static int rt_fill_info(struct net *net,
 	if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
 		goto nla_put_failure;
 
+	if (rt->fl.mark)
+		NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
+
 	error = rt->dst.error;
 	expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
 	if (rt->peer) {
@@ -2933,6 +2936,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	__be32 src = 0;
 	u32 iif;
 	int err;
+	int mark;
 	struct sk_buff *skb;
 
 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
@@ -2960,6 +2964,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
+	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
 
 	if (iif) {
 		struct net_device *dev;
@@ -2972,6 +2977,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 
 		skb->protocol	= htons(ETH_P_IP);
 		skb->dev	= dev;
+		skb->mark	= mark;
 		local_bh_disable();
 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
 		local_bh_enable();
@@ -2989,6 +2995,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 				},
 			},
 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
+			.mark = mark,
 		};
 		err = ip_route_output_key(net, &rt, &fl);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 8b8edefa2fffbff97f9eec8b70e78ae23abad1a0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jul 2010 22:09:01 +0200
Subject: fscache: convert object to use workqueue instead of slow-work

Make fscache object state transition callbacks use workqueue instead
of slow-work.  New dedicated unbound CPU workqueue fscache_object_wq
is created.  get/put callbacks are renamed and modified to take
@object and called directly from the enqueue wrapper and the work
function.  While at it, make all open coded instances of get/put to
use fscache_get/put_object().

* Unbound workqueue is used.

* work_busy() output is printed instead of slow-work flags in object
  debugging outputs.  They mean basically the same thing bit-for-bit.

* sysctl fscache.object_max_active added to control concurrency.  The
  default value is nr_cpus clamped between 4 and
  WQ_UNBOUND_MAX_ACTIVE.

* slow_work_sleep_till_thread_needed() is replaced with fscache
  private implementation fscache_object_sleep_till_congested() which
  waits on fscache_object_wq congestion.

* debugfs support is dropped for now.  Tracing API based debug
  facility is planned to be added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Howells <dhowells@redhat.com>
---
 Documentation/filesystems/caching/fscache.txt |  10 +--
 fs/cachefiles/namei.c                         |  13 ++--
 fs/fscache/internal.h                         |   7 ++
 fs/fscache/main.c                             |  76 ++++++++++++++++++
 fs/fscache/object-list.c                      |  11 ++-
 fs/fscache/object.c                           | 106 +++++++++++++-------------
 include/linux/fscache-cache.h                 |   9 ++-
 7 files changed, 158 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index a91e2e2095b0..770267af5b3e 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -343,8 +343,8 @@ This will look something like:
 	[root@andromeda ~]# head /proc/fs/fscache/objects
 	OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA       OBJECT_KEY, AUX_DATA
 	======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================
-	   17e4b        2 ACTV     0   0   0   0  0     0 7b  4 0 8 | NFS.fh           DT  0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
-	   1693a        2 ACTV     0   0   0   0  0     0 7b  4 0 8 | NFS.fh           DT  0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
+	   17e4b        2 ACTV     0   0   0   0  0     0 7b  4 0 0 | NFS.fh           DT  0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
+	   1693a        2 ACTV     0   0   0   0  0     0 7b  4 0 0 | NFS.fh           DT  0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
 
 where the first set of columns before the '|' describe the object:
 
@@ -362,7 +362,7 @@ where the first set of columns before the '|' describe the object:
 	EM	Object's event mask
 	EV	Events raised on this object
 	F	Object flags
-	S	Object slow-work work item flags
+	S	Object work item busy state mask (1:pending 2:running)
 
 and the second set of columns describe the object's cookie, if present:
 
@@ -395,8 +395,8 @@ and the following paired letters:
 	w	Show objects that don't have pending writes
 	R	Show objects that have outstanding reads
 	r	Show objects that don't have outstanding reads
-	S	Show objects that have slow work queued
-	s	Show objects that don't have slow work queued
+	S	Show objects that have work queued
+	s	Show objects that don't have work queued
 
 If neither side of a letter pair is given, then both are implied.  For example:
 
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f4a7840bf42c..42c7fafc8bfe 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -37,9 +37,9 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
 
 	printk(KERN_ERR "%sobject: OBJ%x\n",
 	       prefix, object->fscache.debug_id);
-	printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n",
+	printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
 	       prefix, fscache_object_states[object->fscache.state],
-	       object->fscache.flags, object->fscache.work.flags,
+	       object->fscache.flags, work_busy(&object->fscache.work),
 	       object->fscache.events,
 	       object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
 	printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -212,7 +212,7 @@ wait_for_old_object:
 
 		/* if the object we're waiting for is queued for processing,
 		 * then just put ourselves on the queue behind it */
-		if (slow_work_is_queued(&xobject->fscache.work)) {
+		if (work_pending(&xobject->fscache.work)) {
 			_debug("queue OBJ%x behind OBJ%x immediately",
 			       object->fscache.debug_id,
 			       xobject->fscache.debug_id);
@@ -220,8 +220,7 @@ wait_for_old_object:
 		}
 
 		/* otherwise we sleep until either the object we're waiting for
-		 * is done, or the slow-work facility wants the thread back to
-		 * do other work */
+		 * is done, or the fscache_object is congested */
 		wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
 		init_wait(&wait);
 		requeue = false;
@@ -229,8 +228,8 @@ wait_for_old_object:
 			prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
 			if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
 				break;
-			requeue = slow_work_sleep_till_thread_needed(
-				&object->fscache.work, &timeout);
+
+			requeue = fscache_object_sleep_till_congested(&timeout);
 		} while (timeout > 0 && !requeue);
 		finish_wait(wq, &wait);
 
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index edd7434ab6e5..6e0b5fb25231 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -82,6 +82,13 @@ extern unsigned fscache_defer_lookup;
 extern unsigned fscache_defer_create;
 extern unsigned fscache_debug;
 extern struct kobject *fscache_root;
+extern struct workqueue_struct *fscache_object_wq;
+DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+
+static inline bool fscache_object_congested(void)
+{
+	return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
+}
 
 extern int fscache_wait_bit(void *);
 extern int fscache_wait_bit_interruptible(void *);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index add6bdb53f04..bb8d4c35c7a2 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/completion.h>
 #include <linux/slab.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 
 MODULE_DESCRIPTION("FS Cache Manager");
@@ -40,22 +41,89 @@ MODULE_PARM_DESC(fscache_debug,
 		 "FS-Cache debugging mask");
 
 struct kobject *fscache_root;
+struct workqueue_struct *fscache_object_wq;
+
+DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+
+/* these values serve as lower bounds, will be adjusted in fscache_init() */
+static unsigned fscache_object_max_active = 4;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fscache_sysctl_header;
+
+static int fscache_max_active_sysctl(struct ctl_table *table, int write,
+				     void __user *buffer,
+				     size_t *lenp, loff_t *ppos)
+{
+	struct workqueue_struct **wqp = table->extra1;
+	unsigned int *datap = table->data;
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret == 0)
+		workqueue_set_max_active(*wqp, *datap);
+	return ret;
+}
+
+ctl_table fscache_sysctls[] = {
+	{
+		.procname	= "object_max_active",
+		.data		= &fscache_object_max_active,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= fscache_max_active_sysctl,
+		.extra1		= &fscache_object_wq,
+	},
+	{}
+};
+
+ctl_table fscache_sysctls_root[] = {
+	{
+		.procname	= "fscache",
+		.mode		= 0555,
+		.child		= fscache_sysctls,
+	},
+	{}
+};
+#endif
 
 /*
  * initialise the fs caching module
  */
 static int __init fscache_init(void)
 {
+	unsigned int nr_cpus = num_possible_cpus();
+	unsigned int cpu;
 	int ret;
 
 	ret = slow_work_register_user(THIS_MODULE);
 	if (ret < 0)
 		goto error_slow_work;
 
+	fscache_object_max_active =
+		clamp_val(nr_cpus,
+			  fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
+
+	ret = -ENOMEM;
+	fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND,
+					    fscache_object_max_active);
+	if (!fscache_object_wq)
+		goto error_object_wq;
+
+	for_each_possible_cpu(cpu)
+		init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
+
 	ret = fscache_proc_init();
 	if (ret < 0)
 		goto error_proc;
 
+#ifdef CONFIG_SYSCTL
+	ret = -ENOMEM;
+	fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root);
+	if (!fscache_sysctl_header)
+		goto error_sysctl;
+#endif
+
 	fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
 					       sizeof(struct fscache_cookie),
 					       0,
@@ -78,8 +146,14 @@ static int __init fscache_init(void)
 error_kobj:
 	kmem_cache_destroy(fscache_cookie_jar);
 error_cookie_jar:
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(fscache_sysctl_header);
+error_sysctl:
+#endif
 	fscache_proc_cleanup();
 error_proc:
+	destroy_workqueue(fscache_object_wq);
+error_object_wq:
 	slow_work_unregister_user(THIS_MODULE);
 error_slow_work:
 	return ret;
@@ -96,7 +170,9 @@ static void __exit fscache_exit(void)
 
 	kobject_put(fscache_root);
 	kmem_cache_destroy(fscache_cookie_jar);
+	unregister_sysctl_table(fscache_sysctl_header);
 	fscache_proc_cleanup();
+	destroy_workqueue(fscache_object_wq);
 	slow_work_unregister_user(THIS_MODULE);
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 4a8eb31c5338..ebe29c581380 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -34,8 +34,8 @@ struct fscache_objlist_data {
 #define FSCACHE_OBJLIST_CONFIG_NOREADS	0x00000200	/* show objects without active reads */
 #define FSCACHE_OBJLIST_CONFIG_EVENTS	0x00000400	/* show objects with events */
 #define FSCACHE_OBJLIST_CONFIG_NOEVENTS	0x00000800	/* show objects without no events */
-#define FSCACHE_OBJLIST_CONFIG_WORK	0x00001000	/* show objects with slow work */
-#define FSCACHE_OBJLIST_CONFIG_NOWORK	0x00002000	/* show objects without slow work */
+#define FSCACHE_OBJLIST_CONFIG_WORK	0x00001000	/* show objects with work */
+#define FSCACHE_OBJLIST_CONFIG_NOWORK	0x00002000	/* show objects without work */
 
 	u8		buf[512];	/* key and aux data buffer */
 };
@@ -231,12 +231,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 		       READS, NOREADS);
 		FILTER(obj->events & obj->event_mask,
 		       EVENTS, NOEVENTS);
-		FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW),
-		       WORK, NOWORK);
+		FILTER(work_busy(&obj->work), WORK, NOWORK);
 	}
 
 	seq_printf(m,
-		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ",
+		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
 		   obj->debug_id,
 		   obj->parent ? obj->parent->debug_id : -1,
 		   fscache_object_states_short[obj->state],
@@ -249,7 +248,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 		   obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
 		   obj->events,
 		   obj->flags,
-		   obj->work.flags);
+		   work_busy(&obj->work));
 
 	no_cookie = true;
 	keylen = auxlen = 0;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 0b589a9b4ffc..b6b897c550ac 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,7 +14,6 @@
 
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
-#include <linux/seq_file.h>
 #include "internal.h"
 
 const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -50,12 +49,8 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
 	[FSCACHE_OBJECT_DEAD]		= "DEAD",
 };
 
-static void fscache_object_slow_work_put_ref(struct slow_work *);
-static int  fscache_object_slow_work_get_ref(struct slow_work *);
-static void fscache_object_slow_work_execute(struct slow_work *);
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
-#endif
+static int  fscache_get_object(struct fscache_object *);
+static void fscache_put_object(struct fscache_object *);
 static void fscache_initialise_object(struct fscache_object *);
 static void fscache_lookup_object(struct fscache_object *);
 static void fscache_object_available(struct fscache_object *);
@@ -64,17 +59,6 @@ static void fscache_withdraw_object(struct fscache_object *);
 static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
 
-const struct slow_work_ops fscache_object_slow_work_ops = {
-	.owner		= THIS_MODULE,
-	.get_ref	= fscache_object_slow_work_get_ref,
-	.put_ref	= fscache_object_slow_work_put_ref,
-	.execute	= fscache_object_slow_work_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	.desc		= fscache_object_slow_work_desc,
-#endif
-};
-EXPORT_SYMBOL(fscache_object_slow_work_ops);
-
 /*
  * we need to notify the parent when an op completes that we had outstanding
  * upon it
@@ -345,7 +329,7 @@ unsupported_event:
 /*
  * execute an object
  */
-static void fscache_object_slow_work_execute(struct slow_work *work)
+void fscache_object_work_func(struct work_struct *work)
 {
 	struct fscache_object *object =
 		container_of(work, struct fscache_object, work);
@@ -359,23 +343,9 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
 	if (object->events & object->event_mask)
 		fscache_enqueue_object(object);
 	clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+	fscache_put_object(object);
 }
-
-/*
- * describe an object for slow-work debugging
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_object_slow_work_desc(struct slow_work *work,
-					  struct seq_file *m)
-{
-	struct fscache_object *object =
-		container_of(work, struct fscache_object, work);
-
-	seq_printf(m, "FSC: OBJ%x: %s",
-		   object->debug_id,
-		   fscache_object_states_short[object->state]);
-}
-#endif
+EXPORT_SYMBOL(fscache_object_work_func);
 
 /*
  * initialise an object
@@ -393,7 +363,6 @@ static void fscache_initialise_object(struct fscache_object *object)
 	_enter("");
 	ASSERT(object->cookie != NULL);
 	ASSERT(object->cookie->parent != NULL);
-	ASSERT(list_empty(&object->work.link));
 
 	if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
 			      (1 << FSCACHE_OBJECT_EV_RELEASE) |
@@ -671,10 +640,8 @@ static void fscache_drop_object(struct fscache_object *object)
 		object->parent = NULL;
 	}
 
-	/* this just shifts the object release to the slow work processor */
-	fscache_stat(&fscache_n_cop_put_object);
-	object->cache->ops->put_object(object);
-	fscache_stat_d(&fscache_n_cop_put_object);
+	/* this just shifts the object release to the work processor */
+	fscache_put_object(object);
 
 	_leave("");
 }
@@ -758,12 +725,10 @@ void fscache_withdrawing_object(struct fscache_cache *cache,
 }
 
 /*
- * allow the slow work item processor to get a ref on an object
+ * get a ref on an object
  */
-static int fscache_object_slow_work_get_ref(struct slow_work *work)
+static int fscache_get_object(struct fscache_object *object)
 {
-	struct fscache_object *object =
-		container_of(work, struct fscache_object, work);
 	int ret;
 
 	fscache_stat(&fscache_n_cop_grab_object);
@@ -773,13 +738,10 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
 }
 
 /*
- * allow the slow work item processor to discard a ref on a work item
+ * discard a ref on a work item
  */
-static void fscache_object_slow_work_put_ref(struct slow_work *work)
+static void fscache_put_object(struct fscache_object *object)
 {
-	struct fscache_object *object =
-		container_of(work, struct fscache_object, work);
-
 	fscache_stat(&fscache_n_cop_put_object);
 	object->cache->ops->put_object(object);
 	fscache_stat_d(&fscache_n_cop_put_object);
@@ -792,8 +754,48 @@ void fscache_enqueue_object(struct fscache_object *object)
 {
 	_enter("{OBJ%x}", object->debug_id);
 
-	slow_work_enqueue(&object->work);
+	if (fscache_get_object(object) >= 0) {
+		wait_queue_head_t *cong_wq =
+			&get_cpu_var(fscache_object_cong_wait);
+
+		if (queue_work(fscache_object_wq, &object->work)) {
+			if (fscache_object_congested())
+				wake_up(cong_wq);
+		} else
+			fscache_put_object(object);
+
+		put_cpu_var(fscache_object_cong_wait);
+	}
+}
+
+/**
+ * fscache_object_sleep_till_congested - Sleep until object wq is congested
+ * @timoutp: Scheduler sleep timeout
+ *
+ * Allow an object handler to sleep until the object workqueue is congested.
+ *
+ * The caller must set up a wake up event before calling this and must have set
+ * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
+ * condition before calling this function as no test is made here.
+ *
+ * %true is returned if the object wq is congested, %false otherwise.
+ */
+bool fscache_object_sleep_till_congested(signed long *timeoutp)
+{
+	wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait);
+	DEFINE_WAIT(wait);
+
+	if (fscache_object_congested())
+		return true;
+
+	add_wait_queue_exclusive(cong_wq, &wait);
+	if (!fscache_object_congested())
+		*timeoutp = schedule_timeout(*timeoutp);
+	finish_wait(cong_wq, &wait);
+
+	return fscache_object_congested();
 }
+EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
 
 /*
  * enqueue the dependents of an object for metadata-type processing
@@ -819,9 +821,7 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
 
 		/* sort onto appropriate lists */
 		fscache_enqueue_object(dep);
-		fscache_stat(&fscache_n_cop_put_object);
-		dep->cache->ops->put_object(dep);
-		fscache_stat_d(&fscache_n_cop_put_object);
+		fscache_put_object(dep);
 
 		if (!list_empty(&object->dependents))
 			cond_resched_lock(&object->lock);
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index c57db27ac861..27c8df503152 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -21,6 +21,7 @@
 #include <linux/fscache.h>
 #include <linux/sched.h>
 #include <linux/slow-work.h>
+#include <linux/workqueue.h>
 
 #define NR_MAXCACHES BITS_PER_LONG
 
@@ -389,7 +390,7 @@ struct fscache_object {
 	struct fscache_cache	*cache;		/* cache that supplied this object */
 	struct fscache_cookie	*cookie;	/* netfs's file/index object */
 	struct fscache_object	*parent;	/* parent object */
-	struct slow_work	work;		/* attention scheduling record */
+	struct work_struct	work;		/* attention scheduling record */
 	struct list_head	dependents;	/* FIFO of dependent objects */
 	struct list_head	dep_link;	/* link in parent's dependents list */
 	struct list_head	pending_ops;	/* unstarted operations on this object */
@@ -411,7 +412,7 @@ extern const char *fscache_object_states[];
 	(test_bit(FSCACHE_IOERROR, &(obj)->cache->flags) &&	\
 	 (obj)->state >= FSCACHE_OBJECT_DYING)
 
-extern const struct slow_work_ops fscache_object_slow_work_ops;
+extern void fscache_object_work_func(struct work_struct *work);
 
 /**
  * fscache_object_init - Initialise a cache object description
@@ -433,7 +434,7 @@ void fscache_object_init(struct fscache_object *object,
 	spin_lock_init(&object->lock);
 	INIT_LIST_HEAD(&object->cache_link);
 	INIT_HLIST_NODE(&object->cookie_link);
-	vslow_work_init(&object->work, &fscache_object_slow_work_ops);
+	INIT_WORK(&object->work, fscache_object_work_func);
 	INIT_LIST_HEAD(&object->dependents);
 	INIT_LIST_HEAD(&object->dep_link);
 	INIT_LIST_HEAD(&object->pending_ops);
@@ -534,6 +535,8 @@ extern void fscache_io_error(struct fscache_cache *cache);
 extern void fscache_mark_pages_cached(struct fscache_retrieval *op,
 				      struct pagevec *pagevec);
 
+extern bool fscache_object_sleep_till_congested(signed long *timeoutp);
+
 extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
 					       const void *data,
 					       uint16_t datalen);
-- 
cgit v1.2.3-59-g8ed1b


From 8af7c12436803291c90295259db23d371a7ad9cc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jul 2010 22:09:01 +0200
Subject: fscache: convert operation to use workqueue instead of slow-work

Make fscache operation to use only workqueue instead of combination of
workqueue and slow-work.  FSCACHE_OP_SLOW is dropped and
FSCACHE_OP_FAST is renamed to FSCACHE_OP_ASYNC and uses newly added
fscache_op_wq workqueue to execute op->processor().
fscache_operation_init_slow() is dropped and fscache_operation_init()
now takes @processor argument directly.

* Unbound workqueue is used.

* fscache_retrieval_work() is no longer necessary as OP_ASYNC now does
  the equivalent thing.

* sysctl fscache.operation_max_active added to control concurrency.
  The default value is nr_cpus clamped between 2 and
  WQ_UNBOUND_MAX_ACTIVE.

* debugfs support is dropped for now.  Tracing API based debug
  facility is planned to be added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Howells <dhowells@redhat.com>
---
 fs/cachefiles/rdwr.c          |  4 +--
 fs/fscache/internal.h         |  1 +
 fs/fscache/main.c             | 23 +++++++++++++++
 fs/fscache/operation.c        | 67 ++++++-------------------------------------
 fs/fscache/page.c             | 36 ++++++-----------------
 include/linux/fscache-cache.h | 37 +++++++-----------------
 6 files changed, 53 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 0f0d41fbb03f..0e3c0924cc3a 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -422,7 +422,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
 
 	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
-	op->op.flags |= FSCACHE_OP_FAST;
+	op->op.flags |= FSCACHE_OP_ASYNC;
 	op->op.processor = cachefiles_read_copier;
 
 	pagevec_init(&pagevec, 0);
@@ -729,7 +729,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 	pagevec_init(&pagevec, 0);
 
 	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
-	op->op.flags |= FSCACHE_OP_FAST;
+	op->op.flags |= FSCACHE_OP_ASYNC;
 	op->op.processor = cachefiles_read_copier;
 
 	INIT_LIST_HEAD(&backpages);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 6e0b5fb25231..6a026441c5a6 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -83,6 +83,7 @@ extern unsigned fscache_defer_create;
 extern unsigned fscache_debug;
 extern struct kobject *fscache_root;
 extern struct workqueue_struct *fscache_object_wq;
+extern struct workqueue_struct *fscache_op_wq;
 DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
 
 static inline bool fscache_object_congested(void)
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index bb8d4c35c7a2..44d13ddab2cc 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -42,11 +42,13 @@ MODULE_PARM_DESC(fscache_debug,
 
 struct kobject *fscache_root;
 struct workqueue_struct *fscache_object_wq;
+struct workqueue_struct *fscache_op_wq;
 
 DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
 
 /* these values serve as lower bounds, will be adjusted in fscache_init() */
 static unsigned fscache_object_max_active = 4;
+static unsigned fscache_op_max_active = 2;
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fscache_sysctl_header;
@@ -74,6 +76,14 @@ ctl_table fscache_sysctls[] = {
 		.proc_handler	= fscache_max_active_sysctl,
 		.extra1		= &fscache_object_wq,
 	},
+	{
+		.procname	= "operation_max_active",
+		.data		= &fscache_op_max_active,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= fscache_max_active_sysctl,
+		.extra1		= &fscache_op_wq,
+	},
 	{}
 };
 
@@ -110,6 +120,16 @@ static int __init fscache_init(void)
 	if (!fscache_object_wq)
 		goto error_object_wq;
 
+	fscache_op_max_active =
+		clamp_val(fscache_object_max_active / 2,
+			  fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE);
+
+	ret = -ENOMEM;
+	fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND,
+					fscache_op_max_active);
+	if (!fscache_op_wq)
+		goto error_op_wq;
+
 	for_each_possible_cpu(cpu)
 		init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
 
@@ -152,6 +172,8 @@ error_sysctl:
 #endif
 	fscache_proc_cleanup();
 error_proc:
+	destroy_workqueue(fscache_op_wq);
+error_op_wq:
 	destroy_workqueue(fscache_object_wq);
 error_object_wq:
 	slow_work_unregister_user(THIS_MODULE);
@@ -172,6 +194,7 @@ static void __exit fscache_exit(void)
 	kmem_cache_destroy(fscache_cookie_jar);
 	unregister_sysctl_table(fscache_sysctl_header);
 	fscache_proc_cleanup();
+	destroy_workqueue(fscache_op_wq);
 	destroy_workqueue(fscache_object_wq);
 	slow_work_unregister_user(THIS_MODULE);
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index f17cecafae44..b9f34eaede09 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -42,16 +42,12 @@ void fscache_enqueue_operation(struct fscache_operation *op)
 
 	fscache_stat(&fscache_n_op_enqueue);
 	switch (op->flags & FSCACHE_OP_TYPE) {
-	case FSCACHE_OP_FAST:
-		_debug("queue fast");
+	case FSCACHE_OP_ASYNC:
+		_debug("queue async");
 		atomic_inc(&op->usage);
-		if (!schedule_work(&op->fast_work))
+		if (!queue_work(fscache_op_wq, &op->work))
 			fscache_put_operation(op);
 		break;
-	case FSCACHE_OP_SLOW:
-		_debug("queue slow");
-		slow_work_enqueue(&op->slow_work);
-		break;
 	case FSCACHE_OP_MYTHREAD:
 		_debug("queue for caller's attention");
 		break;
@@ -455,36 +451,13 @@ void fscache_operation_gc(struct work_struct *work)
 }
 
 /*
- * allow the slow work item processor to get a ref on an operation
- */
-static int fscache_op_get_ref(struct slow_work *work)
-{
-	struct fscache_operation *op =
-		container_of(work, struct fscache_operation, slow_work);
-
-	atomic_inc(&op->usage);
-	return 0;
-}
-
-/*
- * allow the slow work item processor to discard a ref on an operation
- */
-static void fscache_op_put_ref(struct slow_work *work)
-{
-	struct fscache_operation *op =
-		container_of(work, struct fscache_operation, slow_work);
-
-	fscache_put_operation(op);
-}
-
-/*
- * execute an operation using the slow thread pool to provide processing context
- * - the caller holds a ref to this object, so we don't need to hold one
+ * execute an operation using fs_op_wq to provide processing context -
+ * the caller holds a ref to this object, so we don't need to hold one
  */
-static void fscache_op_execute(struct slow_work *work)
+void fscache_op_work_func(struct work_struct *work)
 {
 	struct fscache_operation *op =
-		container_of(work, struct fscache_operation, slow_work);
+		container_of(work, struct fscache_operation, work);
 	unsigned long start;
 
 	_enter("{OBJ%x OP%x,%d}",
@@ -494,31 +467,7 @@ static void fscache_op_execute(struct slow_work *work)
 	start = jiffies;
 	op->processor(op);
 	fscache_hist(fscache_ops_histogram, start);
+	fscache_put_operation(op);
 
 	_leave("");
 }
-
-/*
- * describe an operation for slow-work debugging
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
-{
-	struct fscache_operation *op =
-		container_of(work, struct fscache_operation, slow_work);
-
-	seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
-		   op->object->debug_id, op->debug_id,
-		   op->name, op->state, op->flags);
-}
-#endif
-
-const struct slow_work_ops fscache_op_slow_work_ops = {
-	.owner		= THIS_MODULE,
-	.get_ref	= fscache_op_get_ref,
-	.put_ref	= fscache_op_put_ref,
-	.execute	= fscache_op_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	.desc		= fscache_op_desc,
-#endif
-};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 723b889fd219..41c441c2058d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -105,7 +105,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
 
 page_busy:
 	/* we might want to wait here, but that could deadlock the allocator as
-	 * the slow-work threads writing to the cache may all end up sleeping
+	 * the work threads writing to the cache may all end up sleeping
 	 * on memory allocation */
 	fscache_stat(&fscache_n_store_vmscan_busy);
 	return false;
@@ -188,9 +188,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 		return -ENOMEM;
 	}
 
-	fscache_operation_init(op, NULL);
-	fscache_operation_init_slow(op, fscache_attr_changed_op);
-	op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+	fscache_operation_init(op, fscache_attr_changed_op, NULL);
+	op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
 	fscache_set_op_name(op, "Attr");
 
 	spin_lock(&cookie->lock);
@@ -217,24 +216,6 @@ nobufs:
 }
 EXPORT_SYMBOL(__fscache_attr_changed);
 
-/*
- * handle secondary execution given to a retrieval op on behalf of the
- * cache
- */
-static void fscache_retrieval_work(struct work_struct *work)
-{
-	struct fscache_retrieval *op =
-		container_of(work, struct fscache_retrieval, op.fast_work);
-	unsigned long start;
-
-	_enter("{OP%x}", op->op.debug_id);
-
-	start = jiffies;
-	op->op.processor(&op->op);
-	fscache_hist(fscache_ops_histogram, start);
-	fscache_put_operation(&op->op);
-}
-
 /*
  * release a retrieval op reference
  */
@@ -269,13 +250,12 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 		return NULL;
 	}
 
-	fscache_operation_init(&op->op, fscache_release_retrieval_op);
+	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
 	op->op.flags	= FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
 	op->mapping	= mapping;
 	op->end_io_func	= end_io_func;
 	op->context	= context;
 	op->start_time	= jiffies;
-	INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
 	INIT_LIST_HEAD(&op->to_do);
 	fscache_set_op_name(&op->op, "Retr");
 	return op;
@@ -795,9 +775,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	if (!op)
 		goto nomem;
 
-	fscache_operation_init(&op->op, fscache_release_write_op);
-	fscache_operation_init_slow(&op->op, fscache_write_op);
-	op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+	fscache_operation_init(&op->op, fscache_write_op,
+			       fscache_release_write_op);
+	op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
 	fscache_set_op_name(&op->op, "Write1");
 
 	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
@@ -852,7 +832,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	fscache_stat(&fscache_n_store_ops);
 	fscache_stat(&fscache_n_stores_ok);
 
-	/* the slow work queue now carries its own ref on the object */
+	/* the work queue now carries its own ref on the object */
 	fscache_put_operation(&op->op);
 	_leave(" = 0");
 	return 0;
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 27c8df503152..17ed9c1dbfbe 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -77,18 +77,14 @@ typedef void (*fscache_operation_release_t)(struct fscache_operation *op);
 typedef void (*fscache_operation_processor_t)(struct fscache_operation *op);
 
 struct fscache_operation {
-	union {
-		struct work_struct fast_work;	/* record for fast ops */
-		struct slow_work slow_work;	/* record for (very) slow ops */
-	};
+	struct work_struct	work;		/* record for async ops */
 	struct list_head	pend_link;	/* link in object->pending_ops */
 	struct fscache_object	*object;	/* object to be operated upon */
 
 	unsigned long		flags;
 #define FSCACHE_OP_TYPE		0x000f	/* operation type */
-#define FSCACHE_OP_FAST		0x0001	/* - fast op, processor may not sleep for disk */
-#define FSCACHE_OP_SLOW		0x0002	/* - (very) slow op, processor may sleep for disk */
-#define FSCACHE_OP_MYTHREAD	0x0003	/* - processing is done be issuing thread, not pool */
+#define FSCACHE_OP_ASYNC	0x0001	/* - async op, processor may sleep for disk */
+#define FSCACHE_OP_MYTHREAD	0x0002	/* - processing is done be issuing thread, not pool */
 #define FSCACHE_OP_WAITING	4	/* cleared when op is woken */
 #define FSCACHE_OP_EXCLUSIVE	5	/* exclusive op, other ops must wait */
 #define FSCACHE_OP_DEAD		6	/* op is now dead */
@@ -106,7 +102,8 @@ struct fscache_operation {
 	/* operation releaser */
 	fscache_operation_release_t release;
 
-#ifdef CONFIG_SLOW_WORK_DEBUG
+#ifdef CONFIG_WORKQUEUE_DEBUGFS
+	struct work_struct put_work;	/* work to delay operation put */
 	const char *name;		/* operation name */
 	const char *state;		/* operation state */
 #define fscache_set_op_name(OP, N)	do { (OP)->name  = (N); } while(0)
@@ -118,7 +115,7 @@ struct fscache_operation {
 };
 
 extern atomic_t fscache_op_debug_id;
-extern const struct slow_work_ops fscache_op_slow_work_ops;
+extern void fscache_op_work_func(struct work_struct *work);
 
 extern void fscache_enqueue_operation(struct fscache_operation *);
 extern void fscache_put_operation(struct fscache_operation *);
@@ -129,33 +126,21 @@ extern void fscache_put_operation(struct fscache_operation *);
  * @release: The release function to assign
  *
  * Do basic initialisation of an operation.  The caller must still set flags,
- * object, either fast_work or slow_work if necessary, and processor if needed.
+ * object and processor if needed.
  */
 static inline void fscache_operation_init(struct fscache_operation *op,
-					  fscache_operation_release_t release)
+					fscache_operation_processor_t processor,
+					fscache_operation_release_t release)
 {
+	INIT_WORK(&op->work, fscache_op_work_func);
 	atomic_set(&op->usage, 1);
 	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
+	op->processor = processor;
 	op->release = release;
 	INIT_LIST_HEAD(&op->pend_link);
 	fscache_set_op_state(op, "Init");
 }
 
-/**
- * fscache_operation_init_slow - Do additional initialisation of a slow op
- * @op: The operation to initialise
- * @processor: The processor function to assign
- *
- * Do additional initialisation of an operation as required for slow work.
- */
-static inline
-void fscache_operation_init_slow(struct fscache_operation *op,
-				 fscache_operation_processor_t processor)
-{
-	op->processor = processor;
-	slow_work_init(&op->slow_work, &fscache_op_slow_work_ops);
-}
-
 /*
  * data read operation
  */
-- 
cgit v1.2.3-59-g8ed1b


From d098adfb7d281258173a43151483e52e21761021 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jul 2010 22:09:01 +0200
Subject: fscache: drop references to slow-work

fscache no longer uses slow-work.  Drop references to it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Howells <dhowells@redhat.com>
---
 fs/fscache/Kconfig            | 1 -
 fs/fscache/main.c             | 7 -------
 include/linux/fscache-cache.h | 1 -
 3 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index cc94bb9563f2..3f6dfa989881 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
 
 config FSCACHE
 	tristate "General filesystem local caching manager"
-	select SLOW_WORK
 	help
 	  This option enables a generic filesystem caching manager that can be
 	  used by various network and other filesystems to cache data locally.
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 44d13ddab2cc..500936d9fff2 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -106,10 +106,6 @@ static int __init fscache_init(void)
 	unsigned int cpu;
 	int ret;
 
-	ret = slow_work_register_user(THIS_MODULE);
-	if (ret < 0)
-		goto error_slow_work;
-
 	fscache_object_max_active =
 		clamp_val(nr_cpus,
 			  fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
@@ -176,8 +172,6 @@ error_proc:
 error_op_wq:
 	destroy_workqueue(fscache_object_wq);
 error_object_wq:
-	slow_work_unregister_user(THIS_MODULE);
-error_slow_work:
 	return ret;
 }
 
@@ -196,7 +190,6 @@ static void __exit fscache_exit(void)
 	fscache_proc_cleanup();
 	destroy_workqueue(fscache_op_wq);
 	destroy_workqueue(fscache_object_wq);
-	slow_work_unregister_user(THIS_MODULE);
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
 
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 17ed9c1dbfbe..b8581c09d19f 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -20,7 +20,6 @@
 
 #include <linux/fscache.h>
 #include <linux/sched.h>
-#include <linux/slow-work.h>
 #include <linux/workqueue.h>
 
 #define NR_MAXCACHES BITS_PER_LONG
-- 
cgit v1.2.3-59-g8ed1b


From 183d03cc4ff39e0f0d952c09aa96d0abfd6e0c3c Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Mon, 17 May 2010 17:08:21 +0100
Subject: xen: Xen PCI platform device driver.

Add the xen pci platform device driver that is responsible
for initializing the grant table and xenbus in PV on HVM mode.
Few changes to xenbus and grant table are necessary to allow the delayed
initialization in HVM mode.
Grant table needs few additional modifications to work in HVM mode.

The Xen PCI platform device raises an irq every time an event has been
delivered to us. However these interrupts are only delivered to vcpu 0.
The Xen PCI platform interrupt handler calls xen_hvm_evtchn_do_upcall
that is a little wrapper around __xen_evtchn_do_upcall, the traditional
Xen upcall handler, the very same used with traditional PV guests.

When running on HVM the event channel upcall is never called while in
progress because it is a normal Linux irq handler (and we cannot switch
the irq chip wholesale to the Xen PV ones as we are running QEMU and
might have passed in PCI devices), therefore we cannot be sure that
evtchn_upcall_pending is 0 when returning.
For this reason if evtchn_upcall_pending is set by Xen we need to loop
again on the event channels set pending otherwise we might loose some
event channel deliveries.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/Kconfig                 |   9 ++
 drivers/xen/Makefile                |   3 +-
 drivers/xen/events.c                |   8 +-
 drivers/xen/grant-table.c           |  77 +++++++++++++--
 drivers/xen/manage.c                |   1 +
 drivers/xen/platform-pci.c          | 181 ++++++++++++++++++++++++++++++++++++
 drivers/xen/xenbus/xenbus_probe.c   |  22 ++++-
 include/linux/pci_ids.h             |   3 +
 include/xen/grant_table.h           |   4 +
 include/xen/interface/grant_table.h |   1 +
 10 files changed, 291 insertions(+), 18 deletions(-)
 create mode 100644 drivers/xen/platform-pci.c

(limited to 'include/linux')

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index fad3df2c1276..8f84b108b491 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -62,4 +62,13 @@ config XEN_SYS_HYPERVISOR
 	 virtual environment, /sys/hypervisor will still be present,
 	 but will have no xen contents.
 
+config XEN_PLATFORM_PCI
+	tristate "xen platform pci device driver"
+	depends on XEN
+	default m
+	help
+	  Driver for the Xen PCI Platform device: it is responsible for
+	  initializing xenbus and grant_table when running in a Xen HVM
+	  domain. As a consequence this driver is required to run any Xen PV
+	  frontend on Xen HVM.
 endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 7c284342f30f..e392fb776af3 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -9,4 +9,5 @@ obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
 obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
 obj-$(CONFIG_XEN_DEV_EVTCHN)	+= evtchn.o
 obj-$(CONFIG_XENFS)		+= xenfs/
-obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
\ No newline at end of file
+obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
+obj-$(CONFIG_XEN_PLATFORM_PCI)	+= platform-pci.o
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index d659480125f0..7c64473c9f3f 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -665,7 +665,7 @@ static void __xen_evtchn_do_upcall(void)
 
 		count = __get_cpu_var(xed_nesting_count);
 		__get_cpu_var(xed_nesting_count) = 0;
-	} while(count != 1);
+	} while (count != 1 || vcpu_info->evtchn_upcall_pending);
 
 out:
 
@@ -689,6 +689,7 @@ void xen_hvm_evtchn_do_upcall(void)
 {
 	__xen_evtchn_do_upcall();
 }
+EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
 
 /* Rebind a new event channel to an existing irq. */
 void rebind_evtchn_irq(int evtchn, int irq)
@@ -725,7 +726,10 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 	struct evtchn_bind_vcpu bind_vcpu;
 	int evtchn = evtchn_from_irq(irq);
 
-	if (!VALID_EVTCHN(evtchn))
+	/* events delivered via platform PCI interrupts are always
+	 * routed to vcpu 0 */
+	if (!VALID_EVTCHN(evtchn) ||
+		(xen_hvm_domain() && !xen_have_vector_callback))
 		return -1;
 
 	/* Send future instances of this interrupt to other vcpu. */
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index f66db3b91d61..6c4531816496 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -37,11 +37,13 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/io.h>
 
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
 #include <xen/page.h>
 #include <xen/grant_table.h>
+#include <xen/interface/memory.h>
 #include <asm/xen/hypercall.h>
 
 #include <asm/pgtable.h>
@@ -59,6 +61,8 @@ static unsigned int boot_max_nr_grant_frames;
 static int gnttab_free_count;
 static grant_ref_t gnttab_free_head;
 static DEFINE_SPINLOCK(gnttab_list_lock);
+unsigned long xen_hvm_resume_frames;
+EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
 
 static struct grant_entry *shared;
 
@@ -433,7 +437,7 @@ static unsigned int __max_nr_grant_frames(void)
 	return query.max_nr_frames;
 }
 
-static inline unsigned int max_nr_grant_frames(void)
+unsigned int gnttab_max_grant_frames(void)
 {
 	unsigned int xen_max = __max_nr_grant_frames();
 
@@ -441,6 +445,7 @@ static inline unsigned int max_nr_grant_frames(void)
 		return boot_max_nr_grant_frames;
 	return xen_max;
 }
+EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
 
 static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 {
@@ -449,6 +454,30 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 	unsigned int nr_gframes = end_idx + 1;
 	int rc;
 
+	if (xen_hvm_domain()) {
+		struct xen_add_to_physmap xatp;
+		unsigned int i = end_idx;
+		rc = 0;
+		/*
+		 * Loop backwards, so that the first hypercall has the largest
+		 * index, ensuring that the table will grow only once.
+		 */
+		do {
+			xatp.domid = DOMID_SELF;
+			xatp.idx = i;
+			xatp.space = XENMAPSPACE_grant_table;
+			xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i;
+			rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
+			if (rc != 0) {
+				printk(KERN_WARNING
+						"grant table add_to_physmap failed, err=%d\n", rc);
+				break;
+			}
+		} while (i-- > start_idx);
+
+		return rc;
+	}
+
 	frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
 	if (!frames)
 		return -ENOMEM;
@@ -465,7 +494,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 
 	BUG_ON(rc || setup.status);
 
-	rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(),
+	rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(),
 				    &shared);
 	BUG_ON(rc);
 
@@ -476,9 +505,27 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 
 int gnttab_resume(void)
 {
-	if (max_nr_grant_frames() < nr_grant_frames)
+	unsigned int max_nr_gframes;
+
+	max_nr_gframes = gnttab_max_grant_frames();
+	if (max_nr_gframes < nr_grant_frames)
 		return -ENOSYS;
-	return gnttab_map(0, nr_grant_frames - 1);
+
+	if (xen_pv_domain())
+		return gnttab_map(0, nr_grant_frames - 1);
+
+	if (!shared) {
+		shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes);
+		if (shared == NULL) {
+			printk(KERN_WARNING
+					"Failed to ioremap gnttab share frames!");
+			return -ENOMEM;
+		}
+	}
+
+	gnttab_map(0, nr_grant_frames - 1);
+
+	return 0;
 }
 
 int gnttab_suspend(void)
@@ -495,7 +542,7 @@ static int gnttab_expand(unsigned int req_entries)
 	cur = nr_grant_frames;
 	extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
 		 GREFS_PER_GRANT_FRAME);
-	if (cur + extra > max_nr_grant_frames())
+	if (cur + extra > gnttab_max_grant_frames())
 		return -ENOSPC;
 
 	rc = gnttab_map(cur, cur + extra - 1);
@@ -505,15 +552,12 @@ static int gnttab_expand(unsigned int req_entries)
 	return rc;
 }
 
-static int __devinit gnttab_init(void)
+int gnttab_init(void)
 {
 	int i;
 	unsigned int max_nr_glist_frames, nr_glist_frames;
 	unsigned int nr_init_grefs;
 
-	if (!xen_domain())
-		return -ENODEV;
-
 	nr_grant_frames = 1;
 	boot_max_nr_grant_frames = __max_nr_grant_frames();
 
@@ -556,5 +600,18 @@ static int __devinit gnttab_init(void)
 	kfree(gnttab_list);
 	return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(gnttab_init);
+
+static int __devinit __gnttab_init(void)
+{
+	/* Delay grant-table initialization in the PV on HVM case */
+	if (xen_hvm_domain())
+		return 0;
+
+	if (!xen_pv_domain())
+		return -ENODEV;
+
+	return gnttab_init();
+}
 
-core_initcall(gnttab_init);
+core_initcall(__gnttab_init);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 07e857b0de13..af9c5594d315 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -264,5 +264,6 @@ static int __init setup_shutdown_event(void)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
 
 subsys_initcall(setup_shutdown_event);
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
new file mode 100644
index 000000000000..a0ee5d06f715
--- /dev/null
+++ b/drivers/xen/platform-pci.c
@@ -0,0 +1,181 @@
+/******************************************************************************
+ * platform-pci.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2005, Intel Corporation.
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/hvm.h>
+
+#define DRV_NAME    "xen-platform-pci"
+
+MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com");
+MODULE_DESCRIPTION("Xen platform PCI device");
+MODULE_LICENSE("GPL");
+
+static unsigned long platform_mmio;
+static unsigned long platform_mmio_alloc;
+static unsigned long platform_mmiolen;
+
+unsigned long alloc_xen_mmio(unsigned long len)
+{
+	unsigned long addr;
+
+	addr = platform_mmio + platform_mmio_alloc;
+	platform_mmio_alloc += len;
+	BUG_ON(platform_mmio_alloc > platform_mmiolen);
+
+	return addr;
+}
+
+static uint64_t get_callback_via(struct pci_dev *pdev)
+{
+	u8 pin;
+	int irq;
+
+	irq = pdev->irq;
+	if (irq < 16)
+		return irq; /* ISA IRQ */
+
+	pin = pdev->pin;
+
+	/* We don't know the GSI. Specify the PCI INTx line instead. */
+	return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */
+		((uint64_t)pci_domain_nr(pdev->bus) << 32) |
+		((uint64_t)pdev->bus->number << 16) |
+		((uint64_t)(pdev->devfn & 0xff) << 8) |
+		((uint64_t)(pin - 1) & 3);
+}
+
+static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
+{
+	xen_hvm_evtchn_do_upcall();
+	return IRQ_HANDLED;
+}
+
+static int xen_allocate_irq(struct pci_dev *pdev)
+{
+	return request_irq(pdev->irq, do_hvm_evtchn_intr,
+			IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
+			"xen-platform-pci", pdev);
+}
+
+static int __devinit platform_pci_init(struct pci_dev *pdev,
+				       const struct pci_device_id *ent)
+{
+	int i, ret;
+	long ioaddr, iolen;
+	long mmio_addr, mmio_len;
+	uint64_t callback_via;
+	unsigned int max_nr_gframes;
+
+	i = pci_enable_device(pdev);
+	if (i)
+		return i;
+
+	ioaddr = pci_resource_start(pdev, 0);
+	iolen = pci_resource_len(pdev, 0);
+
+	mmio_addr = pci_resource_start(pdev, 1);
+	mmio_len = pci_resource_len(pdev, 1);
+
+	if (mmio_addr == 0 || ioaddr == 0) {
+		dev_err(&pdev->dev, "no resources found\n");
+		ret = -ENOENT;
+		goto pci_out;
+	}
+
+	if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) {
+		dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n",
+		       mmio_addr, mmio_len);
+		ret = -EBUSY;
+		goto pci_out;
+	}
+
+	if (request_region(ioaddr, iolen, DRV_NAME) == NULL) {
+		dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n",
+		       iolen, ioaddr);
+		ret = -EBUSY;
+		goto mem_out;
+	}
+
+	platform_mmio = mmio_addr;
+	platform_mmiolen = mmio_len;
+
+	if (!xen_have_vector_callback) {
+		ret = xen_allocate_irq(pdev);
+		if (ret) {
+			dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret);
+			goto out;
+		}
+		callback_via = get_callback_via(pdev);
+		ret = xen_set_callback_via(callback_via);
+		if (ret) {
+			dev_warn(&pdev->dev, "Unable to set the evtchn callback "
+					 "err=%d\n", ret);
+			goto out;
+		}
+	}
+
+	max_nr_gframes = gnttab_max_grant_frames();
+	xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+	ret = gnttab_init();
+	if (ret)
+		goto out;
+	xenbus_probe(NULL);
+	return 0;
+
+out:
+	release_region(ioaddr, iolen);
+mem_out:
+	release_mem_region(mmio_addr, mmio_len);
+pci_out:
+	pci_disable_device(pdev);
+	return ret;
+}
+
+static struct pci_device_id platform_pci_tbl[] __devinitdata = {
+	{PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{0,}
+};
+
+MODULE_DEVICE_TABLE(pci, platform_pci_tbl);
+
+static struct pci_driver platform_driver = {
+	.name =           DRV_NAME,
+	.probe =          platform_pci_init,
+	.id_table =       platform_pci_tbl,
+};
+
+static int __init platform_pci_module_init(void)
+{
+	return pci_register_driver(&platform_driver);
+}
+
+module_init(platform_pci_module_init);
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index d96fa75b45ec..a9e83c438cbb 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -781,8 +781,23 @@ void xenbus_probe(struct work_struct *unused)
 	/* Notify others that xenstore is up */
 	blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
 }
+EXPORT_SYMBOL_GPL(xenbus_probe);
 
-static int __init xenbus_probe_init(void)
+static int __init xenbus_probe_initcall(void)
+{
+	if (!xen_domain())
+		return -ENODEV;
+
+	if (xen_initial_domain() || xen_hvm_domain())
+		return 0;
+
+	xenbus_probe(NULL);
+	return 0;
+}
+
+device_initcall(xenbus_probe_initcall);
+
+static int __init xenbus_init(void)
 {
 	int err = 0;
 
@@ -834,9 +849,6 @@ static int __init xenbus_probe_init(void)
 		goto out_unreg_back;
 	}
 
-	if (!xen_initial_domain())
-		xenbus_probe(NULL);
-
 #ifdef CONFIG_XEN_COMPAT_XENFS
 	/*
 	 * Create xenfs mountpoint in /proc for compatibility with
@@ -857,7 +869,7 @@ static int __init xenbus_probe_init(void)
 	return err;
 }
 
-postcore_initcall(xenbus_probe_init);
+postcore_initcall(xenbus_init);
 
 MODULE_LICENSE("GPL");
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3bedcc149c84..cca2526f28d7 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2772,3 +2772,6 @@
 #define PCI_DEVICE_ID_RME_DIGI32	0x9896
 #define PCI_DEVICE_ID_RME_DIGI32_PRO	0x9897
 #define PCI_DEVICE_ID_RME_DIGI32_8	0x9898
+
+#define PCI_VENDOR_ID_XEN		0x5853
+#define PCI_DEVICE_ID_XEN_PLATFORM	0x0001
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index a40f1cd91be1..9a731706a016 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -51,6 +51,7 @@ struct gnttab_free_callback {
 	u16 count;
 };
 
+int gnttab_init(void);
 int gnttab_suspend(void);
 int gnttab_resume(void);
 
@@ -112,6 +113,9 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
 void arch_gnttab_unmap_shared(struct grant_entry *shared,
 			      unsigned long nr_gframes);
 
+extern unsigned long xen_hvm_resume_frames;
+unsigned int gnttab_max_grant_frames(void);
+
 #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
 
 #endif /* __ASM_GNTTAB_H__ */
diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h
index 39da93c21de0..39e571796e32 100644
--- a/include/xen/interface/grant_table.h
+++ b/include/xen/interface/grant_table.h
@@ -28,6 +28,7 @@
 #ifndef __XEN_PUBLIC_GRANT_TABLE_H__
 #define __XEN_PUBLIC_GRANT_TABLE_H__
 
+#include <xen/interface/xen.h>
 
 /***********************************
  * GRANT TABLE REPRESENTATION
-- 
cgit v1.2.3-59-g8ed1b


From 2f1b7cd29fa4917f19d2624afc773d941684c5df Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 22 Jul 2010 03:22:18 +0900
Subject: nilfs2: clarify byte offset in super block format

This inserts comments indicating hexadecimal offset in declaration of
nilfs_super_block structure so that people can know offset of its
fields without counting from the head.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 include/linux/nilfs2_fs.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 8c2c6116e788..cc3465e5be38 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -160,7 +160,7 @@ struct nilfs_super_root {
  * struct nilfs_super_block - structure of super block on disk
  */
 struct nilfs_super_block {
-	__le32	s_rev_level;		/* Revision level */
+/*00*/	__le32	s_rev_level;		/* Revision level */
 	__le16	s_minor_rev_level;	/* minor revision level */
 	__le16	s_magic;		/* Magic signature */
 
@@ -169,47 +169,47 @@ struct nilfs_super_block {
 					   is excluded. */
 	__le16  s_flags;		/* flags */
 	__le32  s_crc_seed;		/* Seed value of CRC calculation */
-	__le32	s_sum;			/* Check sum of super block */
+/*10*/	__le32	s_sum;			/* Check sum of super block */
 
 	__le32	s_log_block_size;	/* Block size represented as follows
 					   blocksize =
 					       1 << (s_log_block_size + 10) */
 	__le64  s_nsegments;		/* Number of segments in filesystem */
-	__le64  s_dev_size;		/* block device size in bytes */
+/*20*/	__le64  s_dev_size;		/* block device size in bytes */
 	__le64	s_first_data_block;	/* 1st seg disk block number */
-	__le32  s_blocks_per_segment;   /* number of blocks per full segment */
+/*30*/	__le32  s_blocks_per_segment;   /* number of blocks per full segment */
 	__le32	s_r_segments_percentage; /* Reserved segments percentage */
 
 	__le64  s_last_cno;		/* Last checkpoint number */
-	__le64  s_last_pseg;		/* disk block addr pseg written last */
+/*40*/	__le64  s_last_pseg;		/* disk block addr pseg written last */
 	__le64  s_last_seq;             /* seq. number of seg written last */
-	__le64	s_free_blocks_count;	/* Free blocks count */
+/*50*/	__le64	s_free_blocks_count;	/* Free blocks count */
 
 	__le64	s_ctime;		/* Creation time (execution time of
 					   newfs) */
-	__le64	s_mtime;		/* Mount time */
+/*60*/	__le64	s_mtime;		/* Mount time */
 	__le64	s_wtime;		/* Write time */
-	__le16	s_mnt_count;		/* Mount count */
+/*70*/	__le16	s_mnt_count;		/* Mount count */
 	__le16	s_max_mnt_count;	/* Maximal mount count */
 	__le16	s_state;		/* File system state */
 	__le16	s_errors;		/* Behaviour when detecting errors */
 	__le64	s_lastcheck;		/* time of last check */
 
-	__le32	s_checkinterval;	/* max. time between checks */
+/*80*/	__le32	s_checkinterval;	/* max. time between checks */
 	__le32	s_creator_os;		/* OS */
 	__le16	s_def_resuid;		/* Default uid for reserved blocks */
 	__le16	s_def_resgid;		/* Default gid for reserved blocks */
 	__le32	s_first_ino;		/* First non-reserved inode */
 
-	__le16  s_inode_size;		/* Size of an inode */
+/*90*/	__le16  s_inode_size;		/* Size of an inode */
 	__le16  s_dat_entry_size;       /* Size of a dat entry */
 	__le16  s_checkpoint_size;      /* Size of a checkpoint */
 	__le16	s_segment_usage_size;	/* Size of a segment usage */
 
-	__u8	s_uuid[16];		/* 128-bit uuid for volume */
-	char	s_volume_name[80];	/* volume name */
+/*98*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
+/*A8*/	char	s_volume_name[80];	/* volume name */
 
-	__le32  s_c_interval;           /* Commit interval of segment */
+/*F8*/	__le32  s_c_interval;           /* Commit interval of segment */
 	__le32  s_c_block_max;          /* Threshold of data amount for
 					   the segment construction */
 	__u32	s_reserved[192];	/* padding to the end of the block */
-- 
cgit v1.2.3-59-g8ed1b


From 1a80a1763fb760b3a84a28df87515f7cdc07a4f4 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 22 Jul 2010 03:22:19 +0900
Subject: nilfs2: add feature set fields to super block

This adds three new fields to nilfs_super_block structure, compatible
feature set, readonly-compatible feature set, and incompatible feature
set in order to prepare for future disk format modifications.

The role of these fields conforms to those of ext3 or other
filesystems.  Most important flags are the incompatible feature set;
it is used to refuse to mount the filesystem which sets an
incompatible feature the kernel doesn't know about.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 include/linux/nilfs2_fs.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index cc3465e5be38..7dd4cd494490 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -212,7 +212,10 @@ struct nilfs_super_block {
 /*F8*/	__le32  s_c_interval;           /* Commit interval of segment */
 	__le32  s_c_block_max;          /* Threshold of data amount for
 					   the segment construction */
-	__u32	s_reserved[192];	/* padding to the end of the block */
+/*100*/	__le64  s_feature_compat;	/* Compatible feature set */
+	__le64  s_feature_compat_ro;	/* Read-only compatible feature set */
+	__le64  s_feature_incompat;	/* Incompatible feature set */
+	__u32	s_reserved[186];	/* padding to the end of the block */
 };
 
 /*
@@ -227,6 +230,16 @@ struct nilfs_super_block {
 #define NILFS_CURRENT_REV	2	/* current major revision */
 #define NILFS_MINOR_REV		0	/* minor revision */
 
+/*
+ * Feature set definitions
+ *
+ * If there is a bit set in the incompatible feature set that the kernel
+ * doesn't know about, it should refuse to mount the filesystem.
+ */
+#define NILFS_FEATURE_COMPAT_SUPP	0ULL
+#define NILFS_FEATURE_COMPAT_RO_SUPP	0ULL
+#define NILFS_FEATURE_INCOMPAT_SUPP	0ULL
+
 /*
  * Bytes count of super_block for CRC-calculation
  */
-- 
cgit v1.2.3-59-g8ed1b


From 9c3e1c39679144c250dda95098333ecb5f1f407a Mon Sep 17 00:00:00 2001
From: Hannes Eder <heder@google.com>
Date: Fri, 23 Jul 2010 12:42:58 +0200
Subject: netfilter: xt_ipvs (netfilter matcher for IPVS)

This implements the kernel-space side of the netfilter matcher xt_ipvs.

[ minor fixes by Simon Horman <horms@verge.net.au> ]
Signed-off-by: Hannes Eder <heder@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
[ Patrick: added xt_ipvs.h to Kbuild ]
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/Kbuild    |   1 +
 include/linux/netfilter/xt_ipvs.h |  27 ++++++
 net/netfilter/Kconfig             |  10 ++
 net/netfilter/Makefile            |   1 +
 net/netfilter/ipvs/ip_vs_proto.c  |   1 +
 net/netfilter/xt_ipvs.c           | 189 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 229 insertions(+)
 create mode 100644 include/linux/netfilter/xt_ipvs.h
 create mode 100644 net/netfilter/xt_ipvs.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index b93b64dc9fae..0cb62c857187 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -31,6 +31,7 @@ header-y += xt_dscp.h
 header-y += xt_esp.h
 header-y += xt_hashlimit.h
 header-y += xt_iprange.h
+header-y += xt_ipvs.h
 header-y += xt_helper.h
 header-y += xt_length.h
 header-y += xt_limit.h
diff --git a/include/linux/netfilter/xt_ipvs.h b/include/linux/netfilter/xt_ipvs.h
new file mode 100644
index 000000000000..1167aeb7a347
--- /dev/null
+++ b/include/linux/netfilter/xt_ipvs.h
@@ -0,0 +1,27 @@
+#ifndef _XT_IPVS_H
+#define _XT_IPVS_H
+
+enum {
+	XT_IPVS_IPVS_PROPERTY =	1 << 0, /* all other options imply this one */
+	XT_IPVS_PROTO =		1 << 1,
+	XT_IPVS_VADDR =		1 << 2,
+	XT_IPVS_VPORT =		1 << 3,
+	XT_IPVS_DIR =		1 << 4,
+	XT_IPVS_METHOD =	1 << 5,
+	XT_IPVS_VPORTCTL =	1 << 6,
+	XT_IPVS_MASK =		(1 << 7) - 1,
+	XT_IPVS_ONCE_MASK =	XT_IPVS_MASK & ~XT_IPVS_IPVS_PROPERTY
+};
+
+struct xt_ipvs_mtinfo {
+	union nf_inet_addr	vaddr, vmask;
+	__be16			vport;
+	__u8			l4proto;
+	__u8			fwd_method;
+	__be16			vportctl;
+
+	__u8			invert;
+	__u8			bitmask;
+};
+
+#endif /* _XT_IPVS_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 5fb8efa84df3..551b58419df9 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -742,6 +742,16 @@ config NETFILTER_XT_MATCH_IPRANGE
 
 	If unsure, say M.
 
+config NETFILTER_XT_MATCH_IPVS
+	tristate '"ipvs" match support'
+	depends on IP_VS
+	depends on NETFILTER_ADVANCED
+	depends on NF_CONNTRACK
+	help
+	  This option allows you to match against IPVS properties of a packet.
+
+	  If unsure, say N.
+
 config NETFILTER_XT_MATCH_LENGTH
 	tristate '"length" match support'
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 36ef8e63be1e..4366c79a6683 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -77,6 +77,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 2d3d5e4b35f8..027f654799fe 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -98,6 +98,7 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
 
 	return NULL;
 }
+EXPORT_SYMBOL(ip_vs_proto_get);
 
 
 /*
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
new file mode 100644
index 000000000000..7a4d66db95ae
--- /dev/null
+++ b/net/netfilter/xt_ipvs.c
@@ -0,0 +1,189 @@
+/*
+ *	xt_ipvs - kernel module to match IPVS connection properties
+ *
+ *	Author: Hannes Eder <heder@google.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#endif
+#include <linux/ip_vs.h>
+#include <linux/types.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_ipvs.h>
+#include <net/netfilter/nf_conntrack.h>
+
+#include <net/ip_vs.h>
+
+MODULE_AUTHOR("Hannes Eder <heder@google.com>");
+MODULE_DESCRIPTION("Xtables: match IPVS connection properties");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_ipvs");
+MODULE_ALIAS("ip6t_ipvs");
+
+/* borrowed from xt_conntrack */
+static bool ipvs_mt_addrcmp(const union nf_inet_addr *kaddr,
+			    const union nf_inet_addr *uaddr,
+			    const union nf_inet_addr *umask,
+			    unsigned int l3proto)
+{
+	if (l3proto == NFPROTO_IPV4)
+		return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0;
+#ifdef CONFIG_IP_VS_IPV6
+	else if (l3proto == NFPROTO_IPV6)
+		return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6,
+		       &uaddr->in6) == 0;
+#endif
+	else
+		return false;
+}
+
+static bool
+ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_ipvs_mtinfo *data = par->matchinfo;
+	/* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */
+	const u_int8_t family = par->family;
+	struct ip_vs_iphdr iph;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_conn *cp;
+	bool match = true;
+
+	if (data->bitmask == XT_IPVS_IPVS_PROPERTY) {
+		match = skb->ipvs_property ^
+			!!(data->invert & XT_IPVS_IPVS_PROPERTY);
+		goto out;
+	}
+
+	/* other flags than XT_IPVS_IPVS_PROPERTY are set */
+	if (!skb->ipvs_property) {
+		match = false;
+		goto out;
+	}
+
+	ip_vs_fill_iphdr(family, skb_network_header(skb), &iph);
+
+	if (data->bitmask & XT_IPVS_PROTO)
+		if ((iph.protocol == data->l4proto) ^
+		    !(data->invert & XT_IPVS_PROTO)) {
+			match = false;
+			goto out;
+		}
+
+	pp = ip_vs_proto_get(iph.protocol);
+	if (unlikely(!pp)) {
+		match = false;
+		goto out;
+	}
+
+	/*
+	 * Check if the packet belongs to an existing entry
+	 */
+	cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */);
+	if (unlikely(cp == NULL)) {
+		match = false;
+		goto out;
+	}
+
+	/*
+	 * We found a connection, i.e. ct != 0, make sure to call
+	 * __ip_vs_conn_put before returning.  In our case jump to out_put_con.
+	 */
+
+	if (data->bitmask & XT_IPVS_VPORT)
+		if ((cp->vport == data->vport) ^
+		    !(data->invert & XT_IPVS_VPORT)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+	if (data->bitmask & XT_IPVS_VPORTCTL)
+		if ((cp->control != NULL &&
+		     cp->control->vport == data->vportctl) ^
+		    !(data->invert & XT_IPVS_VPORTCTL)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+	if (data->bitmask & XT_IPVS_DIR) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct == NULL || nf_ct_is_untracked(ct)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+		if ((ctinfo >= IP_CT_IS_REPLY) ^
+		    !!(data->invert & XT_IPVS_DIR)) {
+			match = false;
+			goto out_put_cp;
+		}
+	}
+
+	if (data->bitmask & XT_IPVS_METHOD)
+		if (((cp->flags & IP_VS_CONN_F_FWD_MASK) == data->fwd_method) ^
+		    !(data->invert & XT_IPVS_METHOD)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+	if (data->bitmask & XT_IPVS_VADDR) {
+		if (ipvs_mt_addrcmp(&cp->vaddr, &data->vaddr,
+				    &data->vmask, family) ^
+		    !(data->invert & XT_IPVS_VADDR)) {
+			match = false;
+			goto out_put_cp;
+		}
+	}
+
+out_put_cp:
+	__ip_vs_conn_put(cp);
+out:
+	pr_debug("match=%d\n", match);
+	return match;
+}
+
+static int ipvs_mt_check(const struct xt_mtchk_param *par)
+{
+	if (par->family != NFPROTO_IPV4
+#ifdef CONFIG_IP_VS_IPV6
+	    && par->family != NFPROTO_IPV6
+#endif
+		) {
+		pr_info("protocol family %u not supported\n", par->family);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match xt_ipvs_mt_reg __read_mostly = {
+	.name       = "ipvs",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.match      = ipvs_mt,
+	.checkentry = ipvs_mt_check,
+	.matchsize  = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+	.me         = THIS_MODULE,
+};
+
+static int __init ipvs_mt_init(void)
+{
+	return xt_register_match(&xt_ipvs_mt_reg);
+}
+
+static void __exit ipvs_mt_exit(void)
+{
+	xt_unregister_match(&xt_ipvs_mt_reg);
+}
+
+module_init(ipvs_mt_init);
+module_exit(ipvs_mt_exit);
-- 
cgit v1.2.3-59-g8ed1b


From 43d2932d88e4ab776dd388c20b003ebd5e1d1f1f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 21 Jul 2010 14:22:21 +0200
Subject: quota: Use mark_inode_dirty_sync instead of mark_inode_dirty

Quota code never touches file data. It just modifies i_blocks + i_bytes
of inodes and inode flags of quota files. So use mark_inode_dirty_sync
instead of mark_inode_dirty.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c         |  2 +-
 include/linux/quotaops.h | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index b171221000fa..a7023bcfae4f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1992,7 +1992,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 				truncate_inode_pages(&toputinode[cnt]->i_data,
 						     0);
 				mutex_unlock(&toputinode[cnt]->i_mutex);
-				mark_inode_dirty(toputinode[cnt]);
+				mark_inode_dirty_sync(toputinode[cnt]);
 			}
 			mutex_unlock(&dqopt->dqonoff_mutex);
 		}
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 4881b49b1a9a..d50ba858cfe0 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -266,7 +266,7 @@ static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr)
 static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr)
 {
 	__dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL);
-	mark_inode_dirty(inode);
+	mark_inode_dirty_sync(inode);
 }
 
 static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
@@ -275,7 +275,7 @@ static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
 
 	ret = dquot_alloc_space_nodirty(inode, nr);
 	if (!ret)
-		mark_inode_dirty(inode);
+		mark_inode_dirty_sync(inode);
 	return ret;
 }
 
@@ -305,7 +305,7 @@ static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr)
 
 	ret = dquot_prealloc_block_nodirty(inode, nr);
 	if (!ret)
-		mark_inode_dirty(inode);
+		mark_inode_dirty_sync(inode);
 	return ret;
 }
 
@@ -321,7 +321,7 @@ static inline int dquot_claim_block(struct inode *inode, qsize_t nr)
 
 	ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits);
 	if (!ret)
-		mark_inode_dirty(inode);
+		mark_inode_dirty_sync(inode);
 	return ret;
 }
 
@@ -333,7 +333,7 @@ static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr)
 static inline void dquot_free_space(struct inode *inode, qsize_t nr)
 {
 	dquot_free_space_nodirty(inode, nr);
-	mark_inode_dirty(inode);
+	mark_inode_dirty_sync(inode);
 }
 
 static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr)
-- 
cgit v1.2.3-59-g8ed1b


From e8648a1fdb54da1f683784b36a17aa65ea56e931 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 23 Jul 2010 12:59:36 +0200
Subject: netfilter: add xt_cpu match

In some situations a CPU match permits a better spreading of
connections, or select targets only for a given cpu.

With Remote Packet Steering or multiqueue NIC and appropriate IRQ
affinities, we can distribute trafic on available cpus, per session.
(all RX packets for a given flow is handled by a given cpu)

Some legacy applications being not SMP friendly, one way to scale a
server is to run multiple copies of them.

Instead of randomly choosing an instance, we can use the cpu number as a
key so that softirq handler for a whole instance is running on a single
cpu, maximizing cache effects in TCP/UDP stacks.

Using NAT for example, a four ways machine might run four copies of
server application, using a separate listening port for each instance,
but still presenting an unique external port :

iptables -t nat -A PREROUTING -p tcp --dport 80 -m cpu --cpu 0 \
        -j REDIRECT --to-port 8080

iptables -t nat -A PREROUTING -p tcp --dport 80 -m cpu --cpu 1 \
        -j REDIRECT --to-port 8081

iptables -t nat -A PREROUTING -p tcp --dport 80 -m cpu --cpu 2 \
        -j REDIRECT --to-port 8082

iptables -t nat -A PREROUTING -p tcp --dport 80 -m cpu --cpu 3 \
        -j REDIRECT --to-port 8083

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/Kbuild   |  3 +-
 include/linux/netfilter/xt_cpu.h | 11 +++++++
 net/netfilter/Kconfig            |  9 ++++++
 net/netfilter/Makefile           |  1 +
 net/netfilter/xt_cpu.c           | 63 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/netfilter/xt_cpu.h
 create mode 100644 net/netfilter/xt_cpu.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index 0cb62c857187..edeeabdc1500 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -20,12 +20,13 @@ header-y += xt_TCPMSS.h
 header-y += xt_TCPOPTSTRIP.h
 header-y += xt_TEE.h
 header-y += xt_TPROXY.h
+header-y += xt_cluster.h
 header-y += xt_comment.h
 header-y += xt_connbytes.h
 header-y += xt_connlimit.h
 header-y += xt_connmark.h
 header-y += xt_conntrack.h
-header-y += xt_cluster.h
+header-y += xt_cpu.h
 header-y += xt_dccp.h
 header-y += xt_dscp.h
 header-y += xt_esp.h
diff --git a/include/linux/netfilter/xt_cpu.h b/include/linux/netfilter/xt_cpu.h
new file mode 100644
index 000000000000..93c7f11d8f42
--- /dev/null
+++ b/include/linux/netfilter/xt_cpu.h
@@ -0,0 +1,11 @@
+#ifndef _XT_CPU_H
+#define _XT_CPU_H
+
+#include <linux/types.h>
+
+struct xt_cpu_info {
+	__u32	cpu;
+	__u32	invert;
+};
+
+#endif /*_XT_CPU_H*/
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 551b58419df9..43288259f4a1 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -663,6 +663,15 @@ config NETFILTER_XT_MATCH_CONNTRACK
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_MATCH_CPU
+	tristate '"cpu" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  CPU matching allows you to match packets based on the CPU
+	  currently handling the packet.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_MATCH_DCCP
 	tristate '"dccp" protocol match support'
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 4366c79a6683..441050f31111 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o
diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c
new file mode 100644
index 000000000000..b39db8a5cbae
--- /dev/null
+++ b/net/netfilter/xt_cpu.c
@@ -0,0 +1,63 @@
+/* Kernel module to match running CPU */
+
+/*
+ * Might be used to distribute connections on several daemons, if
+ * RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable,
+ * each RX queue IRQ affined to one CPU (1:1 mapping)
+ *
+ */
+
+/* (C) 2010 Eric Dumazet
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/xt_cpu.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>");
+MODULE_DESCRIPTION("Xtables: CPU match");
+
+static int cpu_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_cpu_info *info = par->matchinfo;
+
+	if (info->invert & ~1)
+		return -EINVAL;
+	return 0;
+}
+
+static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_cpu_info *info = par->matchinfo;
+
+	return (info->cpu == smp_processor_id()) ^ info->invert;
+}
+
+static struct xt_match cpu_mt_reg __read_mostly = {
+	.name       = "cpu",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = cpu_mt_check,
+	.match      = cpu_mt,
+	.matchsize  = sizeof(struct xt_cpu_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init cpu_mt_init(void)
+{
+	return xt_register_match(&cpu_mt_reg);
+}
+
+static void __exit cpu_mt_exit(void)
+{
+	xt_unregister_match(&cpu_mt_reg);
+}
+
+module_init(cpu_mt_init);
+module_exit(cpu_mt_exit);
-- 
cgit v1.2.3-59-g8ed1b


From 181a51f6e040d0ac006d6adaf4a031ffa440f41c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 20 Jul 2010 22:09:02 +0200
Subject: slow-work: kill it

slow-work doesn't have any user left.  Kill it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  322 -------------
 include/linux/slow-work.h   |  163 -------
 init/Kconfig                |   24 -
 kernel/Makefile             |    2 -
 kernel/slow-work-debugfs.c  |  227 ---------
 kernel/slow-work.c          | 1068 -------------------------------------------
 kernel/slow-work.h          |   72 ---
 kernel/sysctl.c             |    8 -
 8 files changed, 1886 deletions(-)
 delete mode 100644 Documentation/slow-work.txt
 delete mode 100644 include/linux/slow-work.h
 delete mode 100644 kernel/slow-work-debugfs.c
 delete mode 100644 kernel/slow-work.c
 delete mode 100644 kernel/slow-work.h

(limited to 'include/linux')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
deleted file mode 100644
index 9dbf4470c7e1..000000000000
--- a/Documentation/slow-work.txt
+++ /dev/null
@@ -1,322 +0,0 @@
-		     ====================================
-		     SLOW WORK ITEM EXECUTION THREAD POOL
-		     ====================================
-
-By: David Howells <dhowells@redhat.com>
-
-The slow work item execution thread pool is a pool of threads for performing
-things that take a relatively long time, such as making mkdir calls.
-Typically, when processing something, these items will spend a lot of time
-blocking a thread on I/O, thus making that thread unavailable for doing other
-work.
-
-The standard workqueue model is unsuitable for this class of work item as that
-limits the owner to a single thread or a single thread per CPU.  For some
-tasks, however, more threads - or fewer - are required.
-
-There is just one pool per system.  It contains no threads unless something
-wants to use it - and that something must register its interest first.  When
-the pool is active, the number of threads it contains is dynamic, varying
-between a maximum and minimum setting, depending on the load.
-
-
-====================
-CLASSES OF WORK ITEM
-====================
-
-This pool support two classes of work items:
-
- (*) Slow work items.
-
- (*) Very slow work items.
-
-The former are expected to finish much quicker than the latter.
-
-An operation of the very slow class may do a batch combination of several
-lookups, mkdirs, and a create for instance.
-
-An operation of the ordinarily slow class may, for example, write stuff or
-expand files, provided the time taken to do so isn't too long.
-
-Operations of both types may sleep during execution, thus tying up the thread
-loaned to it.
-
-A further class of work item is available, based on the slow work item class:
-
- (*) Delayed slow work items.
-
-These are slow work items that have a timer to defer queueing of the item for
-a while.
-
-
-THREAD-TO-CLASS ALLOCATION
---------------------------
-
-Not all the threads in the pool are available to work on very slow work items.
-The number will be between one and one fewer than the number of active threads.
-This is configurable (see the "Pool Configuration" section).
-
-All the threads are available to work on ordinarily slow work items, but a
-percentage of the threads will prefer to work on very slow work items.
-
-The configuration ensures that at least one thread will be available to work on
-very slow work items, and at least one thread will be available that won't work
-on very slow work items at all.
-
-
-=====================
-USING SLOW WORK ITEMS
-=====================
-
-Firstly, a module or subsystem wanting to make use of slow work items must
-register its interest:
-
-	 int ret = slow_work_register_user(struct module *module);
-
-This will return 0 if successful, or a -ve error upon failure.  The module
-pointer should be the module interested in using this facility (almost
-certainly THIS_MODULE).
-
-
-Slow work items may then be set up by:
-
- (1) Declaring a slow_work struct type variable:
-
-	#include <linux/slow-work.h>
-
-	struct slow_work myitem;
-
- (2) Declaring the operations to be used for this item:
-
-	struct slow_work_ops myitem_ops = {
-		.get_ref = myitem_get_ref,
-		.put_ref = myitem_put_ref,
-		.execute = myitem_execute,
-	};
-
-     [*] For a description of the ops, see section "Item Operations".
-
- (3) Initialising the item:
-
-	slow_work_init(&myitem, &myitem_ops);
-
-     or:
-
-	delayed_slow_work_init(&myitem, &myitem_ops);
-
-     or:
-
-	vslow_work_init(&myitem, &myitem_ops);
-
-     depending on its class.
-
-A suitably set up work item can then be enqueued for processing:
-
-	int ret = slow_work_enqueue(&myitem);
-
-This will return a -ve error if the thread pool is unable to gain a reference
-on the item, 0 otherwise, or (for delayed work):
-
-	int ret = delayed_slow_work_enqueue(&myitem, my_jiffy_delay);
-
-
-The items are reference counted, so there ought to be no need for a flush
-operation.  But as the reference counting is optional, means to cancel
-existing work items are also included:
-
-	cancel_slow_work(&myitem);
-	cancel_delayed_slow_work(&myitem);
-
-can be used to cancel pending work.  The above cancel function waits for
-existing work to have been executed (or prevent execution of them, depending
-on timing).
-
-
-When all a module's slow work items have been processed, and the
-module has no further interest in the facility, it should unregister its
-interest:
-
-	slow_work_unregister_user(struct module *module);
-
-The module pointer is used to wait for all outstanding work items for that
-module before completing the unregistration.  This prevents the put_ref() code
-from being taken away before it completes.  module should almost certainly be
-THIS_MODULE.
-
-
-================
-HELPER FUNCTIONS
-================
-
-The slow-work facility provides a function by which it can be determined
-whether or not an item is queued for later execution:
-
-	bool queued = slow_work_is_queued(struct slow_work *work);
-
-If it returns false, then the item is not on the queue (it may be executing
-with a requeue pending).  This can be used to work out whether an item on which
-another depends is on the queue, thus allowing a dependent item to be queued
-after it.
-
-If the above shows an item on which another depends not to be queued, then the
-owner of the dependent item might need to wait.  However, to avoid locking up
-the threads unnecessarily be sleeping in them, it can make sense under some
-circumstances to return the work item to the queue, thus deferring it until
-some other items have had a chance to make use of the yielded thread.
-
-To yield a thread and defer an item, the work function should simply enqueue
-the work item again and return.  However, this doesn't work if there's nothing
-actually on the queue, as the thread just vacated will jump straight back into
-the item's work function, thus busy waiting on a CPU.
-
-Instead, the item should use the thread to wait for the dependency to go away,
-but rather than using schedule() or schedule_timeout() to sleep, it should use
-the following function:
-
-	bool requeue = slow_work_sleep_till_thread_needed(
-			struct slow_work *work,
-			signed long *_timeout);
-
-This will add a second wait and then sleep, such that it will be woken up if
-either something appears on the queue that could usefully make use of the
-thread - and behind which this item can be queued, or if the event the caller
-set up to wait for happens.  True will be returned if something else appeared
-on the queue and this work function should perhaps return, of false if
-something else woke it up.  The timeout is as for schedule_timeout().
-
-For example:
-
-	wq = bit_waitqueue(&my_flags, MY_BIT);
-	init_wait(&wait);
-	requeue = false;
-	do {
-		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
-		if (!test_bit(MY_BIT, &my_flags))
-			break;
-		requeue = slow_work_sleep_till_thread_needed(&my_work,
-							     &timeout);
-	} while (timeout > 0 && !requeue);
-	finish_wait(wq, &wait);
-	if (!test_bit(MY_BIT, &my_flags)
-		goto do_my_thing;
-	if (requeue)
-		return; // to slow_work
-
-
-===============
-ITEM OPERATIONS
-===============
-
-Each work item requires a table of operations of type struct slow_work_ops.
-Only ->execute() is required; the getting and putting of a reference and the
-describing of an item are all optional.
-
- (*) Get a reference on an item:
-
-	int (*get_ref)(struct slow_work *work);
-
-     This allows the thread pool to attempt to pin an item by getting a
-     reference on it.  This function should return 0 if the reference was
-     granted, or a -ve error otherwise.  If an error is returned,
-     slow_work_enqueue() will fail.
-
-     The reference is held whilst the item is queued and whilst it is being
-     executed.  The item may then be requeued with the same reference held, or
-     the reference will be released.
-
- (*) Release a reference on an item:
-
-	void (*put_ref)(struct slow_work *work);
-
-     This allows the thread pool to unpin an item by releasing the reference on
-     it.  The thread pool will not touch the item again once this has been
-     called.
-
- (*) Execute an item:
-
-	void (*execute)(struct slow_work *work);
-
-     This should perform the work required of the item.  It may sleep, it may
-     perform disk I/O and it may wait for locks.
-
- (*) View an item through /proc:
-
-	void (*desc)(struct slow_work *work, struct seq_file *m);
-
-     If supplied, this should print to 'm' a small string describing the work
-     the item is to do.  This should be no more than about 40 characters, and
-     shouldn't include a newline character.
-
-     See the 'Viewing executing and queued items' section below.
-
-
-==================
-POOL CONFIGURATION
-==================
-
-The slow-work thread pool has a number of configurables:
-
- (*) /proc/sys/kernel/slow-work/min-threads
-
-     The minimum number of threads that should be in the pool whilst it is in
-     use.  This may be anywhere between 2 and max-threads.
-
- (*) /proc/sys/kernel/slow-work/max-threads
-
-     The maximum number of threads that should in the pool.  This may be
-     anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.
-
- (*) /proc/sys/kernel/slow-work/vslow-percentage
-
-     The percentage of active threads in the pool that may be used to execute
-     very slow work items.  This may be between 1 and 99.  The resultant number
-     is bounded to between 1 and one fewer than the number of active threads.
-     This ensures there is always at least one thread that can process very
-     slow work items, and always at least one thread that won't.
-
-
-==================================
-VIEWING EXECUTING AND QUEUED ITEMS
-==================================
-
-If CONFIG_SLOW_WORK_DEBUG is enabled, a debugfs file is made available:
-
-	/sys/kernel/debug/slow_work/runqueue
-
-through which the list of work items being executed and the queues of items to
-be executed may be viewed.  The owner of a work item is given the chance to
-add some information of its own.
-
-The contents look something like the following:
-
-    THR PID   ITEM ADDR        FL MARK  DESC
-    === ===== ================ == ===== ==========
-      0  3005 ffff880023f52348  a 952ms FSC: OBJ17d3: LOOK
-      1  3006 ffff880024e33668  2 160ms FSC: OBJ17e5 OP60d3b: Write1/Store fl=2
-      2  3165 ffff8800296dd180  a 424ms FSC: OBJ17e4: LOOK
-      3  4089 ffff8800262c8d78  a 212ms FSC: OBJ17ea: CRTN
-      4  4090 ffff88002792bed8  2 388ms FSC: OBJ17e8 OP60d36: Write1/Store fl=2
-      5  4092 ffff88002a0ef308  2 388ms FSC: OBJ17e7 OP60d2e: Write1/Store fl=2
-      6  4094 ffff88002abaf4b8  2 132ms FSC: OBJ17e2 OP60d4e: Write1/Store fl=2
-      7  4095 ffff88002bb188e0  a 388ms FSC: OBJ17e9: CRTN
-    vsq     - ffff880023d99668  1 308ms FSC: OBJ17e0 OP60f91: Write1/EnQ fl=2
-    vsq     - ffff8800295d1740  1 212ms FSC: OBJ16be OP4d4b6: Write1/EnQ fl=2
-    vsq     - ffff880025ba3308  1 160ms FSC: OBJ179a OP58dec: Write1/EnQ fl=2
-    vsq     - ffff880024ec83e0  1 160ms FSC: OBJ17ae OP599f2: Write1/EnQ fl=2
-    vsq     - ffff880026618e00  1 160ms FSC: OBJ17e6 OP60d33: Write1/EnQ fl=2
-    vsq     - ffff880025a2a4b8  1 132ms FSC: OBJ16a2 OP4d583: Write1/EnQ fl=2
-    vsq     - ffff880023cbe6d8  9 212ms FSC: OBJ17eb: LOOK
-    vsq     - ffff880024d37590  9 212ms FSC: OBJ17ec: LOOK
-    vsq     - ffff880027746cb0  9 212ms FSC: OBJ17ed: LOOK
-    vsq     - ffff880024d37ae8  9 212ms FSC: OBJ17ee: LOOK
-    vsq     - ffff880024d37cb0  9 212ms FSC: OBJ17ef: LOOK
-    vsq     - ffff880025036550  9 212ms FSC: OBJ17f0: LOOK
-    vsq     - ffff8800250368e0  9 212ms FSC: OBJ17f1: LOOK
-    vsq     - ffff880025036aa8  9 212ms FSC: OBJ17f2: LOOK
-
-In the 'THR' column, executing items show the thread they're occupying and
-queued threads indicate which queue they're on.  'PID' shows the process ID of
-a slow-work thread that's executing something.  'FL' shows the work item flags.
-'MARK' indicates how long since an item was queued or began executing.  Lastly,
-the 'DESC' column permits the owner of an item to give some information.
-
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
deleted file mode 100644
index 13337bf6c3f5..000000000000
--- a/include/linux/slow-work.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- *
- * See Documentation/slow-work.txt
- */
-
-#ifndef _LINUX_SLOW_WORK_H
-#define _LINUX_SLOW_WORK_H
-
-#ifdef CONFIG_SLOW_WORK
-
-#include <linux/sysctl.h>
-#include <linux/timer.h>
-
-struct slow_work;
-#ifdef CONFIG_SLOW_WORK_DEBUG
-struct seq_file;
-#endif
-
-/*
- * The operations used to support slow work items
- */
-struct slow_work_ops {
-	/* owner */
-	struct module *owner;
-
-	/* get a ref on a work item
-	 * - return 0 if successful, -ve if not
-	 */
-	int (*get_ref)(struct slow_work *work);
-
-	/* discard a ref to a work item */
-	void (*put_ref)(struct slow_work *work);
-
-	/* execute a work item */
-	void (*execute)(struct slow_work *work);
-
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	/* describe a work item for debugfs */
-	void (*desc)(struct slow_work *work, struct seq_file *m);
-#endif
-};
-
-/*
- * A slow work item
- * - A reference is held on the parent object by the thread pool when it is
- *   queued
- */
-struct slow_work {
-	struct module		*owner;	/* the owning module */
-	unsigned long		flags;
-#define SLOW_WORK_PENDING	0	/* item pending (further) execution */
-#define SLOW_WORK_EXECUTING	1	/* item currently executing */
-#define SLOW_WORK_ENQ_DEFERRED	2	/* item enqueue deferred */
-#define SLOW_WORK_VERY_SLOW	3	/* item is very slow */
-#define SLOW_WORK_CANCELLING	4	/* item is being cancelled, don't enqueue */
-#define SLOW_WORK_DELAYED	5	/* item is struct delayed_slow_work with active timer */
-	const struct slow_work_ops *ops; /* operations table for this item */
-	struct list_head	link;	/* link in queue */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	struct timespec		mark;	/* jiffies at which queued or exec begun */
-#endif
-};
-
-struct delayed_slow_work {
-	struct slow_work	work;
-	struct timer_list	timer;
-};
-
-/**
- * slow_work_init - Initialise a slow work item
- * @work: The work item to initialise
- * @ops: The operations to use to handle the slow work item
- *
- * Initialise a slow work item.
- */
-static inline void slow_work_init(struct slow_work *work,
-				  const struct slow_work_ops *ops)
-{
-	work->flags = 0;
-	work->ops = ops;
-	INIT_LIST_HEAD(&work->link);
-}
-
-/**
- * slow_work_init - Initialise a delayed slow work item
- * @work: The work item to initialise
- * @ops: The operations to use to handle the slow work item
- *
- * Initialise a delayed slow work item.
- */
-static inline void delayed_slow_work_init(struct delayed_slow_work *dwork,
-					  const struct slow_work_ops *ops)
-{
-	init_timer(&dwork->timer);
-	slow_work_init(&dwork->work, ops);
-}
-
-/**
- * vslow_work_init - Initialise a very slow work item
- * @work: The work item to initialise
- * @ops: The operations to use to handle the slow work item
- *
- * Initialise a very slow work item.  This item will be restricted such that
- * only a certain number of the pool threads will be able to execute items of
- * this type.
- */
-static inline void vslow_work_init(struct slow_work *work,
-				   const struct slow_work_ops *ops)
-{
-	work->flags = 1 << SLOW_WORK_VERY_SLOW;
-	work->ops = ops;
-	INIT_LIST_HEAD(&work->link);
-}
-
-/**
- * slow_work_is_queued - Determine if a slow work item is on the work queue
- * work: The work item to test
- *
- * Determine if the specified slow-work item is on the work queue.  This
- * returns true if it is actually on the queue.
- *
- * If the item is executing and has been marked for requeue when execution
- * finishes, then false will be returned.
- *
- * Anyone wishing to wait for completion of execution can wait on the
- * SLOW_WORK_EXECUTING bit.
- */
-static inline bool slow_work_is_queued(struct slow_work *work)
-{
-	unsigned long flags = work->flags;
-	return flags & SLOW_WORK_PENDING && !(flags & SLOW_WORK_EXECUTING);
-}
-
-extern int slow_work_enqueue(struct slow_work *work);
-extern void slow_work_cancel(struct slow_work *work);
-extern int slow_work_register_user(struct module *owner);
-extern void slow_work_unregister_user(struct module *owner);
-
-extern int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
-				     unsigned long delay);
-
-static inline void delayed_slow_work_cancel(struct delayed_slow_work *dwork)
-{
-	slow_work_cancel(&dwork->work);
-}
-
-extern bool slow_work_sleep_till_thread_needed(struct slow_work *work,
-					       signed long *_timeout);
-
-#ifdef CONFIG_SYSCTL
-extern ctl_table slow_work_sysctls[];
-#endif
-
-#endif /* CONFIG_SLOW_WORK */
-#endif /* _LINUX_SLOW_WORK_H */
diff --git a/init/Kconfig b/init/Kconfig
index 5cff9a980c39..cb64c5889e02 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1143,30 +1143,6 @@ config TRACEPOINTS
 
 source "arch/Kconfig"
 
-config SLOW_WORK
-	default n
-	bool
-	help
-	  The slow work thread pool provides a number of dynamically allocated
-	  threads that can be used by the kernel to perform operations that
-	  take a relatively long time.
-
-	  An example of this would be CacheFiles doing a path lookup followed
-	  by a series of mkdirs and a create call, all of which have to touch
-	  disk.
-
-	  See Documentation/slow-work.txt.
-
-config SLOW_WORK_DEBUG
-	bool "Slow work debugging through debugfs"
-	default n
-	depends on SLOW_WORK && DEBUG_FS
-	help
-	  Display the contents of the slow work run queue through debugfs,
-	  including items currently executing.
-
-	  See Documentation/slow-work.txt.
-
 endmenu		# General setup
 
 config HAVE_GENERIC_DMA_COHERENT
diff --git a/kernel/Makefile b/kernel/Makefile
index 057472fbc272..2484ac39b2e2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -99,8 +99,6 @@ obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
-obj-$(CONFIG_SLOW_WORK) += slow-work.o
-obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
deleted file mode 100644
index e45c43645298..000000000000
--- a/kernel/slow-work-debugfs.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Slow work debugging
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/seq_file.h>
-#include "slow-work.h"
-
-#define ITERATOR_SHIFT		(BITS_PER_LONG - 4)
-#define ITERATOR_SELECTOR	(0xfUL << ITERATOR_SHIFT)
-#define ITERATOR_COUNTER	(~ITERATOR_SELECTOR)
-
-void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
-{
-	seq_puts(m, "Slow-work: New thread");
-}
-
-/*
- * Render the time mark field on a work item into a 5-char time with units plus
- * a space
- */
-static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
-{
-	struct timespec now, diff;
-
-	now = CURRENT_TIME;
-	diff = timespec_sub(now, work->mark);
-
-	if (diff.tv_sec < 0)
-		seq_puts(m, "  -ve ");
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
-		seq_printf(m, "%3luns ", diff.tv_nsec);
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
-		seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
-		seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
-	else if (diff.tv_sec <= 1)
-		seq_puts(m, "   1s ");
-	else if (diff.tv_sec < 60)
-		seq_printf(m, "%4lus ", diff.tv_sec);
-	else if (diff.tv_sec < 60 * 60)
-		seq_printf(m, "%4lum ", diff.tv_sec / 60);
-	else if (diff.tv_sec < 60 * 60 * 24)
-		seq_printf(m, "%4luh ", diff.tv_sec / 3600);
-	else
-		seq_puts(m, "exces ");
-}
-
-/*
- * Describe a slow work item for debugfs
- */
-static int slow_work_runqueue_show(struct seq_file *m, void *v)
-{
-	struct slow_work *work;
-	struct list_head *p = v;
-	unsigned long id;
-
-	switch ((unsigned long) v) {
-	case 1:
-		seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
-		return 0;
-	case 2:
-		seq_puts(m, "=== ===== ================ == ===== ==========\n");
-		return 0;
-
-	case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
-		id = (unsigned long) v - 3;
-
-		read_lock(&slow_work_execs_lock);
-		work = slow_work_execs[id];
-		if (work) {
-			smp_read_barrier_depends();
-
-			seq_printf(m, "%3lu %5d %16p %2lx ",
-				   id, slow_work_pids[id], work, work->flags);
-			slow_work_print_mark(m, work);
-
-			if (work->ops->desc)
-				work->ops->desc(work, m);
-			seq_putc(m, '\n');
-		}
-		read_unlock(&slow_work_execs_lock);
-		return 0;
-
-	default:
-		work = list_entry(p, struct slow_work, link);
-		seq_printf(m, "%3s     - %16p %2lx ",
-			   work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
-			   work, work->flags);
-		slow_work_print_mark(m, work);
-
-		if (work->ops->desc)
-			work->ops->desc(work, m);
-		seq_putc(m, '\n');
-		return 0;
-	}
-}
-
-/*
- * map the iterator to a work item
- */
-static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
-{
-	struct list_head *p;
-	unsigned long count, id;
-
-	switch (*_pos >> ITERATOR_SHIFT) {
-	case 0x0:
-		if (*_pos == 0)
-			*_pos = 1;
-		if (*_pos < 3)
-			return (void *)(unsigned long) *_pos;
-		if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
-			for (id = *_pos - 3;
-			     id < SLOW_WORK_THREAD_LIMIT;
-			     id++, (*_pos)++)
-				if (slow_work_execs[id])
-					return (void *)(unsigned long) *_pos;
-		*_pos = 0x1UL << ITERATOR_SHIFT;
-
-	case 0x1:
-		count = *_pos & ITERATOR_COUNTER;
-		list_for_each(p, &slow_work_queue) {
-			if (count == 0)
-				return p;
-			count--;
-		}
-		*_pos = 0x2UL << ITERATOR_SHIFT;
-
-	case 0x2:
-		count = *_pos & ITERATOR_COUNTER;
-		list_for_each(p, &vslow_work_queue) {
-			if (count == 0)
-				return p;
-			count--;
-		}
-		*_pos = 0x3UL << ITERATOR_SHIFT;
-
-	default:
-		return NULL;
-	}
-}
-
-/*
- * set up the iterator to start reading from the first line
- */
-static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
-{
-	spin_lock_irq(&slow_work_queue_lock);
-	return slow_work_runqueue_index(m, _pos);
-}
-
-/*
- * move to the next line
- */
-static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
-{
-	struct list_head *p = v;
-	unsigned long selector = *_pos >> ITERATOR_SHIFT;
-
-	(*_pos)++;
-	switch (selector) {
-	case 0x0:
-		return slow_work_runqueue_index(m, _pos);
-
-	case 0x1:
-		if (*_pos >> ITERATOR_SHIFT == 0x1) {
-			p = p->next;
-			if (p != &slow_work_queue)
-				return p;
-		}
-		*_pos = 0x2UL << ITERATOR_SHIFT;
-		p = &vslow_work_queue;
-
-	case 0x2:
-		if (*_pos >> ITERATOR_SHIFT == 0x2) {
-			p = p->next;
-			if (p != &vslow_work_queue)
-				return p;
-		}
-		*_pos = 0x3UL << ITERATOR_SHIFT;
-
-	default:
-		return NULL;
-	}
-}
-
-/*
- * clean up after reading
- */
-static void slow_work_runqueue_stop(struct seq_file *m, void *v)
-{
-	spin_unlock_irq(&slow_work_queue_lock);
-}
-
-static const struct seq_operations slow_work_runqueue_ops = {
-	.start		= slow_work_runqueue_start,
-	.stop		= slow_work_runqueue_stop,
-	.next		= slow_work_runqueue_next,
-	.show		= slow_work_runqueue_show,
-};
-
-/*
- * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
- */
-static int slow_work_runqueue_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &slow_work_runqueue_ops);
-}
-
-const struct file_operations slow_work_runqueue_fops = {
-	.owner		= THIS_MODULE,
-	.open		= slow_work_runqueue_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
deleted file mode 100644
index 7d3f4fa9ef4f..000000000000
--- a/kernel/slow-work.c
+++ /dev/null
@@ -1,1068 +0,0 @@
-/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- *
- * See Documentation/slow-work.txt
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
-#include <linux/wait.h>
-#include <linux/debugfs.h>
-#include "slow-work.h"
-
-static void slow_work_cull_timeout(unsigned long);
-static void slow_work_oom_timeout(unsigned long);
-
-#ifdef CONFIG_SYSCTL
-static int slow_work_min_threads_sysctl(struct ctl_table *, int,
-					void __user *, size_t *, loff_t *);
-
-static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
-					void __user *, size_t *, loff_t *);
-#endif
-
-/*
- * The pool of threads has at least min threads in it as long as someone is
- * using the facility, and may have as many as max.
- *
- * A portion of the pool may be processing very slow operations.
- */
-static unsigned slow_work_min_threads = 2;
-static unsigned slow_work_max_threads = 4;
-static unsigned vslow_work_proportion = 50; /* % of threads that may process
-					     * very slow work */
-
-#ifdef CONFIG_SYSCTL
-static const int slow_work_min_min_threads = 2;
-static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
-static const int slow_work_min_vslow = 1;
-static const int slow_work_max_vslow = 99;
-
-ctl_table slow_work_sysctls[] = {
-	{
-		.procname	= "min-threads",
-		.data		= &slow_work_min_threads,
-		.maxlen		= sizeof(unsigned),
-		.mode		= 0644,
-		.proc_handler	= slow_work_min_threads_sysctl,
-		.extra1		= (void *) &slow_work_min_min_threads,
-		.extra2		= &slow_work_max_threads,
-	},
-	{
-		.procname	= "max-threads",
-		.data		= &slow_work_max_threads,
-		.maxlen		= sizeof(unsigned),
-		.mode		= 0644,
-		.proc_handler	= slow_work_max_threads_sysctl,
-		.extra1		= &slow_work_min_threads,
-		.extra2		= (void *) &slow_work_max_max_threads,
-	},
-	{
-		.procname	= "vslow-percentage",
-		.data		= &vslow_work_proportion,
-		.maxlen		= sizeof(unsigned),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *) &slow_work_min_vslow,
-		.extra2		= (void *) &slow_work_max_vslow,
-	},
-	{}
-};
-#endif
-
-/*
- * The active state of the thread pool
- */
-static atomic_t slow_work_thread_count;
-static atomic_t vslow_work_executing_count;
-
-static bool slow_work_may_not_start_new_thread;
-static bool slow_work_cull; /* cull a thread due to lack of activity */
-static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
-static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
-static struct slow_work slow_work_new_thread; /* new thread starter */
-
-/*
- * slow work ID allocation (use slow_work_queue_lock)
- */
-static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
-
-/*
- * Unregistration tracking to prevent put_ref() from disappearing during module
- * unload
- */
-#ifdef CONFIG_MODULES
-static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
-static struct module *slow_work_unreg_module;
-static struct slow_work *slow_work_unreg_work_item;
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
-static DEFINE_MUTEX(slow_work_unreg_sync_lock);
-
-static void slow_work_set_thread_processing(int id, struct slow_work *work)
-{
-	if (work)
-		slow_work_thread_processing[id] = work->owner;
-}
-static void slow_work_done_thread_processing(int id, struct slow_work *work)
-{
-	struct module *module = slow_work_thread_processing[id];
-
-	slow_work_thread_processing[id] = NULL;
-	smp_mb();
-	if (slow_work_unreg_work_item == work ||
-	    slow_work_unreg_module == module)
-		wake_up_all(&slow_work_unreg_wq);
-}
-static void slow_work_clear_thread_processing(int id)
-{
-	slow_work_thread_processing[id] = NULL;
-}
-#else
-static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
-static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
-static void slow_work_clear_thread_processing(int id) {}
-#endif
-
-/*
- * Data for tracking currently executing items for indication through /proc
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
-pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
-DEFINE_RWLOCK(slow_work_execs_lock);
-#endif
-
-/*
- * The queues of work items and the lock governing access to them.  These are
- * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
- * as the number of threads bears no relation to the number of CPUs.
- *
- * There are two queues of work items: one for slow work items, and one for
- * very slow work items.
- */
-LIST_HEAD(slow_work_queue);
-LIST_HEAD(vslow_work_queue);
-DEFINE_SPINLOCK(slow_work_queue_lock);
-
-/*
- * The following are two wait queues that get pinged when a work item is placed
- * on an empty queue.  These allow work items that are hogging a thread by
- * sleeping in a way that could be deferred to yield their thread and enqueue
- * themselves.
- */
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
-static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
-
-/*
- * The thread controls.  A variable used to signal to the threads that they
- * should exit when the queue is empty, a waitqueue used by the threads to wait
- * for signals, and a completion set by the last thread to exit.
- */
-static bool slow_work_threads_should_exit;
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
-static DECLARE_COMPLETION(slow_work_last_thread_exited);
-
-/*
- * The number of users of the thread pool and its lock.  Whilst this is zero we
- * have no threads hanging around, and when this reaches zero, we wait for all
- * active or queued work items to complete and kill all the threads we do have.
- */
-static int slow_work_user_count;
-static DEFINE_MUTEX(slow_work_user_lock);
-
-static inline int slow_work_get_ref(struct slow_work *work)
-{
-	if (work->ops->get_ref)
-		return work->ops->get_ref(work);
-
-	return 0;
-}
-
-static inline void slow_work_put_ref(struct slow_work *work)
-{
-	if (work->ops->put_ref)
-		work->ops->put_ref(work);
-}
-
-/*
- * Calculate the maximum number of active threads in the pool that are
- * permitted to process very slow work items.
- *
- * The answer is rounded up to at least 1, but may not equal or exceed the
- * maximum number of the threads in the pool.  This means we always have at
- * least one thread that can process slow work items, and we always have at
- * least one thread that won't get tied up doing so.
- */
-static unsigned slow_work_calc_vsmax(void)
-{
-	unsigned vsmax;
-
-	vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
-	vsmax /= 100;
-	vsmax = max(vsmax, 1U);
-	return min(vsmax, slow_work_max_threads - 1);
-}
-
-/*
- * Attempt to execute stuff queued on a slow thread.  Return true if we managed
- * it, false if there was nothing to do.
- */
-static noinline bool slow_work_execute(int id)
-{
-	struct slow_work *work = NULL;
-	unsigned vsmax;
-	bool very_slow;
-
-	vsmax = slow_work_calc_vsmax();
-
-	/* see if we can schedule a new thread to be started if we're not
-	 * keeping up with the work */
-	if (!waitqueue_active(&slow_work_thread_wq) &&
-	    (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
-	    atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
-	    !slow_work_may_not_start_new_thread)
-		slow_work_enqueue(&slow_work_new_thread);
-
-	/* find something to execute */
-	spin_lock_irq(&slow_work_queue_lock);
-	if (!list_empty(&vslow_work_queue) &&
-	    atomic_read(&vslow_work_executing_count) < vsmax) {
-		work = list_entry(vslow_work_queue.next,
-				  struct slow_work, link);
-		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
-			BUG();
-		list_del_init(&work->link);
-		atomic_inc(&vslow_work_executing_count);
-		very_slow = true;
-	} else if (!list_empty(&slow_work_queue)) {
-		work = list_entry(slow_work_queue.next,
-				  struct slow_work, link);
-		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
-			BUG();
-		list_del_init(&work->link);
-		very_slow = false;
-	} else {
-		very_slow = false; /* avoid the compiler warning */
-	}
-
-	slow_work_set_thread_processing(id, work);
-	if (work) {
-		slow_work_mark_time(work);
-		slow_work_begin_exec(id, work);
-	}
-
-	spin_unlock_irq(&slow_work_queue_lock);
-
-	if (!work)
-		return false;
-
-	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
-		BUG();
-
-	/* don't execute if the work is in the process of being cancelled */
-	if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
-		work->ops->execute(work);
-
-	if (very_slow)
-		atomic_dec(&vslow_work_executing_count);
-	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
-
-	/* wake up anyone waiting for this work to be complete */
-	wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
-
-	slow_work_end_exec(id, work);
-
-	/* if someone tried to enqueue the item whilst we were executing it,
-	 * then it'll be left unenqueued to avoid multiple threads trying to
-	 * execute it simultaneously
-	 *
-	 * there is, however, a race between us testing the pending flag and
-	 * getting the spinlock, and between the enqueuer setting the pending
-	 * flag and getting the spinlock, so we use a deferral bit to tell us
-	 * if the enqueuer got there first
-	 */
-	if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
-		spin_lock_irq(&slow_work_queue_lock);
-
-		if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
-		    test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
-			goto auto_requeue;
-
-		spin_unlock_irq(&slow_work_queue_lock);
-	}
-
-	/* sort out the race between module unloading and put_ref() */
-	slow_work_put_ref(work);
-	slow_work_done_thread_processing(id, work);
-
-	return true;
-
-auto_requeue:
-	/* we must complete the enqueue operation
-	 * - we transfer our ref on the item back to the appropriate queue
-	 * - don't wake another thread up as we're awake already
-	 */
-	slow_work_mark_time(work);
-	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
-		list_add_tail(&work->link, &vslow_work_queue);
-	else
-		list_add_tail(&work->link, &slow_work_queue);
-	spin_unlock_irq(&slow_work_queue_lock);
-	slow_work_clear_thread_processing(id);
-	return true;
-}
-
-/**
- * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
- * work: The work item under execution that wants to sleep
- * _timeout: Scheduler sleep timeout
- *
- * Allow a requeueable work item to sleep on a slow-work processor thread until
- * that thread is needed to do some other work or the sleep is interrupted by
- * some other event.
- *
- * The caller must set up a wake up event before calling this and must have set
- * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
- * condition before calling this function as no test is made here.
- *
- * False is returned if there is nothing on the queue; true is returned if the
- * work item should be requeued
- */
-bool slow_work_sleep_till_thread_needed(struct slow_work *work,
-					signed long *_timeout)
-{
-	wait_queue_head_t *wfo_wq;
-	struct list_head *queue;
-
-	DEFINE_WAIT(wait);
-
-	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
-		wfo_wq = &vslow_work_queue_waits_for_occupation;
-		queue = &vslow_work_queue;
-	} else {
-		wfo_wq = &slow_work_queue_waits_for_occupation;
-		queue = &slow_work_queue;
-	}
-
-	if (!list_empty(queue))
-		return true;
-
-	add_wait_queue_exclusive(wfo_wq, &wait);
-	if (list_empty(queue))
-		*_timeout = schedule_timeout(*_timeout);
-	finish_wait(wfo_wq, &wait);
-
-	return !list_empty(queue);
-}
-EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
-
-/**
- * slow_work_enqueue - Schedule a slow work item for processing
- * @work: The work item to queue
- *
- * Schedule a slow work item for processing.  If the item is already undergoing
- * execution, this guarantees not to re-enter the execution routine until the
- * first execution finishes.
- *
- * The item is pinned by this function as it retains a reference to it, managed
- * through the item operations.  The item is unpinned once it has been
- * executed.
- *
- * An item may hog the thread that is running it for a relatively large amount
- * of time, sufficient, for example, to perform several lookup, mkdir, create
- * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks.
- *
- * Conversely, if a number of items are awaiting processing, it may take some
- * time before any given item is given attention.  The number of threads in the
- * pool may be increased to deal with demand, but only up to a limit.
- *
- * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
- * the very slow queue, from which only a portion of the threads will be
- * allowed to pick items to execute.  This ensures that very slow items won't
- * overly block ones that are just ordinarily slow.
- *
- * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
- * attempted queued)
- */
-int slow_work_enqueue(struct slow_work *work)
-{
-	wait_queue_head_t *wfo_wq;
-	struct list_head *queue;
-	unsigned long flags;
-	int ret;
-
-	if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
-		return -ECANCELED;
-
-	BUG_ON(slow_work_user_count <= 0);
-	BUG_ON(!work);
-	BUG_ON(!work->ops);
-
-	/* when honouring an enqueue request, we only promise that we will run
-	 * the work function in the future; we do not promise to run it once
-	 * per enqueue request
-	 *
-	 * we use the PENDING bit to merge together repeat requests without
-	 * having to disable IRQs and take the spinlock, whilst still
-	 * maintaining our promise
-	 */
-	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
-		if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
-			wfo_wq = &vslow_work_queue_waits_for_occupation;
-			queue = &vslow_work_queue;
-		} else {
-			wfo_wq = &slow_work_queue_waits_for_occupation;
-			queue = &slow_work_queue;
-		}
-
-		spin_lock_irqsave(&slow_work_queue_lock, flags);
-
-		if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
-			goto cancelled;
-
-		/* we promise that we will not attempt to execute the work
-		 * function in more than one thread simultaneously
-		 *
-		 * this, however, leaves us with a problem if we're asked to
-		 * enqueue the work whilst someone is executing the work
-		 * function as simply queueing the work immediately means that
-		 * another thread may try executing it whilst it is already
-		 * under execution
-		 *
-		 * to deal with this, we set the ENQ_DEFERRED bit instead of
-		 * enqueueing, and the thread currently executing the work
-		 * function will enqueue the work item when the work function
-		 * returns and it has cleared the EXECUTING bit
-		 */
-		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
-			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
-		} else {
-			ret = slow_work_get_ref(work);
-			if (ret < 0)
-				goto failed;
-			slow_work_mark_time(work);
-			list_add_tail(&work->link, queue);
-			wake_up(&slow_work_thread_wq);
-
-			/* if someone who could be requeued is sleeping on a
-			 * thread, then ask them to yield their thread */
-			if (work->link.prev == queue)
-				wake_up(wfo_wq);
-		}
-
-		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	}
-	return 0;
-
-cancelled:
-	ret = -ECANCELED;
-failed:
-	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL(slow_work_enqueue);
-
-static int slow_work_wait(void *word)
-{
-	schedule();
-	return 0;
-}
-
-/**
- * slow_work_cancel - Cancel a slow work item
- * @work: The work item to cancel
- *
- * This function will cancel a previously enqueued work item. If we cannot
- * cancel the work item, it is guarenteed to have run when this function
- * returns.
- */
-void slow_work_cancel(struct slow_work *work)
-{
-	bool wait = true, put = false;
-
-	set_bit(SLOW_WORK_CANCELLING, &work->flags);
-	smp_mb();
-
-	/* if the work item is a delayed work item with an active timer, we
-	 * need to wait for the timer to finish _before_ getting the spinlock,
-	 * lest we deadlock against the timer routine
-	 *
-	 * the timer routine will leave DELAYED set if it notices the
-	 * CANCELLING flag in time
-	 */
-	if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
-		struct delayed_slow_work *dwork =
-			container_of(work, struct delayed_slow_work, work);
-		del_timer_sync(&dwork->timer);
-	}
-
-	spin_lock_irq(&slow_work_queue_lock);
-
-	if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
-		/* the timer routine aborted or never happened, so we are left
-		 * holding the timer's reference on the item and should just
-		 * drop the pending flag and wait for any ongoing execution to
-		 * finish */
-		struct delayed_slow_work *dwork =
-			container_of(work, struct delayed_slow_work, work);
-
-		BUG_ON(timer_pending(&dwork->timer));
-		BUG_ON(!list_empty(&work->link));
-
-		clear_bit(SLOW_WORK_DELAYED, &work->flags);
-		put = true;
-		clear_bit(SLOW_WORK_PENDING, &work->flags);
-
-	} else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
-		   !list_empty(&work->link)) {
-		/* the link in the pending queue holds a reference on the item
-		 * that we will need to release */
-		list_del_init(&work->link);
-		wait = false;
-		put = true;
-		clear_bit(SLOW_WORK_PENDING, &work->flags);
-
-	} else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
-		/* the executor is holding our only reference on the item, so
-		 * we merely need to wait for it to finish executing */
-		clear_bit(SLOW_WORK_PENDING, &work->flags);
-	}
-
-	spin_unlock_irq(&slow_work_queue_lock);
-
-	/* the EXECUTING flag is set by the executor whilst the spinlock is set
-	 * and before the item is dequeued - so assuming the above doesn't
-	 * actually dequeue it, simply waiting for the EXECUTING flag to be
-	 * released here should be sufficient */
-	if (wait)
-		wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
-			    TASK_UNINTERRUPTIBLE);
-
-	clear_bit(SLOW_WORK_CANCELLING, &work->flags);
-	if (put)
-		slow_work_put_ref(work);
-}
-EXPORT_SYMBOL(slow_work_cancel);
-
-/*
- * Handle expiry of the delay timer, indicating that a delayed slow work item
- * should now be queued if not cancelled
- */
-static void delayed_slow_work_timer(unsigned long data)
-{
-	wait_queue_head_t *wfo_wq;
-	struct list_head *queue;
-	struct slow_work *work = (struct slow_work *) data;
-	unsigned long flags;
-	bool queued = false, put = false, first = false;
-
-	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
-		wfo_wq = &vslow_work_queue_waits_for_occupation;
-		queue = &vslow_work_queue;
-	} else {
-		wfo_wq = &slow_work_queue_waits_for_occupation;
-		queue = &slow_work_queue;
-	}
-
-	spin_lock_irqsave(&slow_work_queue_lock, flags);
-	if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
-		clear_bit(SLOW_WORK_DELAYED, &work->flags);
-
-		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
-			/* we discard the reference the timer was holding in
-			 * favour of the one the executor holds */
-			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
-			put = true;
-		} else {
-			slow_work_mark_time(work);
-			list_add_tail(&work->link, queue);
-			queued = true;
-			if (work->link.prev == queue)
-				first = true;
-		}
-	}
-
-	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	if (put)
-		slow_work_put_ref(work);
-	if (first)
-		wake_up(wfo_wq);
-	if (queued)
-		wake_up(&slow_work_thread_wq);
-}
-
-/**
- * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
- * @dwork: The delayed work item to queue
- * @delay: When to start executing the work, in jiffies from now
- *
- * This is similar to slow_work_enqueue(), but it adds a delay before the work
- * is actually queued for processing.
- *
- * The item can have delayed processing requested on it whilst it is being
- * executed.  The delay will begin immediately, and if it expires before the
- * item finishes executing, the item will be placed back on the queue when it
- * has done executing.
- */
-int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
-			      unsigned long delay)
-{
-	struct slow_work *work = &dwork->work;
-	unsigned long flags;
-	int ret;
-
-	if (delay == 0)
-		return slow_work_enqueue(&dwork->work);
-
-	BUG_ON(slow_work_user_count <= 0);
-	BUG_ON(!work);
-	BUG_ON(!work->ops);
-
-	if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
-		return -ECANCELED;
-
-	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
-		spin_lock_irqsave(&slow_work_queue_lock, flags);
-
-		if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
-			goto cancelled;
-
-		/* the timer holds a reference whilst it is pending */
-		ret = slow_work_get_ref(work);
-		if (ret < 0)
-			goto cant_get_ref;
-
-		if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
-			BUG();
-		dwork->timer.expires = jiffies + delay;
-		dwork->timer.data = (unsigned long) work;
-		dwork->timer.function = delayed_slow_work_timer;
-		add_timer(&dwork->timer);
-
-		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	}
-
-	return 0;
-
-cancelled:
-	ret = -ECANCELED;
-cant_get_ref:
-	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL(delayed_slow_work_enqueue);
-
-/*
- * Schedule a cull of the thread pool at some time in the near future
- */
-static void slow_work_schedule_cull(void)
-{
-	mod_timer(&slow_work_cull_timer,
-		  round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
-}
-
-/*
- * Worker thread culling algorithm
- */
-static bool slow_work_cull_thread(void)
-{
-	unsigned long flags;
-	bool do_cull = false;
-
-	spin_lock_irqsave(&slow_work_queue_lock, flags);
-
-	if (slow_work_cull) {
-		slow_work_cull = false;
-
-		if (list_empty(&slow_work_queue) &&
-		    list_empty(&vslow_work_queue) &&
-		    atomic_read(&slow_work_thread_count) >
-		    slow_work_min_threads) {
-			slow_work_schedule_cull();
-			do_cull = true;
-		}
-	}
-
-	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	return do_cull;
-}
-
-/*
- * Determine if there is slow work available for dispatch
- */
-static inline bool slow_work_available(int vsmax)
-{
-	return !list_empty(&slow_work_queue) ||
-		(!list_empty(&vslow_work_queue) &&
-		 atomic_read(&vslow_work_executing_count) < vsmax);
-}
-
-/*
- * Worker thread dispatcher
- */
-static int slow_work_thread(void *_data)
-{
-	int vsmax, id;
-
-	DEFINE_WAIT(wait);
-
-	set_freezable();
-	set_user_nice(current, -5);
-
-	/* allocate ourselves an ID */
-	spin_lock_irq(&slow_work_queue_lock);
-	id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
-	BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
-	__set_bit(id, slow_work_ids);
-	slow_work_set_thread_pid(id, current->pid);
-	spin_unlock_irq(&slow_work_queue_lock);
-
-	sprintf(current->comm, "kslowd%03u", id);
-
-	for (;;) {
-		vsmax = vslow_work_proportion;
-		vsmax *= atomic_read(&slow_work_thread_count);
-		vsmax /= 100;
-
-		prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
-					  TASK_INTERRUPTIBLE);
-		if (!freezing(current) &&
-		    !slow_work_threads_should_exit &&
-		    !slow_work_available(vsmax) &&
-		    !slow_work_cull)
-			schedule();
-		finish_wait(&slow_work_thread_wq, &wait);
-
-		try_to_freeze();
-
-		vsmax = vslow_work_proportion;
-		vsmax *= atomic_read(&slow_work_thread_count);
-		vsmax /= 100;
-
-		if (slow_work_available(vsmax) && slow_work_execute(id)) {
-			cond_resched();
-			if (list_empty(&slow_work_queue) &&
-			    list_empty(&vslow_work_queue) &&
-			    atomic_read(&slow_work_thread_count) >
-			    slow_work_min_threads)
-				slow_work_schedule_cull();
-			continue;
-		}
-
-		if (slow_work_threads_should_exit)
-			break;
-
-		if (slow_work_cull && slow_work_cull_thread())
-			break;
-	}
-
-	spin_lock_irq(&slow_work_queue_lock);
-	slow_work_set_thread_pid(id, 0);
-	__clear_bit(id, slow_work_ids);
-	spin_unlock_irq(&slow_work_queue_lock);
-
-	if (atomic_dec_and_test(&slow_work_thread_count))
-		complete_and_exit(&slow_work_last_thread_exited, 0);
-	return 0;
-}
-
-/*
- * Handle thread cull timer expiration
- */
-static void slow_work_cull_timeout(unsigned long data)
-{
-	slow_work_cull = true;
-	wake_up(&slow_work_thread_wq);
-}
-
-/*
- * Start a new slow work thread
- */
-static void slow_work_new_thread_execute(struct slow_work *work)
-{
-	struct task_struct *p;
-
-	if (slow_work_threads_should_exit)
-		return;
-
-	if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
-		return;
-
-	if (!mutex_trylock(&slow_work_user_lock))
-		return;
-
-	slow_work_may_not_start_new_thread = true;
-	atomic_inc(&slow_work_thread_count);
-	p = kthread_run(slow_work_thread, NULL, "kslowd");
-	if (IS_ERR(p)) {
-		printk(KERN_DEBUG "Slow work thread pool: OOM\n");
-		if (atomic_dec_and_test(&slow_work_thread_count))
-			BUG(); /* we're running on a slow work thread... */
-		mod_timer(&slow_work_oom_timer,
-			  round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
-	} else {
-		/* ratelimit the starting of new threads */
-		mod_timer(&slow_work_oom_timer, jiffies + 1);
-	}
-
-	mutex_unlock(&slow_work_user_lock);
-}
-
-static const struct slow_work_ops slow_work_new_thread_ops = {
-	.owner		= THIS_MODULE,
-	.execute	= slow_work_new_thread_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	.desc		= slow_work_new_thread_desc,
-#endif
-};
-
-/*
- * post-OOM new thread start suppression expiration
- */
-static void slow_work_oom_timeout(unsigned long data)
-{
-	slow_work_may_not_start_new_thread = false;
-}
-
-#ifdef CONFIG_SYSCTL
-/*
- * Handle adjustment of the minimum number of threads
- */
-static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
-					void __user *buffer,
-					size_t *lenp, loff_t *ppos)
-{
-	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	int n;
-
-	if (ret == 0) {
-		mutex_lock(&slow_work_user_lock);
-		if (slow_work_user_count > 0) {
-			/* see if we need to start or stop threads */
-			n = atomic_read(&slow_work_thread_count) -
-				slow_work_min_threads;
-
-			if (n < 0 && !slow_work_may_not_start_new_thread)
-				slow_work_enqueue(&slow_work_new_thread);
-			else if (n > 0)
-				slow_work_schedule_cull();
-		}
-		mutex_unlock(&slow_work_user_lock);
-	}
-
-	return ret;
-}
-
-/*
- * Handle adjustment of the maximum number of threads
- */
-static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
-					void __user *buffer,
-					size_t *lenp, loff_t *ppos)
-{
-	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	int n;
-
-	if (ret == 0) {
-		mutex_lock(&slow_work_user_lock);
-		if (slow_work_user_count > 0) {
-			/* see if we need to stop threads */
-			n = slow_work_max_threads -
-				atomic_read(&slow_work_thread_count);
-
-			if (n < 0)
-				slow_work_schedule_cull();
-		}
-		mutex_unlock(&slow_work_user_lock);
-	}
-
-	return ret;
-}
-#endif /* CONFIG_SYSCTL */
-
-/**
- * slow_work_register_user - Register a user of the facility
- * @module: The module about to make use of the facility
- *
- * Register a user of the facility, starting up the initial threads if there
- * aren't any other users at this point.  This will return 0 if successful, or
- * an error if not.
- */
-int slow_work_register_user(struct module *module)
-{
-	struct task_struct *p;
-	int loop;
-
-	mutex_lock(&slow_work_user_lock);
-
-	if (slow_work_user_count == 0) {
-		printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
-		init_completion(&slow_work_last_thread_exited);
-
-		slow_work_threads_should_exit = false;
-		slow_work_init(&slow_work_new_thread,
-			       &slow_work_new_thread_ops);
-		slow_work_may_not_start_new_thread = false;
-		slow_work_cull = false;
-
-		/* start the minimum number of threads */
-		for (loop = 0; loop < slow_work_min_threads; loop++) {
-			atomic_inc(&slow_work_thread_count);
-			p = kthread_run(slow_work_thread, NULL, "kslowd");
-			if (IS_ERR(p))
-				goto error;
-		}
-		printk(KERN_NOTICE "Slow work thread pool: Ready\n");
-	}
-
-	slow_work_user_count++;
-	mutex_unlock(&slow_work_user_lock);
-	return 0;
-
-error:
-	if (atomic_dec_and_test(&slow_work_thread_count))
-		complete(&slow_work_last_thread_exited);
-	if (loop > 0) {
-		printk(KERN_ERR "Slow work thread pool:"
-		       " Aborting startup on ENOMEM\n");
-		slow_work_threads_should_exit = true;
-		wake_up_all(&slow_work_thread_wq);
-		wait_for_completion(&slow_work_last_thread_exited);
-		printk(KERN_ERR "Slow work thread pool: Aborted\n");
-	}
-	mutex_unlock(&slow_work_user_lock);
-	return PTR_ERR(p);
-}
-EXPORT_SYMBOL(slow_work_register_user);
-
-/*
- * wait for all outstanding items from the calling module to complete
- * - note that more items may be queued whilst we're waiting
- */
-static void slow_work_wait_for_items(struct module *module)
-{
-#ifdef CONFIG_MODULES
-	DECLARE_WAITQUEUE(myself, current);
-	struct slow_work *work;
-	int loop;
-
-	mutex_lock(&slow_work_unreg_sync_lock);
-	add_wait_queue(&slow_work_unreg_wq, &myself);
-
-	for (;;) {
-		spin_lock_irq(&slow_work_queue_lock);
-
-		/* first of all, we wait for the last queued item in each list
-		 * to be processed */
-		list_for_each_entry_reverse(work, &vslow_work_queue, link) {
-			if (work->owner == module) {
-				set_current_state(TASK_UNINTERRUPTIBLE);
-				slow_work_unreg_work_item = work;
-				goto do_wait;
-			}
-		}
-		list_for_each_entry_reverse(work, &slow_work_queue, link) {
-			if (work->owner == module) {
-				set_current_state(TASK_UNINTERRUPTIBLE);
-				slow_work_unreg_work_item = work;
-				goto do_wait;
-			}
-		}
-
-		/* then we wait for the items being processed to finish */
-		slow_work_unreg_module = module;
-		smp_mb();
-		for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
-			if (slow_work_thread_processing[loop] == module)
-				goto do_wait;
-		}
-		spin_unlock_irq(&slow_work_queue_lock);
-		break; /* okay, we're done */
-
-	do_wait:
-		spin_unlock_irq(&slow_work_queue_lock);
-		schedule();
-		slow_work_unreg_work_item = NULL;
-		slow_work_unreg_module = NULL;
-	}
-
-	remove_wait_queue(&slow_work_unreg_wq, &myself);
-	mutex_unlock(&slow_work_unreg_sync_lock);
-#endif /* CONFIG_MODULES */
-}
-
-/**
- * slow_work_unregister_user - Unregister a user of the facility
- * @module: The module whose items should be cleared
- *
- * Unregister a user of the facility, killing all the threads if this was the
- * last one.
- *
- * This waits for all the work items belonging to the nominated module to go
- * away before proceeding.
- */
-void slow_work_unregister_user(struct module *module)
-{
-	/* first of all, wait for all outstanding items from the calling module
-	 * to complete */
-	if (module)
-		slow_work_wait_for_items(module);
-
-	/* then we can actually go about shutting down the facility if need
-	 * be */
-	mutex_lock(&slow_work_user_lock);
-
-	BUG_ON(slow_work_user_count <= 0);
-
-	slow_work_user_count--;
-	if (slow_work_user_count == 0) {
-		printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
-		slow_work_threads_should_exit = true;
-		del_timer_sync(&slow_work_cull_timer);
-		del_timer_sync(&slow_work_oom_timer);
-		wake_up_all(&slow_work_thread_wq);
-		wait_for_completion(&slow_work_last_thread_exited);
-		printk(KERN_NOTICE "Slow work thread pool:"
-		       " Shut down complete\n");
-	}
-
-	mutex_unlock(&slow_work_user_lock);
-}
-EXPORT_SYMBOL(slow_work_unregister_user);
-
-/*
- * Initialise the slow work facility
- */
-static int __init init_slow_work(void)
-{
-	unsigned nr_cpus = num_possible_cpus();
-
-	if (slow_work_max_threads < nr_cpus)
-		slow_work_max_threads = nr_cpus;
-#ifdef CONFIG_SYSCTL
-	if (slow_work_max_max_threads < nr_cpus * 2)
-		slow_work_max_max_threads = nr_cpus * 2;
-#endif
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	{
-		struct dentry *dbdir;
-
-		dbdir = debugfs_create_dir("slow_work", NULL);
-		if (dbdir && !IS_ERR(dbdir))
-			debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
-					    NULL, &slow_work_runqueue_fops);
-	}
-#endif
-	return 0;
-}
-
-subsys_initcall(init_slow_work);
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
deleted file mode 100644
index a29ebd1ef41d..000000000000
--- a/kernel/slow-work.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Slow work private definitions
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
-					 * things to do */
-#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
-					 * OOM */
-
-#define SLOW_WORK_THREAD_LIMIT	255	/* abs maximum number of slow-work threads */
-
-/*
- * slow-work.c
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-extern struct slow_work *slow_work_execs[];
-extern pid_t slow_work_pids[];
-extern rwlock_t slow_work_execs_lock;
-#endif
-
-extern struct list_head slow_work_queue;
-extern struct list_head vslow_work_queue;
-extern spinlock_t slow_work_queue_lock;
-
-/*
- * slow-work-debugfs.c
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-extern const struct file_operations slow_work_runqueue_fops;
-
-extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
-#endif
-
-/*
- * Helper functions
- */
-static inline void slow_work_set_thread_pid(int id, pid_t pid)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	slow_work_pids[id] = pid;
-#endif
-}
-
-static inline void slow_work_mark_time(struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	work->mark = CURRENT_TIME;
-#endif
-}
-
-static inline void slow_work_begin_exec(int id, struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	slow_work_execs[id] = work;
-#endif
-}
-
-static inline void slow_work_end_exec(int id, struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
-	write_lock(&slow_work_execs_lock);
-	slow_work_execs[id] = NULL;
-	write_unlock(&slow_work_execs_lock);
-#endif
-}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d24f761f4876..5821365b9605 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,7 +50,6 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
-#include <linux/slow-work.h>
 #include <linux/perf_event.h>
 #include <linux/kprobes.h>
 #include <linux/pipe_fs_i.h>
@@ -906,13 +905,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_SLOW_WORK
-	{
-		.procname	= "slow-work",
-		.mode		= 0555,
-		.child		= slow_work_sysctls,
-	},
-#endif
 #ifdef CONFIG_PERF_EVENTS
 	{
 		.procname	= "perf_event_paranoid",
-- 
cgit v1.2.3-59-g8ed1b


From 18d0cdfd1a4cc9028c0ef80f94538b31541f8fe5 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 18 Jul 2010 12:44:01 +0200
Subject: firewire: normalize status values in packet callbacks

core-transaction.c transmit_complete_callback() and close_transaction()
expect packet callback status to be an ACK or RCODE, and ACKs get
translated to RCODEs for transaction callbacks.

An old comment on the packet callback API (been there from the initial
submission of the stack) and the dummy_driver implementation of
send_request/send_response deviated from this as they also included
-ERRNO in the range of status values.

Let's narrow status values down to ACK and RCODE to prevent surprises.
RCODE_CANCELLED is chosen as the dummy_driver's RCODE as its meaning of
"transaction timed out" comes closest to what happens when a transaction
coincides with card removal.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c        | 4 ++--
 drivers/firewire/core-transaction.c | 5 ++++-
 include/linux/firewire.h            | 8 ++++----
 3 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 2bb5c036e806..0c312c4bb4bd 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -570,12 +570,12 @@ static int dummy_set_config_rom(struct fw_card *card,
 
 static void dummy_send_request(struct fw_card *card, struct fw_packet *packet)
 {
-	packet->callback(packet, card, -ENODEV);
+	packet->callback(packet, card, RCODE_CANCELLED);
 }
 
 static void dummy_send_response(struct fw_card *card, struct fw_packet *packet)
 {
-	packet->callback(packet, card, -ENODEV);
+	packet->callback(packet, card, RCODE_CANCELLED);
 }
 
 static int dummy_cancel_packet(struct fw_card *card, struct fw_packet *packet)
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 5f5a7852f7ac..e2e4dc624fb6 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -310,7 +310,10 @@ static int allocate_tlabel(struct fw_card *card)
  * After the transaction is completed successfully or unsuccessfully, the
  * @callback will be called.  Among its parameters is the response code which
  * is either one of the rcodes per IEEE 1394 or, in case of internal errors,
- * the firewire-core specific %RCODE_SEND_ERROR.
+ * the firewire-core specific %RCODE_SEND_ERROR.  The other firewire-core
+ * specific rcodes (%RCODE_CANCELLED, %RCODE_BUSY, %RCODE_GENERATION,
+ * %RCODE_NO_ACK) denote transaction timeout, busy responder, stale request
+ * generation, or missing ACK respectively.
  *
  * Note some timing corner cases:  fw_send_request() may complete much earlier
  * than when the request packet actually hits the wire.  On the other hand,
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index adc5b55e6e5f..0c38b8e97722 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -285,10 +285,10 @@ struct fw_packet {
 	u32 timestamp;
 
 	/*
-	 * This callback is called when the packet transmission has
-	 * completed; for successful transmission, the status code is
-	 * the ack received from the destination, otherwise it's a
-	 * negative errno: ENOMEM, ESTALE, ETIMEDOUT, ENODEV, EIO.
+	 * This callback is called when the packet transmission has completed.
+	 * For successful transmission, the status code is the ack received
+	 * from the destination.  Otherwise it is one of the juju-specific
+	 * rcodes:  RCODE_SEND_ERROR, _CANCELLED, _BUSY, _GENERATION, _NO_ACK.
 	 * The callback can be called from tasklet context and thus
 	 * must never block.
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From d505e6e87127d4dbdaa5d91561eed810c180ca23 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sat, 17 Jul 2010 21:36:02 +0200
Subject: firewire: cdev: some clarifications to the API documentation

Response events:
  - are generated on more occasions than their documentation claimed.

CSR allocation:
  - An already occupied CSR can be determined from errno==EBUSY.

Bus resets:
  - Note that FW_CDEV_IOC_INITIATE_BUS_RESET is nonblocking and that the
    client is not required to observe a grace period since kernels
    2.6.36+ will enforce it now (commit 02d37bed).

  - The possible values of fw_cdev_initiate_bus_reset.type are listed in
    the kerneldoc comment already.

  - Clarify that an application that uses FW_CDEV_IOC_ADD_DESCRIPTOR and
    FW_CDEV_IOC_REMOVE_DESCRIPTOR does not have to issue a bus reset.

Isochronous I/O contexts:
  - At most one can be created per open file descriptor.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 include/linux/firewire-cdev.h | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index d31022b05bd9..fde9568151d5 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -84,8 +84,9 @@ struct fw_cdev_event_bus_reset {
 
 /**
  * struct fw_cdev_event_response - Sent when a response packet was received
- * @closure:	See &fw_cdev_event_common;
- *		set by %FW_CDEV_IOC_SEND_REQUEST ioctl
+ * @closure:	See &fw_cdev_event_common; set by %FW_CDEV_IOC_SEND_REQUEST
+ *		or %FW_CDEV_IOC_SEND_BROADCAST_REQUEST
+ *		or %FW_CDEV_IOC_SEND_STREAM_PACKET ioctl
  * @type:	See &fw_cdev_event_common; always %FW_CDEV_EVENT_RESPONSE
  * @rcode:	Response code returned by the remote node
  * @length:	Data length, i.e. the response's payload size in bytes
@@ -95,6 +96,11 @@ struct fw_cdev_event_bus_reset {
  * sent by %FW_CDEV_IOC_SEND_REQUEST ioctl.  The payload data for responses
  * carrying data (read and lock responses) follows immediately and can be
  * accessed through the @data field.
+ *
+ * The event is also generated after conclusions of transactions that do not
+ * involve response packets.  This includes unified write transactions,
+ * broadcast write transactions, and transmission of asynchronous stream
+ * packets.  @rcode indicates success or failure of such transmissions.
  */
 struct fw_cdev_event_response {
 	__u64 closure;
@@ -447,7 +453,9 @@ struct fw_cdev_send_response {
  * range to be used for later deallocation of the range.
  *
  * The address range is allocated on all local nodes.  The address allocation
- * is exclusive except for the FCP command and response registers.
+ * is exclusive except for the FCP command and response registers.  If an
+ * exclusive address region is already in use, the ioctl fails with errno set
+ * to %EBUSY.
  */
 struct fw_cdev_allocate {
 	__u64 offset;
@@ -475,9 +483,14 @@ struct fw_cdev_deallocate {
  * Initiate a bus reset for the bus this device is on.  The bus reset can be
  * either the original (long) bus reset or the arbitrated (short) bus reset
  * introduced in 1394a-2000.
+ *
+ * The ioctl returns immediately.  A subsequent &fw_cdev_event_bus_reset
+ * indicates when the reset actually happened.  Since ABI v4, this may be
+ * considerably later than the ioctl because the kernel ensures a grace period
+ * between subsequent bus resets as per IEEE 1394 bus management specification.
  */
 struct fw_cdev_initiate_bus_reset {
-	__u32 type;	/* FW_CDEV_SHORT_RESET or FW_CDEV_LONG_RESET */
+	__u32 type;
 };
 
 /**
@@ -501,9 +514,10 @@ struct fw_cdev_initiate_bus_reset {
  *
  * @immediate, @key, and @data array elements are CPU-endian quadlets.
  *
- * If successful, the kernel adds the descriptor and writes back a handle to the
- * kernel-side object to be used for later removal of the descriptor block and
- * immediate key.
+ * If successful, the kernel adds the descriptor and writes back a @handle to
+ * the kernel-side object to be used for later removal of the descriptor block
+ * and immediate key.  The kernel will also generate a bus reset to signal the
+ * change of the configuration ROM to other nodes.
  *
  * This ioctl affects the configuration ROMs of all local nodes.
  * The ioctl only succeeds on device files which represent a local node.
@@ -522,7 +536,8 @@ struct fw_cdev_add_descriptor {
  *		descriptor was added
  *
  * Remove a descriptor block and accompanying immediate key from the local
- * nodes' configuration ROMs.
+ * nodes' configuration ROMs.  The kernel will also generate a bus reset to
+ * signal the change of the configuration ROM to other nodes.
  */
 struct fw_cdev_remove_descriptor {
 	__u32 handle;
@@ -554,6 +569,8 @@ struct fw_cdev_remove_descriptor {
  *
  * Note that the effect of a @header_size > 4 depends on
  * &fw_cdev_get_info.version, as documented at &fw_cdev_event_iso_interrupt.
+ *
+ * No more than one iso context can be created per fd.
  */
 struct fw_cdev_create_iso_context {
 	__u32 type;
-- 
cgit v1.2.3-59-g8ed1b


From 850bb6f23b93c04ce1e4509a87fa607dc17d97c1 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Fri, 16 Jul 2010 22:25:14 +0200
Subject: firewire: cdev: add PHY packet transmission

Add an FW_CDEV_IOC_SEND_PHY_PACKET ioctl() for /dev/fw* which can be
used to implement bus management related functionality in userspace.

This is also half of the functionality (the transmit part) that is
needed to support a userspace implementation of a VersaPHY transaction
layer.

Safety considerations:

  - PHY packets are generally broadcasts and may have interesting
    effects on PHYs and the bus, e.g. make asynchronous arbitration
    impossible due to too low gap count.  Hence some kind of elevated
    privileges should be required of a process to be able to send
    PHY packets.  This implementation assumes that a process that is
    allowed to open the /dev/fw* of a local node does have this
    privilege.

    There was an inconclusive discussion about introducing POSIX
    capabilities as a means to check for user privileges for these
    kinds of operations.

  - The kernel does not check integrity of the supplied packet data.
    That would be far too much code, considering the many kinds of
    PHY packets.  A process which got the privilege to send these
    packets is trusted to do it correctly.

Just like with the other "send packet" ioctls, a non-blocking API is
chosen; i.e. the ioctl may return even before AT DMA started.  After
transmission, an event for poll()/read() is enqueued.  Most users are
going to need a blocking API, but a blocking userspace wrapper is easy
to implement, and the second of the two existing libraw1394 calls
raw1394_phy_packet_write() and raw1394_start_phy_packet_write() can be
better supported that way.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c  | 64 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/firewire-cdev.h | 44 ++++++++++++++++++++++++++++-
 2 files changed, 107 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index acf4fa1f3f8c..f95719926487 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -194,6 +194,13 @@ struct iso_resource_event {
 	struct fw_cdev_event_iso_resource iso_resource;
 };
 
+struct outbound_phy_packet_event {
+	struct event event;
+	struct client *client;
+	struct fw_packet p;
+	struct fw_cdev_event_phy_packet phy_packet;
+};
+
 static inline void __user *u64_to_uptr(__u64 value)
 {
 	return (void __user *)(unsigned long)value;
@@ -396,6 +403,7 @@ union ioctl_arg {
 	struct fw_cdev_allocate_iso_resource	allocate_iso_resource;
 	struct fw_cdev_send_stream_packet	send_stream_packet;
 	struct fw_cdev_get_cycle_timer2		get_cycle_timer2;
+	struct fw_cdev_send_phy_packet		send_phy_packet;
 };
 
 static int ioctl_get_info(struct client *client, union ioctl_arg *arg)
@@ -1384,6 +1392,61 @@ static int ioctl_send_stream_packet(struct client *client, union ioctl_arg *arg)
 	return init_request(client, &request, dest, a->speed);
 }
 
+static void outbound_phy_packet_callback(struct fw_packet *packet,
+					 struct fw_card *card, int status)
+{
+	struct outbound_phy_packet_event *e =
+		container_of(packet, struct outbound_phy_packet_event, p);
+
+	switch (status) {
+	/* expected: */
+	case ACK_COMPLETE:	e->phy_packet.rcode = RCODE_COMPLETE;	break;
+	/* should never happen with PHY packets: */
+	case ACK_PENDING:	e->phy_packet.rcode = RCODE_COMPLETE;	break;
+	case ACK_BUSY_X:
+	case ACK_BUSY_A:
+	case ACK_BUSY_B:	e->phy_packet.rcode = RCODE_BUSY;	break;
+	case ACK_DATA_ERROR:	e->phy_packet.rcode = RCODE_DATA_ERROR;	break;
+	case ACK_TYPE_ERROR:	e->phy_packet.rcode = RCODE_TYPE_ERROR;	break;
+	/* stale generation; cancelled; on certain controllers: no ack */
+	default:		e->phy_packet.rcode = status;		break;
+	}
+
+	queue_event(e->client, &e->event,
+		    &e->phy_packet, sizeof(e->phy_packet), NULL, 0);
+	client_put(e->client);
+}
+
+static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg)
+{
+	struct fw_cdev_send_phy_packet *a = &arg->send_phy_packet;
+	struct fw_card *card = client->device->card;
+	struct outbound_phy_packet_event *e;
+
+	/* Access policy: Allow this ioctl only on local nodes' device files. */
+	if (!client->device->is_local)
+		return -ENOSYS;
+
+	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	if (e == NULL)
+		return -ENOMEM;
+
+	client_get(client);
+	e->client		= client;
+	e->p.speed		= SCODE_100;
+	e->p.generation		= a->generation;
+	e->p.header[0]		= a->data[0];
+	e->p.header[1]		= a->data[1];
+	e->p.header_length	= 8;
+	e->p.callback		= outbound_phy_packet_callback;
+	e->phy_packet.closure	= a->closure;
+	e->phy_packet.type	= FW_CDEV_EVENT_PHY_PACKET_SENT;
+
+	card->driver->send_request(card, &e->p);
+
+	return 0;
+}
+
 static int (* const ioctl_handlers[])(struct client *, union ioctl_arg *) = {
 	[0x00] = ioctl_get_info,
 	[0x01] = ioctl_send_request,
@@ -1406,6 +1469,7 @@ static int (* const ioctl_handlers[])(struct client *, union ioctl_arg *) = {
 	[0x12] = ioctl_send_broadcast_request,
 	[0x13] = ioctl_send_stream_packet,
 	[0x14] = ioctl_get_cycle_timer2,
+	[0x15] = ioctl_send_phy_packet,
 };
 
 static int dispatch_ioctl(struct client *client,
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index fde9568151d5..5bc051b9a013 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -34,6 +34,7 @@
 
 /* available since kernel version 2.6.36 */
 #define FW_CDEV_EVENT_REQUEST2			0x06
+#define FW_CDEV_EVENT_PHY_PACKET_SENT		0x07
 
 /**
  * struct fw_cdev_event_common - Common part of all fw_cdev_event_ types
@@ -283,6 +284,19 @@ struct fw_cdev_event_iso_resource {
 	__s32 bandwidth;
 };
 
+/**
+ * struct fw_cdev_event_phy_packet - A PHY packet was transmitted
+ * @closure:	See &fw_cdev_event_common;
+ *		set by %FW_CDEV_IOC_SEND_PHY_PACKET ioctl
+ * @type:	%FW_CDEV_EVENT_PHY_PACKET_SENT
+ * @rcode:	%RCODE_..., indicates success or failure of transmission
+ */
+struct fw_cdev_event_phy_packet {
+	__u64 closure;
+	__u32 type;
+	__u32 rcode;
+};
+
 /**
  * union fw_cdev_event - Convenience union of fw_cdev_event_ types
  * @common:        Valid for all types
@@ -294,6 +308,7 @@ struct fw_cdev_event_iso_resource {
  * @iso_resource:  Valid if @common.type ==
  *				%FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or
  *				%FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED
+ * @phy_packet:    Valid if @common.type == %FW_CDEV_EVENT_PHY_PACKET_SENT
  *
  * Convenience union for userspace use.  Events could be read(2) into an
  * appropriately aligned char buffer and then cast to this union for further
@@ -311,6 +326,7 @@ union fw_cdev_event {
 	struct fw_cdev_event_request2		request2;     /* added in 2.6.36 */
 	struct fw_cdev_event_iso_interrupt	iso_interrupt;
 	struct fw_cdev_event_iso_resource	iso_resource; /* added in 2.6.30 */
+	struct fw_cdev_event_phy_packet		phy_packet;   /* added in 2.6.36 */
 };
 
 /* available since kernel version 2.6.22 */
@@ -342,6 +358,9 @@ union fw_cdev_event {
 /* available since kernel version 2.6.34 */
 #define FW_CDEV_IOC_GET_CYCLE_TIMER2   _IOWR('#', 0x14, struct fw_cdev_get_cycle_timer2)
 
+/* available since kernel version 2.6.36 */
+#define FW_CDEV_IOC_SEND_PHY_PACKET    _IOWR('#', 0x15, struct fw_cdev_send_phy_packet)
+
 /*
  * ABI version history
  *  1  (2.6.22)  - initial version
@@ -357,8 +376,9 @@ union fw_cdev_event {
  *               - shared use and auto-response for FCP registers
  *  3  (2.6.34)  - made &fw_cdev_get_cycle_timer reliable
  *               - added %FW_CDEV_IOC_GET_CYCLE_TIMER2
- *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2
+ *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2, %FW_CDEV_EVENT_PHY_PACKET_SENT
  *               - implemented &fw_cdev_event_bus_reset.bm_node_id
+ *               - added %FW_CDEV_IOC_SEND_PHY_PACKET
  */
 #define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
@@ -808,4 +828,26 @@ struct fw_cdev_send_stream_packet {
 	__u32 speed;
 };
 
+/**
+ * struct fw_cdev_send_phy_packet - send a PHY packet
+ * @closure:	Passed back to userspace in the PHY-packet-sent event
+ * @data:	First and second quadlet of the PHY packet
+ * @generation:	The bus generation where packet is valid
+ *
+ * The %FW_CDEV_IOC_SEND_PHY_PACKET ioctl sends a PHY packet to all nodes
+ * on the same card as this device.  After transmission, an
+ * %FW_CDEV_EVENT_PHY_PACKET_SENT event is generated.
+ *
+ * The payload @data[] shall be specified in host byte order.  Usually,
+ * @data[1] needs to be the bitwise inverse of @data[0].  VersaPHY packets
+ * are an exception to this rule.
+ *
+ * The ioctl is only permitted on device files which represent a local node.
+ */
+struct fw_cdev_send_phy_packet {
+	__u64 closure;
+	__u32 data[2];
+	__u32 generation;
+};
+
 #endif /* _LINUX_FIREWIRE_CDEV_H */
-- 
cgit v1.2.3-59-g8ed1b


From bf54e1462b9192fdef7ea9e2bc44fdc16a4b87bc Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Fri, 16 Jul 2010 22:25:51 +0200
Subject: firewire: cdev: add PHY packet reception

Add an FW_CDEV_IOC_RECEIVE_PHY_PACKETS ioctl() and
FW_CDEV_EVENT_PHY_PACKET_RECEIVED poll()/read() event for /dev/fw*.
This can be used to get information from remote PHYs by remote access
PHY packets.

This is also the 2nd half of the functionality (the receive part) to
support a userspace implementation of a VersaPHY transaction layer.

Safety considerations:

  - PHY packets are generally broadcasts, hence some kind of elevated
    privileges should be required of a process to be able to listen in
    on PHY packets.  This implementation assumes that a process that is
    allowed to open the /dev/fw* of a local node does have this
    privilege.

    There was an inconclusive discussion about introducing POSIX
    capabilities as a means to check for user privileges for these
    kinds of operations.

Other limitations:

  - PHY packet reception may be switched on by ioctl() but cannot be
    switched off again.  It would be trivial to provide an off switch,
    but this is not worth the code.  The client should simply close()
    the fd then, or just ignore further events.

  - For sake of simplicity of API and kernel-side implementation, no
    filter per packet content is provided.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c        |  1 +
 drivers/firewire/core-cdev.c        | 73 ++++++++++++++++++++++++++++++++++---
 drivers/firewire/core-transaction.c |  5 +++
 drivers/firewire/core.h             |  2 +
 drivers/firewire/ohci.c             |  3 +-
 include/linux/firewire-cdev.h       | 39 ++++++++++++++++----
 include/linux/firewire.h            |  3 +-
 7 files changed, 111 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 0c312c4bb4bd..6d1cfae6aad4 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -500,6 +500,7 @@ void fw_card_initialize(struct fw_card *card,
 	kref_init(&card->kref);
 	init_completion(&card->done);
 	INIT_LIST_HEAD(&card->transaction_list);
+	INIT_LIST_HEAD(&card->phy_receiver_list);
 	spin_lock_init(&card->lock);
 
 	card->local_node = NULL;
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index f95719926487..0425dd5dfcd3 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -69,6 +69,9 @@ struct client {
 	struct fw_iso_buffer buffer;
 	unsigned long vm_start;
 
+	struct list_head phy_receiver_link;
+	u64 phy_receiver_closure;
+
 	struct list_head link;
 	struct kref kref;
 };
@@ -201,6 +204,11 @@ struct outbound_phy_packet_event {
 	struct fw_cdev_event_phy_packet phy_packet;
 };
 
+struct inbound_phy_packet_event {
+	struct event event;
+	struct fw_cdev_event_phy_packet phy_packet;
+};
+
 static inline void __user *u64_to_uptr(__u64 value)
 {
 	return (void __user *)(unsigned long)value;
@@ -236,6 +244,7 @@ static int fw_device_op_open(struct inode *inode, struct file *file)
 	idr_init(&client->resource_idr);
 	INIT_LIST_HEAD(&client->event_list);
 	init_waitqueue_head(&client->wait);
+	INIT_LIST_HEAD(&client->phy_receiver_link);
 	kref_init(&client->kref);
 
 	file->private_data = client;
@@ -357,7 +366,7 @@ static void queue_bus_reset_event(struct client *client)
 
 	e = kzalloc(sizeof(*e), GFP_KERNEL);
 	if (e == NULL) {
-		fw_notify("Out of memory when allocating bus reset event\n");
+		fw_notify("Out of memory when allocating event\n");
 		return;
 	}
 
@@ -404,6 +413,7 @@ union ioctl_arg {
 	struct fw_cdev_send_stream_packet	send_stream_packet;
 	struct fw_cdev_get_cycle_timer2		get_cycle_timer2;
 	struct fw_cdev_send_phy_packet		send_phy_packet;
+	struct fw_cdev_receive_phy_packets	receive_phy_packets;
 };
 
 static int ioctl_get_info(struct client *client, union ioctl_arg *arg)
@@ -671,9 +681,10 @@ static void handle_request(struct fw_card *card, struct fw_request *request,
 
 	r = kmalloc(sizeof(*r), GFP_ATOMIC);
 	e = kmalloc(sizeof(*e), GFP_ATOMIC);
-	if (r == NULL || e == NULL)
+	if (r == NULL || e == NULL) {
+		fw_notify("Out of memory when allocating event\n");
 		goto failed;
-
+	}
 	r->card    = card;
 	r->request = request;
 	r->data    = payload;
@@ -902,9 +913,10 @@ static void iso_callback(struct fw_iso_context *context, u32 cycle,
 	struct iso_interrupt_event *e;
 
 	e = kmalloc(sizeof(*e) + header_length, GFP_ATOMIC);
-	if (e == NULL)
+	if (e == NULL) {
+		fw_notify("Out of memory when allocating event\n");
 		return;
-
+	}
 	e->interrupt.type      = FW_CDEV_EVENT_ISO_INTERRUPT;
 	e->interrupt.closure   = client->iso_closure;
 	e->interrupt.cycle     = cycle;
@@ -1447,6 +1459,52 @@ static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg)
 	return 0;
 }
 
+static int ioctl_receive_phy_packets(struct client *client, union ioctl_arg *arg)
+{
+	struct fw_cdev_receive_phy_packets *a = &arg->receive_phy_packets;
+	struct fw_card *card = client->device->card;
+
+	/* Access policy: Allow this ioctl only on local nodes' device files. */
+	if (!client->device->is_local)
+		return -ENOSYS;
+
+	spin_lock_irq(&card->lock);
+
+	list_move_tail(&client->phy_receiver_link, &card->phy_receiver_list);
+	client->phy_receiver_closure = a->closure;
+
+	spin_unlock_irq(&card->lock);
+
+	return 0;
+}
+
+void fw_cdev_handle_phy_packet(struct fw_card *card, struct fw_packet *p)
+{
+	struct client *client;
+	struct inbound_phy_packet_event *e;
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->lock, flags);
+
+	list_for_each_entry(client, &card->phy_receiver_list, phy_receiver_link) {
+		e = kmalloc(sizeof(*e) + 8, GFP_ATOMIC);
+		if (e == NULL) {
+			fw_notify("Out of memory when allocating event\n");
+			break;
+		}
+		e->phy_packet.closure	= client->phy_receiver_closure;
+		e->phy_packet.type	= FW_CDEV_EVENT_PHY_PACKET_RECEIVED;
+		e->phy_packet.rcode	= RCODE_COMPLETE;
+		e->phy_packet.length	= 8;
+		e->phy_packet.data[0]	= p->header[1];
+		e->phy_packet.data[1]	= p->header[2];
+		queue_event(client, &e->event,
+			    &e->phy_packet, sizeof(e->phy_packet) + 8, NULL, 0);
+	}
+
+	spin_unlock_irqrestore(&card->lock, flags);
+}
+
 static int (* const ioctl_handlers[])(struct client *, union ioctl_arg *) = {
 	[0x00] = ioctl_get_info,
 	[0x01] = ioctl_send_request,
@@ -1470,6 +1528,7 @@ static int (* const ioctl_handlers[])(struct client *, union ioctl_arg *) = {
 	[0x13] = ioctl_send_stream_packet,
 	[0x14] = ioctl_get_cycle_timer2,
 	[0x15] = ioctl_send_phy_packet,
+	[0x16] = ioctl_receive_phy_packets,
 };
 
 static int dispatch_ioctl(struct client *client,
@@ -1577,6 +1636,10 @@ static int fw_device_op_release(struct inode *inode, struct file *file)
 	struct client *client = file->private_data;
 	struct event *event, *next_event;
 
+	spin_lock_irq(&client->device->card->lock);
+	list_del(&client->phy_receiver_link);
+	spin_unlock_irq(&client->device->card->lock);
+
 	mutex_lock(&client->device->client_list_mutex);
 	list_del(&client->link);
 	mutex_unlock(&client->device->client_list_mutex);
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index e2e4dc624fb6..6f225cacbc3d 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -883,6 +883,11 @@ void fw_core_handle_request(struct fw_card *card, struct fw_packet *p)
 	if (p->ack != ACK_PENDING && p->ack != ACK_COMPLETE)
 		return;
 
+	if (TCODE_IS_LINK_INTERNAL(HEADER_GET_TCODE(p->header[0]))) {
+		fw_cdev_handle_phy_packet(card, p);
+		return;
+	}
+
 	request = allocate_request(card, p);
 	if (request == NULL) {
 		/* FIXME: send statically allocated busy packet. */
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index ff6c90922001..3102b6b63438 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -128,6 +128,7 @@ extern const struct file_operations fw_device_ops;
 
 void fw_device_cdev_update(struct fw_device *device);
 void fw_device_cdev_remove(struct fw_device *device);
+void fw_cdev_handle_phy_packet(struct fw_card *card, struct fw_packet *p);
 
 
 /* -device */
@@ -214,6 +215,7 @@ static inline bool is_next_generation(int new_generation, int old_generation)
 
 #define TCODE_IS_READ_REQUEST(tcode)	(((tcode) & ~1) == 4)
 #define TCODE_IS_BLOCK_PACKET(tcode)	(((tcode) &  1) != 0)
+#define TCODE_IS_LINK_INTERNAL(tcode)	((tcode) == 0xe)
 #define TCODE_IS_REQUEST(tcode)		(((tcode) &  2) == 0)
 #define TCODE_IS_RESPONSE(tcode)	(((tcode) &  2) != 0)
 #define TCODE_HAS_REQUEST_DATA(tcode)	(((tcode) & 12) != 4)
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index bb6a92bc9e6a..08afccc66333 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -1759,10 +1759,9 @@ static int ohci_enable(struct fw_card *card,
 		  OHCI1394_HCControl_noByteSwapData);
 
 	reg_write(ohci, OHCI1394_SelfIDBuffer, ohci->self_id_bus);
-	reg_write(ohci, OHCI1394_LinkControlClear,
-		  OHCI1394_LinkControl_rcvPhyPkt);
 	reg_write(ohci, OHCI1394_LinkControlSet,
 		  OHCI1394_LinkControl_rcvSelfID |
+		  OHCI1394_LinkControl_rcvPhyPkt |
 		  OHCI1394_LinkControl_cycleTimerEnable |
 		  OHCI1394_LinkControl_cycleMaster);
 
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 5bc051b9a013..b87409160794 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -35,6 +35,7 @@
 /* available since kernel version 2.6.36 */
 #define FW_CDEV_EVENT_REQUEST2			0x06
 #define FW_CDEV_EVENT_PHY_PACKET_SENT		0x07
+#define FW_CDEV_EVENT_PHY_PACKET_RECEIVED	0x08
 
 /**
  * struct fw_cdev_event_common - Common part of all fw_cdev_event_ types
@@ -285,16 +286,24 @@ struct fw_cdev_event_iso_resource {
 };
 
 /**
- * struct fw_cdev_event_phy_packet - A PHY packet was transmitted
- * @closure:	See &fw_cdev_event_common;
- *		set by %FW_CDEV_IOC_SEND_PHY_PACKET ioctl
- * @type:	%FW_CDEV_EVENT_PHY_PACKET_SENT
+ * struct fw_cdev_event_phy_packet - A PHY packet was transmitted or received
+ * @closure:	See &fw_cdev_event_common; set by %FW_CDEV_IOC_SEND_PHY_PACKET
+ *		or %FW_CDEV_IOC_RECEIVE_PHY_PACKETS ioctl
+ * @type:	%FW_CDEV_EVENT_PHY_PACKET_SENT or %..._RECEIVED
  * @rcode:	%RCODE_..., indicates success or failure of transmission
+ * @length:	Data length in bytes
+ * @data:	Incoming data
+ *
+ * If @type is %FW_CDEV_EVENT_PHY_PACKET_SENT, @length is 0 and @data empty.
+ * If @type is %FW_CDEV_EVENT_PHY_PACKET_RECEIVED, @length is 8 and @data
+ * consists of the two PHY packet quadlets, in host byte order.
  */
 struct fw_cdev_event_phy_packet {
 	__u64 closure;
 	__u32 type;
 	__u32 rcode;
+	__u32 length;
+	__u32 data[0];
 };
 
 /**
@@ -308,7 +317,9 @@ struct fw_cdev_event_phy_packet {
  * @iso_resource:  Valid if @common.type ==
  *				%FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or
  *				%FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED
- * @phy_packet:    Valid if @common.type == %FW_CDEV_EVENT_PHY_PACKET_SENT
+ * @phy_packet:    Valid if @common.type ==
+ *				%FW_CDEV_EVENT_PHY_PACKET_SENT or
+ *				%FW_CDEV_EVENT_PHY_PACKET_RECEIVED
  *
  * Convenience union for userspace use.  Events could be read(2) into an
  * appropriately aligned char buffer and then cast to this union for further
@@ -360,6 +371,7 @@ union fw_cdev_event {
 
 /* available since kernel version 2.6.36 */
 #define FW_CDEV_IOC_SEND_PHY_PACKET    _IOWR('#', 0x15, struct fw_cdev_send_phy_packet)
+#define FW_CDEV_IOC_RECEIVE_PHY_PACKETS _IOW('#', 0x16, struct fw_cdev_receive_phy_packets)
 
 /*
  * ABI version history
@@ -376,9 +388,9 @@ union fw_cdev_event {
  *               - shared use and auto-response for FCP registers
  *  3  (2.6.34)  - made &fw_cdev_get_cycle_timer reliable
  *               - added %FW_CDEV_IOC_GET_CYCLE_TIMER2
- *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2, %FW_CDEV_EVENT_PHY_PACKET_SENT
+ *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2, %FW_CDEV_EVENT_PHY_PACKET_*
  *               - implemented &fw_cdev_event_bus_reset.bm_node_id
- *               - added %FW_CDEV_IOC_SEND_PHY_PACKET
+ *               - added %FW_CDEV_IOC_SEND_PHY_PACKET, _RECEIVE_PHY_PACKETS
  */
 #define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
@@ -850,4 +862,17 @@ struct fw_cdev_send_phy_packet {
 	__u32 generation;
 };
 
+/**
+ * struct fw_cdev_receive_phy_packets - start reception of PHY packets
+ * @closure: Passed back to userspace in phy packet events
+ *
+ * This ioctl activates issuing of %FW_CDEV_EVENT_PHY_PACKET_RECEIVED due to
+ * incoming PHY packets from any node on the same bus as the device.
+ *
+ * The ioctl is only permitted on device files which represent a local node.
+ */
+struct fw_cdev_receive_phy_packets {
+	__u64 closure;
+};
+
 #endif /* _LINUX_FIREWIRE_CDEV_H */
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 0c38b8e97722..d974aa4a24c9 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -111,9 +111,10 @@ struct fw_card {
 	bool beta_repeaters_present;
 
 	int index;
-
 	struct list_head link;
 
+	struct list_head phy_receiver_list;
+
 	struct delayed_work br_work; /* bus reset job */
 	bool br_short;
 
-- 
cgit v1.2.3-59-g8ed1b


From cc550216ae9a2993ef3973464714dc1a39ab1f86 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 18 Jul 2010 13:00:50 +0200
Subject: firewire: cdev: add PHY pinging

This extends the FW_CDEV_IOC_SEND_PHY_PACKET ioctl() for /dev/fw* to be
useful for ping time measurements.  One application for it would be gap
count optimization in userspace that is based on ping times rather than
hop count.  (The latter is implemented in firewire-core itself but is
not applicable to beta PHYs that act as repeater.)

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c  | 9 ++++++---
 drivers/firewire/core.h       | 5 +++++
 drivers/firewire/ohci.c       | 3 +++
 include/linux/firewire-cdev.h | 5 ++++-
 4 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 0425dd5dfcd3..31863cf8b6c4 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -1423,9 +1423,10 @@ static void outbound_phy_packet_callback(struct fw_packet *packet,
 	/* stale generation; cancelled; on certain controllers: no ack */
 	default:		e->phy_packet.rcode = status;		break;
 	}
+	e->phy_packet.data[0] = packet->timestamp;
 
-	queue_event(e->client, &e->event,
-		    &e->phy_packet, sizeof(e->phy_packet), NULL, 0);
+	queue_event(e->client, &e->event, &e->phy_packet,
+		    sizeof(e->phy_packet) + e->phy_packet.length, NULL, 0);
 	client_put(e->client);
 }
 
@@ -1439,7 +1440,7 @@ static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg)
 	if (!client->device->is_local)
 		return -ENOSYS;
 
-	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	e = kzalloc(sizeof(*e) + 4, GFP_KERNEL);
 	if (e == NULL)
 		return -ENOMEM;
 
@@ -1453,6 +1454,8 @@ static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg)
 	e->p.callback		= outbound_phy_packet_callback;
 	e->phy_packet.closure	= a->closure;
 	e->phy_packet.type	= FW_CDEV_EVENT_PHY_PACKET_SENT;
+	if (is_ping_packet(a->data))
+			e->phy_packet.length = 4;
 
 	card->driver->send_request(card, &e->p);
 
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 3102b6b63438..28621e44b111 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -234,4 +234,9 @@ void fw_fill_response(struct fw_packet *response, u32 *request_header,
 void fw_send_phy_config(struct fw_card *card,
 			int node_id, int generation, int gap_count);
 
+static inline bool is_ping_packet(u32 *data)
+{
+	return (data[0] & 0xc0ffffff) == 0 && ~data[0] == data[1];
+}
+
 #endif /* _FIREWIRE_CORE_H */
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 08afccc66333..5f6bb2c53808 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -1068,6 +1068,9 @@ static int at_context_queue_packet(struct context *ctx,
 		header[1] = cpu_to_le32(packet->header[0]);
 		header[2] = cpu_to_le32(packet->header[1]);
 		d[0].req_count = cpu_to_le16(12);
+
+		if (is_ping_packet(packet->header))
+			d[0].control |= cpu_to_le16(DESCRIPTOR_PING);
 		break;
 
 	case 4:
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index b87409160794..da0fec7e8dc0 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -294,7 +294,10 @@ struct fw_cdev_event_iso_resource {
  * @length:	Data length in bytes
  * @data:	Incoming data
  *
- * If @type is %FW_CDEV_EVENT_PHY_PACKET_SENT, @length is 0 and @data empty.
+ * If @type is %FW_CDEV_EVENT_PHY_PACKET_SENT, @length is 0 and @data empty,
+ * except in case of a ping packet:  Then, @length is 4, and @data[0] is the
+ * ping time in 49.152MHz clocks if @rcode is %RCODE_COMPLETE.
+ *
  * If @type is %FW_CDEV_EVENT_PHY_PACKET_RECEIVED, @length is 8 and @data
  * consists of the two PHY packet quadlets, in host byte order.
  */
-- 
cgit v1.2.3-59-g8ed1b


From 8e2b2b46ea4ca5ef790dddf78b360ed736a62d7c Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Fri, 23 Jul 2010 13:05:39 +0200
Subject: firewire: cdev: improve FW_CDEV_IOC_ALLOCATE

In both the ieee1394 stack and the firewire stack, the core treats
kernelspace drivers better than userspace drivers when it comes to
CSR address range allocation:  The former may request a register to be
placed automatically at a free spot anywhere inside a specified address
range.  The latter may only request a register at a fixed offset.

Hence, userspace drivers which do not require a fixed offset potentially
need to implement a retry loop with incremented offset in each retry
until the kernel does not fail allocation with EBUSY.  This awkward
procedure is not fundamentally necessary as the core already provides a
superior allocation API to kernelspace drivers.

Therefore change the ioctl() ABI by addition of a region_end member in
the existing struct fw_cdev_allocate.  Userspace and kernelspace APIs
work the same way now.

There is a small cost to pay by clients though:  If client source code
is required to compile with older kernel headers too, then any use of
the new member fw_cdev_allocate.region_end needs to be enclosed by
#ifdef/#endif directives.  However, any client program that seriously
wants to use address range allocations will require a kernel of cdev ABI
version >= 4 at runtime and a linux/firewire-cdev.h header of >= 4
anyway.  This is because v4 brings FW_CDEV_EVENT_REQUEST2.  The only
client program in which build-time compatibility with struct
fw_cdev_allocate as found in older kernel headers makes sense is
libraw1394.

(libraw1394 uses the older broken FW_CDEV_EVENT_REQUEST to implement a
makeshift, incorrect transaction responder that does at least work
somewhat in many simple scenarios, relying on guesswork by libraw1394
and by libraw1394 based applications.  Plus, address range allocation
and transaction responder is only one of many features that libraw1394
needs to provide, and these other features need to work with kernel and
kernel-headers as old as possible.  Any new linux/firewire-cdev.h based
client that implements a transaction responder should never attempt to
do it like libraw1394;  instead it should make a header and kernel of v4
or later a hard requirement.)

While we are at it, update the struct fw_cdev_allocate documentation to
better reflect the recent fw_cdev_event_request2 ABI addition.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c  | 12 +++++++++---
 include/linux/firewire-cdev.h | 29 ++++++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 31863cf8b6c4..f40098dec14b 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -50,8 +50,9 @@
 /*
  * ABI version history is documented in linux/firewire-cdev.h.
  */
-#define FW_CDEV_KERNEL_VERSION		4
-#define FW_CDEV_VERSION_EVENT_REQUEST2	4
+#define FW_CDEV_KERNEL_VERSION			4
+#define FW_CDEV_VERSION_EVENT_REQUEST2		4
+#define FW_CDEV_VERSION_ALLOCATE_REGION_END	4
 
 struct client {
 	u32 version;
@@ -773,7 +774,11 @@ static int ioctl_allocate(struct client *client, union ioctl_arg *arg)
 		return -ENOMEM;
 
 	region.start = a->offset;
-	region.end   = a->offset + a->length;
+	if (client->version < FW_CDEV_VERSION_ALLOCATE_REGION_END)
+		region.end = a->offset + a->length;
+	else
+		region.end = a->region_end;
+
 	r->handler.length           = a->length;
 	r->handler.address_callback = handle_request;
 	r->handler.callback_data    = r;
@@ -785,6 +790,7 @@ static int ioctl_allocate(struct client *client, union ioctl_arg *arg)
 		kfree(r);
 		return ret;
 	}
+	a->offset = r->handler.offset;
 
 	r->resource.release = release_address_handler;
 	ret = add_client_resource(client, &r->resource, GFP_KERNEL);
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index da0fec7e8dc0..14831119ff71 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -394,6 +394,7 @@ union fw_cdev_event {
  *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2, %FW_CDEV_EVENT_PHY_PACKET_*
  *               - implemented &fw_cdev_event_bus_reset.bm_node_id
  *               - added %FW_CDEV_IOC_SEND_PHY_PACKET, _RECEIVE_PHY_PACKETS
+ *               - added &fw_cdev_allocate.region_end
  */
 #define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
@@ -473,17 +474,21 @@ struct fw_cdev_send_response {
 };
 
 /**
- * struct fw_cdev_allocate - Allocate a CSR address range
+ * struct fw_cdev_allocate - Allocate a CSR in an address range
  * @offset:	Start offset of the address range
  * @closure:	To be passed back to userspace in request events
- * @length:	Length of the address range, in bytes
+ * @length:	Length of the CSR, in bytes
  * @handle:	Handle to the allocation, written by the kernel
+ * @region_end:	First address above the address range (added in ABI v4, 2.6.36)
  *
  * Allocate an address range in the 48-bit address space on the local node
  * (the controller).  This allows userspace to listen for requests with an
- * offset within that address range.  When the kernel receives a request
- * within the range, an &fw_cdev_event_request event will be written back.
- * The @closure field is passed back to userspace in the response event.
+ * offset within that address range.  Every time when the kernel receives a
+ * request within the range, an &fw_cdev_event_request2 event will be emitted.
+ * (If the kernel or the client implements ABI version <= 3, an
+ * &fw_cdev_event_request will be generated instead.)
+ *
+ * The @closure field is passed back to userspace in these request events.
  * The @handle field is an out parameter, returning a handle to the allocated
  * range to be used for later deallocation of the range.
  *
@@ -491,12 +496,26 @@ struct fw_cdev_send_response {
  * is exclusive except for the FCP command and response registers.  If an
  * exclusive address region is already in use, the ioctl fails with errno set
  * to %EBUSY.
+ *
+ * If kernel and client implement ABI version >= 4, the kernel looks up a free
+ * spot of size @length inside [@offset..@region_end) and, if found, writes
+ * the start address of the new CSR back in @offset.  I.e. @offset is an
+ * in and out parameter.  If this automatic placement of a CSR in a bigger
+ * address range is not desired, the client simply needs to set @region_end
+ * = @offset + @length.
+ *
+ * If the kernel or the client implements ABI version <= 3, @region_end is
+ * ignored and effectively assumed to be @offset + @length.
+ *
+ * @region_end is only present in a kernel header >= 2.6.36.  If necessary,
+ * this can for example be tested by #ifdef FW_CDEV_EVENT_REQUEST2.
  */
 struct fw_cdev_allocate {
 	__u64 offset;
 	__u64 closure;
 	__u32 length;
 	__u32 handle;
+	__u64 region_end;	/* available since kernel version 2.6.36 */
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 49daf6a22622d4e1619aeaad5f9f0472bf89daff Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Fri, 23 Jul 2010 14:07:47 +0200
Subject: xt_quota: report initial quota value instead of current value to
 userspace

We should copy the initial value to userspace for iptables-save and
to allow removal of specific quota rules.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/xt_quota.h | 2 +-
 net/netfilter/xt_quota.c           | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_quota.h b/include/linux/netfilter/xt_quota.h
index 8dc89dfc1361..b0d28c659ab7 100644
--- a/include/linux/netfilter/xt_quota.h
+++ b/include/linux/netfilter/xt_quota.h
@@ -11,9 +11,9 @@ struct xt_quota_priv;
 struct xt_quota_info {
 	u_int32_t		flags;
 	u_int32_t		pad;
+	aligned_u64		quota;
 
 	/* Used internally by the kernel */
-	aligned_u64		quota;
 	struct xt_quota_priv	*master;
 };
 
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 304b1fda1a0d..70eb2b4984dd 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -36,8 +36,6 @@ quota_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		/* we do not allow even small packets from now on */
 		priv->quota = 0;
 	}
-	/* Copy quota back to matchinfo so that iptables can display it */
-	q->quota = priv->quota;
 	spin_unlock_bh(&priv->lock);
 
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 22b8f15c2f7130bb0386f548428df2ffd4e81903 Mon Sep 17 00:00:00 2001
From: Patrick Pannuto <ppannuto@codeaurora.org>
Date: Mon, 19 Jul 2010 15:09:26 -0700
Subject: timer: Added usleep[_range] timer

usleep[_range] are finer precision implementations of msleep
and are designed to be drop-in replacements for udelay where
a precise sleep / busy-wait is unnecessary. They also allow
an easy interface to specify slack when a precise (ish)
wakeup is unnecessary to help minimize wakeups

Signed-off-by: Patrick Pannuto <ppannuto@codeaurora.org>
Cc: akinobu.mita@gmail.com
Cc: sboyd@codeaurora.org
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <4C44CDD2.1070708@codeaurora.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/delay.h |  6 ++++++
 kernel/timer.c        | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index fd832c6d419e..0e303d1aacd8 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -45,6 +45,12 @@ extern unsigned long lpj_fine;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
+void usleep_range(unsigned long min, unsigned long max);
+
+static inline void usleep(unsigned long usecs)
+{
+	usleep_range(usecs, usecs);
+}
 
 static inline void ssleep(unsigned int seconds)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index ce98685cd1cb..f110f241ab66 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1755,3 +1755,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
+
+static int __sched do_usleep_range(unsigned long min, unsigned long max)
+{
+	ktime_t kmin;
+	unsigned long delta;
+
+	kmin = ktime_set(0, min * NSEC_PER_USEC);
+	delta = max - min;
+	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+}
+
+/**
+ * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ */
+void usleep_range(unsigned long min, unsigned long max)
+{
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	do_usleep_range(min, max);
+}
+EXPORT_SYMBOL(usleep_range);
-- 
cgit v1.2.3-59-g8ed1b


From 808be4b22f47886d2279852ada3d186fc20909bc Mon Sep 17 00:00:00 2001
From: Vasily Khoruzhick <anarsoul@gmail.com>
Date: Sat, 17 Jul 2010 13:57:03 +0300
Subject: Add s3c-adc-battery driver

s3c-adc-battery is driver for monitoring and charging battery on
iPAQ H1930/H1940/RX1950.

It depends on s3c-adc driver to get battery voltage and current.

Signed-off-by: Vasily Khoruzhick <anarsoul@gmail.com>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 drivers/power/Kconfig           |   6 +
 drivers/power/Makefile          |   1 +
 drivers/power/s3c_adc_battery.c | 431 ++++++++++++++++++++++++++++++++++++++++
 include/linux/s3c_adc_battery.h |  36 ++++
 4 files changed, 474 insertions(+)
 create mode 100644 drivers/power/s3c_adc_battery.c
 create mode 100644 include/linux/s3c_adc_battery.h

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 6eae8211197f..d1d3c7006eac 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -136,6 +136,12 @@ config BATTERY_Z2
 	help
 	  Say Y to include support for the battery on the Zipit Z2.
 
+config BATTERY_S3C_ADC
+	tristate "Battery driver for Samsung ADC based monitoring"
+	depends on S3C_ADC
+	help
+	  Say Y here to enable support for iPAQ h1930/h1940/rx1950 battery
+
 config CHARGER_PCF50633
 	tristate "NXP PCF50633 MBC"
 	depends on MFD_PCF50633
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index 2b9a81d41b4d..705675ae9f7c 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -33,5 +33,6 @@ obj-$(CONFIG_BATTERY_BQ27x00)	+= bq27x00_battery.o
 obj-$(CONFIG_BATTERY_DA9030)	+= da9030_battery.o
 obj-$(CONFIG_BATTERY_MAX17040)	+= max17040_battery.o
 obj-$(CONFIG_BATTERY_Z2)	+= z2_battery.o
+obj-$(CONFIG_BATTERY_S3C_ADC)	+= s3c_adc_battery.o
 obj-$(CONFIG_CHARGER_PCF50633)	+= pcf50633-charger.o
 obj-$(CONFIG_BATTERY_INTEL_MID)	+= intel_mid_battery.o
diff --git a/drivers/power/s3c_adc_battery.c b/drivers/power/s3c_adc_battery.c
new file mode 100644
index 000000000000..fe16b482e912
--- /dev/null
+++ b/drivers/power/s3c_adc_battery.c
@@ -0,0 +1,431 @@
+/*
+ *	iPAQ h1930/h1940/rx1950 battery controler driver
+ *	Copyright (c) Vasily Khoruzhick
+ *	Based on h1940_battery.c by Arnaud Patard
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive for
+ * more details.
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/power_supply.h>
+#include <linux/leds.h>
+#include <linux/gpio.h>
+#include <linux/err.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/s3c_adc_battery.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+#include <plat/adc.h>
+
+#define BAT_POLL_INTERVAL		10000 /* ms */
+#define JITTER_DELAY			500 /* ms */
+
+struct s3c_adc_bat {
+	struct power_supply		psy;
+	struct s3c_adc_client		*client;
+	struct s3c_adc_bat_pdata	*pdata;
+	int				volt_value;
+	int				cur_value;
+	unsigned int			timestamp;
+	int				level;
+	int				status;
+	int				cable_plugged:1;
+};
+
+static struct delayed_work bat_work;
+
+static void s3c_adc_bat_ext_power_changed(struct power_supply *psy)
+{
+	schedule_delayed_work(&bat_work,
+		msecs_to_jiffies(JITTER_DELAY));
+}
+
+static enum power_supply_property s3c_adc_backup_bat_props[] = {
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_VOLTAGE_MIN,
+	POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN,
+};
+
+static int s3c_adc_backup_bat_get_property(struct power_supply *psy,
+				enum power_supply_property psp,
+				union power_supply_propval *val)
+{
+	struct s3c_adc_bat *bat = container_of(psy, struct s3c_adc_bat, psy);
+
+	if (!bat) {
+		dev_err(psy->dev, "%s: no battery infos ?!\n", __func__);
+		return -EINVAL;
+	}
+
+	if (bat->volt_value < 0 ||
+		jiffies_to_msecs(jiffies - bat->timestamp) >
+			BAT_POLL_INTERVAL) {
+		bat->volt_value = s3c_adc_read(bat->client,
+			bat->pdata->backup_volt_channel);
+		bat->volt_value *= bat->pdata->backup_volt_mult;
+		bat->timestamp = jiffies;
+	}
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+		val->intval = bat->volt_value;
+		return 0;
+	case POWER_SUPPLY_PROP_VOLTAGE_MIN:
+		val->intval = bat->pdata->backup_volt_min;
+		return 0;
+	case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN:
+		val->intval = bat->pdata->backup_volt_max;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct s3c_adc_bat backup_bat = {
+	.psy = {
+		.name		= "backup-battery",
+		.type		= POWER_SUPPLY_TYPE_BATTERY,
+		.properties	= s3c_adc_backup_bat_props,
+		.num_properties = ARRAY_SIZE(s3c_adc_backup_bat_props),
+		.get_property	= s3c_adc_backup_bat_get_property,
+		.use_for_apm	= 1,
+	},
+};
+
+static enum power_supply_property s3c_adc_main_bat_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
+	POWER_SUPPLY_PROP_CHARGE_EMPTY_DESIGN,
+	POWER_SUPPLY_PROP_CHARGE_NOW,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_CURRENT_NOW,
+};
+
+static int calc_full_volt(int volt_val, int cur_val, int impedance)
+{
+	return volt_val + cur_val * impedance / 1000;
+}
+
+static int s3c_adc_bat_get_property(struct power_supply *psy,
+				    enum power_supply_property psp,
+				    union power_supply_propval *val)
+{
+	struct s3c_adc_bat *bat = container_of(psy, struct s3c_adc_bat, psy);
+
+	int new_level;
+	int full_volt;
+	const struct s3c_adc_bat_thresh *lut = bat->pdata->lut_noac;
+	unsigned int lut_size = bat->pdata->lut_noac_cnt;
+
+	if (!bat) {
+		dev_err(psy->dev, "no battery infos ?!\n");
+		return -EINVAL;
+	}
+
+	if (bat->volt_value < 0 || bat->cur_value < 0 ||
+		jiffies_to_msecs(jiffies - bat->timestamp) >
+			BAT_POLL_INTERVAL) {
+		bat->volt_value = s3c_adc_read(bat->client,
+			bat->pdata->volt_channel) * bat->pdata->volt_mult;
+		bat->cur_value = s3c_adc_read(bat->client,
+			bat->pdata->current_channel) * bat->pdata->current_mult;
+		bat->timestamp = jiffies;
+	}
+
+	if (bat->cable_plugged &&
+		((bat->pdata->gpio_charge_finished < 0) ||
+		!gpio_get_value(bat->pdata->gpio_charge_finished))) {
+		lut = bat->pdata->lut_acin;
+		lut_size = bat->pdata->lut_acin_cnt;
+	}
+
+	new_level = 100000;
+	full_volt = calc_full_volt((bat->volt_value / 1000),
+		(bat->cur_value / 1000), bat->pdata->internal_impedance);
+
+	if (full_volt < calc_full_volt(lut->volt, lut->cur,
+		bat->pdata->internal_impedance)) {
+		lut_size--;
+		while (lut_size--) {
+			int lut_volt1;
+			int lut_volt2;
+
+			lut_volt1 = calc_full_volt(lut[0].volt, lut[0].cur,
+				bat->pdata->internal_impedance);
+			lut_volt2 = calc_full_volt(lut[1].volt, lut[1].cur,
+				bat->pdata->internal_impedance);
+			if (full_volt < lut_volt1 && full_volt >= lut_volt2) {
+				new_level = (lut[1].level +
+					(lut[0].level - lut[1].level) *
+					(full_volt - lut_volt2) /
+					(lut_volt1 - lut_volt2)) * 1000;
+				break;
+			}
+			new_level = lut[1].level * 1000;
+			lut++;
+		}
+	}
+
+	bat->level = new_level;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_STATUS:
+		if (bat->pdata->gpio_charge_finished < 0)
+			val->intval = bat->level == 100000 ?
+				POWER_SUPPLY_STATUS_FULL : bat->status;
+		else
+			val->intval = bat->status;
+		return 0;
+	case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN:
+		val->intval = 100000;
+		return 0;
+	case POWER_SUPPLY_PROP_CHARGE_EMPTY_DESIGN:
+		val->intval = 0;
+		return 0;
+	case POWER_SUPPLY_PROP_CHARGE_NOW:
+		val->intval = bat->level;
+		return 0;
+	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+		val->intval = bat->volt_value;
+		return 0;
+	case POWER_SUPPLY_PROP_CURRENT_NOW:
+		val->intval = bat->cur_value;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct s3c_adc_bat main_bat = {
+	.psy = {
+		.name			= "main-battery",
+		.type			= POWER_SUPPLY_TYPE_BATTERY,
+		.properties		= s3c_adc_main_bat_props,
+		.num_properties		= ARRAY_SIZE(s3c_adc_main_bat_props),
+		.get_property		= s3c_adc_bat_get_property,
+		.external_power_changed = s3c_adc_bat_ext_power_changed,
+		.use_for_apm		= 1,
+	},
+};
+
+static void s3c_adc_bat_work(struct work_struct *work)
+{
+	struct s3c_adc_bat *bat = &main_bat;
+	int is_charged;
+	int is_plugged;
+	static int was_plugged;
+
+	is_plugged = power_supply_am_i_supplied(&bat->psy);
+	bat->cable_plugged = is_plugged;
+	if (is_plugged != was_plugged) {
+		was_plugged = is_plugged;
+		if (is_plugged) {
+			if (bat->pdata->enable_charger)
+				bat->pdata->enable_charger();
+			bat->status = POWER_SUPPLY_STATUS_CHARGING;
+		} else {
+			if (bat->pdata->disable_charger)
+				bat->pdata->disable_charger();
+			bat->status = POWER_SUPPLY_STATUS_DISCHARGING;
+		}
+	} else {
+		if ((bat->pdata->gpio_charge_finished >= 0) && is_plugged) {
+			is_charged = gpio_get_value(
+				main_bat.pdata->gpio_charge_finished);
+			if (is_charged) {
+				if (bat->pdata->disable_charger)
+					bat->pdata->disable_charger();
+				bat->status = POWER_SUPPLY_STATUS_FULL;
+			} else {
+				if (bat->pdata->enable_charger)
+					bat->pdata->enable_charger();
+				bat->status = POWER_SUPPLY_STATUS_CHARGING;
+			}
+		}
+	}
+
+	power_supply_changed(&bat->psy);
+}
+
+static irqreturn_t s3c_adc_bat_charged(int irq, void *dev_id)
+{
+	schedule_delayed_work(&bat_work,
+		msecs_to_jiffies(JITTER_DELAY));
+	return IRQ_HANDLED;
+}
+
+static int __init s3c_adc_bat_probe(struct platform_device *pdev)
+{
+	struct s3c_adc_client	*client;
+	struct s3c_adc_bat_pdata *pdata = pdev->dev.platform_data;
+	int ret;
+
+	client = s3c_adc_register(pdev, NULL, NULL, 0);
+	if (IS_ERR(client)) {
+		dev_err(&pdev->dev, "cannot register adc\n");
+		return PTR_ERR(client);
+	}
+
+	platform_set_drvdata(pdev, client);
+
+	main_bat.client = client;
+	main_bat.pdata = pdata;
+	main_bat.volt_value = -1;
+	main_bat.cur_value = -1;
+	main_bat.cable_plugged = 0;
+	main_bat.status = POWER_SUPPLY_STATUS_DISCHARGING;
+
+	ret = power_supply_register(&pdev->dev, &main_bat.psy);
+	if (ret)
+		goto err_reg_main;
+	if (pdata->backup_volt_mult) {
+		backup_bat.client = client;
+		backup_bat.pdata = pdev->dev.platform_data;
+		backup_bat.volt_value = -1;
+		ret = power_supply_register(&pdev->dev, &backup_bat.psy);
+		if (ret)
+			goto err_reg_backup;
+	}
+
+	INIT_DELAYED_WORK(&bat_work, s3c_adc_bat_work);
+
+	if (pdata->gpio_charge_finished >= 0) {
+		ret = gpio_request(pdata->gpio_charge_finished, "charged");
+		if (ret)
+			goto err_gpio;
+
+		ret = request_irq(gpio_to_irq(pdata->gpio_charge_finished),
+				s3c_adc_bat_charged,
+				IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
+				"battery charged", NULL);
+		if (ret)
+			goto err_irq;
+	}
+
+	if (pdata->init) {
+		ret = pdata->init();
+		if (ret)
+			goto err_platform;
+	}
+
+	dev_info(&pdev->dev, "successfully loaded\n");
+	device_init_wakeup(&pdev->dev, 1);
+
+	/* Schedule timer to check current status */
+	schedule_delayed_work(&bat_work,
+		msecs_to_jiffies(JITTER_DELAY));
+
+	return 0;
+
+err_platform:
+	if (pdata->gpio_charge_finished >= 0)
+		free_irq(gpio_to_irq(pdata->gpio_charge_finished), NULL);
+err_irq:
+	if (pdata->gpio_charge_finished >= 0)
+		gpio_free(pdata->gpio_charge_finished);
+err_gpio:
+	if (pdata->backup_volt_mult)
+		power_supply_unregister(&backup_bat.psy);
+err_reg_backup:
+	power_supply_unregister(&main_bat.psy);
+err_reg_main:
+	return ret;
+}
+
+static int s3c_adc_bat_remove(struct platform_device *pdev)
+{
+	struct s3c_adc_client *client = platform_get_drvdata(pdev);
+	struct s3c_adc_bat_pdata *pdata = pdev->dev.platform_data;
+
+	power_supply_unregister(&main_bat.psy);
+	if (pdata->backup_volt_mult)
+		power_supply_unregister(&backup_bat.psy);
+
+	s3c_adc_release(client);
+
+	if (pdata->gpio_charge_finished >= 0) {
+		free_irq(gpio_to_irq(pdata->gpio_charge_finished), NULL);
+		gpio_free(pdata->gpio_charge_finished);
+	}
+
+	cancel_delayed_work(&bat_work);
+
+	if (pdata->exit)
+		pdata->exit();
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int s3c_adc_bat_suspend(struct platform_device *pdev,
+	pm_message_t state)
+{
+	struct s3c_adc_bat_pdata *pdata = pdev->dev.platform_data;
+
+	if (pdata->gpio_charge_finished >= 0) {
+		if (device_may_wakeup(&pdev->dev))
+			enable_irq_wake(
+				gpio_to_irq(pdata->gpio_charge_finished));
+		else {
+			disable_irq(gpio_to_irq(pdata->gpio_charge_finished));
+			main_bat.pdata->disable_charger();
+		}
+	}
+
+	return 0;
+}
+
+static int s3c_adc_bat_resume(struct platform_device *pdev)
+{
+	struct s3c_adc_bat_pdata *pdata = pdev->dev.platform_data;
+
+	if (pdata->gpio_charge_finished >= 0) {
+		if (device_may_wakeup(&pdev->dev))
+			disable_irq_wake(
+				gpio_to_irq(pdata->gpio_charge_finished));
+		else
+			enable_irq(gpio_to_irq(pdata->gpio_charge_finished));
+	}
+
+	/* Schedule timer to check current status */
+	schedule_delayed_work(&bat_work,
+		msecs_to_jiffies(JITTER_DELAY));
+
+	return 0;
+}
+#else
+#define s3c_adc_battery_suspend NULL
+#define s3c_adc_battery_resume NULL
+#endif
+
+static struct platform_driver s3c_adc_bat_driver = {
+	.driver		= {
+		.name	= "s3c-adc-battery",
+	},
+	.probe		= s3c_adc_bat_probe,
+	.remove		= s3c_adc_bat_remove,
+	.suspend	= s3c_adc_bat_suspend,
+	.resume		= s3c_adc_bat_resume,
+};
+
+static int __init s3c_adc_bat_init(void)
+{
+	return platform_driver_register(&s3c_adc_bat_driver);
+}
+module_init(s3c_adc_bat_init);
+
+static void __exit s3c_adc_bat_exit(void)
+{
+	platform_driver_unregister(&s3c_adc_bat_driver);
+}
+module_exit(s3c_adc_bat_exit);
+
+MODULE_AUTHOR("Vasily Khoruzhick <anarsoul@gmail.com>");
+MODULE_DESCRIPTION("iPAQ H1930/H1940/RX1950 battery controler driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/s3c_adc_battery.h b/include/linux/s3c_adc_battery.h
new file mode 100644
index 000000000000..dbce22faa660
--- /dev/null
+++ b/include/linux/s3c_adc_battery.h
@@ -0,0 +1,36 @@
+#ifndef _S3C_ADC_BATTERY_H
+#define _S3C_ADC_BATTERY_H
+
+struct s3c_adc_bat_thresh {
+	int volt; /* mV */
+	int cur; /* mA */
+	int level; /* percent */
+};
+
+struct s3c_adc_bat_pdata {
+	int (*init)(void);
+	void (*exit)(void);
+	void (*enable_charger)(void);
+	void (*disable_charger)(void);
+
+	int gpio_charge_finished;
+
+	const struct s3c_adc_bat_thresh *lut_noac;
+	unsigned int lut_noac_cnt;
+	const struct s3c_adc_bat_thresh *lut_acin;
+	unsigned int lut_acin_cnt;
+
+	const unsigned int volt_channel;
+	const unsigned int current_channel;
+	const unsigned int backup_volt_channel;
+
+	const unsigned int volt_mult;
+	const unsigned int current_mult;
+	const unsigned int backup_volt_mult;
+	const unsigned int internal_impedance;
+
+	const unsigned int backup_volt_max;
+	const unsigned int backup_volt_min;
+};
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From eca3930163ba8884060ce9d9ff5ef0d9b7c7b00f Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 8 Jun 2010 07:48:21 -0600
Subject: of: Merge of_platform_bus_type with platform_bus_type

of_platform_bus was being used in the same manner as the platform_bus.
The only difference being that of_platform_bus devices are generated
from data in the device tree, and platform_bus devices are usually
statically allocated in platform code.  Having them separate causes
the problem of device drivers having to be registered twice if it
was possible for the same device to appear on either bus.

This patch removes of_platform_bus_type and registers all of_platform
bus devices and drivers on the platform bus instead.  A previous patch
made the of_device structure an alias for the platform_device structure,
and a shim is used to adapt of_platform_drivers to the platform bus.

After all of of_platform_bus drivers are converted to be normal platform
drivers, the shim code can be removed.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/microblaze/kernel/of_platform.c     | 11 ------
 arch/microblaze/kernel/setup.c           |  6 ---
 arch/powerpc/kernel/dma-swiotlb.c        |  8 ----
 arch/powerpc/kernel/of_platform.c        | 12 ------
 arch/powerpc/kernel/setup-common.c       |  7 ----
 arch/powerpc/platforms/cell/beat_iommu.c |  2 +-
 arch/powerpc/platforms/cell/iommu.c      |  2 +-
 arch/powerpc/sysdev/mv64x60_dev.c        |  7 +---
 arch/sparc/kernel/of_device_32.c         | 21 +++-------
 arch/sparc/kernel/of_device_64.c         | 21 +++-------
 arch/sparc/kernel/of_device_common.c     |  3 --
 drivers/base/platform.c                  |  6 +++
 drivers/of/device.c                      |  5 +++
 drivers/of/platform.c                    | 67 ++++++++++++++++++++++++++++++--
 include/linux/of_device.h                |  6 +++
 include/linux/of_platform.h              | 21 ++++------
 16 files changed, 102 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/kernel/of_platform.c b/arch/microblaze/kernel/of_platform.c
index da79edf45420..fb2866104331 100644
--- a/arch/microblaze/kernel/of_platform.c
+++ b/arch/microblaze/kernel/of_platform.c
@@ -26,17 +26,6 @@
 #include <linux/topology.h>
 #include <asm/atomic.h>
 
-struct bus_type of_platform_bus_type = {
-       .uevent	= of_device_uevent,
-};
-EXPORT_SYMBOL(of_platform_bus_type);
-
-static int __init of_bus_driver_init(void)
-{
-	return of_bus_type_init(&of_platform_bus_type, "of_platform");
-}
-postcore_initcall(of_bus_driver_init);
-
 /*
  * The list of OF IDs below is used for matching bus types in the
  * system whose devices are to be exposed as of_platform_devices.
diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c
index 17c98dbcec88..f5f768842354 100644
--- a/arch/microblaze/kernel/setup.c
+++ b/arch/microblaze/kernel/setup.c
@@ -213,15 +213,9 @@ static struct notifier_block dflt_plat_bus_notifier = {
 	.priority = INT_MAX,
 };
 
-static struct notifier_block dflt_of_bus_notifier = {
-	.notifier_call = dflt_bus_notify,
-	.priority = INT_MAX,
-};
-
 static int __init setup_bus_notifier(void)
 {
 	bus_register_notifier(&platform_bus_type, &dflt_plat_bus_notifier);
-	bus_register_notifier(&of_platform_bus_type, &dflt_of_bus_notifier);
 
 	return 0;
 }
diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index 02f724f36753..4295e0b94b2d 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -82,17 +82,9 @@ static struct notifier_block ppc_swiotlb_plat_bus_notifier = {
 	.priority = 0,
 };
 
-static struct notifier_block ppc_swiotlb_of_bus_notifier = {
-	.notifier_call = ppc_swiotlb_bus_notify,
-	.priority = 0,
-};
-
 int __init swiotlb_setup_bus_notifier(void)
 {
 	bus_register_notifier(&platform_bus_type,
 			      &ppc_swiotlb_plat_bus_notifier);
-	bus_register_notifier(&of_platform_bus_type,
-			      &ppc_swiotlb_of_bus_notifier);
-
 	return 0;
 }
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index 4e0a2f7c1dd3..d3497cd81e8a 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -52,18 +52,6 @@ const struct of_device_id of_default_bus_ids[] = {
 	{},
 };
 
-struct bus_type of_platform_bus_type = {
-       .uevent	= of_device_uevent,
-};
-EXPORT_SYMBOL(of_platform_bus_type);
-
-static int __init of_bus_driver_init(void)
-{
-	return of_bus_type_init(&of_platform_bus_type, "of_platform");
-}
-
-postcore_initcall(of_bus_driver_init);
-
 static int of_dev_node_match(struct device *dev, void *data)
 {
 	return to_of_device(dev)->dev.of_node == data;
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index b7e6c7e193ae..d1a5304b3ddd 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -701,16 +701,9 @@ static struct notifier_block ppc_dflt_plat_bus_notifier = {
 	.priority = INT_MAX,
 };
 
-static struct notifier_block ppc_dflt_of_bus_notifier = {
-	.notifier_call = ppc_dflt_bus_notify,
-	.priority = INT_MAX,
-};
-
 static int __init setup_bus_notifier(void)
 {
 	bus_register_notifier(&platform_bus_type, &ppc_dflt_plat_bus_notifier);
-	bus_register_notifier(&of_platform_bus_type, &ppc_dflt_of_bus_notifier);
-
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/cell/beat_iommu.c b/arch/powerpc/platforms/cell/beat_iommu.c
index 39d361c5c6d2..beec405eb6f8 100644
--- a/arch/powerpc/platforms/cell/beat_iommu.c
+++ b/arch/powerpc/platforms/cell/beat_iommu.c
@@ -108,7 +108,7 @@ static int __init celleb_init_iommu(void)
 	celleb_init_direct_mapping();
 	set_pci_dma_ops(&dma_direct_ops);
 	ppc_md.pci_dma_dev_setup = celleb_pci_dma_dev_setup;
-	bus_register_notifier(&of_platform_bus_type, &celleb_of_bus_notifier);
+	bus_register_notifier(&platform_bus_type, &celleb_of_bus_notifier);
 
 	return 0;
 }
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 3712900471ba..58b13ce3847e 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -1204,7 +1204,7 @@ static int __init cell_iommu_init(void)
 	/* Register callbacks on OF platform device addition/removal
 	 * to handle linking them to the right DMA operations
 	 */
-	bus_register_notifier(&of_platform_bus_type, &cell_of_bus_notifier);
+	bus_register_notifier(&platform_bus_type, &cell_of_bus_notifier);
 
 	return 0;
 }
diff --git a/arch/powerpc/sysdev/mv64x60_dev.c b/arch/powerpc/sysdev/mv64x60_dev.c
index 31acd3b1718b..1398bc454999 100644
--- a/arch/powerpc/sysdev/mv64x60_dev.c
+++ b/arch/powerpc/sysdev/mv64x60_dev.c
@@ -20,12 +20,7 @@
 
 #include <asm/prom.h>
 
-/*
- * These functions provide the necessary setup for the mv64x60 drivers.
- * These drivers are unusual in that they work on both the MIPS and PowerPC
- * architectures.  Because of that, the drivers do not support the normal
- * PowerPC of_platform_bus_type.  They support platform_bus_type instead.
- */
+/* These functions provide the necessary setup for the mv64x60 drivers. */
 
 static struct of_device_id __initdata of_mv64x60_devices[] = {
 	{ .compatible = "marvell,mv64306-devctrl", },
diff --git a/arch/sparc/kernel/of_device_32.c b/arch/sparc/kernel/of_device_32.c
index 331de91ad2bc..75fc9d5cd7e6 100644
--- a/arch/sparc/kernel/of_device_32.c
+++ b/arch/sparc/kernel/of_device_32.c
@@ -424,7 +424,7 @@ build_resources:
 	build_device_resources(op, parent);
 
 	op->dev.parent = parent;
-	op->dev.bus = &of_platform_bus_type;
+	op->dev.bus = &platform_bus_type;
 	if (!parent)
 		dev_set_name(&op->dev, "root");
 	else
@@ -452,30 +452,19 @@ static void __init scan_tree(struct device_node *dp, struct device *parent)
 	}
 }
 
-static void __init scan_of_devices(void)
+static int __init scan_of_devices(void)
 {
 	struct device_node *root = of_find_node_by_path("/");
 	struct of_device *parent;
 
 	parent = scan_one_device(root, NULL);
 	if (!parent)
-		return;
+		return 0;
 
 	scan_tree(root->child, &parent->dev);
+	return 0;
 }
-
-static int __init of_bus_driver_init(void)
-{
-	int err;
-
-	err = of_bus_type_init(&of_platform_bus_type, "of");
-	if (!err)
-		scan_of_devices();
-
-	return err;
-}
-
-postcore_initcall(of_bus_driver_init);
+postcore_initcall(scan_of_devices);
 
 static int __init of_debug(char *str)
 {
diff --git a/arch/sparc/kernel/of_device_64.c b/arch/sparc/kernel/of_device_64.c
index 5e8cbb942d3d..9743d1d9fa03 100644
--- a/arch/sparc/kernel/of_device_64.c
+++ b/arch/sparc/kernel/of_device_64.c
@@ -667,7 +667,7 @@ static struct of_device * __init scan_one_device(struct device_node *dp,
 		op->archdata.irqs[i] = build_one_device_irq(op, parent, op->archdata.irqs[i]);
 
 	op->dev.parent = parent;
-	op->dev.bus = &of_platform_bus_type;
+	op->dev.bus = &platform_bus_type;
 	if (!parent)
 		dev_set_name(&op->dev, "root");
 	else
@@ -695,30 +695,19 @@ static void __init scan_tree(struct device_node *dp, struct device *parent)
 	}
 }
 
-static void __init scan_of_devices(void)
+static int __init scan_of_devices(void)
 {
 	struct device_node *root = of_find_node_by_path("/");
 	struct of_device *parent;
 
 	parent = scan_one_device(root, NULL);
 	if (!parent)
-		return;
+		return 0;
 
 	scan_tree(root->child, &parent->dev);
+	return 0;
 }
-
-static int __init of_bus_driver_init(void)
-{
-	int err;
-
-	err = of_bus_type_init(&of_platform_bus_type, "of");
-	if (!err)
-		scan_of_devices();
-
-	return err;
-}
-
-postcore_initcall(of_bus_driver_init);
+postcore_initcall(scan_of_devices);
 
 static int __init of_debug(char *str)
 {
diff --git a/arch/sparc/kernel/of_device_common.c b/arch/sparc/kernel/of_device_common.c
index 016c947d4cae..01f380c7995c 100644
--- a/arch/sparc/kernel/of_device_common.c
+++ b/arch/sparc/kernel/of_device_common.c
@@ -64,9 +64,6 @@ void of_propagate_archdata(struct of_device *bus)
 	}
 }
 
-struct bus_type of_platform_bus_type;
-EXPORT_SYMBOL(of_platform_bus_type);
-
 static void get_cells(struct device_node *dp, int *addrc, int *sizec)
 {
 	if (addrc)
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index fac3633c7223..f699fabf403b 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -636,6 +636,12 @@ static struct device_attribute platform_dev_attrs[] = {
 static int platform_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
 	struct platform_device	*pdev = to_platform_device(dev);
+	int rc;
+
+	/* Some devices have extra OF data and an OF-style MODALIAS */
+	rc = of_device_uevent(dev,env);
+	if (rc != -ENODEV)
+		return rc;
 
 	add_uevent_var(env, "MODALIAS=%s%s", PLATFORM_MODULE_PREFIX,
 		(pdev->id_entry) ? pdev->id_entry->name : pdev->name);
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 5282a202f5a9..12a44b493511 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -104,6 +104,11 @@ int of_device_register(struct of_device *ofdev)
 
 	device_initialize(&ofdev->dev);
 
+	/* name and id have to be set so that the platform bus doesn't get
+	 * confused on matching */
+	ofdev->name = dev_name(&ofdev->dev);
+	ofdev->id = -1;
+
 	/* device_add will assume that this device is on the same node as
 	 * the parent. If there is no parent defined, set the node
 	 * explicitly */
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 9d3d932bcb6f..712dfd866df0 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -20,6 +20,54 @@
 #include <linux/of_device.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
+#include <linux/platform_device.h>
+
+static int platform_driver_probe_shim(struct platform_device *pdev)
+{
+	struct platform_driver *pdrv;
+	struct of_platform_driver *ofpdrv;
+	const struct of_device_id *match;
+
+	pdrv = container_of(pdev->dev.driver, struct platform_driver, driver);
+	ofpdrv = container_of(pdrv, struct of_platform_driver, platform_driver);
+	match = of_match_device(pdev->dev.driver->of_match_table, &pdev->dev);
+	return ofpdrv->probe(pdev, match);
+}
+
+static void platform_driver_shutdown_shim(struct platform_device *pdev)
+{
+	struct platform_driver *pdrv;
+	struct of_platform_driver *ofpdrv;
+
+	pdrv = container_of(pdev->dev.driver, struct platform_driver, driver);
+	ofpdrv = container_of(pdrv, struct of_platform_driver, platform_driver);
+	ofpdrv->shutdown(pdev);
+}
+
+/**
+ * of_register_platform_driver
+ */
+int of_register_platform_driver(struct of_platform_driver *drv)
+{
+	/* setup of_platform_driver to platform_driver adaptors */
+	drv->platform_driver.driver = drv->driver;
+	if (drv->probe)
+		drv->platform_driver.probe = platform_driver_probe_shim;
+	drv->platform_driver.remove = drv->remove;
+	if (drv->shutdown)
+		drv->platform_driver.shutdown = platform_driver_shutdown_shim;
+	drv->platform_driver.suspend = drv->suspend;
+	drv->platform_driver.resume = drv->resume;
+
+	return platform_driver_register(&drv->platform_driver);
+}
+EXPORT_SYMBOL(of_register_platform_driver);
+
+void of_unregister_platform_driver(struct of_platform_driver *drv)
+{
+	platform_driver_unregister(&drv->platform_driver);
+}
+EXPORT_SYMBOL(of_unregister_platform_driver);
 
 #if defined(CONFIG_PPC_DCR)
 #include <asm/dcr.h>
@@ -392,16 +440,29 @@ int of_bus_type_init(struct bus_type *bus, const char *name)
 
 int of_register_driver(struct of_platform_driver *drv, struct bus_type *bus)
 {
-	drv->driver.bus = bus;
+	/*
+	 * Temporary: of_platform_bus used to be distinct from the platform
+	 * bus.  It isn't anymore, and so drivers on the platform bus need
+	 * to be registered in a special way.
+	 *
+	 * After all of_platform_bus_type drivers are converted to
+	 * platform_drivers, this exception can be removed.
+	 */
+	if (bus == &platform_bus_type)
+		return of_register_platform_driver(drv);
 
 	/* register with core */
+	drv->driver.bus = bus;
 	return driver_register(&drv->driver);
 }
 EXPORT_SYMBOL(of_register_driver);
 
 void of_unregister_driver(struct of_platform_driver *drv)
 {
-	driver_unregister(&drv->driver);
+	if (drv->driver.bus == &platform_bus_type)
+		of_unregister_platform_driver(drv);
+	else
+		driver_unregister(&drv->driver);
 }
 EXPORT_SYMBOL(of_unregister_driver);
 
@@ -548,7 +609,7 @@ struct of_device *of_platform_device_create(struct device_node *np,
 	dev->archdata.dma_mask = 0xffffffffUL;
 #endif
 	dev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
-	dev->dev.bus = &of_platform_bus_type;
+	dev->dev.bus = &platform_bus_type;
 
 	/* We do not fill the DMA ops for platform devices by default.
 	 * This is currently the responsibility of the platform code
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 7d27f5a878f6..8cd1fe7864e3 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -65,6 +65,12 @@ static inline int of_driver_match_device(struct device *dev,
 	return 0;
 }
 
+static inline int of_device_uevent(struct device *dev,
+				   struct kobj_uevent_env *env)
+{
+	return -ENODEV;
+}
+
 #endif /* CONFIG_OF_DEVICE */
 
 #endif /* _LINUX_OF_DEVICE_H */
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index a51fd30176aa..133ecf31a60f 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -17,19 +17,19 @@
 #include <linux/mod_devicetable.h>
 #include <linux/pm.h>
 #include <linux/of_device.h>
+#include <linux/platform_device.h>
 
 /*
- * The of_platform_bus_type is a bus type used by drivers that do not
- * attach to a macio or similar bus but still use OF probing
- * mechanism
+ * of_platform_bus_type isn't it's own bus anymore.  It's now just an alias
+ * for the platform bus.
  */
-extern struct bus_type of_platform_bus_type;
+#define of_platform_bus_type platform_bus_type
 
 extern const struct of_device_id of_default_bus_ids[];
 
 /*
  * An of_platform_driver driver is attached to a basic of_device on
- * the "platform bus" (of_platform_bus_type).
+ * the "platform bus" (platform_bus_type).
  */
 struct of_platform_driver
 {
@@ -42,6 +42,7 @@ struct of_platform_driver
 	int	(*shutdown)(struct of_device* dev);
 
 	struct device_driver	driver;
+	struct platform_driver	platform_driver;
 };
 #define	to_of_platform_driver(drv) \
 	container_of(drv,struct of_platform_driver, driver)
@@ -51,14 +52,8 @@ extern int of_register_driver(struct of_platform_driver *drv,
 extern void of_unregister_driver(struct of_platform_driver *drv);
 
 /* Platform drivers register/unregister */
-static inline int of_register_platform_driver(struct of_platform_driver *drv)
-{
-	return of_register_driver(drv, &of_platform_bus_type);
-}
-static inline void of_unregister_platform_driver(struct of_platform_driver *drv)
-{
-	of_unregister_driver(drv);
-}
+extern int of_register_platform_driver(struct of_platform_driver *drv);
+extern void of_unregister_platform_driver(struct of_platform_driver *drv);
 
 extern struct of_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
-- 
cgit v1.2.3-59-g8ed1b


From 1ab1d63a85cee2545272f63a7644e9f855cb65d0 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 24 Jun 2010 15:14:37 -0600
Subject: of/platform: remove all of_bus_type and of_platform_bus_type
 references

Both of_bus_type and of_platform_bus_type are just #define aliases
for the platform bus.  This patch removes all references to them and
switches to the of_register_platform_driver()/of_unregister_platform_driver()
API for registering.

Subsequent patches will convert each user of of_register_platform_driver()
into plain platform_drivers without the of_platform_driver shim.  At which
point the of_register_platform_driver()/of_unregister_platform_driver()
functions can be removed.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/microblaze/kernel/of_platform.c |  3 +--
 arch/powerpc/kernel/of_platform.c    |  3 +--
 arch/sparc/include/asm/of_platform.h |  2 --
 arch/sparc/include/asm/parport.h     |  4 +---
 arch/sparc/kernel/apc.c              |  2 +-
 arch/sparc/kernel/auxio_64.c         |  2 +-
 arch/sparc/kernel/central.c          |  4 ++--
 arch/sparc/kernel/chmc.c             |  4 ++--
 arch/sparc/kernel/of_device_common.c |  2 +-
 arch/sparc/kernel/pci_fire.c         |  2 +-
 arch/sparc/kernel/pci_psycho.c       |  2 +-
 arch/sparc/kernel/pci_sabre.c        |  2 +-
 arch/sparc/kernel/pci_schizo.c       |  2 +-
 arch/sparc/kernel/pci_sun4v.c        |  2 +-
 arch/sparc/kernel/pmc.c              |  2 +-
 arch/sparc/kernel/power.c            |  2 +-
 arch/sparc/kernel/time_32.c          |  2 +-
 arch/sparc/kernel/time_64.c          |  6 +++---
 drivers/atm/fore200e.c               |  6 +++---
 drivers/char/hw_random/n2-drv.c      |  4 ++--
 drivers/crypto/n2_core.c             | 10 +++++-----
 drivers/hwmon/ultra45_env.c          |  4 ++--
 drivers/input/misc/sparcspkr.c       | 12 +++++-------
 drivers/input/serio/i8042-sparcio.h  |  5 ++---
 drivers/mtd/maps/sun_uflash.c        |  4 ++--
 drivers/net/ibm_newemac/core.c       |  4 ++--
 drivers/net/myri_sbus.c              |  4 ++--
 drivers/net/niu.c                    |  6 +++---
 drivers/net/sunbmac.c                |  4 ++--
 drivers/net/sunhme.c                 |  4 ++--
 drivers/net/sunlance.c               |  4 ++--
 drivers/net/sunqe.c                  |  4 ++--
 drivers/parport/parport_sunbpp.c     |  4 ++--
 drivers/sbus/char/bbc_i2c.c          |  4 ++--
 drivers/sbus/char/display7seg.c      |  4 ++--
 drivers/sbus/char/envctrl.c          |  4 ++--
 drivers/sbus/char/flash.c            |  4 ++--
 drivers/sbus/char/uctrl.c            |  4 ++--
 drivers/scsi/qlogicpti.c             |  4 ++--
 drivers/scsi/sun_esp.c               |  4 ++--
 drivers/serial/sunhv.c               |  4 ++--
 drivers/serial/sunsab.c              |  4 ++--
 drivers/serial/sunsu.c               |  2 +-
 drivers/serial/sunzilog.c            |  6 +++---
 drivers/video/bw2.c                  |  4 ++--
 drivers/video/cg14.c                 |  4 ++--
 drivers/video/cg3.c                  |  4 ++--
 drivers/video/cg6.c                  |  4 ++--
 drivers/video/ffb.c                  |  4 ++--
 drivers/video/leo.c                  |  4 ++--
 drivers/video/p9100.c                |  4 ++--
 drivers/video/sunxvr1000.c           |  4 ++--
 drivers/video/tcx.c                  |  4 ++--
 drivers/watchdog/cpwd.c              |  4 ++--
 drivers/watchdog/riowd.c             |  4 ++--
 include/linux/of_platform.h          |  6 ------
 sound/sparc/amd7930.c                |  4 ++--
 sound/sparc/cs4231.c                 |  4 ++--
 sound/sparc/dbri.c                   |  4 ++--
 59 files changed, 109 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/kernel/of_platform.c b/arch/microblaze/kernel/of_platform.c
index fb2866104331..80c9c493cb25 100644
--- a/arch/microblaze/kernel/of_platform.c
+++ b/arch/microblaze/kernel/of_platform.c
@@ -57,8 +57,7 @@ struct of_device *of_find_device_by_node(struct device_node *np)
 {
 	struct device *dev;
 
-	dev = bus_find_device(&of_platform_bus_type,
-			      NULL, np, of_dev_node_match);
+	dev = bus_find_device(&platform_bus_type, NULL, np, of_dev_node_match);
 	if (dev)
 		return to_of_device(dev);
 	return NULL;
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index d3497cd81e8a..b093d4b1f09c 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -61,8 +61,7 @@ struct of_device *of_find_device_by_node(struct device_node *np)
 {
 	struct device *dev;
 
-	dev = bus_find_device(&of_platform_bus_type,
-			      NULL, np, of_dev_node_match);
+	dev = bus_find_device(&platform_bus_type, NULL, np, of_dev_node_match);
 	if (dev)
 		return to_of_device(dev);
 	return NULL;
diff --git a/arch/sparc/include/asm/of_platform.h b/arch/sparc/include/asm/of_platform.h
index 90da99059f83..26540ddfc511 100644
--- a/arch/sparc/include/asm/of_platform.h
+++ b/arch/sparc/include/asm/of_platform.h
@@ -13,6 +13,4 @@
  *
  */
 
-#define of_bus_type	of_platform_bus_type	/* for compatibility */
-
 #endif
diff --git a/arch/sparc/include/asm/parport.h b/arch/sparc/include/asm/parport.h
index 0c34a8792fc7..4891fbce1114 100644
--- a/arch/sparc/include/asm/parport.h
+++ b/arch/sparc/include/asm/parport.h
@@ -243,9 +243,7 @@ static struct of_platform_driver ecpp_driver = {
 
 static int parport_pc_find_nonpci_ports(int autoirq, int autodma)
 {
-	of_register_driver(&ecpp_driver, &of_bus_type);
-
-	return 0;
+	return of_register_platform_driver(&ecpp_driver);
 }
 
 #endif /* !(_ASM_SPARC64_PARPORT_H */
diff --git a/arch/sparc/kernel/apc.c b/arch/sparc/kernel/apc.c
index b27476caa133..c471251cd3f1 100644
--- a/arch/sparc/kernel/apc.c
+++ b/arch/sparc/kernel/apc.c
@@ -184,7 +184,7 @@ static struct of_platform_driver apc_driver = {
 
 static int __init apc_init(void)
 {
-	return of_register_driver(&apc_driver, &of_bus_type);
+	return of_register_platform_driver(&apc_driver);
 }
 
 /* This driver is not critical to the boot process
diff --git a/arch/sparc/kernel/auxio_64.c b/arch/sparc/kernel/auxio_64.c
index ddc84128b3c2..46ba58a8510f 100644
--- a/arch/sparc/kernel/auxio_64.c
+++ b/arch/sparc/kernel/auxio_64.c
@@ -142,7 +142,7 @@ static struct of_platform_driver auxio_driver = {
 
 static int __init auxio_init(void)
 {
-	return of_register_driver(&auxio_driver, &of_platform_bus_type);
+	return of_register_platform_driver(&auxio_driver);
 }
 
 /* Must be after subsys_initcall() so that busses are probed.  Must
diff --git a/arch/sparc/kernel/central.c b/arch/sparc/kernel/central.c
index 434335f65823..b6080c39ed4b 100644
--- a/arch/sparc/kernel/central.c
+++ b/arch/sparc/kernel/central.c
@@ -265,8 +265,8 @@ static struct of_platform_driver fhc_driver = {
 
 static int __init sunfire_init(void)
 {
-	(void) of_register_driver(&fhc_driver, &of_platform_bus_type);
-	(void) of_register_driver(&clock_board_driver, &of_platform_bus_type);
+	(void) of_register_platform_driver(&fhc_driver);
+	(void) of_register_platform_driver(&clock_board_driver);
 	return 0;
 }
 
diff --git a/arch/sparc/kernel/chmc.c b/arch/sparc/kernel/chmc.c
index 870cb65b3f21..04bb7df9f716 100644
--- a/arch/sparc/kernel/chmc.c
+++ b/arch/sparc/kernel/chmc.c
@@ -848,7 +848,7 @@ static int __init us3mc_init(void)
 	ret = register_dimm_printer(us3mc_dimm_printer);
 
 	if (!ret) {
-		ret = of_register_driver(&us3mc_driver, &of_bus_type);
+		ret = of_register_platform_driver(&us3mc_driver);
 		if (ret)
 			unregister_dimm_printer(us3mc_dimm_printer);
 	}
@@ -859,7 +859,7 @@ static void __exit us3mc_cleanup(void)
 {
 	if (us3mc_platform()) {
 		unregister_dimm_printer(us3mc_dimm_printer);
-		of_unregister_driver(&us3mc_driver);
+		of_unregister_platform_driver(&us3mc_driver);
 	}
 }
 
diff --git a/arch/sparc/kernel/of_device_common.c b/arch/sparc/kernel/of_device_common.c
index 01f380c7995c..2a5c639e4c3f 100644
--- a/arch/sparc/kernel/of_device_common.c
+++ b/arch/sparc/kernel/of_device_common.c
@@ -21,7 +21,7 @@ static int node_match(struct device *dev, void *data)
 
 struct of_device *of_find_device_by_node(struct device_node *dp)
 {
-	struct device *dev = bus_find_device(&of_platform_bus_type, NULL,
+	struct device *dev = bus_find_device(&platform_bus_type, NULL,
 					     dp, node_match);
 
 	if (dev)
diff --git a/arch/sparc/kernel/pci_fire.c b/arch/sparc/kernel/pci_fire.c
index 51cfa09e392a..885f10b742e2 100644
--- a/arch/sparc/kernel/pci_fire.c
+++ b/arch/sparc/kernel/pci_fire.c
@@ -518,7 +518,7 @@ static struct of_platform_driver fire_driver = {
 
 static int __init fire_init(void)
 {
-	return of_register_driver(&fire_driver, &of_bus_type);
+	return of_register_platform_driver(&fire_driver);
 }
 
 subsys_initcall(fire_init);
diff --git a/arch/sparc/kernel/pci_psycho.c b/arch/sparc/kernel/pci_psycho.c
index 93011e6e7ddc..71550a7aacd8 100644
--- a/arch/sparc/kernel/pci_psycho.c
+++ b/arch/sparc/kernel/pci_psycho.c
@@ -612,7 +612,7 @@ static struct of_platform_driver psycho_driver = {
 
 static int __init psycho_init(void)
 {
-	return of_register_driver(&psycho_driver, &of_bus_type);
+	return of_register_platform_driver(&psycho_driver);
 }
 
 subsys_initcall(psycho_init);
diff --git a/arch/sparc/kernel/pci_sabre.c b/arch/sparc/kernel/pci_sabre.c
index 99c6dba7d4fd..2d7bf30552dd 100644
--- a/arch/sparc/kernel/pci_sabre.c
+++ b/arch/sparc/kernel/pci_sabre.c
@@ -606,7 +606,7 @@ static struct of_platform_driver sabre_driver = {
 
 static int __init sabre_init(void)
 {
-	return of_register_driver(&sabre_driver, &of_bus_type);
+	return of_register_platform_driver(&sabre_driver);
 }
 
 subsys_initcall(sabre_init);
diff --git a/arch/sparc/kernel/pci_schizo.c b/arch/sparc/kernel/pci_schizo.c
index 9041dae7aaca..04f29c46bfa0 100644
--- a/arch/sparc/kernel/pci_schizo.c
+++ b/arch/sparc/kernel/pci_schizo.c
@@ -1501,7 +1501,7 @@ static struct of_platform_driver schizo_driver = {
 
 static int __init schizo_init(void)
 {
-	return of_register_driver(&schizo_driver, &of_bus_type);
+	return of_register_platform_driver(&schizo_driver);
 }
 
 subsys_initcall(schizo_init);
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index a24af6f7e17f..18ee8b6f4036 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -1019,7 +1019,7 @@ static struct of_platform_driver pci_sun4v_driver = {
 
 static int __init pci_sun4v_init(void)
 {
-	return of_register_driver(&pci_sun4v_driver, &of_bus_type);
+	return of_register_platform_driver(&pci_sun4v_driver);
 }
 
 subsys_initcall(pci_sun4v_init);
diff --git a/arch/sparc/kernel/pmc.c b/arch/sparc/kernel/pmc.c
index 9589d8b9b0c1..a4c73edc8975 100644
--- a/arch/sparc/kernel/pmc.c
+++ b/arch/sparc/kernel/pmc.c
@@ -89,7 +89,7 @@ static struct of_platform_driver pmc_driver = {
 
 static int __init pmc_init(void)
 {
-	return of_register_driver(&pmc_driver, &of_bus_type);
+	return of_register_platform_driver(&pmc_driver);
 }
 
 /* This driver is not critical to the boot process
diff --git a/arch/sparc/kernel/power.c b/arch/sparc/kernel/power.c
index 1cfee577f6b5..abc194ed5a78 100644
--- a/arch/sparc/kernel/power.c
+++ b/arch/sparc/kernel/power.c
@@ -70,7 +70,7 @@ static struct of_platform_driver power_driver = {
 
 static int __init power_init(void)
 {
-	return of_register_driver(&power_driver, &of_platform_bus_type);
+	return of_register_platform_driver(&power_driver);
 }
 
 device_initcall(power_init);
diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c
index e404b063be2c..5dc20216952e 100644
--- a/arch/sparc/kernel/time_32.c
+++ b/arch/sparc/kernel/time_32.c
@@ -189,7 +189,7 @@ static struct of_platform_driver clock_driver = {
 /* Probe for the mostek real time clock chip. */
 static int __init clock_init(void)
 {
-	return of_register_driver(&clock_driver, &of_platform_bus_type);
+	return of_register_platform_driver(&clock_driver);
 }
 /* Must be after subsys_initcall() so that busses are probed.  Must
  * be before device_initcall() because things like the RTC driver
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index 21e9fcae0668..2423b336a717 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -586,9 +586,9 @@ static int __init clock_init(void)
 	if (tlb_type == hypervisor)
 		return platform_device_register(&rtc_sun4v_device);
 
-	(void) of_register_driver(&rtc_driver, &of_platform_bus_type);
-	(void) of_register_driver(&mostek_driver, &of_platform_bus_type);
-	(void) of_register_driver(&bq4802_driver, &of_platform_bus_type);
+	(void) of_register_platform_driver(&rtc_driver);
+	(void) of_register_platform_driver(&mostek_driver);
+	(void) of_register_platform_driver(&bq4802_driver);
 
 	return 0;
 }
diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 38df87b198d5..b7385e077717 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -2795,7 +2795,7 @@ static int __init fore200e_module_init(void)
 	printk(FORE200E "FORE Systems 200E-series ATM driver - version " FORE200E_VERSION "\n");
 
 #ifdef CONFIG_SBUS
-	err = of_register_driver(&fore200e_sba_driver, &of_bus_type);
+	err = of_register_platform_driver(&fore200e_sba_driver);
 	if (err)
 		return err;
 #endif
@@ -2806,7 +2806,7 @@ static int __init fore200e_module_init(void)
 
 #ifdef CONFIG_SBUS
 	if (err)
-		of_unregister_driver(&fore200e_sba_driver);
+		of_unregister_platform_driver(&fore200e_sba_driver);
 #endif
 
 	return err;
@@ -2818,7 +2818,7 @@ static void __exit fore200e_module_cleanup(void)
 	pci_unregister_driver(&fore200e_pca_driver);
 #endif
 #ifdef CONFIG_SBUS
-	of_unregister_driver(&fore200e_sba_driver);
+	of_unregister_platform_driver(&fore200e_sba_driver);
 #endif
 }
 
diff --git a/drivers/char/hw_random/n2-drv.c b/drivers/char/hw_random/n2-drv.c
index 0f9cbf1aaf15..d8a4ca87987d 100644
--- a/drivers/char/hw_random/n2-drv.c
+++ b/drivers/char/hw_random/n2-drv.c
@@ -762,12 +762,12 @@ static struct of_platform_driver n2rng_driver = {
 
 static int __init n2rng_init(void)
 {
-	return of_register_driver(&n2rng_driver, &of_bus_type);
+	return of_register_platform_driver(&n2rng_driver);
 }
 
 static void __exit n2rng_exit(void)
 {
-	of_unregister_driver(&n2rng_driver);
+	of_unregister_platform_driver(&n2rng_driver);
 }
 
 module_init(n2rng_init);
diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c
index 23163fda5035..34ac8ac8aaec 100644
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -2070,20 +2070,20 @@ static struct of_platform_driver n2_mau_driver = {
 
 static int __init n2_init(void)
 {
-	int err = of_register_driver(&n2_crypto_driver, &of_bus_type);
+	int err = of_register_platform_driver(&n2_crypto_driver);
 
 	if (!err) {
-		err = of_register_driver(&n2_mau_driver, &of_bus_type);
+		err = of_register_platform_driver(&n2_mau_driver);
 		if (err)
-			of_unregister_driver(&n2_crypto_driver);
+			of_unregister_platform_driver(&n2_crypto_driver);
 	}
 	return err;
 }
 
 static void __exit n2_exit(void)
 {
-	of_unregister_driver(&n2_mau_driver);
-	of_unregister_driver(&n2_crypto_driver);
+	of_unregister_platform_driver(&n2_mau_driver);
+	of_unregister_platform_driver(&n2_crypto_driver);
 }
 
 module_init(n2_init);
diff --git a/drivers/hwmon/ultra45_env.c b/drivers/hwmon/ultra45_env.c
index 5da5942cf970..89643261ccdb 100644
--- a/drivers/hwmon/ultra45_env.c
+++ b/drivers/hwmon/ultra45_env.c
@@ -311,12 +311,12 @@ static struct of_platform_driver env_driver = {
 
 static int __init env_init(void)
 {
-	return of_register_driver(&env_driver, &of_bus_type);
+	return of_register_platform_driver(&env_driver);
 }
 
 static void __exit env_exit(void)
 {
-	of_unregister_driver(&env_driver);
+	of_unregister_platform_driver(&env_driver);
 }
 
 module_init(env_init);
diff --git a/drivers/input/misc/sparcspkr.c b/drivers/input/misc/sparcspkr.c
index 1dacae4b43f0..f3bb92e9755f 100644
--- a/drivers/input/misc/sparcspkr.c
+++ b/drivers/input/misc/sparcspkr.c
@@ -353,14 +353,12 @@ static struct of_platform_driver grover_beep_driver = {
 
 static int __init sparcspkr_init(void)
 {
-	int err = of_register_driver(&bbc_beep_driver,
-				     &of_platform_bus_type);
+	int err = of_register_platform_driver(&bbc_beep_driver);
 
 	if (!err) {
-		err = of_register_driver(&grover_beep_driver,
-					 &of_platform_bus_type);
+		err = of_register_platform_driver(&grover_beep_driver);
 		if (err)
-			of_unregister_driver(&bbc_beep_driver);
+			of_unregister_platform_driver(&bbc_beep_driver);
 	}
 
 	return err;
@@ -368,8 +366,8 @@ static int __init sparcspkr_init(void)
 
 static void __exit sparcspkr_exit(void)
 {
-	of_unregister_driver(&bbc_beep_driver);
-	of_unregister_driver(&grover_beep_driver);
+	of_unregister_platform_driver(&bbc_beep_driver);
+	of_unregister_platform_driver(&grover_beep_driver);
 }
 
 module_init(sparcspkr_init);
diff --git a/drivers/input/serio/i8042-sparcio.h b/drivers/input/serio/i8042-sparcio.h
index c7d50ff43fca..cb2a24b94746 100644
--- a/drivers/input/serio/i8042-sparcio.h
+++ b/drivers/input/serio/i8042-sparcio.h
@@ -116,8 +116,7 @@ static int __init i8042_platform_init(void)
 		if (!kbd_iobase)
 			return -ENODEV;
 	} else {
-		int err = of_register_driver(&sparc_i8042_driver,
-					     &of_bus_type);
+		int err = of_register_platform_driver(&sparc_i8042_driver);
 		if (err)
 			return err;
 
@@ -141,7 +140,7 @@ static inline void i8042_platform_exit(void)
 	struct device_node *root = of_find_node_by_path("/");
 
 	if (strcmp(root->name, "SUNW,JavaStation-1"))
-		of_unregister_driver(&sparc_i8042_driver);
+		of_unregister_platform_driver(&sparc_i8042_driver);
 }
 
 #else /* !CONFIG_PCI */
diff --git a/drivers/mtd/maps/sun_uflash.c b/drivers/mtd/maps/sun_uflash.c
index 0391c2527bd7..8984236a8d0a 100644
--- a/drivers/mtd/maps/sun_uflash.c
+++ b/drivers/mtd/maps/sun_uflash.c
@@ -160,12 +160,12 @@ static struct of_platform_driver uflash_driver = {
 
 static int __init uflash_init(void)
 {
-	return of_register_driver(&uflash_driver, &of_bus_type);
+	return of_register_platform_driver(&uflash_driver);
 }
 
 static void __exit uflash_exit(void)
 {
-	of_unregister_driver(&uflash_driver);
+	of_unregister_platform_driver(&uflash_driver);
 }
 
 module_init(uflash_init);
diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c
index b150c102ca5a..f10476fb2623 100644
--- a/drivers/net/ibm_newemac/core.c
+++ b/drivers/net/ibm_newemac/core.c
@@ -2339,11 +2339,11 @@ static int __devinit emac_wait_deps(struct emac_instance *dev)
 		deps[EMAC_DEP_MDIO_IDX].phandle = dev->mdio_ph;
 	if (dev->blist && dev->blist > emac_boot_list)
 		deps[EMAC_DEP_PREV_IDX].phandle = 0xffffffffu;
-	bus_register_notifier(&of_platform_bus_type, &emac_of_bus_notifier);
+	bus_register_notifier(&platform_bus_type, &emac_of_bus_notifier);
 	wait_event_timeout(emac_probe_wait,
 			   emac_check_deps(dev, deps),
 			   EMAC_PROBE_DEP_TIMEOUT);
-	bus_unregister_notifier(&of_platform_bus_type, &emac_of_bus_notifier);
+	bus_unregister_notifier(&platform_bus_type, &emac_of_bus_notifier);
 	err = emac_check_deps(dev, deps) ? 0 : -ENODEV;
 	for (i = 0; i < EMAC_DEP_COUNT; i++) {
 		if (deps[i].node)
diff --git a/drivers/net/myri_sbus.c b/drivers/net/myri_sbus.c
index 370d3c17f24c..04e552aa14ec 100644
--- a/drivers/net/myri_sbus.c
+++ b/drivers/net/myri_sbus.c
@@ -1172,12 +1172,12 @@ static struct of_platform_driver myri_sbus_driver = {
 
 static int __init myri_sbus_init(void)
 {
-	return of_register_driver(&myri_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&myri_sbus_driver);
 }
 
 static void __exit myri_sbus_exit(void)
 {
-	of_unregister_driver(&myri_sbus_driver);
+	of_unregister_platform_driver(&myri_sbus_driver);
 }
 
 module_init(myri_sbus_init);
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index f6ecf6180f72..8cb2b30eccae 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -10251,14 +10251,14 @@ static int __init niu_init(void)
 	niu_debug = netif_msg_init(debug, NIU_MSG_DEFAULT);
 
 #ifdef CONFIG_SPARC64
-	err = of_register_driver(&niu_of_driver, &of_bus_type);
+	err = of_register_platform_driver(&niu_of_driver);
 #endif
 
 	if (!err) {
 		err = pci_register_driver(&niu_pci_driver);
 #ifdef CONFIG_SPARC64
 		if (err)
-			of_unregister_driver(&niu_of_driver);
+			of_unregister_platform_driver(&niu_of_driver);
 #endif
 	}
 
@@ -10269,7 +10269,7 @@ static void __exit niu_exit(void)
 {
 	pci_unregister_driver(&niu_pci_driver);
 #ifdef CONFIG_SPARC64
-	of_unregister_driver(&niu_of_driver);
+	of_unregister_platform_driver(&niu_of_driver);
 #endif
 }
 
diff --git a/drivers/net/sunbmac.c b/drivers/net/sunbmac.c
index 0b10d24de051..09c071bd6ad4 100644
--- a/drivers/net/sunbmac.c
+++ b/drivers/net/sunbmac.c
@@ -1301,12 +1301,12 @@ static struct of_platform_driver bigmac_sbus_driver = {
 
 static int __init bigmac_init(void)
 {
-	return of_register_driver(&bigmac_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&bigmac_sbus_driver);
 }
 
 static void __exit bigmac_exit(void)
 {
-	of_unregister_driver(&bigmac_sbus_driver);
+	of_unregister_platform_driver(&bigmac_sbus_driver);
 }
 
 module_init(bigmac_init);
diff --git a/drivers/net/sunhme.c b/drivers/net/sunhme.c
index 0a63ebef86a0..eec443f64079 100644
--- a/drivers/net/sunhme.c
+++ b/drivers/net/sunhme.c
@@ -3304,7 +3304,7 @@ static int __init happy_meal_sbus_init(void)
 {
 	int err;
 
-	err = of_register_driver(&hme_sbus_driver, &of_bus_type);
+	err = of_register_platform_driver(&hme_sbus_driver);
 	if (!err)
 		err = quattro_sbus_register_irqs();
 
@@ -3313,7 +3313,7 @@ static int __init happy_meal_sbus_init(void)
 
 static void happy_meal_sbus_exit(void)
 {
-	of_unregister_driver(&hme_sbus_driver);
+	of_unregister_platform_driver(&hme_sbus_driver);
 	quattro_sbus_free_irqs();
 
 	while (qfe_sbus_list) {
diff --git a/drivers/net/sunlance.c b/drivers/net/sunlance.c
index c6bfdad6c0c8..ee364fa75634 100644
--- a/drivers/net/sunlance.c
+++ b/drivers/net/sunlance.c
@@ -1558,12 +1558,12 @@ static struct of_platform_driver sunlance_sbus_driver = {
 /* Find all the lance cards on the system and initialize them */
 static int __init sparc_lance_init(void)
 {
-	return of_register_driver(&sunlance_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&sunlance_sbus_driver);
 }
 
 static void __exit sparc_lance_exit(void)
 {
-	of_unregister_driver(&sunlance_sbus_driver);
+	of_unregister_platform_driver(&sunlance_sbus_driver);
 }
 
 module_init(sparc_lance_init);
diff --git a/drivers/net/sunqe.c b/drivers/net/sunqe.c
index 446517487084..5f84a5dadedd 100644
--- a/drivers/net/sunqe.c
+++ b/drivers/net/sunqe.c
@@ -988,12 +988,12 @@ static struct of_platform_driver qec_sbus_driver = {
 
 static int __init qec_init(void)
 {
-	return of_register_driver(&qec_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&qec_sbus_driver);
 }
 
 static void __exit qec_exit(void)
 {
-	of_unregister_driver(&qec_sbus_driver);
+	of_unregister_platform_driver(&qec_sbus_driver);
 
 	while (root_qec_dev) {
 		struct sunqec *next = root_qec_dev->next_module;
diff --git a/drivers/parport/parport_sunbpp.c b/drivers/parport/parport_sunbpp.c
index 3cdfe96e8999..210a6441a066 100644
--- a/drivers/parport/parport_sunbpp.c
+++ b/drivers/parport/parport_sunbpp.c
@@ -393,12 +393,12 @@ static struct of_platform_driver bpp_sbus_driver = {
 
 static int __init parport_sunbpp_init(void)
 {
-	return of_register_driver(&bpp_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&bpp_sbus_driver);
 }
 
 static void __exit parport_sunbpp_exit(void)
 {
-	of_unregister_driver(&bpp_sbus_driver);
+	of_unregister_platform_driver(&bpp_sbus_driver);
 }
 
 MODULE_AUTHOR("Derrick J Brashear");
diff --git a/drivers/sbus/char/bbc_i2c.c b/drivers/sbus/char/bbc_i2c.c
index 40d7a1fc69af..3e89c313e98d 100644
--- a/drivers/sbus/char/bbc_i2c.c
+++ b/drivers/sbus/char/bbc_i2c.c
@@ -425,12 +425,12 @@ static struct of_platform_driver bbc_i2c_driver = {
 
 static int __init bbc_i2c_init(void)
 {
-	return of_register_driver(&bbc_i2c_driver, &of_bus_type);
+	return of_register_platform_driver(&bbc_i2c_driver);
 }
 
 static void __exit bbc_i2c_exit(void)
 {
-	of_unregister_driver(&bbc_i2c_driver);
+	of_unregister_platform_driver(&bbc_i2c_driver);
 }
 
 module_init(bbc_i2c_init);
diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c
index 7baf1b644039..8fd362e7fa65 100644
--- a/drivers/sbus/char/display7seg.c
+++ b/drivers/sbus/char/display7seg.c
@@ -277,12 +277,12 @@ static struct of_platform_driver d7s_driver = {
 
 static int __init d7s_init(void)
 {
-	return of_register_driver(&d7s_driver, &of_bus_type);
+	return of_register_platform_driver(&d7s_driver);
 }
 
 static void __exit d7s_exit(void)
 {
-	of_unregister_driver(&d7s_driver);
+	of_unregister_platform_driver(&d7s_driver);
 }
 
 module_init(d7s_init);
diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c
index c8166ecf5276..2c76f700265b 100644
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -1142,12 +1142,12 @@ static struct of_platform_driver envctrl_driver = {
 
 static int __init envctrl_init(void)
 {
-	return of_register_driver(&envctrl_driver, &of_bus_type);
+	return of_register_platform_driver(&envctrl_driver);
 }
 
 static void __exit envctrl_exit(void)
 {
-	of_unregister_driver(&envctrl_driver);
+	of_unregister_platform_driver(&envctrl_driver);
 }
 
 module_init(envctrl_init);
diff --git a/drivers/sbus/char/flash.c b/drivers/sbus/char/flash.c
index 368d66294d83..d79f386c3486 100644
--- a/drivers/sbus/char/flash.c
+++ b/drivers/sbus/char/flash.c
@@ -218,12 +218,12 @@ static struct of_platform_driver flash_driver = {
 
 static int __init flash_init(void)
 {
-	return of_register_driver(&flash_driver, &of_bus_type);
+	return of_register_platform_driver(&flash_driver);
 }
 
 static void __exit flash_cleanup(void)
 {
-	of_unregister_driver(&flash_driver);
+	of_unregister_platform_driver(&flash_driver);
 }
 
 module_init(flash_init);
diff --git a/drivers/sbus/char/uctrl.c b/drivers/sbus/char/uctrl.c
index b8b40e9eca79..57f0612bb013 100644
--- a/drivers/sbus/char/uctrl.c
+++ b/drivers/sbus/char/uctrl.c
@@ -437,12 +437,12 @@ static struct of_platform_driver uctrl_driver = {
 
 static int __init uctrl_init(void)
 {
-	return of_register_driver(&uctrl_driver, &of_bus_type);
+	return of_register_platform_driver(&uctrl_driver);
 }
 
 static void __exit uctrl_exit(void)
 {
-	of_unregister_driver(&uctrl_driver);
+	of_unregister_platform_driver(&uctrl_driver);
 }
 
 module_init(uctrl_init);
diff --git a/drivers/scsi/qlogicpti.c b/drivers/scsi/qlogicpti.c
index 3f5b5411e6bc..53d7ed0dc169 100644
--- a/drivers/scsi/qlogicpti.c
+++ b/drivers/scsi/qlogicpti.c
@@ -1467,12 +1467,12 @@ static struct of_platform_driver qpti_sbus_driver = {
 
 static int __init qpti_init(void)
 {
-	return of_register_driver(&qpti_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&qpti_sbus_driver);
 }
 
 static void __exit qpti_exit(void)
 {
-	of_unregister_driver(&qpti_sbus_driver);
+	of_unregister_platform_driver(&qpti_sbus_driver);
 }
 
 MODULE_DESCRIPTION("QlogicISP SBUS driver");
diff --git a/drivers/scsi/sun_esp.c b/drivers/scsi/sun_esp.c
index ddc221acd14c..89ba6fe02f80 100644
--- a/drivers/scsi/sun_esp.c
+++ b/drivers/scsi/sun_esp.c
@@ -644,12 +644,12 @@ static struct of_platform_driver esp_sbus_driver = {
 
 static int __init sunesp_init(void)
 {
-	return of_register_driver(&esp_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&esp_sbus_driver);
 }
 
 static void __exit sunesp_exit(void)
 {
-	of_unregister_driver(&esp_sbus_driver);
+	of_unregister_platform_driver(&esp_sbus_driver);
 }
 
 MODULE_DESCRIPTION("Sun ESP SCSI driver");
diff --git a/drivers/serial/sunhv.c b/drivers/serial/sunhv.c
index 36e244867dd8..a779e22d213e 100644
--- a/drivers/serial/sunhv.c
+++ b/drivers/serial/sunhv.c
@@ -644,12 +644,12 @@ static int __init sunhv_init(void)
 	if (tlb_type != hypervisor)
 		return -ENODEV;
 
-	return of_register_driver(&hv_driver, &of_bus_type);
+	return of_register_platform_driver(&hv_driver);
 }
 
 static void __exit sunhv_exit(void)
 {
-	of_unregister_driver(&hv_driver);
+	of_unregister_platform_driver(&hv_driver);
 }
 
 module_init(sunhv_init);
diff --git a/drivers/serial/sunsab.c b/drivers/serial/sunsab.c
index 0a7dd6841ff8..9845fb1cfb1f 100644
--- a/drivers/serial/sunsab.c
+++ b/drivers/serial/sunsab.c
@@ -1130,12 +1130,12 @@ static int __init sunsab_init(void)
 		}
 	}
 
-	return of_register_driver(&sab_driver, &of_bus_type);
+	return of_register_platform_driver(&sab_driver);
 }
 
 static void __exit sunsab_exit(void)
 {
-	of_unregister_driver(&sab_driver);
+	of_unregister_platform_driver(&sab_driver);
 	if (sunsab_reg.nr) {
 		sunserial_unregister_minors(&sunsab_reg, sunsab_reg.nr);
 	}
diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c
index 5deafc8180b5..3cdf74822db5 100644
--- a/drivers/serial/sunsu.c
+++ b/drivers/serial/sunsu.c
@@ -1586,7 +1586,7 @@ static int __init sunsu_init(void)
 			return err;
 	}
 
-	err = of_register_driver(&su_driver, &of_bus_type);
+	err = of_register_platform_driver(&su_driver);
 	if (err && num_uart)
 		sunserial_unregister_minors(&sunsu_reg, num_uart);
 
diff --git a/drivers/serial/sunzilog.c b/drivers/serial/sunzilog.c
index fcbe20d48039..d1e6bcb59546 100644
--- a/drivers/serial/sunzilog.c
+++ b/drivers/serial/sunzilog.c
@@ -1576,7 +1576,7 @@ static int __init sunzilog_init(void)
 			goto out_free_tables;
 	}
 
-	err = of_register_driver(&zs_driver, &of_bus_type);
+	err = of_register_platform_driver(&zs_driver);
 	if (err)
 		goto out_unregister_uart;
 
@@ -1604,7 +1604,7 @@ out:
 	return err;
 
 out_unregister_driver:
-	of_unregister_driver(&zs_driver);
+	of_unregister_platform_driver(&zs_driver);
 
 out_unregister_uart:
 	if (num_sunzilog) {
@@ -1619,7 +1619,7 @@ out_free_tables:
 
 static void __exit sunzilog_exit(void)
 {
-	of_unregister_driver(&zs_driver);
+	of_unregister_platform_driver(&zs_driver);
 
 	if (zilog_irq != -1) {
 		struct uart_sunzilog_port *up = sunzilog_irq_chain;
diff --git a/drivers/video/bw2.c b/drivers/video/bw2.c
index 09f1b9b462f4..c7796637bafd 100644
--- a/drivers/video/bw2.c
+++ b/drivers/video/bw2.c
@@ -390,12 +390,12 @@ static int __init bw2_init(void)
 	if (fb_get_options("bw2fb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&bw2_driver, &of_bus_type);
+	return of_register_platform_driver(&bw2_driver);
 }
 
 static void __exit bw2_exit(void)
 {
-	of_unregister_driver(&bw2_driver);
+	of_unregister_platform_driver(&bw2_driver);
 }
 
 module_init(bw2_init);
diff --git a/drivers/video/cg14.c b/drivers/video/cg14.c
index e5dc2241194f..d09fde8beb69 100644
--- a/drivers/video/cg14.c
+++ b/drivers/video/cg14.c
@@ -610,12 +610,12 @@ static int __init cg14_init(void)
 	if (fb_get_options("cg14fb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&cg14_driver, &of_bus_type);
+	return of_register_platform_driver(&cg14_driver);
 }
 
 static void __exit cg14_exit(void)
 {
-	of_unregister_driver(&cg14_driver);
+	of_unregister_platform_driver(&cg14_driver);
 }
 
 module_init(cg14_init);
diff --git a/drivers/video/cg3.c b/drivers/video/cg3.c
index 558d73a948a0..64aa29809fb9 100644
--- a/drivers/video/cg3.c
+++ b/drivers/video/cg3.c
@@ -477,12 +477,12 @@ static int __init cg3_init(void)
 	if (fb_get_options("cg3fb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&cg3_driver, &of_bus_type);
+	return of_register_platform_driver(&cg3_driver);
 }
 
 static void __exit cg3_exit(void)
 {
-	of_unregister_driver(&cg3_driver);
+	of_unregister_platform_driver(&cg3_driver);
 }
 
 module_init(cg3_init);
diff --git a/drivers/video/cg6.c b/drivers/video/cg6.c
index 480d761a27a8..2389a719dcc7 100644
--- a/drivers/video/cg6.c
+++ b/drivers/video/cg6.c
@@ -870,12 +870,12 @@ static int __init cg6_init(void)
 	if (fb_get_options("cg6fb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&cg6_driver, &of_bus_type);
+	return of_register_platform_driver(&cg6_driver);
 }
 
 static void __exit cg6_exit(void)
 {
-	of_unregister_driver(&cg6_driver);
+	of_unregister_platform_driver(&cg6_driver);
 }
 
 module_init(cg6_init);
diff --git a/drivers/video/ffb.c b/drivers/video/ffb.c
index 95c0227f47fc..f6ecfab296d3 100644
--- a/drivers/video/ffb.c
+++ b/drivers/video/ffb.c
@@ -1067,12 +1067,12 @@ static int __init ffb_init(void)
 	if (fb_get_options("ffb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&ffb_driver, &of_bus_type);
+	return of_register_platform_driver(&ffb_driver);
 }
 
 static void __exit ffb_exit(void)
 {
-	of_unregister_driver(&ffb_driver);
+	of_unregister_platform_driver(&ffb_driver);
 }
 
 module_init(ffb_init);
diff --git a/drivers/video/leo.c b/drivers/video/leo.c
index 9e8bf7d5e249..ad677637ffbb 100644
--- a/drivers/video/leo.c
+++ b/drivers/video/leo.c
@@ -677,12 +677,12 @@ static int __init leo_init(void)
 	if (fb_get_options("leofb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&leo_driver, &of_bus_type);
+	return of_register_platform_driver(&leo_driver);
 }
 
 static void __exit leo_exit(void)
 {
-	of_unregister_driver(&leo_driver);
+	of_unregister_platform_driver(&leo_driver);
 }
 
 module_init(leo_init);
diff --git a/drivers/video/p9100.c b/drivers/video/p9100.c
index 6552751e81aa..688b055abab2 100644
--- a/drivers/video/p9100.c
+++ b/drivers/video/p9100.c
@@ -367,12 +367,12 @@ static int __init p9100_init(void)
 	if (fb_get_options("p9100fb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&p9100_driver, &of_bus_type);
+	return of_register_platform_driver(&p9100_driver);
 }
 
 static void __exit p9100_exit(void)
 {
-	of_unregister_driver(&p9100_driver);
+	of_unregister_platform_driver(&p9100_driver);
 }
 
 module_init(p9100_init);
diff --git a/drivers/video/sunxvr1000.c b/drivers/video/sunxvr1000.c
index 489b44e8db81..7288934c0d49 100644
--- a/drivers/video/sunxvr1000.c
+++ b/drivers/video/sunxvr1000.c
@@ -213,12 +213,12 @@ static int __init gfb_init(void)
 	if (fb_get_options("gfb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&gfb_driver, &of_bus_type);
+	return of_register_platform_driver(&gfb_driver);
 }
 
 static void __exit gfb_exit(void)
 {
-	of_unregister_driver(&gfb_driver);
+	of_unregister_platform_driver(&gfb_driver);
 }
 
 module_init(gfb_init);
diff --git a/drivers/video/tcx.c b/drivers/video/tcx.c
index cc039b33d2d8..f375e0db6776 100644
--- a/drivers/video/tcx.c
+++ b/drivers/video/tcx.c
@@ -526,12 +526,12 @@ static int __init tcx_init(void)
 	if (fb_get_options("tcxfb", NULL))
 		return -ENODEV;
 
-	return of_register_driver(&tcx_driver, &of_bus_type);
+	return of_register_platform_driver(&tcx_driver);
 }
 
 static void __exit tcx_exit(void)
 {
-	of_unregister_driver(&tcx_driver);
+	of_unregister_platform_driver(&tcx_driver);
 }
 
 module_init(tcx_init);
diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c
index 8c03fd71693e..30a2512fd52e 100644
--- a/drivers/watchdog/cpwd.c
+++ b/drivers/watchdog/cpwd.c
@@ -688,12 +688,12 @@ static struct of_platform_driver cpwd_driver = {
 
 static int __init cpwd_init(void)
 {
-	return of_register_driver(&cpwd_driver, &of_bus_type);
+	return of_register_platform_driver(&cpwd_driver);
 }
 
 static void __exit cpwd_exit(void)
 {
-	of_unregister_driver(&cpwd_driver);
+	of_unregister_platform_driver(&cpwd_driver);
 }
 
 module_init(cpwd_init);
diff --git a/drivers/watchdog/riowd.c b/drivers/watchdog/riowd.c
index 5dceeddc8859..4082b4ace1fc 100644
--- a/drivers/watchdog/riowd.c
+++ b/drivers/watchdog/riowd.c
@@ -250,12 +250,12 @@ static struct of_platform_driver riowd_driver = {
 
 static int __init riowd_init(void)
 {
-	return of_register_driver(&riowd_driver, &of_bus_type);
+	return of_register_platform_driver(&riowd_driver);
 }
 
 static void __exit riowd_exit(void)
 {
-	of_unregister_driver(&riowd_driver);
+	of_unregister_platform_driver(&riowd_driver);
 }
 
 module_init(riowd_init);
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 133ecf31a60f..429513ae8f6e 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -19,12 +19,6 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 
-/*
- * of_platform_bus_type isn't it's own bus anymore.  It's now just an alias
- * for the platform bus.
- */
-#define of_platform_bus_type platform_bus_type
-
 extern const struct of_device_id of_default_bus_ids[];
 
 /*
diff --git a/sound/sparc/amd7930.c b/sound/sparc/amd7930.c
index 43c63d441087..9eb1a4e0363b 100644
--- a/sound/sparc/amd7930.c
+++ b/sound/sparc/amd7930.c
@@ -1075,7 +1075,7 @@ static struct of_platform_driver amd7930_sbus_driver = {
 
 static int __init amd7930_init(void)
 {
-	return of_register_driver(&amd7930_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&amd7930_sbus_driver);
 }
 
 static void __exit amd7930_exit(void)
@@ -1092,7 +1092,7 @@ static void __exit amd7930_exit(void)
 
 	amd7930_list = NULL;
 
-	of_unregister_driver(&amd7930_sbus_driver);
+	of_unregister_platform_driver(&amd7930_sbus_driver);
 }
 
 module_init(amd7930_init);
diff --git a/sound/sparc/cs4231.c b/sound/sparc/cs4231.c
index f7f05c246303..68570ee2c9bb 100644
--- a/sound/sparc/cs4231.c
+++ b/sound/sparc/cs4231.c
@@ -2120,12 +2120,12 @@ static struct of_platform_driver cs4231_driver = {
 
 static int __init cs4231_init(void)
 {
-	return of_register_driver(&cs4231_driver, &of_bus_type);
+	return of_register_platform_driver(&cs4231_driver);
 }
 
 static void __exit cs4231_exit(void)
 {
-	of_unregister_driver(&cs4231_driver);
+	of_unregister_platform_driver(&cs4231_driver);
 }
 
 module_init(cs4231_init);
diff --git a/sound/sparc/dbri.c b/sound/sparc/dbri.c
index 491ce71c84b6..c421901c48d0 100644
--- a/sound/sparc/dbri.c
+++ b/sound/sparc/dbri.c
@@ -2699,12 +2699,12 @@ static struct of_platform_driver dbri_sbus_driver = {
 /* Probe for the dbri chip and then attach the driver. */
 static int __init dbri_init(void)
 {
-	return of_register_driver(&dbri_sbus_driver, &of_bus_type);
+	return of_register_platform_driver(&dbri_sbus_driver);
 }
 
 static void __exit dbri_exit(void)
 {
-	of_unregister_driver(&dbri_sbus_driver);
+	of_unregister_platform_driver(&dbri_sbus_driver);
 }
 
 module_init(dbri_init);
-- 
cgit v1.2.3-59-g8ed1b


From 129ac799ad627b1e08382739f9e8cd75d7477fa3 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 29 Jun 2010 09:26:53 -0700
Subject: of: remove asm/of_platform.h

Only thing left in it is of_instantiate_rtc() which can be moved to
asm/prom.h on PowerPC and is unused in microblaze.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/microblaze/include/asm/of_platform.h | 19 -------------------
 arch/powerpc/include/asm/of_platform.h    | 16 ----------------
 arch/powerpc/include/asm/prom.h           |  2 ++
 arch/sparc/include/asm/of_platform.h      | 16 ----------------
 include/linux/of_platform.h               |  2 --
 5 files changed, 2 insertions(+), 53 deletions(-)
 delete mode 100644 arch/microblaze/include/asm/of_platform.h
 delete mode 100644 arch/powerpc/include/asm/of_platform.h
 delete mode 100644 arch/sparc/include/asm/of_platform.h

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_platform.h b/arch/microblaze/include/asm/of_platform.h
deleted file mode 100644
index 353d8f651e30..000000000000
--- a/arch/microblaze/include/asm/of_platform.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
- *			<benh@kernel.crashing.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _ASM_MICROBLAZE_OF_PLATFORM_H
-#define _ASM_MICROBLAZE_OF_PLATFORM_H
-
-/* This is just here during the transition */
-#include <linux/of_platform.h>
-
-extern void of_instantiate_rtc(void);
-
-#endif /* _ASM_MICROBLAZE_OF_PLATFORM_H */
diff --git a/arch/powerpc/include/asm/of_platform.h b/arch/powerpc/include/asm/of_platform.h
deleted file mode 100644
index d506aa61db83..000000000000
--- a/arch/powerpc/include/asm/of_platform.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _ASM_POWERPC_OF_PLATFORM_H
-#define _ASM_POWERPC_OF_PLATFORM_H
-/*
- *    Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
- *			 <benh@kernel.crashing.org>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-extern void of_instantiate_rtc(void);
-
-#endif	/* _ASM_POWERPC_OF_PLATFORM_H */
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index f864722679e8..da7dd634e7ce 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -117,5 +117,7 @@ extern const void *of_get_mac_address(struct device_node *np);
 struct pci_dev;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
+extern void of_instantiate_rtc(void);
+
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_PROM_H */
diff --git a/arch/sparc/include/asm/of_platform.h b/arch/sparc/include/asm/of_platform.h
deleted file mode 100644
index 26540ddfc511..000000000000
--- a/arch/sparc/include/asm/of_platform.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef ___ASM_SPARC_OF_PLATFORM_H
-#define ___ASM_SPARC_OF_PLATFORM_H
-/*
- *    Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
- *			 <benh@kernel.crashing.org>
- *    Modified for Sparc by merging parts of asm/of_device.h
- *		by Stephen Rothwell
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#endif
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 429513ae8f6e..a79f59bf61a0 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -52,8 +52,6 @@ extern void of_unregister_platform_driver(struct of_platform_driver *drv);
 extern struct of_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
-#include <asm/of_platform.h>
-
 extern struct of_device *of_find_device_by_node(struct device_node *np);
 
 extern int of_bus_type_init(struct bus_type *bus, const char *name);
-- 
cgit v1.2.3-59-g8ed1b


From 295960429675e17ec658320ebb24385727032bed Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Tue, 29 Jun 2010 11:15:54 -0600
Subject: of: remove asm/of_device.h

It is mostly unused now.  Sparc has a few defines left in it, but they
can be moved to other headers.  Removing this header means that new
architectures adding CONFIG_OF support don't need to also add this
header file.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/microblaze/include/asm/of_device.h | 13 -------------
 arch/powerpc/include/asm/of_device.h    |  3 ---
 arch/sparc/include/asm/device.h         |  2 ++
 arch/sparc/include/asm/of_device.h      | 19 -------------------
 arch/sparc/include/asm/prom.h           |  4 ++++
 include/linux/of_device.h               |  2 +-
 6 files changed, 7 insertions(+), 36 deletions(-)
 delete mode 100644 arch/microblaze/include/asm/of_device.h
 delete mode 100644 arch/powerpc/include/asm/of_device.h
 delete mode 100644 arch/sparc/include/asm/of_device.h

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/of_device.h b/arch/microblaze/include/asm/of_device.h
deleted file mode 100644
index 47e8d42aee8f..000000000000
--- a/arch/microblaze/include/asm/of_device.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (C) 2007-2008 Michal Simek <monstr@monstr.eu>
- *
- * based on PowerPC of_device.h
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-#ifndef _ASM_MICROBLAZE_OF_DEVICE_H
-#define _ASM_MICROBLAZE_OF_DEVICE_H
-#endif /* _ASM_MICROBLAZE_OF_DEVICE_H */
diff --git a/arch/powerpc/include/asm/of_device.h b/arch/powerpc/include/asm/of_device.h
deleted file mode 100644
index 04f76717f82c..000000000000
--- a/arch/powerpc/include/asm/of_device.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#ifndef _ASM_POWERPC_OF_DEVICE_H
-#define _ASM_POWERPC_OF_DEVICE_H
-#endif /* _ASM_POWERPC_OF_DEVICE_H */
diff --git a/arch/sparc/include/asm/device.h b/arch/sparc/include/asm/device.h
index fb220e482039..daa6a8a5e9cd 100644
--- a/arch/sparc/include/asm/device.h
+++ b/arch/sparc/include/asm/device.h
@@ -19,6 +19,8 @@ struct dev_archdata {
 	int			numa_node;
 };
 
+extern void of_propagate_archdata(struct platform_device *bus);
+
 struct pdev_archdata {
 	struct resource		resource[PROMREG_MAX];
 	unsigned int		irqs[PROMINTR_MAX];
diff --git a/arch/sparc/include/asm/of_device.h b/arch/sparc/include/asm/of_device.h
deleted file mode 100644
index 22b9828fe693..000000000000
--- a/arch/sparc/include/asm/of_device.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef _ASM_SPARC_OF_DEVICE_H
-#define _ASM_SPARC_OF_DEVICE_H
-#ifdef __KERNEL__
-
-#include <linux/device.h>
-#include <linux/of.h>
-#include <linux/mod_devicetable.h>
-#include <asm/openprom.h>
-
-extern void __iomem *of_ioremap(struct resource *res, unsigned long offset, unsigned long size, char *name);
-extern void of_iounmap(struct resource *res, void __iomem *base, unsigned long size);
-
-extern void of_propagate_archdata(struct of_device *bus);
-
-/* This is just here during the transition */
-#include <linux/of_platform.h>
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_SPARC_OF_DEVICE_H */
diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h
index ac695742df85..d35df5ace18a 100644
--- a/arch/sparc/include/asm/prom.h
+++ b/arch/sparc/include/asm/prom.h
@@ -51,6 +51,10 @@ extern void prom_build_devicetree(void);
 extern void of_populate_present_mask(void);
 extern void of_fill_in_cpu_data(void);
 
+struct resource;
+extern void __iomem *of_ioremap(struct resource *res, unsigned long offset, unsigned long size, char *name);
+extern void of_iounmap(struct resource *res, void __iomem *base, unsigned long size);
+
 /* These routines are here to provide compatibility with how powerpc
  * handles IRQ mapping for OF device nodes.  We precompute and permanently
  * register them in the of_device objects, whereas powerpc computes them
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 8cd1fe7864e3..0f1911994559 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -16,12 +16,12 @@
  */
 #define of_device platform_device
 #include <linux/platform_device.h>
+#include <linux/of_platform.h> /* temporary until merge */
 
 #ifdef CONFIG_OF_DEVICE
 #include <linux/device.h>
 #include <linux/of.h>
 #include <linux/mod_devicetable.h>
-#include <asm/of_device.h>
 
 #define	to_of_device(d) container_of(d, struct of_device, dev)
 
-- 
cgit v1.2.3-59-g8ed1b


From 94a0cb1fc61ab7a0d47d268a7764374efeb2160b Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 22 Jul 2010 13:59:23 -0600
Subject: of/device: Replace of_device with platform_device in includes and
 core code

of_device is currently just an #define alias to platform_device until it
gets removed entirely.  This patch removes references to it from the
include directories and the core drivers/of code.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/powerpc/include/asm/macio.h   |  2 +-
 arch/sparc/include/asm/floppy_64.h |  6 +++---
 arch/sparc/include/asm/parport.h   |  4 ++--
 drivers/of/device.c                | 22 +++++++++++-----------
 drivers/of/platform.c              | 24 ++++++++++++------------
 include/linux/of_device.h          | 10 +++++-----
 include/linux/of_platform.h        | 16 ++++++++--------
 7 files changed, 42 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/macio.h b/arch/powerpc/include/asm/macio.h
index 675e159b5ef4..7ab82c825a03 100644
--- a/arch/powerpc/include/asm/macio.h
+++ b/arch/powerpc/include/asm/macio.h
@@ -38,7 +38,7 @@ struct macio_dev
 {
 	struct macio_bus	*bus;		/* macio bus this device is on */
 	struct macio_dev	*media_bay;	/* Device is part of a media bay */
-	struct of_device	ofdev;
+	struct platform_device	ofdev;
 	struct device_dma_parameters dma_parms; /* ide needs that */
 	int			n_resources;
 	struct resource		resource[MACIO_DEV_COUNT_RESOURCES];
diff --git a/arch/sparc/include/asm/floppy_64.h b/arch/sparc/include/asm/floppy_64.h
index 4f5bde638f72..6597ce874d78 100644
--- a/arch/sparc/include/asm/floppy_64.h
+++ b/arch/sparc/include/asm/floppy_64.h
@@ -43,7 +43,7 @@ struct sun_flpy_controller {
 /* You'll only ever find one controller on an Ultra anyways. */
 static struct sun_flpy_controller *sun_fdc = (struct sun_flpy_controller *)-1;
 unsigned long fdc_status;
-static struct of_device *floppy_op = NULL;
+static struct platform_device *floppy_op = NULL;
 
 struct sun_floppy_ops {
 	unsigned char	(*fd_inb) (unsigned long port);
@@ -548,7 +548,7 @@ static unsigned long __init sun_floppy_init(void)
 {
 	static int initialized = 0;
 	struct device_node *dp;
-	struct of_device *op;
+	struct platform_device *op;
 	const char *prop;
 	char state[128];
 
@@ -661,7 +661,7 @@ static unsigned long __init sun_floppy_init(void)
 		config = 0;
 		for (dp = ebus_dp->child; dp; dp = dp->sibling) {
 			if (!strcmp(dp->name, "ecpp")) {
-				struct of_device *ecpp_op;
+				struct platform_device *ecpp_op;
 
 				ecpp_op = of_find_device_by_node(dp);
 				if (ecpp_op)
diff --git a/arch/sparc/include/asm/parport.h b/arch/sparc/include/asm/parport.h
index 4891fbce1114..4f7afa01b2ae 100644
--- a/arch/sparc/include/asm/parport.h
+++ b/arch/sparc/include/asm/parport.h
@@ -103,7 +103,7 @@ static inline unsigned int get_dma_residue(unsigned int dmanr)
 	return ebus_dma_residue(&sparc_ebus_dmas[dmanr].info);
 }
 
-static int __devinit ecpp_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit ecpp_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	unsigned long base = op->resource[0].start;
 	unsigned long config = op->resource[1].start;
@@ -192,7 +192,7 @@ out_err:
 	return err;
 }
 
-static int __devexit ecpp_remove(struct of_device *op)
+static int __devexit ecpp_remove(struct platform_device *op)
 {
 	struct parport *p = dev_get_drvdata(&op->dev);
 	int slot = p->dma;
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 12a44b493511..0d8a0644f540 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -26,7 +26,7 @@ const struct of_device_id *of_match_device(const struct of_device_id *matches,
 }
 EXPORT_SYMBOL(of_match_device);
 
-struct of_device *of_dev_get(struct of_device *dev)
+struct platform_device *of_dev_get(struct platform_device *dev)
 {
 	struct device *tmp;
 
@@ -34,13 +34,13 @@ struct of_device *of_dev_get(struct of_device *dev)
 		return NULL;
 	tmp = get_device(&dev->dev);
 	if (tmp)
-		return to_of_device(tmp);
+		return to_platform_device(tmp);
 	else
 		return NULL;
 }
 EXPORT_SYMBOL(of_dev_get);
 
-void of_dev_put(struct of_device *dev)
+void of_dev_put(struct platform_device *dev)
 {
 	if (dev)
 		put_device(&dev->dev);
@@ -50,18 +50,18 @@ EXPORT_SYMBOL(of_dev_put);
 static ssize_t devspec_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 
-	ofdev = to_of_device(dev);
+	ofdev = to_platform_device(dev);
 	return sprintf(buf, "%s\n", ofdev->dev.of_node->full_name);
 }
 
 static ssize_t name_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 
-	ofdev = to_of_device(dev);
+	ofdev = to_platform_device(dev);
 	return sprintf(buf, "%s\n", ofdev->dev.of_node->name);
 }
 
@@ -90,15 +90,15 @@ struct device_attribute of_platform_device_attrs[] = {
  */
 void of_release_dev(struct device *dev)
 {
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 
-	ofdev = to_of_device(dev);
+	ofdev = to_platform_device(dev);
 	of_node_put(ofdev->dev.of_node);
 	kfree(ofdev);
 }
 EXPORT_SYMBOL(of_release_dev);
 
-int of_device_register(struct of_device *ofdev)
+int of_device_register(struct platform_device *ofdev)
 {
 	BUG_ON(ofdev->dev.of_node == NULL);
 
@@ -119,7 +119,7 @@ int of_device_register(struct of_device *ofdev)
 }
 EXPORT_SYMBOL(of_device_register);
 
-void of_device_unregister(struct of_device *ofdev)
+void of_device_unregister(struct platform_device *ofdev)
 {
 	device_unregister(&ofdev->dev);
 }
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index f3f1ec81ef48..9f3840cdcde5 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -94,11 +94,11 @@ static int of_platform_device_probe(struct device *dev)
 {
 	int error = -ENODEV;
 	struct of_platform_driver *drv;
-	struct of_device *of_dev;
+	struct platform_device *of_dev;
 	const struct of_device_id *match;
 
 	drv = to_of_platform_driver(dev->driver);
-	of_dev = to_of_device(dev);
+	of_dev = to_platform_device(dev);
 
 	if (!drv->probe)
 		return error;
@@ -116,7 +116,7 @@ static int of_platform_device_probe(struct device *dev)
 
 static int of_platform_device_remove(struct device *dev)
 {
-	struct of_device *of_dev = to_of_device(dev);
+	struct platform_device *of_dev = to_platform_device(dev);
 	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
 
 	if (dev->driver && drv->remove)
@@ -126,7 +126,7 @@ static int of_platform_device_remove(struct device *dev)
 
 static void of_platform_device_shutdown(struct device *dev)
 {
-	struct of_device *of_dev = to_of_device(dev);
+	struct platform_device *of_dev = to_platform_device(dev);
 	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
 
 	if (dev->driver && drv->shutdown)
@@ -137,7 +137,7 @@ static void of_platform_device_shutdown(struct device *dev)
 
 static int of_platform_legacy_suspend(struct device *dev, pm_message_t mesg)
 {
-	struct of_device *of_dev = to_of_device(dev);
+	struct platform_device *of_dev = to_platform_device(dev);
 	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
 	int ret = 0;
 
@@ -148,7 +148,7 @@ static int of_platform_legacy_suspend(struct device *dev, pm_message_t mesg)
 
 static int of_platform_legacy_resume(struct device *dev)
 {
-	struct of_device *of_dev = to_of_device(dev);
+	struct platform_device *of_dev = to_platform_device(dev);
 	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
 	int ret = 0;
 
@@ -543,11 +543,11 @@ static void of_device_make_bus_id(struct device *dev)
  * @bus_id: Name to assign to the device.  May be null to use default name.
  * @parent: Parent device.
  */
-struct of_device *of_device_alloc(struct device_node *np,
+struct platform_device *of_device_alloc(struct device_node *np,
 				  const char *bus_id,
 				  struct device *parent)
 {
-	struct of_device *dev;
+	struct platform_device *dev;
 	int rc, i, num_reg = 0, num_irq = 0;
 	struct resource *res, temp_res;
 
@@ -600,11 +600,11 @@ EXPORT_SYMBOL(of_device_alloc);
  * @bus_id: name to assign device
  * @parent: Linux device model parent device.
  */
-struct of_device *of_platform_device_create(struct device_node *np,
+struct platform_device *of_platform_device_create(struct device_node *np,
 					    const char *bus_id,
 					    struct device *parent)
 {
-	struct of_device *dev;
+	struct platform_device *dev;
 
 	dev = of_device_alloc(np, bus_id, parent);
 	if (!dev)
@@ -642,7 +642,7 @@ static int of_platform_bus_create(const struct device_node *bus,
 				  struct device *parent)
 {
 	struct device_node *child;
-	struct of_device *dev;
+	struct platform_device *dev;
 	int rc = 0;
 
 	for_each_child_of_node(bus, child) {
@@ -678,7 +678,7 @@ int of_platform_bus_probe(struct device_node *root,
 			  struct device *parent)
 {
 	struct device_node *child;
-	struct of_device *dev;
+	struct platform_device *dev;
 	int rc = 0;
 
 	if (matches == NULL)
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 0f1911994559..e11a0be7893a 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -39,14 +39,14 @@ static inline int of_driver_match_device(const struct device *dev,
 	return of_match_device(drv->of_match_table, dev) != NULL;
 }
 
-extern struct of_device *of_dev_get(struct of_device *dev);
-extern void of_dev_put(struct of_device *dev);
+extern struct platform_device *of_dev_get(struct platform_device *dev);
+extern void of_dev_put(struct platform_device *dev);
 
-extern int of_device_register(struct of_device *ofdev);
-extern void of_device_unregister(struct of_device *ofdev);
+extern int of_device_register(struct platform_device *ofdev);
+extern void of_device_unregister(struct platform_device *ofdev);
 extern void of_release_dev(struct device *dev);
 
-static inline void of_device_free(struct of_device *dev)
+static inline void of_device_free(struct platform_device *dev)
 {
 	of_release_dev(&dev->dev);
 }
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index a79f59bf61a0..b24c5a5b0426 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -27,13 +27,13 @@ extern const struct of_device_id of_default_bus_ids[];
  */
 struct of_platform_driver
 {
-	int	(*probe)(struct of_device* dev,
+	int	(*probe)(struct platform_device* dev,
 			 const struct of_device_id *match);
-	int	(*remove)(struct of_device* dev);
+	int	(*remove)(struct platform_device* dev);
 
-	int	(*suspend)(struct of_device* dev, pm_message_t state);
-	int	(*resume)(struct of_device* dev);
-	int	(*shutdown)(struct of_device* dev);
+	int	(*suspend)(struct platform_device* dev, pm_message_t state);
+	int	(*resume)(struct platform_device* dev);
+	int	(*shutdown)(struct platform_device* dev);
 
 	struct device_driver	driver;
 	struct platform_driver	platform_driver;
@@ -49,16 +49,16 @@ extern void of_unregister_driver(struct of_platform_driver *drv);
 extern int of_register_platform_driver(struct of_platform_driver *drv);
 extern void of_unregister_platform_driver(struct of_platform_driver *drv);
 
-extern struct of_device *of_device_alloc(struct device_node *np,
+extern struct platform_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
-extern struct of_device *of_find_device_by_node(struct device_node *np);
+extern struct platform_device *of_find_device_by_node(struct device_node *np);
 
 extern int of_bus_type_init(struct bus_type *bus, const char *name);
 
 #if !defined(CONFIG_SPARC) /* SPARC has its own device registration method */
 /* Platform devices and busses creation */
-extern struct of_device *of_platform_device_create(struct device_node *np,
+extern struct platform_device *of_platform_device_create(struct device_node *np,
 						   const char *bus_id,
 						   struct device *parent);
 
-- 
cgit v1.2.3-59-g8ed1b


From c0dd394ca5e78649b7013c3ce2d6338af9f228f0 Mon Sep 17 00:00:00 2001
From: Jonas Bonn <jonas@southpole.se>
Date: Fri, 23 Jul 2010 20:19:24 +0200
Subject: of: remove of_default_bus_ids

This list used was by only two platforms with all other platforms defining an
own list of valid bus id's to pass to of_platform_bus_probe.  This patch:

i)   copies the default list to the two platforms that depended on it (powerpc)
ii)  remove the usage of of_default_bus_ids in of_platform_bus_probe
iii) removes the definition of the list from all architectures that defined it

Passing a NULL 'matches' parameter to of_platform_bus_probe is still valid; the
function returns no error in that case as the NULL value is equivalent to an
empty list.

Signed-off-by: Jonas Bonn <jonas@southpole.se>
[grant.likely@secretlab.ca: added __initdata annotations, warn on and return error on missing match table, and fix whitespace errors]
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/microblaze/kernel/Makefile           |  2 +-
 arch/microblaze/kernel/of_platform.c      | 49 -------------------------------
 arch/powerpc/kernel/of_platform.c         | 24 ---------------
 arch/powerpc/platforms/cell/qpace_setup.c | 14 ++++++++-
 arch/powerpc/platforms/cell/setup.c       | 14 ++++++++-
 drivers/of/platform.c                     |  4 +--
 include/linux/of_platform.h               |  2 --
 7 files changed, 28 insertions(+), 81 deletions(-)
 delete mode 100644 arch/microblaze/kernel/of_platform.c

(limited to 'include/linux')

diff --git a/arch/microblaze/kernel/Makefile b/arch/microblaze/kernel/Makefile
index 727e2cbff9c6..7fcc5f7b5a46 100644
--- a/arch/microblaze/kernel/Makefile
+++ b/arch/microblaze/kernel/Makefile
@@ -16,7 +16,7 @@ extra-y := head.o vmlinux.lds
 
 obj-y += dma.o exceptions.o \
 	hw_exception_handler.o init_task.o intc.o irq.o \
-	of_platform.o process.o prom.o prom_parse.o ptrace.o \
+	process.o prom.o prom_parse.o ptrace.o \
 	setup.o signal.o sys_microblaze.o timer.o traps.o reset.o
 
 obj-y += cpu/
diff --git a/arch/microblaze/kernel/of_platform.c b/arch/microblaze/kernel/of_platform.c
deleted file mode 100644
index 6cffadbe2fcd..000000000000
--- a/arch/microblaze/kernel/of_platform.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- *    Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
- *			 <benh@kernel.crashing.org>
- *    and		 Arnd Bergmann, IBM Corp.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#undef DEBUG
-
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mod_devicetable.h>
-#include <linux/pci.h>
-#include <linux/of.h>
-#include <linux/of_device.h>
-#include <linux/of_platform.h>
-
-#include <linux/errno.h>
-#include <linux/topology.h>
-#include <asm/atomic.h>
-
-/*
- * The list of OF IDs below is used for matching bus types in the
- * system whose devices are to be exposed as of_platform_devices.
- *
- * This is the default list valid for most platforms. This file provides
- * functions who can take an explicit list if necessary though
- *
- * The search is always performed recursively looking for children of
- * the provided device_node and recursively if such a children matches
- * a bus type in the list
- */
-
-const struct of_device_id of_default_bus_ids[] = {
-	{ .type = "soc", },
-	{ .compatible = "soc", },
-	{ .type = "plb5", },
-	{ .type = "plb4", },
-	{ .type = "opb", },
-	{ .type = "simple", },
-	{},
-};
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index 760a7af7fdb5..b2c363ef38ad 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -28,30 +28,6 @@
 #include <asm/ppc-pci.h>
 #include <asm/atomic.h>
 
-/*
- * The list of OF IDs below is used for matching bus types in the
- * system whose devices are to be exposed as of_platform_devices.
- *
- * This is the default list valid for most platforms. This file provides
- * functions who can take an explicit list if necessary though
- *
- * The search is always performed recursively looking for children of
- * the provided device_node and recursively if such a children matches
- * a bus type in the list
- */
-
-const struct of_device_id of_default_bus_ids[] = {
-	{ .type = "soc", },
-	{ .compatible = "soc", },
-	{ .type = "spider", },
-	{ .type = "axon", },
-	{ .type = "plb5", },
-	{ .type = "plb4", },
-	{ .type = "opb", },
-	{ .type = "ebc", },
-	{},
-};
-
 #ifdef CONFIG_PPC_OF_PLATFORM_PCI
 
 /* The probing of PCI controllers from of_platform is currently
diff --git a/arch/powerpc/platforms/cell/qpace_setup.c b/arch/powerpc/platforms/cell/qpace_setup.c
index c5ce02e84c8e..1b5749042756 100644
--- a/arch/powerpc/platforms/cell/qpace_setup.c
+++ b/arch/powerpc/platforms/cell/qpace_setup.c
@@ -61,12 +61,24 @@ static void qpace_progress(char *s, unsigned short hex)
 	printk("*** %04x : %s\n", hex, s ? s : "");
 }
 
+static const struct of_device_id qpace_bus_ids[] __initdata = {
+	{ .type = "soc", },
+	{ .compatible = "soc", },
+	{ .type = "spider", },
+	{ .type = "axon", },
+	{ .type = "plb5", },
+	{ .type = "plb4", },
+	{ .type = "opb", },
+	{ .type = "ebc", },
+	{},
+};
+
 static int __init qpace_publish_devices(void)
 {
 	int node;
 
 	/* Publish OF platform devices for southbridge IOs */
-	of_platform_bus_probe(NULL, NULL, NULL);
+	of_platform_bus_probe(NULL, qpace_bus_ids, NULL);
 
 	/* There is no device for the MIC memory controller, thus we create
 	 * a platform device for it to attach the EDAC driver to.
diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c
index 50385db586bd..691995761b3d 100644
--- a/arch/powerpc/platforms/cell/setup.c
+++ b/arch/powerpc/platforms/cell/setup.c
@@ -141,6 +141,18 @@ static int __devinit cell_setup_phb(struct pci_controller *phb)
 	return 0;
 }
 
+static const struct of_device_id cell_bus_ids[] __initdata = {
+	{ .type = "soc", },
+	{ .compatible = "soc", },
+	{ .type = "spider", },
+	{ .type = "axon", },
+	{ .type = "plb5", },
+	{ .type = "plb4", },
+	{ .type = "opb", },
+	{ .type = "ebc", },
+	{},
+};
+
 static int __init cell_publish_devices(void)
 {
 	struct device_node *root = of_find_node_by_path("/");
@@ -148,7 +160,7 @@ static int __init cell_publish_devices(void)
 	int node;
 
 	/* Publish OF platform devices for southbridge IOs */
-	of_platform_bus_probe(NULL, NULL, NULL);
+	of_platform_bus_probe(NULL, cell_bus_ids, NULL);
 
 	/* On spider based blades, we need to manually create the OF
 	 * platform devices for the PCI host bridges
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index f79f40b516c6..033a224a9fda 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -701,9 +701,7 @@ int of_platform_bus_probe(struct device_node *root,
 	struct platform_device *dev;
 	int rc = 0;
 
-	if (matches == NULL)
-		matches = of_default_bus_ids;
-	if (matches == OF_NO_DEEP_PROBE)
+	if (WARN_ON(!matches || matches == OF_NO_DEEP_PROBE))
 		return -EINVAL;
 	if (root == NULL)
 		root = of_find_node_by_path("/");
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index b24c5a5b0426..4e6d989c06df 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -19,8 +19,6 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 
-extern const struct of_device_id of_default_bus_ids[];
-
 /*
  * An of_platform_driver driver is attached to a basic of_device on
  * the "platform bus" (platform_bus_type).
-- 
cgit v1.2.3-59-g8ed1b


From c1f79426e2df5ef96fe3e76de6c7606d15bf390b Mon Sep 17 00:00:00 2001
From: Stefan Assmann <sassmann@redhat.com>
Date: Thu, 22 Jul 2010 02:50:21 +0000
Subject: sysfs: add attribute to indicate hw address assignment type

Add addr_assign_type to struct net_device and expose it via sysfs.
This new attribute has the purpose of giving user-space the ability to
distinguish between different assignment types of MAC addresses.

For example user-space can treat NICs with randomly generated MAC
addresses differently than NICs that have permanent (locally assigned)
MAC addresses.
For the former udev could write a persistent net rule by matching the
device path instead of the MAC address.
There's also the case of devices that 'steal' MAC addresses from slave
devices. In which it is also be beneficial for user-space to be aware
of the fact.

This patch also introduces a helper function to assist adoption of
drivers that generate MAC addresses randomly.

Signed-off-by: Stefan Assmann <sassmann@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 14 ++++++++++++++
 include/linux/netdevice.h   |  6 ++++++
 net/core/net-sysfs.c        |  2 ++
 3 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 3d7a6687d247..848480bc2bf9 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -126,6 +126,20 @@ static inline void random_ether_addr(u8 *addr)
 	addr [0] |= 0x02;	/* set local assignment bit (IEEE802) */
 }
 
+/**
+ * dev_hw_addr_random - Create random MAC and set device flag
+ * @dev: pointer to net_device structure
+ * @addr: Pointer to a six-byte array containing the Ethernet address
+ *
+ * Generate random MAC to be used by a device and set addr_assign_type
+ * so the state can be read by sysfs and be used by udev.
+ */
+static inline void dev_hw_addr_random(struct net_device *dev, u8 *hwaddr)
+{
+	dev->addr_assign_type |= NET_ADDR_RANDOM;
+	random_ether_addr(hwaddr);
+}
+
 /**
  * compare_ether_addr - Compare two Ethernet addresses
  * @addr1: Pointer to a six-byte array containing the Ethernet address
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b6262898ece0..1bca6171b1aa 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -66,6 +66,11 @@ struct wireless_dev;
 #define HAVE_FREE_NETDEV		/* free_netdev() */
 #define HAVE_NETDEV_PRIV		/* netdev_priv() */
 
+/* hardware address assignment types */
+#define NET_ADDR_PERM		0	/* address is permanent (default) */
+#define NET_ADDR_RANDOM		1	/* address is generated randomly */
+#define NET_ADDR_STOLEN		2	/* address is stolen from other device */
+
 /* Backlog congestion levels */
 #define NET_RX_SUCCESS		0	/* keep 'em coming, baby */
 #define NET_RX_DROP		1	/* packet dropped */
@@ -919,6 +924,7 @@ struct net_device {
 
 	/* Interface address info. */
 	unsigned char		perm_addr[MAX_ADDR_LEN]; /* permanent hw address */
+	unsigned char		addr_assign_type; /* hw address assignment type */
 	unsigned char		addr_len;	/* hardware address length	*/
 	unsigned short          dev_id;		/* for shared network cards */
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index d2b596537d41..af4dfbadf2a0 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -95,6 +95,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
 }
 
 NETDEVICE_SHOW(dev_id, fmt_hex);
+NETDEVICE_SHOW(addr_assign_type, fmt_dec);
 NETDEVICE_SHOW(addr_len, fmt_dec);
 NETDEVICE_SHOW(iflink, fmt_dec);
 NETDEVICE_SHOW(ifindex, fmt_dec);
@@ -295,6 +296,7 @@ static ssize_t show_ifalias(struct device *dev,
 }
 
 static struct device_attribute net_class_attributes[] = {
+	__ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL),
 	__ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
 	__ATTR(dev_id, S_IRUGO, show_dev_id, NULL),
 	__ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias),
-- 
cgit v1.2.3-59-g8ed1b


From fed66381d65a35198639f564365e61a7f256bf79 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 22 Jul 2010 19:09:08 +0000
Subject: net: pskb_expand_head() optimization

Move frags[] at the end of struct skb_shared_info, and make
pskb_expand_head() copy only the used part of it instead of whole array.

This should avoid kmemcheck warnings and speedup pskb_expand_head() as
well, avoiding a lot of cache misses.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 ++-
 net/core/skbuff.c      | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f5aa87e1e0c8..d89876b806a0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -202,10 +202,11 @@ struct skb_shared_info {
 	 */
 	atomic_t	dataref;
 
-	skb_frag_t	frags[MAX_SKB_FRAGS];
 	/* Intermediate layers must ensure that destructor_arg
 	 * remains valid until skb destructor */
 	void *		destructor_arg;
+	/* must be last field, see pskb_expand_head() */
+	skb_frag_t	frags[MAX_SKB_FRAGS];
 };
 
 /* We divide dataref into two halves.  The higher 16 bits hold references
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 76d33ca5f037..7da58a25ad92 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -817,7 +817,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	memcpy(data + nhead, skb->head, skb->tail - skb->head);
 #endif
 	memcpy(data + size, skb_end_pointer(skb),
-	       sizeof(struct skb_shared_info));
+	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
 
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 		get_page(skb_shinfo(skb)->frags[i].page);
-- 
cgit v1.2.3-59-g8ed1b


From 6cda9fa2575ec0869fe77b0bdf295c0e51868cab Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 25 Jul 2010 20:39:03 +0900
Subject: nilfs2: avoid rec_len overflow with 64KB block size

With 64KB blocksize, a directory entry can have size 64KB which does
not fit into 16 bits we have for entry length.  So this patch stores
0xffff instead and converts value when read from / written to disk.

Nilfs derives its directory implementation from ext2 filesystem, and
this draws upon the corresponding change on ext2.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/dir.c           | 26 ++++++++++++++------------
 include/linux/nilfs2_fs.h | 18 ++++++++++++++++++
 2 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index d8d183e6d095..b60277b44468 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -141,7 +141,7 @@ static void nilfs_check_page(struct page *page)
 	}
 	for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
 		p = (struct nilfs_dir_entry *)(kaddr + offs);
-		rec_len = le16_to_cpu(p->rec_len);
+		rec_len = nilfs_rec_len_from_disk(p->rec_len);
 
 		if (rec_len < NILFS_DIR_REC_LEN(1))
 			goto Eshort;
@@ -235,7 +235,8 @@ nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
  */
 static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
 {
-	return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+	return (struct nilfs_dir_entry *)((char *)p +
+					  nilfs_rec_len_from_disk(p->rec_len));
 }
 
 static unsigned char
@@ -326,7 +327,7 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 					goto success;
 				}
 			}
-			filp->f_pos += le16_to_cpu(de->rec_len);
+			filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
 		}
 		nilfs_put_page(page);
 	}
@@ -441,7 +442,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 		    struct page *page, struct inode *inode)
 {
 	unsigned from = (char *) de - (char *) page_address(page);
-	unsigned to = from + le16_to_cpu(de->rec_len);
+	unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
 	struct address_space *mapping = page->mapping;
 	int err;
 
@@ -497,7 +498,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 				/* We hit i_size */
 				name_len = 0;
 				rec_len = chunk_size;
-				de->rec_len = cpu_to_le16(chunk_size);
+				de->rec_len = nilfs_rec_len_to_disk(chunk_size);
 				de->inode = 0;
 				goto got_it;
 			}
@@ -511,7 +512,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 			if (nilfs_match(namelen, name, de))
 				goto out_unlock;
 			name_len = NILFS_DIR_REC_LEN(de->name_len);
-			rec_len = le16_to_cpu(de->rec_len);
+			rec_len = nilfs_rec_len_from_disk(de->rec_len);
 			if (!de->inode && rec_len >= reclen)
 				goto got_it;
 			if (rec_len >= name_len + reclen)
@@ -534,8 +535,8 @@ got_it:
 		struct nilfs_dir_entry *de1;
 
 		de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
-		de1->rec_len = cpu_to_le16(rec_len - name_len);
-		de->rec_len = cpu_to_le16(name_len);
+		de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len);
+		de->rec_len = nilfs_rec_len_to_disk(name_len);
 		de = de1;
 	}
 	de->name_len = namelen;
@@ -566,7 +567,8 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
 	struct inode *inode = mapping->host;
 	char *kaddr = page_address(page);
 	unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
-	unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+	unsigned to = ((char *)dir - kaddr) +
+		nilfs_rec_len_from_disk(dir->rec_len);
 	struct nilfs_dir_entry *pde = NULL;
 	struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
 	int err;
@@ -587,7 +589,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
 	err = nilfs_prepare_chunk(page, mapping, from, to);
 	BUG_ON(err);
 	if (pde)
-		pde->rec_len = cpu_to_le16(to - from);
+		pde->rec_len = nilfs_rec_len_to_disk(to - from);
 	dir->inode = 0;
 	nilfs_commit_chunk(page, mapping, from, to);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -621,14 +623,14 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
 	memset(kaddr, 0, chunk_size);
 	de = (struct nilfs_dir_entry *)kaddr;
 	de->name_len = 1;
-	de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
+	de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1));
 	memcpy(de->name, ".\0\0", 4);
 	de->inode = cpu_to_le64(inode->i_ino);
 	nilfs_set_de_type(de, inode);
 
 	de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
 	de->name_len = 2;
-	de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
+	de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1));
 	de->inode = cpu_to_le64(parent->i_ino);
 	memcpy(de->name, "..\0", 4);
 	nilfs_set_de_type(de, inode);
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 7dd4cd494490..970828a5ffc5 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -326,7 +326,25 @@ enum {
 #define NILFS_DIR_ROUND			(NILFS_DIR_PAD - 1)
 #define NILFS_DIR_REC_LEN(name_len)	(((name_len) + 12 + NILFS_DIR_ROUND) & \
 					~NILFS_DIR_ROUND)
+#define NILFS_MAX_REC_LEN		((1<<16)-1)
 
+static inline unsigned nilfs_rec_len_from_disk(__le16 dlen)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+	if (len == NILFS_MAX_REC_LEN)
+		return 1 << 16;
+	return len;
+}
+
+static inline __le16 nilfs_rec_len_to_disk(unsigned len)
+{
+	if (len == (1 << 16))
+		return cpu_to_le16(NILFS_MAX_REC_LEN);
+	else if (len > (1 << 16))
+		BUG();
+	return cpu_to_le16(len);
+}
 
 /**
  * struct nilfs_finfo - file information
-- 
cgit v1.2.3-59-g8ed1b


From 89c0fd014d34d409a7b196667c2b9a4813b6c968 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 25 Jul 2010 22:44:53 +0900
Subject: nilfs2: reject filesystem with unsupported block size

This inserts sanity check that refuses to mount a filesystem with
unsupported block size.

Previously, kernel code of nilfs was looking only limitation of
devices though mkfs.nilfs2 limits the range of block sizes; there was
no check that prevents rec_len overflow with larger block sizes.

With this change, block sizes larger than 64KB or smaller than 1KB
will get rejected explicitly by kernel.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/the_nilfs.c     | 9 ++++++++-
 include/linux/nilfs2_fs.h | 6 ++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index da67b560f3c3..37de1f062d81 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -671,7 +671,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 		goto out;
 	}
 
-	blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
+	blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
 	if (!blocksize) {
 		printk(KERN_ERR "NILFS: unable to set blocksize\n");
 		err = -EINVAL;
@@ -690,6 +690,13 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 		goto failed_sbh;
 
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+	if (blocksize < NILFS_MIN_BLOCK_SIZE ||
+	    blocksize > NILFS_MAX_BLOCK_SIZE) {
+		printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
+		       "filesystem blocksize %d\n", blocksize);
+		err = -EINVAL;
+		goto failed_sbh;
+	}
 	if (sb->s_blocksize != blocksize) {
 		int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
 
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 970828a5ffc5..f5487b6f91ed 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -286,6 +286,12 @@ struct nilfs_super_block {
 
 #define NILFS_NAME_LEN 255
 
+/*
+ * Block size limitations
+ */
+#define NILFS_MIN_BLOCK_SIZE		1024
+#define NILFS_MAX_BLOCK_SIZE		65536
+
 /*
  * The new version of the directory entry.  Since V0 structures are
  * stored in intel byte order, and the name_len field could never be
-- 
cgit v1.2.3-59-g8ed1b


From ba9f507a1bea5ca2fc4a19e227c56b60fd5faca3 Mon Sep 17 00:00:00 2001
From: Xiaolong Chen <xiaolong.chen@gmail.com>
Date: Mon, 26 Jul 2010 01:01:11 -0700
Subject: Input: adp5588-keys - export unused GPIO pins

This patch allows exporting GPIO pins not used by the keypad itself
to be accessible from elsewhere.

Signed-off-by: Xiaolong Chen <xiao-long.chen@motorola.com>
Signed-off-by: Yuanbo Ye <yuan-bo.ye@motorola.com>
Signed-off-by: Tao Hu <taohu@motorola.com>
Acked-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/keyboard/adp5588-keys.c | 209 +++++++++++++++++++++++++++++++++-
 include/linux/i2c/adp5588.h           |   1 +
 2 files changed, 208 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c
index 9096db73c3c8..c39ec93c0c58 100644
--- a/drivers/input/keyboard/adp5588-keys.c
+++ b/drivers/input/keyboard/adp5588-keys.c
@@ -19,6 +19,7 @@
 #include <linux/platform_device.h>
 #include <linux/input.h>
 #include <linux/i2c.h>
+#include <linux/gpio.h>
 #include <linux/slab.h>
 
 #include <linux/i2c/adp5588.h>
@@ -54,6 +55,10 @@
 
 #define KEYP_MAX_EVENT		10
 
+#define MAXGPIO			18
+#define ADP_BANK(offs)		((offs) >> 3)
+#define ADP_BIT(offs)		(1u << ((offs) & 0x7))
+
 /*
  * Early pre 4.0 Silicon required to delay readout by at least 25ms,
  * since the Event Counter Register updated 25ms after the interrupt
@@ -69,6 +74,14 @@ struct adp5588_kpad {
 	unsigned short keycode[ADP5588_KEYMAPSIZE];
 	const struct adp5588_gpi_map *gpimap;
 	unsigned short gpimapsize;
+#ifdef CONFIG_GPIOLIB
+	unsigned char gpiomap[MAXGPIO];
+	bool export_gpio;
+	struct gpio_chip gc;
+	struct mutex gpio_lock;	/* Protect cached dir, dat_out */
+	u8 dat_out[3];
+	u8 dir[3];
+#endif
 };
 
 static int adp5588_read(struct i2c_client *client, u8 reg)
@@ -86,6 +99,183 @@ static int adp5588_write(struct i2c_client *client, u8 reg, u8 val)
 	return i2c_smbus_write_byte_data(client, reg, val);
 }
 
+#ifdef CONFIG_GPIOLIB
+static int adp5588_gpio_get_value(struct gpio_chip *chip, unsigned off)
+{
+	struct adp5588_kpad *kpad = container_of(chip, struct adp5588_kpad, gc);
+	unsigned int bank = ADP_BANK(kpad->gpiomap[off]);
+	unsigned int bit = ADP_BIT(kpad->gpiomap[off]);
+
+	return !!(adp5588_read(kpad->client, GPIO_DAT_STAT1 + bank) & bit);
+}
+
+static void adp5588_gpio_set_value(struct gpio_chip *chip,
+				   unsigned off, int val)
+{
+	struct adp5588_kpad *kpad = container_of(chip, struct adp5588_kpad, gc);
+	unsigned int bank = ADP_BANK(kpad->gpiomap[off]);
+	unsigned int bit = ADP_BIT(kpad->gpiomap[off]);
+
+	mutex_lock(&kpad->gpio_lock);
+
+	if (val)
+		kpad->dat_out[bank] |= bit;
+	else
+		kpad->dat_out[bank] &= ~bit;
+
+	adp5588_write(kpad->client, GPIO_DAT_OUT1 + bank,
+			   kpad->dat_out[bank]);
+
+	mutex_unlock(&kpad->gpio_lock);
+}
+
+static int adp5588_gpio_direction_input(struct gpio_chip *chip, unsigned off)
+{
+	struct adp5588_kpad *kpad = container_of(chip, struct adp5588_kpad, gc);
+	unsigned int bank = ADP_BANK(kpad->gpiomap[off]);
+	unsigned int bit = ADP_BIT(kpad->gpiomap[off]);
+	int ret;
+
+	mutex_lock(&kpad->gpio_lock);
+
+	kpad->dir[bank] &= ~bit;
+	ret = adp5588_write(kpad->client, GPIO_DIR1 + bank, kpad->dir[bank]);
+
+	mutex_unlock(&kpad->gpio_lock);
+
+	return ret;
+}
+
+static int adp5588_gpio_direction_output(struct gpio_chip *chip,
+					 unsigned off, int val)
+{
+	struct adp5588_kpad *kpad = container_of(chip, struct adp5588_kpad, gc);
+	unsigned int bank = ADP_BANK(kpad->gpiomap[off]);
+	unsigned int bit = ADP_BIT(kpad->gpiomap[off]);
+	int ret;
+
+	mutex_lock(&kpad->gpio_lock);
+
+	kpad->dir[bank] |= bit;
+
+	if (val)
+		kpad->dat_out[bank] |= bit;
+	else
+		kpad->dat_out[bank] &= ~bit;
+
+	ret = adp5588_write(kpad->client, GPIO_DAT_OUT1 + bank,
+				 kpad->dat_out[bank]);
+	ret |= adp5588_write(kpad->client, GPIO_DIR1 + bank,
+				 kpad->dir[bank]);
+
+	mutex_unlock(&kpad->gpio_lock);
+
+	return ret;
+}
+
+static int __devinit adp5588_gpio_add(struct device *dev)
+{
+	struct adp5588_kpad *kpad = dev_get_drvdata(dev);
+	const struct adp5588_kpad_platform_data *pdata = dev->platform_data;
+	const struct adp5588_gpio_platform_data *gpio_data = pdata->gpio_data;
+	int i, error;
+
+	if (gpio_data) {
+		int j = 0;
+		bool pin_used[MAXGPIO];
+
+		for (i = 0; i < pdata->rows; i++)
+			pin_used[i] = true;
+
+		for (i = 0; i < pdata->cols; i++)
+			pin_used[i + GPI_PIN_COL_BASE - GPI_PIN_BASE] = true;
+
+		for (i = 0; i < kpad->gpimapsize; i++)
+			pin_used[kpad->gpimap[i].pin - GPI_PIN_BASE] = true;
+
+		for (i = 0; i < MAXGPIO; i++) {
+			if (!pin_used[i])
+				kpad->gpiomap[j++] = i;
+		}
+		kpad->gc.ngpio = j;
+
+		if (kpad->gc.ngpio)
+			kpad->export_gpio = true;
+	}
+
+	if (!kpad->export_gpio) {
+		dev_info(dev, "No unused gpios left to export\n");
+		return 0;
+	}
+
+	kpad->gc.direction_input = adp5588_gpio_direction_input;
+	kpad->gc.direction_output = adp5588_gpio_direction_output;
+	kpad->gc.get = adp5588_gpio_get_value;
+	kpad->gc.set = adp5588_gpio_set_value;
+	kpad->gc.can_sleep = 1;
+
+	kpad->gc.base = gpio_data->gpio_start;
+	kpad->gc.label = kpad->client->name;
+	kpad->gc.owner = THIS_MODULE;
+
+	mutex_init(&kpad->gpio_lock);
+
+	error = gpiochip_add(&kpad->gc);
+	if (error) {
+		dev_err(dev, "gpiochip_add failed, err: %d\n", error);
+		return error;
+	}
+
+	for (i = 0; i <= ADP_BANK(MAXGPIO); i++) {
+		kpad->dat_out[i] = adp5588_read(kpad->client,
+						GPIO_DAT_OUT1 + i);
+		kpad->dir[i] = adp5588_read(kpad->client, GPIO_DIR1 + i);
+	}
+
+	if (gpio_data->setup) {
+		error = gpio_data->setup(kpad->client,
+					 kpad->gc.base, kpad->gc.ngpio,
+					 gpio_data->context);
+		if (error)
+			dev_warn(dev, "setup failed, %d\n", error);
+	}
+
+	return 0;
+}
+
+static void __devexit adp5588_gpio_remove(struct device *dev)
+{
+	struct adp5588_kpad *kpad = dev_get_drvdata(dev);
+	const struct adp5588_kpad_platform_data *pdata = dev->platform_data;
+	const struct adp5588_gpio_platform_data *gpio_data = pdata->gpio_data;
+	int error;
+
+	if (!kpad->export_gpio)
+		return;
+
+	if (gpio_data->teardown) {
+		error = gpio_data->teardown(kpad->client,
+					    kpad->gc.base, kpad->gc.ngpio,
+					    gpio_data->context);
+		if (error)
+			dev_warn(dev, "teardown failed %d\n", error);
+	}
+
+	error = gpiochip_remove(&kpad->gc);
+	if (error)
+		dev_warn(dev, "gpiochip_remove failed %d\n", error);
+}
+#else
+static inline int adp5588_gpio_add(struct device *dev)
+{
+	return 0;
+}
+
+static inline void adp5588_gpio_remove(struct device *dev)
+{
+}
+#endif
+
 static void adp5588_report_events(struct adp5588_kpad *kpad, int ev_cnt)
 {
 	int i, j;
@@ -150,7 +340,8 @@ static irqreturn_t adp5588_irq(int irq, void *handle)
 
 static int __devinit adp5588_setup(struct i2c_client *client)
 {
-	struct adp5588_kpad_platform_data *pdata = client->dev.platform_data;
+	const struct adp5588_kpad_platform_data *pdata = client->dev.platform_data;
+	const struct adp5588_gpio_platform_data *gpio_data = pdata->gpio_data;
 	int i, ret;
 	unsigned char evt_mode1 = 0, evt_mode2 = 0, evt_mode3 = 0;
 
@@ -184,6 +375,15 @@ static int __devinit adp5588_setup(struct i2c_client *client)
 		ret |= adp5588_write(client, GPI_EM3, evt_mode3);
 	}
 
+	if (gpio_data) {
+		for (i = 0; i <= ADP_BANK(MAXGPIO); i++) {
+			int pull_mask = gpio_data->pullup_dis_mask;
+
+			ret |= adp5588_write(client, GPIO_PULL1 + i,
+				(pull_mask >> (8 * i)) & 0xFF);
+		}
+	}
+
 	ret |= adp5588_write(client, INT_STAT, CMP2_INT | CMP1_INT |
 					OVR_FLOW_INT | K_LCK_INT |
 					GPI_INT | KE_INT); /* Status is W1C */
@@ -240,7 +440,7 @@ static int __devinit adp5588_probe(struct i2c_client *client,
 					const struct i2c_device_id *id)
 {
 	struct adp5588_kpad *kpad;
-	struct adp5588_kpad_platform_data *pdata = client->dev.platform_data;
+	const struct adp5588_kpad_platform_data *pdata = client->dev.platform_data;
 	struct input_dev *input;
 	unsigned int revid;
 	int ret, i;
@@ -381,6 +581,10 @@ static int __devinit adp5588_probe(struct i2c_client *client,
 	if (kpad->gpimapsize)
 		adp5588_report_switch_state(kpad);
 
+	error = adp5588_gpio_add(&client->dev);
+	if (error)
+		goto err_free_irq;
+
 	device_init_wakeup(&client->dev, 1);
 	i2c_set_clientdata(client, kpad);
 
@@ -407,6 +611,7 @@ static int __devexit adp5588_remove(struct i2c_client *client)
 	free_irq(client->irq, kpad);
 	cancel_delayed_work_sync(&kpad->work);
 	input_unregister_device(kpad->input);
+	adp5588_gpio_remove(&client->dev);
 	kfree(kpad);
 
 	return 0;
diff --git a/include/linux/i2c/adp5588.h b/include/linux/i2c/adp5588.h
index b5f57c498e24..269181b8f623 100644
--- a/include/linux/i2c/adp5588.h
+++ b/include/linux/i2c/adp5588.h
@@ -123,6 +123,7 @@ struct adp5588_kpad_platform_data {
 	unsigned short unlock_key2;	/* Unlock Key 2 */
 	const struct adp5588_gpi_map *gpimap;
 	unsigned short gpimapsize;
+	const struct adp5588_gpio_platform_data *gpio_data;
 };
 
 struct adp5588_gpio_platform_data {
-- 
cgit v1.2.3-59-g8ed1b


From 40e2e97316af6e62affab7a392e792494b8d9dde Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 18 Jul 2010 21:17:09 +0000
Subject: direct-io: move aio_complete into ->end_io

Filesystems with unwritten extent support must not complete an AIO request
until the transaction to convert the extent has been commited.  That means
the aio_complete calls needs to be moved into the ->end_io callback so
that the filesystem can control when to call it exactly.

This makes a bit of a mess out of dio_complete and the ->end_io callback
prototype even more complicated.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/direct-io.c              | 26 ++++++++++++++------------
 fs/ext4/inode.c             | 10 +++++++---
 fs/ocfs2/aops.c             |  7 ++++++-
 fs/xfs/linux-2.6/xfs_aops.c |  7 ++++++-
 fs/xfs/linux-2.6/xfs_aops.h |  2 ++
 include/linux/fs.h          |  3 ++-
 6 files changed, 37 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7600aacf531d..a10cb91cadea 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
  * filesystems can use it to hold additional state between get_block calls and
  * dio_complete.
  */
-static int dio_complete(struct dio *dio, loff_t offset, int ret)
+static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
 {
 	ssize_t transferred = 0;
 
@@ -239,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
 			transferred = dio->i_size - offset;
 	}
 
-	if (dio->end_io && dio->result)
-		dio->end_io(dio->iocb, offset, transferred,
-			    dio->map_bh.b_private);
-
-	if (dio->flags & DIO_LOCKING)
-		/* lockdep: non-owner release */
-		up_read_non_owner(&dio->inode->i_alloc_sem);
-
 	if (ret == 0)
 		ret = dio->page_errors;
 	if (ret == 0)
@@ -254,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
 	if (ret == 0)
 		ret = transferred;
 
+	if (dio->end_io && dio->result) {
+		dio->end_io(dio->iocb, offset, transferred,
+			    dio->map_bh.b_private, ret, is_async);
+	} else if (is_async) {
+		aio_complete(dio->iocb, ret, 0);
+	}
+
+	if (dio->flags & DIO_LOCKING)
+		/* lockdep: non-owner release */
+		up_read_non_owner(&dio->inode->i_alloc_sem);
+
 	return ret;
 }
 
@@ -277,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
 	if (remaining == 0) {
-		int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
-		aio_complete(dio->iocb, ret, 0);
+		dio_complete(dio, dio->iocb->ki_pos, 0, true);
 		kfree(dio);
 	}
 }
@@ -1126,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
 	if (ret2 == 0) {
-		ret = dio_complete(dio, offset, ret);
+		ret = dio_complete(dio, offset, ret, false);
 		kfree(dio);
 	} else
 		BUG_ON(ret != -EIOCBQUEUED);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 42272d67955a..0afc8c1d8cf3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3775,7 +3775,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
 }
 
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
-			    ssize_t size, void *private)
+			    ssize_t size, void *private, int ret,
+			    bool is_async)
 {
         ext4_io_end_t *io_end = iocb->private;
 	struct workqueue_struct *wq;
@@ -3784,7 +3785,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
 	/* if not async direct IO or dio with 0 bytes write, just return */
 	if (!io_end || !size)
-		return;
+		goto out;
 
 	ext_debug("ext4_end_io_dio(): io_end 0x%p"
 		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
@@ -3795,7 +3796,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 	if (io_end->flag != EXT4_IO_UNWRITTEN){
 		ext4_free_io_end(io_end);
 		iocb->private = NULL;
-		return;
+		goto out;
 	}
 
 	io_end->offset = offset;
@@ -3812,6 +3813,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 	list_add_tail(&io_end->list, &ei->i_completed_io_list);
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	iocb->private = NULL;
+out:
+	if (is_async)
+		aio_complete(iocb, ret, 0);
 }
 
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 356e976772bf..96337a4fbbdf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -578,7 +578,9 @@ bail:
 static void ocfs2_dio_end_io(struct kiocb *iocb,
 			     loff_t offset,
 			     ssize_t bytes,
-			     void *private)
+			     void *private,
+			     int ret,
+			     bool is_async)
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 	int level;
@@ -592,6 +594,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 	if (!level)
 		up_read(&inode->i_alloc_sem);
 	ocfs2_rw_unlock(inode, level);
+
+	if (is_async)
+		aio_complete(iocb, ret, 0);
 }
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 8abbf0532ea1..95d1e2695c3a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1406,7 +1406,9 @@ xfs_end_io_direct(
 	struct kiocb	*iocb,
 	loff_t		offset,
 	ssize_t		size,
-	void		*private)
+	void		*private,
+	int		ret,
+	bool		is_async)
 {
 	xfs_ioend_t	*ioend = iocb->private;
 
@@ -1452,6 +1454,9 @@ xfs_end_io_direct(
 	 * against double-freeing.
 	 */
 	iocb->private = NULL;
+
+	if (is_async)
+		aio_complete(iocb, ret, 0);
 }
 
 STATIC ssize_t
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 319da173cc1a..c5057fb6237a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -37,6 +37,8 @@ typedef struct xfs_ioend {
 	size_t			io_size;	/* size of the extent */
 	xfs_off_t		io_offset;	/* offset in the file */
 	struct work_struct	io_work;	/* xfsdatad work queue */
+	struct kiocb		*io_iocb;
+	int			io_result;
 } xfs_ioend_t;
 
 extern const struct address_space_operations xfs_address_space_operations;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 68ca1b0491af..f91affb7d530 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -415,7 +415,8 @@ struct buffer_head;
 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
-			ssize_t bytes, void *private);
+			ssize_t bytes, void *private, int ret,
+			bool is_async);
 
 /*
  * Attribute flags.  These should be or-ed together to figure out what
-- 
cgit v1.2.3-59-g8ed1b


From 98864ff58dd2b8ef9e72b0d2c70f34e7ff24a2ee Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Sat, 22 May 2010 23:59:11 +0100
Subject: ARM: OMAP: Convert OMAPFB and VRAM SDRAM reservation to LMB

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/plat-omap/common.c            |  4 ++--
 arch/arm/plat-omap/fb.c                | 30 +++++++++++++++++++-----------
 arch/arm/plat-omap/include/plat/vram.h |  4 ++--
 drivers/video/omap2/vram.c             | 33 +++++++++++++++------------------
 include/linux/omapfb.h                 |  2 +-
 5 files changed, 39 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/plat-omap/common.c b/arch/arm/plat-omap/common.c
index 9f6bbc178a74..ebed82699eb2 100644
--- a/arch/arm/plat-omap/common.c
+++ b/arch/arm/plat-omap/common.c
@@ -85,8 +85,8 @@ EXPORT_SYMBOL(omap_get_var_config);
 
 void __init omap_reserve(void)
 {
-	omapfb_reserve_sdram();
-	omap_vram_reserve_sdram();
+	omapfb_reserve_sdram_memblock();
+	omap_vram_reserve_sdram_memblock();
 }
 
 /*
diff --git a/arch/arm/plat-omap/fb.c b/arch/arm/plat-omap/fb.c
index 97db493904fa..0054b9501a53 100644
--- a/arch/arm/plat-omap/fb.c
+++ b/arch/arm/plat-omap/fb.c
@@ -26,7 +26,7 @@
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/io.h>
 #include <linux/omapfb.h>
 
@@ -173,25 +173,27 @@ static int check_fbmem_region(int region_idx, struct omapfb_mem_region *rg,
 
 static int valid_sdram(unsigned long addr, unsigned long size)
 {
-	struct bootmem_data *bdata = NODE_DATA(0)->bdata;
-	unsigned long sdram_start, sdram_end;
+	struct memblock_property res;
 
-	sdram_start = bdata->node_min_pfn << PAGE_SHIFT;
-	sdram_end = bdata->node_low_pfn << PAGE_SHIFT;
-
-	return addr >= sdram_start && sdram_end - addr >= size;
+	res.base = addr;
+	res.size = size;
+	return !memblock_find(&res) && res.base == addr && res.size == size;
 }
 
 static int reserve_sdram(unsigned long addr, unsigned long size)
 {
-	return reserve_bootmem(addr, size, BOOTMEM_EXCLUSIVE);
+	if (memblock_is_region_reserved(addr, size))
+		return -EBUSY;
+	if (memblock_reserve(addr, size))
+		return -ENOMEM;
+	return 0;
 }
 
 /*
  * Called from map_io. We need to call to this early enough so that we
  * can reserve the fixed SDRAM regions before VM could get hold of them.
  */
-void __init omapfb_reserve_sdram(void)
+void __init omapfb_reserve_sdram_memblock(void)
 {
 	unsigned long reserved = 0;
 	int i;
@@ -386,7 +388,10 @@ static inline int omap_init_fb(void)
 
 arch_initcall(omap_init_fb);
 
-void omapfb_reserve_sdram(void) {}
+void omapfb_reserve_sdram_memblock(void)
+{
+}
+
 unsigned long omapfb_reserve_sram(unsigned long sram_pstart,
 				  unsigned long sram_vstart,
 				  unsigned long sram_size,
@@ -402,7 +407,10 @@ void omapfb_set_platform_data(struct omapfb_platform_data *data)
 {
 }
 
-void omapfb_reserve_sdram(void) {}
+void omapfb_reserve_sdram_memblock(void)
+{
+}
+
 unsigned long omapfb_reserve_sram(unsigned long sram_pstart,
 				  unsigned long sram_vstart,
 				  unsigned long sram_size,
diff --git a/arch/arm/plat-omap/include/plat/vram.h b/arch/arm/plat-omap/include/plat/vram.h
index edd4987758a6..0aa4ecd12c7d 100644
--- a/arch/arm/plat-omap/include/plat/vram.h
+++ b/arch/arm/plat-omap/include/plat/vram.h
@@ -38,7 +38,7 @@ extern void omap_vram_get_info(unsigned long *vram, unsigned long *free_vram,
 extern void omap_vram_set_sdram_vram(u32 size, u32 start);
 extern void omap_vram_set_sram_vram(u32 size, u32 start);
 
-extern void omap_vram_reserve_sdram(void);
+extern void omap_vram_reserve_sdram_memblock(void);
 extern unsigned long omap_vram_reserve_sram(unsigned long sram_pstart,
 					    unsigned long sram_vstart,
 					    unsigned long sram_size,
@@ -48,7 +48,7 @@ extern unsigned long omap_vram_reserve_sram(unsigned long sram_pstart,
 static inline void omap_vram_set_sdram_vram(u32 size, u32 start) { }
 static inline void omap_vram_set_sram_vram(u32 size, u32 start) { }
 
-static inline void omap_vram_reserve_sdram(void) { }
+static inline void omap_vram_reserve_sdram_memblock(void) { }
 static inline unsigned long omap_vram_reserve_sram(unsigned long sram_pstart,
 					    unsigned long sram_vstart,
 					    unsigned long sram_size,
diff --git a/drivers/video/omap2/vram.c b/drivers/video/omap2/vram.c
index 3b1237ad85ed..f6fdc2085f3e 100644
--- a/drivers/video/omap2/vram.c
+++ b/drivers/video/omap2/vram.c
@@ -25,7 +25,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/completion.h>
 #include <linux/debugfs.h>
 #include <linux/jiffies.h>
@@ -525,10 +525,8 @@ early_param("vram", omap_vram_early_vram);
  * Called from map_io. We need to call to this early enough so that we
  * can reserve the fixed SDRAM regions before VM could get hold of them.
  */
-void __init omap_vram_reserve_sdram(void)
+void __init omap_vram_reserve_sdram_memblock(void)
 {
-	struct bootmem_data	*bdata;
-	unsigned long		sdram_start, sdram_size;
 	u32 paddr;
 	u32 size = 0;
 
@@ -555,29 +553,28 @@ void __init omap_vram_reserve_sdram(void)
 
 	size = PAGE_ALIGN(size);
 
-	bdata = NODE_DATA(0)->bdata;
-	sdram_start = bdata->node_min_pfn << PAGE_SHIFT;
-	sdram_size = (bdata->node_low_pfn << PAGE_SHIFT) - sdram_start;
-
 	if (paddr) {
-		if ((paddr & ~PAGE_MASK) || paddr < sdram_start ||
-				paddr + size > sdram_start + sdram_size) {
+		struct memblock_property res;
+
+		res.base = paddr;
+		res.size = size;
+		if ((paddr & ~PAGE_MASK) || memblock_find(&res) ||
+		    res.base != paddr || res.size != size) {
 			pr_err("Illegal SDRAM region for VRAM\n");
 			return;
 		}
 
-		if (reserve_bootmem(paddr, size, BOOTMEM_EXCLUSIVE) < 0) {
-			pr_err("FB: failed to reserve VRAM\n");
+		if (memblock_is_region_reserved(paddr, size)) {
+			pr_err("FB: failed to reserve VRAM - busy\n");
 			return;
 		}
-	} else {
-		if (size > sdram_size) {
-			pr_err("Illegal SDRAM size for VRAM\n");
+
+		if (memblock_reserve(paddr, size) < 0) {
+			pr_err("FB: failed to reserve VRAM - no memory\n");
 			return;
 		}
-
-		paddr = virt_to_phys(alloc_bootmem_pages(size));
-		BUG_ON(paddr & ~PAGE_MASK);
+	} else {
+		paddr = memblock_alloc_base(size, PAGE_SIZE, MEMBLOCK_REAL_LIMIT);
 	}
 
 	omap_vram_add_region(paddr, size);
diff --git a/include/linux/omapfb.h b/include/linux/omapfb.h
index 9bdd91486b49..7e4cd616bcb5 100644
--- a/include/linux/omapfb.h
+++ b/include/linux/omapfb.h
@@ -253,7 +253,7 @@ struct omapfb_platform_data {
 /* in arch/arm/plat-omap/fb.c */
 extern void omapfb_set_platform_data(struct omapfb_platform_data *data);
 extern void omapfb_set_ctrl_platform_data(void *pdata);
-extern void omapfb_reserve_sdram(void);
+extern void omapfb_reserve_sdram_memblock(void);
 
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From ec489aa8f993f8d2ec962ce113071faac482aa27 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Wed, 2 Jun 2010 08:13:52 +0100
Subject: ARM: 6157/2: PL011 TX/RX split of LCR for ST-Ericssons derivative

In the ST-Ericsson version of the PL011 the TX and RX have different
control registers.

Cc: Alessandro Rubini <rubini@unipv.it>
Signed-off-by: Marcin Mielczarczyk <marcin.mielczarczyk@tieto.com>
Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/amba-pl011.c | 61 +++++++++++++++++++++++++++++++++++++--------
 include/linux/amba/serial.h |  2 ++
 2 files changed, 52 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index eb4cb480b93e..5644cf2385bb 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -69,9 +69,11 @@
 struct uart_amba_port {
 	struct uart_port	port;
 	struct clk		*clk;
-	unsigned int		im;	/* interrupt mask */
+	unsigned int		im;		/* interrupt mask */
 	unsigned int		old_status;
-	unsigned int		ifls;	/* vendor-specific */
+	unsigned int		ifls;		/* vendor-specific */
+	unsigned int		lcrh_tx;	/* vendor-specific */
+	unsigned int		lcrh_rx;	/* vendor-specific */
 	bool			autorts;
 };
 
@@ -79,16 +81,22 @@ struct uart_amba_port {
 struct vendor_data {
 	unsigned int		ifls;
 	unsigned int		fifosize;
+	unsigned int		lcrh_tx;
+	unsigned int		lcrh_rx;
 };
 
 static struct vendor_data vendor_arm = {
 	.ifls			= UART011_IFLS_RX4_8|UART011_IFLS_TX4_8,
 	.fifosize		= 16,
+	.lcrh_tx		= UART011_LCRH,
+	.lcrh_rx		= UART011_LCRH,
 };
 
 static struct vendor_data vendor_st = {
 	.ifls			= UART011_IFLS_RX_HALF|UART011_IFLS_TX_HALF,
 	.fifosize		= 64,
+	.lcrh_tx		= ST_UART011_LCRH_TX,
+	.lcrh_rx		= ST_UART011_LCRH_RX,
 };
 
 static void pl011_stop_tx(struct uart_port *port)
@@ -327,12 +335,12 @@ static void pl011_break_ctl(struct uart_port *port, int break_state)
 	unsigned int lcr_h;
 
 	spin_lock_irqsave(&uap->port.lock, flags);
-	lcr_h = readw(uap->port.membase + UART011_LCRH);
+	lcr_h = readw(uap->port.membase + uap->lcrh_tx);
 	if (break_state == -1)
 		lcr_h |= UART01x_LCRH_BRK;
 	else
 		lcr_h &= ~UART01x_LCRH_BRK;
-	writew(lcr_h, uap->port.membase + UART011_LCRH);
+	writew(lcr_h, uap->port.membase + uap->lcrh_tx);
 	spin_unlock_irqrestore(&uap->port.lock, flags);
 }
 
@@ -393,7 +401,17 @@ static int pl011_startup(struct uart_port *port)
 	writew(cr, uap->port.membase + UART011_CR);
 	writew(0, uap->port.membase + UART011_FBRD);
 	writew(1, uap->port.membase + UART011_IBRD);
-	writew(0, uap->port.membase + UART011_LCRH);
+	writew(0, uap->port.membase + uap->lcrh_rx);
+	if (uap->lcrh_tx != uap->lcrh_rx) {
+		int i;
+		/*
+		 * Wait 10 PCLKs before writing LCRH_TX register,
+		 * to get this delay write read only register 10 times
+		 */
+		for (i = 0; i < 10; ++i)
+			writew(0xff, uap->port.membase + UART011_MIS);
+		writew(0, uap->port.membase + uap->lcrh_tx);
+	}
 	writew(0, uap->port.membase + UART01x_DR);
 	while (readw(uap->port.membase + UART01x_FR) & UART01x_FR_BUSY)
 		barrier();
@@ -422,10 +440,19 @@ static int pl011_startup(struct uart_port *port)
 	return retval;
 }
 
+static void pl011_shutdown_channel(struct uart_amba_port *uap,
+					unsigned int lcrh)
+{
+      unsigned long val;
+
+      val = readw(uap->port.membase + lcrh);
+      val &= ~(UART01x_LCRH_BRK | UART01x_LCRH_FEN);
+      writew(val, uap->port.membase + lcrh);
+}
+
 static void pl011_shutdown(struct uart_port *port)
 {
 	struct uart_amba_port *uap = (struct uart_amba_port *)port;
-	unsigned long val;
 
 	/*
 	 * disable all interrupts
@@ -450,9 +477,9 @@ static void pl011_shutdown(struct uart_port *port)
 	/*
 	 * disable break condition and fifos
 	 */
-	val = readw(uap->port.membase + UART011_LCRH);
-	val &= ~(UART01x_LCRH_BRK | UART01x_LCRH_FEN);
-	writew(val, uap->port.membase + UART011_LCRH);
+	pl011_shutdown_channel(uap, uap->lcrh_rx);
+	if (uap->lcrh_rx != uap->lcrh_tx)
+		pl011_shutdown_channel(uap, uap->lcrh_tx);
 
 	/*
 	 * Shut down the clock producer
@@ -561,7 +588,17 @@ pl011_set_termios(struct uart_port *port, struct ktermios *termios,
 	 * NOTE: MUST BE WRITTEN AFTER UARTLCR_M & UARTLCR_L
 	 * ----------^----------^----------^----------^-----
 	 */
-	writew(lcr_h, port->membase + UART011_LCRH);
+	writew(lcr_h, port->membase + uap->lcrh_rx);
+	if (uap->lcrh_rx != uap->lcrh_tx) {
+		int i;
+		/*
+		 * Wait 10 PCLKs before writing LCRH_TX register,
+		 * to get this delay write read only register 10 times
+		 */
+		for (i = 0; i < 10; ++i)
+			writew(0xff, uap->port.membase + UART011_MIS);
+		writew(lcr_h, port->membase + uap->lcrh_tx);
+	}
 	writew(old_cr, port->membase + UART011_CR);
 
 	spin_unlock_irqrestore(&port->lock, flags);
@@ -688,7 +725,7 @@ pl011_console_get_options(struct uart_amba_port *uap, int *baud,
 	if (readw(uap->port.membase + UART011_CR) & UART01x_CR_UARTEN) {
 		unsigned int lcr_h, ibrd, fbrd;
 
-		lcr_h = readw(uap->port.membase + UART011_LCRH);
+		lcr_h = readw(uap->port.membase + uap->lcrh_tx);
 
 		*parity = 'n';
 		if (lcr_h & UART01x_LCRH_PEN) {
@@ -800,6 +837,8 @@ static int pl011_probe(struct amba_device *dev, struct amba_id *id)
 	}
 
 	uap->ifls = vendor->ifls;
+	uap->lcrh_rx = vendor->lcrh_rx;
+	uap->lcrh_tx = vendor->lcrh_tx;
 	uap->port.dev = &dev->dev;
 	uap->port.mapbase = dev->res.start;
 	uap->port.membase = base;
diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index 5a5a7fd62490..93c96a66c518 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -38,10 +38,12 @@
 #define UART01x_FR		0x18	/* Flag register (Read only). */
 #define UART010_IIR		0x1C	/* Interrupt indentification register (Read). */
 #define UART010_ICR		0x1C	/* Interrupt clear register (Write). */
+#define ST_UART011_LCRH_RX	0x1C    /* Rx line control register. */
 #define UART01x_ILPR		0x20	/* IrDA low power counter register. */
 #define UART011_IBRD		0x24	/* Integer baud rate divisor register. */
 #define UART011_FBRD		0x28	/* Fractional baud rate divisor register. */
 #define UART011_LCRH		0x2c	/* Line control register. */
+#define ST_UART011_LCRH_TX	0x2c    /* Tx Line control register. */
 #define UART011_CR		0x30	/* Control register. */
 #define UART011_IFLS		0x34	/* Interrupt fifo level select. */
 #define UART011_IMSC		0x38	/* Interrupt mask. */
-- 
cgit v1.2.3-59-g8ed1b


From ac3e3fb424d44109dda3b1a3459e1b30fa60ac4a Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Wed, 2 Jun 2010 20:40:22 +0100
Subject: ARM: 6158/2: PL011 baudrate extension for ST-Ericssons derivative

Implementation of the ST-Ericsson baudrate extension in the PL011
block. In this modified variant it is possible to change the
sampling factor from 16 to 8, and thanks to this we can get higher
baudrates while still using the same peripheral clock.

Also replace the simple division to determine the baud divisor
with DIV_ROUND_CLOSEST() rather than a simple integer division.

Cc: Alessandro Rubini <rubini@unipv.it>
Cc: Jerzy Kasenberg <jerzy.kasenberg@tieto.com>
Signed-off-by: Marcin Mielczarczyk <marcin.mielczarczyk@tieto.com>
Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/amba-pl011.c | 27 +++++++++++++++++++++++++--
 include/linux/amba/serial.h |  1 +
 2 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index 5644cf2385bb..f67e09da6d33 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -74,6 +74,7 @@ struct uart_amba_port {
 	unsigned int		ifls;		/* vendor-specific */
 	unsigned int		lcrh_tx;	/* vendor-specific */
 	unsigned int		lcrh_rx;	/* vendor-specific */
+	bool			oversampling;   /* vendor-specific */
 	bool			autorts;
 };
 
@@ -83,6 +84,7 @@ struct vendor_data {
 	unsigned int		fifosize;
 	unsigned int		lcrh_tx;
 	unsigned int		lcrh_rx;
+	bool			oversampling;
 };
 
 static struct vendor_data vendor_arm = {
@@ -90,6 +92,7 @@ static struct vendor_data vendor_arm = {
 	.fifosize		= 16,
 	.lcrh_tx		= UART011_LCRH,
 	.lcrh_rx		= UART011_LCRH,
+	.oversampling		= false,
 };
 
 static struct vendor_data vendor_st = {
@@ -97,6 +100,7 @@ static struct vendor_data vendor_st = {
 	.fifosize		= 64,
 	.lcrh_tx		= ST_UART011_LCRH_TX,
 	.lcrh_rx		= ST_UART011_LCRH_RX,
+	.oversampling		= true,
 };
 
 static void pl011_stop_tx(struct uart_port *port)
@@ -499,8 +503,13 @@ pl011_set_termios(struct uart_port *port, struct ktermios *termios,
 	/*
 	 * Ask the core to calculate the divisor for us.
 	 */
-	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk/16);
-	quot = port->uartclk * 4 / baud;
+	baud = uart_get_baud_rate(port, termios, old, 0,
+				  port->uartclk/(uap->oversampling ? 8 : 16));
+
+	if (baud > port->uartclk/16)
+		quot = DIV_ROUND_CLOSEST(port->uartclk * 8, baud);
+	else
+		quot = DIV_ROUND_CLOSEST(port->uartclk * 4, baud);
 
 	switch (termios->c_cflag & CSIZE) {
 	case CS5:
@@ -579,6 +588,13 @@ pl011_set_termios(struct uart_port *port, struct ktermios *termios,
 		uap->autorts = false;
 	}
 
+	if (uap->oversampling) {
+		if (baud > port->uartclk/16)
+			old_cr |= ST_UART011_CR_OVSFACT;
+		else
+			old_cr &= ~ST_UART011_CR_OVSFACT;
+	}
+
 	/* Set baud rate */
 	writew(quot & 0x3f, port->membase + UART011_FBRD);
 	writew(quot >> 6, port->membase + UART011_IBRD);
@@ -744,6 +760,12 @@ pl011_console_get_options(struct uart_amba_port *uap, int *baud,
 		fbrd = readw(uap->port.membase + UART011_FBRD);
 
 		*baud = uap->port.uartclk * 4 / (64 * ibrd + fbrd);
+
+		if (uap->oversampling) {
+			if (readw(uap->port.membase + UART011_CR)
+				  & ST_UART011_CR_OVSFACT)
+				*baud *= 2;
+		}
 	}
 }
 
@@ -839,6 +861,7 @@ static int pl011_probe(struct amba_device *dev, struct amba_id *id)
 	uap->ifls = vendor->ifls;
 	uap->lcrh_rx = vendor->lcrh_rx;
 	uap->lcrh_tx = vendor->lcrh_tx;
+	uap->oversampling = vendor->oversampling;
 	uap->port.dev = &dev->dev;
 	uap->port.mapbase = dev->res.start;
 	uap->port.membase = base;
diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index 93c96a66c518..e1b634b635f2 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -86,6 +86,7 @@
 #define UART010_CR_TIE 		0x0020
 #define UART010_CR_RIE 		0x0010
 #define UART010_CR_MSIE		0x0008
+#define ST_UART011_CR_OVSFACT	0x0008	/* Oversampling factor */
 #define UART01x_CR_IIRLP	0x0004	/* SIR low power mode */
 #define UART01x_CR_SIREN	0x0002	/* SIR enable */
 #define UART01x_CR_UARTEN	0x0001	/* UART enable */
-- 
cgit v1.2.3-59-g8ed1b


From ce3bf7ab22527183634a76512d9854a38615e4d5 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:19 -0700
Subject: time: Implement timespec_add

After accidentally misusing timespec_add_safe, I wanted to make sure
we don't accidently trip over that issue again, so I created a simple
timespec_add() function which we can use to replace the instances
of timespec_add_safe() that don't want the overflow detection.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1279068988-21864-3-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/time.h      | 16 ++++++++++++++++
 kernel/time/timekeeping.c |  6 +++---
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index ea3559f0b3f2..9072df83de1f 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -76,9 +76,25 @@ extern unsigned long mktime(const unsigned int year, const unsigned int mon,
 			    const unsigned int min, const unsigned int sec);
 
 extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec);
+
+/*
+ * timespec_add_safe assumes both values are positive and checks
+ * for overflow. It will return TIME_T_MAX if the reutrn would be
+ * smaller then either of the arguments.
+ */
 extern struct timespec timespec_add_safe(const struct timespec lhs,
 					 const struct timespec rhs);
 
+
+static inline struct timespec timespec_add(struct timespec lhs,
+						struct timespec rhs)
+{
+	struct timespec ts_delta;
+	set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec,
+				lhs.tv_nsec + rhs.tv_nsec);
+	return ts_delta;
+}
+
 /*
  * sub = lhs - rhs, in normalized form
  */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index caf8d4d4f5c8..623fe3d504dc 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -579,9 +579,9 @@ static int timekeeping_resume(struct sys_device *dev)
 
 	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
 		ts = timespec_sub(ts, timekeeping_suspend_time);
-		xtime = timespec_add_safe(xtime, ts);
+		xtime = timespec_add(xtime, ts);
 		wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
-		total_sleep_time = timespec_add_safe(total_sleep_time, ts);
+		total_sleep_time = timespec_add(total_sleep_time, ts);
 	}
 	/* re-base the last cycle value */
 	timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -887,7 +887,7 @@ EXPORT_SYMBOL_GPL(getboottime);
  */
 void monotonic_to_bootbased(struct timespec *ts)
 {
-	*ts = timespec_add_safe(*ts, total_sleep_time);
+	*ts = timespec_add(*ts, total_sleep_time);
 }
 EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 
-- 
cgit v1.2.3-59-g8ed1b


From 7615856ebfee52b080c22d263ca4debbd0df0ac1 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:23 -0700
Subject: timkeeping: Fix update_vsyscall to provide wall_to_monotonic offset

update_vsyscall() did not provide the wall_to_monotoinc offset,
so arch specific implementations tend to reference wall_to_monotonic
directly. This limits future cleanups in the timekeeping core, so
this patch fixes the update_vsyscall interface to provide
wall_to_monotonic, allowing wall_to_monotonic to be made static
as planned in Documentation/feature-removal-schedule.txt

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tony Luck <tony.luck@intel.com>
LKML-Reference: <1279068988-21864-7-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/ia64/kernel/time.c       | 7 ++++---
 arch/powerpc/kernel/time.c    | 8 ++++----
 arch/s390/kernel/time.c       | 8 ++++----
 arch/x86/kernel/vsyscall_64.c | 6 +++---
 include/linux/clocksource.h   | 6 ++++--
 kernel/time/timekeeping.c     | 9 ++++++---
 6 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 653b3c46ea82..ed6f22eb5b12 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -471,7 +471,8 @@ void update_vsyscall_tz(void)
 {
 }
 
-void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult)
+void update_vsyscall(struct timespec *wall, struct timespec *wtm,
+			struct clocksource *c, u32 mult)
 {
         unsigned long flags;
 
@@ -487,9 +488,9 @@ void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult)
 	/* copy kernel time structures */
         fsyscall_gtod_data.wall_time.tv_sec = wall->tv_sec;
         fsyscall_gtod_data.wall_time.tv_nsec = wall->tv_nsec;
-        fsyscall_gtod_data.monotonic_time.tv_sec = wall_to_monotonic.tv_sec
+	fsyscall_gtod_data.monotonic_time.tv_sec = wtm->tv_sec
 							+ wall->tv_sec;
-        fsyscall_gtod_data.monotonic_time.tv_nsec = wall_to_monotonic.tv_nsec
+	fsyscall_gtod_data.monotonic_time.tv_nsec = wtm->tv_nsec
 							+ wall->tv_nsec;
 
 	/* normalize */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 0711d60f40b0..e215f76bba1c 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -849,8 +849,8 @@ static cycle_t timebase_read(struct clocksource *cs)
 	return (cycle_t)get_tb();
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
-		     u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+			struct clocksource *clock, u32 mult)
 {
 	u64 new_tb_to_xs, new_stamp_xsec;
 
@@ -882,8 +882,8 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
 	vdso_data->tb_orig_stamp = clock->cycle_last;
 	vdso_data->stamp_xsec = new_stamp_xsec;
 	vdso_data->tb_to_xs = new_tb_to_xs;
-	vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
-	vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
+	vdso_data->wtom_clock_sec = wtm->tv_sec;
+	vdso_data->wtom_clock_nsec = wtm->tv_nsec;
 	vdso_data->stamp_xtime = *wall_time;
 	smp_wmb();
 	++(vdso_data->tb_update_count);
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index a2163c95eb98..aeb30c6f279c 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -207,8 +207,8 @@ struct clocksource * __init clocksource_default_clock(void)
 	return &clocksource_tod;
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
-		     u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+			struct clocksource *clock, u32 mult)
 {
 	if (clock != &clocksource_tod)
 		return;
@@ -219,8 +219,8 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
 	vdso_data->xtime_tod_stamp = clock->cycle_last;
 	vdso_data->xtime_clock_sec = wall_time->tv_sec;
 	vdso_data->xtime_clock_nsec = wall_time->tv_nsec;
-	vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
-	vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
+	vdso_data->wtom_clock_sec = wtm->tv_sec;
+	vdso_data->wtom_clock_nsec = wtm->tv_nsec;
 	vdso_data->ntp_mult = mult;
 	smp_wmb();
 	++vdso_data->tb_update_count;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dce0c3c5a787..dcbb28c4b694 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
-		     u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+			struct clocksource *clock, u32 mult)
 {
 	unsigned long flags;
 
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
 	vsyscall_gtod_data.clock.shift = clock->shift;
 	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
-	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+	vsyscall_gtod_data.wall_to_monotonic = *wtm;
 	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 5ea3c60c160c..21677d99a161 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -313,11 +313,13 @@ clocksource_calc_mult_shift(struct clocksource *cs, u32 freq, u32 minsec)
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
 extern void
-update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult);
+update_vsyscall(struct timespec *ts, struct timespec *wtm,
+			struct clocksource *c, u32 mult);
 extern void update_vsyscall_tz(void);
 #else
 static inline void
-update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult)
+update_vsyscall(struct timespec *ts, struct timespec *wtm,
+			struct clocksource *c, u32 mult)
 {
 }
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 73edd4074b50..b15c3acafd5a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -170,7 +170,8 @@ void timekeeping_leap_insert(int leapsecond)
 {
 	xtime.tv_sec += leapsecond;
 	wall_to_monotonic.tv_sec -= leapsecond;
-	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+			timekeeper.mult);
 }
 
 /**
@@ -326,7 +327,8 @@ int do_settimeofday(struct timespec *tv)
 	timekeeper.ntp_error = 0;
 	ntp_clear();
 
-	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+				timekeeper.mult);
 
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
@@ -809,7 +811,8 @@ void update_wall_time(void)
 	}
 
 	/* check to see if there is a new clocksource to use */
-	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+				timekeeper.mult);
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 8ab4351a4c888016620f43bde605b3d0964af339 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:25 -0700
Subject: hrtimer: Cleanup direct access to wall_to_monotonic

Provides an accessor function to replace hrtimer.c's
direct access of wall_to_monotonic.

This will allow wall_to_monotonic to be made static as
planned in Documentation/feature-removal-schedule.txt

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1279068988-21864-9-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/time.h      | 3 ++-
 kernel/hrtimer.c          | 9 ++++-----
 kernel/time/timekeeping.c | 5 +++++
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 9072df83de1f..a57e0f67b3d0 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -126,7 +126,8 @@ extern int timekeeping_suspended;
 
 unsigned long get_seconds(void);
 struct timespec current_kernel_time(void);
-struct timespec __current_kernel_time(void); /* does not hold xtime_lock */
+struct timespec __current_kernel_time(void); /* does not take xtime_lock */
+struct timespec __get_wall_to_monotonic(void); /* does not take xtime_lock */
 struct timespec get_monotonic_coarse(void);
 
 #define CURRENT_TIME		(current_kernel_time())
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..809f48c70553 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 	do {
 		seq = read_seqbegin(&xtime_lock);
 		xts = __current_kernel_time();
-		tom = wall_to_monotonic;
+		tom = __get_wall_to_monotonic();
 	} while (read_seqretry(&xtime_lock, seq));
 
 	xtim = timespec_to_ktime(xts);
@@ -612,7 +612,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 static void retrigger_next_event(void *arg)
 {
 	struct hrtimer_cpu_base *base;
-	struct timespec realtime_offset;
+	struct timespec realtime_offset, wtm;
 	unsigned long seq;
 
 	if (!hrtimer_hres_active())
@@ -620,10 +620,9 @@ static void retrigger_next_event(void *arg)
 
 	do {
 		seq = read_seqbegin(&xtime_lock);
-		set_normalized_timespec(&realtime_offset,
-					-wall_to_monotonic.tv_sec,
-					-wall_to_monotonic.tv_nsec);
+		wtm = __get_wall_to_monotonic();
 	} while (read_seqretry(&xtime_lock, seq));
+	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
 
 	base = &__get_cpu_var(hrtimer_bases);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b15c3acafd5a..fb61c2ed3660 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -858,6 +858,11 @@ struct timespec __current_kernel_time(void)
 	return xtime;
 }
 
+struct timespec __get_wall_to_monotonic(void)
+{
+	return wall_to_monotonic;
+}
+
 struct timespec current_kernel_time(void)
 {
 	struct timespec now;
-- 
cgit v1.2.3-59-g8ed1b


From 0fb86b06298b6cd3205cac2e68a499f269282dac Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:26 -0700
Subject: timekeeping: Make xtime and wall_to_monotonic static

This patch makes xtime and wall_to_monotonic static, as planned in
Documentation/feature-removal-schedule.txt. This will allow for
further cleanups to the timekeeping core.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1279068988-21864-10-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/feature-removal-schedule.txt | 10 ----------
 include/linux/time.h                       |  2 --
 kernel/time/timekeeping.c                  |  4 ++--
 3 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 1571c0c83dba..cd648dbb5146 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -549,16 +549,6 @@ Who:	Avi Kivity <avi@redhat.com>
 
 ----------------------------
 
-What:	xtime, wall_to_monotonic
-When:	2.6.36+
-Files:	kernel/time/timekeeping.c include/linux/time.h
-Why:	Cleaning up timekeeping internal values. Please use
-	existing timekeeping accessor functions to access
-	the equivalent functionality.
-Who:	John Stultz <johnstul@us.ibm.com>
-
-----------------------------
-
 What:	KVM kernel-allocated memory slots
 When:	July 2010
 Why:	Since 2.6.25, kvm supports user-allocated memory slots, which are
diff --git a/include/linux/time.h b/include/linux/time.h
index a57e0f67b3d0..cb34e35fabac 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -113,8 +113,6 @@ static inline struct timespec timespec_sub(struct timespec lhs,
 #define timespec_valid(ts) \
 	(((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC))
 
-extern struct timespec xtime;
-extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock;
 
 extern void read_persistent_clock(struct timespec *ts);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb61c2ed3660..e14c839e9faa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
  * - wall_to_monotonic is no longer the boot time, getboottime must be
  * used instead.
  */
-struct timespec xtime __attribute__ ((aligned (16)));
-struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+static struct timespec xtime __attribute__ ((aligned (16)));
+static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 static struct timespec total_sleep_time;
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 852db46d55e85b475a72e665ca08d3317769ceef Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Tue, 13 Jul 2010 17:56:28 -0700
Subject: clocksource: Add __clocksource_updatefreq_hz/khz methods

To properly handle clocksources that change frequencies
at the clocksource->enable() point, this patch adds
a method that will update the clocksource's mult/shift and
max_idle_ns values.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1279068988-21864-12-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h | 11 +++++++++++
 kernel/time/clocksource.c   | 29 ++++++++++++++++++++++++-----
 2 files changed, 35 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 21677d99a161..c37b21ad5a3b 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -292,6 +292,8 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec);
  */
 extern int
 __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq);
+extern void
+__clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq);
 
 static inline int clocksource_register_hz(struct clocksource *cs, u32 hz)
 {
@@ -303,6 +305,15 @@ static inline int clocksource_register_khz(struct clocksource *cs, u32 khz)
 	return __clocksource_register_scale(cs, 1000, khz);
 }
 
+static inline void __clocksource_updatefreq_hz(struct clocksource *cs, u32 hz)
+{
+	__clocksource_updatefreq_scale(cs, 1, hz);
+}
+
+static inline void __clocksource_updatefreq_khz(struct clocksource *cs, u32 khz)
+{
+	__clocksource_updatefreq_scale(cs, 1000, khz);
+}
 
 static inline void
 clocksource_calc_mult_shift(struct clocksource *cs, u32 freq, u32 minsec)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c543d21b4e54..c18d7efa1b4b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs)
 #define MAX_UPDATE_LENGTH 5 /* Seconds */
 
 /**
- * __clocksource_register_scale - Used to install new clocksources
+ * __clocksource_updatefreq_scale - Used update clocksource with new freq
  * @t:		clocksource to be registered
  * @scale:	Scale factor multiplied against freq to get clocksource hz
  * @freq:	clocksource frequency (cycles per second) divided by scale
  *
- * Returns -EBUSY if registration fails, zero otherwise.
+ * This should only be called from the clocksource->enable() method.
  *
  * This *SHOULD NOT* be called directly! Please use the
- * clocksource_register_hz() or clocksource_register_khz helper functions.
+ * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
  */
-int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
-
 	/*
 	 * Ideally we want to use  some of the limits used in
 	 * clocksource_max_deferment, to provide a more informed
@@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 				      NSEC_PER_SEC/scale,
 				      MAX_UPDATE_LENGTH*scale);
 	cs->max_idle_ns = clocksource_max_deferment(cs);
+}
+EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+
+/**
+ * __clocksource_register_scale - Used to install new clocksources
+ * @t:		clocksource to be registered
+ * @scale:	Scale factor multiplied against freq to get clocksource hz
+ * @freq:	clocksource frequency (cycles per second) divided by scale
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * clocksource_register_hz() or clocksource_register_khz helper functions.
+ */
+int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+
+	/* Intialize mult/shift and max_idle_ns */
+	__clocksource_updatefreq_scale(cs, scale, freq);
 
+	/* Add clocksource to the clcoksource list */
 	mutex_lock(&clocksource_mutex);
 	clocksource_enqueue(cs);
 	clocksource_select();
-- 
cgit v1.2.3-59-g8ed1b


From a0d40c80256e31b23849f2ba781b74bf0218a1fa Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 26 Mar 2010 15:28:51 -0700
Subject: vmap: add flag to allow lazy unmap to be disabled at runtime

Add a flag to force lazy_max_pages() to zero to prevent any outstanding
mapped pages.  We'll need this for Xen.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Nick Piggin <npiggin@suse.de>
---
 include/linux/vmalloc.h | 2 ++
 mm/vmalloc.c            | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 227c2a585e4f..b840fdaf438c 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -7,6 +7,8 @@
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 
+extern bool vmap_lazy_unmap;
+
 /* bits in flags of vmalloc's vm_struct below */
 #define VM_IOREMAP	0x00000001	/* ioremap() and friends */
 #define VM_ALLOC	0x00000002	/* vmalloc() */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ae007462b7f6..7f35fe2cf9e7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,6 +31,7 @@
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
 
+bool vmap_lazy_unmap __read_mostly = true;
 
 /*** Page table manipulation functions ***/
 
@@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void)
 {
 	unsigned int log;
 
+	if (!vmap_lazy_unmap)
+		return 0;
+
 	log = fls(num_online_cpus());
 
 	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
-- 
cgit v1.2.3-59-g8ed1b


From 47def82672b3ba4e7c5e9a4fe48a556f8684d0d6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 27 Jul 2010 11:56:05 -0400
Subject: jbd2: Remove __GFP_NOFAIL from jbd2 layer

__GFP_NOFAIL is going away, so add our own retry loop.  Also add
jbd2__journal_start() and jbd2__journal_restart() which take a gfp
mask, so that file systems can optionally (re)start transaction
handles using GFP_KERNEL.  If they do this, then they need to be
prepared to handle receiving an PTR_ERR(-ENOMEM) error, and be ready
to reflect that error up to userspace.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c     | 15 ++++++++++---
 fs/jbd2/transaction.c | 61 +++++++++++++++++++++++++++++++++++----------------
 include/linux/jbd2.h  |  4 +++-
 3 files changed, 57 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f7bf15787d68..a79d3345b55a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -41,6 +41,7 @@
 #include <linux/hash.h>
 #include <linux/log2.h>
 #include <linux/vmalloc.h>
+#include <linux/backing-dev.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -48,8 +49,6 @@
 #include <asm/uaccess.h>
 #include <asm/page.h>
 
-EXPORT_SYMBOL(jbd2_journal_start);
-EXPORT_SYMBOL(jbd2_journal_restart);
 EXPORT_SYMBOL(jbd2_journal_extend);
 EXPORT_SYMBOL(jbd2_journal_stop);
 EXPORT_SYMBOL(jbd2_journal_lock_updates);
@@ -311,7 +310,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 	 */
 	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
 
-	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+retry_alloc:
+	new_bh = alloc_buffer_head(GFP_NOFS);
+	if (!new_bh) {
+		/*
+		 * Failure is not an option, but __GFP_NOFAIL is going
+		 * away; so we retry ourselves here.
+		 */
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+		goto retry_alloc;
+	}
+
 	/* keep subsequent assertions sane */
 	new_bh->b_state = 0;
 	init_buffer(new_bh, NULL, NULL);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e214d68620ac..001e95fb0fe1 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -26,6 +26,8 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
 
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -83,30 +85,38 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
  * transaction's buffer credits.
  */
 
-static int start_this_handle(journal_t *journal, handle_t *handle)
+static int start_this_handle(journal_t *journal, handle_t *handle,
+			     int gfp_mask)
 {
 	transaction_t *transaction;
 	int needed;
 	int nblocks = handle->h_buffer_credits;
 	transaction_t *new_transaction = NULL;
-	int ret = 0;
 	unsigned long ts = jiffies;
 
 	if (nblocks > journal->j_max_transaction_buffers) {
 		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
 		       current->comm, nblocks,
 		       journal->j_max_transaction_buffers);
-		ret = -ENOSPC;
-		goto out;
+		return -ENOSPC;
 	}
 
 alloc_transaction:
 	if (!journal->j_running_transaction) {
-		new_transaction = kzalloc(sizeof(*new_transaction),
-						GFP_NOFS|__GFP_NOFAIL);
+		new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
 		if (!new_transaction) {
-			ret = -ENOMEM;
-			goto out;
+			/*
+			 * If __GFP_FS is not present, then we may be
+			 * being called from inside the fs writeback
+			 * layer, so we MUST NOT fail.  Since
+			 * __GFP_NOFAIL is going away, we will arrange
+			 * to retry the allocation ourselves.
+			 */
+			if ((gfp_mask & __GFP_FS) == 0) {
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				goto alloc_transaction;
+			}
+			return -ENOMEM;
 		}
 	}
 
@@ -123,8 +133,8 @@ repeat_locked:
 	if (is_journal_aborted(journal) ||
 	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
 		spin_unlock(&journal->j_state_lock);
-		ret = -EROFS;
-		goto out;
+		kfree(new_transaction);
+		return -EROFS;
 	}
 
 	/* Wait on the journal's transaction barrier if necessary */
@@ -240,10 +250,8 @@ repeat_locked:
 	spin_unlock(&journal->j_state_lock);
 
 	lock_map_acquire(&handle->h_lockdep_map);
-out:
-	if (unlikely(new_transaction))		/* It's usually NULL */
-		kfree(new_transaction);
-	return ret;
+	kfree(new_transaction);
+	return 0;
 }
 
 static struct lock_class_key jbd2_handle_key;
@@ -278,7 +286,7 @@ static handle_t *new_handle(int nblocks)
  *
  * Return a pointer to a newly allocated handle, or NULL on failure
  */
-handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
 {
 	handle_t *handle = journal_current_handle();
 	int err;
@@ -298,7 +306,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 
 	current->journal_info = handle;
 
-	err = start_this_handle(journal, handle);
+	err = start_this_handle(journal, handle, gfp_mask);
 	if (err < 0) {
 		jbd2_free_handle(handle);
 		current->journal_info = NULL;
@@ -308,6 +316,15 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 out:
 	return handle;
 }
+EXPORT_SYMBOL(jbd2__journal_start);
+
+
+handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+{
+	return jbd2__journal_start(journal, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_start);
+
 
 /**
  * int jbd2_journal_extend() - extend buffer credits.
@@ -394,8 +411,7 @@ out:
  * transaction capabable of guaranteeing the requested number of
  * credits.
  */
-
-int jbd2_journal_restart(handle_t *handle, int nblocks)
+int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
@@ -428,10 +444,17 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
 
 	lock_map_release(&handle->h_lockdep_map);
 	handle->h_buffer_credits = nblocks;
-	ret = start_this_handle(journal, handle);
+	ret = start_this_handle(journal, handle, gfp_mask);
 	return ret;
 }
+EXPORT_SYMBOL(jbd2__journal_restart);
+
 
+int jbd2_journal_restart(handle_t *handle, int nblocks)
+{
+	return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_restart);
 
 /**
  * void jbd2_journal_lock_updates () - establish a transaction barrier.
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index a4d2e9f7088a..5a72bc75b273 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1081,7 +1081,9 @@ static inline handle_t *journal_current_handle(void)
  */
 
 extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
-extern int	 jbd2_journal_restart (handle_t *, int nblocks);
+extern handle_t *jbd2__journal_start(journal_t *, int nblocks, int gfp_mask);
+extern int	 jbd2_journal_restart(handle_t *, int nblocks);
+extern int	 jbd2__journal_restart(handle_t *, int nblocks, int gfp_mask);
 extern int	 jbd2_journal_extend (handle_t *, int nblocks);
 extern int	 jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
-- 
cgit v1.2.3-59-g8ed1b


From 552ef8024f909d9b3a7442d0ab0d48a22de24e9e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 27 Jul 2010 11:56:06 -0400
Subject: direct-io: move aio_complete into ->end_io

Filesystems with unwritten extent support must not complete an AIO request
until the transaction to convert the extent has been commited.  That means
the aio_complete calls needs to be moved into the ->end_io callback so
that the filesystem can control when to call it exactly.

This makes a bit of a mess out of dio_complete and the ->end_io callback
prototype even more complicated.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/direct-io.c              | 26 ++++++++++++++------------
 fs/ext4/inode.c             | 10 +++++++---
 fs/ocfs2/aops.c             |  7 ++++++-
 fs/xfs/linux-2.6/xfs_aops.c |  7 ++++++-
 fs/xfs/linux-2.6/xfs_aops.h |  2 ++
 include/linux/fs.h          |  3 ++-
 6 files changed, 37 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7600aacf531d..a10cb91cadea 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
  * filesystems can use it to hold additional state between get_block calls and
  * dio_complete.
  */
-static int dio_complete(struct dio *dio, loff_t offset, int ret)
+static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
 {
 	ssize_t transferred = 0;
 
@@ -239,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
 			transferred = dio->i_size - offset;
 	}
 
-	if (dio->end_io && dio->result)
-		dio->end_io(dio->iocb, offset, transferred,
-			    dio->map_bh.b_private);
-
-	if (dio->flags & DIO_LOCKING)
-		/* lockdep: non-owner release */
-		up_read_non_owner(&dio->inode->i_alloc_sem);
-
 	if (ret == 0)
 		ret = dio->page_errors;
 	if (ret == 0)
@@ -254,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
 	if (ret == 0)
 		ret = transferred;
 
+	if (dio->end_io && dio->result) {
+		dio->end_io(dio->iocb, offset, transferred,
+			    dio->map_bh.b_private, ret, is_async);
+	} else if (is_async) {
+		aio_complete(dio->iocb, ret, 0);
+	}
+
+	if (dio->flags & DIO_LOCKING)
+		/* lockdep: non-owner release */
+		up_read_non_owner(&dio->inode->i_alloc_sem);
+
 	return ret;
 }
 
@@ -277,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
 	if (remaining == 0) {
-		int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
-		aio_complete(dio->iocb, ret, 0);
+		dio_complete(dio, dio->iocb->ki_pos, 0, true);
 		kfree(dio);
 	}
 }
@@ -1126,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
 	if (ret2 == 0) {
-		ret = dio_complete(dio, offset, ret);
+		ret = dio_complete(dio, offset, ret, false);
 		kfree(dio);
 	} else
 		BUG_ON(ret != -EIOCBQUEUED);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 699d1d01c5df..609159e990de 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3775,7 +3775,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
 }
 
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
-			    ssize_t size, void *private)
+			    ssize_t size, void *private, int ret,
+			    bool is_async)
 {
         ext4_io_end_t *io_end = iocb->private;
 	struct workqueue_struct *wq;
@@ -3784,7 +3785,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
 	/* if not async direct IO or dio with 0 bytes write, just return */
 	if (!io_end || !size)
-		return;
+		goto out;
 
 	ext_debug("ext4_end_io_dio(): io_end 0x%p"
 		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
@@ -3795,7 +3796,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 	if (io_end->flag != EXT4_IO_UNWRITTEN){
 		ext4_free_io_end(io_end);
 		iocb->private = NULL;
-		return;
+		goto out;
 	}
 
 	io_end->offset = offset;
@@ -3812,6 +3813,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 	list_add_tail(&io_end->list, &ei->i_completed_io_list);
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	iocb->private = NULL;
+out:
+	if (is_async)
+		aio_complete(iocb, ret, 0);
 }
 
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca20cc18..1d2b1f156bcf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -609,7 +609,9 @@ bail:
 static void ocfs2_dio_end_io(struct kiocb *iocb,
 			     loff_t offset,
 			     ssize_t bytes,
-			     void *private)
+			     void *private,
+			     int ret,
+			     bool is_async)
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 	int level;
@@ -623,6 +625,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 	if (!level)
 		up_read(&inode->i_alloc_sem);
 	ocfs2_rw_unlock(inode, level);
+
+	if (is_async)
+		aio_complete(iocb, ret, 0);
 }
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 34640d6dbdcb..5895aaf62ace 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1599,7 +1599,9 @@ xfs_end_io_direct(
 	struct kiocb	*iocb,
 	loff_t		offset,
 	ssize_t		size,
-	void		*private)
+	void		*private,
+	int		ret,
+	bool		is_async)
 {
 	xfs_ioend_t	*ioend = iocb->private;
 
@@ -1645,6 +1647,9 @@ xfs_end_io_direct(
 	 * against double-freeing.
 	 */
 	iocb->private = NULL;
+
+	if (is_async)
+		aio_complete(iocb, ret, 0);
 }
 
 STATIC ssize_t
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 4cfc6ea87df8..9f566d92ae3a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -37,6 +37,8 @@ typedef struct xfs_ioend {
 	size_t			io_size;	/* size of the extent */
 	xfs_off_t		io_offset;	/* offset in the file */
 	struct work_struct	io_work;	/* xfsdatad work queue */
+	struct kiocb		*io_iocb;
+	int			io_result;
 } xfs_ioend_t;
 
 extern const struct address_space_operations xfs_address_space_operations;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 471e1ff5079a..a0912f6075e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -415,7 +415,8 @@ struct buffer_head;
 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
-			ssize_t bytes, void *private);
+			ssize_t bytes, void *private, int ret,
+			bool is_async);
 
 /*
  * Attribute flags.  These should be or-ed together to figure out what
-- 
cgit v1.2.3-59-g8ed1b


From 94fe8c683cea97fe2c59a5f0dc206aa329c5763c Mon Sep 17 00:00:00 2001
From: Richard Röjfors <richard.rojfors@pelagicore.com>
Date: Tue, 27 Jul 2010 12:57:01 +0000
Subject: ks8842: Support DMA when accessed via timberdale
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds support for RX and TX DMA via the DMA API,
this is only supported when the KS8842 is accessed via timberdale.

There is no support for DMA on the generic bus interface it self,
a state machine inside the FPGA is handling RX and TX transfers to/from
buffers in the FPGA. The host CPU can do DMA to and from these buffers.

The FPGA has to handle the RX interrupts, so these must be enabled in
the ks8842 but not in the FPGA. The driver must not disable the RX interrupt
that would mean that the data transfers into the FPGA buffers would stop.

The host shall not enable TX interrupts since TX is handled by the FPGA,
the host is notified by DMA callbacks when transfers are finished.

Which DMA channels to use are added as parameters in the platform data struct.

Signed-off-by: Richard Röjfors <richard.rojfors@pelagicore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ks8842.c   | 464 ++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/ks8842.h |   4 +
 2 files changed, 447 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ks8842.c b/drivers/net/ks8842.c
index 289b0bee3462..3fe38c787f29 100644
--- a/drivers/net/ks8842.c
+++ b/drivers/net/ks8842.c
@@ -30,6 +30,9 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
 #include <linux/ks8842.h>
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
 
 #define DRV_NAME "ks8842"
 
@@ -82,6 +85,15 @@
 #define IRQ_RX_ERROR	0x0080
 #define ENABLED_IRQS	(IRQ_LINK_CHANGE | IRQ_TX | IRQ_RX | IRQ_RX_STOPPED | \
 		IRQ_TX_STOPPED | IRQ_RX_OVERRUN | IRQ_RX_ERROR)
+/* When running via timberdale in DMA mode, the RX interrupt should be
+   enabled in the KS8842, but not in the FPGA IP, since the IP handles
+   RX DMA internally.
+   TX interrupts are not needed it is handled by the FPGA the driver is
+   notified via DMA callbacks.
+*/
+#define ENABLED_IRQS_DMA_IP	(IRQ_LINK_CHANGE | IRQ_RX_STOPPED | \
+	IRQ_TX_STOPPED | IRQ_RX_OVERRUN | IRQ_RX_ERROR)
+#define ENABLED_IRQS_DMA	(ENABLED_IRQS_DMA_IP | IRQ_RX)
 #define REG_ISR		0x02
 #define REG_RXSR	0x04
 #define RXSR_VALID	0x8000
@@ -124,6 +136,28 @@
 #define	MICREL_KS884X		0x01	/* 0=Timeberdale(FPGA), 1=Micrel */
 #define	KS884X_16BIT		0x02	/*  1=16bit, 0=32bit */
 
+#define DMA_BUFFER_SIZE		2048
+
+struct ks8842_tx_dma_ctl {
+	struct dma_chan *chan;
+	struct dma_async_tx_descriptor *adesc;
+	void *buf;
+	struct scatterlist sg;
+	int channel;
+};
+
+struct ks8842_rx_dma_ctl {
+	struct dma_chan *chan;
+	struct dma_async_tx_descriptor *adesc;
+	struct sk_buff  *skb;
+	struct scatterlist sg;
+	struct tasklet_struct tasklet;
+	int channel;
+};
+
+#define KS8842_USE_DMA(adapter) (((adapter)->dma_tx.channel != -1) && \
+	 ((adapter)->dma_rx.channel != -1))
+
 struct ks8842_adapter {
 	void __iomem	*hw_addr;
 	int		irq;
@@ -132,8 +166,19 @@ struct ks8842_adapter {
 	spinlock_t	lock; /* spinlock to be interrupt safe */
 	struct work_struct timeout_work;
 	struct net_device *netdev;
+	struct device *dev;
+	struct ks8842_tx_dma_ctl	dma_tx;
+	struct ks8842_rx_dma_ctl	dma_rx;
 };
 
+static void ks8842_dma_rx_cb(void *data);
+static void ks8842_dma_tx_cb(void *data);
+
+static inline void ks8842_resume_dma(struct ks8842_adapter *adapter)
+{
+	iowrite32(1, adapter->hw_addr + REQ_TIMB_DMA_RESUME);
+}
+
 static inline void ks8842_select_bank(struct ks8842_adapter *adapter, u16 bank)
 {
 	iowrite16(bank, adapter->hw_addr + REG_SELECT_BANK);
@@ -297,8 +342,19 @@ static void ks8842_reset_hw(struct ks8842_adapter *adapter)
 	ks8842_write16(adapter, 18, 0xffff, REG_ISR);
 
 	/* enable interrupts */
-	ks8842_write16(adapter, 18, ENABLED_IRQS, REG_IER);
-
+	if (KS8842_USE_DMA(adapter)) {
+		/* When running in DMA Mode the RX interrupt is not enabled in
+		   timberdale because RX data is received by DMA callbacks
+		   it must still be enabled in the KS8842 because it indicates
+		   to timberdale when there is RX data for it's DMA FIFOs */
+		iowrite16(ENABLED_IRQS_DMA_IP, adapter->hw_addr + REG_TIMB_IER);
+		ks8842_write16(adapter, 18, ENABLED_IRQS_DMA, REG_IER);
+	} else {
+		if (!(adapter->conf_flags & MICREL_KS884X))
+			iowrite16(ENABLED_IRQS,
+				adapter->hw_addr + REG_TIMB_IER);
+		ks8842_write16(adapter, 18, ENABLED_IRQS, REG_IER);
+	}
 	/* enable the switch */
 	ks8842_write16(adapter, 32, 0x1, REG_SW_ID_AND_ENABLE);
 }
@@ -371,6 +427,53 @@ static inline u16 ks8842_tx_fifo_space(struct ks8842_adapter *adapter)
 	return ks8842_read16(adapter, 16, REG_TXMIR) & 0x1fff;
 }
 
+static int ks8842_tx_frame_dma(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct ks8842_adapter *adapter = netdev_priv(netdev);
+	struct ks8842_tx_dma_ctl *ctl = &adapter->dma_tx;
+	u8 *buf = ctl->buf;
+
+	if (ctl->adesc) {
+		netdev_dbg(netdev, "%s: TX ongoing\n", __func__);
+		/* transfer ongoing */
+		return NETDEV_TX_BUSY;
+	}
+
+	sg_dma_len(&ctl->sg) = skb->len + sizeof(u32);
+
+	/* copy data to the TX buffer */
+	/* the control word, enable IRQ, port 1 and the length */
+	*buf++ = 0x00;
+	*buf++ = 0x01; /* Port 1 */
+	*buf++ = skb->len & 0xff;
+	*buf++ = (skb->len >> 8) & 0xff;
+	skb_copy_from_linear_data(skb, buf, skb->len);
+
+	dma_sync_single_range_for_device(adapter->dev,
+		sg_dma_address(&ctl->sg), 0, sg_dma_len(&ctl->sg),
+		DMA_TO_DEVICE);
+
+	/* make sure the length is a multiple of 4 */
+	if (sg_dma_len(&ctl->sg) % 4)
+		sg_dma_len(&ctl->sg) += 4 - sg_dma_len(&ctl->sg) % 4;
+
+	ctl->adesc = ctl->chan->device->device_prep_slave_sg(ctl->chan,
+		&ctl->sg, 1, DMA_TO_DEVICE,
+		DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP);
+	if (!ctl->adesc)
+		return NETDEV_TX_BUSY;
+
+	ctl->adesc->callback_param = netdev;
+	ctl->adesc->callback = ks8842_dma_tx_cb;
+	ctl->adesc->tx_submit(ctl->adesc);
+
+	netdev->stats.tx_bytes += skb->len;
+
+	dev_kfree_skb(skb);
+
+	return NETDEV_TX_OK;
+}
+
 static int ks8842_tx_frame(struct sk_buff *skb, struct net_device *netdev)
 {
 	struct ks8842_adapter *adapter = netdev_priv(netdev);
@@ -422,6 +525,121 @@ static int ks8842_tx_frame(struct sk_buff *skb, struct net_device *netdev)
 	return NETDEV_TX_OK;
 }
 
+static void ks8842_update_rx_err_counters(struct net_device *netdev, u32 status)
+{
+	netdev_dbg(netdev, "RX error, status: %x\n", status);
+
+	netdev->stats.rx_errors++;
+	if (status & RXSR_TOO_LONG)
+		netdev->stats.rx_length_errors++;
+	if (status & RXSR_CRC_ERROR)
+		netdev->stats.rx_crc_errors++;
+	if (status & RXSR_RUNT)
+		netdev->stats.rx_frame_errors++;
+}
+
+static void ks8842_update_rx_counters(struct net_device *netdev, u32 status,
+	int len)
+{
+	netdev_dbg(netdev, "RX packet, len: %d\n", len);
+
+	netdev->stats.rx_packets++;
+	netdev->stats.rx_bytes += len;
+	if (status & RXSR_MULTICAST)
+		netdev->stats.multicast++;
+}
+
+static int __ks8842_start_new_rx_dma(struct net_device *netdev)
+{
+	struct ks8842_adapter *adapter = netdev_priv(netdev);
+	struct ks8842_rx_dma_ctl *ctl = &adapter->dma_rx;
+	struct scatterlist *sg = &ctl->sg;
+	int err;
+
+	ctl->skb = netdev_alloc_skb(netdev, DMA_BUFFER_SIZE);
+	if (ctl->skb) {
+		sg_init_table(sg, 1);
+		sg_dma_address(sg) = dma_map_single(adapter->dev,
+			ctl->skb->data, DMA_BUFFER_SIZE, DMA_FROM_DEVICE);
+		err = dma_mapping_error(adapter->dev, sg_dma_address(sg));
+		if (unlikely(err)) {
+			sg_dma_address(sg) = 0;
+			goto out;
+		}
+
+		sg_dma_len(sg) = DMA_BUFFER_SIZE;
+
+		ctl->adesc = ctl->chan->device->device_prep_slave_sg(ctl->chan,
+			sg, 1, DMA_FROM_DEVICE,
+			DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP);
+
+		if (!ctl->adesc)
+			goto out;
+
+		ctl->adesc->callback_param = netdev;
+		ctl->adesc->callback = ks8842_dma_rx_cb;
+		ctl->adesc->tx_submit(ctl->adesc);
+	} else {
+		err = -ENOMEM;
+		sg_dma_address(sg) = 0;
+		goto out;
+	}
+
+	return err;
+out:
+	if (sg_dma_address(sg))
+		dma_unmap_single(adapter->dev, sg_dma_address(sg),
+			DMA_BUFFER_SIZE, DMA_FROM_DEVICE);
+	sg_dma_address(sg) = 0;
+	if (ctl->skb)
+		dev_kfree_skb(ctl->skb);
+
+	ctl->skb = NULL;
+
+	printk(KERN_ERR DRV_NAME": Failed to start RX DMA: %d\n", err);
+	return err;
+}
+
+static void ks8842_rx_frame_dma_tasklet(unsigned long arg)
+{
+	struct net_device *netdev = (struct net_device *)arg;
+	struct ks8842_adapter *adapter = netdev_priv(netdev);
+	struct ks8842_rx_dma_ctl *ctl = &adapter->dma_rx;
+	struct sk_buff *skb = ctl->skb;
+	dma_addr_t addr = sg_dma_address(&ctl->sg);
+	u32 status;
+
+	ctl->adesc = NULL;
+
+	/* kick next transfer going */
+	__ks8842_start_new_rx_dma(netdev);
+
+	/* now handle the data we got */
+	dma_unmap_single(adapter->dev, addr, DMA_BUFFER_SIZE, DMA_FROM_DEVICE);
+
+	status = *((u32 *)skb->data);
+
+	netdev_dbg(netdev, "%s - rx_data: status: %x\n",
+		__func__, status & 0xffff);
+
+	/* check the status */
+	if ((status & RXSR_VALID) && !(status & RXSR_ERROR)) {
+		int len = (status >> 16) & 0x7ff;
+
+		ks8842_update_rx_counters(netdev, status, len);
+
+		/* reserve 4 bytes which is the status word */
+		skb_reserve(skb, 4);
+		skb_put(skb, len);
+
+		skb->protocol = eth_type_trans(skb, netdev);
+		netif_rx(skb);
+	} else {
+		ks8842_update_rx_err_counters(netdev, status);
+		dev_kfree_skb(skb);
+	}
+}
+
 static void ks8842_rx_frame(struct net_device *netdev,
 	struct ks8842_adapter *adapter)
 {
@@ -445,13 +663,9 @@ static void ks8842_rx_frame(struct net_device *netdev,
 	if ((status & RXSR_VALID) && !(status & RXSR_ERROR)) {
 		struct sk_buff *skb = netdev_alloc_skb_ip_align(netdev, len);
 
-		netdev_dbg(netdev, "%s, got package, len: %d\n", __func__, len);
 		if (skb) {
 
-			netdev->stats.rx_packets++;
-			netdev->stats.rx_bytes += len;
-			if (status & RXSR_MULTICAST)
-				netdev->stats.multicast++;
+			ks8842_update_rx_counters(netdev, status, len);
 
 			if (adapter->conf_flags & KS884X_16BIT) {
 				u16 *data16 = (u16 *)skb_put(skb, len);
@@ -477,16 +691,8 @@ static void ks8842_rx_frame(struct net_device *netdev,
 			netif_rx(skb);
 		} else
 			netdev->stats.rx_dropped++;
-	} else {
-		netdev_dbg(netdev, "RX error, status: %x\n", status);
-		netdev->stats.rx_errors++;
-		if (status & RXSR_TOO_LONG)
-			netdev->stats.rx_length_errors++;
-		if (status & RXSR_CRC_ERROR)
-			netdev->stats.rx_crc_errors++;
-		if (status & RXSR_RUNT)
-			netdev->stats.rx_frame_errors++;
-	}
+	} else
+		ks8842_update_rx_err_counters(netdev, status);
 
 	/* set high watermark to 3K */
 	ks8842_clear_bits(adapter, 0, 1 << 12, REG_QRFCR);
@@ -541,6 +747,12 @@ void ks8842_tasklet(unsigned long arg)
 	isr = ks8842_read16(adapter, 18, REG_ISR);
 	netdev_dbg(netdev, "%s - ISR: 0x%x\n", __func__, isr);
 
+	/* when running in DMA mode, do not ack RX interrupts, it is handled
+	   internally by timberdale, otherwise it's DMA FIFO:s would stop
+	*/
+	if (KS8842_USE_DMA(adapter))
+		isr &= ~IRQ_RX;
+
 	/* Ack */
 	ks8842_write16(adapter, 18, isr, REG_ISR);
 
@@ -554,9 +766,11 @@ void ks8842_tasklet(unsigned long arg)
 	if (isr & IRQ_LINK_CHANGE)
 		ks8842_update_link_status(netdev, adapter);
 
-	if (isr & (IRQ_RX | IRQ_RX_ERROR))
+	/* should not get IRQ_RX when running DMA mode */
+	if (isr & (IRQ_RX | IRQ_RX_ERROR) && !KS8842_USE_DMA(adapter))
 		ks8842_handle_rx(netdev, adapter);
 
+	/* should only happen when in PIO mode */
 	if (isr & IRQ_TX)
 		ks8842_handle_tx(netdev, adapter);
 
@@ -575,8 +789,17 @@ void ks8842_tasklet(unsigned long arg)
 
 	/* re-enable interrupts, put back the bank selection register */
 	spin_lock_irqsave(&adapter->lock, flags);
-	ks8842_write16(adapter, 18, ENABLED_IRQS, REG_IER);
+	if (KS8842_USE_DMA(adapter))
+		ks8842_write16(adapter, 18, ENABLED_IRQS_DMA, REG_IER);
+	else
+		ks8842_write16(adapter, 18, ENABLED_IRQS, REG_IER);
 	iowrite16(entry_bank, adapter->hw_addr + REG_SELECT_BANK);
+
+	/* Make sure timberdale continues DMA operations, they are stopped while
+	   we are handling the ks8842 because we might change bank */
+	if (KS8842_USE_DMA(adapter))
+		ks8842_resume_dma(adapter);
+
 	spin_unlock_irqrestore(&adapter->lock, flags);
 }
 
@@ -592,8 +815,12 @@ static irqreturn_t ks8842_irq(int irq, void *devid)
 	netdev_dbg(netdev, "%s - ISR: 0x%x\n", __func__, isr);
 
 	if (isr) {
-		/* disable IRQ */
-		ks8842_write16(adapter, 18, 0x00, REG_IER);
+		if (KS8842_USE_DMA(adapter))
+			/* disable all but RX IRQ, since the FPGA relies on it*/
+			ks8842_write16(adapter, 18, IRQ_RX, REG_IER);
+		else
+			/* disable IRQ */
+			ks8842_write16(adapter, 18, 0x00, REG_IER);
 
 		/* schedule tasklet */
 		tasklet_schedule(&adapter->tasklet);
@@ -603,9 +830,151 @@ static irqreturn_t ks8842_irq(int irq, void *devid)
 
 	iowrite16(entry_bank, adapter->hw_addr + REG_SELECT_BANK);
 
+	/* After an interrupt, tell timberdale to continue DMA operations.
+	   DMA is disabled while we are handling the ks8842 because we might
+	   change bank */
+	ks8842_resume_dma(adapter);
+
 	return ret;
 }
 
+static void ks8842_dma_rx_cb(void *data)
+{
+	struct net_device	*netdev = data;
+	struct ks8842_adapter	*adapter = netdev_priv(netdev);
+
+	netdev_dbg(netdev, "RX DMA finished\n");
+	/* schedule tasklet */
+	if (adapter->dma_rx.adesc)
+		tasklet_schedule(&adapter->dma_rx.tasklet);
+}
+
+static void ks8842_dma_tx_cb(void *data)
+{
+	struct net_device		*netdev = data;
+	struct ks8842_adapter		*adapter = netdev_priv(netdev);
+	struct ks8842_tx_dma_ctl	*ctl = &adapter->dma_tx;
+
+	netdev_dbg(netdev, "TX DMA finished\n");
+
+	if (!ctl->adesc)
+		return;
+
+	netdev->stats.tx_packets++;
+	ctl->adesc = NULL;
+
+	if (netif_queue_stopped(netdev))
+		netif_wake_queue(netdev);
+}
+
+static void ks8842_stop_dma(struct ks8842_adapter *adapter)
+{
+	struct ks8842_tx_dma_ctl *tx_ctl = &adapter->dma_tx;
+	struct ks8842_rx_dma_ctl *rx_ctl = &adapter->dma_rx;
+
+	tx_ctl->adesc = NULL;
+	if (tx_ctl->chan)
+		tx_ctl->chan->device->device_control(tx_ctl->chan,
+			DMA_TERMINATE_ALL, 0);
+
+	rx_ctl->adesc = NULL;
+	if (rx_ctl->chan)
+		rx_ctl->chan->device->device_control(rx_ctl->chan,
+			DMA_TERMINATE_ALL, 0);
+
+	if (sg_dma_address(&rx_ctl->sg))
+		dma_unmap_single(adapter->dev, sg_dma_address(&rx_ctl->sg),
+			DMA_BUFFER_SIZE, DMA_FROM_DEVICE);
+	sg_dma_address(&rx_ctl->sg) = 0;
+
+	dev_kfree_skb(rx_ctl->skb);
+	rx_ctl->skb = NULL;
+}
+
+static void ks8842_dealloc_dma_bufs(struct ks8842_adapter *adapter)
+{
+	struct ks8842_tx_dma_ctl *tx_ctl = &adapter->dma_tx;
+	struct ks8842_rx_dma_ctl *rx_ctl = &adapter->dma_rx;
+
+	ks8842_stop_dma(adapter);
+
+	if (tx_ctl->chan)
+		dma_release_channel(tx_ctl->chan);
+	tx_ctl->chan = NULL;
+
+	if (rx_ctl->chan)
+		dma_release_channel(rx_ctl->chan);
+	rx_ctl->chan = NULL;
+
+	tasklet_kill(&rx_ctl->tasklet);
+
+	if (sg_dma_address(&tx_ctl->sg))
+		dma_unmap_single(adapter->dev, sg_dma_address(&tx_ctl->sg),
+			DMA_BUFFER_SIZE, DMA_TO_DEVICE);
+	sg_dma_address(&tx_ctl->sg) = 0;
+
+	kfree(tx_ctl->buf);
+	tx_ctl->buf = NULL;
+}
+
+static bool ks8842_dma_filter_fn(struct dma_chan *chan, void *filter_param)
+{
+	return chan->chan_id == (int)filter_param;
+}
+
+static int ks8842_alloc_dma_bufs(struct net_device *netdev)
+{
+	struct ks8842_adapter *adapter = netdev_priv(netdev);
+	struct ks8842_tx_dma_ctl *tx_ctl = &adapter->dma_tx;
+	struct ks8842_rx_dma_ctl *rx_ctl = &adapter->dma_rx;
+	int err;
+
+	dma_cap_mask_t mask;
+
+	dma_cap_zero(mask);
+	dma_cap_set(DMA_SLAVE, mask);
+	dma_cap_set(DMA_PRIVATE, mask);
+
+	sg_init_table(&tx_ctl->sg, 1);
+
+	tx_ctl->chan = dma_request_channel(mask, ks8842_dma_filter_fn,
+		(void *)tx_ctl->channel);
+	if (!tx_ctl->chan) {
+		err = -ENODEV;
+		goto err;
+	}
+
+	/* allocate DMA buffer */
+	tx_ctl->buf = kmalloc(DMA_BUFFER_SIZE, GFP_KERNEL);
+	if (!tx_ctl->buf) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	sg_dma_address(&tx_ctl->sg) = dma_map_single(adapter->dev,
+		tx_ctl->buf, DMA_BUFFER_SIZE, DMA_TO_DEVICE);
+	err = dma_mapping_error(adapter->dev,
+		sg_dma_address(&tx_ctl->sg));
+	if (err) {
+		sg_dma_address(&tx_ctl->sg) = 0;
+		goto err;
+	}
+
+	rx_ctl->chan = dma_request_channel(mask, ks8842_dma_filter_fn,
+		(void *)rx_ctl->channel);
+	if (!rx_ctl->chan) {
+		err = -ENODEV;
+		goto err;
+	}
+
+	tasklet_init(&rx_ctl->tasklet, ks8842_rx_frame_dma_tasklet,
+		(unsigned long)netdev);
+
+	return 0;
+err:
+	ks8842_dealloc_dma_bufs(adapter);
+	return err;
+}
 
 /* Netdevice operations */
 
@@ -616,6 +985,25 @@ static int ks8842_open(struct net_device *netdev)
 
 	netdev_dbg(netdev, "%s - entry\n", __func__);
 
+	if (KS8842_USE_DMA(adapter)) {
+		err = ks8842_alloc_dma_bufs(netdev);
+
+		if (!err) {
+			/* start RX dma */
+			err = __ks8842_start_new_rx_dma(netdev);
+			if (err)
+				ks8842_dealloc_dma_bufs(adapter);
+		}
+
+		if (err) {
+			printk(KERN_WARNING DRV_NAME
+				": Failed to initiate DMA, running PIO\n");
+			ks8842_dealloc_dma_bufs(adapter);
+			adapter->dma_rx.channel = -1;
+			adapter->dma_tx.channel = -1;
+		}
+	}
+
 	/* reset the HW */
 	ks8842_reset_hw(adapter);
 
@@ -641,6 +1029,9 @@ static int ks8842_close(struct net_device *netdev)
 
 	cancel_work_sync(&adapter->timeout_work);
 
+	if (KS8842_USE_DMA(adapter))
+		ks8842_dealloc_dma_bufs(adapter);
+
 	/* free the irq */
 	free_irq(adapter->irq, netdev);
 
@@ -658,6 +1049,17 @@ static netdev_tx_t ks8842_xmit_frame(struct sk_buff *skb,
 
 	netdev_dbg(netdev, "%s: entry\n", __func__);
 
+	if (KS8842_USE_DMA(adapter)) {
+		unsigned long flags;
+		ret = ks8842_tx_frame_dma(skb, netdev);
+		/* for now only allow one transfer at the time */
+		spin_lock_irqsave(&adapter->lock, flags);
+		if (adapter->dma_tx.adesc)
+			netif_stop_queue(netdev);
+		spin_unlock_irqrestore(&adapter->lock, flags);
+		return ret;
+	}
+
 	ret = ks8842_tx_frame(skb, netdev);
 
 	if (ks8842_tx_fifo_space(adapter) <  netdev->mtu + 8)
@@ -693,6 +1095,10 @@ static void ks8842_tx_timeout_work(struct work_struct *work)
 	netdev_dbg(netdev, "%s: entry\n", __func__);
 
 	spin_lock_irqsave(&adapter->lock, flags);
+
+	if (KS8842_USE_DMA(adapter))
+		ks8842_stop_dma(adapter);
+
 	/* disable interrupts */
 	ks8842_write16(adapter, 18, 0, REG_IER);
 	ks8842_write16(adapter, 18, 0xFFFF, REG_ISR);
@@ -706,6 +1112,9 @@ static void ks8842_tx_timeout_work(struct work_struct *work)
 	ks8842_write_mac_addr(adapter, netdev->dev_addr);
 
 	ks8842_update_link_status(netdev, adapter);
+
+	if (KS8842_USE_DMA(adapter))
+		__ks8842_start_new_rx_dma(netdev);
 }
 
 static void ks8842_tx_timeout(struct net_device *netdev)
@@ -765,6 +1174,19 @@ static int __devinit ks8842_probe(struct platform_device *pdev)
 		goto err_get_irq;
 	}
 
+	adapter->dev = (pdev->dev.parent) ? pdev->dev.parent : &pdev->dev;
+
+	/* DMA is only supported when accessed via timberdale */
+	if (!(adapter->conf_flags & MICREL_KS884X) && pdata &&
+		(pdata->tx_dma_channel != -1) &&
+		(pdata->rx_dma_channel != -1)) {
+		adapter->dma_rx.channel = pdata->rx_dma_channel;
+		adapter->dma_tx.channel = pdata->tx_dma_channel;
+	} else {
+		adapter->dma_rx.channel = -1;
+		adapter->dma_tx.channel = -1;
+	}
+
 	tasklet_init(&adapter->tasklet, ks8842_tasklet, (unsigned long)netdev);
 	spin_lock_init(&adapter->lock);
 
diff --git a/include/linux/ks8842.h b/include/linux/ks8842.h
index da0341b8ca0a..14ba4452296e 100644
--- a/include/linux/ks8842.h
+++ b/include/linux/ks8842.h
@@ -25,10 +25,14 @@
  * struct ks8842_platform_data - Platform data of the KS8842 network driver
  * @macaddr:	The MAC address of the device, set to all 0:s to use the on in
  *		the chip.
+ * @rx_dma_channel:	The DMA channel to use for RX, -1 for none.
+ * @tx_dma_channel:	The DMA channel to use for TX, -1 for none.
  *
  */
 struct ks8842_platform_data {
 	u8 macaddr[ETH_ALEN];
+	int rx_dma_channel;
+	int tx_dma_channel;
 };
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From b3c567e474b5ba4447b6e16063a3b0cffc22d205 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vinod.koul@intel.com>
Date: Wed, 21 Jul 2010 13:28:10 +0530
Subject: intel_mid: Add Mrst & Mfld DMA Drivers

This patch add DMA drivers for DMA controllers in Langwell chipset
of Intel(R) Moorestown platform and DMA controllers in Penwell of
Intel(R) Medfield platfrom

This patch adds support for Moorestown DMAC1 and DMAC2 controllers.
It also add support for Medfiled GP DMA and DMAC1 controllers.
These controllers supports memory to peripheral and peripheral to
memory transfers. It support only single block transfers.

This driver is based on Kernel DMA engine
Anyone who wishes to use this controller should use DMA engine APIs

This controller exposes DMA_SLAVE capabilities and notifies the client drivers
of DMA transaction completion

Config option required to be enabled CONFIG_INTEL_MID_DMAC=y

Signed-off-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/Kconfig              |   13 +
 drivers/dma/Makefile             |    1 +
 drivers/dma/intel_mid_dma.c      | 1143 ++++++++++++++++++++++++++++++++++++++
 drivers/dma/intel_mid_dma_regs.h |  260 +++++++++
 include/linux/intel_mid_dma.h    |   86 +++
 5 files changed, 1503 insertions(+)
 create mode 100644 drivers/dma/intel_mid_dma.c
 create mode 100644 drivers/dma/intel_mid_dma_regs.h
 create mode 100644 include/linux/intel_mid_dma.h

(limited to 'include/linux')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 9e01e96fee94..6a3e5bfa6742 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -33,6 +33,19 @@ if DMADEVICES
 
 comment "DMA Devices"
 
+config INTEL_MID_DMAC
+	tristate "Intel MID DMA support for Peripheral DMA controllers"
+	depends on PCI && X86
+	select DMA_ENGINE
+	default n
+	help
+	  Enable support for the Intel(R) MID DMA engine present
+	  in Intel MID chipsets.
+
+	  Say Y here if you have such a chipset.
+
+	  If unsure, say N.
+
 config ASYNC_TX_DISABLE_CHANNEL_SWITCH
 	bool
 
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 0fe5ebbfda5d..27b6c69ae639 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -7,6 +7,7 @@ endif
 
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_NET_DMA) += iovlock.o
+obj-$(CONFIG_INTEL_MID_DMAC) += intel_mid_dma.o
 obj-$(CONFIG_DMATEST) += dmatest.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioat/
 obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
diff --git a/drivers/dma/intel_mid_dma.c b/drivers/dma/intel_mid_dma.c
new file mode 100644
index 000000000000..c2591e8d9b6e
--- /dev/null
+++ b/drivers/dma/intel_mid_dma.c
@@ -0,0 +1,1143 @@
+/*
+ *  intel_mid_dma.c - Intel Langwell DMA Drivers
+ *
+ *  Copyright (C) 2008-10 Intel Corp
+ *  Author: Vinod Koul <vinod.koul@intel.com>
+ *  The driver design is based on dw_dmac driver
+ *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *
+ */
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/intel_mid_dma.h>
+
+#define MAX_CHAN	4 /*max ch across controllers*/
+#include "intel_mid_dma_regs.h"
+
+#define INTEL_MID_DMAC1_ID		0x0814
+#define INTEL_MID_DMAC2_ID		0x0813
+#define INTEL_MID_GP_DMAC2_ID		0x0827
+#define INTEL_MFLD_DMAC1_ID		0x0830
+#define LNW_PERIPHRAL_MASK_BASE		0xFFAE8008
+#define LNW_PERIPHRAL_MASK_SIZE		0x10
+#define LNW_PERIPHRAL_STATUS		0x0
+#define LNW_PERIPHRAL_MASK		0x8
+
+struct intel_mid_dma_probe_info {
+	u8 max_chan;
+	u8 ch_base;
+	u16 block_size;
+	u32 pimr_mask;
+};
+
+#define INFO(_max_chan, _ch_base, _block_size, _pimr_mask) \
+	((kernel_ulong_t)&(struct intel_mid_dma_probe_info) {	\
+		.max_chan = (_max_chan),			\
+		.ch_base = (_ch_base),				\
+		.block_size = (_block_size),			\
+		.pimr_mask = (_pimr_mask),			\
+	})
+
+/*****************************************************************************
+Utility Functions*/
+/**
+ * get_ch_index	-	convert status to channel
+ * @status: status mask
+ * @base: dma ch base value
+ *
+ * Modify the status mask and return the channel index needing
+ * attention (or -1 if neither)
+ */
+static int get_ch_index(int *status, unsigned int base)
+{
+	int i;
+	for (i = 0; i < MAX_CHAN; i++) {
+		if (*status & (1 << (i + base))) {
+			*status = *status & ~(1 << (i + base));
+			pr_debug("MDMA: index %d New status %x\n", i, *status);
+			return i;
+		}
+	}
+	return -1;
+}
+
+/**
+ * get_block_ts	-	calculates dma transaction length
+ * @len: dma transfer length
+ * @tx_width: dma transfer src width
+ * @block_size: dma controller max block size
+ *
+ * Based on src width calculate the DMA trsaction length in data items
+ * return data items or FFFF if exceeds max length for block
+ */
+static int get_block_ts(int len, int tx_width, int block_size)
+{
+	int byte_width = 0, block_ts = 0;
+
+	switch (tx_width) {
+	case LNW_DMA_WIDTH_8BIT:
+		byte_width = 1;
+		break;
+	case LNW_DMA_WIDTH_16BIT:
+		byte_width = 2;
+		break;
+	case LNW_DMA_WIDTH_32BIT:
+	default:
+		byte_width = 4;
+		break;
+	}
+
+	block_ts = len/byte_width;
+	if (block_ts > block_size)
+		block_ts = 0xFFFF;
+	return block_ts;
+}
+
+/*****************************************************************************
+DMAC1 interrupt Functions*/
+
+/**
+ * dmac1_mask_periphral_intr -	mask the periphral interrupt
+ * @midc: dma channel for which masking is required
+ *
+ * Masks the DMA periphral interrupt
+ * this is valid for DMAC1 family controllers only
+ * This controller should have periphral mask registers already mapped
+ */
+static void dmac1_mask_periphral_intr(struct intel_mid_dma_chan *midc)
+{
+	u32 pimr;
+	struct middma_device *mid = to_middma_device(midc->chan.device);
+
+	if (mid->pimr_mask) {
+		pimr = readl(mid->mask_reg + LNW_PERIPHRAL_MASK);
+		pimr |= mid->pimr_mask;
+		writel(pimr, mid->mask_reg + LNW_PERIPHRAL_MASK);
+	}
+	return;
+}
+
+/**
+ * dmac1_unmask_periphral_intr -	unmask the periphral interrupt
+ * @midc: dma channel for which masking is required
+ *
+ * UnMasks the DMA periphral interrupt,
+ * this is valid for DMAC1 family controllers only
+ * This controller should have periphral mask registers already mapped
+ */
+static void dmac1_unmask_periphral_intr(struct intel_mid_dma_chan *midc)
+{
+	u32 pimr;
+	struct middma_device *mid = to_middma_device(midc->chan.device);
+
+	if (mid->pimr_mask) {
+		pimr = readl(mid->mask_reg + LNW_PERIPHRAL_MASK);
+		pimr &= ~mid->pimr_mask;
+		writel(pimr, mid->mask_reg + LNW_PERIPHRAL_MASK);
+	}
+	return;
+}
+
+/**
+ * enable_dma_interrupt -	enable the periphral interrupt
+ * @midc: dma channel for which enable interrupt is required
+ *
+ * Enable the DMA periphral interrupt,
+ * this is valid for DMAC1 family controllers only
+ * This controller should have periphral mask registers already mapped
+ */
+static void enable_dma_interrupt(struct intel_mid_dma_chan *midc)
+{
+	dmac1_unmask_periphral_intr(midc);
+
+	/*en ch interrupts*/
+	iowrite32(UNMASK_INTR_REG(midc->ch_id), midc->dma_base + MASK_TFR);
+	iowrite32(UNMASK_INTR_REG(midc->ch_id), midc->dma_base + MASK_ERR);
+	return;
+}
+
+/**
+ * disable_dma_interrupt -	disable the periphral interrupt
+ * @midc: dma channel for which disable interrupt is required
+ *
+ * Disable the DMA periphral interrupt,
+ * this is valid for DMAC1 family controllers only
+ * This controller should have periphral mask registers already mapped
+ */
+static void disable_dma_interrupt(struct intel_mid_dma_chan *midc)
+{
+	/*Check LPE PISR, make sure fwd is disabled*/
+	dmac1_mask_periphral_intr(midc);
+	iowrite32(MASK_INTR_REG(midc->ch_id), midc->dma_base + MASK_BLOCK);
+	iowrite32(MASK_INTR_REG(midc->ch_id), midc->dma_base + MASK_TFR);
+	iowrite32(MASK_INTR_REG(midc->ch_id), midc->dma_base + MASK_ERR);
+	return;
+}
+
+/*****************************************************************************
+DMA channel helper Functions*/
+/**
+ * mid_desc_get		-	get a descriptor
+ * @midc: dma channel for which descriptor is required
+ *
+ * Obtain a descriptor for the channel. Returns NULL if none are free.
+ * Once the descriptor is returned it is private until put on another
+ * list or freed
+ */
+static struct intel_mid_dma_desc *midc_desc_get(struct intel_mid_dma_chan *midc)
+{
+	struct intel_mid_dma_desc *desc, *_desc;
+	struct intel_mid_dma_desc *ret = NULL;
+
+	spin_lock_bh(&midc->lock);
+	list_for_each_entry_safe(desc, _desc, &midc->free_list, desc_node) {
+		if (async_tx_test_ack(&desc->txd)) {
+			list_del(&desc->desc_node);
+			ret = desc;
+			break;
+		}
+	}
+	spin_unlock_bh(&midc->lock);
+	return ret;
+}
+
+/**
+ * mid_desc_put		-	put a descriptor
+ * @midc: dma channel for which descriptor is required
+ * @desc: descriptor to put
+ *
+ * Return a descriptor from lwn_desc_get back to the free pool
+ */
+static void midc_desc_put(struct intel_mid_dma_chan *midc,
+			struct intel_mid_dma_desc *desc)
+{
+	if (desc) {
+		spin_lock_bh(&midc->lock);
+		list_add_tail(&desc->desc_node, &midc->free_list);
+		spin_unlock_bh(&midc->lock);
+	}
+}
+/**
+ * midc_dostart		-		begin a DMA transaction
+ * @midc: channel for which txn is to be started
+ * @first: first descriptor of series
+ *
+ * Load a transaction into the engine. This must be called with midc->lock
+ * held and bh disabled.
+ */
+static void midc_dostart(struct intel_mid_dma_chan *midc,
+			struct intel_mid_dma_desc *first)
+{
+	struct middma_device *mid = to_middma_device(midc->chan.device);
+
+	/*  channel is idle */
+	if (midc->in_use && test_ch_en(midc->dma_base, midc->ch_id)) {
+		/*error*/
+		pr_err("ERR_MDMA: channel is busy in start\n");
+		/* The tasklet will hopefully advance the queue... */
+		return;
+	}
+
+	/*write registers and en*/
+	iowrite32(first->sar, midc->ch_regs + SAR);
+	iowrite32(first->dar, midc->ch_regs + DAR);
+	iowrite32(first->cfg_hi, midc->ch_regs + CFG_HIGH);
+	iowrite32(first->cfg_lo, midc->ch_regs + CFG_LOW);
+	iowrite32(first->ctl_lo, midc->ch_regs + CTL_LOW);
+	iowrite32(first->ctl_hi, midc->ch_regs + CTL_HIGH);
+	pr_debug("MDMA:TX SAR %x,DAR %x,CFGL %x,CFGH %x,CTLH %x, CTLL %x\n",
+		(int)first->sar, (int)first->dar, first->cfg_hi,
+		first->cfg_lo, first->ctl_hi, first->ctl_lo);
+
+	iowrite32(ENABLE_CHANNEL(midc->ch_id), mid->dma_base + DMA_CHAN_EN);
+	first->status = DMA_IN_PROGRESS;
+}
+
+/**
+ * midc_descriptor_complete	-	process completed descriptor
+ * @midc: channel owning the descriptor
+ * @desc: the descriptor itself
+ *
+ * Process a completed descriptor and perform any callbacks upon
+ * the completion. The completion handling drops the lock during the
+ * callbacks but must be called with the lock held.
+ */
+static void midc_descriptor_complete(struct intel_mid_dma_chan *midc,
+	       struct intel_mid_dma_desc *desc)
+{
+	struct dma_async_tx_descriptor	*txd = &desc->txd;
+	dma_async_tx_callback callback_txd = NULL;
+	void *param_txd = NULL;
+
+	midc->completed = txd->cookie;
+	callback_txd = txd->callback;
+	param_txd = txd->callback_param;
+
+	list_move(&desc->desc_node, &midc->free_list);
+
+	spin_unlock_bh(&midc->lock);
+	if (callback_txd) {
+		pr_debug("MDMA: TXD callback set ... calling\n");
+		callback_txd(param_txd);
+		spin_lock_bh(&midc->lock);
+		return;
+	}
+	spin_lock_bh(&midc->lock);
+
+}
+/**
+ * midc_scan_descriptors -		check the descriptors in channel
+ *					mark completed when tx is completete
+ * @mid: device
+ * @midc: channel to scan
+ *
+ * Walk the descriptor chain for the device and process any entries
+ * that are complete.
+ */
+static void midc_scan_descriptors(struct middma_device *mid,
+				struct intel_mid_dma_chan *midc)
+{
+	struct intel_mid_dma_desc *desc = NULL, *_desc = NULL;
+
+	/*tx is complete*/
+	list_for_each_entry_safe(desc, _desc, &midc->active_list, desc_node) {
+		if (desc->status == DMA_IN_PROGRESS)  {
+			desc->status = DMA_SUCCESS;
+			midc_descriptor_complete(midc, desc);
+		}
+	}
+	return;
+}
+
+/*****************************************************************************
+DMA engine callback Functions*/
+/**
+ * intel_mid_dma_tx_submit -	callback to submit DMA transaction
+ * @tx: dma engine descriptor
+ *
+ * Submit the DMA trasaction for this descriptor, start if ch idle
+ */
+static dma_cookie_t intel_mid_dma_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct intel_mid_dma_desc	*desc = to_intel_mid_dma_desc(tx);
+	struct intel_mid_dma_chan	*midc = to_intel_mid_dma_chan(tx->chan);
+	dma_cookie_t		cookie;
+
+	spin_lock_bh(&midc->lock);
+	cookie = midc->chan.cookie;
+
+	if (++cookie < 0)
+		cookie = 1;
+
+	midc->chan.cookie = cookie;
+	desc->txd.cookie = cookie;
+
+
+	if (list_empty(&midc->active_list)) {
+		midc_dostart(midc, desc);
+		list_add_tail(&desc->desc_node, &midc->active_list);
+	} else {
+		list_add_tail(&desc->desc_node, &midc->queue);
+	}
+	spin_unlock_bh(&midc->lock);
+
+	return cookie;
+}
+
+/**
+ * intel_mid_dma_issue_pending -	callback to issue pending txn
+ * @chan: chan where pending trascation needs to be checked and submitted
+ *
+ * Call for scan to issue pending descriptors
+ */
+static void intel_mid_dma_issue_pending(struct dma_chan *chan)
+{
+	struct intel_mid_dma_chan	*midc = to_intel_mid_dma_chan(chan);
+
+	spin_lock_bh(&midc->lock);
+	if (!list_empty(&midc->queue))
+		midc_scan_descriptors(to_middma_device(chan->device), midc);
+	spin_unlock_bh(&midc->lock);
+}
+
+/**
+ * intel_mid_dma_tx_status -	Return status of txn
+ * @chan: chan for where status needs to be checked
+ * @cookie: cookie for txn
+ * @txstate: DMA txn state
+ *
+ * Return status of DMA txn
+ */
+static enum dma_status intel_mid_dma_tx_status(struct dma_chan *chan,
+						dma_cookie_t cookie,
+						struct dma_tx_state *txstate)
+{
+	struct intel_mid_dma_chan	*midc = to_intel_mid_dma_chan(chan);
+	dma_cookie_t		last_used;
+	dma_cookie_t		last_complete;
+	int				ret;
+
+	last_complete = midc->completed;
+	last_used = chan->cookie;
+
+	ret = dma_async_is_complete(cookie, last_complete, last_used);
+	if (ret != DMA_SUCCESS) {
+		midc_scan_descriptors(to_middma_device(chan->device), midc);
+
+		last_complete = midc->completed;
+		last_used = chan->cookie;
+
+		ret = dma_async_is_complete(cookie, last_complete, last_used);
+	}
+
+	if (txstate) {
+		txstate->last = last_complete;
+		txstate->used = last_used;
+		txstate->residue = 0;
+	}
+	return ret;
+}
+
+/**
+ * intel_mid_dma_device_control -	DMA device control
+ * @chan: chan for DMA control
+ * @cmd: control cmd
+ * @arg: cmd arg value
+ *
+ * Perform DMA control command
+ */
+static int intel_mid_dma_device_control(struct dma_chan *chan,
+			enum dma_ctrl_cmd cmd, unsigned long arg)
+{
+	struct intel_mid_dma_chan	*midc = to_intel_mid_dma_chan(chan);
+	struct middma_device	*mid = to_middma_device(chan->device);
+	struct intel_mid_dma_desc	*desc, *_desc;
+	LIST_HEAD(list);
+
+	if (cmd != DMA_TERMINATE_ALL)
+		return -ENXIO;
+
+	spin_lock_bh(&midc->lock);
+	if (midc->in_use == false) {
+		spin_unlock_bh(&midc->lock);
+		return 0;
+	}
+	list_splice_init(&midc->free_list, &list);
+	midc->descs_allocated = 0;
+	midc->slave = NULL;
+
+	/* Disable interrupts */
+	disable_dma_interrupt(midc);
+
+	spin_unlock_bh(&midc->lock);
+	list_for_each_entry_safe(desc, _desc, &list, desc_node) {
+		pr_debug("MDMA: freeing descriptor %p\n", desc);
+		pci_pool_free(mid->dma_pool, desc, desc->txd.phys);
+	}
+	return 0;
+}
+
+/**
+ * intel_mid_dma_prep_slave_sg -	Prep slave sg txn
+ * @chan: chan for DMA transfer
+ * @sgl: scatter gather list
+ * @sg_len: length of sg txn
+ * @direction: DMA transfer dirtn
+ * @flags: DMA flags
+ *
+ * Do DMA sg txn: NOT supported now
+ */
+static struct dma_async_tx_descriptor *intel_mid_dma_prep_slave_sg(
+			struct dma_chan *chan, struct scatterlist *sgl,
+			unsigned int sg_len, enum dma_data_direction direction,
+			unsigned long flags)
+{
+	/*not supported now*/
+	return NULL;
+}
+
+/**
+ * intel_mid_dma_prep_memcpy -	Prep memcpy txn
+ * @chan: chan for DMA transfer
+ * @dest: destn address
+ * @src: src address
+ * @len: DMA transfer len
+ * @flags: DMA flags
+ *
+ * Perform a DMA memcpy. Note we support slave periphral DMA transfers only
+ * The periphral txn details should be filled in slave structure properly
+ * Returns the descriptor for this txn
+ */
+static struct dma_async_tx_descriptor *intel_mid_dma_prep_memcpy(
+			struct dma_chan *chan, dma_addr_t dest,
+			dma_addr_t src, size_t len, unsigned long flags)
+{
+	struct intel_mid_dma_chan *midc;
+	struct intel_mid_dma_desc *desc = NULL;
+	struct intel_mid_dma_slave *mids;
+	union intel_mid_dma_ctl_lo ctl_lo;
+	union intel_mid_dma_ctl_hi ctl_hi;
+	union intel_mid_dma_cfg_lo cfg_lo;
+	union intel_mid_dma_cfg_hi cfg_hi;
+	enum intel_mid_dma_width width = 0;
+
+	pr_debug("MDMA: Prep for memcpy\n");
+	WARN_ON(!chan);
+	if (!len)
+		return NULL;
+
+	mids = chan->private;
+	WARN_ON(!mids);
+
+	midc = to_intel_mid_dma_chan(chan);
+	WARN_ON(!midc);
+
+	pr_debug("MDMA:called for DMA %x CH %d Length %zu\n",
+				midc->dma->pci_id, midc->ch_id, len);
+	pr_debug("MDMA:Cfg passed Mode %x, Dirn %x, HS %x, Width %x\n",
+		mids->cfg_mode, mids->dirn, mids->hs_mode, mids->src_width);
+
+	/*calculate CFG_LO*/
+	if (mids->hs_mode == LNW_DMA_SW_HS) {
+		cfg_lo.cfg_lo = 0;
+		cfg_lo.cfgx.hs_sel_dst = 1;
+		cfg_lo.cfgx.hs_sel_src = 1;
+	} else if (mids->hs_mode == LNW_DMA_HW_HS)
+		cfg_lo.cfg_lo = 0x00000;
+
+	/*calculate CFG_HI*/
+	if (mids->cfg_mode == LNW_DMA_MEM_TO_MEM) {
+		/*SW HS only*/
+		cfg_hi.cfg_hi = 0;
+	} else {
+		cfg_hi.cfg_hi = 0;
+		if (midc->dma->pimr_mask) {
+			cfg_hi.cfgx.protctl = 0x0; /*default value*/
+			cfg_hi.cfgx.fifo_mode = 1;
+			if (mids->dirn == DMA_TO_DEVICE) {
+				cfg_hi.cfgx.src_per = 0;
+				if (mids->device_instance == 0)
+					cfg_hi.cfgx.dst_per = 3;
+				if (mids->device_instance == 1)
+					cfg_hi.cfgx.dst_per = 1;
+			} else if (mids->dirn == DMA_FROM_DEVICE) {
+				if (mids->device_instance == 0)
+					cfg_hi.cfgx.src_per = 2;
+				if (mids->device_instance == 1)
+					cfg_hi.cfgx.src_per = 0;
+				cfg_hi.cfgx.dst_per = 0;
+			}
+		} else {
+			cfg_hi.cfgx.protctl = 0x1; /*default value*/
+			cfg_hi.cfgx.src_per = cfg_hi.cfgx.dst_per =
+					midc->ch_id - midc->dma->chan_base;
+		}
+	}
+
+	/*calculate CTL_HI*/
+	ctl_hi.ctlx.reser = 0;
+	width = mids->src_width;
+
+	ctl_hi.ctlx.block_ts = get_block_ts(len, width, midc->dma->block_size);
+	pr_debug("MDMA:calc len %d for block size %d\n",
+				ctl_hi.ctlx.block_ts, midc->dma->block_size);
+	/*calculate CTL_LO*/
+	ctl_lo.ctl_lo = 0;
+	ctl_lo.ctlx.int_en = 1;
+	ctl_lo.ctlx.dst_tr_width = mids->dst_width;
+	ctl_lo.ctlx.src_tr_width = mids->src_width;
+	ctl_lo.ctlx.dst_msize = mids->src_msize;
+	ctl_lo.ctlx.src_msize = mids->dst_msize;
+
+	if (mids->cfg_mode == LNW_DMA_MEM_TO_MEM) {
+		ctl_lo.ctlx.tt_fc = 0;
+		ctl_lo.ctlx.sinc = 0;
+		ctl_lo.ctlx.dinc = 0;
+	} else {
+		if (mids->dirn == DMA_TO_DEVICE) {
+			ctl_lo.ctlx.sinc = 0;
+			ctl_lo.ctlx.dinc = 2;
+			ctl_lo.ctlx.tt_fc = 1;
+		} else if (mids->dirn == DMA_FROM_DEVICE) {
+			ctl_lo.ctlx.sinc = 2;
+			ctl_lo.ctlx.dinc = 0;
+			ctl_lo.ctlx.tt_fc = 2;
+		}
+	}
+
+	pr_debug("MDMA:Calc CTL LO %x, CTL HI %x, CFG LO %x, CFG HI %x\n",
+		ctl_lo.ctl_lo, ctl_hi.ctl_hi, cfg_lo.cfg_lo, cfg_hi.cfg_hi);
+
+	enable_dma_interrupt(midc);
+
+	desc = midc_desc_get(midc);
+	if (desc == NULL)
+		goto err_desc_get;
+	desc->sar = src;
+	desc->dar = dest ;
+	desc->len = len;
+	desc->cfg_hi = cfg_hi.cfg_hi;
+	desc->cfg_lo = cfg_lo.cfg_lo;
+	desc->ctl_lo = ctl_lo.ctl_lo;
+	desc->ctl_hi = ctl_hi.ctl_hi;
+	desc->width = width;
+	desc->dirn = mids->dirn;
+	return &desc->txd;
+
+err_desc_get:
+	pr_err("ERR_MDMA: Failed to get desc\n");
+	midc_desc_put(midc, desc);
+	return NULL;
+}
+
+/**
+ * intel_mid_dma_free_chan_resources -	Frees dma resources
+ * @chan: chan requiring attention
+ *
+ * Frees the allocated resources on this DMA chan
+ */
+static void intel_mid_dma_free_chan_resources(struct dma_chan *chan)
+{
+	struct intel_mid_dma_chan	*midc = to_intel_mid_dma_chan(chan);
+	struct middma_device	*mid = to_middma_device(chan->device);
+	struct intel_mid_dma_desc	*desc, *_desc;
+
+	if (true == midc->in_use) {
+		/*trying to free ch in use!!!!!*/
+		pr_err("ERR_MDMA: trying to free ch in use\n");
+	}
+
+	spin_lock_bh(&midc->lock);
+	midc->descs_allocated = 0;
+	list_for_each_entry_safe(desc, _desc, &midc->active_list, desc_node) {
+		list_del(&desc->desc_node);
+		pci_pool_free(mid->dma_pool, desc, desc->txd.phys);
+	}
+	list_for_each_entry_safe(desc, _desc, &midc->free_list, desc_node) {
+		list_del(&desc->desc_node);
+		pci_pool_free(mid->dma_pool, desc, desc->txd.phys);
+	}
+	list_for_each_entry_safe(desc, _desc, &midc->queue, desc_node) {
+		list_del(&desc->desc_node);
+		pci_pool_free(mid->dma_pool, desc, desc->txd.phys);
+	}
+	spin_unlock_bh(&midc->lock);
+	midc->in_use = false;
+	/* Disable CH interrupts */
+	iowrite32(MASK_INTR_REG(midc->ch_id), mid->dma_base + MASK_BLOCK);
+	iowrite32(MASK_INTR_REG(midc->ch_id), mid->dma_base + MASK_ERR);
+}
+
+/**
+ * intel_mid_dma_alloc_chan_resources -	Allocate dma resources
+ * @chan: chan requiring attention
+ *
+ * Allocates DMA resources on this chan
+ * Return the descriptors allocated
+ */
+static int intel_mid_dma_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct intel_mid_dma_chan	*midc = to_intel_mid_dma_chan(chan);
+	struct middma_device	*mid = to_middma_device(chan->device);
+	struct intel_mid_dma_desc	*desc;
+	dma_addr_t		phys;
+	int	i = 0;
+
+
+	/* ASSERT:  channel is idle */
+	if (test_ch_en(mid->dma_base, midc->ch_id)) {
+		/*ch is not idle*/
+		pr_err("ERR_MDMA: ch not idle\n");
+		return -EIO;
+	}
+	midc->completed = chan->cookie = 1;
+
+	spin_lock_bh(&midc->lock);
+	while (midc->descs_allocated < DESCS_PER_CHANNEL) {
+		spin_unlock_bh(&midc->lock);
+		desc = pci_pool_alloc(mid->dma_pool, GFP_KERNEL, &phys);
+		if (!desc) {
+			pr_err("ERR_MDMA: desc failed\n");
+			return -ENOMEM;
+			/*check*/
+		}
+		dma_async_tx_descriptor_init(&desc->txd, chan);
+		desc->txd.tx_submit = intel_mid_dma_tx_submit;
+		desc->txd.flags = DMA_CTRL_ACK;
+		desc->txd.phys = phys;
+		spin_lock_bh(&midc->lock);
+		i = ++midc->descs_allocated;
+		list_add_tail(&desc->desc_node, &midc->free_list);
+	}
+	spin_unlock_bh(&midc->lock);
+	midc->in_use = false;
+	pr_debug("MID_DMA: Desc alloc done ret: %d desc\n", i);
+	return i;
+}
+
+/**
+ * midc_handle_error -	Handle DMA txn error
+ * @mid: controller where error occured
+ * @midc: chan where error occured
+ *
+ * Scan the descriptor for error
+ */
+static void midc_handle_error(struct middma_device *mid,
+		struct intel_mid_dma_chan *midc)
+{
+	midc_scan_descriptors(mid, midc);
+}
+
+/**
+ * dma_tasklet -	DMA interrupt tasklet
+ * @data: tasklet arg (the controller structure)
+ *
+ * Scan the controller for interrupts for completion/error
+ * Clear the interrupt and call for handling completion/error
+ */
+static void dma_tasklet(unsigned long data)
+{
+	struct middma_device *mid = NULL;
+	struct intel_mid_dma_chan *midc = NULL;
+	u32 status;
+	int i;
+
+	mid = (struct middma_device *)data;
+	if (mid == NULL) {
+		pr_err("ERR_MDMA: tasklet Null param\n");
+		return;
+	}
+	pr_debug("MDMA: in tasklet for device %x\n", mid->pci_id);
+	status = ioread32(mid->dma_base + RAW_TFR);
+	pr_debug("MDMA:RAW_TFR %x\n", status);
+	status &= mid->intr_mask;
+	while (status) {
+		/*txn interrupt*/
+		i = get_ch_index(&status, mid->chan_base);
+		if (i < 0) {
+			pr_err("ERR_MDMA:Invalid ch index %x\n", i);
+			return;
+		}
+		midc = &mid->ch[i];
+		if (midc == NULL) {
+			pr_err("ERR_MDMA:Null param midc\n");
+			return;
+		}
+		pr_debug("MDMA:Tx complete interrupt %x, Ch No %d Index %d\n",
+				status, midc->ch_id, i);
+		/*clearing this interrupts first*/
+		iowrite32((1 << midc->ch_id), mid->dma_base + CLEAR_TFR);
+		iowrite32((1 << midc->ch_id), mid->dma_base + CLEAR_BLOCK);
+
+		spin_lock_bh(&midc->lock);
+		midc_scan_descriptors(mid, midc);
+		pr_debug("MDMA:Scan of desc... complete, unmasking\n");
+		iowrite32(UNMASK_INTR_REG(midc->ch_id),
+				mid->dma_base + MASK_TFR);
+		spin_unlock_bh(&midc->lock);
+	}
+
+	status = ioread32(mid->dma_base + RAW_ERR);
+	status &= mid->intr_mask;
+	while (status) {
+		/*err interrupt*/
+		i = get_ch_index(&status, mid->chan_base);
+		if (i < 0) {
+			pr_err("ERR_MDMA:Invalid ch index %x\n", i);
+			return;
+		}
+		midc = &mid->ch[i];
+		if (midc == NULL) {
+			pr_err("ERR_MDMA:Null param midc\n");
+			return;
+		}
+		pr_debug("MDMA:Tx complete interrupt %x, Ch No %d Index %d\n",
+				status, midc->ch_id, i);
+
+		iowrite32((1 << midc->ch_id), mid->dma_base + CLEAR_ERR);
+		spin_lock_bh(&midc->lock);
+		midc_handle_error(mid, midc);
+		iowrite32(UNMASK_INTR_REG(midc->ch_id),
+				mid->dma_base + MASK_ERR);
+		spin_unlock_bh(&midc->lock);
+	}
+	pr_debug("MDMA:Exiting takslet...\n");
+	return;
+}
+
+static void dma_tasklet1(unsigned long data)
+{
+	pr_debug("MDMA:in takslet1...\n");
+	return dma_tasklet(data);
+}
+
+static void dma_tasklet2(unsigned long data)
+{
+	pr_debug("MDMA:in takslet2...\n");
+	return dma_tasklet(data);
+}
+
+/**
+ * intel_mid_dma_interrupt -	DMA ISR
+ * @irq: IRQ where interrupt occurred
+ * @data: ISR cllback data (the controller structure)
+ *
+ * See if this is our interrupt if so then schedule the tasklet
+ * otherwise ignore
+ */
+static irqreturn_t intel_mid_dma_interrupt(int irq, void *data)
+{
+	struct middma_device *mid = data;
+	u32 status;
+	int call_tasklet = 0;
+
+	/*DMA Interrupt*/
+	pr_debug("MDMA:Got an interrupt on irq %d\n", irq);
+	if (!mid) {
+		pr_err("ERR_MDMA:null pointer mid\n");
+		return -EINVAL;
+	}
+
+	status = ioread32(mid->dma_base + RAW_TFR);
+	pr_debug("MDMA: Status %x, Mask %x\n", status, mid->intr_mask);
+	status &= mid->intr_mask;
+	if (status) {
+		/*need to disable intr*/
+		iowrite32((status << 8), mid->dma_base + MASK_TFR);
+		pr_debug("MDMA: Calling tasklet %x\n", status);
+		call_tasklet = 1;
+	}
+	status = ioread32(mid->dma_base + RAW_ERR);
+	status &= mid->intr_mask;
+	if (status) {
+		iowrite32(MASK_INTR_REG(status), mid->dma_base + MASK_ERR);
+		call_tasklet = 1;
+	}
+	if (call_tasklet)
+		tasklet_schedule(&mid->tasklet);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t intel_mid_dma_interrupt1(int irq, void *data)
+{
+	return intel_mid_dma_interrupt(irq, data);
+}
+
+static irqreturn_t intel_mid_dma_interrupt2(int irq, void *data)
+{
+	return intel_mid_dma_interrupt(irq, data);
+}
+
+/**
+ * mid_setup_dma -	Setup the DMA controller
+ * @pdev: Controller PCI device structure
+ *
+ * Initilize the DMA controller, channels, registers with DMA engine,
+ * ISR. Initilize DMA controller channels.
+ */
+static int mid_setup_dma(struct pci_dev *pdev)
+{
+	struct middma_device *dma = pci_get_drvdata(pdev);
+	int err, i;
+	unsigned int irq_level;
+
+	/* DMA coherent memory pool for DMA descriptor allocations */
+	dma->dma_pool = pci_pool_create("intel_mid_dma_desc_pool", pdev,
+					sizeof(struct intel_mid_dma_desc),
+					32, 0);
+	if (NULL == dma->dma_pool) {
+		pr_err("ERR_MDMA:pci_pool_create failed\n");
+		err = -ENOMEM;
+		kfree(dma);
+		goto err_dma_pool;
+	}
+
+	INIT_LIST_HEAD(&dma->common.channels);
+	dma->pci_id = pdev->device;
+	if (dma->pimr_mask) {
+		dma->mask_reg = ioremap(LNW_PERIPHRAL_MASK_BASE,
+					LNW_PERIPHRAL_MASK_SIZE);
+		if (dma->mask_reg == NULL) {
+			pr_err("ERR_MDMA:Cant map periphral intr space !!\n");
+			return -ENOMEM;
+		}
+	} else
+		dma->mask_reg = NULL;
+
+	pr_debug("MDMA:Adding %d channel for this controller\n", dma->max_chan);
+	/*init CH structures*/
+	dma->intr_mask = 0;
+	for (i = 0; i < dma->max_chan; i++) {
+		struct intel_mid_dma_chan *midch = &dma->ch[i];
+
+		midch->chan.device = &dma->common;
+		midch->chan.cookie =  1;
+		midch->chan.chan_id = i;
+		midch->ch_id = dma->chan_base + i;
+		pr_debug("MDMA:Init CH %d, ID %d\n", i, midch->ch_id);
+
+		midch->dma_base = dma->dma_base;
+		midch->ch_regs = dma->dma_base + DMA_CH_SIZE * midch->ch_id;
+		midch->dma = dma;
+		dma->intr_mask |= 1 << (dma->chan_base + i);
+		spin_lock_init(&midch->lock);
+
+		INIT_LIST_HEAD(&midch->active_list);
+		INIT_LIST_HEAD(&midch->queue);
+		INIT_LIST_HEAD(&midch->free_list);
+		/*mask interrupts*/
+		iowrite32(MASK_INTR_REG(midch->ch_id),
+			dma->dma_base + MASK_BLOCK);
+		iowrite32(MASK_INTR_REG(midch->ch_id),
+			dma->dma_base + MASK_SRC_TRAN);
+		iowrite32(MASK_INTR_REG(midch->ch_id),
+			dma->dma_base + MASK_DST_TRAN);
+		iowrite32(MASK_INTR_REG(midch->ch_id),
+			dma->dma_base + MASK_ERR);
+		iowrite32(MASK_INTR_REG(midch->ch_id),
+			dma->dma_base + MASK_TFR);
+
+		disable_dma_interrupt(midch);
+		list_add_tail(&midch->chan.device_node, &dma->common.channels);
+	}
+	pr_debug("MDMA: Calc Mask as %x for this controller\n", dma->intr_mask);
+
+	/*init dma structure*/
+	dma_cap_zero(dma->common.cap_mask);
+	dma_cap_set(DMA_MEMCPY, dma->common.cap_mask);
+	dma_cap_set(DMA_SLAVE, dma->common.cap_mask);
+	dma_cap_set(DMA_PRIVATE, dma->common.cap_mask);
+	dma->common.dev = &pdev->dev;
+	dma->common.chancnt = dma->max_chan;
+
+	dma->common.device_alloc_chan_resources =
+					intel_mid_dma_alloc_chan_resources;
+	dma->common.device_free_chan_resources =
+					intel_mid_dma_free_chan_resources;
+
+	dma->common.device_tx_status = intel_mid_dma_tx_status;
+	dma->common.device_prep_dma_memcpy = intel_mid_dma_prep_memcpy;
+	dma->common.device_issue_pending = intel_mid_dma_issue_pending;
+	dma->common.device_prep_slave_sg = intel_mid_dma_prep_slave_sg;
+	dma->common.device_control = intel_mid_dma_device_control;
+
+	/*enable dma cntrl*/
+	iowrite32(REG_BIT0, dma->dma_base + DMA_CFG);
+
+	/*register irq */
+	if (dma->pimr_mask) {
+		irq_level = IRQF_SHARED;
+		pr_debug("MDMA:Requesting irq shared for DMAC1\n");
+		err = request_irq(pdev->irq, intel_mid_dma_interrupt1,
+			IRQF_SHARED, "INTEL_MID_DMAC1", dma);
+		if (0 != err)
+			goto err_irq;
+	} else {
+		dma->intr_mask = 0x03;
+		irq_level = 0;
+		pr_debug("MDMA:Requesting irq for DMAC2\n");
+		err = request_irq(pdev->irq, intel_mid_dma_interrupt2,
+			0, "INTEL_MID_DMAC2", dma);
+		if (0 != err)
+			goto err_irq;
+	}
+	/*register device w/ engine*/
+	err = dma_async_device_register(&dma->common);
+	if (0 != err) {
+		pr_err("ERR_MDMA:device_register failed: %d\n", err);
+		goto err_engine;
+	}
+	if (dma->pimr_mask) {
+		pr_debug("setting up tasklet1 for DMAC1\n");
+		tasklet_init(&dma->tasklet, dma_tasklet1, (unsigned long)dma);
+	} else {
+		pr_debug("setting up tasklet2 for DMAC2\n");
+		tasklet_init(&dma->tasklet, dma_tasklet2, (unsigned long)dma);
+	}
+	return 0;
+
+err_engine:
+	free_irq(pdev->irq, dma);
+err_irq:
+	pci_pool_destroy(dma->dma_pool);
+	kfree(dma);
+err_dma_pool:
+	pr_err("ERR_MDMA:setup_dma failed: %d\n", err);
+	return err;
+
+}
+
+/**
+ * middma_shutdown -	Shutdown the DMA controller
+ * @pdev: Controller PCI device structure
+ *
+ * Called by remove
+ * Unregister DMa controller, clear all structures and free interrupt
+ */
+static void middma_shutdown(struct pci_dev *pdev)
+{
+	struct middma_device *device = pci_get_drvdata(pdev);
+
+	dma_async_device_unregister(&device->common);
+	pci_pool_destroy(device->dma_pool);
+	if (device->mask_reg)
+		iounmap(device->mask_reg);
+	if (device->dma_base)
+		iounmap(device->dma_base);
+	free_irq(pdev->irq, device);
+	return;
+}
+
+/**
+ * intel_mid_dma_probe -	PCI Probe
+ * @pdev: Controller PCI device structure
+ * @id: pci device id structure
+ *
+ * Initilize the PCI device, map BARs, query driver data.
+ * Call setup_dma to complete contoller and chan initilzation
+ */
+static int __devinit intel_mid_dma_probe(struct pci_dev *pdev,
+					const struct pci_device_id *id)
+{
+	struct middma_device *device;
+	u32 base_addr, bar_size;
+	struct intel_mid_dma_probe_info *info;
+	int err;
+
+	pr_debug("MDMA: probe for %x\n", pdev->device);
+	info = (void *)id->driver_data;
+	pr_debug("MDMA: CH %d, base %d, block len %d, Periphral mask %x\n",
+				info->max_chan, info->ch_base,
+				info->block_size, info->pimr_mask);
+
+	err = pci_enable_device(pdev);
+	if (err)
+		goto err_enable_device;
+
+	err = pci_request_regions(pdev, "intel_mid_dmac");
+	if (err)
+		goto err_request_regions;
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+	if (err)
+		goto err_set_dma_mask;
+
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+	if (err)
+		goto err_set_dma_mask;
+
+	device = kzalloc(sizeof(*device), GFP_KERNEL);
+	if (!device) {
+		pr_err("ERR_MDMA:kzalloc failed probe\n");
+		err = -ENOMEM;
+		goto err_kzalloc;
+	}
+	device->pdev = pci_dev_get(pdev);
+
+	base_addr = pci_resource_start(pdev, 0);
+	bar_size  = pci_resource_len(pdev, 0);
+	device->dma_base = ioremap_nocache(base_addr, DMA_REG_SIZE);
+	if (!device->dma_base) {
+		pr_err("ERR_MDMA:ioremap failed\n");
+		err = -ENOMEM;
+		goto err_ioremap;
+	}
+	pci_set_drvdata(pdev, device);
+	pci_set_master(pdev);
+	device->max_chan = info->max_chan;
+	device->chan_base = info->ch_base;
+	device->block_size = info->block_size;
+	device->pimr_mask = info->pimr_mask;
+
+	err = mid_setup_dma(pdev);
+	if (err)
+		goto err_dma;
+
+	return 0;
+
+err_dma:
+	iounmap(device->dma_base);
+err_ioremap:
+	pci_dev_put(pdev);
+	kfree(device);
+err_kzalloc:
+err_set_dma_mask:
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+err_request_regions:
+err_enable_device:
+	pr_err("ERR_MDMA:Probe failed %d\n", err);
+	return err;
+}
+
+/**
+ * intel_mid_dma_remove -	PCI remove
+ * @pdev: Controller PCI device structure
+ *
+ * Free up all resources and data
+ * Call shutdown_dma to complete contoller and chan cleanup
+ */
+static void __devexit intel_mid_dma_remove(struct pci_dev *pdev)
+{
+	struct middma_device *device = pci_get_drvdata(pdev);
+	middma_shutdown(pdev);
+	pci_dev_put(pdev);
+	kfree(device);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+}
+
+/******************************************************************************
+* PCI stuff
+*/
+static struct pci_device_id intel_mid_dma_ids[] = {
+	{ PCI_VDEVICE(INTEL, INTEL_MID_DMAC1_ID),	INFO(2, 6, 4095, 0x200020)},
+	{ PCI_VDEVICE(INTEL, INTEL_MID_DMAC2_ID),	INFO(2, 0, 2047, 0)},
+	{ PCI_VDEVICE(INTEL, INTEL_MID_GP_DMAC2_ID),	INFO(2, 0, 2047, 0)},
+	{ PCI_VDEVICE(INTEL, INTEL_MFLD_DMAC1_ID),	INFO(4, 0, 4095, 0x400040)},
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, intel_mid_dma_ids);
+
+static struct pci_driver intel_mid_dma_pci = {
+	.name		=	"Intel MID DMA",
+	.id_table	=	intel_mid_dma_ids,
+	.probe		=	intel_mid_dma_probe,
+	.remove		=	__devexit_p(intel_mid_dma_remove),
+};
+
+static int __init intel_mid_dma_init(void)
+{
+	pr_debug("INFO_MDMA: LNW DMA Driver Version %s\n",
+			INTEL_MID_DMA_DRIVER_VERSION);
+	return pci_register_driver(&intel_mid_dma_pci);
+}
+fs_initcall(intel_mid_dma_init);
+
+static void __exit intel_mid_dma_exit(void)
+{
+	pci_unregister_driver(&intel_mid_dma_pci);
+}
+module_exit(intel_mid_dma_exit);
+
+MODULE_AUTHOR("Vinod Koul <vinod.koul@intel.com>");
+MODULE_DESCRIPTION("Intel (R) MID DMAC Driver");
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(INTEL_MID_DMA_DRIVER_VERSION);
diff --git a/drivers/dma/intel_mid_dma_regs.h b/drivers/dma/intel_mid_dma_regs.h
new file mode 100644
index 000000000000..d81aa658ab09
--- /dev/null
+++ b/drivers/dma/intel_mid_dma_regs.h
@@ -0,0 +1,260 @@
+/*
+ *  intel_mid_dma_regs.h - Intel MID DMA Drivers
+ *
+ *  Copyright (C) 2008-10 Intel Corp
+ *  Author: Vinod Koul <vinod.koul@intel.com>
+ *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *
+ */
+#ifndef __INTEL_MID_DMAC_REGS_H__
+#define __INTEL_MID_DMAC_REGS_H__
+
+#include <linux/dmaengine.h>
+#include <linux/dmapool.h>
+#include <linux/pci_ids.h>
+
+#define INTEL_MID_DMA_DRIVER_VERSION "1.0.5"
+
+#define	REG_BIT0		0x00000001
+#define	REG_BIT8		0x00000100
+
+#define UNMASK_INTR_REG(chan_num) \
+	((REG_BIT0 << chan_num) | (REG_BIT8 << chan_num))
+#define MASK_INTR_REG(chan_num) (REG_BIT8 << chan_num)
+
+#define ENABLE_CHANNEL(chan_num) \
+	((REG_BIT0 << chan_num) | (REG_BIT8 << chan_num))
+
+#define DESCS_PER_CHANNEL	16
+/*DMA Registers*/
+/*registers associated with channel programming*/
+#define DMA_REG_SIZE		0x400
+#define DMA_CH_SIZE		0x58
+
+/*CH X REG = (DMA_CH_SIZE)*CH_NO + REG*/
+#define SAR			0x00 /* Source Address Register*/
+#define DAR			0x08 /* Destination Address Register*/
+#define CTL_LOW			0x18 /* Control Register*/
+#define CTL_HIGH		0x1C /* Control Register*/
+#define CFG_LOW			0x40 /* Configuration Register Low*/
+#define CFG_HIGH		0x44 /* Configuration Register high*/
+
+#define STATUS_TFR		0x2E8
+#define STATUS_BLOCK		0x2F0
+#define STATUS_ERR		0x308
+
+#define RAW_TFR			0x2C0
+#define RAW_BLOCK		0x2C8
+#define RAW_ERR			0x2E0
+
+#define MASK_TFR		0x310
+#define MASK_BLOCK		0x318
+#define MASK_SRC_TRAN		0x320
+#define MASK_DST_TRAN		0x328
+#define MASK_ERR		0x330
+
+#define CLEAR_TFR		0x338
+#define CLEAR_BLOCK		0x340
+#define CLEAR_SRC_TRAN		0x348
+#define CLEAR_DST_TRAN		0x350
+#define CLEAR_ERR		0x358
+
+#define INTR_STATUS		0x360
+#define DMA_CFG			0x398
+#define DMA_CHAN_EN		0x3A0
+
+/*DMA channel control registers*/
+union intel_mid_dma_ctl_lo {
+	struct {
+		u32	int_en:1;	/*enable or disable interrupts*/
+					/*should be 0*/
+		u32	dst_tr_width:3;	/*destination transfer width*/
+					/*usually 32 bits = 010*/
+		u32	src_tr_width:3; /*source transfer width*/
+					/*usually 32 bits = 010*/
+		u32	dinc:2;		/*destination address inc/dec*/
+					/*For mem:INC=00, Periphral NoINC=11*/
+		u32	sinc:2;		/*source address inc or dec, as above*/
+		u32	dst_msize:3;	/*destination burst transaction length*/
+					/*always = 16 ie 011*/
+		u32	src_msize:3;	/*source burst transaction length*/
+					/*always = 16 ie 011*/
+		u32	reser1:3;
+		u32	tt_fc:3;	/*transfer type and flow controller*/
+					/*M-M = 000
+					  P-M = 010
+					  M-P = 001*/
+		u32	dms:2;		/*destination master select = 0*/
+		u32	sms:2;		/*source master select = 0*/
+		u32	llp_dst_en:1;	/*enable/disable destination LLP = 0*/
+		u32	llp_src_en:1;	/*enable/disable source LLP = 0*/
+		u32	reser2:3;
+	} ctlx;
+	u32	ctl_lo;
+};
+
+union intel_mid_dma_ctl_hi {
+	struct {
+		u32	block_ts:12;	/*block transfer size*/
+					/*configured by DMAC*/
+		u32	reser:20;
+	} ctlx;
+	u32	ctl_hi;
+
+};
+
+/*DMA channel configuration registers*/
+union intel_mid_dma_cfg_lo {
+	struct {
+		u32	reser1:5;
+		u32	ch_prior:3;	/*channel priority = 0*/
+		u32	ch_susp:1;	/*channel suspend = 0*/
+		u32	fifo_empty:1;	/*FIFO empty or not R bit = 0*/
+		u32	hs_sel_dst:1;	/*select HW/SW destn handshaking*/
+					/*HW = 0, SW = 1*/
+		u32	hs_sel_src:1;	/*select HW/SW src handshaking*/
+		u32	reser2:6;
+		u32	dst_hs_pol:1;	/*dest HS interface polarity*/
+		u32	src_hs_pol:1;	/*src HS interface polarity*/
+		u32	max_abrst:10;	/*max AMBA burst len = 0 (no sw limit*/
+		u32	reload_src:1;	/*auto reload src addr =1 if src is P*/
+		u32	reload_dst:1;	/*AR destn addr =1 if dstn is P*/
+	} cfgx;
+	u32	cfg_lo;
+};
+
+union intel_mid_dma_cfg_hi {
+	struct {
+		u32	fcmode:1;	/*flow control mode = 1*/
+		u32	fifo_mode:1;	/*FIFO mode select = 1*/
+		u32	protctl:3;	/*protection control = 0*/
+		u32	rsvd:2;
+		u32	src_per:4;	/*src hw HS interface*/
+		u32	dst_per:4;	/*dstn hw HS interface*/
+		u32	reser2:17;
+	} cfgx;
+	u32	cfg_hi;
+};
+
+/**
+ * struct intel_mid_dma_chan - internal mid representation of a DMA channel
+ * @chan: dma_chan strcture represetation for mid chan
+ * @ch_regs: MMIO register space pointer to channel register
+ * @dma_base: MMIO register space DMA engine base pointer
+ * @ch_id: DMA channel id
+ * @lock: channel spinlock
+ * @completed: DMA cookie
+ * @active_list: current active descriptors
+ * @queue: current queued up descriptors
+ * @free_list: current free descriptors
+ * @slave: dma slave struture
+ * @descs_allocated: total number of decsiptors allocated
+ * @dma: dma device struture pointer
+ * @in_use: bool representing if ch is in use or not
+ */
+struct intel_mid_dma_chan {
+	struct dma_chan		chan;
+	void __iomem		*ch_regs;
+	void __iomem		*dma_base;
+	int			ch_id;
+	spinlock_t		lock;
+	dma_cookie_t		completed;
+	struct list_head	active_list;
+	struct list_head	queue;
+	struct list_head	free_list;
+	struct intel_mid_dma_slave	*slave;
+	unsigned int		descs_allocated;
+	struct middma_device	*dma;
+	bool			in_use;
+};
+
+static inline struct intel_mid_dma_chan *to_intel_mid_dma_chan(
+						struct dma_chan *chan)
+{
+	return container_of(chan, struct intel_mid_dma_chan, chan);
+}
+
+/**
+ * struct middma_device - internal representation of a DMA device
+ * @pdev: PCI device
+ * @dma_base: MMIO register space pointer of DMA
+ * @dma_pool: for allocating DMA descriptors
+ * @common: embedded struct dma_device
+ * @tasklet: dma tasklet for processing interrupts
+ * @ch: per channel data
+ * @pci_id: DMA device PCI ID
+ * @intr_mask: Interrupt mask to be used
+ * @mask_reg: MMIO register for periphral mask
+ * @chan_base: Base ch index (read from driver data)
+ * @max_chan: max number of chs supported (from drv_data)
+ * @block_size: Block size of DMA transfer supported (from drv_data)
+ * @pimr_mask: MMIO register addr for periphral interrupt (from drv_data)
+ */
+struct middma_device {
+	struct pci_dev		*pdev;
+	void __iomem		*dma_base;
+	struct pci_pool		*dma_pool;
+	struct dma_device	common;
+	struct tasklet_struct   tasklet;
+	struct intel_mid_dma_chan ch[MAX_CHAN];
+	unsigned int		pci_id;
+	unsigned int		intr_mask;
+	void __iomem		*mask_reg;
+	int			chan_base;
+	int			max_chan;
+	int			block_size;
+	unsigned int		pimr_mask;
+};
+
+static inline struct middma_device *to_middma_device(struct dma_device *common)
+{
+	return container_of(common, struct middma_device, common);
+}
+
+struct intel_mid_dma_desc {
+	void __iomem			*block; /*ch ptr*/
+	struct list_head		desc_node;
+	struct dma_async_tx_descriptor	txd;
+	size_t				len;
+	dma_addr_t			sar;
+	dma_addr_t			dar;
+	u32				cfg_hi;
+	u32				cfg_lo;
+	u32				ctl_lo;
+	u32				ctl_hi;
+	dma_addr_t			next;
+	enum dma_data_direction		dirn;
+	enum dma_status			status;
+	enum intel_mid_dma_width	width; /*width of DMA txn*/
+	enum intel_mid_dma_mode		cfg_mode; /*mode configuration*/
+
+};
+
+static inline int test_ch_en(void __iomem *dma, u32 ch_no)
+{
+	u32 en_reg = ioread32(dma + DMA_CHAN_EN);
+	return (en_reg >> ch_no) & 0x1;
+}
+
+static inline struct intel_mid_dma_desc *to_intel_mid_dma_desc
+		(struct dma_async_tx_descriptor *txd)
+{
+	return container_of(txd, struct intel_mid_dma_desc, txd);
+}
+#endif /*__INTEL_MID_DMAC_REGS_H__*/
diff --git a/include/linux/intel_mid_dma.h b/include/linux/intel_mid_dma.h
new file mode 100644
index 000000000000..d9d08b6269b6
--- /dev/null
+++ b/include/linux/intel_mid_dma.h
@@ -0,0 +1,86 @@
+/*
+ *  intel_mid_dma.h - Intel MID DMA Drivers
+ *
+ *  Copyright (C) 2008-10 Intel Corp
+ *  Author: Vinod Koul <vinod.koul@intel.com>
+ *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *
+ */
+#ifndef __INTEL_MID_DMA_H__
+#define __INTEL_MID_DMA_H__
+
+#include <linux/dmaengine.h>
+
+/*DMA transaction width, src and dstn width would be same
+The DMA length must be width aligned,
+for 32 bit width the length must be 32 bit (4bytes) aligned only*/
+enum intel_mid_dma_width {
+	LNW_DMA_WIDTH_8BIT = 0x0,
+	LNW_DMA_WIDTH_16BIT = 0x1,
+	LNW_DMA_WIDTH_32BIT = 0x2,
+};
+
+/*DMA mode configurations*/
+enum intel_mid_dma_mode {
+	LNW_DMA_PER_TO_MEM = 0, /*periphral to memory configuration*/
+	LNW_DMA_MEM_TO_PER,	/*memory to periphral configuration*/
+	LNW_DMA_MEM_TO_MEM,	/*mem to mem confg (testing only)*/
+};
+
+/*DMA handshaking*/
+enum intel_mid_dma_hs_mode {
+	LNW_DMA_HW_HS = 0,	/*HW Handshaking only*/
+	LNW_DMA_SW_HS = 1,	/*SW Handshaking not recommended*/
+};
+
+/*Burst size configuration*/
+enum intel_mid_dma_msize {
+	LNW_DMA_MSIZE_1 = 0x0,
+	LNW_DMA_MSIZE_4 = 0x1,
+	LNW_DMA_MSIZE_8 = 0x2,
+	LNW_DMA_MSIZE_16 = 0x3,
+	LNW_DMA_MSIZE_32 = 0x4,
+	LNW_DMA_MSIZE_64 = 0x5,
+};
+
+/**
+ * struct intel_mid_dma_slave - DMA slave structure
+ *
+ * @dirn: DMA trf direction
+ * @src_width: tx register width
+ * @dst_width: rx register width
+ * @hs_mode: HW/SW handshaking mode
+ * @cfg_mode: DMA data transfer mode (per-per/mem-per/mem-mem)
+ * @src_msize: Source DMA burst size
+ * @dst_msize: Dst DMA burst size
+ * @device_instance: DMA peripheral device instance, we can have multiple
+ *		peripheral device connected to single DMAC
+ */
+struct intel_mid_dma_slave {
+	enum dma_data_direction		dirn;
+	enum intel_mid_dma_width	src_width; /*width of DMA src txn*/
+	enum intel_mid_dma_width	dst_width; /*width of DMA dst txn*/
+	enum intel_mid_dma_hs_mode	hs_mode;  /*handshaking*/
+	enum intel_mid_dma_mode		cfg_mode; /*mode configuration*/
+	enum intel_mid_dma_msize	src_msize; /*size if src burst*/
+	enum intel_mid_dma_msize	dst_msize; /*size of dst burst*/
+	unsigned int		device_instance; /*0, 1 for periphral instance*/
+};
+
+#endif /*__INTEL_MID_DMA_H__*/
-- 
cgit v1.2.3-59-g8ed1b


From d7926ee38f5c6e0bbebe712304f99a4c67e40f84 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <samudrala.sridhar@gmail.com>
Date: Sun, 30 May 2010 22:24:39 +0200
Subject: cgroups: Add an API to attach a task to current task's cgroup

Add a new kernel API to attach a task to current task's cgroup
in all the active hierarchies.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Paul Menage <menage@google.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h |  7 +++++++
 kernel/cgroup.c        | 23 +++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0c621604baa1..e0aa067d1b11 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -570,6 +570,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
+int cgroup_attach_task_current_cg(struct task_struct *);
 
 /*
  * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
@@ -626,6 +627,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
 	return -EINVAL;
 }
 
+/* No cgroups - nothing to do */
+static inline int cgroup_attach_task_current_cg(struct task_struct *t)
+{
+	return 0;
+}
+
 #endif /* !CONFIG_CGROUPS */
 
 #endif /* _LINUX_CGROUP_H */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 422cb19f156e..37642ad9cca8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1788,6 +1788,29 @@ out:
 	return retval;
 }
 
+/**
+ * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_current_cg(struct task_struct *tsk)
+{
+	struct cgroupfs_root *root;
+	struct cgroup *cur_cg;
+	int retval = 0;
+
+	cgroup_lock();
+	for_each_active_root(root) {
+		cur_cg = task_cgroup_from_root(current, root);
+		retval = cgroup_attach_task(cur_cg, tsk);
+		if (retval)
+			break;
+	}
+	cgroup_unlock();
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg);
+
 /*
  * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
  * held. May take task_lock of task
-- 
cgit v1.2.3-59-g8ed1b


From e9fd702a58c49dbb14481dca88dad44758da393a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 20:12:04 -0500
Subject: audit: convert audit watches to use fsnotify instead of inotify

Audit currently uses inotify to pin inodes in core and to detect when
watched inodes are deleted or unmounted.  This patch uses fsnotify instead
of inotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/fsnotify_backend.h |   5 +-
 kernel/audit_watch.c             | 208 ++++++++++++++++++++++++++++-----------
 2 files changed, 152 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 4d6f47b51189..8f8341e9f021 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -58,9 +58,12 @@
 				   FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
 				   FS_DELETE)
 
+#define FS_MOVE			(FS_MOVED_FROM | FS_MOVED_TO)
+
 /* listeners that hard code group numbers near the top */
 #define DNOTIFY_GROUP_NUM	UINT_MAX
-#define INOTIFY_GROUP_NUM	(DNOTIFY_GROUP_NUM-1)
+#define AUDIT_WATCH_GROUP_NUM  (DNOTIFY_GROUP_NUM-1)
+#define INOTIFY_GROUP_NUM      (AUDIT_WATCH_GROUP_NUM-1)
 
 struct fsnotify_group;
 struct fsnotify_event;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index c2ca7168bfd1..ff5be849473d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -24,18 +24,18 @@
 #include <linux/kthread.h>
 #include <linux/mutex.h>
 #include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/inotify.h>
 #include <linux/security.h>
 #include "audit.h"
 
 /*
  * Reference counting:
  *
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
  * 	event.  Each audit_watch holds a reference to its associated parent.
  *
  * audit_watch: if added to lists, lifetime is from audit_init_watch() to
@@ -57,26 +57,27 @@ struct audit_watch {
 struct audit_parent {
 	struct list_head	ilist;	/* tmp list used to free parents */
 	struct list_head	watches; /* anchor for audit_watch->wlist */
-	struct inotify_watch	wdata;	/* inotify watch data */
+	struct fsnotify_mark_entry mark; /* fsnotify mark on the inode */
 	unsigned		flags;	/* status flags */
 };
 
-/* Inotify handle. */
-struct inotify_handle *audit_ih;
+/* fsnotify handle. */
+struct fsnotify_group *audit_watch_group;
 
 /*
  * audit_parent status flags:
  *
  * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
  * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
+ * Technically not needed for FS_DELETE_SELF or FS_UNMOUNT events, as we cannot
+ * receive them while we have nameidata, but must be used for FS_MOVE_SELF which
  * we can receive while holding nameidata.
  */
 #define AUDIT_PARENT_INVALID	0x001
 
-/* Inotify events we care about. */
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+/* fsnotify events we care about. */
+#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+			FS_MOVE_SELF | FS_EVENT_ON_CHILD)
 
 static void audit_free_parent(struct audit_parent *parent)
 {
@@ -84,14 +85,45 @@ static void audit_free_parent(struct audit_parent *parent)
 	kfree(parent);
 }
 
-static void audit_destroy_watch(struct inotify_watch *i_watch)
+static void audit_watch_free_mark(struct fsnotify_mark_entry *entry)
 {
 	struct audit_parent *parent;
 
-	parent = container_of(i_watch, struct audit_parent, wdata);
+	parent = container_of(entry, struct audit_parent, mark);
 	audit_free_parent(parent);
 }
 
+static void audit_get_parent(struct audit_parent *parent)
+{
+	if (likely(parent))
+		fsnotify_get_mark(&parent->mark);
+}
+
+static void audit_put_parent(struct audit_parent *parent)
+{
+	if (likely(parent))
+		fsnotify_put_mark(&parent->mark);
+}
+
+/*
+ * Find and return the audit_parent on the given inode.  If found a reference
+ * is taken on this parent.
+ */
+static inline struct audit_parent *audit_find_parent(struct inode *inode)
+{
+	struct audit_parent *parent = NULL;
+	struct fsnotify_mark_entry *entry;
+
+	spin_lock(&inode->i_lock);
+	entry = fsnotify_find_mark_entry(audit_watch_group, inode);
+	spin_unlock(&inode->i_lock);
+
+	if (entry)
+		parent = container_of(entry, struct audit_parent, mark);
+
+	return parent;
+}
+
 void audit_get_watch(struct audit_watch *watch)
 {
 	atomic_inc(&watch->count);
@@ -110,7 +142,7 @@ void audit_put_watch(struct audit_watch *watch)
 void audit_remove_watch(struct audit_watch *watch)
 {
 	list_del(&watch->wlist);
-	put_inotify_watch(&watch->parent->wdata);
+	audit_put_parent(watch->parent);
 	watch->parent = NULL;
 	audit_put_watch(watch); /* match initial get */
 }
@@ -130,8 +162,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
 /* Initialize a parent watch entry. */
 static struct audit_parent *audit_init_parent(struct nameidata *ndp)
 {
+	struct inode *inode = ndp->path.dentry->d_inode;
 	struct audit_parent *parent;
-	s32 wd;
+	int ret;
 
 	parent = kzalloc(sizeof(*parent), GFP_KERNEL);
 	if (unlikely(!parent))
@@ -140,14 +173,14 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp)
 	INIT_LIST_HEAD(&parent->watches);
 	parent->flags = 0;
 
-	inotify_init_watch(&parent->wdata);
-	/* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
-	get_inotify_watch(&parent->wdata);
-	wd = inotify_add_watch(audit_ih, &parent->wdata,
-			       ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
-	if (wd < 0) {
+	fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
+	parent->mark.mask = AUDIT_FS_WATCH;
+	/* grab a ref so fsnotify mark hangs around until we take audit_filter_mutex */
+	audit_get_parent(parent);
+	ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode);
+	if (ret < 0) {
 		audit_free_parent(parent);
-		return ERR_PTR(wd);
+		return ERR_PTR(ret);
 	}
 
 	return parent;
@@ -176,7 +209,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
 {
 	struct audit_watch *watch;
 
-	if (!audit_ih)
+	if (!audit_watch_group)
 		return -EOPNOTSUPP;
 
 	if (path[0] != '/' || path[len-1] == '/' ||
@@ -214,7 +247,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
 
 	new->dev = old->dev;
 	new->ino = old->ino;
-	get_inotify_watch(&old->parent->wdata);
+	audit_get_parent(old->parent);
 	new->parent = old->parent;
 
 out:
@@ -335,19 +368,21 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 		audit_remove_watch(w);
 	}
 	mutex_unlock(&audit_filter_mutex);
+
+	fsnotify_destroy_mark_by_entry(&parent->mark);
 }
 
 /* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
+ * Generates an FS_IGNORED event. */
 void audit_watch_inotify_unregister(struct list_head *in_list)
 {
 	struct audit_parent *p, *n;
 
 	list_for_each_entry_safe(p, n, in_list, ilist) {
 		list_del(&p->ilist);
-		inotify_rm_watch(audit_ih, &p->wdata);
-		/* the unpin matching the pin in audit_remove_watch_rule() */
-		unpin_inotify_watch(&p->wdata);
+		fsnotify_destroy_mark_by_entry(&p->mark);
+		/* matches the get in audit_remove_watch_rule() */
+		audit_put_parent(p);
 	}
 }
 
@@ -399,7 +434,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
 	}
 }
 
-/* Associate the given rule with an existing parent inotify_watch.
+/* Associate the given rule with an existing parent.
  * Caller must hold audit_filter_mutex. */
 static void audit_add_to_parent(struct audit_krule *krule,
 				struct audit_parent *parent)
@@ -407,6 +442,8 @@ static void audit_add_to_parent(struct audit_krule *krule,
 	struct audit_watch *w, *watch = krule->watch;
 	int watch_found = 0;
 
+	BUG_ON(!mutex_is_locked(&audit_filter_mutex));
+
 	list_for_each_entry(w, &parent->watches, wlist) {
 		if (strcmp(watch->path, w->path))
 			continue;
@@ -423,7 +460,7 @@ static void audit_add_to_parent(struct audit_krule *krule,
 	}
 
 	if (!watch_found) {
-		get_inotify_watch(&parent->wdata);
+		audit_get_parent(parent);
 		watch->parent = parent;
 
 		list_add(&watch->wlist, &parent->watches);
@@ -436,7 +473,6 @@ static void audit_add_to_parent(struct audit_krule *krule,
 int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 {
 	struct audit_watch *watch = krule->watch;
-	struct inotify_watch *i_watch;
 	struct audit_parent *parent;
 	struct nameidata *ndp = NULL, *ndw = NULL;
 	int h, ret = 0;
@@ -462,8 +498,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 	 * inotify watch is found, inotify_find_watch() grabs a reference before
 	 * returning.
 	 */
-	if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
-			       &i_watch) < 0) {
+	parent = audit_find_parent(ndp->path.dentry->d_inode);
+	if (!parent) {
 		parent = audit_init_parent(ndp);
 		if (IS_ERR(parent)) {
 			/* caller expects mutex locked */
@@ -471,8 +507,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 			ret = PTR_ERR(parent);
 			goto error;
 		}
-	} else
-		parent = container_of(i_watch, struct audit_parent, wdata);
+	}
 
 	mutex_lock(&audit_filter_mutex);
 
@@ -482,8 +517,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 	else
 		audit_add_to_parent(krule, parent);
 
-	/* match get in audit_init_parent or inotify_find_watch */
-	put_inotify_watch(&parent->wdata);
+	/* match get in audit_find_parent or audit_init_parent */
+	audit_put_parent(parent);
 
 	h = audit_hash_ino((u32)watch->ino);
 	*list = &audit_inode_hash[h];
@@ -504,52 +539,105 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
 		audit_remove_watch(watch);
 
 		if (list_empty(&parent->watches)) {
-			/* Put parent on the inotify un-registration
-			 * list.  Grab a reference before releasing
+			/* Put parent on the un-registration list.
+			 * Grab a reference before releasing
 			 * audit_filter_mutex, to be released in
-			 * audit_inotify_unregister().
+			 * audit_watch_inotify_unregister().
 			 * If filesystem is going away, just leave
 			 * the sucker alone, eviction will take
 			 * care of it. */
-			if (pin_inotify_watch(&parent->wdata))
-				list_add(&parent->ilist, list);
+			audit_get_parent(parent);
+			list_add(&parent->ilist, list);
 		}
 	}
 }
 
-/* Update watch data in audit rules based on inotify events. */
-static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
-			 u32 cookie, const char *dname, struct inode *inode)
+static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
 {
+	struct fsnotify_mark_entry *entry;
+	bool send;
+
+	spin_lock(&inode->i_lock);
+	entry = fsnotify_find_mark_entry(group, inode);
+	spin_unlock(&inode->i_lock);
+	if (!entry)
+		return false;
+
+	mask = (mask & ~FS_EVENT_ON_CHILD);
+	send = (entry->mask & mask);
+
+	/* find took a reference */
+	fsnotify_put_mark(entry);
+
+	return send;
+}
+
+/* Update watch data in audit rules based on fsnotify events. */
+static int audit_watch_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	struct inode *inode;
+	__u32 mask = event->mask;
+	const char *dname = event->file_name;
 	struct audit_parent *parent;
 
-	parent = container_of(i_watch, struct audit_parent, wdata);
+	BUG_ON(group != audit_watch_group);
+
+	parent = audit_find_parent(event->to_tell);
+	if (unlikely(!parent))
+		return 0;
+
+	switch (event->data_type) {
+	case (FSNOTIFY_EVENT_PATH):
+		inode = event->path.dentry->d_inode;
+		break;
+	case (FSNOTIFY_EVENT_INODE):
+		inode = event->inode;
+		break;
+	default:
+		BUG();
+		inode = NULL;
+		break;
+	};
 
-	if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
+	if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
 		audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
-	else if (mask & (IN_DELETE|IN_MOVED_FROM))
+	else if (mask & (FS_DELETE|FS_MOVED_FROM))
 		audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
-	/* inotify automatically removes the watch and sends IN_IGNORED */
-	else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
+	else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
 		audit_remove_parent_watches(parent);
-	/* inotify does not remove the watch, so remove it manually */
-	else if(mask & IN_MOVE_SELF) {
-		audit_remove_parent_watches(parent);
-		inotify_remove_watch_locked(audit_ih, i_watch);
-	} else if (mask & IN_IGNORED)
-		put_inotify_watch(i_watch);
+	/* moved put_inotify_watch to freeing mark */
+
+	/* matched the ref taken by audit_find_parent */
+	audit_put_parent(parent);
+
+	return 0;
+}
+
+static void audit_watch_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+	struct audit_parent *parent;
+
+	parent = container_of(entry, struct audit_parent, mark);
+	/* taken from audit_handle_ievent & FS_IGNORED please figure out what I match... */
+	audit_put_parent(parent);
 }
 
-static const struct inotify_operations audit_inotify_ops = {
-	.handle_event   = audit_handle_ievent,
-	.destroy_watch  = audit_destroy_watch,
+static const struct fsnotify_ops audit_watch_fsnotify_ops = {
+	.should_send_event = 	audit_watch_should_send_event,
+	.handle_event = 	audit_watch_handle_event,
+	.free_group_priv = 	NULL,
+	.freeing_mark = 	audit_watch_freeing_mark,
+	.free_event_priv = 	NULL,
 };
 
 static int __init audit_watch_init(void)
 {
-	audit_ih = inotify_init(&audit_inotify_ops);
-	if (IS_ERR(audit_ih))
-		audit_panic("cannot initialize inotify handle");
+	audit_watch_group = fsnotify_obtain_group(AUDIT_WATCH_GROUP_NUM, AUDIT_FS_WATCH,
+						  &audit_watch_fsnotify_ops);
+	if (IS_ERR(audit_watch_group)) {
+		audit_watch_group = NULL;
+		audit_panic("cannot create audit fsnotify group");
+	}
 	return 0;
 }
 subsys_initcall(audit_watch_init);
-- 
cgit v1.2.3-59-g8ed1b


From 9e1c74321d87a8b079f04d89e750b39a43365e1f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 20:12:05 -0500
Subject: fsnotify: duplicate fsnotify_mark_entry data between 2 marks

Simple copy fsnotify information from one mark to another in preparation
for the second mark to replace the first.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inode_mark.c           | 10 +++++++++-
 include/linux/fsnotify_backend.h |  2 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 0399bcbe09c8..a13cf9e9233a 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -282,12 +282,20 @@ struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *grou
 	return NULL;
 }
 
+void fsnotify_duplicate_mark(struct fsnotify_mark_entry *new, struct fsnotify_mark_entry *old)
+{
+	assert_spin_locked(&old->lock);
+	new->inode = old->inode;
+	new->group = old->group;
+	new->mask = old->mask;
+	new->free_mark = old->free_mark;
+}
+
 /*
  * Nothing fancy, just initialize lists and locks and counters.
  */
 void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
 			void (*free_mark)(struct fsnotify_mark_entry *entry))
-
 {
 	spin_lock_init(&entry->lock);
 	atomic_set(&entry->refcnt, 1);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 8f8341e9f021..390516732956 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -342,6 +342,8 @@ extern void fsnotify_recalc_inode_mask(struct inode *inode);
 extern void fsnotify_init_mark(struct fsnotify_mark_entry *entry, void (*free_mark)(struct fsnotify_mark_entry *entry));
 /* find (and take a reference) to a mark associated with group and inode */
 extern struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, struct inode *inode);
+/* copy the values from old into new */
+extern void fsnotify_duplicate_mark(struct fsnotify_mark_entry *new, struct fsnotify_mark_entry *old);
 /* attach the mark to both the group and the inode */
 extern int fsnotify_add_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group, struct inode *inode);
 /* given a mark, flag it to be freed when all references are dropped */
-- 
cgit v1.2.3-59-g8ed1b


From 40554c3dae83bd892b7fbfaa2ea9de739cbcf065 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 20:12:05 -0500
Subject: fsnotify: allow addition of duplicate fsnotify marks

This patch allows a task to add a second fsnotify mark to an inode for the
same group.  This mark will be added to the end of the inode's list and
this will never be found by the stand fsnotify_find_mark() function.   This
is useful if a user wants to add a new mark before removing the old one.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c      | 2 +-
 fs/notify/inode_mark.c           | 8 +++++---
 fs/notify/inotify/inotify_user.c | 2 +-
 include/linux/fsnotify_backend.h | 2 +-
 kernel/audit_watch.c             | 2 +-
 5 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 7e54e52964dd..85b97fca14de 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -362,7 +362,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 		dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
 		spin_lock(&entry->lock);
 	} else {
-		fsnotify_add_mark(new_entry, dnotify_group, inode);
+		fsnotify_add_mark(new_entry, dnotify_group, inode, 0);
 		spin_lock(&new_entry->lock);
 		entry = new_entry;
 		dnentry = new_dnentry;
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index a13cf9e9233a..7d2962e5328e 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -312,9 +312,10 @@ void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
  * event types should be delivered to which group and for which inodes.
  */
 int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
-		      struct fsnotify_group *group, struct inode *inode)
+		      struct fsnotify_group *group, struct inode *inode,
+		      int allow_dups)
 {
-	struct fsnotify_mark_entry *lentry;
+	struct fsnotify_mark_entry *lentry = NULL;
 	int ret = 0;
 
 	inode = igrab(inode);
@@ -331,7 +332,8 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 	spin_lock(&group->mark_lock);
 	spin_lock(&inode->i_lock);
 
-	lentry = fsnotify_find_mark_entry(group, inode);
+	if (!allow_dups)
+		lentry = fsnotify_find_mark_entry(group, inode);
 	if (!lentry) {
 		entry->group = group;
 		entry->inode = inode;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 653c507b1bb3..f22a04005db2 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -651,7 +651,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
 		goto out_err;
 
 	/* we are on the idr, now get on the inode */
-	ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+	ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode, 0);
 	if (ret) {
 		/* we failed to get on the inode, get off the idr */
 		inotify_remove_from_idr(group, tmp_ientry);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 390516732956..1679f250d59e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -345,7 +345,7 @@ extern struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_grou
 /* copy the values from old into new */
 extern void fsnotify_duplicate_mark(struct fsnotify_mark_entry *new, struct fsnotify_mark_entry *old);
 /* attach the mark to both the group and the inode */
-extern int fsnotify_add_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group, struct inode *inode);
+extern int fsnotify_add_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group, struct inode *inode, int allow_dups);
 /* given a mark, flag it to be freed when all references are dropped */
 extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry);
 /* run all the marks in a group, and flag them to be freed */
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 75ab53987ece..c44de0c4fc47 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -161,7 +161,7 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp)
 
 	fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
 	parent->mark.mask = AUDIT_FS_WATCH;
-	ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode);
+	ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, 0);
 	if (ret < 0) {
 		audit_free_parent(parent);
 		return ERR_PTR(ret);
-- 
cgit v1.2.3-59-g8ed1b


From 28a3a7eb3b1f3e7d834e19f06e794e429058a4dd Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 20:12:05 -0500
Subject: audit: reimplement audit_trees using fsnotify rather than inotify

Simply switch audit_trees from using inotify to using fsnotify for it's
inode pinning and disappearing act information.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/fsnotify_backend.h |   5 +-
 init/Kconfig                     |   2 +-
 kernel/audit_tree.c              | 234 ++++++++++++++++++++++-----------------
 kernel/auditsc.c                 |   4 +-
 4 files changed, 136 insertions(+), 109 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 1679f250d59e..e25284371020 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -62,8 +62,9 @@
 
 /* listeners that hard code group numbers near the top */
 #define DNOTIFY_GROUP_NUM	UINT_MAX
-#define AUDIT_WATCH_GROUP_NUM  (DNOTIFY_GROUP_NUM-1)
-#define INOTIFY_GROUP_NUM      (AUDIT_WATCH_GROUP_NUM-1)
+#define AUDIT_WATCH_GROUP_NUM	(DNOTIFY_GROUP_NUM-1)
+#define AUDIT_TREE_GROUP_NUM	(AUDIT_WATCH_GROUP_NUM-1)
+#define INOTIFY_GROUP_NUM	(AUDIT_TREE_GROUP_NUM-1)
 
 struct fsnotify_group;
 struct fsnotify_event;
diff --git a/init/Kconfig b/init/Kconfig
index 5cff9a980c39..84e33c49a0cb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -326,7 +326,7 @@ config AUDITSYSCALL
 config AUDIT_TREE
 	def_bool y
 	depends on AUDITSYSCALL
-	select INOTIFY
+	select FSNOTIFY
 
 menu "RCU Subsystem"
 
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 46a57b57a335..a164600dd82e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,5 +1,5 @@
 #include "audit.h"
-#include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/kthread.h>
@@ -22,7 +22,7 @@ struct audit_tree {
 
 struct audit_chunk {
 	struct list_head hash;
-	struct inotify_watch watch;
+	struct fsnotify_mark_entry mark;
 	struct list_head trees;		/* with root here */
 	int dead;
 	int count;
@@ -59,7 +59,7 @@ static LIST_HEAD(prune_list);
  * tree is refcounted; one reference for "some rules on rules_list refer to
  * it", one for each chunk with pointer to it.
  *
- * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
  * of watch contributes 1 to .refs).
  *
  * node.index allows to get from node.list to containing chunk.
@@ -68,7 +68,7 @@ static LIST_HEAD(prune_list);
  * that makes a difference.  Some.
  */
 
-static struct inotify_handle *rtree_ih;
+static struct fsnotify_group *audit_tree_group;
 
 static struct audit_tree *alloc_tree(const char *s)
 {
@@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree)
 	return tree->pathname;
 }
 
-static struct audit_chunk *alloc_chunk(int count)
-{
-	struct audit_chunk *chunk;
-	size_t size;
-	int i;
-
-	size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
-	chunk = kzalloc(size, GFP_KERNEL);
-	if (!chunk)
-		return NULL;
-
-	INIT_LIST_HEAD(&chunk->hash);
-	INIT_LIST_HEAD(&chunk->trees);
-	chunk->count = count;
-	atomic_long_set(&chunk->refs, 1);
-	for (i = 0; i < count; i++) {
-		INIT_LIST_HEAD(&chunk->owners[i].list);
-		chunk->owners[i].index = i;
-	}
-	inotify_init_watch(&chunk->watch);
-	return chunk;
-}
-
 static void free_chunk(struct audit_chunk *chunk)
 {
 	int i;
@@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu)
 	audit_put_chunk(chunk);
 }
 
+static void audit_tree_destroy_watch(struct fsnotify_mark_entry *entry)
+{
+	struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+	call_rcu(&chunk->head, __put_chunk);
+}
+
+static struct audit_chunk *alloc_chunk(int count)
+{
+	struct audit_chunk *chunk;
+	size_t size;
+	int i;
+
+	size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
+	chunk = kzalloc(size, GFP_KERNEL);
+	if (!chunk)
+		return NULL;
+
+	INIT_LIST_HEAD(&chunk->hash);
+	INIT_LIST_HEAD(&chunk->trees);
+	chunk->count = count;
+	atomic_long_set(&chunk->refs, 1);
+	for (i = 0; i < count; i++) {
+		INIT_LIST_HEAD(&chunk->owners[i].list);
+		chunk->owners[i].index = i;
+	}
+	fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+	return chunk;
+}
+
 enum {HASH_SIZE = 128};
 static struct list_head chunk_hash_heads[HASH_SIZE];
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
 	return chunk_hash_heads + n % HASH_SIZE;
 }
 
-/* hash_lock is held by caller */
+/* hash_lock & entry->lock is held by caller */
 static void insert_hash(struct audit_chunk *chunk)
 {
-	struct list_head *list = chunk_hash(chunk->watch.inode);
+	struct fsnotify_mark_entry *entry = &chunk->mark;
+	struct list_head *list;
+
+	if (!entry->inode)
+		return;
+	list = chunk_hash(entry->inode);
 	list_add_rcu(&chunk->hash, list);
 }
 
@@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
 	struct audit_chunk *p;
 
 	list_for_each_entry_rcu(p, list, hash) {
-		if (p->watch.inode == inode) {
+		/* mark.inode may have gone NULL, but who cares? */
+		if (p->mark.inode == inode) {
 			atomic_long_inc(&p->refs);
 			return p;
 		}
@@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p)
 static void untag_chunk(struct node *p)
 {
 	struct audit_chunk *chunk = find_chunk(p);
+	struct fsnotify_mark_entry *entry = &chunk->mark;
 	struct audit_chunk *new;
 	struct audit_tree *owner;
 	int size = chunk->count - 1;
 	int i, j;
 
-	if (!pin_inotify_watch(&chunk->watch)) {
-		/*
-		 * Filesystem is shutting down; all watches are getting
-		 * evicted, just take it off the node list for this
-		 * tree and let the eviction logics take care of the
-		 * rest.
-		 */
-		owner = p->owner;
-		if (owner->root == chunk) {
-			list_del_init(&owner->same_root);
-			owner->root = NULL;
-		}
-		list_del_init(&p->list);
-		p->owner = NULL;
-		put_tree(owner);
-		return;
-	}
+	fsnotify_get_mark(entry);
 
 	spin_unlock(&hash_lock);
 
-	/*
-	 * pin_inotify_watch() succeeded, so the watch won't go away
-	 * from under us.
-	 */
-	mutex_lock(&chunk->watch.inode->inotify_mutex);
-	if (chunk->dead) {
-		mutex_unlock(&chunk->watch.inode->inotify_mutex);
+	spin_lock(&entry->lock);
+	if (chunk->dead || !entry->inode) {
+		spin_unlock(&entry->lock);
 		goto out;
 	}
 
@@ -256,16 +249,17 @@ static void untag_chunk(struct node *p)
 		list_del_init(&p->list);
 		list_del_rcu(&chunk->hash);
 		spin_unlock(&hash_lock);
-		inotify_evict_watch(&chunk->watch);
-		mutex_unlock(&chunk->watch.inode->inotify_mutex);
-		put_inotify_watch(&chunk->watch);
+		spin_unlock(&entry->lock);
+		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_put_mark(entry);
 		goto out;
 	}
 
 	new = alloc_chunk(size);
 	if (!new)
 		goto Fallback;
-	if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) {
+	fsnotify_duplicate_mark(&new->mark, entry);
+	if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, 1)) {
 		free_chunk(new);
 		goto Fallback;
 	}
@@ -298,9 +292,9 @@ static void untag_chunk(struct node *p)
 	list_for_each_entry(owner, &new->trees, same_root)
 		owner->root = new;
 	spin_unlock(&hash_lock);
-	inotify_evict_watch(&chunk->watch);
-	mutex_unlock(&chunk->watch.inode->inotify_mutex);
-	put_inotify_watch(&chunk->watch);
+	spin_unlock(&entry->lock);
+	fsnotify_destroy_mark_by_entry(entry);
+	fsnotify_put_mark(entry);
 	goto out;
 
 Fallback:
@@ -314,31 +308,33 @@ Fallback:
 	p->owner = NULL;
 	put_tree(owner);
 	spin_unlock(&hash_lock);
-	mutex_unlock(&chunk->watch.inode->inotify_mutex);
+	spin_unlock(&entry->lock);
 out:
-	unpin_inotify_watch(&chunk->watch);
+	fsnotify_put_mark(entry);
 	spin_lock(&hash_lock);
 }
 
 static int create_chunk(struct inode *inode, struct audit_tree *tree)
 {
+	struct fsnotify_mark_entry *entry;
 	struct audit_chunk *chunk = alloc_chunk(1);
 	if (!chunk)
 		return -ENOMEM;
 
-	if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) {
+	entry = &chunk->mark;
+	if (fsnotify_add_mark(entry, audit_tree_group, inode, 0)) {
 		free_chunk(chunk);
 		return -ENOSPC;
 	}
 
-	mutex_lock(&inode->inotify_mutex);
+	spin_lock(&entry->lock);
 	spin_lock(&hash_lock);
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
 		chunk->dead = 1;
-		inotify_evict_watch(&chunk->watch);
-		mutex_unlock(&inode->inotify_mutex);
-		put_inotify_watch(&chunk->watch);
+		spin_unlock(&entry->lock);
+		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_put_mark(entry);
 		return 0;
 	}
 	chunk->owners[0].index = (1U << 31);
@@ -351,30 +347,33 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 	}
 	insert_hash(chunk);
 	spin_unlock(&hash_lock);
-	mutex_unlock(&inode->inotify_mutex);
+	spin_unlock(&entry->lock);
 	return 0;
 }
 
 /* the first tagged inode becomes root of tree */
 static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 {
-	struct inotify_watch *watch;
+	struct fsnotify_mark_entry *old_entry, *chunk_entry;
 	struct audit_tree *owner;
 	struct audit_chunk *chunk, *old;
 	struct node *p;
 	int n;
 
-	if (inotify_find_watch(rtree_ih, inode, &watch) < 0)
+	spin_lock(&inode->i_lock);
+	old_entry = fsnotify_find_mark_entry(audit_tree_group, inode);
+	spin_unlock(&inode->i_lock);
+	if (!old_entry)
 		return create_chunk(inode, tree);
 
-	old = container_of(watch, struct audit_chunk, watch);
+	old = container_of(old_entry, struct audit_chunk, mark);
 
 	/* are we already there? */
 	spin_lock(&hash_lock);
 	for (n = 0; n < old->count; n++) {
 		if (old->owners[n].owner == tree) {
 			spin_unlock(&hash_lock);
-			put_inotify_watch(&old->watch);
+			fsnotify_put_mark(old_entry);
 			return 0;
 		}
 	}
@@ -382,25 +381,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 
 	chunk = alloc_chunk(old->count + 1);
 	if (!chunk) {
-		put_inotify_watch(&old->watch);
+		fsnotify_put_mark(old_entry);
 		return -ENOMEM;
 	}
 
-	mutex_lock(&inode->inotify_mutex);
-	if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
-		mutex_unlock(&inode->inotify_mutex);
-		put_inotify_watch(&old->watch);
+	chunk_entry = &chunk->mark;
+
+	spin_lock(&old_entry->lock);
+	if (!old_entry->inode) {
+		/* old_entry is being shot, lets just lie */
+		spin_unlock(&old_entry->lock);
+		fsnotify_put_mark(old_entry);
 		free_chunk(chunk);
+		return -ENOENT;
+	}
+
+	fsnotify_duplicate_mark(chunk_entry, old_entry);
+	if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, 1)) {
+		spin_unlock(&old_entry->lock);
+		free_chunk(chunk);
+		fsnotify_put_mark(old_entry);
 		return -ENOSPC;
 	}
+
+	/* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
+	spin_lock(&chunk_entry->lock);
 	spin_lock(&hash_lock);
+
+	/* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
 		chunk->dead = 1;
-		inotify_evict_watch(&chunk->watch);
-		mutex_unlock(&inode->inotify_mutex);
-		put_inotify_watch(&old->watch);
-		put_inotify_watch(&chunk->watch);
+		spin_unlock(&chunk_entry->lock);
+		spin_unlock(&old_entry->lock);
+
+		fsnotify_destroy_mark_by_entry(chunk_entry);
+
+		fsnotify_put_mark(chunk_entry);
+		fsnotify_put_mark(old_entry);
 		return 0;
 	}
 	list_replace_init(&old->trees, &chunk->trees);
@@ -426,10 +444,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		list_add(&tree->same_root, &chunk->trees);
 	}
 	spin_unlock(&hash_lock);
-	inotify_evict_watch(&old->watch);
-	mutex_unlock(&inode->inotify_mutex);
-	put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
-	put_inotify_watch(&old->watch); /* and kill it */
+	spin_unlock(&chunk_entry->lock);
+	spin_unlock(&old_entry->lock);
+	fsnotify_destroy_mark_by_entry(old_entry);
+	fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
+	fsnotify_put_mark(old_entry); /* and kill it */
 	return 0;
 }
 
@@ -584,7 +603,9 @@ void audit_trim_trees(void)
 
 		spin_lock(&hash_lock);
 		list_for_each_entry(node, &tree->chunks, list) {
-			struct inode *inode = find_chunk(node)->watch.inode;
+			struct audit_chunk *chunk = find_chunk(node);
+			/* this could be NULL if the watch is dieing else where... */
+			struct inode *inode = chunk->mark.inode;
 			node->index |= 1U<<31;
 			if (iterate_mounts(compare_root, inode, root_mnt))
 				node->index &= ~(1U<<31);
@@ -846,7 +867,6 @@ void audit_kill_trees(struct list_head *list)
  *  Here comes the stuff asynchronous to auditctl operations
  */
 
-/* inode->inotify_mutex is locked */
 static void evict_chunk(struct audit_chunk *chunk)
 {
 	struct audit_tree *owner;
@@ -885,35 +905,41 @@ static void evict_chunk(struct audit_chunk *chunk)
 	mutex_unlock(&audit_filter_mutex);
 }
 
-static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
-                         u32 cookie, const char *dname, struct inode *inode)
+static int audit_tree_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
 {
-	struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
+	BUG();
+	return -EOPNOTSUPP;
+}
 
-	if (mask & IN_IGNORED) {
-		evict_chunk(chunk);
-		put_inotify_watch(watch);
-	}
+static void audit_tree_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+	struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+
+	evict_chunk(chunk);
+	fsnotify_put_mark(entry);
 }
 
-static void destroy_watch(struct inotify_watch *watch)
+static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
 {
-	struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
-	call_rcu(&chunk->head, __put_chunk);
+	return 0;
 }
 
-static const struct inotify_operations rtree_inotify_ops = {
-	.handle_event	= handle_event,
-	.destroy_watch	= destroy_watch,
+static const struct fsnotify_ops audit_tree_ops = {
+	.handle_event = audit_tree_handle_event,
+	.should_send_event = audit_tree_send_event,
+	.free_group_priv = NULL,
+	.free_event_priv = NULL,
+	.freeing_mark = audit_tree_freeing_mark,
 };
 
 static int __init audit_tree_init(void)
 {
 	int i;
 
-	rtree_ih = inotify_init(&rtree_inotify_ops);
-	if (IS_ERR(rtree_ih))
-		audit_panic("cannot initialize inotify handle for rectree watches");
+	audit_tree_group = fsnotify_obtain_group(AUDIT_TREE_GROUP_NUM,
+						 0, &audit_tree_ops);
+	if (IS_ERR(audit_tree_group))
+		audit_panic("cannot initialize fsnotify group for rectree watches");
 
 	for (i = 0; i < HASH_SIZE; i++)
 		INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 240063c370e6..786901cd8217 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1725,7 +1725,7 @@ static inline void handle_one(const struct inode *inode)
 	struct audit_tree_refs *p;
 	struct audit_chunk *chunk;
 	int count;
-	if (likely(list_empty(&inode->inotify_watches)))
+	if (likely(hlist_empty(&inode->i_fsnotify_mark_entries)))
 		return;
 	context = current->audit_context;
 	p = context->trees;
@@ -1768,7 +1768,7 @@ retry:
 	seq = read_seqbegin(&rename_lock);
 	for(;;) {
 		struct inode *inode = d->d_inode;
-		if (inode && unlikely(!list_empty(&inode->inotify_watches))) {
+		if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_mark_entries))) {
 			struct audit_chunk *chunk;
 			chunk = audit_tree_lookup(inode);
 			if (chunk) {
-- 
cgit v1.2.3-59-g8ed1b


From 2dfc1cae4c42b93b831b2417540df2b895ab7108 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 20:30:52 -0500
Subject: inotify: remove inotify in kernel interface

nothing uses inotify in the kernel, drop it!

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 Documentation/feature-removal-schedule.txt |   8 -
 fs/inode.c                                 |   6 -
 fs/notify/inotify/Kconfig                  |  15 -
 fs/notify/inotify/Makefile                 |   1 -
 fs/notify/inotify/inotify.c                | 873 -----------------------------
 fs/open.c                                  |   1 +
 include/linux/fs.h                         |   5 -
 include/linux/fsnotify.h                   |  50 +-
 include/linux/inotify.h                    | 174 ------
 kernel/auditsc.c                           |   1 -
 10 files changed, 4 insertions(+), 1130 deletions(-)
 delete mode 100644 fs/notify/inotify/inotify.c

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 1571c0c83dba..a8188bd3dab8 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -367,14 +367,6 @@ When:	2.6.33
 Why:	Should be implemented in userspace, policy daemon.
 Who:	Johannes Berg <johannes@sipsolutions.net>
 
----------------------------
-
-What:	CONFIG_INOTIFY
-When:	2.6.33
-Why:	last user (audit) will be converted to the newer more generic
-	and more easily maintained fsnotify subsystem
-Who:	Eric Paris <eparis@redhat.com>
-
 ----------------------------
 
 What:	lock_policy_rwsem_* and unlock_policy_rwsem_* will not be
diff --git a/fs/inode.c b/fs/inode.c
index 722860b323a9..8e1bee998796 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,7 +20,6 @@
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
-#include <linux/inotify.h>
 #include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
@@ -264,10 +263,6 @@ void inode_init_once(struct inode *inode)
 	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
 	i_size_ordered_init(inode);
-#ifdef CONFIG_INOTIFY
-	INIT_LIST_HEAD(&inode->inotify_watches);
-	mutex_init(&inode->inotify_mutex);
-#endif
 #ifdef CONFIG_FSNOTIFY
 	INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
 #endif
@@ -413,7 +408,6 @@ int invalidate_inodes(struct super_block *sb)
 
 	down_write(&iprune_sem);
 	spin_lock(&inode_lock);
-	inotify_unmount_inodes(&sb->s_inodes);
 	fsnotify_unmount_inodes(&sb->s_inodes);
 	busy = invalidate_list(&sb->s_inodes, &throw_away);
 	spin_unlock(&inode_lock);
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index b3a159b21cfd..b981fc0c8379 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,18 +1,3 @@
-config INOTIFY
-	bool "Inotify file change notification support"
-	default n
-	---help---
-	  Say Y here to enable legacy in kernel inotify support.  Inotify is a
-	  file change notification system.  It is a replacement for dnotify.
-	  This option only provides the legacy inotify in kernel API.  There
-	  are no in tree kernel users of this interface since it is deprecated.
-	  You only need this if you are loading an out of tree kernel module
-	  that uses inotify.
-
-	  For more information, see <file:Documentation/filesystems/inotify.txt>
-
-	  If unsure, say N.
-
 config INOTIFY_USER
 	bool "Inotify support for userspace"
 	select ANON_INODES
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index 943828171362..a380dabe09de 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1 @@
-obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
deleted file mode 100644
index 27b75ebc7460..000000000000
--- a/fs/notify/inotify/inotify.c
+++ /dev/null
@@ -1,873 +0,0 @@
-/*
- * fs/inotify.c - inode-based file event notifications
- *
- * Authors:
- *	John McCutchan	<ttb@tentacle.dhs.org>
- *	Robert Love	<rml@novell.com>
- *
- * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
- *
- * Copyright (C) 2005 John McCutchan
- * Copyright 2006 Hewlett-Packard Development Company, L.P.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/writeback.h>
-#include <linux/inotify.h>
-#include <linux/fsnotify_backend.h>
-
-static atomic_t inotify_cookie;
-
-/*
- * Lock ordering:
- *
- * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
- * iprune_mutex (synchronize shrink_icache_memory())
- * 	inode_lock (protects the super_block->s_inodes list)
- * 	inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
- * 		inotify_handle->mutex (protects inotify_handle and watches->h_list)
- *
- * The inode->inotify_mutex and inotify_handle->mutex and held during execution
- * of a caller's event handler.  Thus, the caller must not hold any locks
- * taken in their event handler while calling any of the published inotify
- * interfaces.
- */
-
-/*
- * Lifetimes of the three main data structures--inotify_handle, inode, and
- * inotify_watch--are managed by reference count.
- *
- * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
- * Additional references can bump the count via get_inotify_handle() and drop
- * the count via put_inotify_handle().
- *
- * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
- * to remove_watch_no_event().  Additional references can bump the count via
- * get_inotify_watch() and drop the count via put_inotify_watch().  The caller
- * is reponsible for the final put after receiving IN_IGNORED, or when using
- * IN_ONESHOT after receiving the first event.  Inotify does the final put if
- * inotify_destroy() is called.
- *
- * inode: Pinned so long as the inode is associated with a watch, from
- * inotify_add_watch() to the final put_inotify_watch().
- */
-
-/*
- * struct inotify_handle - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_handle {
-	struct idr		idr;		/* idr mapping wd -> watch */
-	struct mutex		mutex;		/* protects this bad boy */
-	struct list_head	watches;	/* list of watches */
-	atomic_t		count;		/* reference count */
-	u32			last_wd;	/* the last wd allocated */
-	const struct inotify_operations *in_ops; /* inotify caller operations */
-};
-
-static inline void get_inotify_handle(struct inotify_handle *ih)
-{
-	atomic_inc(&ih->count);
-}
-
-static inline void put_inotify_handle(struct inotify_handle *ih)
-{
-	if (atomic_dec_and_test(&ih->count)) {
-		idr_destroy(&ih->idr);
-		kfree(ih);
-	}
-}
-
-/**
- * get_inotify_watch - grab a reference to an inotify_watch
- * @watch: watch to grab
- */
-void get_inotify_watch(struct inotify_watch *watch)
-{
-	atomic_inc(&watch->count);
-}
-EXPORT_SYMBOL_GPL(get_inotify_watch);
-
-int pin_inotify_watch(struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	if (atomic_inc_not_zero(&sb->s_active)) {
-		atomic_inc(&watch->count);
-		return 1;
-	}
-	return 0;
-}
-
-/**
- * put_inotify_watch - decrements the ref count on a given watch.  cleans up
- * watch references if the count reaches zero.  inotify_watch is freed by
- * inotify callers via the destroy_watch() op.
- * @watch: watch to release
- */
-void put_inotify_watch(struct inotify_watch *watch)
-{
-	if (atomic_dec_and_test(&watch->count)) {
-		struct inotify_handle *ih = watch->ih;
-
-		iput(watch->inode);
-		ih->in_ops->destroy_watch(watch);
-		put_inotify_handle(ih);
-	}
-}
-EXPORT_SYMBOL_GPL(put_inotify_watch);
-
-void unpin_inotify_watch(struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	put_inotify_watch(watch);
-	deactivate_super(sb);
-}
-
-/*
- * inotify_handle_get_wd - returns the next WD for use by the given handle
- *
- * Callers must hold ih->mutex.  This function can sleep.
- */
-static int inotify_handle_get_wd(struct inotify_handle *ih,
-				 struct inotify_watch *watch)
-{
-	int ret;
-
-	do {
-		if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
-			return -ENOSPC;
-		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
-	} while (ret == -EAGAIN);
-
-	if (likely(!ret))
-		ih->last_wd = watch->wd;
-
-	return ret;
-}
-
-/*
- * inotify_inode_watched - returns nonzero if there are watches on this inode
- * and zero otherwise.  We call this lockless, we do not care if we race.
- */
-static inline int inotify_inode_watched(struct inode *inode)
-{
-	return !list_empty(&inode->inotify_watches);
-}
-
-/*
- * Get child dentry flag into synch with parent inode.
- * Flag should always be clear for negative dentrys.
- */
-static void set_dentry_child_flags(struct inode *inode, int watched)
-{
-	struct dentry *alias;
-
-	spin_lock(&dcache_lock);
-	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
-		struct dentry *child;
-
-		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
-			if (!child->d_inode)
-				continue;
-
-			spin_lock(&child->d_lock);
-			if (watched)
-				child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-			else
-				child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED;
-			spin_unlock(&child->d_lock);
-		}
-	}
-	spin_unlock(&dcache_lock);
-}
-
-/*
- * inotify_find_handle - find the watch associated with the given inode and
- * handle
- *
- * Callers must hold inode->inotify_mutex.
- */
-static struct inotify_watch *inode_find_handle(struct inode *inode,
-					       struct inotify_handle *ih)
-{
-	struct inotify_watch *watch;
-
-	list_for_each_entry(watch, &inode->inotify_watches, i_list) {
-		if (watch->ih == ih)
-			return watch;
-	}
-
-	return NULL;
-}
-
-/*
- * remove_watch_no_event - remove watch without the IN_IGNORED event.
- *
- * Callers must hold both inode->inotify_mutex and ih->mutex.
- */
-static void remove_watch_no_event(struct inotify_watch *watch,
-				  struct inotify_handle *ih)
-{
-	list_del(&watch->i_list);
-	list_del(&watch->h_list);
-
-	if (!inotify_inode_watched(watch->inode))
-		set_dentry_child_flags(watch->inode, 0);
-
-	idr_remove(&ih->idr, watch->wd);
-}
-
-/**
- * inotify_remove_watch_locked - Remove a watch from both the handle and the
- * inode.  Sends the IN_IGNORED event signifying that the inode is no longer
- * watched.  May be invoked from a caller's event handler.
- * @ih: inotify handle associated with watch
- * @watch: watch to remove
- *
- * Callers must hold both inode->inotify_mutex and ih->mutex.
- */
-void inotify_remove_watch_locked(struct inotify_handle *ih,
-				 struct inotify_watch *watch)
-{
-	remove_watch_no_event(watch, ih);
-	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
-}
-EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
-
-/* Kernel API for producing events */
-
-/*
- * inotify_d_instantiate - instantiate dcache entry for inode
- */
-void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
-{
-	struct dentry *parent;
-
-	if (!inode)
-		return;
-
-	spin_lock(&entry->d_lock);
-	parent = entry->d_parent;
-	if (parent->d_inode && inotify_inode_watched(parent->d_inode))
-		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-	spin_unlock(&entry->d_lock);
-}
-
-/*
- * inotify_d_move - dcache entry has been moved
- */
-void inotify_d_move(struct dentry *entry)
-{
-	struct dentry *parent;
-
-	parent = entry->d_parent;
-	if (inotify_inode_watched(parent->d_inode))
-		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-	else
-		entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
-}
-
-/**
- * inotify_inode_queue_event - queue an event to all watches on this inode
- * @inode: inode event is originating from
- * @mask: event mask describing this event
- * @cookie: cookie for synchronization, or zero
- * @name: filename, if any
- * @n_inode: inode associated with name
- */
-void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
-			       const char *name, struct inode *n_inode)
-{
-	struct inotify_watch *watch, *next;
-
-	if (!inotify_inode_watched(inode))
-		return;
-
-	mutex_lock(&inode->inotify_mutex);
-	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-		u32 watch_mask = watch->mask;
-		if (watch_mask & mask) {
-			struct inotify_handle *ih= watch->ih;
-			mutex_lock(&ih->mutex);
-			if (watch_mask & IN_ONESHOT)
-				remove_watch_no_event(watch, ih);
-			ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
-						 name, n_inode);
-			mutex_unlock(&ih->mutex);
-		}
-	}
-	mutex_unlock(&inode->inotify_mutex);
-}
-EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
-
-/**
- * inotify_dentry_parent_queue_event - queue an event to a dentry's parent
- * @dentry: the dentry in question, we queue against this dentry's parent
- * @mask: event mask describing this event
- * @cookie: cookie for synchronization, or zero
- * @name: filename, if any
- */
-void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
-				       u32 cookie, const char *name)
-{
-	struct dentry *parent;
-	struct inode *inode;
-
-	if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
-		return;
-
-	spin_lock(&dentry->d_lock);
-	parent = dentry->d_parent;
-	inode = parent->d_inode;
-
-	if (inotify_inode_watched(inode)) {
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
-		inotify_inode_queue_event(inode, mask, cookie, name,
-					  dentry->d_inode);
-		dput(parent);
-	} else
-		spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event);
-
-/**
- * inotify_get_cookie - return a unique cookie for use in synchronizing events.
- */
-u32 inotify_get_cookie(void)
-{
-	return atomic_inc_return(&inotify_cookie);
-}
-EXPORT_SYMBOL_GPL(inotify_get_cookie);
-
-/**
- * inotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
- * @list: list of inodes being unmounted (sb->s_inodes)
- *
- * Called with inode_lock held, protecting the unmounting super block's list
- * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
- * We temporarily drop inode_lock, however, and CAN block.
- */
-void inotify_unmount_inodes(struct list_head *list)
-{
-	struct inode *inode, *next_i, *need_iput = NULL;
-
-	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
-		struct inotify_watch *watch, *next_w;
-		struct inode *need_iput_tmp;
-		struct list_head *watches;
-
-		/*
-		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
-		 * I_WILL_FREE, or I_NEW which is fine because by that point
-		 * the inode cannot have any associated watches.
-		 */
-		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
-			continue;
-
-		/*
-		 * If i_count is zero, the inode cannot have any watches and
-		 * doing an __iget/iput with MS_ACTIVE clear would actually
-		 * evict all inodes with zero i_count from icache which is
-		 * unnecessarily violent and may in fact be illegal to do.
-		 */
-		if (!atomic_read(&inode->i_count))
-			continue;
-
-		need_iput_tmp = need_iput;
-		need_iput = NULL;
-		/* In case inotify_remove_watch_locked() drops a reference. */
-		if (inode != need_iput_tmp)
-			__iget(inode);
-		else
-			need_iput_tmp = NULL;
-		/* In case the dropping of a reference would nuke next_i. */
-		if ((&next_i->i_sb_list != list) &&
-				atomic_read(&next_i->i_count) &&
-				!(next_i->i_state & (I_CLEAR | I_FREEING |
-					I_WILL_FREE))) {
-			__iget(next_i);
-			need_iput = next_i;
-		}
-
-		/*
-		 * We can safely drop inode_lock here because we hold
-		 * references on both inode and next_i.  Also no new inodes
-		 * will be added since the umount has begun.  Finally,
-		 * iprune_mutex keeps shrink_icache_memory() away.
-		 */
-		spin_unlock(&inode_lock);
-
-		if (need_iput_tmp)
-			iput(need_iput_tmp);
-
-		/* for each watch, send IN_UNMOUNT and then remove it */
-		mutex_lock(&inode->inotify_mutex);
-		watches = &inode->inotify_watches;
-		list_for_each_entry_safe(watch, next_w, watches, i_list) {
-			struct inotify_handle *ih= watch->ih;
-			get_inotify_watch(watch);
-			mutex_lock(&ih->mutex);
-			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
-						 NULL, NULL);
-			inotify_remove_watch_locked(ih, watch);
-			mutex_unlock(&ih->mutex);
-			put_inotify_watch(watch);
-		}
-		mutex_unlock(&inode->inotify_mutex);
-		iput(inode);		
-
-		spin_lock(&inode_lock);
-	}
-}
-EXPORT_SYMBOL_GPL(inotify_unmount_inodes);
-
-/**
- * inotify_inode_is_dead - an inode has been deleted, cleanup any watches
- * @inode: inode that is about to be removed
- */
-void inotify_inode_is_dead(struct inode *inode)
-{
-	struct inotify_watch *watch, *next;
-
-	mutex_lock(&inode->inotify_mutex);
-	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-		struct inotify_handle *ih = watch->ih;
-		mutex_lock(&ih->mutex);
-		inotify_remove_watch_locked(ih, watch);
-		mutex_unlock(&ih->mutex);
-	}
-	mutex_unlock(&inode->inotify_mutex);
-}
-EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
-
-/* Kernel Consumer API */
-
-/**
- * inotify_init - allocate and initialize an inotify instance
- * @ops: caller's inotify operations
- */
-struct inotify_handle *inotify_init(const struct inotify_operations *ops)
-{
-	struct inotify_handle *ih;
-
-	ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
-	if (unlikely(!ih))
-		return ERR_PTR(-ENOMEM);
-
-	idr_init(&ih->idr);
-	INIT_LIST_HEAD(&ih->watches);
-	mutex_init(&ih->mutex);
-	ih->last_wd = 0;
-	ih->in_ops = ops;
-	atomic_set(&ih->count, 0);
-	get_inotify_handle(ih);
-
-	return ih;
-}
-EXPORT_SYMBOL_GPL(inotify_init);
-
-/**
- * inotify_init_watch - initialize an inotify watch
- * @watch: watch to initialize
- */
-void inotify_init_watch(struct inotify_watch *watch)
-{
-	INIT_LIST_HEAD(&watch->h_list);
-	INIT_LIST_HEAD(&watch->i_list);
-	atomic_set(&watch->count, 0);
-	get_inotify_watch(watch); /* initial get */
-}
-EXPORT_SYMBOL_GPL(inotify_init_watch);
-
-/*
- * Watch removals suck violently.  To kick the watch out we need (in this
- * order) inode->inotify_mutex and ih->mutex.  That's fine if we have
- * a hold on inode; however, for all other cases we need to make damn sure
- * we don't race with umount.  We can *NOT* just grab a reference to a
- * watch - inotify_unmount_inodes() will happily sail past it and we'll end
- * with reference to inode potentially outliving its superblock.  Ideally
- * we just want to grab an active reference to superblock if we can; that
- * will make sure we won't go into inotify_umount_inodes() until we are
- * done.  Cleanup is just deactivate_super().  However, that leaves a messy
- * case - what if we *are* racing with umount() and active references to
- * superblock can't be acquired anymore?  We can bump ->s_count, grab
- * ->s_umount, which will wait until the superblock is shut down and the
- * watch in question is pining for fjords.
- *
- * And yes, this is far beyond mere "not very pretty"; so's the entire
- * concept of inotify to start with.
- */
-
-/**
- * pin_to_kill - pin the watch down for removal
- * @ih: inotify handle
- * @watch: watch to kill
- *
- * Called with ih->mutex held, drops it.  Possible return values:
- * 0 - nothing to do, it has died
- * 1 - remove it, drop the reference and deactivate_super()
- */
-static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-
-	if (atomic_inc_not_zero(&sb->s_active)) {
-		get_inotify_watch(watch);
-		mutex_unlock(&ih->mutex);
-		return 1;	/* the best outcome */
-	}
-	spin_lock(&sb_lock);
-	sb->s_count++;
-	spin_unlock(&sb_lock);
-	mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
-	down_read(&sb->s_umount);
-	/* fs is already shut down; the watch is dead */
-	drop_super(sb);
-	return 0;
-}
-
-static void unpin_and_kill(struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	put_inotify_watch(watch);
-	deactivate_super(sb);
-}
-
-/**
- * inotify_destroy - clean up and destroy an inotify instance
- * @ih: inotify handle
- */
-void inotify_destroy(struct inotify_handle *ih)
-{
-	/*
-	 * Destroy all of the watches for this handle. Unfortunately, not very
-	 * pretty.  We cannot do a simple iteration over the list, because we
-	 * do not know the inode until we iterate to the watch.  But we need to
-	 * hold inode->inotify_mutex before ih->mutex.  The following works.
-	 *
-	 * AV: it had to become even uglier to start working ;-/
-	 */
-	while (1) {
-		struct inotify_watch *watch;
-		struct list_head *watches;
-		struct super_block *sb;
-		struct inode *inode;
-
-		mutex_lock(&ih->mutex);
-		watches = &ih->watches;
-		if (list_empty(watches)) {
-			mutex_unlock(&ih->mutex);
-			break;
-		}
-		watch = list_first_entry(watches, struct inotify_watch, h_list);
-		sb = watch->inode->i_sb;
-		if (!pin_to_kill(ih, watch))
-			continue;
-
-		inode = watch->inode;
-		mutex_lock(&inode->inotify_mutex);
-		mutex_lock(&ih->mutex);
-
-		/* make sure we didn't race with another list removal */
-		if (likely(idr_find(&ih->idr, watch->wd))) {
-			remove_watch_no_event(watch, ih);
-			put_inotify_watch(watch);
-		}
-
-		mutex_unlock(&ih->mutex);
-		mutex_unlock(&inode->inotify_mutex);
-		unpin_and_kill(watch);
-	}
-
-	/* free this handle: the put matching the get in inotify_init() */
-	put_inotify_handle(ih);
-}
-EXPORT_SYMBOL_GPL(inotify_destroy);
-
-/**
- * inotify_find_watch - find an existing watch for an (ih,inode) pair
- * @ih: inotify handle
- * @inode: inode to watch
- * @watchp: pointer to existing inotify_watch
- *
- * Caller must pin given inode (via nameidata).
- */
-s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
-		       struct inotify_watch **watchp)
-{
-	struct inotify_watch *old;
-	int ret = -ENOENT;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	old = inode_find_handle(inode, ih);
-	if (unlikely(old)) {
-		get_inotify_watch(old); /* caller must put watch */
-		*watchp = old;
-		ret = old->wd;
-	}
-
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_find_watch);
-
-/**
- * inotify_find_update_watch - find and update the mask of an existing watch
- * @ih: inotify handle
- * @inode: inode's watch to update
- * @mask: mask of events to watch
- *
- * Caller must pin given inode (via nameidata).
- */
-s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
-			      u32 mask)
-{
-	struct inotify_watch *old;
-	int mask_add = 0;
-	int ret;
-
-	if (mask & IN_MASK_ADD)
-		mask_add = 1;
-
-	/* don't allow invalid bits: we don't want flags set */
-	mask &= IN_ALL_EVENTS | IN_ONESHOT;
-	if (unlikely(!mask))
-		return -EINVAL;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	/*
-	 * Handle the case of re-adding a watch on an (inode,ih) pair that we
-	 * are already watching.  We just update the mask and return its wd.
-	 */
-	old = inode_find_handle(inode, ih);
-	if (unlikely(!old)) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	if (mask_add)
-		old->mask |= mask;
-	else
-		old->mask = mask;
-	ret = old->wd;
-out:
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_find_update_watch);
-
-/**
- * inotify_add_watch - add a watch to an inotify instance
- * @ih: inotify handle
- * @watch: caller allocated watch structure
- * @inode: inode to watch
- * @mask: mask of events to watch
- *
- * Caller must pin given inode (via nameidata).
- * Caller must ensure it only calls inotify_add_watch() once per watch.
- * Calls inotify_handle_get_wd() so may sleep.
- */
-s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
-		      struct inode *inode, u32 mask)
-{
-	int ret = 0;
-	int newly_watched;
-
-	/* don't allow invalid bits: we don't want flags set */
-	mask &= IN_ALL_EVENTS | IN_ONESHOT;
-	if (unlikely(!mask))
-		return -EINVAL;
-	watch->mask = mask;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	/* Initialize a new watch */
-	ret = inotify_handle_get_wd(ih, watch);
-	if (unlikely(ret))
-		goto out;
-	ret = watch->wd;
-
-	/* save a reference to handle and bump the count to make it official */
-	get_inotify_handle(ih);
-	watch->ih = ih;
-
-	/*
-	 * Save a reference to the inode and bump the ref count to make it
-	 * official.  We hold a reference to nameidata, which makes this safe.
-	 */
-	watch->inode = igrab(inode);
-
-	/* Add the watch to the handle's and the inode's list */
-	newly_watched = !inotify_inode_watched(inode);
-	list_add(&watch->h_list, &ih->watches);
-	list_add(&watch->i_list, &inode->inotify_watches);
-	/*
-	 * Set child flags _after_ adding the watch, so there is no race
-	 * windows where newly instantiated children could miss their parent's
-	 * watched flag.
-	 */
-	if (newly_watched)
-		set_dentry_child_flags(inode, 1);
-
-out:
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_add_watch);
-
-/**
- * inotify_clone_watch - put the watch next to existing one
- * @old: already installed watch
- * @new: new watch
- *
- * Caller must hold the inotify_mutex of inode we are dealing with;
- * it is expected to remove the old watch before unlocking the inode.
- */
-s32 inotify_clone_watch(struct inotify_watch *old, struct inotify_watch *new)
-{
-	struct inotify_handle *ih = old->ih;
-	int ret = 0;
-
-	new->mask = old->mask;
-	new->ih = ih;
-
-	mutex_lock(&ih->mutex);
-
-	/* Initialize a new watch */
-	ret = inotify_handle_get_wd(ih, new);
-	if (unlikely(ret))
-		goto out;
-	ret = new->wd;
-
-	get_inotify_handle(ih);
-
-	new->inode = igrab(old->inode);
-
-	list_add(&new->h_list, &ih->watches);
-	list_add(&new->i_list, &old->inode->inotify_watches);
-out:
-	mutex_unlock(&ih->mutex);
-	return ret;
-}
-
-void inotify_evict_watch(struct inotify_watch *watch)
-{
-	get_inotify_watch(watch);
-	mutex_lock(&watch->ih->mutex);
-	inotify_remove_watch_locked(watch->ih, watch);
-	mutex_unlock(&watch->ih->mutex);
-}
-
-/**
- * inotify_rm_wd - remove a watch from an inotify instance
- * @ih: inotify handle
- * @wd: watch descriptor to remove
- *
- * Can sleep.
- */
-int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
-{
-	struct inotify_watch *watch;
-	struct super_block *sb;
-	struct inode *inode;
-
-	mutex_lock(&ih->mutex);
-	watch = idr_find(&ih->idr, wd);
-	if (unlikely(!watch)) {
-		mutex_unlock(&ih->mutex);
-		return -EINVAL;
-	}
-	sb = watch->inode->i_sb;
-	if (!pin_to_kill(ih, watch))
-		return 0;
-
-	inode = watch->inode;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	/* make sure that we did not race */
-	if (likely(idr_find(&ih->idr, wd) == watch))
-		inotify_remove_watch_locked(ih, watch);
-
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	unpin_and_kill(watch);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(inotify_rm_wd);
-
-/**
- * inotify_rm_watch - remove a watch from an inotify instance
- * @ih: inotify handle
- * @watch: watch to remove
- *
- * Can sleep.
- */
-int inotify_rm_watch(struct inotify_handle *ih,
-		     struct inotify_watch *watch)
-{
-	return inotify_rm_wd(ih, watch->wd);
-}
-EXPORT_SYMBOL_GPL(inotify_rm_watch);
-
-/*
- * inotify_setup - core initialization function
- */
-static int __init inotify_setup(void)
-{
-	BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
-	BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
-	BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
-	BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
-	BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
-	BUILD_BUG_ON(IN_OPEN != FS_OPEN);
-	BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
-	BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
-	BUILD_BUG_ON(IN_CREATE != FS_CREATE);
-	BUILD_BUG_ON(IN_DELETE != FS_DELETE);
-	BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
-	BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
-	BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
-
-	BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
-	BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
-	BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
-	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
-
-	atomic_set(&inotify_cookie, 0);
-
-	return 0;
-}
-
-module_init(inotify_setup);
diff --git a/fs/open.c b/fs/open.c
index 5463266db9e6..94d54d3efa8b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
 #include <linux/falloc.h>
 #include <linux/fs_struct.h>
 #include <linux/ima.h>
+#include <linux/dnotify.h>
 
 #include "internal.h"
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 68ca1b0491af..e5598d2f99b9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -771,11 +771,6 @@ struct inode {
 	struct hlist_head	i_fsnotify_mark_entries; /* fsnotify mark entries */
 #endif
 
-#ifdef CONFIG_INOTIFY
-	struct list_head	inotify_watches; /* watches on this inode */
-	struct mutex		inotify_mutex;	/* protects the watches list */
-#endif
-
 	unsigned long		i_state;
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 01755909ce81..f958e93feb97 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -11,8 +11,6 @@
  * (C) Copyright 2005 Robert Love
  */
 
-#include <linux/dnotify.h>
-#include <linux/inotify.h>
 #include <linux/fsnotify_backend.h>
 #include <linux/audit.h>
 #include <linux/slab.h>
@@ -25,16 +23,12 @@ static inline void fsnotify_d_instantiate(struct dentry *entry,
 						struct inode *inode)
 {
 	__fsnotify_d_instantiate(entry, inode);
-
-	inotify_d_instantiate(entry, inode);
 }
 
 /* Notify this dentry's parent about a child's events. */
 static inline void fsnotify_parent(struct dentry *dentry, __u32 mask)
 {
 	__fsnotify_parent(dentry, mask);
-
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 }
 
 /*
@@ -48,8 +42,6 @@ static inline void fsnotify_d_move(struct dentry *entry)
 	 * cares about events from this entry.
 	 */
 	__fsnotify_update_dcache_flags(entry);
-
-	inotify_d_move(entry);
 }
 
 /*
@@ -57,8 +49,6 @@ static inline void fsnotify_d_move(struct dentry *entry)
  */
 static inline void fsnotify_link_count(struct inode *inode)
 {
-	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
-
 	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
@@ -70,7 +60,6 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 				 int isdir, struct inode *target, struct dentry *moved)
 {
 	struct inode *source = moved->d_inode;
-	u32 in_cookie = inotify_get_cookie();
 	u32 fs_cookie = fsnotify_get_cookie();
 	__u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM);
 	__u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO);
@@ -80,31 +69,18 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 		old_dir_mask |= FS_DN_RENAME;
 
 	if (isdir) {
-		isdir = IN_ISDIR;
 		old_dir_mask |= FS_IN_ISDIR;
 		new_dir_mask |= FS_IN_ISDIR;
 	}
 
-	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir, in_cookie, old_name,
-				  source);
-	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, in_cookie, new_name,
-				  source);
-
 	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie);
 	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name, fs_cookie);
 
-	if (target) {
-		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
-		inotify_inode_is_dead(target);
-
-		/* this is really a link_count change not a removal */
+	if (target)
 		fsnotify_link_count(target);
-	}
 
-	if (source) {
-		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
+	if (source)
 		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
-	}
 	audit_inode_child(moved, new_dir);
 }
 
@@ -134,9 +110,6 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
  */
 static inline void fsnotify_inoderemove(struct inode *inode)
 {
-	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
-	inotify_inode_is_dead(inode);
-
 	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	__fsnotify_inode_delete(inode);
 }
@@ -146,8 +119,6 @@ static inline void fsnotify_inoderemove(struct inode *inode)
  */
 static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
-	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
-				  dentry->d_inode);
 	audit_inode_child(dentry, inode);
 
 	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
@@ -160,8 +131,6 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
  */
 static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct dentry *new_dentry)
 {
-	inotify_inode_queue_event(dir, IN_CREATE, 0, new_dentry->d_name.name,
-				  inode);
 	fsnotify_link_count(inode);
 	audit_inode_child(new_dentry, dir);
 
@@ -176,7 +145,6 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	__u32 mask = (FS_CREATE | FS_IN_ISDIR);
 	struct inode *d_inode = dentry->d_inode;
 
-	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry, inode);
 
 	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
@@ -193,8 +161,6 @@ static inline void fsnotify_access(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
-
 	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
@@ -210,8 +176,6 @@ static inline void fsnotify_modify(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
-
 	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
@@ -227,8 +191,6 @@ static inline void fsnotify_open(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
-
 	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
@@ -246,8 +208,6 @@ static inline void fsnotify_close(struct file *file)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
-
 	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
@@ -263,8 +223,6 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
-
 	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
@@ -299,14 +257,12 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 	if (mask) {
 		if (S_ISDIR(inode->i_mode))
 			mask |= FS_IN_ISDIR;
-		inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
-
 		fsnotify_parent(dentry, mask);
 		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	}
 }
 
-#if defined(CONFIG_INOTIFY) || defined(CONFIG_FSNOTIFY)	/* notify helpers */
+#if defined(CONFIG_FSNOTIFY)	/* notify helpers */
 
 /*
  * fsnotify_oldname_init - save off the old filename before we change it
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 37ea2894b3c0..959a38b8f75d 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -69,178 +69,4 @@ struct inotify_event {
 #define IN_CLOEXEC O_CLOEXEC
 #define IN_NONBLOCK O_NONBLOCK
 
-#ifdef __KERNEL__
-
-#include <linux/dcache.h>
-#include <linux/fs.h>
-
-/*
- * struct inotify_watch - represents a watch request on a specific inode
- *
- * h_list is protected by ih->mutex of the associated inotify_handle.
- * i_list, mask are protected by inode->inotify_mutex of the associated inode.
- * ih, inode, and wd are never written to once the watch is created.
- *
- * Callers must use the established inotify interfaces to access inotify_watch
- * contents.  The content of this structure is private to the inotify
- * implementation.
- */
-struct inotify_watch {
-	struct list_head	h_list;	/* entry in inotify_handle's list */
-	struct list_head	i_list;	/* entry in inode's list */
-	atomic_t		count;	/* reference count */
-	struct inotify_handle	*ih;	/* associated inotify handle */
-	struct inode		*inode;	/* associated inode */
-	__s32			wd;	/* watch descriptor */
-	__u32			mask;	/* event mask for this watch */
-};
-
-struct inotify_operations {
-	void (*handle_event)(struct inotify_watch *, u32, u32, u32,
-			     const char *, struct inode *);
-	void (*destroy_watch)(struct inotify_watch *);
-};
-
-#ifdef CONFIG_INOTIFY
-
-/* Kernel API for producing events */
-
-extern void inotify_d_instantiate(struct dentry *, struct inode *);
-extern void inotify_d_move(struct dentry *);
-extern void inotify_inode_queue_event(struct inode *, __u32, __u32,
-				      const char *, struct inode *);
-extern void inotify_dentry_parent_queue_event(struct dentry *, __u32, __u32,
-					      const char *);
-extern void inotify_unmount_inodes(struct list_head *);
-extern void inotify_inode_is_dead(struct inode *);
-extern u32 inotify_get_cookie(void);
-
-/* Kernel Consumer API */
-
-extern struct inotify_handle *inotify_init(const struct inotify_operations *);
-extern void inotify_init_watch(struct inotify_watch *);
-extern void inotify_destroy(struct inotify_handle *);
-extern __s32 inotify_find_watch(struct inotify_handle *, struct inode *,
-				struct inotify_watch **);
-extern __s32 inotify_find_update_watch(struct inotify_handle *, struct inode *,
-				       u32);
-extern __s32 inotify_add_watch(struct inotify_handle *, struct inotify_watch *,
-			       struct inode *, __u32);
-extern __s32 inotify_clone_watch(struct inotify_watch *, struct inotify_watch *);
-extern void inotify_evict_watch(struct inotify_watch *);
-extern int inotify_rm_watch(struct inotify_handle *, struct inotify_watch *);
-extern int inotify_rm_wd(struct inotify_handle *, __u32);
-extern void inotify_remove_watch_locked(struct inotify_handle *,
-					struct inotify_watch *);
-extern void get_inotify_watch(struct inotify_watch *);
-extern void put_inotify_watch(struct inotify_watch *);
-extern int pin_inotify_watch(struct inotify_watch *);
-extern void unpin_inotify_watch(struct inotify_watch *);
-
-#else
-
-static inline void inotify_d_instantiate(struct dentry *dentry,
-					struct inode *inode)
-{
-}
-
-static inline void inotify_d_move(struct dentry *dentry)
-{
-}
-
-static inline void inotify_inode_queue_event(struct inode *inode,
-					     __u32 mask, __u32 cookie,
-					     const char *filename,
-					     struct inode *n_inode)
-{
-}
-
-static inline void inotify_dentry_parent_queue_event(struct dentry *dentry,
-						     __u32 mask, __u32 cookie,
-						     const char *filename)
-{
-}
-
-static inline void inotify_unmount_inodes(struct list_head *list)
-{
-}
-
-static inline void inotify_inode_is_dead(struct inode *inode)
-{
-}
-
-static inline u32 inotify_get_cookie(void)
-{
-	return 0;
-}
-
-static inline struct inotify_handle *inotify_init(const struct inotify_operations *ops)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void inotify_init_watch(struct inotify_watch *watch)
-{
-}
-
-static inline void inotify_destroy(struct inotify_handle *ih)
-{
-}
-
-static inline __s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
-				       struct inotify_watch **watchp)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline __s32 inotify_find_update_watch(struct inotify_handle *ih,
-					      struct inode *inode, u32 mask)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline __s32 inotify_add_watch(struct inotify_handle *ih,
-				      struct inotify_watch *watch,
-				      struct inode *inode, __u32 mask)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int inotify_rm_watch(struct inotify_handle *ih,
-				   struct inotify_watch *watch)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int inotify_rm_wd(struct inotify_handle *ih, __u32 wd)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void inotify_remove_watch_locked(struct inotify_handle *ih,
-					       struct inotify_watch *watch)
-{
-}
-
-static inline void get_inotify_watch(struct inotify_watch *watch)
-{
-}
-
-static inline void put_inotify_watch(struct inotify_watch *watch)
-{
-}
-
-extern inline int pin_inotify_watch(struct inotify_watch *watch)
-{
-	return 0;
-}
-
-extern inline void unpin_inotify_watch(struct inotify_watch *watch)
-{
-}
-
-#endif	/* CONFIG_INOTIFY */
-
-#endif	/* __KERNEL __ */
-
 #endif	/* _LINUX_INOTIFY_H */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 786901cd8217..853185f7ba7e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,7 +65,6 @@
 #include <linux/binfmts.h>
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
-#include <linux/inotify.h>
 #include <linux/capability.h>
 #include <linux/fs_struct.h>
 
-- 
cgit v1.2.3-59-g8ed1b


From 7b0a04fbfb35650941af87728d4891515b4fc179 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:21 -0500
Subject: fsnotify: provide the data type to should_send_event

fanotify is only interested in event types which contain enough information
to open the original file in the context of the fanotify listener.  Since
fanotify may not want to send events if that data isn't present we pass
the data type to the should_send_event function call so fanotify can express
its lack of interest.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c          | 3 ++-
 fs/notify/fsnotify.c                 | 2 +-
 fs/notify/inotify/inotify_fsnotify.c | 3 ++-
 include/linux/fsnotify_backend.h     | 3 ++-
 kernel/audit_tree.c                  | 3 ++-
 kernel/audit_watch.c                 | 3 ++-
 6 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 85b97fca14de..6f30f496e235 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -133,7 +133,8 @@ static int dnotify_handle_event(struct fsnotify_group *group,
  * userspace notification for that pair.
  */
 static bool dnotify_should_send_event(struct fsnotify_group *group,
-				      struct inode *inode, __u32 mask)
+				      struct inode *inode, __u32 mask,
+				      int data_type)
 {
 	struct fsnotify_mark_entry *entry;
 	bool send;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index fcc2f064af83..fc06e4789392 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -157,7 +157,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 	idx = srcu_read_lock(&fsnotify_grp_srcu);
 	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
 		if (test_mask & group->mask) {
-			if (!group->ops->should_send_event(group, to_tell, mask))
+			if (!group->ops->should_send_event(group, to_tell, mask, data_is))
 				continue;
 			if (!event) {
 				event = fsnotify_create_event(to_tell, mask, data,
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index e27960cd76ab..fc7c4952e6a2 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -86,7 +86,8 @@ static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnot
 	inotify_ignored_and_remove_idr(entry, group);
 }
 
-static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
+static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
+				      __u32 mask, int data_type)
 {
 	struct fsnotify_mark_entry *entry;
 	bool send;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index e25284371020..61aed0c54fe9 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -84,7 +84,8 @@ struct fsnotify_event_private_data;
  *		valid group and inode to use to clean up.
  */
 struct fsnotify_ops {
-	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode, __u32 mask);
+	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode,
+				  __u32 mask, int data_type);
 	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index a164600dd82e..b5417cd65216 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -919,7 +919,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark_entry *entry, struct fs
 	fsnotify_put_mark(entry);
 }
 
-static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
+static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
+				  __u32 mask, int data_type)
 {
 	return 0;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f8543a41115b..67d8f2f52874 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -505,7 +505,8 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 	}
 }
 
-static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
+static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
+					  __u32 mask, int data_type)
 {
 	struct fsnotify_mark_entry *entry;
 	bool send;
-- 
cgit v1.2.3-59-g8ed1b


From 8112e2d6a7356e8c3ff1f7f3c86f375ed0305705 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:21 -0500
Subject: fsnotify: include data in should_send calls

fanotify is going to need to look at file->private_data to know if an event
should be sent or not.  This passes the data (which might be a file,
dentry, inode, or none) to the should_send function calls so fanotify can
get that information when available

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c          | 2 +-
 fs/notify/fsnotify.c                 | 3 ++-
 fs/notify/inotify/inotify_fsnotify.c | 2 +-
 include/linux/fsnotify_backend.h     | 2 +-
 kernel/audit_tree.c                  | 2 +-
 kernel/audit_watch.c                 | 2 +-
 6 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 6f30f496e235..a213b83a59cf 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -134,7 +134,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
  */
 static bool dnotify_should_send_event(struct fsnotify_group *group,
 				      struct inode *inode, __u32 mask,
-				      int data_type)
+				      void *data, int data_type)
 {
 	struct fsnotify_mark_entry *entry;
 	bool send;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index fc06e4789392..523337b600a0 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -157,7 +157,8 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 	idx = srcu_read_lock(&fsnotify_grp_srcu);
 	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
 		if (test_mask & group->mask) {
-			if (!group->ops->should_send_event(group, to_tell, mask, data_is))
+			if (!group->ops->should_send_event(group, to_tell, mask,
+							   data, data_is))
 				continue;
 			if (!event) {
 				event = fsnotify_create_event(to_tell, mask, data,
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index fc7c4952e6a2..1f33234cc308 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -87,7 +87,7 @@ static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnot
 }
 
 static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
-				      __u32 mask, int data_type)
+				      __u32 mask, void *data, int data_type)
 {
 	struct fsnotify_mark_entry *entry;
 	bool send;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 61aed0c54fe9..2766df67f1ec 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -85,7 +85,7 @@ struct fsnotify_event_private_data;
  */
 struct fsnotify_ops {
 	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode,
-				  __u32 mask, int data_type);
+				  __u32 mask, void *data, int data_type);
 	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b5417cd65216..e3d63b596ef0 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -920,7 +920,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark_entry *entry, struct fs
 }
 
 static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
-				  __u32 mask, int data_type)
+				  __u32 mask, void *data, int data_type)
 {
 	return 0;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 67d8f2f52874..85c43aa292e0 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -506,7 +506,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 }
 
 static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
-					  __u32 mask, int data_type)
+					  __u32 mask, void *data, int data_type)
 {
 	struct fsnotify_mark_entry *entry;
 	bool send;
-- 
cgit v1.2.3-59-g8ed1b


From 2a12a9d7814631e918dec93abad856e692d5286d Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:21 -0500
Subject: fsnotify: pass a file instead of an inode to open, read, and write

fanotify, the upcoming notification system actually needs a struct path so it can
do opens in the context of listeners, and it needs a file so it can get f_flags
from the original process.  Close was the only operation that already was passing
a struct file to the notification hook.  This patch passes a file for access,
modify, and open as well as they are easily available to these hooks.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/compat.c              |  5 ++---
 fs/exec.c                |  4 ++--
 fs/nfsd/vfs.c            |  4 ++--
 fs/open.c                |  2 +-
 fs/read_write.c          |  8 ++++----
 include/linux/fsnotify.h | 15 +++++++++------
 6 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index 6490d2134ff3..ce02278b9c83 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1193,11 +1193,10 @@ out:
 	if (iov != iovstack)
 		kfree(iov);
 	if ((ret + (type == READ)) > 0) {
-		struct dentry *dentry = file->f_path.dentry;
 		if (type == READ)
-			fsnotify_access(dentry);
+			fsnotify_access(file);
 		else
-			fsnotify_modify(dentry);
+			fsnotify_modify(file);
 	}
 	return ret;
 }
diff --git a/fs/exec.c b/fs/exec.c
index e19de6a80339..f2de04a01a2a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -129,7 +129,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 		goto exit;
 
-	fsnotify_open(file->f_path.dentry);
+	fsnotify_open(file);
 
 	error = -ENOEXEC;
 	if(file->f_op) {
@@ -683,7 +683,7 @@ struct file *open_exec(const char *name)
 	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 		goto exit;
 
-	fsnotify_open(file->f_path.dentry);
+	fsnotify_open(file);
 
 	err = deny_write_access(file);
 	if (err)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3c111120b619..16114a8e79d4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -951,7 +951,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		nfsdstats.io_read += host_err;
 		*count = host_err;
 		err = 0;
-		fsnotify_access(file->f_path.dentry);
+		fsnotify_access(file);
 	} else 
 		err = nfserrno(host_err);
 out:
@@ -1062,7 +1062,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		goto out_nfserr;
 	*cnt = host_err;
 	nfsdstats.io_write += host_err;
-	fsnotify_modify(file->f_path.dentry);
+	fsnotify_modify(file);
 
 	/* clear setuid/setgid flag after write */
 	if (inode->i_mode & (S_ISUID | S_ISGID))
diff --git a/fs/open.c b/fs/open.c
index 94d54d3efa8b..bf082635e257 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -889,7 +889,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 				put_unused_fd(fd);
 				fd = PTR_ERR(f);
 			} else {
-				fsnotify_open(f->f_path.dentry);
+				fsnotify_open(f);
 				fd_install(fd, f);
 			}
 		}
diff --git a/fs/read_write.c b/fs/read_write.c
index 9c0485236e68..74e36586e4d3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -311,7 +311,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 		else
 			ret = do_sync_read(file, buf, count, pos);
 		if (ret > 0) {
-			fsnotify_access(file->f_path.dentry);
+			fsnotify_access(file);
 			add_rchar(current, ret);
 		}
 		inc_syscr(current);
@@ -367,7 +367,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 		else
 			ret = do_sync_write(file, buf, count, pos);
 		if (ret > 0) {
-			fsnotify_modify(file->f_path.dentry);
+			fsnotify_modify(file);
 			add_wchar(current, ret);
 		}
 		inc_syscw(current);
@@ -675,9 +675,9 @@ out:
 		kfree(iov);
 	if ((ret + (type == READ)) > 0) {
 		if (type == READ)
-			fsnotify_access(file->f_path.dentry);
+			fsnotify_access(file);
 		else
-			fsnotify_modify(file->f_path.dentry);
+			fsnotify_modify(file);
 	}
 	return ret;
 }
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index f958e93feb97..845e57abfb86 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -153,8 +153,9 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 /*
  * fsnotify_access - file was read
  */
-static inline void fsnotify_access(struct dentry *dentry)
+static inline void fsnotify_access(struct file *file)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	__u32 mask = FS_ACCESS;
 
@@ -162,14 +163,15 @@ static inline void fsnotify_access(struct dentry *dentry)
 		mask |= FS_IN_ISDIR;
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
 
 /*
  * fsnotify_modify - file was modified
  */
-static inline void fsnotify_modify(struct dentry *dentry)
+static inline void fsnotify_modify(struct file *file)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	__u32 mask = FS_MODIFY;
 
@@ -177,14 +179,15 @@ static inline void fsnotify_modify(struct dentry *dentry)
 		mask |= FS_IN_ISDIR;
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
 
 /*
  * fsnotify_open - file was opened
  */
-static inline void fsnotify_open(struct dentry *dentry)
+static inline void fsnotify_open(struct file *file)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	__u32 mask = FS_OPEN;
 
@@ -192,7 +195,7 @@ static inline void fsnotify_open(struct dentry *dentry)
 		mask |= FS_IN_ISDIR;
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 28c60e37f874dcbb93c4afc839ba5e4911c4f4bc Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:21 -0500
Subject: fsnotify: send struct file when sending events to parents when
 possible

fanotify needs a path in order to open an fd to the object which changed.
Currently notifications to inode's parents are done using only the inode.
For some parental notification we have the entire file, send that so
fanotify can use it.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             | 13 ++++++++++---
 include/linux/fsnotify.h         | 40 +++++++++++++++++++++-------------------
 include/linux/fsnotify_backend.h |  4 ++--
 3 files changed, 33 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 523337b600a0..806beede24a3 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -78,13 +78,16 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 }
 
 /* Notify this dentry's parent about a child's events. */
-void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
 {
 	struct dentry *parent;
 	struct inode *p_inode;
 	bool send = false;
 	bool should_update_children = false;
 
+	if (file)
+		dentry = file->f_path.dentry;
+
 	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
 		return;
 
@@ -115,8 +118,12 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
 		 * specifies these are events which came from a child. */
 		mask |= FS_EVENT_ON_CHILD;
 
-		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
-			 dentry->d_name.name, 0);
+		if (file)
+			fsnotify(p_inode, mask, file, FSNOTIFY_EVENT_FILE,
+				 dentry->d_name.name, 0);
+		else
+			fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+				 dentry->d_name.name, 0);
 		dput(parent);
 	}
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 845e57abfb86..04ea03ea8090 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -26,9 +26,14 @@ static inline void fsnotify_d_instantiate(struct dentry *entry,
 }
 
 /* Notify this dentry's parent about a child's events. */
-static inline void fsnotify_parent(struct dentry *dentry, __u32 mask)
+static inline void fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
 {
-	__fsnotify_parent(dentry, mask);
+	BUG_ON(file && dentry);
+
+	if (file)
+		dentry = file->f_path.dentry;
+
+	__fsnotify_parent(file, dentry, mask);
 }
 
 /*
@@ -102,7 +107,7 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 	if (isdir)
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(dentry, mask);
+	fsnotify_parent(NULL, dentry, mask);
 }
 
 /*
@@ -155,14 +160,13 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
  */
 static inline void fsnotify_access(struct file *file)
 {
-	struct dentry *dentry = file->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
 	__u32 mask = FS_ACCESS;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(dentry, mask);
+	fsnotify_parent(file, NULL, mask);
 	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
 
@@ -171,14 +175,13 @@ static inline void fsnotify_access(struct file *file)
  */
 static inline void fsnotify_modify(struct file *file)
 {
-	struct dentry *dentry = file->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
 	__u32 mask = FS_MODIFY;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(dentry, mask);
+	fsnotify_parent(file, NULL, mask);
 	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
 
@@ -187,14 +190,13 @@ static inline void fsnotify_modify(struct file *file)
  */
 static inline void fsnotify_open(struct file *file)
 {
-	struct dentry *dentry = file->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
 	__u32 mask = FS_OPEN;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(dentry, mask);
+	fsnotify_parent(file, NULL, mask);
 	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
 
@@ -203,15 +205,14 @@ static inline void fsnotify_open(struct file *file)
  */
 static inline void fsnotify_close(struct file *file)
 {
-	struct dentry *dentry = file->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
 	fmode_t mode = file->f_mode;
 	__u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(dentry, mask);
+	fsnotify_parent(file, NULL, mask);
 	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
 
@@ -226,7 +227,7 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(dentry, mask);
+	fsnotify_parent(NULL, dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
@@ -260,7 +261,8 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 	if (mask) {
 		if (S_ISDIR(inode->i_mode))
 			mask |= FS_IN_ISDIR;
-		fsnotify_parent(dentry, mask);
+
+		fsnotify_parent(NULL, dentry, mask);
 		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	}
 }
@@ -283,7 +285,7 @@ static inline void fsnotify_oldname_free(const char *old_name)
 	kfree(old_name);
 }
 
-#else	/* CONFIG_INOTIFY || CONFIG_FSNOTIFY */
+#else	/* CONFIG_FSNOTIFY */
 
 static inline const char *fsnotify_oldname_init(const char *name)
 {
@@ -294,6 +296,6 @@ static inline void fsnotify_oldname_free(const char *old_name)
 {
 }
 
-#endif	/* ! CONFIG_INOTIFY */
+#endif	/*  CONFIG_FSNOTIFY */
 
 #endif	/* _LINUX_FS_NOTIFY_H */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 2766df67f1ec..0e0c2b76b067 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -259,7 +259,7 @@ struct fsnotify_mark_entry {
 /* main fsnotify call to send events */
 extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 		     const char *name, u32 cookie);
-extern void __fsnotify_parent(struct dentry *dentry, __u32 mask);
+extern void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 extern u32 fsnotify_get_cookie(void);
 
@@ -367,7 +367,7 @@ static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int d
 			    const char *name, u32 cookie)
 {}
 
-static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+static inline void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
 {}
 
 static inline void __fsnotify_inode_delete(struct inode *inode)
-- 
cgit v1.2.3-59-g8ed1b


From 74766bbfa99adf8cb8119df6121851edba21c9d9 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:21 -0500
Subject: fsnotify: per group notification queue merge types

inotify only wishes to merge a new event with the last event on the
notification fifo.  fanotify is willing to merge any events including by
means of bitwise OR masks of multiple events together.  This patch moves
the inotify event merging logic out of the generic fsnotify notification.c
and into the inotify code.  This allows each use of fsnotify to provide
their own merge functionality.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_fsnotify.c | 56 ++++++++++++++++++++++++++-
 fs/notify/inotify/inotify_user.c     |  2 +-
 fs/notify/notification.c             | 73 ++++++++++--------------------------
 include/linux/fsnotify_backend.h     |  6 ++-
 4 files changed, 79 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1f33234cc308..0a0f5d0f0d0a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -32,6 +32,60 @@
 
 #include "inotify.h"
 
+/*
+ * Check if 2 events contain the same information.  We do not compare private data
+ * but at this moment that isn't a problem for any know fsnotify listeners.
+ */
+static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+	if ((old->mask == new->mask) &&
+	    (old->to_tell == new->to_tell) &&
+	    (old->data_type == new->data_type) &&
+	    (old->name_len == new->name_len)) {
+		switch (old->data_type) {
+		case (FSNOTIFY_EVENT_INODE):
+			/* remember, after old was put on the wait_q we aren't
+			 * allowed to look at the inode any more, only thing
+			 * left to check was if the file_name is the same */
+			if (!old->name_len ||
+			    !strcmp(old->file_name, new->file_name))
+				return true;
+			break;
+		case (FSNOTIFY_EVENT_PATH):
+			if ((old->path.mnt == new->path.mnt) &&
+			    (old->path.dentry == new->path.dentry))
+				return true;
+			break;
+		case (FSNOTIFY_EVENT_NONE):
+			if (old->mask & FS_Q_OVERFLOW)
+				return true;
+			else if (old->mask & FS_IN_IGNORED)
+				return false;
+			return true;
+		};
+	}
+	return false;
+}
+
+static int inotify_merge(struct list_head *list, struct fsnotify_event *event)
+{
+	struct fsnotify_event_holder *last_holder;
+	struct fsnotify_event *last_event;
+	int ret = 0;
+
+	/* and the list better be locked by something too */
+	spin_lock(&event->lock);
+
+	last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
+	last_event = last_holder->event;
+	if (event_compare(last_event, event))
+		ret = -EEXIST;
+
+	spin_unlock(&event->lock);
+
+	return ret;
+}
+
 static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
 {
 	struct fsnotify_mark_entry *entry;
@@ -62,7 +116,7 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 	fsn_event_priv->group = group;
 	event_priv->wd = wd;
 
-	ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
+	ret = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
 	if (ret) {
 		inotify_free_event_priv(fsn_event_priv);
 		/* EEXIST says we tail matched, EOVERFLOW isn't something
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index f2b542479e91..cbe16df326f8 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -531,7 +531,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 	fsn_event_priv->group = group;
 	event_priv->wd = ientry->wd;
 
-	ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
+	ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
 	if (ret)
 		inotify_free_event_priv(fsn_event_priv);
 
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index b34ce7ad0409..6dc96b35e4a7 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -104,7 +104,8 @@ struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
 
 void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
 {
-	kmem_cache_free(fsnotify_event_holder_cachep, holder);
+	if (holder)
+		kmem_cache_free(fsnotify_event_holder_cachep, holder);
 }
 
 /*
@@ -128,54 +129,18 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
 	return priv;
 }
 
-/*
- * Check if 2 events contain the same information.  We do not compare private data
- * but at this moment that isn't a problem for any know fsnotify listeners.
- */
-static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
-{
-	if ((old->mask == new->mask) &&
-	    (old->to_tell == new->to_tell) &&
-	    (old->data_type == new->data_type) &&
-	    (old->name_len == new->name_len)) {
-		switch (old->data_type) {
-		case (FSNOTIFY_EVENT_INODE):
-			/* remember, after old was put on the wait_q we aren't
-			 * allowed to look at the inode any more, only thing
-			 * left to check was if the file_name is the same */
-			if (!old->name_len ||
-			    !strcmp(old->file_name, new->file_name))
-				return true;
-			break;
-		case (FSNOTIFY_EVENT_PATH):
-			if ((old->path.mnt == new->path.mnt) &&
-			    (old->path.dentry == new->path.dentry))
-				return true;
-			break;
-		case (FSNOTIFY_EVENT_NONE):
-			if (old->mask & FS_Q_OVERFLOW)
-				return true;
-			else if (old->mask & FS_IN_IGNORED)
-				return false;
-			return false;
-		};
-	}
-	return false;
-}
-
 /*
  * Add an event to the group notification queue.  The group can later pull this
  * event off the queue to deal with.  If the event is successfully added to the
  * group's notification queue, a reference is taken on event.
  */
 int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
-			      struct fsnotify_event_private_data *priv)
+			      struct fsnotify_event_private_data *priv,
+			      int (*merge)(struct list_head *, struct fsnotify_event *))
 {
 	struct fsnotify_event_holder *holder = NULL;
 	struct list_head *list = &group->notification_list;
-	struct fsnotify_event_holder *last_holder;
-	struct fsnotify_event *last_event;
-	int ret = 0;
+	int rc = 0;
 
 	/*
 	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -196,11 +161,23 @@ alloc_holder:
 
 	if (group->q_len >= group->max_events) {
 		event = q_overflow_event;
-		ret = -EOVERFLOW;
+		rc = -EOVERFLOW;
 		/* sorry, no private data on the overflow event */
 		priv = NULL;
 	}
 
+	if (!list_empty(list) && merge) {
+		int ret;
+
+		ret = merge(list, event);
+		if (ret) {
+			mutex_unlock(&group->notification_mutex);
+			if (holder != &event->holder)
+				fsnotify_destroy_event_holder(holder);
+			return ret;
+		}
+	}
+
 	spin_lock(&event->lock);
 
 	if (list_empty(&event->holder.event_list)) {
@@ -215,18 +192,6 @@ alloc_holder:
 		goto alloc_holder;
 	}
 
-	if (!list_empty(list)) {
-		last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
-		last_event = last_holder->event;
-		if (event_compare(last_event, event)) {
-			spin_unlock(&event->lock);
-			mutex_unlock(&group->notification_mutex);
-			if (holder != &event->holder)
-				fsnotify_destroy_event_holder(holder);
-			return -EEXIST;
-		}
-	}
-
 	group->q_len++;
 	holder->event = event;
 
@@ -238,7 +203,7 @@ alloc_holder:
 	mutex_unlock(&group->notification_mutex);
 
 	wake_up(&group->notification_waitq);
-	return ret;
+	return rc;
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0e0c2b76b067..25789d45fad8 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -328,8 +328,10 @@ extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struc
 									   struct fsnotify_event *event);
 
 /* attach the event to the group notification queue */
-extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
-				     struct fsnotify_event_private_data *priv);
+extern int fsnotify_add_notify_event(struct fsnotify_group *group,
+				     struct fsnotify_event *event,
+				     struct fsnotify_event_private_data *priv,
+				     int (*merge)(struct list_head *, struct fsnotify_event *));
 /* true if the group notification queue is empty */
 extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
 /* return, but do not dequeue the first event on the notification queue */
-- 
cgit v1.2.3-59-g8ed1b


From b4e4e1407312ae5a267ed7d716e6d4e7120a8430 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:21 -0500
Subject: fsnotify: clone existing events

fsnotify_clone_event will take an event, clone it, and return the cloned
event to the caller.  Since events may be in use by multiple fsnotify
groups simultaneously certain event entries (such as the mask) cannot be
changed after the event was created.  Since fanotify would like to merge
events happening on the same file it needs a new clean event to work with
so it can change any fields it wishes.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/notification.c         | 29 +++++++++++++++++++++++++----
 include/linux/fsnotify_backend.h |  3 +++
 2 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 6dc96b35e4a7..bc9470c7ece7 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -284,11 +284,33 @@ static void initialize_event(struct fsnotify_event *event)
 
 	spin_lock_init(&event->lock);
 
-	event->data_type = FSNOTIFY_EVENT_NONE;
-
 	INIT_LIST_HEAD(&event->private_data_list);
 }
 
+struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
+{
+	struct fsnotify_event *event;
+
+	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+	if (!event)
+		return NULL;
+
+	memcpy(event, old_event, sizeof(*event));
+	initialize_event(event);
+
+	if (event->name_len) {
+		event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
+		if (!event->file_name) {
+			kmem_cache_free(fsnotify_event_cachep, event);
+			return NULL;
+		}
+	}
+	if (event->data_type == FSNOTIFY_EVENT_PATH)
+		path_get(&event->path);
+
+	return event;
+}
+
 /*
  * fsnotify_create_event - Allocate a new event which will be sent to each
  * group's handle_event function if the group was interested in this
@@ -324,6 +346,7 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 
 	event->sync_cookie = cookie;
 	event->to_tell = to_tell;
+	event->data_type = data_type;
 
 	switch (data_type) {
 	case FSNOTIFY_EVENT_FILE: {
@@ -340,12 +363,10 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 		event->path.dentry = path->dentry;
 		event->path.mnt = path->mnt;
 		path_get(&event->path);
-		event->data_type = FSNOTIFY_EVENT_PATH;
 		break;
 	}
 	case FSNOTIFY_EVENT_INODE:
 		event->inode = data;
-		event->data_type = FSNOTIFY_EVENT_INODE;
 		break;
 	case FSNOTIFY_EVENT_NONE:
 		event->inode = NULL;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 25789d45fad8..3a7fff235539 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -363,6 +363,9 @@ extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32
 						    void *data, int data_is, const char *name,
 						    u32 cookie, gfp_t gfp);
 
+/* fanotify likes to change events after they are on lists... */
+extern struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event);
+
 #else
 
 static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
-- 
cgit v1.2.3-59-g8ed1b


From 1201a5361b9bd6512ae01e6f2b7aa79d458cafb1 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:22 -0500
Subject: fsnotify: replace an event on a list

fanotify would like to clone events already on its notification list, make
changes to the new event, and then replace the old event on the list with
the new event.  This patch implements the replace functionality of that
process.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/notification.c         | 56 ++++++++++++++++++++++++++++++++++++++++
 include/linux/fsnotify_backend.h |  2 ++
 2 files changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index bc9470c7ece7..b493c378445f 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -287,6 +287,62 @@ static void initialize_event(struct fsnotify_event *event)
 	INIT_LIST_HEAD(&event->private_data_list);
 }
 
+/*
+ * Caller damn well better be holding whatever mutex is protecting the
+ * old_holder->event_list.
+ */
+int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
+			   struct fsnotify_event *new_event)
+{
+	struct fsnotify_event *old_event = old_holder->event;
+	struct fsnotify_event_holder *new_holder = NULL;
+
+	/*
+	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
+	 * Check if we expect to be able to use that holder.  If not alloc a new
+	 * holder.
+	 * For the overflow event it's possible that something will use the in
+	 * event holder before we get the lock so we may need to jump back and
+	 * alloc a new holder, this can't happen for most events...
+	 */
+	if (!list_empty(&new_event->holder.event_list)) {
+alloc_holder:
+		new_holder = fsnotify_alloc_event_holder();
+		if (!new_holder)
+			return -ENOMEM;
+	}
+
+	spin_lock(&old_event->lock);
+	spin_lock(&new_event->lock);
+
+	if (list_empty(&new_event->holder.event_list)) {
+		if (unlikely(new_holder))
+			fsnotify_destroy_event_holder(new_holder);
+		new_holder = &new_event->holder;
+	} else if (unlikely(!new_holder)) {
+		/* between the time we checked above and got the lock the in
+		 * event holder was used, go back and get a new one */
+		spin_unlock(&new_event->lock);
+		spin_unlock(&old_event->lock);
+		goto alloc_holder;
+	}
+
+	new_holder->event = new_event;
+	list_replace_init(&old_holder->event_list, &new_holder->event_list);
+
+	spin_unlock(&new_event->lock);
+	spin_unlock(&old_event->lock);
+
+	/* event == holder means we are referenced through the in event holder */
+	if (old_holder != &old_event->holder)
+		fsnotify_destroy_event_holder(old_holder);
+
+	fsnotify_get_event(new_event); /* on the list take reference */
+	fsnotify_put_event(old_event); /* off the list, drop reference */
+
+	return 0;
+}
+
 struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
 {
 	struct fsnotify_event *event;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 3a7fff235539..427f6ffab127 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -365,6 +365,8 @@ extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32
 
 /* fanotify likes to change events after they are on lists... */
 extern struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event);
+extern int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
+				  struct fsnotify_event *new_event);
 
 #else
 
-- 
cgit v1.2.3-59-g8ed1b


From 74be0cc82835aecad332a29896b0f212ba893403 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:22 -0500
Subject: fsnotify: remove group_num altogether

The original fsnotify interface has a group-num which was intended to be
able to find a group after it was added.  I no longer think this is a
necessary thing to do and so we remove the group_num.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c      |  3 +--
 fs/notify/group.c                | 48 ++--------------------------------------
 fs/notify/inotify/inotify_user.c | 11 +--------
 include/linux/fsnotify_backend.h | 10 +--------
 kernel/audit_tree.c              |  3 +--
 kernel/audit_watch.c             |  2 +-
 6 files changed, 7 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index a213b83a59cf..1f46aeac3387 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -433,8 +433,7 @@ static int __init dnotify_init(void)
 	dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
 	dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
 
-	dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
-					      0, &dnotify_fsnotify_ops);
+	dnotify_group = fsnotify_obtain_group(0, &dnotify_fsnotify_ops);
 	if (IS_ERR(dnotify_group))
 		panic("unable to allocate fsnotify group for dnotify\n");
 	return 0;
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 777ca8212255..62fb8961a57b 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -77,15 +77,6 @@ void fsnotify_recalc_group_mask(struct fsnotify_group *group)
 		fsnotify_recalc_global_mask();
 }
 
-/*
- * Take a reference to a group so things found under the fsnotify_grp_mutex
- * can't get freed under us
- */
-static void fsnotify_get_group(struct fsnotify_group *group)
-{
-	atomic_inc(&group->refcnt);
-}
-
 /*
  * Final freeing of a group
  */
@@ -170,41 +161,15 @@ void fsnotify_put_group(struct fsnotify_group *group)
 	fsnotify_destroy_group(group);
 }
 
-/*
- * Simply run the fsnotify_groups list and find a group which matches
- * the given parameters.  If a group is found we take a reference to that
- * group.
- */
-static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
-						  const struct fsnotify_ops *ops)
-{
-	struct fsnotify_group *group_iter;
-	struct fsnotify_group *group = NULL;
-
-	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
-
-	list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
-		if (group_iter->group_num == group_num) {
-			if ((group_iter->mask == mask) &&
-			    (group_iter->ops == ops)) {
-				fsnotify_get_group(group_iter);
-				group = group_iter;
-			} else
-				group = ERR_PTR(-EEXIST);
-		}
-	}
-	return group;
-}
-
 /*
  * Either finds an existing group which matches the group_num, mask, and ops or
  * creates a new group and adds it to the global group list.  In either case we
  * take a reference for the group returned.
  */
-struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
+struct fsnotify_group *fsnotify_obtain_group(__u32 mask,
 					     const struct fsnotify_ops *ops)
 {
-	struct fsnotify_group *group, *tgroup;
+	struct fsnotify_group *group;
 
 	/* very low use, simpler locking if we just always alloc */
 	group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
@@ -214,7 +179,6 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
 	atomic_set(&group->refcnt, 1);
 
 	group->on_group_list = 0;
-	group->group_num = group_num;
 	group->mask = mask;
 
 	mutex_init(&group->notification_mutex);
@@ -230,14 +194,6 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
 	group->ops = ops;
 
 	mutex_lock(&fsnotify_grp_mutex);
-	tgroup = fsnotify_find_group(group_num, mask, ops);
-	if (tgroup) {
-		/* group already exists */
-		mutex_unlock(&fsnotify_grp_mutex);
-		/* destroy the new one we made */
-		fsnotify_put_group(group);
-		return tgroup;
-	}
 
 	/* group not found, add a new one */
 	list_add_rcu(&group->group_list, &fsnotify_groups);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index cbe16df326f8..cae317f5bd9d 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -51,12 +51,6 @@ int inotify_max_user_watches __read_mostly;
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 struct kmem_cache *event_priv_cachep __read_mostly;
 
-/*
- * When inotify registers a new group it increments this and uses that
- * value as an offset to set the fsnotify group "name" and priority.
- */
-static atomic_t inotify_grp_num;
-
 #ifdef CONFIG_SYSCTL
 
 #include <linux/sysctl.h>
@@ -700,11 +694,8 @@ retry:
 static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
 {
 	struct fsnotify_group *group;
-	unsigned int grp_num;
 
-	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
-	grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
-	group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
+	group = fsnotify_obtain_group(0, &inotify_fsnotify_ops);
 	if (IS_ERR(group))
 		return group;
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 427f6ffab127..57e503d017c8 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -60,12 +60,6 @@
 
 #define FS_MOVE			(FS_MOVED_FROM | FS_MOVED_TO)
 
-/* listeners that hard code group numbers near the top */
-#define DNOTIFY_GROUP_NUM	UINT_MAX
-#define AUDIT_WATCH_GROUP_NUM	(DNOTIFY_GROUP_NUM-1)
-#define AUDIT_TREE_GROUP_NUM	(AUDIT_WATCH_GROUP_NUM-1)
-#define INOTIFY_GROUP_NUM	(AUDIT_TREE_GROUP_NUM-1)
-
 struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark_entry;
@@ -124,7 +118,6 @@ struct fsnotify_group {
 	 * closed.
 	 */
 	atomic_t refcnt;		/* things with interest in this group */
-	unsigned int group_num;		/* simply prevents accidental group collision */
 
 	const struct fsnotify_ops *ops;	/* how this group handles things */
 
@@ -312,8 +305,7 @@ static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode
 /* must call when a group changes its ->mask */
 extern void fsnotify_recalc_global_mask(void);
 /* get a reference to an existing or create a new group */
-extern struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num,
-						    __u32 mask,
+extern struct fsnotify_group *fsnotify_obtain_group(__u32 mask,
 						    const struct fsnotify_ops *ops);
 /* run all marks associated with this group and update group->mask */
 extern void fsnotify_recalc_group_mask(struct fsnotify_group *group);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e3d63b596ef0..59065e72a2eb 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -937,8 +937,7 @@ static int __init audit_tree_init(void)
 {
 	int i;
 
-	audit_tree_group = fsnotify_obtain_group(AUDIT_TREE_GROUP_NUM,
-						 0, &audit_tree_ops);
+	audit_tree_group = fsnotify_obtain_group(0, &audit_tree_ops);
 	if (IS_ERR(audit_tree_group))
 		audit_panic("cannot initialize fsnotify group for rectree watches");
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 85c43aa292e0..c500104d38c2 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -577,7 +577,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = {
 
 static int __init audit_watch_init(void)
 {
-	audit_watch_group = fsnotify_obtain_group(AUDIT_WATCH_GROUP_NUM, AUDIT_FS_WATCH,
+	audit_watch_group = fsnotify_obtain_group(AUDIT_FS_WATCH,
 						  &audit_watch_fsnotify_ops);
 	if (IS_ERR(audit_watch_group)) {
 		audit_watch_group = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From ffab83402f01555a5fa32efb48a4dd0ce8d12ef5 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:22 -0500
Subject: fsnotify: fsnotify_obtain_group should be fsnotify_alloc_group

fsnotify_obtain_group was intended to be able to find an already existing
group.  Nothing uses that functionality.  This just renames it to
fsnotify_alloc_group so it is clear what it is doing.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c      |  2 +-
 fs/notify/group.c                | 10 +++-------
 fs/notify/inotify/inotify_user.c |  2 +-
 include/linux/fsnotify_backend.h |  4 ++--
 kernel/audit_tree.c              |  2 +-
 kernel/audit_watch.c             |  4 ++--
 6 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 1f46aeac3387..51e4fe33d6bb 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -433,7 +433,7 @@ static int __init dnotify_init(void)
 	dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
 	dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
 
-	dnotify_group = fsnotify_obtain_group(0, &dnotify_fsnotify_ops);
+	dnotify_group = fsnotify_alloc_group(0, &dnotify_fsnotify_ops);
 	if (IS_ERR(dnotify_group))
 		panic("unable to allocate fsnotify group for dnotify\n");
 	return 0;
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 934860e98095..1d20d26d5fee 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -162,16 +162,13 @@ void fsnotify_put_group(struct fsnotify_group *group)
 }
 
 /*
- * Either finds an existing group which matches the group_num, mask, and ops or
- * creates a new group and adds it to the global group list.  In either case we
- * take a reference for the group returned.
+ * Create a new fsnotify_group and hold a reference for the group returned.
  */
-struct fsnotify_group *fsnotify_obtain_group(__u32 mask,
-					     const struct fsnotify_ops *ops)
+struct fsnotify_group *fsnotify_alloc_group(__u32 mask,
+					    const struct fsnotify_ops *ops)
 {
 	struct fsnotify_group *group;
 
-	/* very low use, simpler locking if we just always alloc */
 	group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
 	if (!group)
 		return ERR_PTR(-ENOMEM);
@@ -192,7 +189,6 @@ struct fsnotify_group *fsnotify_obtain_group(__u32 mask,
 
 	mutex_lock(&fsnotify_grp_mutex);
 
-	/* group not found, add a new one */
 	list_add_rcu(&group->group_list, &fsnotify_groups);
 	group->on_group_list = 1;
 	/* being on the fsnotify_groups list holds one num_marks */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index cae317f5bd9d..25a2854186e9 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -695,7 +695,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
 {
 	struct fsnotify_group *group;
 
-	group = fsnotify_obtain_group(0, &inotify_fsnotify_ops);
+	group = fsnotify_alloc_group(0, &inotify_fsnotify_ops);
 	if (IS_ERR(group))
 		return group;
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 57e503d017c8..7d3c03e46862 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -305,11 +305,11 @@ static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode
 /* must call when a group changes its ->mask */
 extern void fsnotify_recalc_global_mask(void);
 /* get a reference to an existing or create a new group */
-extern struct fsnotify_group *fsnotify_obtain_group(__u32 mask,
+extern struct fsnotify_group *fsnotify_alloc_group(__u32 mask,
 						    const struct fsnotify_ops *ops);
 /* run all marks associated with this group and update group->mask */
 extern void fsnotify_recalc_group_mask(struct fsnotify_group *group);
-/* drop reference on a group from fsnotify_obtain_group */
+/* drop reference on a group from fsnotify_alloc_group */
 extern void fsnotify_put_group(struct fsnotify_group *group);
 
 /* take a reference to an event */
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 59065e72a2eb..813274d4edad 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -937,7 +937,7 @@ static int __init audit_tree_init(void)
 {
 	int i;
 
-	audit_tree_group = fsnotify_obtain_group(0, &audit_tree_ops);
+	audit_tree_group = fsnotify_alloc_group(0, &audit_tree_ops);
 	if (IS_ERR(audit_tree_group))
 		audit_panic("cannot initialize fsnotify group for rectree watches");
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index c500104d38c2..0f03a6ab96ed 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -577,8 +577,8 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = {
 
 static int __init audit_watch_init(void)
 {
-	audit_watch_group = fsnotify_obtain_group(AUDIT_FS_WATCH,
-						  &audit_watch_fsnotify_ops);
+	audit_watch_group = fsnotify_alloc_group(AUDIT_FS_WATCH,
+						 &audit_watch_fsnotify_ops);
 	if (IS_ERR(audit_watch_group)) {
 		audit_watch_group = NULL;
 		audit_panic("cannot create audit fsnotify group");
-- 
cgit v1.2.3-59-g8ed1b


From 0d2e2a1d00d7d23e5bd9bb0935cde7c3d5835c56 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:22 -0500
Subject: fsnotify: drop mask argument from fsnotify_alloc_group

Nothing uses the mask argument to fsnotify_alloc_group.  This patch drops
that argument.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c      | 2 +-
 fs/notify/group.c                | 8 +-------
 fs/notify/inotify/inotify_user.c | 2 +-
 include/linux/fsnotify_backend.h | 3 +--
 kernel/audit_tree.c              | 2 +-
 kernel/audit_watch.c             | 2 +-
 6 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 51e4fe33d6bb..e0a847bd53be 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -433,7 +433,7 @@ static int __init dnotify_init(void)
 	dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
 	dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
 
-	dnotify_group = fsnotify_alloc_group(0, &dnotify_fsnotify_ops);
+	dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
 	if (IS_ERR(dnotify_group))
 		panic("unable to allocate fsnotify group for dnotify\n");
 	return 0;
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 1d20d26d5fee..1657349c30a6 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -164,8 +164,7 @@ void fsnotify_put_group(struct fsnotify_group *group)
 /*
  * Create a new fsnotify_group and hold a reference for the group returned.
  */
-struct fsnotify_group *fsnotify_alloc_group(__u32 mask,
-					    const struct fsnotify_ops *ops)
+struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 {
 	struct fsnotify_group *group;
 
@@ -175,8 +174,6 @@ struct fsnotify_group *fsnotify_alloc_group(__u32 mask,
 
 	atomic_set(&group->refcnt, 1);
 
-	group->mask = mask;
-
 	mutex_init(&group->notification_mutex);
 	INIT_LIST_HEAD(&group->notification_list);
 	init_waitqueue_head(&group->notification_waitq);
@@ -196,8 +193,5 @@ struct fsnotify_group *fsnotify_alloc_group(__u32 mask,
 
 	mutex_unlock(&fsnotify_grp_mutex);
 
-	if (mask)
-		fsnotify_recalc_global_mask();
-
 	return group;
 }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 25a2854186e9..a48d68a68b25 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -695,7 +695,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
 {
 	struct fsnotify_group *group;
 
-	group = fsnotify_alloc_group(0, &inotify_fsnotify_ops);
+	group = fsnotify_alloc_group(&inotify_fsnotify_ops);
 	if (IS_ERR(group))
 		return group;
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7d3c03e46862..58326049ab29 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -305,8 +305,7 @@ static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode
 /* must call when a group changes its ->mask */
 extern void fsnotify_recalc_global_mask(void);
 /* get a reference to an existing or create a new group */
-extern struct fsnotify_group *fsnotify_alloc_group(__u32 mask,
-						    const struct fsnotify_ops *ops);
+extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops);
 /* run all marks associated with this group and update group->mask */
 extern void fsnotify_recalc_group_mask(struct fsnotify_group *group);
 /* drop reference on a group from fsnotify_alloc_group */
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 813274d4edad..04f16887406b 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -937,7 +937,7 @@ static int __init audit_tree_init(void)
 {
 	int i;
 
-	audit_tree_group = fsnotify_alloc_group(0, &audit_tree_ops);
+	audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
 	if (IS_ERR(audit_tree_group))
 		audit_panic("cannot initialize fsnotify group for rectree watches");
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 87408b282118..83d5f9674cec 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -585,7 +585,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = {
 
 static int __init audit_watch_init(void)
 {
-	audit_watch_group = fsnotify_alloc_group(0, &audit_watch_fsnotify_ops);
+	audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
 	if (IS_ERR(audit_watch_group)) {
 		audit_watch_group = NULL;
 		audit_panic("cannot create audit fsnotify group");
-- 
cgit v1.2.3-59-g8ed1b


From 19c2a0e1a2f60112c158342ba5f568f72b741c2c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:23 -0500
Subject: fsnotify: rename fsnotify_groups to fsnotify_inode_groups

Simple renaming patch.  fsnotify is about to support mount point listeners
so I am renaming fsnotify_groups and fsnotify_mask to indicate these are lists
used only for groups which have watches on inodes.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             |  6 +++---
 fs/notify/fsnotify.h             |  8 ++++----
 fs/notify/group.c                | 30 +++++++++++++++++++-----------
 include/linux/fsnotify_backend.h |  6 +++---
 4 files changed, 29 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 806beede24a3..23b5cfbeed50 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -148,10 +148,10 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 	/* global tests shouldn't care about events on child only the specific event */
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
-	if (list_empty(&fsnotify_groups))
+	if (list_empty(&fsnotify_inode_groups))
 		return;
 
-	if (!(test_mask & fsnotify_mask))
+	if (!(test_mask & fsnotify_inode_mask))
 		return;
 
 	if (!(test_mask & to_tell->i_fsnotify_mask))
@@ -162,7 +162,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 	 * anything other than walk the list so it's crazy to pre-allocate.
 	 */
 	idx = srcu_read_lock(&fsnotify_grp_srcu);
-	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
+	list_for_each_entry_rcu(group, &fsnotify_inode_groups, inode_group_list) {
 		if (test_mask & group->mask) {
 			if (!group->ops->should_send_event(group, to_tell, mask,
 							   data, data_is))
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 4dc240824b2d..ec5aee43bdc5 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -8,10 +8,10 @@
 
 /* protects reads of fsnotify_groups */
 extern struct srcu_struct fsnotify_grp_srcu;
-/* all groups which receive fsnotify events */
-extern struct list_head fsnotify_groups;
-/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
-extern __u32 fsnotify_mask;
+/* all groups which receive inode fsnotify events */
+extern struct list_head fsnotify_inode_groups;
+/* all bitwise OR of all event types (FS_*) for all fsnotify_inode_groups */
+extern __u32 fsnotify_inode_mask;
 
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 1657349c30a6..c80809745312 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -33,9 +33,9 @@ static DEFINE_MUTEX(fsnotify_grp_mutex);
 /* protects reads while running the fsnotify_groups list */
 struct srcu_struct fsnotify_grp_srcu;
 /* all groups registered to receive filesystem notifications */
-LIST_HEAD(fsnotify_groups);
+LIST_HEAD(fsnotify_inode_groups);
 /* bitwise OR of all events (FS_*) interesting to some group on this system */
-__u32 fsnotify_mask;
+__u32 fsnotify_inode_mask;
 
 /*
  * When a new group registers or changes it's set of interesting events
@@ -48,10 +48,10 @@ void fsnotify_recalc_global_mask(void)
 	int idx;
 
 	idx = srcu_read_lock(&fsnotify_grp_srcu);
-	list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
+	list_for_each_entry_rcu(group, &fsnotify_inode_groups, inode_group_list)
 		mask |= group->mask;
 	srcu_read_unlock(&fsnotify_grp_srcu, idx);
-	fsnotify_mask = mask;
+	fsnotify_inode_mask = mask;
 }
 
 /*
@@ -77,6 +77,17 @@ void fsnotify_recalc_group_mask(struct fsnotify_group *group)
 		fsnotify_recalc_global_mask();
 }
 
+static void fsnotify_add_group(struct fsnotify_group *group)
+{
+	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+
+	group->on_inode_group_list = 1;
+	/* being on the fsnotify_groups list holds one num_marks */
+	atomic_inc(&group->num_marks);
+
+	list_add_tail_rcu(&group->inode_group_list, &fsnotify_inode_groups);
+}
+
 /*
  * Final freeing of a group
  */
@@ -118,9 +129,9 @@ static void __fsnotify_evict_group(struct fsnotify_group *group)
 {
 	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
 
-	if (group->on_group_list)
-		list_del_rcu(&group->group_list);
-	group->on_group_list = 0;
+	if (group->on_inode_group_list)
+		list_del_rcu(&group->inode_group_list);
+	group->on_inode_group_list = 0;
 }
 
 /*
@@ -186,10 +197,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 
 	mutex_lock(&fsnotify_grp_mutex);
 
-	list_add_rcu(&group->group_list, &fsnotify_groups);
-	group->on_group_list = 1;
-	/* being on the fsnotify_groups list holds one num_marks */
-	atomic_inc(&group->num_marks);
+	fsnotify_add_group(group);
 
 	mutex_unlock(&fsnotify_grp_mutex);
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 58326049ab29..21079ade5620 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -95,10 +95,10 @@ struct fsnotify_ops {
 struct fsnotify_group {
 	/*
 	 * global list of all groups receiving events from fsnotify.
-	 * anchored by fsnotify_groups and protected by either fsnotify_grp_mutex
+	 * anchored by fsnotify_inode_groups and protected by either fsnotify_grp_mutex
 	 * or fsnotify_grp_srcu depending on write vs read.
 	 */
-	struct list_head group_list;
+	struct list_head inode_group_list;
 
 	/*
 	 * Defines all of the event types in which this group is interested.
@@ -136,7 +136,7 @@ struct fsnotify_group {
 	struct list_head mark_entries;	/* all inode mark entries for this group */
 
 	/* prevents double list_del of group_list.  protected by global fsnotify_grp_mutex */
-	bool on_group_list;
+	bool on_inode_group_list;
 
 	/* groups can define private fields here or use the void *private */
 	union {
-- 
cgit v1.2.3-59-g8ed1b


From 7131485a93679ff9a543b74df280cfd119eb03ca Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:23 -0500
Subject: fsnotify: mount point listeners list and global mask

currently all of the notification systems implemented select which inodes
they care about and receive messages only about those inodes (or the
children of those inodes.)  This patch begins to flesh out fsnotify support
for the concept of listeners that want to hear notification for an inode
accessed below a given monut point.  This patch implements a second list
of fsnotify groups to hold these types of groups and a second global mask
to hold the events of interest for this type of group.

The reason we want a second group list and mask is because the inode based
notification should_send_event support which makes each group look for a mark
on the given inode.  With one nfsmount listener that means that every group would
have to take the inode->i_lock, look for their mark, not find one, and return
for every operation.   By seperating vfsmount from inode listeners only when
there is a inode listener will the inode groups have to look for their
mark and take the inode lock.  vfsmount listeners will have to grab the lock and
look for a mark but there should be fewer of them, and one vfsmount listener
won't cause the i_lock to be grabbed and released for every fsnotify group
on every io operation.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             | 92 ++++++++++++++++++++++++++++++----------
 fs/notify/fsnotify.h             |  6 +++
 fs/notify/group.c                | 33 ++++++++++++--
 fs/notify/inode_mark.c           |  7 +++
 include/linux/fsnotify_backend.h |  5 +++
 5 files changed, 117 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 23b5cfbeed50..a61aaa710825 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -21,6 +21,7 @@
 #include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/mount.h>
 #include <linux/srcu.h>
 
 #include <linux/fsnotify_backend.h>
@@ -134,6 +135,45 @@ void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
+static void send_to_group(__u32 mask,
+			  struct fsnotify_group *group,
+			  void *data, int data_is, const char *file_name,
+			  u32 cookie, struct fsnotify_event **event,
+			  struct inode *to_tell)
+{
+	if (!group->ops->should_send_event(group, to_tell, mask,
+					   data, data_is))
+		return;
+	if (!*event) {
+		*event = fsnotify_create_event(to_tell, mask, data,
+						data_is, file_name,
+						cookie, GFP_KERNEL);
+		/*
+		 * shit, we OOM'd and now we can't tell, maybe
+		 * someday someone else will want to do something
+		 * here
+		 */
+		if (!*event)
+			return;
+	}
+	group->ops->handle_event(group, *event);
+}
+
+static bool needed_by_vfsmount(__u32 test_mask, void *data, int data_is)
+{
+	struct path *path;
+
+	if (data_is == FSNOTIFY_EVENT_PATH)
+		path = (struct path *)data;
+	else if (data_is == FSNOTIFY_EVENT_FILE)
+		path = &((struct file *)data)->f_path;
+	else
+		return false;
+
+	/* hook in this when mnt->mnt_fsnotify_mask is defined */
+	/* return (test_mask & path->mnt->mnt_fsnotify_mask); */
+	return false;
+}
 /*
  * This is the main call to fsnotify.  The VFS calls into hook specific functions
  * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
@@ -148,38 +188,46 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 	/* global tests shouldn't care about events on child only the specific event */
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
-	if (list_empty(&fsnotify_inode_groups))
-		return;
+	/* if no fsnotify listeners, nothing to do */
+	if (list_empty(&fsnotify_inode_groups) &&
+	    list_empty(&fsnotify_vfsmount_groups))
+                return;
+ 
+	/* if none of the directed listeners or vfsmount listeners care */
+	if (!(test_mask & fsnotify_inode_mask) &&
+	    !(test_mask & fsnotify_vfsmount_mask))
+                return;
+ 
+	/* if this inode's directed listeners don't care and nothing on the vfsmount
+	 * listeners list cares, nothing to do */
+	if (!(test_mask & to_tell->i_fsnotify_mask) &&
+	    !needed_by_vfsmount(test_mask, data, data_is))
+                return;
 
-	if (!(test_mask & fsnotify_inode_mask))
-		return;
-
-	if (!(test_mask & to_tell->i_fsnotify_mask))
-		return;
 	/*
 	 * SRCU!!  the groups list is very very much read only and the path is
 	 * very hot.  The VAST majority of events are not going to need to do
 	 * anything other than walk the list so it's crazy to pre-allocate.
 	 */
 	idx = srcu_read_lock(&fsnotify_grp_srcu);
-	list_for_each_entry_rcu(group, &fsnotify_inode_groups, inode_group_list) {
-		if (test_mask & group->mask) {
-			if (!group->ops->should_send_event(group, to_tell, mask,
-							   data, data_is))
-				continue;
-			if (!event) {
-				event = fsnotify_create_event(to_tell, mask, data,
-							      data_is, file_name, cookie,
-							      GFP_KERNEL);
-				/* shit, we OOM'd and now we can't tell, maybe
-				 * someday someone else will want to do something
-				 * here */
-				if (!event)
-					break;
+
+	if (test_mask & to_tell->i_fsnotify_mask) {
+		list_for_each_entry_rcu(group, &fsnotify_inode_groups, inode_group_list) {
+			if (test_mask & group->mask) {
+				send_to_group(mask, group, data, data_is,
+					      file_name, cookie, &event, to_tell);
 			}
-			group->ops->handle_event(group, event);
 		}
 	}
+	if (needed_by_vfsmount(test_mask, data, data_is)) {
+		list_for_each_entry_rcu(group, &fsnotify_vfsmount_groups, vfsmount_group_list) {
+			if (test_mask & group->mask) {
+				send_to_group(mask, group, data, data_is,
+					      file_name, cookie, &event, to_tell);
+			}
+		}
+	}
+
 	srcu_read_unlock(&fsnotify_grp_srcu, idx);
 	/*
 	 * fsnotify_create_event() took a reference so the event can't be cleaned
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 5bd22412017f..2ba59158969f 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -10,14 +10,20 @@
 extern struct srcu_struct fsnotify_grp_srcu;
 /* all groups which receive inode fsnotify events */
 extern struct list_head fsnotify_inode_groups;
+/* all groups which receive vfsmount fsnotify events */
+extern struct list_head fsnotify_vfsmount_groups;
 /* all bitwise OR of all event types (FS_*) for all fsnotify_inode_groups */
 extern __u32 fsnotify_inode_mask;
+/* all bitwise OR of all event types (FS_*) for all fsnotify_vfsmount_groups */
+extern __u32 fsnotify_vfsmount_mask;
 
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
 /* add a group to the inode group list */
 extern void fsnotify_add_inode_group(struct fsnotify_group *group);
+/* add a group to the vfsmount group list */
+extern void fsnotify_add_vfsmount_group(struct fsnotify_group *group);
 /* final kfree of a group */
 extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
 
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 34fccbd2809c..aa4654fe6ec2 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -32,10 +32,14 @@
 static DEFINE_MUTEX(fsnotify_grp_mutex);
 /* protects reads while running the fsnotify_groups list */
 struct srcu_struct fsnotify_grp_srcu;
-/* all groups registered to receive filesystem notifications */
+/* all groups registered to receive inode filesystem notifications */
 LIST_HEAD(fsnotify_inode_groups);
+/* all groups registered to receive mount point filesystem notifications */
+LIST_HEAD(fsnotify_vfsmount_groups);
 /* bitwise OR of all events (FS_*) interesting to some group on this system */
 __u32 fsnotify_inode_mask;
+/* bitwise OR of all events (FS_*) interesting to some group on this system */
+__u32 fsnotify_vfsmount_mask;
 
 /*
  * When a new group registers or changes it's set of interesting events
@@ -44,14 +48,20 @@ __u32 fsnotify_inode_mask;
 void fsnotify_recalc_global_mask(void)
 {
 	struct fsnotify_group *group;
-	__u32 mask = 0;
+	__u32 inode_mask = 0;
+	__u32 vfsmount_mask = 0;
 	int idx;
 
 	idx = srcu_read_lock(&fsnotify_grp_srcu);
 	list_for_each_entry_rcu(group, &fsnotify_inode_groups, inode_group_list)
-		mask |= group->mask;
+		inode_mask |= group->mask;
+	list_for_each_entry_rcu(group, &fsnotify_vfsmount_groups, vfsmount_group_list)
+		vfsmount_mask |= group->mask;
+		
 	srcu_read_unlock(&fsnotify_grp_srcu, idx);
-	fsnotify_inode_mask = mask;
+
+	fsnotify_inode_mask = inode_mask;
+	fsnotify_vfsmount_mask = vfsmount_mask;
 }
 
 /*
@@ -77,6 +87,17 @@ void fsnotify_recalc_group_mask(struct fsnotify_group *group)
 		fsnotify_recalc_global_mask();
 }
 
+void fsnotify_add_vfsmount_group(struct fsnotify_group *group)
+{
+	mutex_lock(&fsnotify_grp_mutex);
+
+	if (!group->on_vfsmount_group_list)
+		list_add_tail_rcu(&group->vfsmount_group_list, &fsnotify_vfsmount_groups);
+	group->on_vfsmount_group_list = 1;
+
+	mutex_unlock(&fsnotify_grp_mutex);
+}
+
 void fsnotify_add_inode_group(struct fsnotify_group *group)
 {
 	mutex_lock(&fsnotify_grp_mutex);
@@ -132,6 +153,9 @@ static void __fsnotify_evict_group(struct fsnotify_group *group)
 	if (group->on_inode_group_list)
 		list_del_rcu(&group->inode_group_list);
 	group->on_inode_group_list = 0;
+	if (group->on_vfsmount_group_list)
+		list_del_rcu(&group->vfsmount_group_list);
+	group->on_vfsmount_group_list = 0;
 }
 
 /*
@@ -197,6 +221,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 	group->max_events = UINT_MAX;
 
 	INIT_LIST_HEAD(&group->inode_group_list);
+	INIT_LIST_HEAD(&group->vfsmount_group_list);
 
 	spin_lock_init(&group->mark_lock);
 	INIT_LIST_HEAD(&group->mark_entries);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index a3230c485531..beffebb64627 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -328,6 +328,13 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 	 */
 	if (unlikely(list_empty(&group->inode_group_list)))
 		fsnotify_add_inode_group(group);
+	/*
+	 * XXX This is where we could also do the fsnotify_add_vfsmount_group
+	 * if we are setting and vfsmount mark....
+
+	if (unlikely(list_empty(&group->vfsmount_group_list)))
+		fsnotify_add_vfsmount_group(group);
+	 */
 
 	/*
 	 * LOCKING ORDER!!!!
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 21079ade5620..dea48bee057d 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -99,6 +99,10 @@ struct fsnotify_group {
 	 * or fsnotify_grp_srcu depending on write vs read.
 	 */
 	struct list_head inode_group_list;
+	/*
+	 * same as above except anchored by fsnotify_vfsmount_groups
+	 */
+	struct list_head vfsmount_group_list;
 
 	/*
 	 * Defines all of the event types in which this group is interested.
@@ -137,6 +141,7 @@ struct fsnotify_group {
 
 	/* prevents double list_del of group_list.  protected by global fsnotify_grp_mutex */
 	bool on_inode_group_list;
+	bool on_vfsmount_group_list;
 
 	/* groups can define private fields here or use the void *private */
 	union {
-- 
cgit v1.2.3-59-g8ed1b


From 3a9fb89f4cd04c23e16397befba92efb5d989b74 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:23 -0500
Subject: fsnotify: include vfsmount in should_send_event when appropriate

To ensure that a group will not duplicate events when it receives it based
on the vfsmount and the inode should_send_event test we should distinguish
those two cases.  We pass a vfsmount to this function so groups can make
their own determinations.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c          |  4 ++--
 fs/notify/fsnotify.c                 | 39 ++++++++++++++++++------------------
 fs/notify/inotify/inotify_fsnotify.c |  3 ++-
 include/linux/fsnotify_backend.h     |  3 ++-
 kernel/audit_tree.c                  |  3 ++-
 kernel/audit_watch.c                 |  3 ++-
 6 files changed, 29 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index e0a847bd53be..9eddafa4c7ba 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -133,8 +133,8 @@ static int dnotify_handle_event(struct fsnotify_group *group,
  * userspace notification for that pair.
  */
 static bool dnotify_should_send_event(struct fsnotify_group *group,
-				      struct inode *inode, __u32 mask,
-				      void *data, int data_type)
+				      struct inode *inode, struct vfsmount *mnt,
+				      __u32 mask, void *data, int data_type)
 {
 	struct fsnotify_mark_entry *entry;
 	bool send;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index a61aaa710825..78c440c343a8 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -135,13 +135,12 @@ void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
-static void send_to_group(__u32 mask,
-			  struct fsnotify_group *group,
-			  void *data, int data_is, const char *file_name,
-			  u32 cookie, struct fsnotify_event **event,
-			  struct inode *to_tell)
+static void send_to_group(struct fsnotify_group *group, struct inode *to_tell,
+			  struct vfsmount *mnt, __u32 mask, void *data,
+			  int data_is, u32 cookie, const char *file_name,
+			  struct fsnotify_event **event)
 {
-	if (!group->ops->should_send_event(group, to_tell, mask,
+	if (!group->ops->should_send_event(group, to_tell, mnt, mask,
 					   data, data_is))
 		return;
 	if (!*event) {
@@ -159,15 +158,9 @@ static void send_to_group(__u32 mask,
 	group->ops->handle_event(group, *event);
 }
 
-static bool needed_by_vfsmount(__u32 test_mask, void *data, int data_is)
+static bool needed_by_vfsmount(__u32 test_mask, struct vfsmount *mnt)
 {
-	struct path *path;
-
-	if (data_is == FSNOTIFY_EVENT_PATH)
-		path = (struct path *)data;
-	else if (data_is == FSNOTIFY_EVENT_FILE)
-		path = &((struct file *)data)->f_path;
-	else
+	if (!mnt)
 		return false;
 
 	/* hook in this when mnt->mnt_fsnotify_mask is defined */
@@ -184,6 +177,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 {
 	struct fsnotify_group *group;
 	struct fsnotify_event *event = NULL;
+	struct vfsmount *mnt = NULL;
 	int idx;
 	/* global tests shouldn't care about events on child only the specific event */
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
@@ -198,10 +192,15 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 	    !(test_mask & fsnotify_vfsmount_mask))
                 return;
  
+	if (data_is == FSNOTIFY_EVENT_PATH)
+		mnt = ((struct path *)data)->mnt;
+	else if (data_is == FSNOTIFY_EVENT_FILE)
+		mnt = ((struct file *)data)->f_path.mnt;
+
 	/* if this inode's directed listeners don't care and nothing on the vfsmount
 	 * listeners list cares, nothing to do */
 	if (!(test_mask & to_tell->i_fsnotify_mask) &&
-	    !needed_by_vfsmount(test_mask, data, data_is))
+	    !needed_by_vfsmount(test_mask, mnt))
                 return;
 
 	/*
@@ -214,16 +213,16 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 	if (test_mask & to_tell->i_fsnotify_mask) {
 		list_for_each_entry_rcu(group, &fsnotify_inode_groups, inode_group_list) {
 			if (test_mask & group->mask) {
-				send_to_group(mask, group, data, data_is,
-					      file_name, cookie, &event, to_tell);
+				send_to_group(group, to_tell, NULL, mask, data, data_is,
+					      cookie, file_name, &event);
 			}
 		}
 	}
-	if (needed_by_vfsmount(test_mask, data, data_is)) {
+	if (needed_by_vfsmount(test_mask, mnt)) {
 		list_for_each_entry_rcu(group, &fsnotify_vfsmount_groups, vfsmount_group_list) {
 			if (test_mask & group->mask) {
-				send_to_group(mask, group, data, data_is,
-					      file_name, cookie, &event, to_tell);
+				send_to_group(group, to_tell, mnt, mask, data, data_is,
+					      cookie, file_name, &event);
 			}
 		}
 	}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 0a0f5d0f0d0a..8075ae708ed4 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -141,7 +141,8 @@ static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnot
 }
 
 static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
-				      __u32 mask, void *data, int data_type)
+				      struct vfsmount *mnt, __u32 mask, void *data,
+				      int data_type)
 {
 	struct fsnotify_mark_entry *entry;
 	bool send;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index dea48bee057d..c2a04b7e4fca 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -79,7 +79,8 @@ struct fsnotify_event_private_data;
  */
 struct fsnotify_ops {
 	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode,
-				  __u32 mask, void *data, int data_type);
+				  struct vfsmount *mnt, __u32 mask, void *data,
+				  int data_type);
 	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 04f16887406b..ecf0bf260d09 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -920,7 +920,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark_entry *entry, struct fs
 }
 
 static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
-				  __u32 mask, void *data, int data_type)
+				  struct vfsmount *mnt, __u32 mask, void *data,
+				  int data_type)
 {
 	return 0;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 83d5f9674cec..6304ee5d7642 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -514,7 +514,8 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 }
 
 static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
-					  __u32 mask, void *data, int data_type)
+					  struct vfsmount *mnt, __u32 mask, void *data,
+					  int data_type)
 {
 	struct fsnotify_mark_entry *entry;
 	bool send;
-- 
cgit v1.2.3-59-g8ed1b


From 2823e04de4f1a49087b58ff2bb8f61361ffd9321 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:23 -0500
Subject: fsnotify: put inode specific fields in an fsnotify_mark in a union

The addition of marks on vfs mounts will be simplified if the inode
specific parts of a mark and the vfsmnt specific parts of a mark are
actually in a union so naming can be easy.  This patch just implements the
inode struct and the union.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c          |  4 ++--
 fs/notify/inode_mark.c               | 28 ++++++++++++++--------------
 fs/notify/inotify/inotify_fsnotify.c |  2 +-
 fs/notify/inotify/inotify_user.c     | 10 +++++-----
 include/linux/fsnotify_backend.h     | 17 +++++++++++++----
 kernel/audit_tree.c                  | 16 ++++++++--------
 6 files changed, 43 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 9eddafa4c7ba..fc3a9dc567c5 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -70,8 +70,8 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
 	if (old_mask == new_mask)
 		return;
 
-	if (entry->inode)
-		fsnotify_recalc_inode_mask(entry->inode);
+	if (entry->i.inode)
+		fsnotify_recalc_inode_mask(entry->i.inode);
 }
 
 /*
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index beffebb64627..6731408c49f7 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -117,7 +117,7 @@ static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
 
 	assert_spin_locked(&inode->i_lock);
 
-	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
+	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i.i_list)
 		new_mask |= entry->mask;
 	inode->i_fsnotify_mask = new_mask;
 }
@@ -148,7 +148,7 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 	spin_lock(&entry->lock);
 
 	group = entry->group;
-	inode = entry->inode;
+	inode = entry->i.inode;
 
 	BUG_ON(group && !inode);
 	BUG_ON(!group && inode);
@@ -165,8 +165,8 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 	spin_lock(&group->mark_lock);
 	spin_lock(&inode->i_lock);
 
-	hlist_del_init(&entry->i_list);
-	entry->inode = NULL;
+	hlist_del_init(&entry->i.i_list);
+	entry->i.inode = NULL;
 
 	list_del_init(&entry->g_list);
 	entry->group = NULL;
@@ -248,14 +248,14 @@ void fsnotify_clear_marks_by_inode(struct inode *inode)
 	LIST_HEAD(free_list);
 
 	spin_lock(&inode->i_lock);
-	hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
-		list_add(&entry->free_i_list, &free_list);
-		hlist_del_init(&entry->i_list);
+	hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i.i_list) {
+		list_add(&entry->i.free_i_list, &free_list);
+		hlist_del_init(&entry->i.i_list);
 		fsnotify_get_mark(entry);
 	}
 	spin_unlock(&inode->i_lock);
 
-	list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
+	list_for_each_entry_safe(entry, lentry, &free_list, i.free_i_list) {
 		fsnotify_destroy_mark_by_entry(entry);
 		fsnotify_put_mark(entry);
 	}
@@ -273,7 +273,7 @@ struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *grou
 
 	assert_spin_locked(&inode->i_lock);
 
-	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
+	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i.i_list) {
 		if (entry->group == group) {
 			fsnotify_get_mark(entry);
 			return entry;
@@ -285,7 +285,7 @@ struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *grou
 void fsnotify_duplicate_mark(struct fsnotify_mark_entry *new, struct fsnotify_mark_entry *old)
 {
 	assert_spin_locked(&old->lock);
-	new->inode = old->inode;
+	new->i.inode = old->i.inode;
 	new->group = old->group;
 	new->mask = old->mask;
 	new->free_mark = old->free_mark;
@@ -299,10 +299,10 @@ void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
 {
 	spin_lock_init(&entry->lock);
 	atomic_set(&entry->refcnt, 1);
-	INIT_HLIST_NODE(&entry->i_list);
+	INIT_HLIST_NODE(&entry->i.i_list);
 	entry->group = NULL;
 	entry->mask = 0;
-	entry->inode = NULL;
+	entry->i.inode = NULL;
 	entry->free_mark = free_mark;
 }
 
@@ -350,9 +350,9 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 		lentry = fsnotify_find_mark_entry(group, inode);
 	if (!lentry) {
 		entry->group = group;
-		entry->inode = inode;
+		entry->i.inode = inode;
 
-		hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
+		hlist_add_head(&entry->i.i_list, &inode->i_fsnotify_mark_entries);
 		list_add(&entry->g_list, &group->mark_entries);
 
 		fsnotify_get_mark(entry); /* for i_list and g_list */
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 8075ae708ed4..3edb51cfcfbe 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -193,7 +193,7 @@ static int idr_callback(int id, void *p, void *data)
 	 */
 	if (entry)
 		printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n",
-			entry->group, entry->inode, ientry->wd);
+			entry->group, entry->i.inode, ientry->wd);
 	return 0;
 }
 
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index a48d68a68b25..4b1587f9df3b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -445,7 +445,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
 	if (wd == -1) {
 		WARN_ONCE(1, "%s: ientry=%p ientry->wd=%d ientry->group=%p"
 			" ientry->inode=%p\n", __func__, ientry, ientry->wd,
-			ientry->fsn_entry.group, ientry->fsn_entry.inode);
+			ientry->fsn_entry.group, ientry->fsn_entry.i.inode);
 		goto out;
 	}
 
@@ -454,7 +454,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
 	if (unlikely(!found_ientry)) {
 		WARN_ONCE(1, "%s: ientry=%p ientry->wd=%d ientry->group=%p"
 			" ientry->inode=%p\n", __func__, ientry, ientry->wd,
-			ientry->fsn_entry.group, ientry->fsn_entry.inode);
+			ientry->fsn_entry.group, ientry->fsn_entry.i.inode);
 		goto out;
 	}
 
@@ -468,9 +468,9 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
 			"entry->inode=%p found_ientry=%p found_ientry->wd=%d "
 			"found_ientry->group=%p found_ientry->inode=%p\n",
 			__func__, ientry, ientry->wd, ientry->fsn_entry.group,
-			ientry->fsn_entry.inode, found_ientry, found_ientry->wd,
+			ientry->fsn_entry.i.inode, found_ientry, found_ientry->wd,
 			found_ientry->fsn_entry.group,
-			found_ientry->fsn_entry.inode);
+			found_ientry->fsn_entry.i.inode);
 		goto out;
 	}
 
@@ -482,7 +482,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
 	if (unlikely(atomic_read(&ientry->fsn_entry.refcnt) < 3)) {
 		printk(KERN_ERR "%s: ientry=%p ientry->wd=%d ientry->group=%p"
 			" ientry->inode=%p\n", __func__, ientry, ientry->wd,
-			ientry->fsn_entry.group, ientry->fsn_entry.inode);
+			ientry->fsn_entry.group, ientry->fsn_entry.i.inode);
 		/* we can't really recover with bad ref cnting.. */
 		BUG();
 	}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index c2a04b7e4fca..dca7f2cbde90 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -226,6 +226,15 @@ struct fsnotify_event {
 	struct list_head private_data_list;	/* groups can store private data here */
 };
 
+/*
+ * Inode specific fields in an fsnotify_mark_entry
+ */
+struct fsnotify_inode_mark {
+	struct inode *inode;		/* inode this entry is associated with */
+	struct hlist_node i_list;	/* list of mark_entries by inode->i_fsnotify_mark_entries */
+	struct list_head free_i_list;	/* tmp list used when freeing this mark */
+};
+
 /*
  * a mark is simply an entry attached to an in core inode which allows an
  * fsnotify listener to indicate they are either no longer interested in events
@@ -241,12 +250,12 @@ struct fsnotify_mark_entry {
 	/* we hold ref for each i_list and g_list.  also one ref for each 'thing'
 	 * in kernel that found and may be using this mark. */
 	atomic_t refcnt;		/* active things looking at this mark */
-	struct inode *inode;		/* inode this entry is associated with */
 	struct fsnotify_group *group;	/* group this mark entry is for */
-	struct hlist_node i_list;	/* list of mark_entries by inode->i_fsnotify_mark_entries */
 	struct list_head g_list;	/* list of mark_entries by group->i_fsnotify_mark_entries */
-	spinlock_t lock;		/* protect group, inode, and killme */
-	struct list_head free_i_list;	/* tmp list used when freeing this mark */
+	spinlock_t lock;		/* protect group and inode */
+	union {
+		struct fsnotify_inode_mark i;
+	};
 	struct list_head free_g_list;	/* tmp list used when freeing this mark */
 	void (*free_mark)(struct fsnotify_mark_entry *entry); /* called on final put+free */
 };
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index ecf0bf260d09..c21b05d25224 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -179,9 +179,9 @@ static void insert_hash(struct audit_chunk *chunk)
 	struct fsnotify_mark_entry *entry = &chunk->mark;
 	struct list_head *list;
 
-	if (!entry->inode)
+	if (!entry->i.inode)
 		return;
-	list = chunk_hash(entry->inode);
+	list = chunk_hash(entry->i.inode);
 	list_add_rcu(&chunk->hash, list);
 }
 
@@ -193,7 +193,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
 
 	list_for_each_entry_rcu(p, list, hash) {
 		/* mark.inode may have gone NULL, but who cares? */
-		if (p->mark.inode == inode) {
+		if (p->mark.i.inode == inode) {
 			atomic_long_inc(&p->refs);
 			return p;
 		}
@@ -233,7 +233,7 @@ static void untag_chunk(struct node *p)
 	spin_unlock(&hash_lock);
 
 	spin_lock(&entry->lock);
-	if (chunk->dead || !entry->inode) {
+	if (chunk->dead || !entry->i.inode) {
 		spin_unlock(&entry->lock);
 		goto out;
 	}
@@ -259,7 +259,7 @@ static void untag_chunk(struct node *p)
 	if (!new)
 		goto Fallback;
 	fsnotify_duplicate_mark(&new->mark, entry);
-	if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, 1)) {
+	if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, 1)) {
 		free_chunk(new);
 		goto Fallback;
 	}
@@ -388,7 +388,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	chunk_entry = &chunk->mark;
 
 	spin_lock(&old_entry->lock);
-	if (!old_entry->inode) {
+	if (!old_entry->i.inode) {
 		/* old_entry is being shot, lets just lie */
 		spin_unlock(&old_entry->lock);
 		fsnotify_put_mark(old_entry);
@@ -397,7 +397,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	}
 
 	fsnotify_duplicate_mark(chunk_entry, old_entry);
-	if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, 1)) {
+	if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, 1)) {
 		spin_unlock(&old_entry->lock);
 		free_chunk(chunk);
 		fsnotify_put_mark(old_entry);
@@ -605,7 +605,7 @@ void audit_trim_trees(void)
 		list_for_each_entry(node, &tree->chunks, list) {
 			struct audit_chunk *chunk = find_chunk(node);
 			/* this could be NULL if the watch is dieing else where... */
-			struct inode *inode = chunk->mark.inode;
+			struct inode *inode = chunk->mark.i.inode;
 			node->index |= 1U<<31;
 			if (iterate_mounts(compare_root, inode, root_mnt))
 				node->index &= ~(1U<<31);
-- 
cgit v1.2.3-59-g8ed1b


From 4136510dd61a1ca151fc5b9d8c1ebd5a8ce2e8f4 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:23 -0500
Subject: fsnotify: add vfsmount specific fields to the fsnotify_mark_entry
 union

vfsmount marks need mostly the same data as inode specific fields, but for
consistency and understandability we put that data in a vfsmount specific
struct inside a union with inode specific data.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/fsnotify_backend.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index dca7f2cbde90..0c0fd4ee2840 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -235,6 +235,15 @@ struct fsnotify_inode_mark {
 	struct list_head free_i_list;	/* tmp list used when freeing this mark */
 };
 
+/*
+ * Mount point specific fields in an fsnotify_mark_entry
+ */
+struct fsnotify_vfsmount_mark {
+	struct vfsmount *mnt;		/* inode this entry is associated with */
+	struct hlist_node m_list;	/* list of mark_entries by inode->i_fsnotify_mark_entries */
+	struct list_head free_m_list;	/* tmp list used when freeing this mark */
+};
+
 /*
  * a mark is simply an entry attached to an in core inode which allows an
  * fsnotify listener to indicate they are either no longer interested in events
@@ -255,6 +264,7 @@ struct fsnotify_mark_entry {
 	spinlock_t lock;		/* protect group and inode */
 	union {
 		struct fsnotify_inode_mark i;
+		struct fsnotify_vfsmount_mark m;
 	};
 	struct list_head free_g_list;	/* tmp list used when freeing this mark */
 	void (*free_mark)(struct fsnotify_mark_entry *entry); /* called on final put+free */
-- 
cgit v1.2.3-59-g8ed1b


From 098cf2fc77ee190c92bf9d08d69a13305f2487ec Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:24 -0500
Subject: fsnotify: add flags to fsnotify_mark_entries

To differentiate between inode and vfsmount (or other future) types of
marks we add a flags field and set the inode bit on inode marks (the only
currently supported type of mark)

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inode_mark.c           | 2 ++
 include/linux/fsnotify_backend.h | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 6731408c49f7..b00065842b3e 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -322,6 +322,8 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 	if (unlikely(!inode))
 		return -EINVAL;
 
+	entry->flags = FSNOTIFY_MARK_FLAG_INODE;
+
 	/*
 	 * if this group isn't being testing for inode type events we need
 	 * to start testing
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0c0fd4ee2840..cf165857199b 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -267,6 +267,9 @@ struct fsnotify_mark_entry {
 		struct fsnotify_vfsmount_mark m;
 	};
 	struct list_head free_g_list;	/* tmp list used when freeing this mark */
+#define FSNOTIFY_MARK_FLAG_INODE	0x01
+#define FSNOTIFY_MARK_FLAG_VFSMOUNT	0x02
+	unsigned int flags;		/* vfsmount or inode mark? */
 	void (*free_mark)(struct fsnotify_mark_entry *entry); /* called on final put+free */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 72acc854427948efed7a83da27f7dc3239ac9afc Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Thu, 17 Dec 2009 21:24:24 -0500
Subject: fsnotify: kill FSNOTIFY_EVENT_FILE

Some fsnotify operations send a struct file.  This is more information than
we technically need.  We instead send a struct path in all cases instead of
sometimes a path and sometimes a file.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             | 12 +++++-------
 fs/notify/notification.c         |  9 ---------
 include/linux/fsnotify.h         | 36 +++++++++++++++++++-----------------
 include/linux/fsnotify_backend.h |  5 ++---
 4 files changed, 26 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 78c440c343a8..60e84fd338dd 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -79,15 +79,15 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 }
 
 /* Notify this dentry's parent about a child's events. */
-void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
+void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
 	struct dentry *parent;
 	struct inode *p_inode;
 	bool send = false;
 	bool should_update_children = false;
 
-	if (file)
-		dentry = file->f_path.dentry;
+	if (!dentry)
+		dentry = path->dentry;
 
 	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
 		return;
@@ -119,8 +119,8 @@ void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
 		 * specifies these are events which came from a child. */
 		mask |= FS_EVENT_ON_CHILD;
 
-		if (file)
-			fsnotify(p_inode, mask, file, FSNOTIFY_EVENT_FILE,
+		if (path)
+			fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
 				 dentry->d_name.name, 0);
 		else
 			fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
@@ -194,8 +194,6 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
  
 	if (data_is == FSNOTIFY_EVENT_PATH)
 		mnt = ((struct path *)data)->mnt;
-	else if (data_is == FSNOTIFY_EVENT_FILE)
-		mnt = ((struct file *)data)->f_path.mnt;
 
 	/* if this inode's directed listeners don't care and nothing on the vfsmount
 	 * listeners list cares, nothing to do */
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index dafd0b7687b8..066f1f988bac 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -390,15 +390,6 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 	event->data_type = data_type;
 
 	switch (data_type) {
-	case FSNOTIFY_EVENT_FILE: {
-		struct file *file = data;
-		struct path *path = &file->f_path;
-		event->path.dentry = path->dentry;
-		event->path.mnt = path->mnt;
-		path_get(&event->path);
-		event->data_type = FSNOTIFY_EVENT_PATH;
-		break;
-	}
 	case FSNOTIFY_EVENT_PATH: {
 		struct path *path = data;
 		event->path.dentry = path->dentry;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 04ea03ea8090..06d296d85ebf 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -26,14 +26,12 @@ static inline void fsnotify_d_instantiate(struct dentry *entry,
 }
 
 /* Notify this dentry's parent about a child's events. */
-static inline void fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
+static inline void fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
-	BUG_ON(file && dentry);
+	if (!dentry)
+		dentry = path->dentry;
 
-	if (file)
-		dentry = file->f_path.dentry;
-
-	__fsnotify_parent(file, dentry, mask);
+	__fsnotify_parent(path, dentry, mask);
 }
 
 /*
@@ -160,14 +158,15 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
  */
 static inline void fsnotify_access(struct file *file)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct path *path = &file->f_path;
+	struct inode *inode = path->dentry->d_inode;
 	__u32 mask = FS_ACCESS;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(file, NULL, mask);
-	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
+	fsnotify_parent(path, NULL, mask);
+	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 }
 
 /*
@@ -175,14 +174,15 @@ static inline void fsnotify_access(struct file *file)
  */
 static inline void fsnotify_modify(struct file *file)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct path *path = &file->f_path;
+	struct inode *inode = path->dentry->d_inode;
 	__u32 mask = FS_MODIFY;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(file, NULL, mask);
-	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
+	fsnotify_parent(path, NULL, mask);
+	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 }
 
 /*
@@ -190,14 +190,15 @@ static inline void fsnotify_modify(struct file *file)
  */
 static inline void fsnotify_open(struct file *file)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct path *path = &file->f_path;
+	struct inode *inode = path->dentry->d_inode;
 	__u32 mask = FS_OPEN;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(file, NULL, mask);
-	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
+	fsnotify_parent(path, NULL, mask);
+	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 }
 
 /*
@@ -205,6 +206,7 @@ static inline void fsnotify_open(struct file *file)
  */
 static inline void fsnotify_close(struct file *file)
 {
+	struct path *path = &file->f_path;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	fmode_t mode = file->f_mode;
 	__u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
@@ -212,8 +214,8 @@ static inline void fsnotify_close(struct file *file)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(file, NULL, mask);
-	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
+	fsnotify_parent(path, NULL, mask);
+	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index cf165857199b..7a6ba755acc3 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -214,7 +214,6 @@ struct fsnotify_event {
 #define FSNOTIFY_EVENT_NONE	0
 #define FSNOTIFY_EVENT_PATH	1
 #define FSNOTIFY_EVENT_INODE	2
-#define FSNOTIFY_EVENT_FILE	3
 	int data_type;		/* which of the above union we have */
 	atomic_t refcnt;	/* how many groups still are using/need to send this event */
 	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
@@ -280,7 +279,7 @@ struct fsnotify_mark_entry {
 /* main fsnotify call to send events */
 extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 		     const char *name, u32 cookie);
-extern void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask);
+extern void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 extern u32 fsnotify_get_cookie(void);
 
@@ -393,7 +392,7 @@ static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int d
 			    const char *name, u32 cookie)
 {}
 
-static inline void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
+static inline void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {}
 
 static inline void __fsnotify_inode_delete(struct inode *inode)
-- 
cgit v1.2.3-59-g8ed1b


From e61ce86737b4d60521e4e71f9892fe4bdcfb688b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:24 -0500
Subject: fsnotify: rename fsnotify_mark_entry to just fsnotify_mark

The name is long and it serves no real purpose.  So rename
fsnotify_mark_entry to just fsnotify_mark.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/inode.c                           |  2 +-
 fs/notify/dnotify/dnotify.c          | 24 +++++++++---------
 fs/notify/group.c                    |  8 +++---
 fs/notify/inode_mark.c               | 48 ++++++++++++++++++------------------
 fs/notify/inotify/inotify.h          |  6 ++---
 fs/notify/inotify/inotify_fsnotify.c |  8 +++---
 fs/notify/inotify/inotify_user.c     |  8 +++---
 include/linux/fs.h                   |  2 +-
 include/linux/fsnotify.h             | 18 +++++++-------
 include/linux/fsnotify_backend.h     | 38 ++++++++++++++--------------
 kernel/audit_tree.c                  | 14 +++++------
 kernel/audit_watch.c                 |  8 +++---
 kernel/auditsc.c                     |  4 +--
 13 files changed, 94 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 8e1bee998796..a2da778467bb 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -264,7 +264,7 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
 	i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
-	INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
+	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
 #endif
 }
 EXPORT_SYMBOL(inode_init_once);
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index fc3a9dc567c5..e6edae60894d 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -34,12 +34,12 @@ static struct fsnotify_group *dnotify_group __read_mostly;
 static DEFINE_MUTEX(dnotify_mark_mutex);
 
 /*
- * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
+ * dnotify will attach one of these to each inode (i_fsnotify_marks) which
  * is being watched by dnotify.  If multiple userspace applications are watching
  * the same directory with dnotify their information is chained in dn
  */
 struct dnotify_mark_entry {
-	struct fsnotify_mark_entry fsn_entry;
+	struct fsnotify_mark fsn_entry;
 	struct dnotify_struct *dn;
 };
 
@@ -51,7 +51,7 @@ struct dnotify_mark_entry {
  * it calls the fsnotify function so it can update the set of all events relevant
  * to this inode.
  */
-static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
+static void dnotify_recalc_inode_mask(struct fsnotify_mark *entry)
 {
 	__u32 new_mask, old_mask;
 	struct dnotify_struct *dn;
@@ -85,7 +85,7 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
 static int dnotify_handle_event(struct fsnotify_group *group,
 				struct fsnotify_event *event)
 {
-	struct fsnotify_mark_entry *entry = NULL;
+	struct fsnotify_mark *entry = NULL;
 	struct dnotify_mark_entry *dnentry;
 	struct inode *to_tell;
 	struct dnotify_struct *dn;
@@ -136,7 +136,7 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 				      struct inode *inode, struct vfsmount *mnt,
 				      __u32 mask, void *data, int data_type)
 {
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *entry;
 	bool send;
 
 	/* !dir_notify_enable should never get here, don't waste time checking
@@ -163,7 +163,7 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 	return send;
 }
 
-static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
+static void dnotify_free_mark(struct fsnotify_mark *entry)
 {
 	struct dnotify_mark_entry *dnentry = container_of(entry,
 							  struct dnotify_mark_entry,
@@ -184,14 +184,14 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
 
 /*
  * Called every time a file is closed.  Looks first for a dnotify mark on the
- * inode.  If one is found run all of the ->dn entries attached to that
+ * inode.  If one is found run all of the ->dn structures attached to that
  * mark for one relevant to this process closing the file and remove that
  * dnotify_struct.  If that was the last dnotify_struct also remove the
- * fsnotify_mark_entry.
+ * fsnotify_mark.
  */
 void dnotify_flush(struct file *filp, fl_owner_t id)
 {
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *entry;
 	struct dnotify_mark_entry *dnentry;
 	struct dnotify_struct *dn;
 	struct dnotify_struct **prev;
@@ -260,7 +260,7 @@ static __u32 convert_arg(unsigned long arg)
 
 /*
  * If multiple processes watch the same inode with dnotify there is only one
- * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
+ * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
  * onto that mark.  This function either attaches the new dnotify_struct onto
  * that list, or it |= the mask onto an existing dnofiy_struct.
  */
@@ -298,7 +298,7 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
 int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 {
 	struct dnotify_mark_entry *new_dnentry, *dnentry;
-	struct fsnotify_mark_entry *new_entry, *entry;
+	struct fsnotify_mark *new_entry, *entry;
 	struct dnotify_struct *dn;
 	struct inode *inode;
 	fl_owner_t id = current->files;
@@ -378,7 +378,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	/* if (f != filp) means that we lost a race and another task/thread
 	 * actually closed the fd we are still playing with before we grabbed
 	 * the dnotify_mark_mutex and entry->lock.  Since closing the fd is the
-	 * only time we clean up the mark entries we need to get our mark off
+	 * only time we clean up the marks we need to get our mark off
 	 * the list. */
 	if (f != filp) {
 		/* if we added ourselves, shoot ourselves, it's possible that
diff --git a/fs/notify/group.c b/fs/notify/group.c
index aa4654fe6ec2..b70e7d21dfde 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -74,10 +74,10 @@ void fsnotify_recalc_group_mask(struct fsnotify_group *group)
 {
 	__u32 mask = 0;
 	__u32 old_mask = group->mask;
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *entry;
 
 	spin_lock(&group->mark_lock);
-	list_for_each_entry(entry, &group->mark_entries, g_list)
+	list_for_each_entry(entry, &group->marks_list, g_list)
 		mask |= entry->mask;
 	spin_unlock(&group->mark_lock);
 
@@ -133,7 +133,7 @@ void fsnotify_final_destroy_group(struct fsnotify_group *group)
  */
 static void fsnotify_destroy_group(struct fsnotify_group *group)
 {
-	/* clear all inode mark entries for this group */
+	/* clear all inode marks for this group */
 	fsnotify_clear_marks_by_group(group);
 
 	/* past the point of no return, matches the initial value of 1 */
@@ -224,7 +224,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 	INIT_LIST_HEAD(&group->vfsmount_group_list);
 
 	spin_lock_init(&group->mark_lock);
-	INIT_LIST_HEAD(&group->mark_entries);
+	INIT_LIST_HEAD(&group->marks_list);
 
 	group->ops = ops;
 
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index b00065842b3e..7e69f6b08d4e 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -38,12 +38,12 @@
  * that lock to dereference either of these things (they could be NULL even with
  * the lock)
  *
- * group->mark_lock protects the mark_entries list anchored inside a given group
+ * group->mark_lock protects the marks_list anchored inside a given group
  * and each entry is hooked via the g_list.  It also sorta protects the
  * free_g_list, which when used is anchored by a private list on the stack of the
  * task which held the group->mark_lock.
  *
- * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
+ * inode->i_lock protects the i_fsnotify_marks list anchored inside a
  * given inode and each entry is hooked via the i_list. (and sorta the
  * free_i_list)
  *
@@ -61,7 +61,7 @@
  *   need to be cleaned up. (fsnotify_clear_marks_by_group)
  *
  * Worst case we are given an inode and need to clean up all the marks on that
- * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
+ * inode.  We take i_lock and walk the i_fsnotify_marks safely.  For each
  * mark on the list we take a reference (so the mark can't disappear under us).
  * We remove that mark form the inode's list of marks and we add this mark to a
  * private list anchored on the stack using i_free_list;  At this point we no
@@ -95,12 +95,12 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
-void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
+void fsnotify_get_mark(struct fsnotify_mark *entry)
 {
 	atomic_inc(&entry->refcnt);
 }
 
-void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
+void fsnotify_put_mark(struct fsnotify_mark *entry)
 {
 	if (atomic_dec_and_test(&entry->refcnt))
 		entry->free_mark(entry);
@@ -111,13 +111,13 @@ void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
  */
 static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
 {
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *entry;
 	struct hlist_node *pos;
 	__u32 new_mask = 0;
 
 	assert_spin_locked(&inode->i_lock);
 
-	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i.i_list)
+	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_marks, i.i_list)
 		new_mask |= entry->mask;
 	inode->i_fsnotify_mask = new_mask;
 }
@@ -140,7 +140,7 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
  * The caller had better be holding a reference to this mark so we don't actually
  * do the final put under the entry->lock
  */
-void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
+void fsnotify_destroy_mark_by_entry(struct fsnotify_mark *entry)
 {
 	struct fsnotify_group *group;
 	struct inode *inode;
@@ -174,7 +174,7 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 	fsnotify_put_mark(entry); /* for i_list and g_list */
 
 	/*
-	 * this mark is now off the inode->i_fsnotify_mark_entries list and we
+	 * this mark is now off the inode->i_fsnotify_marks list and we
 	 * hold the inode->i_lock, so this is the perfect time to update the
 	 * inode->i_fsnotify_mask
 	 */
@@ -221,11 +221,11 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
  */
 void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
 {
-	struct fsnotify_mark_entry *lentry, *entry;
+	struct fsnotify_mark *lentry, *entry;
 	LIST_HEAD(free_list);
 
 	spin_lock(&group->mark_lock);
-	list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
+	list_for_each_entry_safe(entry, lentry, &group->marks_list, g_list) {
 		list_add(&entry->free_g_list, &free_list);
 		list_del_init(&entry->g_list);
 		fsnotify_get_mark(entry);
@@ -243,12 +243,12 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
  */
 void fsnotify_clear_marks_by_inode(struct inode *inode)
 {
-	struct fsnotify_mark_entry *entry, *lentry;
+	struct fsnotify_mark *entry, *lentry;
 	struct hlist_node *pos, *n;
 	LIST_HEAD(free_list);
 
 	spin_lock(&inode->i_lock);
-	hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i.i_list) {
+	hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_marks, i.i_list) {
 		list_add(&entry->i.free_i_list, &free_list);
 		hlist_del_init(&entry->i.i_list);
 		fsnotify_get_mark(entry);
@@ -265,15 +265,15 @@ void fsnotify_clear_marks_by_inode(struct inode *inode)
  * given a group and inode, find the mark associated with that combination.
  * if found take a reference to that mark and return it, else return NULL
  */
-struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
-						     struct inode *inode)
+struct fsnotify_mark *fsnotify_find_mark_entry(struct fsnotify_group *group,
+					       struct inode *inode)
 {
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *entry;
 	struct hlist_node *pos;
 
 	assert_spin_locked(&inode->i_lock);
 
-	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i.i_list) {
+	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_marks, i.i_list) {
 		if (entry->group == group) {
 			fsnotify_get_mark(entry);
 			return entry;
@@ -282,7 +282,7 @@ struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *grou
 	return NULL;
 }
 
-void fsnotify_duplicate_mark(struct fsnotify_mark_entry *new, struct fsnotify_mark_entry *old)
+void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
 {
 	assert_spin_locked(&old->lock);
 	new->i.inode = old->i.inode;
@@ -294,8 +294,8 @@ void fsnotify_duplicate_mark(struct fsnotify_mark_entry *new, struct fsnotify_ma
 /*
  * Nothing fancy, just initialize lists and locks and counters.
  */
-void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
-			void (*free_mark)(struct fsnotify_mark_entry *entry))
+void fsnotify_init_mark(struct fsnotify_mark *entry,
+			void (*free_mark)(struct fsnotify_mark *entry))
 {
 	spin_lock_init(&entry->lock);
 	atomic_set(&entry->refcnt, 1);
@@ -311,11 +311,11 @@ void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
  * These marks may be used for the fsnotify backend to determine which
  * event types should be delivered to which group and for which inodes.
  */
-int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
+int fsnotify_add_mark(struct fsnotify_mark *entry,
 		      struct fsnotify_group *group, struct inode *inode,
 		      int allow_dups)
 {
-	struct fsnotify_mark_entry *lentry = NULL;
+	struct fsnotify_mark *lentry = NULL;
 	int ret = 0;
 
 	inode = igrab(inode);
@@ -354,8 +354,8 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 		entry->group = group;
 		entry->i.inode = inode;
 
-		hlist_add_head(&entry->i.i_list, &inode->i_fsnotify_mark_entries);
-		list_add(&entry->g_list, &group->mark_entries);
+		hlist_add_head(&entry->i.i_list, &inode->i_fsnotify_marks);
+		list_add(&entry->g_list, &group->marks_list);
 
 		fsnotify_get_mark(entry); /* for i_list and g_list */
 
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index f234f3a4c8ca..07be6df2428f 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -10,12 +10,12 @@ struct inotify_event_private_data {
 };
 
 struct inotify_inode_mark_entry {
-	/* fsnotify_mark_entry MUST be the first thing */
-	struct fsnotify_mark_entry fsn_entry;
+	/* fsnotify_mark MUST be the first thing */
+	struct fsnotify_mark fsn_entry;
 	int wd;
 };
 
-extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *entry,
 					   struct fsnotify_group *group);
 extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
 
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 3edb51cfcfbe..f33a9bd32e5d 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -88,7 +88,7 @@ static int inotify_merge(struct list_head *list, struct fsnotify_event *event)
 
 static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
 {
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *entry;
 	struct inotify_inode_mark_entry *ientry;
 	struct inode *to_tell;
 	struct inotify_event_private_data *event_priv;
@@ -135,7 +135,7 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 	return ret;
 }
 
-static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+static void inotify_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
 {
 	inotify_ignored_and_remove_idr(entry, group);
 }
@@ -144,7 +144,7 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
 				      struct vfsmount *mnt, __u32 mask, void *data,
 				      int data_type)
 {
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *entry;
 	bool send;
 
 	spin_lock(&inode->i_lock);
@@ -171,7 +171,7 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
  */
 static int idr_callback(int id, void *p, void *data)
 {
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *entry;
 	struct inotify_inode_mark_entry *ientry;
 	static bool warned = false;
 
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 4b1587f9df3b..7be5dcf07ac7 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -386,7 +386,7 @@ static struct inotify_inode_mark_entry *inotify_idr_find_locked(struct fsnotify_
 
 	ientry = idr_find(idr, wd);
 	if (ientry) {
-		struct fsnotify_mark_entry *fsn_entry = &ientry->fsn_entry;
+		struct fsnotify_mark *fsn_entry = &ientry->fsn_entry;
 
 		fsnotify_get_mark(fsn_entry);
 		/* One ref for being in the idr, one ref we just took */
@@ -499,7 +499,7 @@ out:
 /*
  * Send IN_IGNORED for this wd, remove this wd from the idr.
  */
-void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+void inotify_ignored_and_remove_idr(struct fsnotify_mark *entry,
 				    struct fsnotify_group *group)
 {
 	struct inotify_inode_mark_entry *ientry;
@@ -541,7 +541,7 @@ skip_send_ignore:
 }
 
 /* ding dong the mark is dead */
-static void inotify_free_mark(struct fsnotify_mark_entry *entry)
+static void inotify_free_mark(struct fsnotify_mark *entry)
 {
 	struct inotify_inode_mark_entry *ientry;
 
@@ -554,7 +554,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 					 struct inode *inode,
 					 u32 arg)
 {
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *entry;
 	struct inotify_inode_mark_entry *ientry;
 	__u32 old_mask, new_mask;
 	__u32 mask;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e5598d2f99b9..85fe89c43487 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -768,7 +768,7 @@ struct inode {
 
 #ifdef CONFIG_FSNOTIFY
 	__u32			i_fsnotify_mask; /* all events this inode cares about */
-	struct hlist_head	i_fsnotify_mark_entries; /* fsnotify mark entries */
+	struct hlist_head	i_fsnotify_marks;
 #endif
 
 	unsigned long		i_state;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 06d296d85ebf..62e93a9dd115 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -19,10 +19,10 @@
  * fsnotify_d_instantiate - instantiate a dentry for inode
  * Called with dcache_lock held.
  */
-static inline void fsnotify_d_instantiate(struct dentry *entry,
-						struct inode *inode)
+static inline void fsnotify_d_instantiate(struct dentry *dentry,
+					  struct inode *inode)
 {
-	__fsnotify_d_instantiate(entry, inode);
+	__fsnotify_d_instantiate(dentry, inode);
 }
 
 /* Notify this dentry's parent about a child's events. */
@@ -35,16 +35,16 @@ static inline void fsnotify_parent(struct path *path, struct dentry *dentry, __u
 }
 
 /*
- * fsnotify_d_move - entry has been moved
- * Called with dcache_lock and entry->d_lock held.
+ * fsnotify_d_move - dentry has been moved
+ * Called with dcache_lock and dentry->d_lock held.
  */
-static inline void fsnotify_d_move(struct dentry *entry)
+static inline void fsnotify_d_move(struct dentry *dentry)
 {
 	/*
-	 * On move we need to update entry->d_flags to indicate if the new parent
-	 * cares about events from this entry.
+	 * On move we need to update dentry->d_flags to indicate if the new parent
+	 * cares about events from this dentry.
 	 */
-	__fsnotify_update_dcache_flags(entry);
+	__fsnotify_update_dcache_flags(dentry);
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7a6ba755acc3..59c072e8fddd 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -62,7 +62,7 @@
 
 struct fsnotify_group;
 struct fsnotify_event;
-struct fsnotify_mark_entry;
+struct fsnotify_mark;
 struct fsnotify_event_private_data;
 
 /*
@@ -83,7 +83,7 @@ struct fsnotify_ops {
 				  int data_type);
 	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
-	void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+	void (*freeing_mark)(struct fsnotify_mark *entry, struct fsnotify_group *group);
 	void (*free_event_priv)(struct fsnotify_event_private_data *priv);
 };
 
@@ -133,12 +133,12 @@ struct fsnotify_group {
 	unsigned int q_len;			/* events on the queue */
 	unsigned int max_events;		/* maximum events allowed on the list */
 
-	/* stores all fastapth entries assoc with this group so they can be cleaned on unregister */
-	spinlock_t mark_lock;		/* protect mark_entries list */
+	/* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
+	spinlock_t mark_lock;		/* protect marks_list */
 	atomic_t num_marks;		/* 1 for each mark entry and 1 for not being
 					 * past the point of no return when freeing
 					 * a group */
-	struct list_head mark_entries;	/* all inode mark entries for this group */
+	struct list_head marks_list;	/* all inode marks for this group */
 
 	/* prevents double list_del of group_list.  protected by global fsnotify_grp_mutex */
 	bool on_inode_group_list;
@@ -226,20 +226,20 @@ struct fsnotify_event {
 };
 
 /*
- * Inode specific fields in an fsnotify_mark_entry
+ * Inode specific fields in an fsnotify_mark
  */
 struct fsnotify_inode_mark {
 	struct inode *inode;		/* inode this entry is associated with */
-	struct hlist_node i_list;	/* list of mark_entries by inode->i_fsnotify_mark_entries */
+	struct hlist_node i_list;	/* list of marks by inode->i_fsnotify_marks */
 	struct list_head free_i_list;	/* tmp list used when freeing this mark */
 };
 
 /*
- * Mount point specific fields in an fsnotify_mark_entry
+ * Mount point specific fields in an fsnotify_mark
  */
 struct fsnotify_vfsmount_mark {
 	struct vfsmount *mnt;		/* inode this entry is associated with */
-	struct hlist_node m_list;	/* list of mark_entries by inode->i_fsnotify_mark_entries */
+	struct hlist_node m_list;	/* list of marks by inode->i_fsnotify_marks */
 	struct list_head free_m_list;	/* tmp list used when freeing this mark */
 };
 
@@ -253,13 +253,13 @@ struct fsnotify_vfsmount_mark {
  * (such as dnotify) will flush these when the open fd is closed and not at
  * inode eviction or modification.
  */
-struct fsnotify_mark_entry {
+struct fsnotify_mark {
 	__u32 mask;			/* mask this mark entry is for */
 	/* we hold ref for each i_list and g_list.  also one ref for each 'thing'
 	 * in kernel that found and may be using this mark. */
 	atomic_t refcnt;		/* active things looking at this mark */
 	struct fsnotify_group *group;	/* group this mark entry is for */
-	struct list_head g_list;	/* list of mark_entries by group->i_fsnotify_mark_entries */
+	struct list_head g_list;	/* list of marks by group->i_fsnotify_marks */
 	spinlock_t lock;		/* protect group and inode */
 	union {
 		struct fsnotify_inode_mark i;
@@ -269,7 +269,7 @@ struct fsnotify_mark_entry {
 #define FSNOTIFY_MARK_FLAG_INODE	0x01
 #define FSNOTIFY_MARK_FLAG_VFSMOUNT	0x02
 	unsigned int flags;		/* vfsmount or inode mark? */
-	void (*free_mark)(struct fsnotify_mark_entry *entry); /* called on final put+free */
+	void (*free_mark)(struct fsnotify_mark *entry); /* called on final put+free */
 };
 
 #ifdef CONFIG_FSNOTIFY
@@ -361,19 +361,19 @@ extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group
 
 /* run all marks associated with an inode and update inode->i_fsnotify_mask */
 extern void fsnotify_recalc_inode_mask(struct inode *inode);
-extern void fsnotify_init_mark(struct fsnotify_mark_entry *entry, void (*free_mark)(struct fsnotify_mark_entry *entry));
+extern void fsnotify_init_mark(struct fsnotify_mark *entry, void (*free_mark)(struct fsnotify_mark *entry));
 /* find (and take a reference) to a mark associated with group and inode */
-extern struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, struct inode *inode);
+extern struct fsnotify_mark *fsnotify_find_mark_entry(struct fsnotify_group *group, struct inode *inode);
 /* copy the values from old into new */
-extern void fsnotify_duplicate_mark(struct fsnotify_mark_entry *new, struct fsnotify_mark_entry *old);
+extern void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old);
 /* attach the mark to both the group and the inode */
-extern int fsnotify_add_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group, struct inode *inode, int allow_dups);
+extern int fsnotify_add_mark(struct fsnotify_mark *entry, struct fsnotify_group *group, struct inode *inode, int allow_dups);
 /* given a mark, flag it to be freed when all references are dropped */
-extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry);
+extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark *entry);
 /* run all the marks in a group, and flag them to be freed */
 extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
-extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry);
-extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
+extern void fsnotify_get_mark(struct fsnotify_mark *entry);
+extern void fsnotify_put_mark(struct fsnotify_mark *entry);
 extern void fsnotify_unmount_inodes(struct list_head *list);
 
 /* put here because inotify does some weird stuff when destroying watches */
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index c21b05d25224..f16f909fbbc1 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -22,7 +22,7 @@ struct audit_tree {
 
 struct audit_chunk {
 	struct list_head hash;
-	struct fsnotify_mark_entry mark;
+	struct fsnotify_mark mark;
 	struct list_head trees;		/* with root here */
 	int dead;
 	int count;
@@ -134,7 +134,7 @@ static void __put_chunk(struct rcu_head *rcu)
 	audit_put_chunk(chunk);
 }
 
-static void audit_tree_destroy_watch(struct fsnotify_mark_entry *entry)
+static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
 {
 	struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
 	call_rcu(&chunk->head, __put_chunk);
@@ -176,7 +176,7 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
 /* hash_lock & entry->lock is held by caller */
 static void insert_hash(struct audit_chunk *chunk)
 {
-	struct fsnotify_mark_entry *entry = &chunk->mark;
+	struct fsnotify_mark *entry = &chunk->mark;
 	struct list_head *list;
 
 	if (!entry->i.inode)
@@ -222,7 +222,7 @@ static struct audit_chunk *find_chunk(struct node *p)
 static void untag_chunk(struct node *p)
 {
 	struct audit_chunk *chunk = find_chunk(p);
-	struct fsnotify_mark_entry *entry = &chunk->mark;
+	struct fsnotify_mark *entry = &chunk->mark;
 	struct audit_chunk *new;
 	struct audit_tree *owner;
 	int size = chunk->count - 1;
@@ -316,7 +316,7 @@ out:
 
 static int create_chunk(struct inode *inode, struct audit_tree *tree)
 {
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *entry;
 	struct audit_chunk *chunk = alloc_chunk(1);
 	if (!chunk)
 		return -ENOMEM;
@@ -354,7 +354,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 /* the first tagged inode becomes root of tree */
 static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 {
-	struct fsnotify_mark_entry *old_entry, *chunk_entry;
+	struct fsnotify_mark *old_entry, *chunk_entry;
 	struct audit_tree *owner;
 	struct audit_chunk *chunk, *old;
 	struct node *p;
@@ -911,7 +911,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, struct fsnotify
 	return -EOPNOTSUPP;
 }
 
-static void audit_tree_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
 {
 	struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 6304ee5d7642..d8cb55a5c059 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -56,7 +56,7 @@ struct audit_watch {
 
 struct audit_parent {
 	struct list_head	watches; /* anchor for audit_watch->wlist */
-	struct fsnotify_mark_entry mark; /* fsnotify mark on the inode */
+	struct fsnotify_mark mark; /* fsnotify mark on the inode */
 };
 
 /* fsnotify handle. */
@@ -72,7 +72,7 @@ static void audit_free_parent(struct audit_parent *parent)
 	kfree(parent);
 }
 
-static void audit_watch_free_mark(struct fsnotify_mark_entry *entry)
+static void audit_watch_free_mark(struct fsnotify_mark *entry)
 {
 	struct audit_parent *parent;
 
@@ -99,7 +99,7 @@ static void audit_put_parent(struct audit_parent *parent)
 static inline struct audit_parent *audit_find_parent(struct inode *inode)
 {
 	struct audit_parent *parent = NULL;
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *entry;
 
 	spin_lock(&inode->i_lock);
 	entry = fsnotify_find_mark_entry(audit_watch_group, inode);
@@ -517,7 +517,7 @@ static bool audit_watch_should_send_event(struct fsnotify_group *group, struct i
 					  struct vfsmount *mnt, __u32 mask, void *data,
 					  int data_type)
 {
-	struct fsnotify_mark_entry *entry;
+	struct fsnotify_mark *entry;
 	bool send;
 
 	spin_lock(&inode->i_lock);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 853185f7ba7e..b87a63beb66c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1724,7 +1724,7 @@ static inline void handle_one(const struct inode *inode)
 	struct audit_tree_refs *p;
 	struct audit_chunk *chunk;
 	int count;
-	if (likely(hlist_empty(&inode->i_fsnotify_mark_entries)))
+	if (likely(hlist_empty(&inode->i_fsnotify_marks)))
 		return;
 	context = current->audit_context;
 	p = context->trees;
@@ -1767,7 +1767,7 @@ retry:
 	seq = read_seqbegin(&rename_lock);
 	for(;;) {
 		struct inode *inode = d->d_inode;
-		if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_mark_entries))) {
+		if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
 			struct audit_chunk *chunk;
 			chunk = audit_tree_lookup(inode);
 			if (chunk) {
-- 
cgit v1.2.3-59-g8ed1b


From d07754412f9cdc2f4a99318d5ee81ace6715ea99 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:24 -0500
Subject: fsnotify: rename fsnotify_find_mark_entry to fsnotify_find_mark

the _entry portion of fsnotify functions is useless.  Drop it.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c          | 14 +++++++-------
 fs/notify/inode_mark.c               | 14 +++++++-------
 fs/notify/inotify/inotify_fsnotify.c |  4 ++--
 fs/notify/inotify/inotify_user.c     |  6 +++---
 include/linux/fsnotify_backend.h     |  4 ++--
 kernel/audit_tree.c                  | 12 ++++++------
 kernel/audit_watch.c                 |  8 ++++----
 7 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index e6edae60894d..b202bc590c61 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -96,7 +96,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 	to_tell = event->to_tell;
 
 	spin_lock(&to_tell->i_lock);
-	entry = fsnotify_find_mark_entry(group, to_tell);
+	entry = fsnotify_find_mark(group, to_tell);
 	spin_unlock(&to_tell->i_lock);
 
 	/* unlikely since we alreay passed dnotify_should_send_event() */
@@ -148,7 +148,7 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 		return false;
 
 	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(group, inode);
+	entry = fsnotify_find_mark(group, inode);
 	spin_unlock(&inode->i_lock);
 
 	/* no mark means no dnotify watch */
@@ -158,7 +158,7 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 	mask = (mask & ~FS_EVENT_ON_CHILD);
 	send = (mask & entry->mask);
 
-	fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
+	fsnotify_put_mark(entry); /* matches fsnotify_find_mark */
 
 	return send;
 }
@@ -202,7 +202,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 		return;
 
 	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(dnotify_group, inode);
+	entry = fsnotify_find_mark(dnotify_group, inode);
 	spin_unlock(&inode->i_lock);
 	if (!entry)
 		return;
@@ -226,7 +226,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 
 	/* nothing else could have found us thanks to the dnotify_mark_mutex */
 	if (dnentry->dn == NULL)
-		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_destroy_mark(entry);
 
 	fsnotify_recalc_group_mask(dnotify_group);
 
@@ -357,7 +357,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 
 	/* add the new_entry or find an old one. */
 	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(dnotify_group, inode);
+	entry = fsnotify_find_mark(dnotify_group, inode);
 	spin_unlock(&inode->i_lock);
 	if (entry) {
 		dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
@@ -414,7 +414,7 @@ out:
 	spin_unlock(&entry->lock);
 
 	if (destroy)
-		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_destroy_mark(entry);
 
 	fsnotify_recalc_group_mask(dnotify_group);
 
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 7e69f6b08d4e..01c42632eb2a 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -56,7 +56,7 @@
  * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
  * - The inode is being evicted from cache. (fsnotify_inode_delete)
  * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
- * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
  * - The fsnotify_group associated with the mark is going away and all such marks
  *   need to be cleaned up. (fsnotify_clear_marks_by_group)
  *
@@ -140,7 +140,7 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
  * The caller had better be holding a reference to this mark so we don't actually
  * do the final put under the entry->lock
  */
-void fsnotify_destroy_mark_by_entry(struct fsnotify_mark *entry)
+void fsnotify_destroy_mark(struct fsnotify_mark *entry)
 {
 	struct fsnotify_group *group;
 	struct inode *inode;
@@ -233,7 +233,7 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
 	spin_unlock(&group->mark_lock);
 
 	list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
-		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_destroy_mark(entry);
 		fsnotify_put_mark(entry);
 	}
 }
@@ -256,7 +256,7 @@ void fsnotify_clear_marks_by_inode(struct inode *inode)
 	spin_unlock(&inode->i_lock);
 
 	list_for_each_entry_safe(entry, lentry, &free_list, i.free_i_list) {
-		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_destroy_mark(entry);
 		fsnotify_put_mark(entry);
 	}
 }
@@ -265,8 +265,8 @@ void fsnotify_clear_marks_by_inode(struct inode *inode)
  * given a group and inode, find the mark associated with that combination.
  * if found take a reference to that mark and return it, else return NULL
  */
-struct fsnotify_mark *fsnotify_find_mark_entry(struct fsnotify_group *group,
-					       struct inode *inode)
+struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_group *group,
+					 struct inode *inode)
 {
 	struct fsnotify_mark *entry;
 	struct hlist_node *pos;
@@ -349,7 +349,7 @@ int fsnotify_add_mark(struct fsnotify_mark *entry,
 	spin_lock(&inode->i_lock);
 
 	if (!allow_dups)
-		lentry = fsnotify_find_mark_entry(group, inode);
+		lentry = fsnotify_find_mark(group, inode);
 	if (!lentry) {
 		entry->group = group;
 		entry->i.inode = inode;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index f33a9bd32e5d..f8a2a6eda133 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -98,7 +98,7 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 	to_tell = event->to_tell;
 
 	spin_lock(&to_tell->i_lock);
-	entry = fsnotify_find_mark_entry(group, to_tell);
+	entry = fsnotify_find_mark(group, to_tell);
 	spin_unlock(&to_tell->i_lock);
 	/* race with watch removal?  We already passes should_send */
 	if (unlikely(!entry))
@@ -148,7 +148,7 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
 	bool send;
 
 	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(group, inode);
+	entry = fsnotify_find_mark(group, inode);
 	spin_unlock(&inode->i_lock);
 	if (!entry)
 		return false;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 7be5dcf07ac7..118085c9d2d9 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -567,7 +567,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 		return -EINVAL;
 
 	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(group, inode);
+	entry = fsnotify_find_mark(group, inode);
 	spin_unlock(&inode->i_lock);
 	if (!entry)
 		return -ENOENT;
@@ -607,7 +607,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 	/* return the wd */
 	ret = ientry->wd;
 
-	/* match the get from fsnotify_find_mark_entry() */
+	/* match the get from fsnotify_find_mark() */
 	fsnotify_put_mark(entry);
 
 	return ret;
@@ -823,7 +823,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 
 	ret = 0;
 
-	fsnotify_destroy_mark_by_entry(&ientry->fsn_entry);
+	fsnotify_destroy_mark(&ientry->fsn_entry);
 
 	/* match ref taken by inotify_idr_find */
 	fsnotify_put_mark(&ientry->fsn_entry);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 59c072e8fddd..83b6bfeb2d66 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -363,13 +363,13 @@ extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group
 extern void fsnotify_recalc_inode_mask(struct inode *inode);
 extern void fsnotify_init_mark(struct fsnotify_mark *entry, void (*free_mark)(struct fsnotify_mark *entry));
 /* find (and take a reference) to a mark associated with group and inode */
-extern struct fsnotify_mark *fsnotify_find_mark_entry(struct fsnotify_group *group, struct inode *inode);
+extern struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_group *group, struct inode *inode);
 /* copy the values from old into new */
 extern void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old);
 /* attach the mark to both the group and the inode */
 extern int fsnotify_add_mark(struct fsnotify_mark *entry, struct fsnotify_group *group, struct inode *inode, int allow_dups);
 /* given a mark, flag it to be freed when all references are dropped */
-extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark *entry);
+extern void fsnotify_destroy_mark(struct fsnotify_mark *entry);
 /* run all the marks in a group, and flag them to be freed */
 extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
 extern void fsnotify_get_mark(struct fsnotify_mark *entry);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f16f909fbbc1..b20fb055d712 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -250,7 +250,7 @@ static void untag_chunk(struct node *p)
 		list_del_rcu(&chunk->hash);
 		spin_unlock(&hash_lock);
 		spin_unlock(&entry->lock);
-		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_destroy_mark(entry);
 		fsnotify_put_mark(entry);
 		goto out;
 	}
@@ -293,7 +293,7 @@ static void untag_chunk(struct node *p)
 		owner->root = new;
 	spin_unlock(&hash_lock);
 	spin_unlock(&entry->lock);
-	fsnotify_destroy_mark_by_entry(entry);
+	fsnotify_destroy_mark(entry);
 	fsnotify_put_mark(entry);
 	goto out;
 
@@ -333,7 +333,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 		spin_unlock(&hash_lock);
 		chunk->dead = 1;
 		spin_unlock(&entry->lock);
-		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_destroy_mark(entry);
 		fsnotify_put_mark(entry);
 		return 0;
 	}
@@ -361,7 +361,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	int n;
 
 	spin_lock(&inode->i_lock);
-	old_entry = fsnotify_find_mark_entry(audit_tree_group, inode);
+	old_entry = fsnotify_find_mark(audit_tree_group, inode);
 	spin_unlock(&inode->i_lock);
 	if (!old_entry)
 		return create_chunk(inode, tree);
@@ -415,7 +415,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		spin_unlock(&chunk_entry->lock);
 		spin_unlock(&old_entry->lock);
 
-		fsnotify_destroy_mark_by_entry(chunk_entry);
+		fsnotify_destroy_mark(chunk_entry);
 
 		fsnotify_put_mark(chunk_entry);
 		fsnotify_put_mark(old_entry);
@@ -446,7 +446,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	spin_unlock(&hash_lock);
 	spin_unlock(&chunk_entry->lock);
 	spin_unlock(&old_entry->lock);
-	fsnotify_destroy_mark_by_entry(old_entry);
+	fsnotify_destroy_mark(old_entry);
 	fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
 	fsnotify_put_mark(old_entry); /* and kill it */
 	return 0;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d8cb55a5c059..24ecbebf4354 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -102,7 +102,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
 	struct fsnotify_mark *entry;
 
 	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(audit_watch_group, inode);
+	entry = fsnotify_find_mark(audit_watch_group, inode);
 	spin_unlock(&inode->i_lock);
 
 	if (entry)
@@ -354,7 +354,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 	}
 	mutex_unlock(&audit_filter_mutex);
 
-	fsnotify_destroy_mark_by_entry(&parent->mark);
+	fsnotify_destroy_mark(&parent->mark);
 
 	fsnotify_recalc_group_mask(audit_watch_group);
 
@@ -504,7 +504,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 
 		if (list_empty(&parent->watches)) {
 			audit_get_parent(parent);
-			fsnotify_destroy_mark_by_entry(&parent->mark);
+			fsnotify_destroy_mark(&parent->mark);
 			audit_put_parent(parent);
 		}
 	}
@@ -521,7 +521,7 @@ static bool audit_watch_should_send_event(struct fsnotify_group *group, struct i
 	bool send;
 
 	spin_lock(&inode->i_lock);
-	entry = fsnotify_find_mark_entry(group, inode);
+	entry = fsnotify_find_mark(group, inode);
 	spin_unlock(&inode->i_lock);
 	if (!entry)
 		return false;
-- 
cgit v1.2.3-59-g8ed1b


From 841bdc10f573aa010dd5818d35a5690b7d9f73ce Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:24 -0500
Subject: fsnotify: rename mark_entry to just mark

previously I used mark_entry when talking about marks on inodes.  The
_entry is pretty useless.  Just use "mark" instead.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/group.c                |   6 +-
 fs/notify/inode_mark.c           | 148 +++++++++++++++++++--------------------
 include/linux/fsnotify_backend.h |  26 +++----
 3 files changed, 90 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/group.c b/fs/notify/group.c
index b70e7d21dfde..9e9eb406afdd 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -74,11 +74,11 @@ void fsnotify_recalc_group_mask(struct fsnotify_group *group)
 {
 	__u32 mask = 0;
 	__u32 old_mask = group->mask;
-	struct fsnotify_mark *entry;
+	struct fsnotify_mark *mark;
 
 	spin_lock(&group->mark_lock);
-	list_for_each_entry(entry, &group->marks_list, g_list)
-		mask |= entry->mask;
+	list_for_each_entry(mark, &group->marks_list, g_list)
+		mask |= mark->mask;
 	spin_unlock(&group->mark_lock);
 
 	group->mask = mask;
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 01c42632eb2a..27c1b43ad739 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -30,21 +30,21 @@
  * There are 3 spinlocks involved with fsnotify inode marks and they MUST
  * be taken in order as follows:
  *
- * entry->lock
+ * mark->lock
  * group->mark_lock
  * inode->i_lock
  *
- * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
+ * mark->lock protects 2 things, mark->group and mark->inode.  You must hold
  * that lock to dereference either of these things (they could be NULL even with
  * the lock)
  *
  * group->mark_lock protects the marks_list anchored inside a given group
- * and each entry is hooked via the g_list.  It also sorta protects the
+ * and each mark is hooked via the g_list.  It also sorta protects the
  * free_g_list, which when used is anchored by a private list on the stack of the
  * task which held the group->mark_lock.
  *
  * inode->i_lock protects the i_fsnotify_marks list anchored inside a
- * given inode and each entry is hooked via the i_list. (and sorta the
+ * given inode and each mark is hooked via the i_list. (and sorta the
  * free_i_list)
  *
  *
@@ -95,15 +95,15 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
-void fsnotify_get_mark(struct fsnotify_mark *entry)
+void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
-	atomic_inc(&entry->refcnt);
+	atomic_inc(&mark->refcnt);
 }
 
-void fsnotify_put_mark(struct fsnotify_mark *entry)
+void fsnotify_put_mark(struct fsnotify_mark *mark)
 {
-	if (atomic_dec_and_test(&entry->refcnt))
-		entry->free_mark(entry);
+	if (atomic_dec_and_test(&mark->refcnt))
+		mark->free_mark(mark);
 }
 
 /*
@@ -111,14 +111,14 @@ void fsnotify_put_mark(struct fsnotify_mark *entry)
  */
 static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
 {
-	struct fsnotify_mark *entry;
+	struct fsnotify_mark *mark;
 	struct hlist_node *pos;
 	__u32 new_mask = 0;
 
 	assert_spin_locked(&inode->i_lock);
 
-	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_marks, i.i_list)
-		new_mask |= entry->mask;
+	hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list)
+		new_mask |= mark->mask;
 	inode->i_fsnotify_mask = new_mask;
 }
 
@@ -138,40 +138,40 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
 /*
  * Any time a mark is getting freed we end up here.
  * The caller had better be holding a reference to this mark so we don't actually
- * do the final put under the entry->lock
+ * do the final put under the mark->lock
  */
-void fsnotify_destroy_mark(struct fsnotify_mark *entry)
+void fsnotify_destroy_mark(struct fsnotify_mark *mark)
 {
 	struct fsnotify_group *group;
 	struct inode *inode;
 
-	spin_lock(&entry->lock);
+	spin_lock(&mark->lock);
 
-	group = entry->group;
-	inode = entry->i.inode;
+	group = mark->group;
+	inode = mark->i.inode;
 
 	BUG_ON(group && !inode);
 	BUG_ON(!group && inode);
 
 	/* if !group something else already marked this to die */
 	if (!group) {
-		spin_unlock(&entry->lock);
+		spin_unlock(&mark->lock);
 		return;
 	}
 
 	/* 1 from caller and 1 for being on i_list/g_list */
-	BUG_ON(atomic_read(&entry->refcnt) < 2);
+	BUG_ON(atomic_read(&mark->refcnt) < 2);
 
 	spin_lock(&group->mark_lock);
 	spin_lock(&inode->i_lock);
 
-	hlist_del_init(&entry->i.i_list);
-	entry->i.inode = NULL;
+	hlist_del_init(&mark->i.i_list);
+	mark->i.inode = NULL;
 
-	list_del_init(&entry->g_list);
-	entry->group = NULL;
+	list_del_init(&mark->g_list);
+	mark->group = NULL;
 
-	fsnotify_put_mark(entry); /* for i_list and g_list */
+	fsnotify_put_mark(mark); /* for i_list and g_list */
 
 	/*
 	 * this mark is now off the inode->i_fsnotify_marks list and we
@@ -182,21 +182,21 @@ void fsnotify_destroy_mark(struct fsnotify_mark *entry)
 
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&group->mark_lock);
-	spin_unlock(&entry->lock);
+	spin_unlock(&mark->lock);
 
 	/*
 	 * Some groups like to know that marks are being freed.  This is a
-	 * callback to the group function to let it know that this entry
+	 * callback to the group function to let it know that this mark
 	 * is being freed.
 	 */
 	if (group->ops->freeing_mark)
-		group->ops->freeing_mark(entry, group);
+		group->ops->freeing_mark(mark, group);
 
 	/*
 	 * __fsnotify_update_child_dentry_flags(inode);
 	 *
 	 * I really want to call that, but we can't, we have no idea if the inode
-	 * still exists the second we drop the entry->lock.
+	 * still exists the second we drop the mark->lock.
 	 *
 	 * The next time an event arrive to this inode from one of it's children
 	 * __fsnotify_parent will see that the inode doesn't care about it's
@@ -221,20 +221,20 @@ void fsnotify_destroy_mark(struct fsnotify_mark *entry)
  */
 void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
 {
-	struct fsnotify_mark *lentry, *entry;
+	struct fsnotify_mark *lmark, *mark;
 	LIST_HEAD(free_list);
 
 	spin_lock(&group->mark_lock);
-	list_for_each_entry_safe(entry, lentry, &group->marks_list, g_list) {
-		list_add(&entry->free_g_list, &free_list);
-		list_del_init(&entry->g_list);
-		fsnotify_get_mark(entry);
+	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
+		list_add(&mark->free_g_list, &free_list);
+		list_del_init(&mark->g_list);
+		fsnotify_get_mark(mark);
 	}
 	spin_unlock(&group->mark_lock);
 
-	list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
-		fsnotify_destroy_mark(entry);
-		fsnotify_put_mark(entry);
+	list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
+		fsnotify_destroy_mark(mark);
+		fsnotify_put_mark(mark);
 	}
 }
 
@@ -243,21 +243,21 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
  */
 void fsnotify_clear_marks_by_inode(struct inode *inode)
 {
-	struct fsnotify_mark *entry, *lentry;
+	struct fsnotify_mark *mark, *lmark;
 	struct hlist_node *pos, *n;
 	LIST_HEAD(free_list);
 
 	spin_lock(&inode->i_lock);
-	hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_marks, i.i_list) {
-		list_add(&entry->i.free_i_list, &free_list);
-		hlist_del_init(&entry->i.i_list);
-		fsnotify_get_mark(entry);
+	hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) {
+		list_add(&mark->i.free_i_list, &free_list);
+		hlist_del_init(&mark->i.i_list);
+		fsnotify_get_mark(mark);
 	}
 	spin_unlock(&inode->i_lock);
 
-	list_for_each_entry_safe(entry, lentry, &free_list, i.free_i_list) {
-		fsnotify_destroy_mark(entry);
-		fsnotify_put_mark(entry);
+	list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) {
+		fsnotify_destroy_mark(mark);
+		fsnotify_put_mark(mark);
 	}
 }
 
@@ -268,15 +268,15 @@ void fsnotify_clear_marks_by_inode(struct inode *inode)
 struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_group *group,
 					 struct inode *inode)
 {
-	struct fsnotify_mark *entry;
+	struct fsnotify_mark *mark;
 	struct hlist_node *pos;
 
 	assert_spin_locked(&inode->i_lock);
 
-	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_marks, i.i_list) {
-		if (entry->group == group) {
-			fsnotify_get_mark(entry);
-			return entry;
+	hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) {
+		if (mark->group == group) {
+			fsnotify_get_mark(mark);
+			return mark;
 		}
 	}
 	return NULL;
@@ -294,35 +294,35 @@ void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *ol
 /*
  * Nothing fancy, just initialize lists and locks and counters.
  */
-void fsnotify_init_mark(struct fsnotify_mark *entry,
-			void (*free_mark)(struct fsnotify_mark *entry))
+void fsnotify_init_mark(struct fsnotify_mark *mark,
+			void (*free_mark)(struct fsnotify_mark *mark))
 {
-	spin_lock_init(&entry->lock);
-	atomic_set(&entry->refcnt, 1);
-	INIT_HLIST_NODE(&entry->i.i_list);
-	entry->group = NULL;
-	entry->mask = 0;
-	entry->i.inode = NULL;
-	entry->free_mark = free_mark;
+	spin_lock_init(&mark->lock);
+	atomic_set(&mark->refcnt, 1);
+	INIT_HLIST_NODE(&mark->i.i_list);
+	mark->group = NULL;
+	mark->mask = 0;
+	mark->i.inode = NULL;
+	mark->free_mark = free_mark;
 }
 
 /*
- * Attach an initialized mark entry to a given group and inode.
+ * Attach an initialized mark mark to a given group and inode.
  * These marks may be used for the fsnotify backend to determine which
  * event types should be delivered to which group and for which inodes.
  */
-int fsnotify_add_mark(struct fsnotify_mark *entry,
+int fsnotify_add_mark(struct fsnotify_mark *mark,
 		      struct fsnotify_group *group, struct inode *inode,
 		      int allow_dups)
 {
-	struct fsnotify_mark *lentry = NULL;
+	struct fsnotify_mark *lmark = NULL;
 	int ret = 0;
 
 	inode = igrab(inode);
 	if (unlikely(!inode))
 		return -EINVAL;
 
-	entry->flags = FSNOTIFY_MARK_FLAG_INODE;
+	mark->flags = FSNOTIFY_MARK_FLAG_INODE;
 
 	/*
 	 * if this group isn't being testing for inode type events we need
@@ -340,24 +340,24 @@ int fsnotify_add_mark(struct fsnotify_mark *entry,
 
 	/*
 	 * LOCKING ORDER!!!!
-	 * entry->lock
+	 * mark->lock
 	 * group->mark_lock
 	 * inode->i_lock
 	 */
-	spin_lock(&entry->lock);
+	spin_lock(&mark->lock);
 	spin_lock(&group->mark_lock);
 	spin_lock(&inode->i_lock);
 
 	if (!allow_dups)
-		lentry = fsnotify_find_mark(group, inode);
-	if (!lentry) {
-		entry->group = group;
-		entry->i.inode = inode;
+		lmark = fsnotify_find_mark(group, inode);
+	if (!lmark) {
+		mark->group = group;
+		mark->i.inode = inode;
 
-		hlist_add_head(&entry->i.i_list, &inode->i_fsnotify_marks);
-		list_add(&entry->g_list, &group->marks_list);
+		hlist_add_head(&mark->i.i_list, &inode->i_fsnotify_marks);
+		list_add(&mark->g_list, &group->marks_list);
 
-		fsnotify_get_mark(entry); /* for i_list and g_list */
+		fsnotify_get_mark(mark); /* for i_list and g_list */
 
 		atomic_inc(&group->num_marks);
 
@@ -366,12 +366,12 @@ int fsnotify_add_mark(struct fsnotify_mark *entry,
 
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&group->mark_lock);
-	spin_unlock(&entry->lock);
+	spin_unlock(&mark->lock);
 
-	if (lentry) {
+	if (lmark) {
 		ret = -EEXIST;
 		iput(inode);
-		fsnotify_put_mark(lentry);
+		fsnotify_put_mark(lmark);
 	} else {
 		__fsnotify_update_child_dentry_flags(inode);
 	}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 83b6bfeb2d66..ff654c1932f2 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -83,7 +83,7 @@ struct fsnotify_ops {
 				  int data_type);
 	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
-	void (*freeing_mark)(struct fsnotify_mark *entry, struct fsnotify_group *group);
+	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
 	void (*free_event_priv)(struct fsnotify_event_private_data *priv);
 };
 
@@ -135,7 +135,7 @@ struct fsnotify_group {
 
 	/* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
 	spinlock_t mark_lock;		/* protect marks_list */
-	atomic_t num_marks;		/* 1 for each mark entry and 1 for not being
+	atomic_t num_marks;		/* 1 for each mark and 1 for not being
 					 * past the point of no return when freeing
 					 * a group */
 	struct list_head marks_list;	/* all inode marks for this group */
@@ -229,7 +229,7 @@ struct fsnotify_event {
  * Inode specific fields in an fsnotify_mark
  */
 struct fsnotify_inode_mark {
-	struct inode *inode;		/* inode this entry is associated with */
+	struct inode *inode;		/* inode this mark is associated with */
 	struct hlist_node i_list;	/* list of marks by inode->i_fsnotify_marks */
 	struct list_head free_i_list;	/* tmp list used when freeing this mark */
 };
@@ -238,13 +238,13 @@ struct fsnotify_inode_mark {
  * Mount point specific fields in an fsnotify_mark
  */
 struct fsnotify_vfsmount_mark {
-	struct vfsmount *mnt;		/* inode this entry is associated with */
+	struct vfsmount *mnt;		/* vfsmount this mark is associated with */
 	struct hlist_node m_list;	/* list of marks by inode->i_fsnotify_marks */
 	struct list_head free_m_list;	/* tmp list used when freeing this mark */
 };
 
 /*
- * a mark is simply an entry attached to an in core inode which allows an
+ * a mark is simply an object attached to an in core inode which allows an
  * fsnotify listener to indicate they are either no longer interested in events
  * of a type matching mask or only interested in those events.
  *
@@ -254,11 +254,11 @@ struct fsnotify_vfsmount_mark {
  * inode eviction or modification.
  */
 struct fsnotify_mark {
-	__u32 mask;			/* mask this mark entry is for */
+	__u32 mask;			/* mask this mark is for */
 	/* we hold ref for each i_list and g_list.  also one ref for each 'thing'
 	 * in kernel that found and may be using this mark. */
 	atomic_t refcnt;		/* active things looking at this mark */
-	struct fsnotify_group *group;	/* group this mark entry is for */
+	struct fsnotify_group *group;	/* group this mark is for */
 	struct list_head g_list;	/* list of marks by group->i_fsnotify_marks */
 	spinlock_t lock;		/* protect group and inode */
 	union {
@@ -269,7 +269,7 @@ struct fsnotify_mark {
 #define FSNOTIFY_MARK_FLAG_INODE	0x01
 #define FSNOTIFY_MARK_FLAG_VFSMOUNT	0x02
 	unsigned int flags;		/* vfsmount or inode mark? */
-	void (*free_mark)(struct fsnotify_mark *entry); /* called on final put+free */
+	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
 
 #ifdef CONFIG_FSNOTIFY
@@ -361,19 +361,19 @@ extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group
 
 /* run all marks associated with an inode and update inode->i_fsnotify_mask */
 extern void fsnotify_recalc_inode_mask(struct inode *inode);
-extern void fsnotify_init_mark(struct fsnotify_mark *entry, void (*free_mark)(struct fsnotify_mark *entry));
+extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(struct fsnotify_mark *mark));
 /* find (and take a reference) to a mark associated with group and inode */
 extern struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_group *group, struct inode *inode);
 /* copy the values from old into new */
 extern void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old);
 /* attach the mark to both the group and the inode */
-extern int fsnotify_add_mark(struct fsnotify_mark *entry, struct fsnotify_group *group, struct inode *inode, int allow_dups);
+extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, int allow_dups);
 /* given a mark, flag it to be freed when all references are dropped */
-extern void fsnotify_destroy_mark(struct fsnotify_mark *entry);
+extern void fsnotify_destroy_mark(struct fsnotify_mark *mark);
 /* run all the marks in a group, and flag them to be freed */
 extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
-extern void fsnotify_get_mark(struct fsnotify_mark *entry);
-extern void fsnotify_put_mark(struct fsnotify_mark *entry);
+extern void fsnotify_get_mark(struct fsnotify_mark *mark);
+extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_unmount_inodes(struct list_head *list);
 
 /* put here because inotify does some weird stuff when destroying watches */
-- 
cgit v1.2.3-59-g8ed1b


From ecf081d1a73b077916f514f2ec744ded32b88ca1 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:25 -0500
Subject: vfs: introduce FMODE_NONOTIFY

This is a new f_mode which can only be set by the kernel.  It indicates
that the fd was opened by fanotify and should not cause future fanotify
events.  This is needed to prevent fanotify livelock.  An example of
obvious livelock is from fanotify close events.

Process A closes file1
This creates a close event for file1.
fanotify opens file1 for Listener X
Listener X deals with the event and closes its fd for file1.
This creates a close event for file1.
fanotify opens file1 for Listener X
Listener X deals with the event and closes its fd for file1.
This creates a close event for file1.
fanotify opens file1 for Listener X
Listener X deals with the event and closes its fd for file1.
notice a pattern?

The fix is to add the FMODE_NONOTIFY bit to the open filp done by the kernel
for fanotify.  Thus when that file is used it will not generate future
events.

This patch simply defines the bit.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/asm-generic/fcntl.h |  8 ++++++++
 include/linux/fs.h          |  6 +++++-
 include/linux/fsnotify.h    | 24 ++++++++++++++++--------
 3 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index fcd268ce0674..009bd6149d99 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -3,6 +3,14 @@
 
 #include <linux/types.h>
 
+/*
+ * FMODE_EXEC is 0x20
+ * FMODE_NONOTIFY is 0x800000
+ * These cannot be used by userspace O_* until internal and external open
+ * flags are split.
+ * -Eric Paris
+ */
+
 #define O_ACCMODE	00000003
 #define O_RDONLY	00000000
 #define O_WRONLY	00000001
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 85fe89c43487..50ef4d4c95bf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -90,6 +90,9 @@ struct inodes_stat_t {
 /* Expect random access pattern */
 #define FMODE_RANDOM		((__force fmode_t)0x1000)
 
+/* File was opened by fanotify and shouldn't generate fanotify events */
+#define FMODE_NONOTIFY		((__force fmode_t)8388608)
+
 /*
  * The below are the various read and write types that we support. Some of
  * them include behavioral modifiers that send information down to the
@@ -2508,7 +2511,8 @@ int proc_nr_files(struct ctl_table *table, int write,
 int __init get_filesystem_list(char *buf);
 
 #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
-#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE))
+#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
+					    (flag & FMODE_NONOTIFY)))
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_FS_H */
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 62e93a9dd115..5184a2b786c1 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -165,8 +165,10 @@ static inline void fsnotify_access(struct file *file)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(path, NULL, mask);
-	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+	if (!(file->f_mode & FMODE_NONOTIFY)) {
+		fsnotify_parent(path, NULL, mask);
+		fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+	}
 }
 
 /*
@@ -181,8 +183,10 @@ static inline void fsnotify_modify(struct file *file)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(path, NULL, mask);
-	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+	if (!(file->f_mode & FMODE_NONOTIFY)) {
+		fsnotify_parent(path, NULL, mask);
+		fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+	}
 }
 
 /*
@@ -197,8 +201,10 @@ static inline void fsnotify_open(struct file *file)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(path, NULL, mask);
-	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+	if (!(file->f_mode & FMODE_NONOTIFY)) {
+		fsnotify_parent(path, NULL, mask);
+		fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+	}
 }
 
 /*
@@ -214,8 +220,10 @@ static inline void fsnotify_close(struct file *file)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	fsnotify_parent(path, NULL, mask);
-	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+	if (!(file->f_mode & FMODE_NONOTIFY)) {
+		fsnotify_parent(path, NULL, mask);
+		fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+	}
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 12ed2e36c98aec6c41559222e311f4aa15d254b6 Mon Sep 17 00:00:00 2001
From: "Signed-off-by: Wu Fengguang" <fengguang.wu@intel.com>
Date: Mon, 8 Feb 2010 12:31:29 -0500
Subject: fanotify: FMODE_NONOTIFY and __O_SYNC in sparc conflict

sparc used the same value as FMODE_NONOTIFY so change FMODE_NONOTIFY to be
something unique.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/asm-generic/fcntl.h | 2 +-
 include/linux/fs.h          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index 009bd6149d99..e3cbc38bdcc2 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -5,7 +5,7 @@
 
 /*
  * FMODE_EXEC is 0x20
- * FMODE_NONOTIFY is 0x800000
+ * FMODE_NONOTIFY is 0x1000000
  * These cannot be used by userspace O_* until internal and external open
  * flags are split.
  * -Eric Paris
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 50ef4d4c95bf..f9a003278758 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -91,7 +91,7 @@ struct inodes_stat_t {
 #define FMODE_RANDOM		((__force fmode_t)0x1000)
 
 /* File was opened by fanotify and shouldn't generate fanotify events */
-#define FMODE_NONOTIFY		((__force fmode_t)8388608)
+#define FMODE_NONOTIFY		((__force fmode_t)16777216) /* 0x1000000 */
 
 /*
  * The below are the various read and write types that we support. Some of
-- 
cgit v1.2.3-59-g8ed1b


From ff0b16a9850e8a240ad59e10b0a1291a8fcf7cbc Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:25 -0500
Subject: fanotify: fscking all notification system

fanotify is a novel file notification system which bases notification on
giving userspace both an event type (open, close, read, write) and an open
file descriptor to the object in question.  This should address a number of
races and problems with other notification systems like inotify and dnotify
and should allow the future implementation of blocking or access controlled
notification.  These are useful for on access scanners or hierachical storage
management schemes.

This patch just implements the basics of the fsnotify functions.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/Kconfig             |  1 +
 fs/notify/Makefile            |  1 +
 fs/notify/fanotify/Kconfig    | 11 ++++++
 fs/notify/fanotify/Makefile   |  1 +
 fs/notify/fanotify/fanotify.c | 78 +++++++++++++++++++++++++++++++++++++++++++
 fs/notify/fanotify/fanotify.h | 12 +++++++
 include/linux/Kbuild          |  1 +
 include/linux/fanotify.h      | 40 ++++++++++++++++++++++
 8 files changed, 145 insertions(+)
 create mode 100644 fs/notify/fanotify/Kconfig
 create mode 100644 fs/notify/fanotify/Makefile
 create mode 100644 fs/notify/fanotify/fanotify.c
 create mode 100644 fs/notify/fanotify/fanotify.h
 create mode 100644 include/linux/fanotify.h

(limited to 'include/linux')

diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index dffbb0911d02..22c629eedd82 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,3 +3,4 @@ config FSNOTIFY
 
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
+source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 0922cc826c46..396a38779371 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -2,3 +2,4 @@ obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o
 
 obj-y			+= dnotify/
 obj-y			+= inotify/
+obj-y			+= fanotify/
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
new file mode 100644
index 000000000000..f9d7ae081f85
--- /dev/null
+++ b/fs/notify/fanotify/Kconfig
@@ -0,0 +1,11 @@
+config FANOTIFY
+	bool "Filesystem wide access notification"
+	select FSNOTIFY
+	default y
+	---help---
+	   Say Y here to enable fanotify suport.  fanotify is a file access
+	   notification system which differs from inotify in that it sends
+	   and open file descriptor to the userspace listener along with
+	   the event.
+
+	   If unsure, say Y.
diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
new file mode 100644
index 000000000000..e7d39c05b0fe
--- /dev/null
+++ b/fs/notify/fanotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_FANOTIFY)		+= fanotify.o
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
new file mode 100644
index 000000000000..3ffb9dbcab08
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.c
@@ -0,0 +1,78 @@
+#include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/init.h>
+#include <linux/kernel.h> /* UINT_MAX */
+#include <linux/types.h>
+
+#include "fanotify.h"
+
+static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	int ret;
+
+
+	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
+	BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+	BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
+	BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
+	BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	ret = fsnotify_add_notify_event(group, event, NULL, NULL);
+
+	return ret;
+}
+
+static bool fanotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
+				       struct vfsmount *mnt, __u32 mask, void *data,
+				       int data_type)
+{
+	struct fsnotify_mark *fsn_mark;
+	bool send;
+
+	pr_debug("%s: group=%p inode=%p mask=%x data=%p data_type=%d\n",
+		 __func__, group, inode, mask, data, data_type);
+
+	/* sorry, fanotify only gives a damn about files and dirs */
+	if (!S_ISREG(inode->i_mode) &&
+	    !S_ISDIR(inode->i_mode))
+		return false;
+
+	/* if we don't have enough info to send an event to userspace say no */
+	if (data_type != FSNOTIFY_EVENT_PATH)
+		return false;
+
+	fsn_mark = fsnotify_find_mark(group, inode);
+	if (!fsn_mark)
+		return false;
+
+	/* if the event is for a child and this inode doesn't care about
+	 * events on the child, don't send it! */
+	if ((mask & FS_EVENT_ON_CHILD) &&
+	    !(fsn_mark->mask & FS_EVENT_ON_CHILD)) {
+		send = false;
+	} else {
+		/*
+		 * We care about children, but do we care about this particular
+		 * type of event?
+		 */
+		mask = (mask & ~FS_EVENT_ON_CHILD);
+		send = (fsn_mark->mask & mask);
+	}
+
+	/* find took a reference */
+	fsnotify_put_mark(fsn_mark);
+
+	return send;
+}
+
+const struct fsnotify_ops fanotify_fsnotify_ops = {
+	.handle_event = fanotify_handle_event,
+	.should_send_event = fanotify_should_send_event,
+	.free_group_priv = NULL,
+	.free_event_priv = NULL,
+	.freeing_mark = NULL,
+};
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
new file mode 100644
index 000000000000..50765eb30fe4
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.h
@@ -0,0 +1,12 @@
+#include <linux/fanotify.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/net.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+static inline bool fanotify_mask_valid(__u32 mask)
+{
+	if (mask & ~((__u32)FAN_ALL_INCOMING_EVENTS))
+		return false;
+	return true;
+}
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 2fc8e14cc24a..d5cca9a05f14 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -210,6 +210,7 @@ unifdef-y += ethtool.h
 unifdef-y += eventpoll.h
 unifdef-y += signalfd.h
 unifdef-y += ext2_fs.h
+unifdef-y += fanotify.h
 unifdef-y += fb.h
 unifdef-y += fcntl.h
 unifdef-y += filter.h
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
new file mode 100644
index 000000000000..b560f86d1401
--- /dev/null
+++ b/include/linux/fanotify.h
@@ -0,0 +1,40 @@
+#ifndef _LINUX_FANOTIFY_H
+#define _LINUX_FANOTIFY_H
+
+#include <linux/types.h>
+
+/* the following events that user-space can register for */
+#define FAN_ACCESS		0x00000001	/* File was accessed */
+#define FAN_MODIFY		0x00000002	/* File was modified */
+#define FAN_CLOSE_WRITE		0x00000008	/* Unwrittable file closed */
+#define FAN_CLOSE_NOWRITE	0x00000010	/* Writtable file closed */
+#define FAN_OPEN		0x00000020	/* File was opened */
+
+#define FAN_EVENT_ON_CHILD	0x08000000	/* interested in child events */
+
+/* FIXME currently Q's have no limit.... */
+#define FAN_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
+
+/* helper events */
+#define FAN_CLOSE		(FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */
+
+/*
+ * All of the events - we build the list by hand so that we can add flags in
+ * the future and not break backward compatibility.  Apps will get only the
+ * events that they originally wanted.  Be sure to add new events here!
+ */
+#define FAN_ALL_EVENTS (FAN_ACCESS |\
+			FAN_MODIFY |\
+			FAN_CLOSE |\
+			FAN_OPEN)
+
+/*
+ * All legal FAN bits userspace can request (although possibly not all
+ * at the same time.
+ */
+#define FAN_ALL_INCOMING_EVENTS	(FAN_ALL_EVENTS |\
+				 FAN_EVENT_ON_CHILD)
+#ifdef __KERNEL__
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_FANOTIFY_H */
-- 
cgit v1.2.3-59-g8ed1b


From 11637e4b7dc098e9a863f0a619d55ebc60f5949e Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:25 -0500
Subject: fanotify: fanotify_init syscall declaration

This patch defines a new syscall fanotify_init() of the form:

int sys_fanotify_init(unsigned int flags, unsigned int event_f_flags,
		      unsigned int priority)

This syscall is used to create and fanotify group.  This is very similar to
the inotify_init() syscall.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 arch/x86/ia32/ia32entry.S          |  1 +
 arch/x86/include/asm/unistd_32.h   |  3 ++-
 arch/x86/include/asm/unistd_64.h   |  2 ++
 arch/x86/kernel/syscall_table_32.S |  1 +
 fs/notify/fanotify/Makefile        |  2 +-
 fs/notify/fanotify/fanotify_user.c | 13 +++++++++++++
 include/linux/syscalls.h           |  2 ++
 kernel/sys_ni.c                    |  3 +++
 8 files changed, 25 insertions(+), 2 deletions(-)
 create mode 100644 fs/notify/fanotify/fanotify_user.c

(limited to 'include/linux')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e790bc1fbfa3..586cb3be2e32 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -842,4 +842,5 @@ ia32_sys_call_table:
 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
 	.quad sys_perf_event_open
 	.quad compat_sys_recvmmsg
+	.quad sys_fanotify_init
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index beb9b5f8f8a4..981c7e7ad804 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,11 @@
 #define __NR_rt_tgsigqueueinfo	335
 #define __NR_perf_event_open	336
 #define __NR_recvmmsg		337
+#define __NR_fanotify_init	338
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 338
+#define NR_syscalls 339
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index ff4307b0e81e..4f23e04bdb34 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,8 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_recvmmsg				299
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_fanotify_init			300
+__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b3729341216..e38793b50e1d 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,4 @@ ENTRY(sys_call_table)
 	.long sys_rt_tgsigqueueinfo	/* 335 */
 	.long sys_perf_event_open
 	.long sys_recvmmsg
+	.long sys_fanotify_init
diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
index e7d39c05b0fe..0999213e7e6e 100644
--- a/fs/notify/fanotify/Makefile
+++ b/fs/notify/fanotify/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_FANOTIFY)		+= fanotify.o
+obj-$(CONFIG_FANOTIFY)		+= fanotify.o fanotify_user.o
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
new file mode 100644
index 000000000000..cf176fc7086b
--- /dev/null
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -0,0 +1,13 @@
+#include <linux/fcntl.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+
+#include "fanotify.h"
+
+SYSCALL_DEFINE3(fanotify_init, unsigned int, flags, unsigned int, event_f_flags,
+		unsigned int, priority)
+{
+	return -ENOSYS;
+}
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 13ebb5413a79..198dcc9bd025 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -813,6 +813,8 @@ asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
 asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
 			  struct timespec __user *, const sigset_t __user *,
 			  size_t);
+asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags,
+				  unsigned int priority);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 70f2ea758ffe..2c4adc2decc3 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -181,3 +181,6 @@ cond_syscall(sys_eventfd2);
 
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
+
+/* fanotify! */
+cond_syscall(sys_fanotify_init);
-- 
cgit v1.2.3-59-g8ed1b


From 52c923dd079df49f58016a9e56df184b132611d6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:26 -0500
Subject: fanotify: fanotify_init syscall implementation

NAME
	fanotify_init - initialize an fanotify group

SYNOPSIS
	int fanotify_init(unsigned int flags, unsigned int event_f_flags, int priority);

DESCRIPTION
	fanotify_init() initializes a new fanotify instance and returns a file
	descriptor associated with the new fanotify event queue.

	The following values can be OR'd into the flags field:

	FAN_NONBLOCK Set the O_NONBLOCK file status flag on the new open file description.
		Using this flag saves extra calls to fcntl(2) to achieve the same
		result.

	FAN_CLOEXEC Set the close-on-exec (FD_CLOEXEC) flag on the new file descriptor.
		See the description of the O_CLOEXEC flag in open(2) for reasons why
		this may be useful.

	The event_f_flags argument is unused and must be set to 0

	The priority argument is unused and must be set to 0

RETURN VALUE
	On success, this system call return a new file descriptor. On error, -1 is
	returned, and errno is set to indicate the error.

ERRORS
	EINVAL An invalid value was specified in flags.

	EINVAL A non-zero valid was passed in event_f_flags or in priority

	ENFILE The system limit on the total number of file descriptors has been reached.

	ENOMEM Insufficient kernel memory is available.

CONFORMING TO
	These system calls are Linux-specific.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.h      |  2 ++
 fs/notify/fanotify/fanotify_user.c | 61 +++++++++++++++++++++++++++++++++++++-
 include/linux/fanotify.h           |  4 +++
 3 files changed, 66 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 50765eb30fe4..dd656cfab1ba 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -4,6 +4,8 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 
+extern const struct fsnotify_ops fanotify_fsnotify_ops;
+
 static inline bool fanotify_mask_valid(__u32 mask)
 {
 	if (mask & ~((__u32)FAN_ALL_INCOMING_EVENTS))
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index cf176fc7086b..67c0b5e4a488 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1,13 +1,72 @@
 #include <linux/fcntl.h>
 #include <linux/fs.h>
+#include <linux/anon_inodes.h>
 #include <linux/fsnotify_backend.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 
 #include "fanotify.h"
 
+static int fanotify_release(struct inode *ignored, struct file *file)
+{
+	struct fsnotify_group *group = file->private_data;
+
+	pr_debug("%s: file=%p group=%p\n", __func__, file, group);
+
+	/* matches the fanotify_init->fsnotify_alloc_group */
+	fsnotify_put_group(group);
+
+	return 0;
+}
+
+static const struct file_operations fanotify_fops = {
+	.poll		= NULL,
+	.read		= NULL,
+	.fasync		= NULL,
+	.release	= fanotify_release,
+	.unlocked_ioctl	= NULL,
+	.compat_ioctl	= NULL,
+};
+
+/* fanotify syscalls */
 SYSCALL_DEFINE3(fanotify_init, unsigned int, flags, unsigned int, event_f_flags,
 		unsigned int, priority)
 {
-	return -ENOSYS;
+	struct fsnotify_group *group;
+	int f_flags, fd;
+
+	pr_debug("%s: flags=%d event_f_flags=%d priority=%d\n",
+		__func__, flags, event_f_flags, priority);
+
+	if (event_f_flags)
+		return -EINVAL;
+	if (priority)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (flags & ~FAN_ALL_INIT_FLAGS)
+		return -EINVAL;
+
+	f_flags = (O_RDONLY | FMODE_NONOTIFY);
+	if (flags & FAN_CLOEXEC)
+		f_flags |= O_CLOEXEC;
+	if (flags & FAN_NONBLOCK)
+		f_flags |= O_NONBLOCK;
+
+	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
+	group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
+	if (fd < 0)
+		goto out_put_group;
+
+	return fd;
+
+out_put_group:
+	fsnotify_put_group(group);
+	return fd;
 }
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index b560f86d1401..00bc6d4fbb58 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -18,6 +18,10 @@
 /* helper events */
 #define FAN_CLOSE		(FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */
 
+#define FAN_CLOEXEC		0x00000001
+#define FAN_NONBLOCK		0x00000002
+
+#define FAN_ALL_INIT_FLAGS	(FAN_CLOEXEC | FAN_NONBLOCK)
 /*
  * All of the events - we build the list by hand so that we can add flags in
  * the future and not break backward compatibility.  Apps will get only the
-- 
cgit v1.2.3-59-g8ed1b


From bbaa4168b2d2d8cc674e6d35806e8426aef464b8 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:26 -0500
Subject: fanotify: sys_fanotify_mark declartion

This patch simply declares the new sys_fanotify_mark syscall

int fanotify_mark(int fanotify_fd, unsigned int flags, u64_mask,
		  int dfd const char *pathname)

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 arch/x86/ia32/ia32entry.S          | 1 +
 arch/x86/ia32/sys_ia32.c           | 9 +++++++++
 arch/x86/include/asm/sys_ia32.h    | 3 +++
 arch/x86/include/asm/unistd_32.h   | 3 ++-
 arch/x86/include/asm/unistd_64.h   | 2 ++
 arch/x86/kernel/syscall_table_32.S | 1 +
 fs/notify/fanotify/fanotify_user.c | 6 ++++++
 include/linux/syscalls.h           | 3 +++
 kernel/sys_ni.c                    | 1 +
 9 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 586cb3be2e32..17cf65c94804 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -843,4 +843,5 @@ ia32_sys_call_table:
 	.quad sys_perf_event_open
 	.quad compat_sys_recvmmsg
 	.quad sys_fanotify_init
+	.quad sys32_fanotify_mark
 ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 626be156d88d..3d093311d5e2 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -546,3 +546,12 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
 	return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
 			     ((u64)len_hi << 32) | len_lo);
 }
+
+asmlinkage long sys32_fanotify_mark(int fanotify_fd, unsigned int flags,
+				    u32 mask_lo, u32 mask_hi,
+				    int fd, const char  __user *pathname)
+{
+	return sys_fanotify_mark(fanotify_fd, flags,
+				 ((u64)mask_hi << 32) | mask_lo,
+				 fd, pathname);
+}
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 3ad421784ae7..cf4e2e381cba 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -80,4 +80,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
 
 /* ia32/ipc32.c */
 asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
+
+asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int,
+				    const char __user *);
 #endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 981c7e7ad804..80b799cd74f7 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -344,10 +344,11 @@
 #define __NR_perf_event_open	336
 #define __NR_recvmmsg		337
 #define __NR_fanotify_init	338
+#define __NR_fanotify_mark	339
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 339
+#define NR_syscalls 340
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 4f23e04bdb34..5b7b1d585616 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -665,6 +665,8 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
 #define __NR_fanotify_init			300
 __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
+#define __NR_fanotify_mark			301
+__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index e38793b50e1d..07ad5eb7cc5c 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -338,3 +338,4 @@ ENTRY(sys_call_table)
 	.long sys_perf_event_open
 	.long sys_recvmmsg
 	.long sys_fanotify_init
+	.long sys_fanotify_mark
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 67c0b5e4a488..55d6e379f2b6 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -70,3 +70,9 @@ out_put_group:
 	fsnotify_put_group(group);
 	return fd;
 }
+
+SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
+		__u64, mask, int, dfd, const char  __user *, pathname)
+{
+	return -ENOSYS;
+}
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 198dcc9bd025..5b05c37059e9 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -815,6 +815,9 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
 			  size_t);
 asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags,
 				  unsigned int priority);
+asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
+				  u64 mask, int fd,
+				  const char  __user *pathname);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2c4adc2decc3..bad369ec5403 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -184,3 +184,4 @@ cond_syscall(sys_perf_event_open);
 
 /* fanotify! */
 cond_syscall(sys_fanotify_init);
+cond_syscall(sys_fanotify_mark);
-- 
cgit v1.2.3-59-g8ed1b


From 2a3edf86040a7e15684525a2aadc29f532c51325 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:26 -0500
Subject: fanotify: fanotify_mark syscall implementation

NAME
	fanotify_mark - add, remove, or modify an fanotify mark on a
filesystem object

SYNOPSIS
	int fanotify_mark(int fanotify_fd, unsigned int flags, u64 mask,
			  int dfd, const char *pathname)

DESCRIPTION
	fanotify_mark() is used to add remove or modify a mark on a filesystem
	object.  Marks are used to indicate that the fanotify group is
	interested in events which occur on that object.  At this point in
	time marks may only be added to files and directories.

	fanotify_fd must be a file descriptor returned by fanotify_init()

	The flags field must contain exactly one of the following:

	FAN_MARK_ADD - or the bits in mask and ignored mask into the mark
	FAN_MARK_REMOVE - bitwise remove the bits in mask and ignored mark
		from the mark

	The following values can be OR'd into the flags field:

	FAN_MARK_DONT_FOLLOW - same meaning as O_NOFOLLOW as described in open(2)
	FAN_MARK_ONLYDIR - same meaning as O_DIRECTORY as described in open(2)

	dfd may be any of the following:
	AT_FDCWD: the object will be lookup up based on pathname similar
		to open(2)

	file descriptor of a directory: if pathname is not NULL the
		object to modify will be lookup up similar to openat(2)

	file descriptor of the final object: if pathname is NULL the
		object to modify will be the object referenced by dfd

	The mask is the bitwise OR of the set of events of interest such as:
	FAN_ACCESS		- object was accessed (read)
	FAN_MODIFY		- object was modified (write)
	FAN_CLOSE_WRITE		- object was writable and was closed
	FAN_CLOSE_NOWRITE	- object was read only and was closed
	FAN_OPEN		- object was opened
	FAN_EVENT_ON_CHILD	- interested in objected that happen to
				  children.  Only relavent when the object
				  is a directory
	FAN_Q_OVERFLOW		- event queue overflowed (not implemented)

RETURN VALUE
	On success, this system call returns 0. On error, -1 is
	returned, and errno is set to indicate the error.

ERRORS
	EINVAL An invalid value was specified in flags.

	EINVAL An invalid value was specified in mask.

	EINVAL An invalid value was specified in ignored_mask.

	EINVAL fanotify_fd is not a file descriptor as returned by
	fanotify_init()

	EBADF fanotify_fd is not a valid file descriptor

	EBADF dfd is not a valid file descriptor and path is NULL.

	ENOTDIR dfd is not a directory and path is not NULL

	EACCESS no search permissions on some part of the path

	ENENT file not found

	ENOMEM Insufficient kernel memory is available.

CONFORMING TO
	These system calls are Linux-specific.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.h      |  18 +++
 fs/notify/fanotify/fanotify_user.c | 239 ++++++++++++++++++++++++++++++++++++-
 include/linux/fanotify.h           |  13 ++
 3 files changed, 269 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index dd656cfab1ba..59c3331a0e81 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -6,6 +6,24 @@
 
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 
+static inline bool fanotify_mark_flags_valid(unsigned int flags)
+{
+	/* must be either and add or a remove */
+	if (!(flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)))
+		return false;
+
+	/* cannot be both add and remove */
+	if ((flags & FAN_MARK_ADD) &&
+	    (flags & FAN_MARK_REMOVE))
+		return false;
+
+	/* cannot have more flags than we know about */
+	if (flags & ~FAN_ALL_MARK_FLAGS)
+		return false;
+
+	return true;
+}
+
 static inline bool fanotify_mask_valid(__u32 mask)
 {
 	if (mask & ~((__u32)FAN_ALL_INCOMING_EVENTS))
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 55d6e379f2b6..bc4fa48157f1 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1,12 +1,18 @@
 #include <linux/fcntl.h>
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/anon_inodes.h>
 #include <linux/fsnotify_backend.h>
+#include <linux/init.h>
+#include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/types.h>
 
 #include "fanotify.h"
 
+static struct kmem_cache *fanotify_mark_cache __read_mostly;
+
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
 	struct fsnotify_group *group = file->private_data;
@@ -28,6 +34,185 @@ static const struct file_operations fanotify_fops = {
 	.compat_ioctl	= NULL,
 };
 
+static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+	kmem_cache_free(fanotify_mark_cache, fsn_mark);
+}
+
+static int fanotify_find_path(int dfd, const char __user *filename,
+			      struct path *path, unsigned int flags)
+{
+	int ret;
+
+	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
+		 dfd, filename, flags);
+
+	if (filename == NULL) {
+		struct file *file;
+		int fput_needed;
+
+		ret = -EBADF;
+		file = fget_light(dfd, &fput_needed);
+		if (!file)
+			goto out;
+
+		ret = -ENOTDIR;
+		if ((flags & FAN_MARK_ONLYDIR) &&
+		    !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) {
+			fput_light(file, fput_needed);
+			goto out;
+		}
+
+		*path = file->f_path;
+		path_get(path);
+		fput_light(file, fput_needed);
+	} else {
+		unsigned int lookup_flags = 0;
+
+		if (!(flags & FAN_MARK_DONT_FOLLOW))
+			lookup_flags |= LOOKUP_FOLLOW;
+		if (flags & FAN_MARK_ONLYDIR)
+			lookup_flags |= LOOKUP_DIRECTORY;
+
+		ret = user_path_at(dfd, filename, lookup_flags, path);
+		if (ret)
+			goto out;
+	}
+
+	/* you can only watch an inode if you have read permissions on it */
+	ret = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (ret)
+		path_put(path);
+out:
+	return ret;
+}
+
+static int fanotify_remove_mark(struct fsnotify_group *group,
+				struct inode *inode,
+				__u32 mask)
+{
+	struct fsnotify_mark *fsn_mark;
+	__u32 new_mask;
+
+	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__,
+		 group, inode, mask);
+
+	fsn_mark = fsnotify_find_mark(group, inode);
+	if (!fsn_mark)
+		return -ENOENT;
+
+	spin_lock(&fsn_mark->lock);
+	fsn_mark->mask &= ~mask;
+	new_mask = fsn_mark->mask;
+	spin_unlock(&fsn_mark->lock);
+
+	if (!new_mask)
+		fsnotify_destroy_mark(fsn_mark);
+	else
+		fsnotify_recalc_inode_mask(inode);
+
+	fsnotify_recalc_group_mask(group);
+
+	/* matches the fsnotify_find_mark() */
+	fsnotify_put_mark(fsn_mark);
+
+	return 0;
+}
+
+static int fanotify_add_mark(struct fsnotify_group *group,
+			     struct inode *inode,
+			     __u32 mask)
+{
+	struct fsnotify_mark *fsn_mark;
+	__u32 old_mask, new_mask;
+	int ret;
+
+	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__,
+		 group, inode, mask);
+
+	fsn_mark = fsnotify_find_mark(group, inode);
+	if (!fsn_mark) {
+		struct fsnotify_mark *new_fsn_mark;
+
+		ret = -ENOMEM;
+		new_fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+		if (!new_fsn_mark)
+			goto out;
+
+		fsnotify_init_mark(new_fsn_mark, fanotify_free_mark);
+		ret = fsnotify_add_mark(new_fsn_mark, group, inode, 0);
+		if (ret) {
+			fanotify_free_mark(new_fsn_mark);
+			goto out;
+		}
+
+		fsn_mark = new_fsn_mark;
+	}
+
+	ret = 0;
+
+	spin_lock(&fsn_mark->lock);
+	old_mask = fsn_mark->mask;
+	fsn_mark->mask |= mask;
+	new_mask = fsn_mark->mask;
+	spin_unlock(&fsn_mark->lock);
+
+	/* we made changes to a mask, update the group mask and the inode mask
+	 * so things happen quickly. */
+	if (old_mask != new_mask) {
+		/* more bits in old than in new? */
+		int dropped = (old_mask & ~new_mask);
+		/* more bits in this mark than the inode's mask? */
+		int do_inode = (new_mask & ~inode->i_fsnotify_mask);
+		/* more bits in this mark than the group? */
+		int do_group = (new_mask & ~group->mask);
+
+		/* update the inode with this new mark */
+		if (dropped || do_inode)
+			fsnotify_recalc_inode_mask(inode);
+
+		/* update the group mask with the new mask */
+		if (dropped || do_group)
+			fsnotify_recalc_group_mask(group);
+	}
+
+	/* match the init or the find.... */
+	fsnotify_put_mark(fsn_mark);
+out:
+	return ret;
+}
+
+static int fanotify_update_mark(struct fsnotify_group *group,
+				struct inode *inode, int flags,
+				__u32 mask)
+{
+	pr_debug("%s: group=%p inode=%p flags=%x mask=%x\n", __func__,
+		 group, inode, flags, mask);
+
+	if (flags & FAN_MARK_ADD)
+		fanotify_add_mark(group, inode, mask);
+	else if (flags & FAN_MARK_REMOVE)
+		fanotify_remove_mark(group, inode, mask);
+	else
+		BUG();
+
+	return 0;
+}
+
+static bool fanotify_mark_validate_input(int flags,
+					 __u32 mask)
+{
+	pr_debug("%s: flags=%x mask=%x\n", __func__, flags, mask);
+
+	/* are flags valid of this operation? */
+	if (!fanotify_mark_flags_valid(flags))
+		return false;
+	/* is the mask valid? */
+	if (!fanotify_mask_valid(mask))
+		return false;
+	return true;
+}
+
 /* fanotify syscalls */
 SYSCALL_DEFINE3(fanotify_init, unsigned int, flags, unsigned int, event_f_flags,
 		unsigned int, priority)
@@ -74,5 +259,57 @@ out_put_group:
 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
 		__u64, mask, int, dfd, const char  __user *, pathname)
 {
-	return -ENOSYS;
+	struct inode *inode;
+	struct fsnotify_group *group;
+	struct file *filp;
+	struct path path;
+	int ret, fput_needed;
+
+	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
+		 __func__, fanotify_fd, flags, dfd, pathname, mask);
+
+	/* we only use the lower 32 bits as of right now. */
+	if (mask & ((__u64)0xffffffff << 32))
+		return -EINVAL;
+
+	if (!fanotify_mark_validate_input(flags, mask))
+		return -EINVAL;
+
+	filp = fget_light(fanotify_fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an fanotify instance */
+	ret = -EINVAL;
+	if (unlikely(filp->f_op != &fanotify_fops))
+		goto fput_and_out;
+
+	ret = fanotify_find_path(dfd, pathname, &path, flags);
+	if (ret)
+		goto fput_and_out;
+
+	/* inode held in place by reference to path; group by fget on fd */
+	inode = path.dentry->d_inode;
+	group = filp->private_data;
+
+	/* create/update an inode mark */
+	ret = fanotify_update_mark(group, inode, flags, mask);
+
+	path_put(&path);
+fput_and_out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+/*
+ * fanotify_user_setup - Our initialization function.  Note that we cannnot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init fanotify_user_setup(void)
+{
+	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
+
+	return 0;
 }
+device_initcall(fanotify_user_setup);
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 00bc6d4fbb58..95aeea2a3ca6 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -18,10 +18,23 @@
 /* helper events */
 #define FAN_CLOSE		(FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */
 
+/* flags used for fanotify_init() */
 #define FAN_CLOEXEC		0x00000001
 #define FAN_NONBLOCK		0x00000002
 
 #define FAN_ALL_INIT_FLAGS	(FAN_CLOEXEC | FAN_NONBLOCK)
+
+/* flags used for fanotify_modify_mark() */
+#define FAN_MARK_ADD		0x00000001
+#define FAN_MARK_REMOVE		0x00000002
+#define FAN_MARK_DONT_FOLLOW	0x00000004
+#define FAN_MARK_ONLYDIR	0x00000008
+
+#define FAN_ALL_MARK_FLAGS	(FAN_MARK_ADD |\
+				 FAN_MARK_REMOVE |\
+				 FAN_MARK_DONT_FOLLOW |\
+				 FAN_MARK_ONLYDIR)
+
 /*
  * All of the events - we build the list by hand so that we can add flags in
  * the future and not break backward compatibility.  Apps will get only the
-- 
cgit v1.2.3-59-g8ed1b


From a1014f102322398e67524b68b3300acf384e6c1f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:26 -0500
Subject: fanotify: send events using read

Send events to userspace by reading the file descriptor from fanotify_init().
One will get blocks of data which look like:

struct fanotify_event_metadata {
	__u32 event_len;
	__u32 vers;
	__s32 fd;
	__u64 mask;
	__s64 pid;
	__u64 cookie;
} __attribute__ ((packed));

Simple code to retrieve and deal with events is below

	while ((len = read(fan_fd, buf, sizeof(buf))) > 0) {
		struct fanotify_event_metadata *metadata;

		metadata = (void *)buf;
		while(FAN_EVENT_OK(metadata, len)) {
			[PROCESS HERE!!]
			if (metadata->fd >= 0 && close(metadata->fd) != 0)
				goto fail;
			metadata = FAN_EVENT_NEXT(metadata, len);
		}
	}

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.h      |   5 +
 fs/notify/fanotify/fanotify_user.c | 220 ++++++++++++++++++++++++++++++++++++-
 include/linux/fanotify.h           |  24 ++++
 3 files changed, 245 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 59c3331a0e81..5608783c6bca 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -30,3 +30,8 @@ static inline bool fanotify_mask_valid(__u32 mask)
 		return false;
 	return true;
 }
+
+static inline __u32 fanotify_outgoing_mask(__u32 mask)
+{
+	return mask & FAN_ALL_OUTGOING_EVENTS;
+}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index bc4fa48157f1..a99550f83f8a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -4,15 +4,202 @@
 #include <linux/anon_inodes.h>
 #include <linux/fsnotify_backend.h>
 #include <linux/init.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/poll.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/types.h>
+#include <linux/uaccess.h>
+
+#include <asm/ioctls.h>
 
 #include "fanotify.h"
 
 static struct kmem_cache *fanotify_mark_cache __read_mostly;
 
+/*
+ * Get an fsnotify notification event if one exists and is small
+ * enough to fit in "count". Return an error pointer if the count
+ * is not large enough.
+ *
+ * Called with the group->notification_mutex held.
+ */
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+					    size_t count)
+{
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+
+	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
+
+	if (fsnotify_notify_queue_is_empty(group))
+		return NULL;
+
+	if (FAN_EVENT_METADATA_LEN > count)
+		return ERR_PTR(-EINVAL);
+
+	/* held the notification_mutex the whole time, so this is the
+	 * same event we peeked above */
+	return fsnotify_remove_notify_event(group);
+}
+
+static int create_and_fill_fd(struct fsnotify_group *group,
+			      struct fanotify_event_metadata *metadata,
+			      struct fsnotify_event *event)
+{
+	int client_fd;
+	struct dentry *dentry;
+	struct vfsmount *mnt;
+	struct file *new_file;
+
+	pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, group,
+		 metadata, event);
+
+	client_fd = get_unused_fd();
+	if (client_fd < 0)
+		return client_fd;
+
+	if (event->data_type != FSNOTIFY_EVENT_PATH) {
+		WARN_ON(1);
+		put_unused_fd(client_fd);
+		return -EINVAL;
+	}
+
+	/*
+	 * we need a new file handle for the userspace program so it can read even if it was
+	 * originally opened O_WRONLY.
+	 */
+	dentry = dget(event->path.dentry);
+	mnt = mntget(event->path.mnt);
+	/* it's possible this event was an overflow event.  in that case dentry and mnt
+	 * are NULL;  That's fine, just don't call dentry open */
+	if (dentry && mnt)
+		new_file = dentry_open(dentry, mnt,
+				       O_RDONLY | O_LARGEFILE | FMODE_NONOTIFY,
+				       current_cred());
+	else
+		new_file = ERR_PTR(-EOVERFLOW);
+	if (IS_ERR(new_file)) {
+		/*
+		 * we still send an event even if we can't open the file.  this
+		 * can happen when say tasks are gone and we try to open their
+		 * /proc files or we try to open a WRONLY file like in sysfs
+		 * we just send the errno to userspace since there isn't much
+		 * else we can do.
+		 */
+		put_unused_fd(client_fd);
+		client_fd = PTR_ERR(new_file);
+	} else {
+		fd_install(client_fd, new_file);
+	}
+
+	metadata->fd = client_fd;
+
+	return 0;
+}
+
+static ssize_t fill_event_metadata(struct fsnotify_group *group,
+				   struct fanotify_event_metadata *metadata,
+				   struct fsnotify_event *event)
+{
+	pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
+		 group, metadata, event);
+
+	metadata->event_len = FAN_EVENT_METADATA_LEN;
+	metadata->vers = FANOTIFY_METADATA_VERSION;
+	metadata->mask = fanotify_outgoing_mask(event->mask);
+
+	return create_and_fill_fd(group, metadata, event);
+
+}
+
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+				  struct fsnotify_event *event,
+				  char __user *buf)
+{
+	struct fanotify_event_metadata fanotify_event_metadata;
+	int ret;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	ret = fill_event_metadata(group, &fanotify_event_metadata, event);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
+		return -EFAULT;
+
+	return FAN_EVENT_METADATA_LEN;
+}
+
+/* intofiy userspace file descriptor functions */
+static unsigned int fanotify_poll(struct file *file, poll_table *wait)
+{
+	struct fsnotify_group *group = file->private_data;
+	int ret = 0;
+
+	poll_wait(file, &group->notification_waitq, wait);
+	mutex_lock(&group->notification_mutex);
+	if (!fsnotify_notify_queue_is_empty(group))
+		ret = POLLIN | POLLRDNORM;
+	mutex_unlock(&group->notification_mutex);
+
+	return ret;
+}
+
+static ssize_t fanotify_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event *kevent;
+	char __user *start;
+	int ret;
+	DEFINE_WAIT(wait);
+
+	start = buf;
+	group = file->private_data;
+
+	pr_debug("%s: group=%p\n", __func__, group);
+
+	while (1) {
+		prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
+
+		mutex_lock(&group->notification_mutex);
+		kevent = get_one_event(group, count);
+		mutex_unlock(&group->notification_mutex);
+
+		if (kevent) {
+			ret = PTR_ERR(kevent);
+			if (IS_ERR(kevent))
+				break;
+			ret = copy_event_to_user(group, kevent, buf);
+			fsnotify_put_event(kevent);
+			if (ret < 0)
+				break;
+			buf += ret;
+			count -= ret;
+			continue;
+		}
+
+		ret = -EAGAIN;
+		if (file->f_flags & O_NONBLOCK)
+			break;
+		ret = -EINTR;
+		if (signal_pending(current))
+			break;
+
+		if (start != buf)
+			break;
+
+		schedule();
+	}
+
+	finish_wait(&group->notification_waitq, &wait);
+	if (start != buf && ret != -EFAULT)
+		ret = buf - start;
+	return ret;
+}
+
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
 	struct fsnotify_group *group = file->private_data;
@@ -25,13 +212,38 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 	return 0;
 }
 
+static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event_holder *holder;
+	void __user *p;
+	int ret = -ENOTTY;
+	size_t send_len = 0;
+
+	group = file->private_data;
+
+	p = (void __user *) arg;
+
+	switch (cmd) {
+	case FIONREAD:
+		mutex_lock(&group->notification_mutex);
+		list_for_each_entry(holder, &group->notification_list, event_list)
+			send_len += FAN_EVENT_METADATA_LEN;
+		mutex_unlock(&group->notification_mutex);
+		ret = put_user(send_len, (int __user *) p);
+		break;
+	}
+
+	return ret;
+}
+
 static const struct file_operations fanotify_fops = {
-	.poll		= NULL,
-	.read		= NULL,
+	.poll		= fanotify_poll,
+	.read		= fanotify_read,
 	.fasync		= NULL,
 	.release	= fanotify_release,
-	.unlocked_ioctl	= NULL,
-	.compat_ioctl	= NULL,
+	.unlocked_ioctl	= fanotify_ioctl,
+	.compat_ioctl	= fanotify_ioctl,
 };
 
 static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 95aeea2a3ca6..c1c66162a46c 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -51,6 +51,30 @@
  */
 #define FAN_ALL_INCOMING_EVENTS	(FAN_ALL_EVENTS |\
 				 FAN_EVENT_ON_CHILD)
+
+#define FAN_ALL_OUTGOING_EVENTS	(FAN_ALL_EVENTS |\
+				 FAN_Q_OVERFLOW)
+
+#define FANOTIFY_METADATA_VERSION	1
+
+struct fanotify_event_metadata {
+	__u32 event_len;
+	__u32 vers;
+	__s32 fd;
+	__u64 mask;
+} __attribute__ ((packed));
+
+/* Helper functions to deal with fanotify_event_metadata buffers */
+#define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata))
+
+#define FAN_EVENT_NEXT(meta, len) ((len) -= (meta)->event_len, \
+				   (struct fanotify_event_metadata*)(((char *)(meta)) + \
+				   (meta)->event_len))
+
+#define FAN_EVENT_OK(meta, len)	((long)(len) >= (long)FAN_EVENT_METADATA_LEN && \
+				(long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \
+				(long)(meta)->event_len <= (long)(len))
+
 #ifdef __KERNEL__
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3-59-g8ed1b


From 32c3263221bd63316815286dccacdc7abfd7f3c4 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Thu, 17 Dec 2009 21:24:27 -0500
Subject: fanotify: Add pids to events

Pass the process identifiers of the triggering processes to fanotify
listeners: this information is useful for event filtering and logging.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.c      | 5 +++--
 fs/notify/fanotify/fanotify_user.c | 1 +
 fs/notify/notification.c           | 3 +++
 include/linux/fanotify.h           | 1 +
 include/linux/fsnotify_backend.h   | 1 +
 5 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 5b0b6b485a9c..881067dc7923 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -10,8 +10,9 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
 {
 	pr_debug("%s: old=%p new=%p\n", __func__, old, new);
 
-	if ((old->to_tell == new->to_tell) &&
-	    (old->data_type == new->data_type)) {
+	if (old->to_tell == new->to_tell &&
+	    old->data_type == new->data_type &&
+	    old->tgid == new->tgid) {
 		switch (old->data_type) {
 		case (FSNOTIFY_EVENT_PATH):
 			if ((old->path.mnt == new->path.mnt) &&
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index cf9c30009825..66e38fc052b2 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -103,6 +103,7 @@ static ssize_t fill_event_metadata(struct fsnotify_group *group,
 	metadata->event_len = FAN_EVENT_METADATA_LEN;
 	metadata->vers = FANOTIFY_METADATA_VERSION;
 	metadata->mask = fanotify_outgoing_mask(event->mask);
+	metadata->pid = pid_vnr(event->tgid);
 	metadata->fd = create_fd(group, event);
 
 	return metadata->fd;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 066f1f988bac..7fc8d004084c 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -93,6 +93,7 @@ void fsnotify_put_event(struct fsnotify_event *event)
 		BUG_ON(!list_empty(&event->private_data_list));
 
 		kfree(event->file_name);
+		put_pid(event->tgid);
 		kmem_cache_free(fsnotify_event_cachep, event);
 	}
 }
@@ -346,6 +347,7 @@ struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
 			return NULL;
 		}
 	}
+	event->tgid = get_pid(old_event->tgid);
 	if (event->data_type == FSNOTIFY_EVENT_PATH)
 		path_get(&event->path);
 
@@ -385,6 +387,7 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 		event->name_len = strlen(event->file_name);
 	}
 
+	event->tgid = get_pid(task_tgid(current));
 	event->sync_cookie = cookie;
 	event->to_tell = to_tell;
 	event->data_type = data_type;
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index c1c66162a46c..5f633af4d1b0 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -62,6 +62,7 @@ struct fanotify_event_metadata {
 	__u32 vers;
 	__s32 fd;
 	__u64 mask;
+	__s64 pid;
 } __attribute__ ((packed));
 
 /* Helper functions to deal with fanotify_event_metadata buffers */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index ff654c1932f2..7d93572ec568 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -221,6 +221,7 @@ struct fsnotify_event {
 	u32 sync_cookie;	/* used to corrolate events, namely inotify mv events */
 	char *file_name;
 	size_t name_len;
+	struct pid *tgid;
 
 	struct list_head private_data_list;	/* groups can store private data here */
 };
-- 
cgit v1.2.3-59-g8ed1b


From 5444e2981c31d0ed7465475e451b8437084337e5 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:27 -0500
Subject: fsnotify: split generic and inode specific mark code

currently all marking is done by functions in inode-mark.c.  Some of this
is pretty generic and should be instead done in a generic function and we
should only put the inode specific code in inode-mark.c

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/Makefile                   |   3 +-
 fs/notify/dnotify/dnotify.c          |  12 +-
 fs/notify/fanotify/fanotify.c        |   2 +-
 fs/notify/fanotify/fanotify_user.c   |   8 +-
 fs/notify/fsnotify.h                 |   7 +
 fs/notify/inode_mark.c               | 246 +++--------------------------
 fs/notify/inotify/inotify_fsnotify.c |   4 +-
 fs/notify/inotify/inotify_user.c     |   4 +-
 fs/notify/mark.c                     | 294 +++++++++++++++++++++++++++++++++++
 include/linux/fsnotify_backend.h     |   5 +-
 kernel/audit_tree.c                  |   8 +-
 kernel/audit_watch.c                 |   6 +-
 12 files changed, 347 insertions(+), 252 deletions(-)
 create mode 100644 fs/notify/mark.c

(limited to 'include/linux')

diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 396a38779371..8f7f3b024a2e 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,4 +1,5 @@
-obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o
+obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o \
+				   mark.o
 
 obj-y			+= dnotify/
 obj-y			+= inotify/
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index cac2eb896639..69f42df9ba45 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -95,7 +95,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 
 	to_tell = event->to_tell;
 
-	fsn_mark = fsnotify_find_mark(group, to_tell);
+	fsn_mark = fsnotify_find_inode_mark(group, to_tell);
 	if (unlikely(!fsn_mark))
 		return 0;
 	dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
@@ -143,14 +143,14 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 	if (!S_ISDIR(inode->i_mode))
 		return false;
 
-	fsn_mark = fsnotify_find_mark(group, inode);
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark)
 		return false;
 
 	mask = (mask & ~FS_EVENT_ON_CHILD);
 	send = (mask & fsn_mark->mask);
 
-	fsnotify_put_mark(fsn_mark); /* matches fsnotify_find_mark */
+	fsnotify_put_mark(fsn_mark); /* matches fsnotify_find_inode_mark */
 
 	return send;
 }
@@ -193,7 +193,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	if (!S_ISDIR(inode->i_mode))
 		return;
 
-	fsn_mark = fsnotify_find_mark(dnotify_group, inode);
+	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
 	if (!fsn_mark)
 		return;
 	dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
@@ -346,12 +346,12 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	mutex_lock(&dnotify_mark_mutex);
 
 	/* add the new_fsn_mark or find an old one. */
-	fsn_mark = fsnotify_find_mark(dnotify_group, inode);
+	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
 	if (fsn_mark) {
 		dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
 		spin_lock(&fsn_mark->lock);
 	} else {
-		fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, 0);
+		fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
 		spin_lock(&new_fsn_mark->lock);
 		fsn_mark = new_fsn_mark;
 		dn_mark = new_dn_mark;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 881067dc7923..aa5e92661142 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -118,7 +118,7 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, struct inod
 	if (data_type != FSNOTIFY_EVENT_PATH)
 		return false;
 
-	fsn_mark = fsnotify_find_mark(group, inode);
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark)
 		return false;
 
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 66e38fc052b2..05351936a725 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -305,7 +305,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
 	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__,
 		 group, inode, mask);
 
-	fsn_mark = fsnotify_find_mark(group, inode);
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark)
 		return -ENOENT;
 
@@ -321,7 +321,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
 
 	fsnotify_recalc_group_mask(group);
 
-	/* matches the fsnotify_find_mark() */
+	/* matches the fsnotify_find_inode_mark() */
 	fsnotify_put_mark(fsn_mark);
 
 	return 0;
@@ -338,7 +338,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
 	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__,
 		 group, inode, mask);
 
-	fsn_mark = fsnotify_find_mark(group, inode);
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark) {
 		struct fsnotify_mark *new_fsn_mark;
 
@@ -348,7 +348,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
 			goto out;
 
 		fsnotify_init_mark(new_fsn_mark, fanotify_free_mark);
-		ret = fsnotify_add_mark(new_fsn_mark, group, inode, 0);
+		ret = fsnotify_add_mark(new_fsn_mark, group, inode, NULL, 0);
 		if (ret) {
 			fanotify_free_mark(new_fsn_mark);
 			goto out;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 2ba59158969f..7c7a904b802d 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -20,6 +20,11 @@ extern __u32 fsnotify_vfsmount_mask;
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
+/* add a mark to an inode */
+extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
+				   struct fsnotify_group *group, struct inode *inode,
+				   int allow_dups);
+
 /* add a group to the inode group list */
 extern void fsnotify_add_inode_group(struct fsnotify_group *group);
 /* add a group to the vfsmount group list */
@@ -27,6 +32,8 @@ extern void fsnotify_add_vfsmount_group(struct fsnotify_group *group);
 /* final kfree of a group */
 extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
 
+/* inode specific destruction of a mark */
+extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
 /* run the list of all marks associated with inode and flag them to be freed */
 extern void fsnotify_clear_marks_by_inode(struct inode *inode);
 /*
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index ba6f9833561b..c925579ba011 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -16,72 +16,6 @@
  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-/*
- * fsnotify inode mark locking/lifetime/and refcnting
- *
- * REFCNT:
- * The mark->refcnt tells how many "things" in the kernel currently are
- * referencing this object.  The object typically will live inside the kernel
- * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
- * which can find this object holding the appropriete locks, can take a reference
- * and the object itself is guarenteed to survive until the reference is dropped.
- *
- * LOCKING:
- * There are 3 spinlocks involved with fsnotify inode marks and they MUST
- * be taken in order as follows:
- *
- * mark->lock
- * group->mark_lock
- * inode->i_lock
- *
- * mark->lock protects 2 things, mark->group and mark->inode.  You must hold
- * that lock to dereference either of these things (they could be NULL even with
- * the lock)
- *
- * group->mark_lock protects the marks_list anchored inside a given group
- * and each mark is hooked via the g_list.  It also sorta protects the
- * free_g_list, which when used is anchored by a private list on the stack of the
- * task which held the group->mark_lock.
- *
- * inode->i_lock protects the i_fsnotify_marks list anchored inside a
- * given inode and each mark is hooked via the i_list. (and sorta the
- * free_i_list)
- *
- *
- * LIFETIME:
- * Inode marks survive between when they are added to an inode and when their
- * refcnt==0.
- *
- * The inode mark can be cleared for a number of different reasons including:
- * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
- * - The inode is being evicted from cache. (fsnotify_inode_delete)
- * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
- * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
- * - The fsnotify_group associated with the mark is going away and all such marks
- *   need to be cleaned up. (fsnotify_clear_marks_by_group)
- *
- * Worst case we are given an inode and need to clean up all the marks on that
- * inode.  We take i_lock and walk the i_fsnotify_marks safely.  For each
- * mark on the list we take a reference (so the mark can't disappear under us).
- * We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list;  At this point we no
- * longer fear anything finding the mark using the inode's list of marks.
- *
- * We can safely and locklessly run the private list on the stack of everything
- * we just unattached from the original inode.  For each mark on the private list
- * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
- * we see the group and inode are not NULL we take those locks.  Now holding all
- * 3 locks we can completely remove the mark from other tasks finding it in the
- * future.  Remember, 10 things might already be referencing this mark, but they
- * better be holding a ref.  We drop our reference we took before we unhooked it
- * from the inode.  When the ref hits 0 we can free the mark.
- *
- * Very similarly for freeing by group, except we use free_g_list.
- *
- * This has the very interesting property of being able to run concurrently with
- * any (or all) other directions.
- */
-
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -95,17 +29,6 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
-void fsnotify_get_mark(struct fsnotify_mark *mark)
-{
-	atomic_inc(&mark->refcnt);
-}
-
-void fsnotify_put_mark(struct fsnotify_mark *mark)
-{
-	if (atomic_dec_and_test(&mark->refcnt))
-		mark->free_mark(mark);
-}
-
 /*
  * Recalculate the mask of events relevant to a given inode locked.
  */
@@ -135,44 +58,18 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
 	__fsnotify_update_child_dentry_flags(inode);
 }
 
-/*
- * Any time a mark is getting freed we end up here.
- * The caller had better be holding a reference to this mark so we don't actually
- * do the final put under the mark->lock
- */
-void fsnotify_destroy_mark(struct fsnotify_mark *mark)
+void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 {
-	struct fsnotify_group *group;
-	struct inode *inode;
-
-	spin_lock(&mark->lock);
+	struct inode *inode = mark->i.inode;
 
-	group = mark->group;
-	inode = mark->i.inode;
+	assert_spin_locked(&mark->lock);
+	assert_spin_locked(&mark->group->mark_lock);
 
-	BUG_ON(group && !inode);
-	BUG_ON(!group && inode);
-
-	/* if !group something else already marked this to die */
-	if (!group) {
-		spin_unlock(&mark->lock);
-		return;
-	}
-
-	/* 1 from caller and 1 for being on i_list/g_list */
-	BUG_ON(atomic_read(&mark->refcnt) < 2);
-
-	spin_lock(&group->mark_lock);
 	spin_lock(&inode->i_lock);
 
 	hlist_del_init(&mark->i.i_list);
 	mark->i.inode = NULL;
 
-	list_del_init(&mark->g_list);
-	mark->group = NULL;
-
-	fsnotify_put_mark(mark); /* for i_list and g_list */
-
 	/*
 	 * this mark is now off the inode->i_fsnotify_marks list and we
 	 * hold the inode->i_lock, so this is the perfect time to update the
@@ -181,61 +78,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
 	fsnotify_recalc_inode_mask_locked(inode);
 
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&group->mark_lock);
-	spin_unlock(&mark->lock);
-
-	/*
-	 * Some groups like to know that marks are being freed.  This is a
-	 * callback to the group function to let it know that this mark
-	 * is being freed.
-	 */
-	if (group->ops->freeing_mark)
-		group->ops->freeing_mark(mark, group);
-
-	/*
-	 * __fsnotify_update_child_dentry_flags(inode);
-	 *
-	 * I really want to call that, but we can't, we have no idea if the inode
-	 * still exists the second we drop the mark->lock.
-	 *
-	 * The next time an event arrive to this inode from one of it's children
-	 * __fsnotify_parent will see that the inode doesn't care about it's
-	 * children and will update all of these flags then.  So really this
-	 * is just a lazy update (and could be a perf win...)
-	 */
-
-
-	iput(inode);
-
-	/*
-	 * it's possible that this group tried to destroy itself, but this
-	 * this mark was simultaneously being freed by inode.  If that's the
-	 * case, we finish freeing the group here.
-	 */
-	if (unlikely(atomic_dec_and_test(&group->num_marks)))
-		fsnotify_final_destroy_group(group);
-}
-
-/*
- * Given a group, destroy all of the marks associated with that group.
- */
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
-{
-	struct fsnotify_mark *lmark, *mark;
-	LIST_HEAD(free_list);
-
-	spin_lock(&group->mark_lock);
-	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
-		list_add(&mark->free_g_list, &free_list);
-		list_del_init(&mark->g_list);
-		fsnotify_get_mark(mark);
-	}
-	spin_unlock(&group->mark_lock);
-
-	list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
-		fsnotify_destroy_mark(mark);
-		fsnotify_put_mark(mark);
-	}
 }
 
 /*
@@ -261,8 +103,12 @@ void fsnotify_clear_marks_by_inode(struct inode *inode)
 	}
 }
 
-static struct fsnotify_mark *fsnotify_find_mark_locked(struct fsnotify_group *group,
-						       struct inode *inode)
+/*
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group,
+						      struct inode *inode)
 {
 	struct fsnotify_mark *mark;
 	struct hlist_node *pos;
@@ -282,50 +128,26 @@ static struct fsnotify_mark *fsnotify_find_mark_locked(struct fsnotify_group *gr
  * given a group and inode, find the mark associated with that combination.
  * if found take a reference to that mark and return it, else return NULL
  */
-struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_group *group,
-					 struct inode *inode)
+struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
+					       struct inode *inode)
 {
 	struct fsnotify_mark *mark;
 
 	spin_lock(&inode->i_lock);
-	mark = fsnotify_find_mark_locked(group, inode);
+	mark = fsnotify_find_inode_mark_locked(group, inode);
 	spin_unlock(&inode->i_lock);
 
 	return mark;
 }
 
-void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
-{
-	assert_spin_locked(&old->lock);
-	new->i.inode = old->i.inode;
-	new->group = old->group;
-	new->mask = old->mask;
-	new->free_mark = old->free_mark;
-}
-
-/*
- * Nothing fancy, just initialize lists and locks and counters.
- */
-void fsnotify_init_mark(struct fsnotify_mark *mark,
-			void (*free_mark)(struct fsnotify_mark *mark))
-{
-	spin_lock_init(&mark->lock);
-	atomic_set(&mark->refcnt, 1);
-	INIT_HLIST_NODE(&mark->i.i_list);
-	mark->group = NULL;
-	mark->mask = 0;
-	mark->i.inode = NULL;
-	mark->free_mark = free_mark;
-}
-
 /*
  * Attach an initialized mark mark to a given group and inode.
  * These marks may be used for the fsnotify backend to determine which
  * event types should be delivered to which group and for which inodes.
  */
-int fsnotify_add_mark(struct fsnotify_mark *mark,
-		      struct fsnotify_group *group, struct inode *inode,
-		      int allow_dups)
+int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
+			    struct fsnotify_group *group, struct inode *inode,
+			    int allow_dups)
 {
 	struct fsnotify_mark *lmark = NULL;
 	int ret = 0;
@@ -336,56 +158,26 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,
 
 	mark->flags = FSNOTIFY_MARK_FLAG_INODE;
 
-	/*
-	 * if this group isn't being testing for inode type events we need
-	 * to start testing
-	 */
-	if (unlikely(list_empty(&group->inode_group_list)))
-		fsnotify_add_inode_group(group);
-	/*
-	 * XXX This is where we could also do the fsnotify_add_vfsmount_group
-	 * if we are setting and vfsmount mark....
-
-	if (unlikely(list_empty(&group->vfsmount_group_list)))
-		fsnotify_add_vfsmount_group(group);
-	 */
+	assert_spin_locked(&mark->lock);
+	assert_spin_locked(&group->mark_lock);
 
-	/*
-	 * LOCKING ORDER!!!!
-	 * mark->lock
-	 * group->mark_lock
-	 * inode->i_lock
-	 */
-	spin_lock(&mark->lock);
-	spin_lock(&group->mark_lock);
 	spin_lock(&inode->i_lock);
 
 	if (!allow_dups)
-		lmark = fsnotify_find_mark_locked(group, inode);
+		lmark = fsnotify_find_inode_mark_locked(group, inode);
 	if (!lmark) {
-		mark->group = group;
 		mark->i.inode = inode;
 
 		hlist_add_head(&mark->i.i_list, &inode->i_fsnotify_marks);
-		list_add(&mark->g_list, &group->marks_list);
-
-		fsnotify_get_mark(mark); /* for i_list and g_list */
-
-		atomic_inc(&group->num_marks);
 
 		fsnotify_recalc_inode_mask_locked(inode);
 	}
 
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&group->mark_lock);
-	spin_unlock(&mark->lock);
 
 	if (lmark) {
 		ret = -EEXIST;
 		iput(inode);
-		fsnotify_put_mark(lmark);
-	} else {
-		__fsnotify_update_child_dentry_flags(inode);
 	}
 
 	return ret;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index cc8f6bcbb4a3..1d237e1bf7b1 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -97,7 +97,7 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 
 	to_tell = event->to_tell;
 
-	fsn_mark = fsnotify_find_mark(group, to_tell);
+	fsn_mark = fsnotify_find_inode_mark(group, to_tell);
 	/* race with watch removal?  We already passes should_send */
 	if (unlikely(!fsn_mark))
 		return 0;
@@ -145,7 +145,7 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
 	struct fsnotify_mark *fsn_mark;
 	bool send;
 
-	fsn_mark = fsnotify_find_mark(group, inode);
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark)
 		return false;
 
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ad5a1ea7827e..a12315a7553d 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -566,7 +566,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 	if (unlikely(!mask))
 		return -EINVAL;
 
-	fsn_mark = fsnotify_find_mark(group, inode);
+	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark)
 		return -ENOENT;
 
@@ -644,7 +644,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
 		goto out_err;
 
 	/* we are on the idr, now get on the inode */
-	ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, 0);
+	ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
 	if (ret) {
 		/* we failed to get on the inode, get off the idr */
 		inotify_remove_from_idr(group, tmp_i_mark);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
new file mode 100644
index 000000000000..e56e8768d676
--- /dev/null
+++ b/fs/notify/mark.c
@@ -0,0 +1,294 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * fsnotify inode mark locking/lifetime/and refcnting
+ *
+ * REFCNT:
+ * The mark->refcnt tells how many "things" in the kernel currently are
+ * referencing this object.  The object typically will live inside the kernel
+ * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
+ * which can find this object holding the appropriete locks, can take a reference
+ * and the object itself is guarenteed to survive until the reference is dropped.
+ *
+ * LOCKING:
+ * There are 3 spinlocks involved with fsnotify inode marks and they MUST
+ * be taken in order as follows:
+ *
+ * mark->lock
+ * group->mark_lock
+ * inode->i_lock
+ *
+ * mark->lock protects 2 things, mark->group and mark->inode.  You must hold
+ * that lock to dereference either of these things (they could be NULL even with
+ * the lock)
+ *
+ * group->mark_lock protects the marks_list anchored inside a given group
+ * and each mark is hooked via the g_list.  It also sorta protects the
+ * free_g_list, which when used is anchored by a private list on the stack of the
+ * task which held the group->mark_lock.
+ *
+ * inode->i_lock protects the i_fsnotify_marks list anchored inside a
+ * given inode and each mark is hooked via the i_list. (and sorta the
+ * free_i_list)
+ *
+ *
+ * LIFETIME:
+ * Inode marks survive between when they are added to an inode and when their
+ * refcnt==0.
+ *
+ * The inode mark can be cleared for a number of different reasons including:
+ * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
+ * - The inode is being evicted from cache. (fsnotify_inode_delete)
+ * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
+ * - The fsnotify_group associated with the mark is going away and all such marks
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *
+ * Worst case we are given an inode and need to clean up all the marks on that
+ * inode.  We take i_lock and walk the i_fsnotify_marks safely.  For each
+ * mark on the list we take a reference (so the mark can't disappear under us).
+ * We remove that mark form the inode's list of marks and we add this mark to a
+ * private list anchored on the stack using i_free_list;  At this point we no
+ * longer fear anything finding the mark using the inode's list of marks.
+ *
+ * We can safely and locklessly run the private list on the stack of everything
+ * we just unattached from the original inode.  For each mark on the private list
+ * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
+ * we see the group and inode are not NULL we take those locks.  Now holding all
+ * 3 locks we can completely remove the mark from other tasks finding it in the
+ * future.  Remember, 10 things might already be referencing this mark, but they
+ * better be holding a ref.  We drop our reference we took before we unhooked it
+ * from the inode.  When the ref hits 0 we can free the mark.
+ *
+ * Very similarly for freeing by group, except we use free_g_list.
+ *
+ * This has the very interesting property of being able to run concurrently with
+ * any (or all) other directions.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/writeback.h> /* for inode_lock */
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+void fsnotify_get_mark(struct fsnotify_mark *mark)
+{
+	atomic_inc(&mark->refcnt);
+}
+
+void fsnotify_put_mark(struct fsnotify_mark *mark)
+{
+	if (atomic_dec_and_test(&mark->refcnt))
+		mark->free_mark(mark);
+}
+
+/*
+ * Any time a mark is getting freed we end up here.
+ * The caller had better be holding a reference to this mark so we don't actually
+ * do the final put under the mark->lock
+ */
+void fsnotify_destroy_mark(struct fsnotify_mark *mark)
+{
+	struct fsnotify_group *group;
+	struct inode *inode;
+
+	spin_lock(&mark->lock);
+
+	group = mark->group;
+	inode = mark->i.inode;
+
+	BUG_ON(group && !inode);
+	BUG_ON(!group && inode);
+
+	/* if !group something else already marked this to die */
+	if (!group) {
+		spin_unlock(&mark->lock);
+		return;
+	}
+
+	/* 1 from caller and 1 for being on i_list/g_list */
+	BUG_ON(atomic_read(&mark->refcnt) < 2);
+
+	spin_lock(&group->mark_lock);
+
+	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
+		fsnotify_destroy_inode_mark(mark);
+	else
+		BUG();
+
+	list_del_init(&mark->g_list);
+	mark->group = NULL;
+
+	fsnotify_put_mark(mark); /* for i_list and g_list */
+
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&mark->lock);
+
+	/*
+	 * Some groups like to know that marks are being freed.  This is a
+	 * callback to the group function to let it know that this mark
+	 * is being freed.
+	 */
+	if (group->ops->freeing_mark)
+		group->ops->freeing_mark(mark, group);
+
+	/*
+	 * __fsnotify_update_child_dentry_flags(inode);
+	 *
+	 * I really want to call that, but we can't, we have no idea if the inode
+	 * still exists the second we drop the mark->lock.
+	 *
+	 * The next time an event arrive to this inode from one of it's children
+	 * __fsnotify_parent will see that the inode doesn't care about it's
+	 * children and will update all of these flags then.  So really this
+	 * is just a lazy update (and could be a perf win...)
+	 */
+
+
+	iput(inode);
+
+	/*
+	 * it's possible that this group tried to destroy itself, but this
+	 * this mark was simultaneously being freed by inode.  If that's the
+	 * case, we finish freeing the group here.
+	 */
+	if (unlikely(atomic_dec_and_test(&group->num_marks)))
+		fsnotify_final_destroy_group(group);
+}
+
+/*
+ * Attach an initialized mark to a given group and fs object.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group.
+ */
+int fsnotify_add_mark(struct fsnotify_mark *mark,
+		      struct fsnotify_group *group, struct inode *inode,
+		      struct vfsmount *mnt, int allow_dups)
+{
+	int ret = 0;
+
+	BUG_ON(mnt);
+	BUG_ON(inode && mnt);
+	BUG_ON(!inode && !mnt);
+
+	/*
+	 * if this group isn't being testing for inode type events we need
+	 * to start testing
+	 */
+	if (inode && unlikely(list_empty(&group->inode_group_list)))
+		fsnotify_add_inode_group(group);
+	else if (mnt && unlikely(list_empty(&group->vfsmount_group_list)))
+		fsnotify_add_vfsmount_group(group);
+
+	/*
+	 * LOCKING ORDER!!!!
+	 * mark->lock
+	 * group->mark_lock
+	 * inode->i_lock
+	 */
+	spin_lock(&mark->lock);
+	spin_lock(&group->mark_lock);
+
+	mark->group = group;
+	list_add(&mark->g_list, &group->marks_list);
+	atomic_inc(&group->num_marks);
+	fsnotify_get_mark(mark); /* for i_list and g_list */
+
+	if (inode) {
+		ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
+		if (ret)
+			goto err;
+	} else {
+		BUG();
+	}
+
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&mark->lock);
+
+	if (inode)
+		__fsnotify_update_child_dentry_flags(inode);
+
+	return ret;
+err:
+	mark->group = NULL;
+	list_del_init(&mark->g_list);
+	atomic_dec(&group->num_marks);
+	fsnotify_put_mark(mark);
+
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&mark->lock);
+
+	return ret;
+}
+
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+	struct fsnotify_mark *lmark, *mark;
+	LIST_HEAD(free_list);
+
+	spin_lock(&group->mark_lock);
+	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
+		list_add(&mark->free_g_list, &free_list);
+		list_del_init(&mark->g_list);
+		fsnotify_get_mark(mark);
+	}
+	spin_unlock(&group->mark_lock);
+
+	list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
+		fsnotify_destroy_mark(mark);
+		fsnotify_put_mark(mark);
+	}
+}
+
+void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
+{
+	assert_spin_locked(&old->lock);
+	new->i.inode = old->i.inode;
+	new->m.mnt = old->m.mnt;
+	new->group = old->group;
+	new->mask = old->mask;
+	new->free_mark = old->free_mark;
+}
+
+/*
+ * Nothing fancy, just initialize lists and locks and counters.
+ */
+void fsnotify_init_mark(struct fsnotify_mark *mark,
+			void (*free_mark)(struct fsnotify_mark *mark))
+{
+	spin_lock_init(&mark->lock);
+	atomic_set(&mark->refcnt, 1);
+	INIT_HLIST_NODE(&mark->i.i_list);
+	mark->group = NULL;
+	mark->mask = 0;
+	mark->i.inode = NULL;
+	mark->free_mark = free_mark;
+}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7d93572ec568..27cccbecbf23 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -364,11 +364,12 @@ extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group
 extern void fsnotify_recalc_inode_mask(struct inode *inode);
 extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(struct fsnotify_mark *mark));
 /* find (and take a reference) to a mark associated with group and inode */
-extern struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_group *group, struct inode *inode);
+extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct inode *inode);
 /* copy the values from old into new */
 extern void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old);
 /* attach the mark to both the group and the inode */
-extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, int allow_dups);
+extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
+			     struct inode *inode, struct vfsmount *mnt, int allow_dups);
 /* given a mark, flag it to be freed when all references are dropped */
 extern void fsnotify_destroy_mark(struct fsnotify_mark *mark);
 /* run all the marks in a group, and flag them to be freed */
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 80f8ac328aad..cfb97d752a61 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -259,7 +259,7 @@ static void untag_chunk(struct node *p)
 	if (!new)
 		goto Fallback;
 	fsnotify_duplicate_mark(&new->mark, entry);
-	if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, 1)) {
+	if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
 		free_chunk(new);
 		goto Fallback;
 	}
@@ -322,7 +322,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 		return -ENOMEM;
 
 	entry = &chunk->mark;
-	if (fsnotify_add_mark(entry, audit_tree_group, inode, 0)) {
+	if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
 		free_chunk(chunk);
 		return -ENOSPC;
 	}
@@ -360,7 +360,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	struct node *p;
 	int n;
 
-	old_entry = fsnotify_find_mark(audit_tree_group, inode);
+	old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
 	if (!old_entry)
 		return create_chunk(inode, tree);
 
@@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	}
 
 	fsnotify_duplicate_mark(chunk_entry, old_entry);
-	if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, 1)) {
+	if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
 		spin_unlock(&old_entry->lock);
 		free_chunk(chunk);
 		fsnotify_put_mark(old_entry);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d85fa538a722..7499397a6100 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -101,7 +101,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
 	struct audit_parent *parent = NULL;
 	struct fsnotify_mark *entry;
 
-	entry = fsnotify_find_mark(audit_watch_group, inode);
+	entry = fsnotify_find_inode_mark(audit_watch_group, inode);
 	if (entry)
 		parent = container_of(entry, struct audit_parent, mark);
 
@@ -158,7 +158,7 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp)
 
 	fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
 	parent->mark.mask = AUDIT_FS_WATCH;
-	ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, 0);
+	ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
 	if (ret < 0) {
 		audit_free_parent(parent);
 		return ERR_PTR(ret);
@@ -517,7 +517,7 @@ static bool audit_watch_should_send_event(struct fsnotify_group *group, struct i
 	struct fsnotify_mark *entry;
 	bool send;
 
-	entry = fsnotify_find_mark(group, inode);
+	entry = fsnotify_find_inode_mark(group, inode);
 	if (!entry)
 		return false;
 
-- 
cgit v1.2.3-59-g8ed1b


From 2504c5d63b811b71bbaa8d5d5af163e698f4df1f Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Thu, 17 Dec 2009 21:24:27 -0500
Subject: fsnotify/vfsmount: add fsnotify fields to struct vfsmount

This patch adds the list and mask fields needed to support vfsmount marks.
These are the same fields fsnotify needs on an inode.  They are not used,
just declared and we note where the cleanup hook should be (the function is
not yet defined)

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/namespace.c        | 4 ++++
 fs/notify/fsnotify.c  | 4 +---
 include/linux/mount.h | 6 +++++-
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 88058de59c7c..a2d681a6b5e9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -29,6 +29,7 @@
 #include <linux/log2.h>
 #include <linux/idr.h>
 #include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
@@ -150,6 +151,9 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
+#ifdef CONFIG_FSNOTIFY
+		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
+#endif
 #ifdef CONFIG_SMP
 		mnt->mnt_writers = alloc_percpu(int);
 		if (!mnt->mnt_writers)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 60e84fd338dd..e0bf86953e1b 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -163,9 +163,7 @@ static bool needed_by_vfsmount(__u32 test_mask, struct vfsmount *mnt)
 	if (!mnt)
 		return false;
 
-	/* hook in this when mnt->mnt_fsnotify_mask is defined */
-	/* return (test_mask & path->mnt->mnt_fsnotify_mask); */
-	return false;
+	return (test_mask & mnt->mnt_fsnotify_mask);
 }
 /*
  * This is the main call to fsnotify.  The VFS calls into hook specific functions
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 4bd05474d11d..907210bd9f9c 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -56,7 +56,11 @@ struct vfsmount {
 	struct list_head mnt_mounts;	/* list of children, anchored here */
 	struct list_head mnt_child;	/* and going through their mnt_child */
 	int mnt_flags;
-	/* 4 bytes hole on 64bits arches */
+	/* 4 bytes hole on 64bits arches without fsnotify */
+#ifdef CONFIG_FSNOTIFY
+	__u32 mnt_fsnotify_mask;
+	struct hlist_head mnt_fsnotify_marks;
+#endif
 	const char *mnt_devname;	/* Name of device e.g. /dev/dsk/hda1 */
 	struct list_head mnt_list;
 	struct list_head mnt_expire;	/* link in fs-specific expiry list */
-- 
cgit v1.2.3-59-g8ed1b


From 0d48b7f01f442bc88a69aa98f3b6b015f2817608 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:27 -0500
Subject: fsnotify: vfsmount marks generic functions

Much like inode-mark.c has all of the code dealing with marks on inodes
this patch adds a vfsmount-mark.c which has similar code but is intended
for marks on vfsmounts.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/Makefile               |   2 +-
 fs/notify/fsnotify.h             |   6 ++
 fs/notify/mark.c                 |  20 ++---
 fs/notify/vfsmount_mark.c        | 171 +++++++++++++++++++++++++++++++++++++++
 include/linux/fsnotify_backend.h |   2 +
 5 files changed, 191 insertions(+), 10 deletions(-)
 create mode 100644 fs/notify/vfsmount_mark.c

(limited to 'include/linux')

diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 8f7f3b024a2e..ae5f33a6d868 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o \
-				   mark.o
+				   mark.o vfsmount_mark.o
 
 obj-y			+= dnotify/
 obj-y			+= inotify/
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 7c7a904b802d..38f3fb5cef28 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -24,6 +24,10 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group);
 extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 				   struct fsnotify_group *group, struct inode *inode,
 				   int allow_dups);
+/* add a mark to a vfsmount */
+extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
+				      struct fsnotify_group *group, struct vfsmount *mnt,
+				      int allow_dups);
 
 /* add a group to the inode group list */
 extern void fsnotify_add_inode_group(struct fsnotify_group *group);
@@ -32,6 +36,8 @@ extern void fsnotify_add_vfsmount_group(struct fsnotify_group *group);
 /* final kfree of a group */
 extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
 
+/* vfsmount specific destruction of a mark */
+extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
 /* inode specific destruction of a mark */
 extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
 /* run the list of all marks associated with inode and flag them to be freed */
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 57bb1d74a2b6..d296ec9ffb2a 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -115,15 +115,11 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 void fsnotify_destroy_mark(struct fsnotify_mark *mark)
 {
 	struct fsnotify_group *group;
-	struct inode *inode;
+	struct inode *inode = NULL;
 
 	spin_lock(&mark->lock);
 
 	group = mark->group;
-	inode = mark->i.inode;
-
-	BUG_ON(group && !inode);
-	BUG_ON(!group && inode);
 
 	/* if !group something else already marked this to die */
 	if (!group) {
@@ -136,8 +132,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
 
 	spin_lock(&group->mark_lock);
 
-	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
+	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
 		fsnotify_destroy_inode_mark(mark);
+		inode = mark->i.inode;
+	} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
+		fsnotify_destroy_vfsmount_mark(mark);
 	else
 		BUG();
 
@@ -169,8 +168,8 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
 	 * is just a lazy update (and could be a perf win...)
 	 */
 
-
-	iput(inode);
+	if (inode)
+		iput(inode);
 
 	/*
 	 * it's possible that this group tried to destroy itself, but this
@@ -192,7 +191,6 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,
 {
 	int ret = 0;
 
-	BUG_ON(mnt);
 	BUG_ON(inode && mnt);
 	BUG_ON(!inode && !mnt);
 
@@ -223,6 +221,10 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,
 		ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
 		if (ret)
 			goto err;
+	} else if (mnt) {
+		ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
+		if (ret)
+			goto err;
 	} else {
 		BUG();
 	}
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
new file mode 100644
index 000000000000..1b61d0a942de
--- /dev/null
+++ b/fs/notify/vfsmount_mark.c
@@ -0,0 +1,171 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/writeback.h> /* for inode_lock */
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark, *lmark;
+	struct hlist_node *pos, *n;
+	LIST_HEAD(free_list);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+	hlist_for_each_entry_safe(mark, pos, n, &mnt->mnt_fsnotify_marks, m.m_list) {
+		list_add(&mark->m.free_m_list, &free_list);
+		hlist_del_init(&mark->m.m_list);
+		fsnotify_get_mark(mark);
+	}
+	spin_unlock(&mnt->mnt_root->d_lock);
+
+	list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) {
+		fsnotify_destroy_mark(mark);
+		fsnotify_put_mark(mark);
+	}
+}
+
+/*
+ * Recalculate the mask of events relevant to a given vfsmount locked.
+ */
+static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+	struct hlist_node *pos;
+	__u32 new_mask = 0;
+
+	assert_spin_locked(&mnt->mnt_root->d_lock);
+
+	hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list)
+		new_mask |= mark->mask;
+	mnt->mnt_fsnotify_mask = new_mask;
+}
+
+/*
+ * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this mount point
+ */
+void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
+{
+	spin_lock(&mnt->mnt_root->d_lock);
+	fsnotify_recalc_vfsmount_mask_locked(mnt);
+	spin_unlock(&mnt->mnt_root->d_lock);
+}
+
+void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
+{
+	struct vfsmount *mnt = mark->m.mnt;
+
+	assert_spin_locked(&mark->lock);
+	assert_spin_locked(&mark->group->mark_lock);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+
+	hlist_del_init(&mark->m.m_list);
+	mark->m.mnt = NULL;
+
+	fsnotify_recalc_vfsmount_mask_locked(mnt);
+
+	spin_unlock(&mnt->mnt_root->d_lock);
+}
+
+static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group,
+								struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+	struct hlist_node *pos;
+
+	assert_spin_locked(&mnt->mnt_root->d_lock);
+
+	hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) {
+		if (mark->group == group) {
+			fsnotify_get_mark(mark);
+			return mark;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * given a group and vfsmount, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
+						  struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+
+	spin_lock(&mnt->mnt_root->d_lock);
+	mark = fsnotify_find_vfsmount_mark_locked(group, mnt);
+	spin_unlock(&mnt->mnt_root->d_lock);
+
+	return mark;
+}
+
+/*
+ * Attach an initialized mark to a given group and vfsmount.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which groups.
+ */
+int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
+			       struct fsnotify_group *group, struct vfsmount *mnt,
+			       int allow_dups)
+{
+	struct fsnotify_mark *lmark = NULL;
+	int ret = 0;
+
+	mark->flags = FSNOTIFY_MARK_FLAG_VFSMOUNT;
+
+	/*
+	 * LOCKING ORDER!!!!
+	 * mark->lock
+	 * group->mark_lock
+	 * mnt->mnt_root->d_lock
+	 */
+	assert_spin_locked(&mark->lock);
+	assert_spin_locked(&group->mark_lock);
+
+	spin_lock(&mnt->mnt_root->d_lock);
+
+	if (!allow_dups)
+		lmark = fsnotify_find_vfsmount_mark_locked(group, mnt);
+	if (!lmark) {
+		mark->m.mnt = mnt;
+
+		hlist_add_head(&mark->m.m_list, &mnt->mnt_fsnotify_marks);
+
+		fsnotify_recalc_vfsmount_mask_locked(mnt);
+	} else {
+		ret = -EEXIST;
+	}
+
+	spin_unlock(&mnt->mnt_root->d_lock);
+
+	return ret;
+}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 27cccbecbf23..f21ff1bd4b5a 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -360,6 +360,8 @@ extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group
 
 /* functions used to manipulate the marks attached to inodes */
 
+/* run all marks associated with a vfsmount and update mnt->mnt_fsnotify_mask */
+extern void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt);
 /* run all marks associated with an inode and update inode->i_fsnotify_mask */
 extern void fsnotify_recalc_inode_mask(struct inode *inode);
 extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(struct fsnotify_mark *mark));
-- 
cgit v1.2.3-59-g8ed1b


From ca9c726eea013394d1e846331b117effb21ead83 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Thu, 17 Dec 2009 21:24:27 -0500
Subject: fsnotify: Infrastructure for per-mount watches

Per-mount watches allow groups to listen to fsnotify events on an entire
mount.  This patch simply adds and initializes the fields needed in the
vfsmount struct to make this happen.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/namespace.c                   | 1 +
 fs/notify/fsnotify.c             | 5 +++++
 fs/notify/fsnotify.h             | 2 ++
 include/linux/fsnotify.h         | 8 ++++++++
 include/linux/fsnotify_backend.h | 4 ++++
 5 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index a2d681a6b5e9..1969d6b2571e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -614,6 +614,7 @@ static inline void __mntput(struct vfsmount *mnt)
 	 * provides barriers, so count_mnt_writers() below is safe.  AV
 	 */
 	WARN_ON(count_mnt_writers(mnt));
+	fsnotify_vfsmount_delete(mnt);
 	dput(mnt->mnt_root);
 	free_vfsmnt(mnt);
 	deactivate_super(sb);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index e0bf86953e1b..7f14ddc3efc2 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -36,6 +36,11 @@ void __fsnotify_inode_delete(struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
 
+void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
+{
+	fsnotify_clear_marks_by_mount(mnt);
+}
+
 /*
  * Given an inode, first check if we care what happens to our children.  Inotify
  * and dnotify both tell their parents about events.  If we care about any event
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 38f3fb5cef28..204353c0f663 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -42,6 +42,8 @@ extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
 extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
 /* run the list of all marks associated with inode and flag them to be freed */
 extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+/* run the list of all marks associated with vfsmount and flag them to be freed */
+extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
 /*
  * update the dentry->d_flags of all of inode's children to indicate if inode cares
  * about events that happen to its children.
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 5184a2b786c1..06c0e50c7968 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -95,6 +95,14 @@ static inline void fsnotify_inode_delete(struct inode *inode)
 	__fsnotify_inode_delete(inode);
 }
 
+/*
+ * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed
+ */
+static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
+{
+	__fsnotify_vfsmount_delete(mnt);
+}
+
 /*
  * fsnotify_nameremove - a filename was removed from a directory
  */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index f21ff1bd4b5a..1af42cbfc429 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -282,6 +282,7 @@ extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 		     const char *name, u32 cookie);
 extern void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
+extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
 extern u32 fsnotify_get_cookie(void);
 
 static inline int fsnotify_inode_watches_children(struct inode *inode)
@@ -402,6 +403,9 @@ static inline void __fsnotify_parent(struct path *path, struct dentry *dentry, _
 static inline void __fsnotify_inode_delete(struct inode *inode)
 {}
 
+static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
+{}
+
 static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
 {}
 
-- 
cgit v1.2.3-59-g8ed1b


From 1c529063a3e4c15eaae28db31326a7aaab7091b5 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:28 -0500
Subject: fanotify: should_send_event needs to handle vfsmounts

currently should_send_event in fanotify only cares about marks on inodes.
This patch extends that interface to indicate that it cares about events
that happened on vfsmounts.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.c    | 56 ++++++++++++++++++++++++++++++++--------
 include/linux/fsnotify_backend.h |  2 ++
 2 files changed, 47 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index aa5e92661142..202be8adb2ec 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -2,6 +2,7 @@
 #include <linux/fsnotify_backend.h>
 #include <linux/init.h>
 #include <linux/kernel.h> /* UINT_MAX */
+#include <linux/mount.h>
 #include <linux/types.h>
 
 #include "fanotify.h"
@@ -99,24 +100,35 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_e
 	return ret;
 }
 
-static bool fanotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
-				       struct vfsmount *mnt, __u32 mask, void *data,
-				       int data_type)
+static bool should_send_vfsmount_event(struct fsnotify_group *group, struct vfsmount *mnt,
+				       __u32 mask)
 {
 	struct fsnotify_mark *fsn_mark;
 	bool send;
 
-	pr_debug("%s: group=%p inode=%p mask=%x data=%p data_type=%d\n",
-		 __func__, group, inode, mask, data, data_type);
+	pr_debug("%s: group=%p vfsmount=%p mask=%x\n",
+		 __func__, group, mnt, mask);
 
-	/* sorry, fanotify only gives a damn about files and dirs */
-	if (!S_ISREG(inode->i_mode) &&
-	    !S_ISDIR(inode->i_mode))
+	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+	if (!fsn_mark)
 		return false;
 
-	/* if we don't have enough info to send an event to userspace say no */
-	if (data_type != FSNOTIFY_EVENT_PATH)
-		return false;
+	send = (mask & fsn_mark->mask);
+
+	/* find took a reference */
+	fsnotify_put_mark(fsn_mark);
+
+	return send;
+}
+
+static bool should_send_inode_event(struct fsnotify_group *group, struct inode *inode,
+				    __u32 mask)
+{
+	struct fsnotify_mark *fsn_mark;
+	bool send;
+
+	pr_debug("%s: group=%p inode=%p mask=%x\n",
+		 __func__, group, inode, mask);
 
 	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark)
@@ -142,6 +154,28 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, struct inod
 	return send;
 }
 
+static bool fanotify_should_send_event(struct fsnotify_group *group, struct inode *to_tell,
+				       struct vfsmount *mnt, __u32 mask, void *data,
+				       int data_type)
+{
+	pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x data=%p data_type=%d\n",
+		 __func__, group, to_tell, mnt, mask, data, data_type);
+
+	/* sorry, fanotify only gives a damn about files and dirs */
+	if (!S_ISREG(to_tell->i_mode) &&
+	    !S_ISDIR(to_tell->i_mode))
+		return false;
+
+	/* if we don't have enough info to send an event to userspace say no */
+	if (data_type != FSNOTIFY_EVENT_PATH)
+		return false;
+
+	if (mnt)
+		return should_send_vfsmount_event(group, mnt, mask);
+	else
+		return should_send_inode_event(group, to_tell, mask);
+}
+
 const struct fsnotify_ops fanotify_fsnotify_ops = {
 	.handle_event = fanotify_handle_event,
 	.should_send_event = fanotify_should_send_event,
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 1af42cbfc429..2d2f015fb700 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -368,6 +368,8 @@ extern void fsnotify_recalc_inode_mask(struct inode *inode);
 extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(struct fsnotify_mark *mark));
 /* find (and take a reference) to a mark associated with group and inode */
 extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct inode *inode);
+/* find (and take a reference) to a mark associated with group and vfsmount */
+extern struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt);
 /* copy the values from old into new */
 extern void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old);
 /* attach the mark to both the group and the inode */
-- 
cgit v1.2.3-59-g8ed1b


From 0ff21db9fcc39042b814dad8a4b7508710a75235 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:29 -0500
Subject: fanotify: hooks the fanotify_mark syscall to the vfsmount code

Create a new fanotify_mark flag which indicates we should attach the mark
to the vfsmount holding the object referenced by dfd and pathname rather
than the inode itself.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 15 +++++++++++----
 include/linux/fanotify.h           |  4 +++-
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index db80a0d89d24..81267260d1b9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -485,7 +485,8 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 			      __u64 mask, int dfd,
 			      const char  __user * pathname)
 {
-	struct inode *inode;
+	struct inode *inode = NULL;
+	struct vfsmount *mnt = NULL;
 	struct fsnotify_group *group;
 	struct file *filp;
 	struct path path;
@@ -515,16 +516,22 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 		goto fput_and_out;
 
 	/* inode held in place by reference to path; group by fget on fd */
-	inode = path.dentry->d_inode;
+	if (!(flags & FAN_MARK_ON_VFSMOUNT))
+		inode = path.dentry->d_inode;
+	else
+		mnt = path.mnt;
 	group = filp->private_data;
 
 	/* create/update an inode mark */
 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
 	case FAN_MARK_ADD:
-		ret = fanotify_add_inode_mark(group, inode, mask);
+		if (flags & FAN_MARK_ON_VFSMOUNT)
+			ret = fanotify_add_vfsmount_mark(group, mnt, mask);
+		else
+			ret = fanotify_add_inode_mark(group, inode, mask);
 		break;
 	case FAN_MARK_REMOVE:
-		ret = fanotify_remove_mark(group, inode, NULL, mask);
+		ret = fanotify_remove_mark(group, inode, mnt, mask);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 5f633af4d1b0..e25d348188ca 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -29,11 +29,13 @@
 #define FAN_MARK_REMOVE		0x00000002
 #define FAN_MARK_DONT_FOLLOW	0x00000004
 #define FAN_MARK_ONLYDIR	0x00000008
+#define FAN_MARK_ON_VFSMOUNT	0x00000010
 
 #define FAN_ALL_MARK_FLAGS	(FAN_MARK_ADD |\
 				 FAN_MARK_REMOVE |\
 				 FAN_MARK_DONT_FOLLOW |\
-				 FAN_MARK_ONLYDIR)
+				 FAN_MARK_ONLYDIR |\
+				 FAN_MARK_ON_VFSMOUNT)
 
 /*
  * All of the events - we build the list by hand so that we can add flags in
-- 
cgit v1.2.3-59-g8ed1b


From eac8e9e80ccbd30801b7b76a2ee4c6c5a681e53c Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Thu, 17 Dec 2009 21:24:29 -0500
Subject: fanotify: rename FAN_MARK_ON_VFSMOUNT to FAN_MARK_MOUNT

the term 'vfsmount' isn't sensicle to userspace.  instead call is 'mount.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 4 ++--
 include/linux/fanotify.h           | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 81267260d1b9..091371e1bde3 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -516,7 +516,7 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 		goto fput_and_out;
 
 	/* inode held in place by reference to path; group by fget on fd */
-	if (!(flags & FAN_MARK_ON_VFSMOUNT))
+	if (!(flags & FAN_MARK_MOUNT))
 		inode = path.dentry->d_inode;
 	else
 		mnt = path.mnt;
@@ -525,7 +525,7 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 	/* create/update an inode mark */
 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
 	case FAN_MARK_ADD:
-		if (flags & FAN_MARK_ON_VFSMOUNT)
+		if (flags & FAN_MARK_MOUNT)
 			ret = fanotify_add_vfsmount_mark(group, mnt, mask);
 		else
 			ret = fanotify_add_inode_mark(group, inode, mask);
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index e25d348188ca..5ee22fb274e5 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -29,13 +29,13 @@
 #define FAN_MARK_REMOVE		0x00000002
 #define FAN_MARK_DONT_FOLLOW	0x00000004
 #define FAN_MARK_ONLYDIR	0x00000008
-#define FAN_MARK_ON_VFSMOUNT	0x00000010
+#define FAN_MARK_MOUNT		0x00000010
 
 #define FAN_ALL_MARK_FLAGS	(FAN_MARK_ADD |\
 				 FAN_MARK_REMOVE |\
 				 FAN_MARK_DONT_FOLLOW |\
 				 FAN_MARK_ONLYDIR |\
-				 FAN_MARK_ON_VFSMOUNT)
+				 FAN_MARK_MOUNT)
 
 /*
  * All of the events - we build the list by hand so that we can add flags in
-- 
cgit v1.2.3-59-g8ed1b


From 88380fe66e0ac22529f5426ab27d67da00ed2628 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Thu, 17 Dec 2009 21:24:29 -0500
Subject: fanotify: remove fanotify.h declarations

fanotify_mark_validate functions are all needlessly declared in headers as
static inlines.  Instead just do the checks where they are needed for code
readability.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.h      | 25 -------------------------
 fs/notify/fanotify/fanotify_user.c | 25 ++++++++++---------------
 include/linux/fanotify.h           |  7 -------
 3 files changed, 10 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 5608783c6bca..4d5723a74a8e 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -6,31 +6,6 @@
 
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 
-static inline bool fanotify_mark_flags_valid(unsigned int flags)
-{
-	/* must be either and add or a remove */
-	if (!(flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)))
-		return false;
-
-	/* cannot be both add and remove */
-	if ((flags & FAN_MARK_ADD) &&
-	    (flags & FAN_MARK_REMOVE))
-		return false;
-
-	/* cannot have more flags than we know about */
-	if (flags & ~FAN_ALL_MARK_FLAGS)
-		return false;
-
-	return true;
-}
-
-static inline bool fanotify_mask_valid(__u32 mask)
-{
-	if (mask & ~((__u32)FAN_ALL_INCOMING_EVENTS))
-		return false;
-	return true;
-}
-
 static inline __u32 fanotify_outgoing_mask(__u32 mask)
 {
 	return mask & FAN_ALL_OUTGOING_EVENTS;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 00628d3ce5a2..618867e4d30f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -430,20 +430,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 	return 0;
 }
 
-static bool fanotify_mark_validate_input(int flags,
-					 __u32 mask)
-{
-	pr_debug("%s: flags=%x mask=%x\n", __func__, flags, mask);
-
-	/* are flags valid of this operation? */
-	if (!fanotify_mark_flags_valid(flags))
-		return false;
-	/* is the mask valid? */
-	if (!fanotify_mask_valid(mask))
-		return false;
-	return true;
-}
-
 /* fanotify syscalls */
 SYSCALL_DEFINE3(fanotify_init, unsigned int, flags, unsigned int, event_f_flags,
 		unsigned int, priority)
@@ -505,7 +491,16 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 	if (mask & ((__u64)0xffffffff << 32))
 		return -EINVAL;
 
-	if (!fanotify_mark_validate_input(flags, mask))
+	if (flags & ~FAN_ALL_MARK_FLAGS)
+		return -EINVAL;
+	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
+	case FAN_MARK_ADD:
+	case FAN_MARK_REMOVE:
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD))
 		return -EINVAL;
 
 	filp = fget_light(fanotify_fd, &fput_needed);
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 5ee22fb274e5..90e59b24fd04 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -47,13 +47,6 @@
 			FAN_CLOSE |\
 			FAN_OPEN)
 
-/*
- * All legal FAN bits userspace can request (although possibly not all
- * at the same time.
- */
-#define FAN_ALL_INCOMING_EVENTS	(FAN_ALL_EVENTS |\
-				 FAN_EVENT_ON_CHILD)
-
 #define FAN_ALL_OUTGOING_EVENTS	(FAN_ALL_EVENTS |\
 				 FAN_Q_OVERFLOW)
 
-- 
cgit v1.2.3-59-g8ed1b


From 90b1e7a57880fb66437ab7db39e1e65ca0372822 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:33 -0500
Subject: fsnotify: allow marks to not pin inodes in core

inotify marks must pin inodes in core.  dnotify doesn't technically need to
since they are closed when the directory is closed.  fanotify also need to
pin inodes in core as it works today.  But the next step is to introduce
the concept of 'ignored masks' which is actually a mask of events for an
inode of no interest.  I claim that these should be liberally sent to the
kernel and should not pin the inode in core.  If the inode is brought back
in the listener will get an event it may have thought excluded, but this is
not a serious situation and one any listener should deal with.

This patch lays the ground work for non-pinning inode marks by using lazy
inode pinning.  We do not pin a mark until it has a non-zero mask entry.  If a
listener new sets a mask we never pin the inode.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c        |  2 +-
 fs/notify/fanotify/fanotify_user.c |  4 ++--
 fs/notify/fsnotify.h               |  2 ++
 fs/notify/inode_mark.c             | 35 +++++++++++++++++++++++++++--------
 fs/notify/inotify/inotify_user.c   | 12 +++++-------
 fs/notify/mark.c                   | 17 ++++++++++++++++-
 include/linux/fsnotify_backend.h   |  7 +++++--
 7 files changed, 58 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 69f42df9ba45..6624c2ee8786 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -65,7 +65,7 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
 	new_mask = 0;
 	for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
 		new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
-	fsn_mark->mask = new_mask;
+	fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
 
 	if (old_mask == new_mask)
 		return;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 84155841a025..3320f0c57e31 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -302,7 +302,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, __u3
 
 	spin_lock(&fsn_mark->lock);
 	oldmask = fsn_mark->mask;
-	fsn_mark->mask = oldmask & ~mask;
+	fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
 	spin_unlock(&fsn_mark->lock);
 
 	if (!(oldmask & ~mask))
@@ -359,7 +359,7 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, __u32 mas
 
 	spin_lock(&fsn_mark->lock);
 	oldmask = fsn_mark->mask;
-	fsn_mark->mask = oldmask | mask;
+	fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
 	spin_unlock(&fsn_mark->lock);
 
 	return mask & ~oldmask;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 204353c0f663..1be54f6f9e7d 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -20,6 +20,8 @@ extern __u32 fsnotify_vfsmount_mask;
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
+extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
+						__u32 mask);
 /* add a mark to an inode */
 extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 				   struct fsnotify_group *group, struct inode *inode,
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index c925579ba011..4292f9e23ae8 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -141,7 +141,32 @@ struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
 }
 
 /*
- * Attach an initialized mark mark to a given group and inode.
+ * If we are setting a mark mask on an inode mark we should pin the inode
+ * in memory.
+ */
+void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
+					 __u32 mask)
+{
+	struct inode *inode;
+
+	assert_spin_locked(&mark->lock);
+
+	if (mask &&
+	    mark->i.inode &&
+	    !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
+		mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
+		inode = igrab(mark->i.inode);
+		/*
+		 * we shouldn't be able to get here if the inode wasn't
+		 * already safely held in memory.  But bug in case it
+		 * ever is wrong.
+		 */
+		BUG_ON(!inode);
+	}
+}
+
+/*
+ * Attach an initialized mark to a given group and inode.
  * These marks may be used for the fsnotify backend to determine which
  * event types should be delivered to which group and for which inodes.
  */
@@ -152,10 +177,6 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 	struct fsnotify_mark *lmark = NULL;
 	int ret = 0;
 
-	inode = igrab(inode);
-	if (unlikely(!inode))
-		return -EINVAL;
-
 	mark->flags = FSNOTIFY_MARK_FLAG_INODE;
 
 	assert_spin_locked(&mark->lock);
@@ -175,10 +196,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 
 	spin_unlock(&inode->i_lock);
 
-	if (lmark) {
+	if (lmark)
 		ret = -EEXIST;
-		iput(inode);
-	}
 
 	return ret;
 }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index a12315a7553d..19d274057bfa 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -575,13 +575,11 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 	spin_lock(&fsn_mark->lock);
 
 	old_mask = fsn_mark->mask;
-	if (add) {
-		fsn_mark->mask |= mask;
-		new_mask = fsn_mark->mask;
-	} else {
-		fsn_mark->mask = mask;
-		new_mask = fsn_mark->mask;
-	}
+	if (add)
+		fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
+	else
+		fsnotify_set_mark_mask_locked(fsn_mark, mask);
+	new_mask = fsn_mark->mask;
 
 	spin_unlock(&fsn_mark->lock);
 
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d296ec9ffb2a..0ebc3fd7089b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -168,7 +168,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
 	 * is just a lazy update (and could be a perf win...)
 	 */
 
-	if (inode)
+	if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
 		iput(inode);
 
 	/*
@@ -180,6 +180,17 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
 		fsnotify_final_destroy_group(group);
 }
 
+void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
+{
+	assert_spin_locked(&mark->lock);
+
+	mark->mask = mask;
+
+	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
+		fsnotify_set_inode_mark_mask_locked(mark, mask);
+}
+
+
 /*
  * Attach an initialized mark to a given group and fs object.
  * These marks may be used for the fsnotify backend to determine which
@@ -230,6 +241,10 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,
 	}
 
 	spin_unlock(&group->mark_lock);
+
+	/* this will pin the object if appropriate */
+	fsnotify_set_mark_mask_locked(mark, mark->mask);
+
 	spin_unlock(&mark->lock);
 
 	if (inode)
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 2d2f015fb700..489c881ed4ec 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -267,8 +267,9 @@ struct fsnotify_mark {
 		struct fsnotify_vfsmount_mark m;
 	};
 	struct list_head free_g_list;	/* tmp list used when freeing this mark */
-#define FSNOTIFY_MARK_FLAG_INODE	0x01
-#define FSNOTIFY_MARK_FLAG_VFSMOUNT	0x02
+#define FSNOTIFY_MARK_FLAG_INODE		0x01
+#define FSNOTIFY_MARK_FLAG_VFSMOUNT		0x02
+#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED	0x04
 	unsigned int flags;		/* vfsmount or inode mark? */
 	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
@@ -372,6 +373,8 @@ extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *gro
 extern struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt);
 /* copy the values from old into new */
 extern void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old);
+/* set the mask of a mark (might pin the object into memory */
+extern void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask);
 /* attach the mark to both the group and the inode */
 extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
 			     struct inode *inode, struct vfsmount *mnt, int allow_dups);
-- 
cgit v1.2.3-59-g8ed1b


From 33af5e32e0bb73c704b5e156f4411cdb53e6cc59 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:33 -0500
Subject: fsnotify: ignored_mask - excluding notification

The ignored_mask is a new mask which is part of fsnotify marks.  A group's
should_send_event() function can use the ignored mask to determine that
certain events are not of interest.  In particular if a group registers a
mask including FS_OPEN on a vfsmount they could add FS_OPEN to the
ignored_mask for individual inodes and not send open events for those
inodes.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/mark.c                 | 6 ++++++
 include/linux/fsnotify_backend.h | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 0ebc3fd7089b..cb1d822f227f 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -190,6 +190,12 @@ void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
 		fsnotify_set_inode_mark_mask_locked(mark, mask);
 }
 
+void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
+{
+	assert_spin_locked(&mark->lock);
+
+	mark->ignored_mask = mask;
+}
 
 /*
  * Attach an initialized mark to a given group and fs object.
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 489c881ed4ec..018416ec5ce4 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -266,6 +266,7 @@ struct fsnotify_mark {
 		struct fsnotify_inode_mark i;
 		struct fsnotify_vfsmount_mark m;
 	};
+	__u32 ignored_mask;		/* events types to ignore */
 	struct list_head free_g_list;	/* tmp list used when freeing this mark */
 #define FSNOTIFY_MARK_FLAG_INODE		0x01
 #define FSNOTIFY_MARK_FLAG_VFSMOUNT		0x02
@@ -373,6 +374,8 @@ extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *gro
 extern struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt);
 /* copy the values from old into new */
 extern void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old);
+/* set the ignored_mask of a mark */
+extern void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask);
 /* set the mask of a mark (might pin the object into memory */
 extern void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask);
 /* attach the mark to both the group and the inode */
-- 
cgit v1.2.3-59-g8ed1b


From b9e4e3bd0495fea9e8f8e712889c9cd8ffa43c94 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:33 -0500
Subject: fanotify: allow users to set an ignored_mask

Change the sys_fanotify_mark() system call so users can set ignored_masks
on inodes.  Remember, if a user new sets a real mask, and only sets ignored
masks, the ignore will never be pinned in memory.  Thus ignored_masks can
be lost under memory pressure and the user may again get events they
previously thought were ignored.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 54 +++++++++++++++++++++++++-------------
 include/linux/fanotify.h           |  4 ++-
 2 files changed, 39 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 3320f0c57e31..ad02d475770f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -296,13 +296,20 @@ out:
 	return ret;
 }
 
-static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, __u32 mask)
+static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
+					    __u32 mask,
+					    unsigned int flags)
 {
 	__u32 oldmask;
 
 	spin_lock(&fsn_mark->lock);
-	oldmask = fsn_mark->mask;
-	fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
+	if (!(flags & FAN_MARK_IGNORED_MASK)) {
+		oldmask = fsn_mark->mask;
+		fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
+	} else {
+		oldmask = fsn_mark->ignored_mask;
+		fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask));
+	}
 	spin_unlock(&fsn_mark->lock);
 
 	if (!(oldmask & ~mask))
@@ -312,7 +319,8 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, __u3
 }
 
 static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
-					 struct vfsmount *mnt, __u32 mask)
+					 struct vfsmount *mnt, __u32 mask,
+					 unsigned int flags)
 {
 	struct fsnotify_mark *fsn_mark = NULL;
 	__u32 removed;
@@ -321,7 +329,7 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 	if (!fsn_mark)
 		return -ENOENT;
 
-	removed = fanotify_mark_remove_from_mask(fsn_mark, mask);
+	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
 	fsnotify_put_mark(fsn_mark);
 	if (removed & group->mask)
 		fsnotify_recalc_group_mask(group);
@@ -332,7 +340,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 }
 
 static int fanotify_remove_inode_mark(struct fsnotify_group *group,
-				      struct inode *inode, __u32 mask)
+				      struct inode *inode, __u32 mask,
+				      unsigned int flags)
 {
 	struct fsnotify_mark *fsn_mark = NULL;
 	__u32 removed;
@@ -341,7 +350,7 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 	if (!fsn_mark)
 		return -ENOENT;
 
-	removed = fanotify_mark_remove_from_mask(fsn_mark, mask);
+	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
 	/* matches the fsnotify_find_inode_mark() */
 	fsnotify_put_mark(fsn_mark);
 
@@ -353,20 +362,28 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 	return 0;
 }
 
-static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, __u32 mask)
+static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
+				       __u32 mask,
+				       unsigned int flags)
 {
 	__u32 oldmask;
 
 	spin_lock(&fsn_mark->lock);
-	oldmask = fsn_mark->mask;
-	fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
+	if (!(flags & FAN_MARK_IGNORED_MASK)) {
+		oldmask = fsn_mark->mask;
+		fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
+	} else {
+		oldmask = fsn_mark->ignored_mask;
+		fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask));
+	}
 	spin_unlock(&fsn_mark->lock);
 
 	return mask & ~oldmask;
 }
 
 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
-				      struct vfsmount *mnt, __u32 mask)
+				      struct vfsmount *mnt, __u32 mask,
+				      unsigned int flags)
 {
 	struct fsnotify_mark *fsn_mark;
 	__u32 added;
@@ -386,7 +403,7 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 			return ret;
 		}
 	}
-	added = fanotify_mark_add_to_mask(fsn_mark, mask);
+	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
 	fsnotify_put_mark(fsn_mark);
 	if (added) {
 		if (added & ~group->mask)
@@ -398,7 +415,8 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 }
 
 static int fanotify_add_inode_mark(struct fsnotify_group *group,
-				   struct inode *inode, __u32 mask)
+				   struct inode *inode, __u32 mask,
+				   unsigned int flags)
 {
 	struct fsnotify_mark *fsn_mark;
 	__u32 added;
@@ -420,7 +438,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 			return ret;
 		}
 	}
-	added = fanotify_mark_add_to_mask(fsn_mark, mask);
+	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
 	fsnotify_put_mark(fsn_mark);
 	if (added) {
 		if (added & ~group->mask)
@@ -528,15 +546,15 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
 	case FAN_MARK_ADD:
 		if (flags & FAN_MARK_MOUNT)
-			ret = fanotify_add_vfsmount_mark(group, mnt, mask);
+			ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
 		else
-			ret = fanotify_add_inode_mark(group, inode, mask);
+			ret = fanotify_add_inode_mark(group, inode, mask, flags);
 		break;
 	case FAN_MARK_REMOVE:
 		if (flags & FAN_MARK_MOUNT)
-			ret = fanotify_remove_vfsmount_mark(group, mnt, mask);
+			ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
 		else
-			ret = fanotify_remove_inode_mark(group, inode, mask);
+			ret = fanotify_remove_inode_mark(group, inode, mask, flags);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 90e59b24fd04..b8daa9f9b560 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -30,12 +30,14 @@
 #define FAN_MARK_DONT_FOLLOW	0x00000004
 #define FAN_MARK_ONLYDIR	0x00000008
 #define FAN_MARK_MOUNT		0x00000010
+#define FAN_MARK_IGNORED_MASK	0x00000020
 
 #define FAN_ALL_MARK_FLAGS	(FAN_MARK_ADD |\
 				 FAN_MARK_REMOVE |\
 				 FAN_MARK_DONT_FOLLOW |\
 				 FAN_MARK_ONLYDIR |\
-				 FAN_MARK_MOUNT)
+				 FAN_MARK_MOUNT |\
+				 FAN_MARK_IGNORED_MASK)
 
 /*
  * All of the events - we build the list by hand so that we can add flags in
-- 
cgit v1.2.3-59-g8ed1b


From c908370fc1ac27fd7e1fc0f34c693047b26564ce Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:33 -0500
Subject: fsnotify: allow ignored_mask to survive modification

Some inodes a group may want to never hear about a set of events even if
the inode is modified.  We add a new mark flag which indicates that these
marks should not have their ignored_mask cleared on modification.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             | 6 ++++--
 include/linux/fsnotify_backend.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 3ad940d0bac1..54d58d5f72c1 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -148,7 +148,8 @@ void __fsnotify_flush_ignored_mask(struct inode *inode, void *data, int data_is)
 	if (!hlist_empty(&inode->i_fsnotify_marks)) {
 		spin_lock(&inode->i_lock);
 		hlist_for_each_entry(mark, node, &inode->i_fsnotify_marks, i.i_list) {
-			mark->ignored_mask = 0;
+			if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+				mark->ignored_mask = 0;
 		}
 		spin_unlock(&inode->i_lock);
 	}
@@ -160,7 +161,8 @@ void __fsnotify_flush_ignored_mask(struct inode *inode, void *data, int data_is)
 		if (mnt && !hlist_empty(&mnt->mnt_fsnotify_marks)) {
 			spin_lock(&mnt->mnt_root->d_lock);
 			hlist_for_each_entry(mark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
-				mark->ignored_mask = 0;
+				if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+					mark->ignored_mask = 0;
 			}
 			spin_unlock(&mnt->mnt_root->d_lock);
 		}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 018416ec5ce4..8ca19df8a171 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -271,6 +271,7 @@ struct fsnotify_mark {
 #define FSNOTIFY_MARK_FLAG_INODE		0x01
 #define FSNOTIFY_MARK_FLAG_VFSMOUNT		0x02
 #define FSNOTIFY_MARK_FLAG_OBJECT_PINNED	0x04
+#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x08
 	unsigned int flags;		/* vfsmount or inode mark? */
 	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
-- 
cgit v1.2.3-59-g8ed1b


From c9778a98e7440fb73e0d27b8155a688663a0d493 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:33 -0500
Subject: fanotify: allow ignored_masks to survive modify

Some users may want to truely ignore an inode even if it has been modified.
Say you are wanting a mount which contains a log file and you really don't
want any notification about that file.  This patch allows the listener to
do that.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 2 ++
 include/linux/fanotify.h           | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index ad02d475770f..3e275f17e7b7 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -375,6 +375,8 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 	} else {
 		oldmask = fsn_mark->ignored_mask;
 		fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask));
+		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
+			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
 	}
 	spin_unlock(&fsn_mark->lock);
 
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index b8daa9f9b560..e43934d0b74c 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -31,13 +31,15 @@
 #define FAN_MARK_ONLYDIR	0x00000008
 #define FAN_MARK_MOUNT		0x00000010
 #define FAN_MARK_IGNORED_MASK	0x00000020
+#define FAN_MARK_IGNORED_SURV_MODIFY	0x00000040
 
 #define FAN_ALL_MARK_FLAGS	(FAN_MARK_ADD |\
 				 FAN_MARK_REMOVE |\
 				 FAN_MARK_DONT_FOLLOW |\
 				 FAN_MARK_ONLYDIR |\
 				 FAN_MARK_MOUNT |\
-				 FAN_MARK_IGNORED_MASK)
+				 FAN_MARK_IGNORED_MASK |\
+				 FAN_MARK_IGNORED_SURV_MODIFY)
 
 /*
  * All of the events - we build the list by hand so that we can add flags in
-- 
cgit v1.2.3-59-g8ed1b


From 4d92604cc90aa18bbbe0f6e23b7a9fdb612836d3 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:34 -0500
Subject: fanotify: clear all fanotify marks

fanotify listeners may want to clear all marks.  They may want to do this
to destroy all of their inode marks which have nothing but ignores.
Realistically this is useful for av vendors who update policy and want to
clear all of their cached allows.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 12 ++++++++++--
 fs/notify/inode_mark.c             |  8 ++++++++
 fs/notify/mark.c                   | 21 ++++++++++++++++-----
 fs/notify/vfsmount_mark.c          |  5 +++++
 include/linux/fanotify.h           |  1 +
 include/linux/fsnotify_backend.h   |  6 ++++++
 6 files changed, 46 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 3e275f17e7b7..9fe760baf69f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -514,9 +514,10 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 
 	if (flags & ~FAN_ALL_MARK_FLAGS)
 		return -EINVAL;
-	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
+	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
 	case FAN_MARK_ADD:
 	case FAN_MARK_REMOVE:
+	case FAN_MARK_FLUSH:
 		break;
 	default:
 		return -EINVAL;
@@ -545,7 +546,7 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 	group = filp->private_data;
 
 	/* create/update an inode mark */
-	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
+	switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
 	case FAN_MARK_ADD:
 		if (flags & FAN_MARK_MOUNT)
 			ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
@@ -558,6 +559,13 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 		else
 			ret = fanotify_remove_inode_mark(group, inode, mask, flags);
 		break;
+	case FAN_MARK_FLUSH:
+		if (flags & FAN_MARK_MOUNT)
+			fsnotify_clear_vfsmount_marks_by_group(group);
+		else
+			fsnotify_clear_inode_marks_by_group(group);
+		fsnotify_recalc_group_mask(group);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 4292f9e23ae8..0c0a48b1659f 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -103,6 +103,14 @@ void fsnotify_clear_marks_by_inode(struct inode *inode)
 	}
 }
 
+/*
+ * Given a group clear all of the inode marks associated with that group.
+ */
+void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
+}
+
 /*
  * given a group and inode, find the mark associated with that combination.
  * if found take a reference to that mark and return it, else return NULL
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index cb1d822f227f..1e824e64441d 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -270,18 +270,21 @@ err:
 }
 
 /*
- * Given a group, destroy all of the marks associated with that group.
+ * clear any marks in a group in which mark->flags & flags is true
  */
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
+					 unsigned int flags)
 {
 	struct fsnotify_mark *lmark, *mark;
 	LIST_HEAD(free_list);
 
 	spin_lock(&group->mark_lock);
 	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
-		list_add(&mark->free_g_list, &free_list);
-		list_del_init(&mark->g_list);
-		fsnotify_get_mark(mark);
+		if (mark->flags & flags) {
+			list_add(&mark->free_g_list, &free_list);
+			list_del_init(&mark->g_list);
+			fsnotify_get_mark(mark);
+		}
 	}
 	spin_unlock(&group->mark_lock);
 
@@ -291,6 +294,14 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
 	}
 }
 
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+}
+
 void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
 {
 	assert_spin_locked(&old->lock);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 1b61d0a942de..8f1aa02f4f02 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -51,6 +51,11 @@ void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
 	}
 }
 
+void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
+}
+
 /*
  * Recalculate the mask of events relevant to a given vfsmount locked.
  */
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index e43934d0b74c..385896c9f828 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -32,6 +32,7 @@
 #define FAN_MARK_MOUNT		0x00000010
 #define FAN_MARK_IGNORED_MASK	0x00000020
 #define FAN_MARK_IGNORED_SURV_MODIFY	0x00000040
+#define FAN_MARK_FLUSH		0x00000080
 
 #define FAN_ALL_MARK_FLAGS	(FAN_MARK_ADD |\
 				 FAN_MARK_REMOVE |\
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 8ca19df8a171..be4a36ed2008 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -384,6 +384,12 @@ extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *
 			     struct inode *inode, struct vfsmount *mnt, int allow_dups);
 /* given a mark, flag it to be freed when all references are dropped */
 extern void fsnotify_destroy_mark(struct fsnotify_mark *mark);
+/* run all the marks in a group, and clear all of the vfsmount marks */
+extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group);
+/* run all the marks in a group, and clear all of the inode marks */
+extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
+/* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
+extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
 /* run all the marks in a group, and flag them to be freed */
 extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
-- 
cgit v1.2.3-59-g8ed1b


From cb2d429faf2cae62d3c51e28099a181d5fe8c244 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:34 -0500
Subject: fsnotify: add group priorities

This introduces an ordering to fsnotify groups.  With purely asynchronous
notification based "things" implementing fsnotify (inotify, dnotify) ordering
isn't particularly important.  But if people want to use fsnotify for the
basis of sycronous notification or blocking notification ordering becomes
important.

eg. A Hierarchical Storage Management listener would need to get its event
before an AV scanner could get its event (since the HSM would need to
bring the data in for the AV scanner to scan.)  Typically asynchronous notification
would want to run after the AV scanner made any relevant access decisions
so as to not send notification about an event that was denied.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c |  4 ++--
 fs/notify/group.c                  | 40 ++++++++++++++++++++++++++++++++++++--
 include/linux/fsnotify_backend.h   |  1 +
 3 files changed, 41 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9fe760baf69f..84d3e2047de3 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -463,8 +463,6 @@ SYSCALL_DEFINE3(fanotify_init, unsigned int, flags, unsigned int, event_f_flags,
 
 	if (event_f_flags)
 		return -EINVAL;
-	if (priority)
-		return -EINVAL;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -483,6 +481,8 @@ SYSCALL_DEFINE3(fanotify_init, unsigned int, flags, unsigned int, event_f_flags,
 	if (IS_ERR(group))
 		return PTR_ERR(group);
 
+	group->priority = priority;
+
 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
 	if (fd < 0)
 		goto out_put_group;
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 9e9eb406afdd..ada913fd4f7f 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -89,10 +89,27 @@ void fsnotify_recalc_group_mask(struct fsnotify_group *group)
 
 void fsnotify_add_vfsmount_group(struct fsnotify_group *group)
 {
+	struct fsnotify_group *group_iter;
+	unsigned int priority = group->priority;
+
 	mutex_lock(&fsnotify_grp_mutex);
 
-	if (!group->on_vfsmount_group_list)
+	if (!group->on_vfsmount_group_list) {
+		list_for_each_entry(group_iter, &fsnotify_vfsmount_groups,
+				    vfsmount_group_list) {
+			/* insert in front of this one? */
+			if (priority < group_iter->priority) {
+				/* list_add_tail() insert in front of group_iter */
+				list_add_tail_rcu(&group->inode_group_list,
+						  &group_iter->inode_group_list);
+				goto out;
+			}
+		}
+
+		/* apparently we need to be the last entry */
 		list_add_tail_rcu(&group->vfsmount_group_list, &fsnotify_vfsmount_groups);
+	}
+out:
 	group->on_vfsmount_group_list = 1;
 
 	mutex_unlock(&fsnotify_grp_mutex);
@@ -100,10 +117,27 @@ void fsnotify_add_vfsmount_group(struct fsnotify_group *group)
 
 void fsnotify_add_inode_group(struct fsnotify_group *group)
 {
+	struct fsnotify_group *group_iter;
+	unsigned int priority = group->priority;
+
 	mutex_lock(&fsnotify_grp_mutex);
 
-	if (!group->on_inode_group_list)
+	/* add to global group list, priority 0 first, UINT_MAX last */
+	if (!group->on_inode_group_list) {
+		list_for_each_entry(group_iter, &fsnotify_inode_groups,
+				    inode_group_list) {
+			if (priority < group_iter->priority) {
+				/* list_add_tail() insert in front of group_iter */
+				list_add_tail_rcu(&group->inode_group_list,
+						  &group_iter->inode_group_list);
+				goto out;
+			}
+		}
+
+		/* apparently we need to be the last entry */
 		list_add_tail_rcu(&group->inode_group_list, &fsnotify_inode_groups);
+	}
+out:
 	group->on_inode_group_list = 1;
 
 	mutex_unlock(&fsnotify_grp_mutex);
@@ -226,6 +260,8 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 	spin_lock_init(&group->mark_lock);
 	INIT_LIST_HEAD(&group->marks_list);
 
+	group->priority = UINT_MAX;
+
 	group->ops = ops;
 
 	return group;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index be4a36ed2008..8b2e095e5907 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -140,6 +140,7 @@ struct fsnotify_group {
 					 * a group */
 	struct list_head marks_list;	/* all inode marks for this group */
 
+	unsigned int priority;		/* order of this group compared to others */
 	/* prevents double list_del of group_list.  protected by global fsnotify_grp_mutex */
 	bool on_inode_group_list;
 	bool on_vfsmount_group_list;
-- 
cgit v1.2.3-59-g8ed1b


From 6e5f77b32e9097a8a68a8d453799676cacf70cad Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:34 -0500
Subject: fsnotify: intoduce a notification merge argument

Each group can define their own notification (and secondary_q) merge
function.  Inotify does tail drop, fanotify does matching and drop which
can actually allocate a completely new event.  But for fanotify to properly
deal with permissions events it needs to know the new event which was
ultimately added to the notification queue.  This patch just implements a
void ** argument which is passed to the merge function.  fanotify can use
this field to pass the new event back to higher layers.

Signed-off-by: Eric Paris <eparis@redhat.com>
for fanotify to properly deal with permissions events
---
 fs/notify/fanotify/fanotify.c        | 6 ++++--
 fs/notify/inotify/inotify_fsnotify.c | 6 ++++--
 fs/notify/inotify/inotify_user.c     | 2 +-
 fs/notify/notification.c             | 7 +++++--
 include/linux/fsnotify_backend.h     | 5 ++++-
 5 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 060b177146e8..95a330d2f8a1 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -27,7 +27,9 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
 	return false;
 }
 
-static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
+static int fanotify_merge(struct list_head *list,
+			  struct fsnotify_event *event,
+			  void **arg)
 {
 	struct fsnotify_event_holder *test_holder;
 	struct fsnotify_event *test_event;
@@ -92,7 +94,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_e
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	ret = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
+	ret = fsnotify_add_notify_event(group, event, NULL, fanotify_merge, NULL);
 	/* -EEXIST means this event was merged with another, not that it was an error */
 	if (ret == -EEXIST)
 		ret = 0;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1d237e1bf7b1..daa666a6e6c9 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -67,7 +67,9 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 	return false;
 }
 
-static int inotify_merge(struct list_head *list, struct fsnotify_event *event)
+static int inotify_merge(struct list_head *list,
+			 struct fsnotify_event *event,
+			 void **arg)
 {
 	struct fsnotify_event_holder *last_holder;
 	struct fsnotify_event *last_event;
@@ -114,7 +116,7 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 	fsn_event_priv->group = group;
 	event_priv->wd = wd;
 
-	ret = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
+	ret = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge, NULL);
 	if (ret) {
 		inotify_free_event_priv(fsn_event_priv);
 		/* EEXIST says we tail matched, EOVERFLOW isn't something
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 19d274057bfa..1ce71f5b9589 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -525,7 +525,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 	fsn_event_priv->group = group;
 	event_priv->wd = i_mark->wd;
 
-	ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
+	ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL, NULL);
 	if (ret)
 		inotify_free_event_priv(fsn_event_priv);
 
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 7fc8d004084c..2d50a40ab1e4 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -137,7 +137,10 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
  */
 int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
 			      struct fsnotify_event_private_data *priv,
-			      int (*merge)(struct list_head *, struct fsnotify_event *))
+			      int (*merge)(struct list_head *,
+					   struct fsnotify_event *,
+					   void **arg),
+			      void **arg)
 {
 	struct fsnotify_event_holder *holder = NULL;
 	struct list_head *list = &group->notification_list;
@@ -170,7 +173,7 @@ alloc_holder:
 	if (!list_empty(list) && merge) {
 		int ret;
 
-		ret = merge(list, event);
+		ret = merge(list, event, arg);
 		if (ret) {
 			mutex_unlock(&group->notification_mutex);
 			if (holder != &event->holder)
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 8b2e095e5907..afc690192972 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -355,7 +355,10 @@ extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struc
 extern int fsnotify_add_notify_event(struct fsnotify_group *group,
 				     struct fsnotify_event *event,
 				     struct fsnotify_event_private_data *priv,
-				     int (*merge)(struct list_head *, struct fsnotify_event *));
+				     int (*merge)(struct list_head *,
+						  struct fsnotify_event *,
+						  void **),
+				     void **arg);
 /* true if the group notification queue is empty */
 extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
 /* return, but do not dequeue the first event on the notification queue */
-- 
cgit v1.2.3-59-g8ed1b


From 59b0df211bd9699d7e0d01fcf9345a149f75b033 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 8 Feb 2010 12:53:52 -0500
Subject: fsnotify: use unsigned char * for dentry->d_name.name

fsnotify was using char * when it passed around the d_name.name string
internally but it is actually an unsigned char *.  This patch switches
fsnotify to use unsigned and should silence some pointer signess warnings
which have popped out of xfs.  I do not add -Wpointer-sign to the fsnotify
code as there are still issues with kstrdup and strlen which would pop
out needless warnings.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/namei.c                       |  2 +-
 fs/notify/fsnotify.c             |  5 +++--
 fs/notify/notification.c         |  4 ++--
 include/linux/fsnotify.h         | 12 ++++++------
 include/linux/fsnotify_backend.h |  9 +++++----
 5 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 868d0cb9d473..3479b176a4cd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2635,7 +2635,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 {
 	int error;
 	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
-	const char *old_name;
+	const unsigned char *old_name;
 
 	if (old_dentry->d_inode == new_dentry->d_inode)
  		return 0;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 54d58d5f72c1..c5adf833bf6a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -171,7 +171,7 @@ void __fsnotify_flush_ignored_mask(struct inode *inode, void *data, int data_is)
 
 static void send_to_group(struct fsnotify_group *group, struct inode *to_tell,
 			  struct vfsmount *mnt, __u32 mask, void *data,
-			  int data_is, u32 cookie, const char *file_name,
+			  int data_is, u32 cookie, const unsigned char *file_name,
 			  struct fsnotify_event **event)
 {
 	if (!group->ops->should_send_event(group, to_tell, mnt, mask,
@@ -206,7 +206,8 @@ static bool needed_by_vfsmount(__u32 test_mask, struct vfsmount *mnt)
  * out to all of the registered fsnotify_group.  Those groups can then use the
  * notification event in whatever means they feel necessary.
  */
-void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+	      const unsigned char *file_name, u32 cookie)
 {
 	struct fsnotify_group *group;
 	struct fsnotify_event *event = NULL;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 2d50a40ab1e4..b35faafacd38 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -370,8 +370,8 @@ struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
  * @name the filename, if available
  */
 struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
-					     int data_type, const char *name, u32 cookie,
-					     gfp_t gfp)
+					     int data_type, const unsigned char *name,
+					     u32 cookie, gfp_t gfp)
 {
 	struct fsnotify_event *event;
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 06c0e50c7968..b8cf161f5a6d 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -59,14 +59,14 @@ static inline void fsnotify_link_count(struct inode *inode)
  * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
  */
 static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
-				 const char *old_name,
+				 const unsigned char *old_name,
 				 int isdir, struct inode *target, struct dentry *moved)
 {
 	struct inode *source = moved->d_inode;
 	u32 fs_cookie = fsnotify_get_cookie();
 	__u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM);
 	__u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO);
-	const char *new_name = moved->d_name.name;
+	const unsigned char *new_name = moved->d_name.name;
 
 	if (old_dir == new_dir)
 		old_dir_mask |= FS_DN_RENAME;
@@ -290,7 +290,7 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 /*
  * fsnotify_oldname_init - save off the old filename before we change it
  */
-static inline const char *fsnotify_oldname_init(const char *name)
+static inline const unsigned char *fsnotify_oldname_init(const unsigned char *name)
 {
 	return kstrdup(name, GFP_KERNEL);
 }
@@ -298,19 +298,19 @@ static inline const char *fsnotify_oldname_init(const char *name)
 /*
  * fsnotify_oldname_free - free the name we got from fsnotify_oldname_init
  */
-static inline void fsnotify_oldname_free(const char *old_name)
+static inline void fsnotify_oldname_free(const unsigned char *old_name)
 {
 	kfree(old_name);
 }
 
 #else	/* CONFIG_FSNOTIFY */
 
-static inline const char *fsnotify_oldname_init(const char *name)
+static inline const char *fsnotify_oldname_init(const unsigned char *name)
 {
 	return NULL;
 }
 
-static inline void fsnotify_oldname_free(const char *old_name)
+static inline void fsnotify_oldname_free(const unsigned char *old_name)
 {
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index afc690192972..efe9ba321cf2 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -220,7 +220,7 @@ struct fsnotify_event {
 	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
 
 	u32 sync_cookie;	/* used to corrolate events, namely inotify mv events */
-	char *file_name;
+	const unsigned char *file_name;
 	size_t name_len;
 	struct pid *tgid;
 
@@ -283,7 +283,7 @@ struct fsnotify_mark {
 
 /* main fsnotify call to send events */
 extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
-		     const char *name, u32 cookie);
+		     const unsigned char *name, u32 cookie);
 extern void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
@@ -402,7 +402,8 @@ extern void fsnotify_unmount_inodes(struct list_head *list);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-						    void *data, int data_is, const char *name,
+						    void *data, int data_is,
+						    const unsigned char *name,
 						    u32 cookie, gfp_t gfp);
 
 /* fanotify likes to change events after they are on lists... */
@@ -413,7 +414,7 @@ extern int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
 #else
 
 static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
-			    const char *name, u32 cookie)
+			    const unsigned char *name, u32 cookie)
 {}
 
 static inline void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
-- 
cgit v1.2.3-59-g8ed1b


From 6e006701ccc1590500186ef21e074bd900c5dd67 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 20 Jan 2010 22:27:56 +0200
Subject: dnotify: move dir_notify_enable declaration

Move dir_notify_enable declaration to where it belongs -- dnotify.h .

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/dnotify.h | 1 +
 include/linux/fs.h      | 3 ---
 kernel/sysctl.c         | 1 +
 3 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h
index ecc06286226d..3290555a52ee 100644
--- a/include/linux/dnotify.h
+++ b/include/linux/dnotify.h
@@ -28,6 +28,7 @@ struct dnotify_struct {
 			    FS_CREATE | FS_DN_RENAME |\
 			    FS_MOVED_FROM | FS_MOVED_TO)
 
+extern int dir_notify_enable;
 extern void dnotify_flush(struct file *, fl_owner_t);
 extern int fcntl_dirnotify(int, struct file *, unsigned long);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f9a003278758..d92c212476f9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -410,9 +410,6 @@ extern int get_max_files(void);
 extern int sysctl_nr_open;
 extern struct inodes_stat_t inodes_stat;
 extern int leases_enable, lease_break_time;
-#ifdef CONFIG_DNOTIFY
-extern int dir_notify_enable;
-#endif
 
 struct buffer_head;
 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d24f761f4876..7b983dbfe0ec 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -44,6 +44,7 @@
 #include <linux/times.h>
 #include <linux/limits.h>
 #include <linux/dcache.h>
+#include <linux/dnotify.h>
 #include <linux/syscalls.h>
 #include <linux/vmstat.h>
 #include <linux/nfs_fs.h>
-- 
cgit v1.2.3-59-g8ed1b


From d14f1729483fad3a8817fbbcbd017678b7d1ad26 Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Thu, 25 Feb 2010 20:28:57 -0500
Subject: sysctl extern cleanup: inotify

Extern declarations in sysctl.c should be move to their own head file, and
then include them in relavant .c files.

Move inotify_table extern declaration to linux/inotify.h

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/inotify.h | 5 +++++
 kernel/sysctl.c         | 6 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 959a38b8f75d..94d209a1b689 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -69,4 +69,9 @@ struct inotify_event {
 #define IN_CLOEXEC O_CLOEXEC
 #define IN_NONBLOCK O_NONBLOCK
 
+#ifdef __KERNEL__
+#include <linux/sysctl.h>
+extern struct ctl_table inotify_table[]; /* for sysctl */
+#endif
+
 #endif	/* _LINUX_INOTIFY_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7b983dbfe0ec..fe30db7bdb0a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -131,6 +131,9 @@ static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 
+#ifdef CONFIG_INOTIFY_USER
+#include <linux/inotify.h>
+#endif
 #ifdef CONFIG_SPARC
 #include <asm/system.h>
 #endif
@@ -207,9 +210,6 @@ static struct ctl_table fs_table[];
 static struct ctl_table debug_table[];
 static struct ctl_table dev_table[];
 extern struct ctl_table random_table[];
-#ifdef CONFIG_INOTIFY_USER
-extern struct ctl_table inotify_table[];
-#endif
 #ifdef CONFIG_EPOLL
 extern struct ctl_table epoll_table[];
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From c4ec54b40d33f8016fea970a383cc584dd0e6019 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:34 -0500
Subject: fsnotify: new fsnotify hooks and events types for access decisions

introduce a new fsnotify hook, fsnotify_perm(), which is called from the
security code.  This hook is used to allow fsnotify groups to make access
control decisions about events on the system.  We also must change the
generic fsnotify function to return an error code if we intend these hooks
to be in any way useful.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             | 47 ++++++++++++++++++++--------------------
 include/linux/fsnotify.h         | 19 ++++++++++++++++
 include/linux/fsnotify_backend.h | 15 ++++++++-----
 include/linux/security.h         |  1 +
 security/security.c              | 16 ++++++++++++--
 5 files changed, 68 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index c5adf833bf6a..668268627894 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -169,27 +169,22 @@ void __fsnotify_flush_ignored_mask(struct inode *inode, void *data, int data_is)
 	}
 }
 
-static void send_to_group(struct fsnotify_group *group, struct inode *to_tell,
-			  struct vfsmount *mnt, __u32 mask, void *data,
-			  int data_is, u32 cookie, const unsigned char *file_name,
-			  struct fsnotify_event **event)
+static int send_to_group(struct fsnotify_group *group, struct inode *to_tell,
+			 struct vfsmount *mnt, __u32 mask, void *data,
+			 int data_is, u32 cookie, const unsigned char *file_name,
+			 struct fsnotify_event **event)
 {
 	if (!group->ops->should_send_event(group, to_tell, mnt, mask,
 					   data, data_is))
-		return;
+		return 0;
 	if (!*event) {
 		*event = fsnotify_create_event(to_tell, mask, data,
 						data_is, file_name,
 						cookie, GFP_KERNEL);
-		/*
-		 * shit, we OOM'd and now we can't tell, maybe
-		 * someday someone else will want to do something
-		 * here
-		 */
 		if (!*event)
-			return;
+			return -ENOMEM;
 	}
-	group->ops->handle_event(group, *event);
+	return group->ops->handle_event(group, *event);
 }
 
 static bool needed_by_vfsmount(__u32 test_mask, struct vfsmount *mnt)
@@ -206,20 +201,20 @@ static bool needed_by_vfsmount(__u32 test_mask, struct vfsmount *mnt)
  * out to all of the registered fsnotify_group.  Those groups can then use the
  * notification event in whatever means they feel necessary.
  */
-void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
-	      const unsigned char *file_name, u32 cookie)
+int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+	     const unsigned char *file_name, u32 cookie)
 {
 	struct fsnotify_group *group;
 	struct fsnotify_event *event = NULL;
 	struct vfsmount *mnt = NULL;
-	int idx;
+	int idx, ret = 0;
 	/* global tests shouldn't care about events on child only the specific event */
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
 	/* if no fsnotify listeners, nothing to do */
 	if (list_empty(&fsnotify_inode_groups) &&
 	    list_empty(&fsnotify_vfsmount_groups))
-                return;
+		return 0;
  
 	if (mask & FS_MODIFY)
 		__fsnotify_flush_ignored_mask(to_tell, data, data_is);
@@ -227,7 +222,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	/* if none of the directed listeners or vfsmount listeners care */
 	if (!(test_mask & fsnotify_inode_mask) &&
 	    !(test_mask & fsnotify_vfsmount_mask))
-                return;
+		return 0;
  
 	if (data_is == FSNOTIFY_EVENT_PATH)
 		mnt = ((struct path *)data)->mnt;
@@ -236,7 +231,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	 * listeners list cares, nothing to do */
 	if (!(test_mask & to_tell->i_fsnotify_mask) &&
 	    !needed_by_vfsmount(test_mask, mnt))
-                return;
+		return 0;
 
 	/*
 	 * SRCU!!  the groups list is very very much read only and the path is
@@ -248,20 +243,24 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	if (test_mask & to_tell->i_fsnotify_mask) {
 		list_for_each_entry_rcu(group, &fsnotify_inode_groups, inode_group_list) {
 			if (test_mask & group->mask) {
-				send_to_group(group, to_tell, NULL, mask, data, data_is,
-					      cookie, file_name, &event);
+				ret = send_to_group(group, to_tell, NULL, mask, data, data_is,
+						    cookie, file_name, &event);
+				if (ret)
+					goto out;
 			}
 		}
 	}
 	if (needed_by_vfsmount(test_mask, mnt)) {
 		list_for_each_entry_rcu(group, &fsnotify_vfsmount_groups, vfsmount_group_list) {
 			if (test_mask & group->mask) {
-				send_to_group(group, to_tell, mnt, mask, data, data_is,
-					      cookie, file_name, &event);
+				ret = send_to_group(group, to_tell, mnt, mask, data, data_is,
+						    cookie, file_name, &event);
+				if (ret)
+					goto out;
 			}
 		}
 	}
-
+out:
 	srcu_read_unlock(&fsnotify_grp_srcu, idx);
 	/*
 	 * fsnotify_create_event() took a reference so the event can't be cleaned
@@ -269,6 +268,8 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	 */
 	if (event)
 		fsnotify_put_event(event);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(fsnotify);
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index b8cf161f5a6d..64efda9aae62 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -34,6 +34,25 @@ static inline void fsnotify_parent(struct path *path, struct dentry *dentry, __u
 	__fsnotify_parent(path, dentry, mask);
 }
 
+/* simple call site for access decisions */
+static inline int fsnotify_perm(struct file *file, int mask)
+{
+	struct path *path = &file->f_path;
+	struct inode *inode = path->dentry->d_inode;
+	__u32 fsnotify_mask;
+
+	if (file->f_mode & FMODE_NONOTIFY)
+		return 0;
+	if (!(mask & (MAY_READ | MAY_OPEN)))
+		return 0;
+	if (mask & MAY_READ)
+		fsnotify_mask = FS_ACCESS_PERM;
+	if (mask & MAY_OPEN)
+		fsnotify_mask = FS_OPEN_PERM;
+
+	return fsnotify(inode, fsnotify_mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+}
+
 /*
  * fsnotify_d_move - dentry has been moved
  * Called with dcache_lock and dentry->d_lock held.
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index efe9ba321cf2..c34728e7d8cb 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -41,6 +41,9 @@
 #define FS_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
 #define FS_IN_IGNORED		0x00008000	/* last inotify event here */
 
+#define FS_OPEN_PERM		0x00010000	/* open event in an permission hook */
+#define FS_ACCESS_PERM		0x00020000	/* access event in a permissions hook */
+
 #define FS_IN_ISDIR		0x40000000	/* event occurred against dir */
 #define FS_IN_ONESHOT		0x80000000	/* only send event once */
 
@@ -282,8 +285,8 @@ struct fsnotify_mark {
 /* called from the vfs helpers */
 
 /* main fsnotify call to send events */
-extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
-		     const unsigned char *name, u32 cookie);
+extern int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+		    const unsigned char *name, u32 cookie);
 extern void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
@@ -413,9 +416,11 @@ extern int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
 
 #else
 
-static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
-			    const unsigned char *name, u32 cookie)
-{}
+static inline int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+			   const unsigned char *name, u32 cookie)
+{
+	return 0;
+}
 
 static inline void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {}
diff --git a/include/linux/security.h b/include/linux/security.h
index 0c8819170463..24fc29540aa3 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -23,6 +23,7 @@
 #define __LINUX_SECURITY_H
 
 #include <linux/fs.h>
+#include <linux/fsnotify.h>
 #include <linux/binfmts.h>
 #include <linux/signal.h>
 #include <linux/resource.h>
diff --git a/security/security.c b/security/security.c
index 351942a4ca0e..f6ac27cd3452 100644
--- a/security/security.c
+++ b/security/security.c
@@ -620,7 +620,13 @@ void security_inode_getsecid(const struct inode *inode, u32 *secid)
 
 int security_file_permission(struct file *file, int mask)
 {
-	return security_ops->file_permission(file, mask);
+	int ret;
+
+	ret = security_ops->file_permission(file, mask);
+	if (ret)
+		return ret;
+
+	return fsnotify_perm(file, mask);
 }
 
 int security_file_alloc(struct file *file)
@@ -684,7 +690,13 @@ int security_file_receive(struct file *file)
 
 int security_dentry_open(struct file *file, const struct cred *cred)
 {
-	return security_ops->dentry_open(file, cred);
+	int ret;
+
+	ret = security_ops->dentry_open(file, cred);
+	if (ret)
+		return ret;
+
+	return fsnotify_perm(file, MAY_OPEN);
 }
 
 int security_task_create(unsigned long clone_flags)
-- 
cgit v1.2.3-59-g8ed1b


From 9e66e4233db9c7e31e9ee706be2c9ddd54cf99b3 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:34 -0500
Subject: fanotify: permissions and blocking

This is the backend work needed for fanotify to support the new
FS_OPEN_PERM and FS_ACCESS_PERM fsnotify events.  This is done using the
new fsnotify secondary queue.  No userspace interface is provided actually
respond to or request these events.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/Kconfig         | 14 ++++++++++
 fs/notify/fanotify/fanotify.c      | 54 +++++++++++++++++++++++++++++++++++---
 fs/notify/fanotify/fanotify_user.c |  5 ++++
 include/linux/fanotify.h           | 18 +++++++++++++
 include/linux/fsnotify_backend.h   | 12 +++++++++
 5 files changed, 99 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 668e5df28e28..566de30395c2 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -10,3 +10,17 @@ config FANOTIFY
 	   the event.
 
 	   If unsure, say Y.
+
+config FANOTIFY_ACCESS_PERMISSIONS
+	bool "fanotify permissions checking"
+	depends on FANOTIFY
+	depends on SECURITY
+	default n
+	---help---
+	   Say Y here is you want fanotify listeners to be able to make permissions
+	   decisions concerning filesystem events.  This is used by some fanotify
+	   listeners which need to scan files before allowing the system access to
+	   use those files.  This is used by some anti-malware vendors and by some
+	   hierarchical storage managent systems.
+
+	   If unsure, say N.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 4feed8601e29..52d0a55a249e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -2,9 +2,12 @@
 #include <linux/fdtable.h>
 #include <linux/fsnotify_backend.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
 #include <linux/kernel.h> /* UINT_MAX */
 #include <linux/mount.h>
+#include <linux/sched.h>
 #include <linux/types.h>
+#include <linux/wait.h>
 
 static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
 {
@@ -88,10 +91,37 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+static int fanotify_get_response_from_access(struct fsnotify_group *group,
+					     struct fsnotify_event *event)
+{
+	int ret;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	wait_event(group->fanotify_data.access_waitq, event->response);
+
+	/* userspace responded, convert to something usable */
+	spin_lock(&event->lock);
+	switch (event->response) {
+	case FAN_ALLOW:
+		ret = 0;
+		break;
+	case FAN_DENY:
+	default:
+		ret = -EPERM;
+	}
+	event->response = 0;
+	spin_unlock(&event->lock);
+
+	return ret;
+}
+#endif
+
 static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
 {
 	int ret;
-	struct fsnotify_event *used_event;
+	struct fsnotify_event *notify_event = NULL;
 
 	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
 	BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
@@ -100,15 +130,31 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_e
 	BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
 	BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
 	BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
+	BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
+	BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	ret = fsnotify_add_notify_event(group, event, NULL, fanotify_merge, (void **)&used_event);
+	ret = fsnotify_add_notify_event(group, event, NULL, fanotify_merge,
+					(void **)&notify_event);
 	/* -EEXIST means this event was merged with another, not that it was an error */
 	if (ret == -EEXIST)
 		ret = 0;
-	if (used_event)
-		fsnotify_put_event(used_event);
+	if (ret)
+		goto out;
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (event->mask & FAN_ALL_PERM_EVENTS) {
+		/* if we merged we need to wait on the new event */
+		if (notify_event)
+			event = notify_event;
+		ret = fanotify_get_response_from_access(group, event);
+	}
+#endif
+
+out:
+	if (notify_event)
+		fsnotify_put_event(notify_event);
 	return ret;
 }
 
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 84d3e2047de3..09d9bdb62af3 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -482,6 +482,11 @@ SYSCALL_DEFINE3(fanotify_init, unsigned int, flags, unsigned int, event_f_flags,
 		return PTR_ERR(group);
 
 	group->priority = priority;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	mutex_init(&group->fanotify_data.access_mutex);
+	init_waitqueue_head(&group->fanotify_data.access_waitq);
+	INIT_LIST_HEAD(&group->fanotify_data.access_list);
+#endif
 
 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
 	if (fd < 0)
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 385896c9f828..02f80676c238 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -15,6 +15,9 @@
 /* FIXME currently Q's have no limit.... */
 #define FAN_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
 
+#define FAN_OPEN_PERM		0x00010000	/* File open in perm check */
+#define FAN_ACCESS_PERM		0x00020000	/* File accessed in perm check */
+
 /* helper events */
 #define FAN_CLOSE		(FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */
 
@@ -52,7 +55,14 @@
 			FAN_CLOSE |\
 			FAN_OPEN)
 
+/*
+ * All events which require a permission response from userspace
+ */
+#define FAN_ALL_PERM_EVENTS (FAN_OPEN_PERM |\
+			     FAN_ACCESS_PERM)
+
 #define FAN_ALL_OUTGOING_EVENTS	(FAN_ALL_EVENTS |\
+				 FAN_ALL_PERM_EVENTS |\
 				 FAN_Q_OVERFLOW)
 
 #define FANOTIFY_METADATA_VERSION	1
@@ -65,6 +75,10 @@ struct fanotify_event_metadata {
 	__s64 pid;
 } __attribute__ ((packed));
 
+/* Legit userspace responses to a _PERM event */
+#define FAN_ALLOW	0x01
+#define FAN_DENY	0x02
+
 /* Helper functions to deal with fanotify_event_metadata buffers */
 #define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata))
 
@@ -78,5 +92,9 @@ struct fanotify_event_metadata {
 
 #ifdef __KERNEL__
 
+struct fanotify_wait {
+	struct fsnotify_event *event;
+	__s32 fd;
+};
 #endif /* __KERNEL__ */
 #endif /* _LINUX_FANOTIFY_H */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index c34728e7d8cb..b0d00fd6bfad 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -159,6 +159,14 @@ struct fsnotify_group {
 			struct fasync_struct    *fa;    /* async notification */
 			struct user_struct      *user;
 		} inotify_data;
+#endif
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+		struct fanotify_group_private_data {
+			/* allows a group to block waiting for a userspace response */
+			struct mutex access_mutex;
+			struct list_head access_list;
+			wait_queue_head_t access_waitq;
+		} fanotify_data;
 #endif
 	};
 };
@@ -227,6 +235,10 @@ struct fsnotify_event {
 	size_t name_len;
 	struct pid *tgid;
 
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	__u32 response;	/* userspace answer to question */
+#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
+
 	struct list_head private_data_list;	/* groups can store private data here */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From b2d879096ac799722e6017ee82c0586f0d101c9c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 17 Dec 2009 21:24:34 -0500
Subject: fanotify: userspace interface for permission responses

fanotify groups need to respond to events which include permissions types.
To do so groups will send a response using write() on the fanotify_fd they
have open.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.c      |   3 +
 fs/notify/fanotify/fanotify_user.c | 182 +++++++++++++++++++++++++++++++++++--
 include/linux/fanotify.h           |   5 +
 3 files changed, 184 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 52d0a55a249e..bbcfccd4a8ea 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -114,6 +114,9 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
 	event->response = 0;
 	spin_unlock(&event->lock);
 
+	pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
+		 group, event, ret);
+	
 	return ret;
 }
 #endif
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 09d9bdb62af3..87f0be852f71 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -18,6 +18,13 @@
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 
 static struct kmem_cache *fanotify_mark_cache __read_mostly;
+static struct kmem_cache *fanotify_response_event_cache __read_mostly;
+
+struct fanotify_response_event {
+	struct list_head list;
+	__s32 fd;
+	struct fsnotify_event *event;
+};
 
 /*
  * Get an fsnotify notification event if one exists and is small
@@ -110,23 +117,152 @@ static ssize_t fill_event_metadata(struct fsnotify_group *group,
 	return metadata->fd;
 }
 
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group,
+						  __s32 fd)
+{
+	struct fanotify_response_event *re, *return_re = NULL;
+
+	mutex_lock(&group->fanotify_data.access_mutex);
+	list_for_each_entry(re, &group->fanotify_data.access_list, list) {
+		if (re->fd != fd)
+			continue;
+
+		list_del_init(&re->list);
+		return_re = re;
+		break;
+	}
+	mutex_unlock(&group->fanotify_data.access_mutex);
+
+	pr_debug("%s: found return_re=%p\n", __func__, return_re);
+
+	return return_re;
+}
+
+static int process_access_response(struct fsnotify_group *group,
+				   struct fanotify_response *response_struct)
+{
+	struct fanotify_response_event *re;
+	__s32 fd = response_struct->fd;
+	__u32 response = response_struct->response;
+
+	pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
+		 fd, response);
+	/*
+	 * make sure the response is valid, if invalid we do nothing and either
+	 * userspace can send a valid responce or we will clean it up after the
+	 * timeout
+	 */
+	switch (response) {
+	case FAN_ALLOW:
+	case FAN_DENY:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (fd < 0)
+		return -EINVAL;
+
+	re = dequeue_re(group, fd);
+	if (!re)
+		return -ENOENT;
+
+	re->event->response = response;
+
+	wake_up(&group->fanotify_data.access_waitq);
+
+	kmem_cache_free(fanotify_response_event_cache, re);
+
+	return 0;
+}
+
+static int prepare_for_access_response(struct fsnotify_group *group,
+				       struct fsnotify_event *event,
+				       __s32 fd)
+{
+	struct fanotify_response_event *re;
+
+	if (!(event->mask & FAN_ALL_PERM_EVENTS))
+		return 0;
+
+	re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
+	if (!re)
+		return -ENOMEM;
+
+	re->event = event;
+	re->fd = fd;
+
+	mutex_lock(&group->fanotify_data.access_mutex);
+	list_add_tail(&re->list, &group->fanotify_data.access_list);
+	mutex_unlock(&group->fanotify_data.access_mutex);
+
+	return 0;
+}
+
+static void remove_access_response(struct fsnotify_group *group,
+				   struct fsnotify_event *event,
+				   __s32 fd)
+{
+	struct fanotify_response_event *re;
+
+	if (!(event->mask & FAN_ALL_PERM_EVENTS))
+		return;
+
+	re = dequeue_re(group, fd);
+	if (!re)
+		return;
+
+	BUG_ON(re->event != event);
+
+	kmem_cache_free(fanotify_response_event_cache, re);
+
+	return;
+}
+#else
+static int prepare_for_access_response(struct fsnotify_group *group,
+				       struct fsnotify_event *event,
+				       __s32 fd)
+{
+	return 0;
+}
+
+static void remove_access_response(struct fsnotify_group *group,
+				   struct fsnotify_event *event,
+				   __s32 fd)
+{
+	return 0;
+}
+#endif
+
 static ssize_t copy_event_to_user(struct fsnotify_group *group,
 				  struct fsnotify_event *event,
 				  char __user *buf)
 {
 	struct fanotify_event_metadata fanotify_event_metadata;
-	int ret;
+	int fd, ret;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	ret = fill_event_metadata(group, &fanotify_event_metadata, event);
-	if (ret < 0)
-		return ret;
+	fd = fill_event_metadata(group, &fanotify_event_metadata, event);
+	if (fd < 0)
+		return fd;
+
+	ret = prepare_for_access_response(group, event, fd);
+	if (ret)
+		goto out_close_fd;
 
+	ret = -EFAULT;
 	if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
-		return -EFAULT;
+		goto out_kill_access_response;
 
 	return FAN_EVENT_METADATA_LEN;
+
+out_kill_access_response:
+	remove_access_response(group, event, fd);
+out_close_fd:
+	sys_close(fd);
+	return ret;
 }
 
 /* intofiy userspace file descriptor functions */
@@ -197,6 +333,33 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 	return ret;
 }
 
+static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
+{
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	struct fanotify_response response = { .fd = -1, .response = -1 };
+	struct fsnotify_group *group;
+	int ret;
+
+	group = file->private_data;
+
+	if (count > sizeof(response))
+		count = sizeof(response);
+
+	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
+
+	if (copy_from_user(&response, buf, count))
+		return -EFAULT;
+
+	ret = process_access_response(group, &response);
+	if (ret < 0)
+		count = ret;
+
+	return count;
+#else
+	return -EINVAL;
+#endif
+}
+
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
 	struct fsnotify_group *group = file->private_data;
@@ -237,6 +400,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
 static const struct file_operations fanotify_fops = {
 	.poll		= fanotify_poll,
 	.read		= fanotify_read,
+	.write		= fanotify_write,
 	.fasync		= NULL,
 	.release	= fanotify_release,
 	.unlocked_ioctl	= fanotify_ioctl,
@@ -470,7 +634,7 @@ SYSCALL_DEFINE3(fanotify_init, unsigned int, flags, unsigned int, event_f_flags,
 	if (flags & ~FAN_ALL_INIT_FLAGS)
 		return -EINVAL;
 
-	f_flags = (O_RDONLY | FMODE_NONOTIFY);
+	f_flags = O_RDWR | FMODE_NONOTIFY;
 	if (flags & FAN_CLOEXEC)
 		f_flags |= O_CLOEXEC;
 	if (flags & FAN_NONBLOCK)
@@ -527,7 +691,11 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 	default:
 		return -EINVAL;
 	}
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
+#else
 	if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD))
+#endif
 		return -EINVAL;
 
 	filp = fget_light(fanotify_fd, &fput_needed);
@@ -600,6 +768,8 @@ SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
 static int __init fanotify_user_setup(void)
 {
 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
+	fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
+						   SLAB_PANIC);
 
 	return 0;
 }
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 02f80676c238..f0949a57ca9d 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -75,6 +75,11 @@ struct fanotify_event_metadata {
 	__s64 pid;
 } __attribute__ ((packed));
 
+struct fanotify_response {
+	__s32 fd;
+	__u32 response;
+} __attribute__ ((packed));
+
 /* Legit userspace responses to a _PERM event */
 #define FAN_ALLOW	0x01
 #define FAN_DENY	0x02
-- 
cgit v1.2.3-59-g8ed1b


From fb1cfb88c8597d847553f39efc2bbd41c72c5f50 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 12 May 2010 11:42:29 -0400
Subject: fsnotify: initialize mask in fsnotify_perm

akpm got a warning the fsnotify_mask could be used uninitialized in
fsnotify_perm().  It's not actually possible but his compiler complained
about it.  This patch just initializes it to 0 to shut up the compiler.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/fsnotify.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 64efda9aae62..59d0df43ff9d 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -39,16 +39,18 @@ static inline int fsnotify_perm(struct file *file, int mask)
 {
 	struct path *path = &file->f_path;
 	struct inode *inode = path->dentry->d_inode;
-	__u32 fsnotify_mask;
+	__u32 fsnotify_mask = 0;
 
 	if (file->f_mode & FMODE_NONOTIFY)
 		return 0;
 	if (!(mask & (MAY_READ | MAY_OPEN)))
 		return 0;
-	if (mask & MAY_READ)
-		fsnotify_mask = FS_ACCESS_PERM;
 	if (mask & MAY_OPEN)
 		fsnotify_mask = FS_OPEN_PERM;
+	else if (mask & MAY_READ)
+		fsnotify_mask = FS_ACCESS_PERM;
+	else
+		BUG();
 
 	return fsnotify(inode, fsnotify_mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 08ae89380a8210a9965d04083e1de78cb8bca4b1 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 27 May 2010 09:41:40 -0400
Subject: fanotify: drop the useless priority argument

The priority argument in fanotify is useless.  Kill it.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c |  8 +++-----
 fs/notify/group.c                  | 10 +++-------
 include/linux/fsnotify_backend.h   |  1 -
 include/linux/syscalls.h           |  3 +--
 4 files changed, 7 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 7c869fa23ec6..664102084766 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -616,14 +616,13 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 }
 
 /* fanotify syscalls */
-SYSCALL_DEFINE3(fanotify_init, unsigned int, flags, unsigned int, event_f_flags,
-		unsigned int, priority)
+SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 {
 	struct fsnotify_group *group;
 	int f_flags, fd;
 
-	pr_debug("%s: flags=%d event_f_flags=%d priority=%d\n",
-		__func__, flags, event_f_flags, priority);
+	pr_debug("%s: flags=%d event_f_flags=%d\n",
+		__func__, flags, event_f_flags);
 
 	if (event_f_flags)
 		return -EINVAL;
@@ -645,7 +644,6 @@ SYSCALL_DEFINE3(fanotify_init, unsigned int, flags, unsigned int, event_f_flags,
 	if (IS_ERR(group))
 		return PTR_ERR(group);
 
-	group->priority = priority;
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	mutex_init(&group->fanotify_data.access_mutex);
 	init_waitqueue_head(&group->fanotify_data.access_waitq);
diff --git a/fs/notify/group.c b/fs/notify/group.c
index ada913fd4f7f..7ac65ed4735b 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -90,7 +90,6 @@ void fsnotify_recalc_group_mask(struct fsnotify_group *group)
 void fsnotify_add_vfsmount_group(struct fsnotify_group *group)
 {
 	struct fsnotify_group *group_iter;
-	unsigned int priority = group->priority;
 
 	mutex_lock(&fsnotify_grp_mutex);
 
@@ -98,7 +97,7 @@ void fsnotify_add_vfsmount_group(struct fsnotify_group *group)
 		list_for_each_entry(group_iter, &fsnotify_vfsmount_groups,
 				    vfsmount_group_list) {
 			/* insert in front of this one? */
-			if (priority < group_iter->priority) {
+			if (group < group_iter) {
 				/* list_add_tail() insert in front of group_iter */
 				list_add_tail_rcu(&group->inode_group_list,
 						  &group_iter->inode_group_list);
@@ -118,15 +117,14 @@ out:
 void fsnotify_add_inode_group(struct fsnotify_group *group)
 {
 	struct fsnotify_group *group_iter;
-	unsigned int priority = group->priority;
 
 	mutex_lock(&fsnotify_grp_mutex);
 
-	/* add to global group list, priority 0 first, UINT_MAX last */
+	/* add to global group list */
 	if (!group->on_inode_group_list) {
 		list_for_each_entry(group_iter, &fsnotify_inode_groups,
 				    inode_group_list) {
-			if (priority < group_iter->priority) {
+			if (group < group_iter) {
 				/* list_add_tail() insert in front of group_iter */
 				list_add_tail_rcu(&group->inode_group_list,
 						  &group_iter->inode_group_list);
@@ -260,8 +258,6 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 	spin_lock_init(&group->mark_lock);
 	INIT_LIST_HEAD(&group->marks_list);
 
-	group->priority = UINT_MAX;
-
 	group->ops = ops;
 
 	return group;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index b0d00fd6bfad..b9b3f24ad4fc 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -143,7 +143,6 @@ struct fsnotify_group {
 					 * a group */
 	struct list_head marks_list;	/* all inode marks for this group */
 
-	unsigned int priority;		/* order of this group compared to others */
 	/* prevents double list_del of group_list.  protected by global fsnotify_grp_mutex */
 	bool on_inode_group_list;
 	bool on_vfsmount_group_list;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 5b05c37059e9..0ec26a74f20a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -813,8 +813,7 @@ asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
 asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
 			  struct timespec __user *, const sigset_t __user *,
 			  size_t);
-asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags,
-				  unsigned int priority);
+asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags);
 asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
 				  u64 mask, int fd,
 				  const char  __user *pathname);
-- 
cgit v1.2.3-59-g8ed1b


From 8c1934c8d70b22ca8333b216aec6c7d09fdbd6a6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:37 -0400
Subject: inotify: allow users to request not to recieve events on unlinked
 children

An inotify watch on a directory will send events for children even if those
children have been unlinked.  This patch add a new inotify flag IN_EXCL_UNLINK
which allows a watch to specificy they don't care about unlinked children.
This should fix performance problems seen by tasks which add a watch to
/tmp and then are overrun with events when other processes are reading and
writing to unlinked files they created in /tmp.

https://bugzilla.kernel.org/show_bug.cgi?id=16296

Requested-by: Matthias Clasen <mclasen@redhat.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_fsnotify.c | 9 +++++++++
 fs/notify/inotify/inotify_user.c     | 2 +-
 include/linux/fsnotify_backend.h     | 1 +
 include/linux/inotify.h              | 1 +
 4 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 388a150c3d4a..9d332e7f5a5c 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -22,6 +22,7 @@
  * General Public License for more details.
  */
 
+#include <linux/dcache.h> /* d_unlinked */
 #include <linux/fs.h> /* struct inode */
 #include <linux/fsnotify_backend.h>
 #include <linux/inotify.h>
@@ -157,6 +158,14 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
 	mask = (mask & ~FS_EVENT_ON_CHILD);
 	send = (fsn_mark->mask & mask);
 
+	if (send && (fsn_mark->mask & FS_EXCL_UNLINK) &&
+	    (data_type == FSNOTIFY_EVENT_PATH)) {
+		struct path *path  = data;
+
+		if (d_unlinked(path->dentry))
+			send = false;
+	}
+
 	/* find took a reference */
 	fsnotify_put_mark(fsn_mark);
 
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index f381dafe8efb..dfc80f70e517 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -97,7 +97,7 @@ static inline __u32 inotify_arg_to_mask(u32 arg)
 	mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT);
 
 	/* mask off the flags used to open the fd */
-	mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
+	mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
 
 	return mask;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index b9b3f24ad4fc..4b809fcd4996 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -44,6 +44,7 @@
 #define FS_OPEN_PERM		0x00010000	/* open event in an permission hook */
 #define FS_ACCESS_PERM		0x00020000	/* access event in a permissions hook */
 
+#define FS_EXCL_UNLINK		0x04000000	/* do not send events if object is unlinked */
 #define FS_IN_ISDIR		0x40000000	/* event occurred against dir */
 #define FS_IN_ONESHOT		0x80000000	/* only send event once */
 
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 94d209a1b689..b74f2ef2c368 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -51,6 +51,7 @@ struct inotify_event {
 /* special flags */
 #define IN_ONLYDIR		0x01000000	/* only watch the path if it is a directory */
 #define IN_DONT_FOLLOW		0x02000000	/* don't follow a sym link */
+#define IN_EXCL_UNLINK		0x04000000	/* exclude events on unlinked objects */
 #define IN_MASK_ADD		0x20000000	/* add to the mask of an already existing watch */
 #define IN_ISDIR		0x40000000	/* event occurred against dir */
 #define IN_ONESHOT		0x80000000	/* only send event once */
-- 
cgit v1.2.3-59-g8ed1b


From f874e1ac21d7708464dc656a10312542c54719f1 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:37 -0400
Subject: inotify: force inotify and fsnotify use same bits

inotify uses bits called IN_* and fsnotify uses bits called FS_*.  These
need to line up.  This patch adds build time checks to make sure noone can
change these bits so they are not the same.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 21 +++++++++++++++++++++
 include/linux/inotify.h          |  9 +++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index dfc80f70e517..c8203ce28ab7 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -839,6 +839,27 @@ out:
  */
 static int __init inotify_user_setup(void)
 {
+	BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+	BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+	BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
+	BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+	BUILD_BUG_ON(IN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
+	BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
+	BUILD_BUG_ON(IN_CREATE != FS_CREATE);
+	BUILD_BUG_ON(IN_DELETE != FS_DELETE);
+	BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
+	BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
+	BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+	BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+	BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+	BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
+	BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
+	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
+
+	BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
+
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
 	event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
 
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index b74f2ef2c368..d33041e2a42a 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -73,6 +73,15 @@ struct inotify_event {
 #ifdef __KERNEL__
 #include <linux/sysctl.h>
 extern struct ctl_table inotify_table[]; /* for sysctl */
+
+#define ALL_INOTIFY_BITS (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \
+			  IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | \
+			  IN_MOVED_TO | IN_CREATE | IN_DELETE | \
+			  IN_DELETE_SELF | IN_MOVE_SELF | IN_UNMOUNT | \
+			  IN_Q_OVERFLOW | IN_IGNORED | IN_ONLYDIR | \
+			  IN_DONT_FOLLOW | IN_EXCL_UNLINK | IN_MASK_ADD | \
+			  IN_ISDIR | IN_ONESHOT)
+
 #endif
 
 #endif	/* _LINUX_INOTIFY_H */
-- 
cgit v1.2.3-59-g8ed1b


From 20dee624ca40db227aa70cb3f44d2d6cb4fdbab4 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:37 -0400
Subject: fsnotify: check to make sure all fsnotify bits are unique

This patch adds a check to make sure that all fsnotify bits are unique and we
cannot accidentally use the same bit for 2 different fsnotify event types.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             | 2 ++
 include/linux/fsnotify_backend.h | 9 +++++++++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 9810babb1a3b..076c10e959d5 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -275,6 +275,8 @@ EXPORT_SYMBOL_GPL(fsnotify);
 
 static __init int fsnotify_init(void)
 {
+	BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);
+
 	return init_srcu_struct(&fsnotify_grp_srcu);
 }
 subsys_initcall(fsnotify_init);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 4b809fcd4996..a46355db1e47 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -64,6 +64,15 @@
 
 #define FS_MOVE			(FS_MOVED_FROM | FS_MOVED_TO)
 
+#define ALL_FSNOTIFY_EVENTS (FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
+			     FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN | \
+			     FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE | \
+			     FS_DELETE | FS_DELETE_SELF | FS_MOVE_SELF | \
+			     FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
+			     FS_OPEN_PERM | FS_ACCESS_PERM | FS_EXCL_UNLINK | \
+			     FS_IN_ISDIR | FS_IN_ONESHOT | FS_DN_RENAME | \
+			     FS_DN_MULTISHOT | FS_EVENT_ON_CHILD)
+
 struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark;
-- 
cgit v1.2.3-59-g8ed1b


From 80af2588676483ac4e998b5092e9d008dab3ab62 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:37 -0400
Subject: fanotify: groups can specify their f_flags for new fd

Currently fanotify fds opened for thier listeners are done with f_flags
equal to O_RDONLY | O_LARGEFILE.  This patch instead takes f_flags from the
fanotify_init syscall and uses those when opening files in the context of
the listener.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 6 ++----
 include/linux/fsnotify_backend.h   | 7 +++++--
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index da01091f93eb..7182c83be90e 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -81,7 +81,7 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
 	 * are NULL;  That's fine, just don't call dentry open */
 	if (dentry && mnt)
 		new_file = dentry_open(dentry, mnt,
-				       O_RDONLY | O_LARGEFILE | FMODE_NONOTIFY,
+				       group->fanotify_data.f_flags | FMODE_NONOTIFY,
 				       current_cred());
 	else
 		new_file = ERR_PTR(-EOVERFLOW);
@@ -625,9 +625,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	pr_debug("%s: flags=%d event_f_flags=%d\n",
 		__func__, flags, event_f_flags);
 
-	if (event_f_flags)
-		return -EINVAL;
-
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
@@ -645,6 +642,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	if (IS_ERR(group))
 		return PTR_ERR(group);
 
+	group->fanotify_data.f_flags = event_f_flags;
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	mutex_init(&group->fanotify_data.access_mutex);
 	init_waitqueue_head(&group->fanotify_data.access_waitq);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index a46355db1e47..a83859d7d36e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -169,14 +169,17 @@ struct fsnotify_group {
 			struct user_struct      *user;
 		} inotify_data;
 #endif
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+#ifdef CONFIG_FANOTIFY
 		struct fanotify_group_private_data {
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 			/* allows a group to block waiting for a userspace response */
 			struct mutex access_mutex;
 			struct list_head access_list;
 			wait_queue_head_t access_waitq;
+#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
+			int f_flags;
 		} fanotify_data;
-#endif
+#endif /* CONFIG_FANOTIFY */
 	};
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From f70ab54cc6c3907b0727ba332b3976f80f3846d0 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:37 -0400
Subject: fsnotify: fsnotify_add_notify_event should return an event

Rather than the horrific void ** argument and such just to pass the
fanotify_merge event back to the caller of fsnotify_add_notify_event() have
those things return an event if it was different than the event suggusted to
be added.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.c        | 103 ++++++++++++++++-------------------
 fs/notify/inotify/inotify_fsnotify.c |  28 +++++-----
 fs/notify/inotify/inotify_user.c     |  11 +++-
 fs/notify/notification.c             |  42 +++++++++-----
 include/linux/fsnotify_backend.h     |  12 ++--
 5 files changed, 101 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index bbcfccd4a8ea..f3c40c0e2b86 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -30,65 +30,58 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
 	return false;
 }
 
-/* Note, if we return an event in *arg that a reference is being held... */
-static int fanotify_merge(struct list_head *list,
-			  struct fsnotify_event *event,
-			  void **arg)
+/* and the list better be locked by something too! */
+static struct fsnotify_event *fanotify_merge(struct list_head *list,
+					     struct fsnotify_event *event)
 {
 	struct fsnotify_event_holder *test_holder;
-	struct fsnotify_event *test_event;
+	struct fsnotify_event *test_event = NULL;
 	struct fsnotify_event *new_event;
-	struct fsnotify_event **return_event = (struct fsnotify_event **)arg;
-	int ret = 0;
 
 	pr_debug("%s: list=%p event=%p\n", __func__, list, event);
 
-	*return_event = NULL;
-
-	/* and the list better be locked by something too! */
 
 	list_for_each_entry_reverse(test_holder, list, event_list) {
-		test_event = test_holder->event;
-		if (should_merge(test_event, event)) {
-			fsnotify_get_event(test_event);
-			*return_event = test_event;
-
-			ret = -EEXIST;
-			/* if they are exactly the same we are done */
-			if (test_event->mask == event->mask)
-				goto out;
-
-			/*
-			 * if the refcnt == 1 this is the only queue
-			 * for this event and so we can update the mask
-			 * in place.
-			 */
-			if (atomic_read(&test_event->refcnt) == 1) {
-				test_event->mask |= event->mask;
-				goto out;
-			}
-
-			/* can't allocate memory, merge was no possible */
-			new_event = fsnotify_clone_event(test_event);
-			if (unlikely(!new_event)) {
-				ret = 0;
-				goto out;
-			}
-
-			/* we didn't return the test_event, so drop that ref */
-			fsnotify_put_event(test_event);
-			/* the reference we return on new_event is from clone */
-			*return_event = new_event;
-
-			/* build new event and replace it on the list */
-			new_event->mask = (test_event->mask | event->mask);
-			fsnotify_replace_event(test_holder, new_event);
-
+		if (should_merge(test_holder->event, event)) {
+			test_event = test_holder->event;
 			break;
 		}
 	}
-out:
-	return ret;
+
+	if (!test_event)
+		return NULL;
+
+	fsnotify_get_event(test_event);
+
+	/* if they are exactly the same we are done */
+	if (test_event->mask == event->mask)
+		return test_event;
+
+	/*
+	 * if the refcnt == 2 this is the only queue
+	 * for this event and so we can update the mask
+	 * in place.
+	 */
+	if (atomic_read(&test_event->refcnt) == 2) {
+		test_event->mask |= event->mask;
+		return test_event;
+	}
+
+	new_event = fsnotify_clone_event(test_event);
+
+	/* done with test_event */
+	fsnotify_put_event(test_event);
+
+	/* couldn't allocate memory, merge was not possible */
+	if (unlikely(!new_event))
+		return ERR_PTR(-ENOMEM);
+
+	/* build new event and replace it on the list */
+	new_event->mask = (test_event->mask | event->mask);
+	fsnotify_replace_event(test_holder, new_event);
+
+	/* we hold a reference on new_event from clone_event */
+	return new_event;
 }
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
@@ -123,7 +116,7 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
 
 static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
 {
-	int ret;
+	int ret = 0;
 	struct fsnotify_event *notify_event = NULL;
 
 	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
@@ -138,13 +131,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_e
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	ret = fsnotify_add_notify_event(group, event, NULL, fanotify_merge,
-					(void **)&notify_event);
-	/* -EEXIST means this event was merged with another, not that it was an error */
-	if (ret == -EEXIST)
-		ret = 0;
-	if (ret)
-		goto out;
+	notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
+	if (IS_ERR(notify_event))
+		return PTR_ERR(notify_event);
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	if (event->mask & FAN_ALL_PERM_EVENTS) {
@@ -155,9 +144,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_e
 	}
 #endif
 
-out:
 	if (notify_event)
 		fsnotify_put_event(notify_event);
+
 	return ret;
 }
 
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 906b72761b17..73a1106b3542 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -68,13 +68,11 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 	return false;
 }
 
-static int inotify_merge(struct list_head *list,
-			 struct fsnotify_event *event,
-			 void **arg)
+static struct fsnotify_event *inotify_merge(struct list_head *list,
+					    struct fsnotify_event *event)
 {
 	struct fsnotify_event_holder *last_holder;
 	struct fsnotify_event *last_event;
-	int ret = 0;
 
 	/* and the list better be locked by something too */
 	spin_lock(&event->lock);
@@ -82,11 +80,13 @@ static int inotify_merge(struct list_head *list,
 	last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
 	last_event = last_holder->event;
 	if (event_compare(last_event, event))
-		ret = -EEXIST;
+		fsnotify_get_event(last_event);
+	else
+		last_event = NULL;
 
 	spin_unlock(&event->lock);
 
-	return ret;
+	return last_event;
 }
 
 static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
@@ -96,7 +96,8 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 	struct inode *to_tell;
 	struct inotify_event_private_data *event_priv;
 	struct fsnotify_event_private_data *fsn_event_priv;
-	int wd, ret;
+	struct fsnotify_event *added_event;
+	int wd, ret = 0;
 
 	pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
 		 event, event->to_tell, event->mask);
@@ -120,14 +121,13 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 	fsn_event_priv->group = group;
 	event_priv->wd = wd;
 
-	ret = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge, NULL);
-	if (ret) {
+	added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
+	if (added_event) {
 		inotify_free_event_priv(fsn_event_priv);
-		/* EEXIST says we tail matched, EOVERFLOW isn't something
-		 * to report up the stack. */
-		if ((ret == -EEXIST) ||
-		    (ret == -EOVERFLOW))
-			ret = 0;
+		if (!IS_ERR(added_event))
+			fsnotify_put_event(added_event);
+		else
+			ret = PTR_ERR(added_event);
 	}
 
 	if (fsn_mark->mask & IN_ONESHOT)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1068e1ca9cb0..a4cd227c4c76 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -516,7 +516,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 				    struct fsnotify_group *group)
 {
 	struct inotify_inode_mark *i_mark;
-	struct fsnotify_event *ignored_event;
+	struct fsnotify_event *ignored_event, *notify_event;
 	struct inotify_event_private_data *event_priv;
 	struct fsnotify_event_private_data *fsn_event_priv;
 	int ret;
@@ -538,9 +538,14 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 	fsn_event_priv->group = group;
 	event_priv->wd = i_mark->wd;
 
-	ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL, NULL);
-	if (ret)
+	notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
+	if (notify_event) {
+		if (IS_ERR(notify_event))
+			ret = PTR_ERR(notify_event);
+		else
+			fsnotify_put_event(notify_event);
 		inotify_free_event_priv(fsn_event_priv);
+	}
 
 skip_send_ignore:
 
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index e6dde25fb99b..f39260f8f865 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -137,16 +137,14 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
  * event off the queue to deal with.  If the event is successfully added to the
  * group's notification queue, a reference is taken on event.
  */
-int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
-			      struct fsnotify_event_private_data *priv,
-			      int (*merge)(struct list_head *,
-					   struct fsnotify_event *,
-					   void **arg),
-			      void **arg)
+struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+						 struct fsnotify_event_private_data *priv,
+						 struct fsnotify_event *(*merge)(struct list_head *,
+										 struct fsnotify_event *))
 {
+	struct fsnotify_event *return_event = NULL;
 	struct fsnotify_event_holder *holder = NULL;
 	struct list_head *list = &group->notification_list;
-	int rc = 0;
 
 	pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
 
@@ -162,27 +160,37 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
 alloc_holder:
 		holder = fsnotify_alloc_event_holder();
 		if (!holder)
-			return -ENOMEM;
+			return ERR_PTR(-ENOMEM);
 	}
 
 	mutex_lock(&group->notification_mutex);
 
 	if (group->q_len >= group->max_events) {
 		event = q_overflow_event;
-		rc = -EOVERFLOW;
+
+		/*
+		 * we need to return the overflow event
+		 * which means we need a ref
+		 */
+		fsnotify_get_event(event);
+		return_event = event;
+
 		/* sorry, no private data on the overflow event */
 		priv = NULL;
 	}
 
 	if (!list_empty(list) && merge) {
-		int ret;
+		struct fsnotify_event *tmp;
 
-		ret = merge(list, event, arg);
-		if (ret) {
+		tmp = merge(list, event);
+		if (tmp) {
 			mutex_unlock(&group->notification_mutex);
+
+			if (return_event)
+				fsnotify_put_event(return_event);
 			if (holder != &event->holder)
 				fsnotify_destroy_event_holder(holder);
-			return ret;
+			return tmp;
 		}
 	}
 
@@ -197,6 +205,12 @@ alloc_holder:
 		 * event holder was used, go back and get a new one */
 		spin_unlock(&event->lock);
 		mutex_unlock(&group->notification_mutex);
+
+		if (return_event) {
+			fsnotify_put_event(return_event);
+			return_event = NULL;
+		}
+
 		goto alloc_holder;
 	}
 
@@ -211,7 +225,7 @@ alloc_holder:
 	mutex_unlock(&group->notification_mutex);
 
 	wake_up(&group->notification_waitq);
-	return rc;
+	return return_event;
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index a83859d7d36e..564b5ea4a831 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -379,13 +379,11 @@ extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struc
 									   struct fsnotify_event *event);
 
 /* attach the event to the group notification queue */
-extern int fsnotify_add_notify_event(struct fsnotify_group *group,
-				     struct fsnotify_event *event,
-				     struct fsnotify_event_private_data *priv,
-				     int (*merge)(struct list_head *,
-						  struct fsnotify_event *,
-						  void **),
-				     void **arg);
+extern struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
+							struct fsnotify_event *event,
+							struct fsnotify_event_private_data *priv,
+							struct fsnotify_event *(*merge)(struct list_head *,
+											struct fsnotify_event *));
 /* true if the group notification queue is empty */
 extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
 /* return, but do not dequeue the first event on the notification queue */
-- 
cgit v1.2.3-59-g8ed1b


From 3bcf3860a4ff9bbc522820b4b765e65e4deceb3e Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:37 -0400
Subject: fsnotify: store struct file not struct path

Al explains that calling dentry_open() with a mnt/dentry pair is only
garunteed to be safe if they are already used in an open struct file.  To
make sure this is the case don't store and use a struct path in fsnotify,
always use a struct file.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.c        |  8 ++++----
 fs/notify/fanotify/fanotify_user.c   |  6 +++---
 fs/notify/fsnotify.c                 | 16 ++++++++--------
 fs/notify/inotify/inotify_fsnotify.c | 12 ++++++------
 fs/notify/notification.c             | 20 +++++++++----------
 include/linux/fsnotify.h             | 37 ++++++++++++++++--------------------
 include/linux/fsnotify_backend.h     | 16 ++++++++--------
 kernel/audit_watch.c                 |  4 ++--
 8 files changed, 56 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index f3c40c0e2b86..c2a3029052bc 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -17,9 +17,9 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
 	    old->data_type == new->data_type &&
 	    old->tgid == new->tgid) {
 		switch (old->data_type) {
-		case (FSNOTIFY_EVENT_PATH):
-			if ((old->path.mnt == new->path.mnt) &&
-			    (old->path.dentry == new->path.dentry))
+		case (FSNOTIFY_EVENT_FILE):
+			if ((old->file->f_path.mnt == new->file->f_path.mnt) &&
+			    (old->file->f_path.dentry == new->file->f_path.dentry))
 				return true;
 		case (FSNOTIFY_EVENT_NONE):
 			return true;
@@ -226,7 +226,7 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, struct inod
 		return false;
 
 	/* if we don't have enough info to send an event to userspace say no */
-	if (data_type != FSNOTIFY_EVENT_PATH)
+	if (data_type != FSNOTIFY_EVENT_FILE)
 		return false;
 
 	if (mnt)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 7182c83be90e..50cea74bf1c8 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -65,7 +65,7 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
 	if (client_fd < 0)
 		return client_fd;
 
-	if (event->data_type != FSNOTIFY_EVENT_PATH) {
+	if (event->data_type != FSNOTIFY_EVENT_FILE) {
 		WARN_ON(1);
 		put_unused_fd(client_fd);
 		return -EINVAL;
@@ -75,8 +75,8 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
 	 * we need a new file handle for the userspace program so it can read even if it was
 	 * originally opened O_WRONLY.
 	 */
-	dentry = dget(event->path.dentry);
-	mnt = mntget(event->path.mnt);
+	dentry = dget(event->file->f_path.dentry);
+	mnt = mntget(event->file->f_path.mnt);
 	/* it's possible this event was an overflow event.  in that case dentry and mnt
 	 * are NULL;  That's fine, just don't call dentry open */
 	if (dentry && mnt)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 72aae4045314..4788c866473a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -84,7 +84,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 }
 
 /* Notify this dentry's parent about a child's events. */
-void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
+void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
 {
 	struct dentry *parent;
 	struct inode *p_inode;
@@ -92,7 +92,7 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 	bool should_update_children = false;
 
 	if (!dentry)
-		dentry = path->dentry;
+		dentry = file->f_path.dentry;
 
 	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
 		return;
@@ -124,8 +124,8 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 		 * specifies these are events which came from a child. */
 		mask |= FS_EVENT_ON_CHILD;
 
-		if (path)
-			fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
+		if (file)
+			fsnotify(p_inode, mask, file, FSNOTIFY_EVENT_FILE,
 				 dentry->d_name.name, 0);
 		else
 			fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
@@ -154,10 +154,10 @@ void __fsnotify_flush_ignored_mask(struct inode *inode, void *data, int data_is)
 		spin_unlock(&inode->i_lock);
 	}
 
-	if (data_is == FSNOTIFY_EVENT_PATH) {
+	if (data_is == FSNOTIFY_EVENT_FILE) {
 		struct vfsmount *mnt;
 
-		mnt = ((struct path *)data)->mnt;
+		mnt = ((struct file *)data)->f_path.mnt;
 		if (mnt && !hlist_empty(&mnt->mnt_fsnotify_marks)) {
 			spin_lock(&mnt->mnt_root->d_lock);
 			hlist_for_each_entry(mark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
@@ -228,8 +228,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	    !(test_mask & fsnotify_vfsmount_mask))
 		return 0;
  
-	if (data_is == FSNOTIFY_EVENT_PATH)
-		mnt = ((struct path *)data)->mnt;
+	if (data_is == FSNOTIFY_EVENT_FILE)
+		mnt = ((struct file *)data)->f_path.mnt;
 
 	/* if this inode's directed listeners don't care and nothing on the vfsmount
 	 * listeners list cares, nothing to do */
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 73a1106b3542..3c506e0364cc 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -52,9 +52,9 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 			    !strcmp(old->file_name, new->file_name))
 				return true;
 			break;
-		case (FSNOTIFY_EVENT_PATH):
-			if ((old->path.mnt == new->path.mnt) &&
-			    (old->path.dentry == new->path.dentry))
+		case (FSNOTIFY_EVENT_FILE):
+			if ((old->file->f_path.mnt == new->file->f_path.mnt) &&
+			    (old->file->f_path.dentry == new->file->f_path.dentry))
 				return true;
 			break;
 		case (FSNOTIFY_EVENT_NONE):
@@ -165,10 +165,10 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
 	send = (fsn_mark->mask & mask);
 
 	if (send && (fsn_mark->mask & FS_EXCL_UNLINK) &&
-	    (data_type == FSNOTIFY_EVENT_PATH)) {
-		struct path *path  = data;
+	    (data_type == FSNOTIFY_EVENT_FILE)) {
+		struct file *file  = data;
 
-		if (d_unlinked(path->dentry))
+		if (d_unlinked(file->f_path.dentry))
 			send = false;
 	}
 
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index f39260f8f865..c106cdd7ff5e 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -31,6 +31,7 @@
  * allocated and used.
  */
 
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -89,8 +90,8 @@ void fsnotify_put_event(struct fsnotify_event *event)
 	if (atomic_dec_and_test(&event->refcnt)) {
 		pr_debug("%s: event=%p\n", __func__, event);
 
-		if (event->data_type == FSNOTIFY_EVENT_PATH)
-			path_put(&event->path);
+		if (event->data_type == FSNOTIFY_EVENT_FILE)
+			fput(event->file);
 
 		BUG_ON(!list_empty(&event->private_data_list));
 
@@ -375,8 +376,8 @@ struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
 		}
 	}
 	event->tgid = get_pid(old_event->tgid);
-	if (event->data_type == FSNOTIFY_EVENT_PATH)
-		path_get(&event->path);
+	if (event->data_type == FSNOTIFY_EVENT_FILE)
+		get_file(event->file);
 
 	return event;
 }
@@ -423,11 +424,9 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 	event->data_type = data_type;
 
 	switch (data_type) {
-	case FSNOTIFY_EVENT_PATH: {
-		struct path *path = data;
-		event->path.dentry = path->dentry;
-		event->path.mnt = path->mnt;
-		path_get(&event->path);
+	case FSNOTIFY_EVENT_FILE: {
+		event->file = data;
+		get_file(event->file);
 		break;
 	}
 	case FSNOTIFY_EVENT_INODE:
@@ -435,8 +434,7 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 		break;
 	case FSNOTIFY_EVENT_NONE:
 		event->inode = NULL;
-		event->path.dentry = NULL;
-		event->path.mnt = NULL;
+		event->file = NULL;
 		break;
 	default:
 		BUG();
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 59d0df43ff9d..e4e2204187ee 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -26,19 +26,18 @@ static inline void fsnotify_d_instantiate(struct dentry *dentry,
 }
 
 /* Notify this dentry's parent about a child's events. */
-static inline void fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
+static inline void fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
 {
 	if (!dentry)
-		dentry = path->dentry;
+		dentry = file->f_path.dentry;
 
-	__fsnotify_parent(path, dentry, mask);
+	__fsnotify_parent(file, dentry, mask);
 }
 
 /* simple call site for access decisions */
 static inline int fsnotify_perm(struct file *file, int mask)
 {
-	struct path *path = &file->f_path;
-	struct inode *inode = path->dentry->d_inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
 	__u32 fsnotify_mask = 0;
 
 	if (file->f_mode & FMODE_NONOTIFY)
@@ -52,7 +51,7 @@ static inline int fsnotify_perm(struct file *file, int mask)
 	else
 		BUG();
 
-	return fsnotify(inode, fsnotify_mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+	return fsnotify(inode, fsnotify_mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
 
 /*
@@ -187,16 +186,15 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
  */
 static inline void fsnotify_access(struct file *file)
 {
-	struct path *path = &file->f_path;
-	struct inode *inode = path->dentry->d_inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
 	__u32 mask = FS_ACCESS;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
 	if (!(file->f_mode & FMODE_NONOTIFY)) {
-		fsnotify_parent(path, NULL, mask);
-		fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+		fsnotify_parent(file, NULL, mask);
+		fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 	}
 }
 
@@ -205,16 +203,15 @@ static inline void fsnotify_access(struct file *file)
  */
 static inline void fsnotify_modify(struct file *file)
 {
-	struct path *path = &file->f_path;
-	struct inode *inode = path->dentry->d_inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
 	__u32 mask = FS_MODIFY;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
 	if (!(file->f_mode & FMODE_NONOTIFY)) {
-		fsnotify_parent(path, NULL, mask);
-		fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+		fsnotify_parent(file, NULL, mask);
+		fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 	}
 }
 
@@ -223,16 +220,15 @@ static inline void fsnotify_modify(struct file *file)
  */
 static inline void fsnotify_open(struct file *file)
 {
-	struct path *path = &file->f_path;
-	struct inode *inode = path->dentry->d_inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
 	__u32 mask = FS_OPEN;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
 	if (!(file->f_mode & FMODE_NONOTIFY)) {
-		fsnotify_parent(path, NULL, mask);
-		fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+		fsnotify_parent(file, NULL, mask);
+		fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 	}
 }
 
@@ -241,7 +237,6 @@ static inline void fsnotify_open(struct file *file)
  */
 static inline void fsnotify_close(struct file *file)
 {
-	struct path *path = &file->f_path;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	fmode_t mode = file->f_mode;
 	__u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
@@ -250,8 +245,8 @@ static inline void fsnotify_close(struct file *file)
 		mask |= FS_IN_ISDIR;
 
 	if (!(file->f_mode & FMODE_NONOTIFY)) {
-		fsnotify_parent(path, NULL, mask);
-		fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+		fsnotify_parent(file, NULL, mask);
+		fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 564b5ea4a831..3410d388163e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -223,20 +223,20 @@ struct fsnotify_event {
 	/* to_tell may ONLY be dereferenced during handle_event(). */
 	struct inode *to_tell;	/* either the inode the event happened to or its parent */
 	/*
-	 * depending on the event type we should have either a path or inode
-	 * We hold a reference on path, but NOT on inode.  Since we have the ref on
-	 * the path, it may be dereferenced at any point during this object's
+	 * depending on the event type we should have either a file or inode
+	 * We hold a reference on file, but NOT on inode.  Since we have the ref on
+	 * the file, it may be dereferenced at any point during this object's
 	 * lifetime.  That reference is dropped when this object's refcnt hits
-	 * 0.  If this event contains an inode instead of a path, the inode may
+	 * 0.  If this event contains an inode instead of a file, the inode may
 	 * ONLY be used during handle_event().
 	 */
 	union {
-		struct path path;
+		struct file *file;
 		struct inode *inode;
 	};
 /* when calling fsnotify tell it if the data is a path or inode */
 #define FSNOTIFY_EVENT_NONE	0
-#define FSNOTIFY_EVENT_PATH	1
+#define FSNOTIFY_EVENT_FILE	1
 #define FSNOTIFY_EVENT_INODE	2
 	int data_type;		/* which of the above union we have */
 	atomic_t refcnt;	/* how many groups still are using/need to send this event */
@@ -311,7 +311,7 @@ struct fsnotify_mark {
 /* main fsnotify call to send events */
 extern int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 		    const unsigned char *name, u32 cookie);
-extern void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask);
+extern void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
 extern u32 fsnotify_get_cookie(void);
@@ -444,7 +444,7 @@ static inline int fsnotify(struct inode *to_tell, __u32 mask, void *data, int da
 	return 0;
 }
 
-static inline void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
+static inline void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
 {}
 
 static inline void __fsnotify_inode_delete(struct inode *inode)
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 7499397a6100..b955a22d8ff1 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -545,8 +545,8 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct fsnotif
 		return 0;
 
 	switch (event->data_type) {
-	case (FSNOTIFY_EVENT_PATH):
-		inode = event->path.dentry->d_inode;
+	case (FSNOTIFY_EVENT_FILE):
+		inode = event->file->f_path.dentry->d_inode;
 		break;
 	case (FSNOTIFY_EVENT_INODE):
 		inode = event->inode;
-- 
cgit v1.2.3-59-g8ed1b


From 700307a29ad61090dcf1d45f8f4a135f5e9211ae Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:38 -0400
Subject: fsnotify: use an explicit flag to indicate fsnotify_destroy_mark has
 been called

Currently fsnotify check is mark->group is NULL to decide if
fsnotify_destroy_mark() has already been called or not.  With the upcoming
rcu work it is a heck of a lot easier to use an explicit flag than worry
about group being set to NULL.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inode_mark.c           |  2 +-
 fs/notify/mark.c                 | 11 +++++++----
 fs/notify/vfsmount_mark.c        |  2 +-
 include/linux/fsnotify_backend.h |  1 +
 4 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 455cb41c729b..37b460f302b7 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -187,7 +187,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 	struct hlist_node *node, *last = NULL;
 	int ret = 0;
 
-	mark->flags = FSNOTIFY_MARK_FLAG_INODE;
+	mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
 
 	assert_spin_locked(&mark->lock);
 	assert_spin_locked(&group->mark_lock);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 8f3b0e7a543d..69c5a166930c 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -121,12 +121,14 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
 
 	group = mark->group;
 
-	/* if !group something else already marked this to die */
-	if (!group) {
+	/* something else already called this function on this mark */
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
 		spin_unlock(&mark->lock);
 		return;
 	}
 
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+
 	/* 1 from caller and 1 for being on i_list/g_list */
 	BUG_ON(atomic_read(&mark->refcnt) < 2);
 
@@ -141,7 +143,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
 		BUG();
 
 	list_del_init(&mark->g_list);
-	mark->group = NULL;
 
 	fsnotify_put_mark(mark); /* for i_list and g_list */
 
@@ -229,6 +230,8 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,
 	spin_lock(&mark->lock);
 	spin_lock(&group->mark_lock);
 
+	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+
 	mark->group = group;
 	list_add(&mark->g_list, &group->marks_list);
 	atomic_inc(&group->num_marks);
@@ -258,7 +261,7 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,
 
 	return ret;
 err:
-	mark->group = NULL;
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
 	list_del_init(&mark->g_list);
 	atomic_dec(&group->num_marks);
 	fsnotify_put_mark(mark);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index b7ae64030021..56772b578fbd 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -145,7 +145,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 	struct hlist_node *node, *last = NULL;
 	int ret = 0;
 
-	mark->flags = FSNOTIFY_MARK_FLAG_VFSMOUNT;
+	mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
 
 	assert_spin_locked(&mark->lock);
 	assert_spin_locked(&group->mark_lock);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 3410d388163e..8e24cdf72928 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -300,6 +300,7 @@ struct fsnotify_mark {
 #define FSNOTIFY_MARK_FLAG_VFSMOUNT		0x02
 #define FSNOTIFY_MARK_FLAG_OBJECT_PINNED	0x04
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x08
+#define FSNOTIFY_MARK_FLAG_ALIVE		0x10
 	unsigned int flags;		/* vfsmount or inode mark? */
 	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
-- 
cgit v1.2.3-59-g8ed1b


From 75c1be487a690db43da2c1234fcacd84c982803c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:38 -0400
Subject: fsnotify: srcu to protect read side of inode and vfsmount locks

Currently reading the inode->i_fsnotify_marks or
vfsmount->mnt_fsnotify_marks lists are protected by a spinlock on both the
read and the write side.  This patch protects the read side of those lists
with a new single srcu.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             | 69 ++++++++++++++++++++++++++--------------
 fs/notify/fsnotify.h             |  5 +--
 fs/notify/group.c                | 16 +++-------
 fs/notify/mark.c                 | 60 ++++++++++++++++++++++++++++++++--
 include/linux/fsnotify_backend.h |  1 +
 5 files changed, 111 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4788c866473a..4678b416241e 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -144,14 +144,15 @@ void __fsnotify_flush_ignored_mask(struct inode *inode, void *data, int data_is)
 {
 	struct fsnotify_mark *mark;
 	struct hlist_node *node;
+	int idx;
+
+	idx = srcu_read_lock(&fsnotify_mark_srcu);
 
 	if (!hlist_empty(&inode->i_fsnotify_marks)) {
-		spin_lock(&inode->i_lock);
-		hlist_for_each_entry(mark, node, &inode->i_fsnotify_marks, i.i_list) {
+		hlist_for_each_entry_rcu(mark, node, &inode->i_fsnotify_marks, i.i_list) {
 			if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
 				mark->ignored_mask = 0;
 		}
-		spin_unlock(&inode->i_lock);
 	}
 
 	if (data_is == FSNOTIFY_EVENT_FILE) {
@@ -159,14 +160,14 @@ void __fsnotify_flush_ignored_mask(struct inode *inode, void *data, int data_is)
 
 		mnt = ((struct file *)data)->f_path.mnt;
 		if (mnt && !hlist_empty(&mnt->mnt_fsnotify_marks)) {
-			spin_lock(&mnt->mnt_root->d_lock);
-			hlist_for_each_entry(mark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
+			hlist_for_each_entry_rcu(mark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
 				if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
 					mark->ignored_mask = 0;
 			}
-			spin_unlock(&mnt->mnt_root->d_lock);
 		}
 	}
+
+	srcu_read_unlock(&fsnotify_mark_srcu, idx);
 }
 
 static int send_to_group(struct fsnotify_group *group, struct inode *to_tell,
@@ -208,8 +209,10 @@ static bool needed_by_vfsmount(__u32 test_mask, struct vfsmount *mnt)
 int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	     const unsigned char *file_name, u32 cookie)
 {
+	struct fsnotify_mark *mark;
 	struct fsnotify_group *group;
 	struct fsnotify_event *event = NULL;
+	struct hlist_node *node;
 	struct vfsmount *mnt = NULL;
 	int idx, ret = 0;
 	/* global tests shouldn't care about events on child only the specific event */
@@ -237,35 +240,47 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	    !needed_by_vfsmount(test_mask, mnt))
 		return 0;
 
-	/*
-	 * SRCU!!  the groups list is very very much read only and the path is
-	 * very hot.  The VAST majority of events are not going to need to do
-	 * anything other than walk the list so it's crazy to pre-allocate.
-	 */
-	idx = srcu_read_lock(&fsnotify_grp_srcu);
+	idx = srcu_read_lock(&fsnotify_mark_srcu);
 
 	if (test_mask & to_tell->i_fsnotify_mask) {
-		list_for_each_entry_rcu(group, &fsnotify_inode_groups, inode_group_list) {
-			if (test_mask & group->mask) {
-				ret = send_to_group(group, to_tell, NULL, mask, data, data_is,
-						    cookie, file_name, &event);
+		hlist_for_each_entry_rcu(mark, node, &to_tell->i_fsnotify_marks, i.i_list) {
+
+			pr_debug("%s: inode_loop: mark=%p mark->mask=%x mark->ignored_mask=%x\n",
+				 __func__, mark, mark->mask, mark->ignored_mask);
+
+			if (test_mask & mark->mask & ~mark->ignored_mask) {
+				group = mark->group;
+				if (!group)
+					continue;
+				ret = send_to_group(group, to_tell, NULL, mask,
+						    data, data_is, cookie, file_name,
+						    &event);
 				if (ret)
 					goto out;
 			}
 		}
 	}
-	if (needed_by_vfsmount(test_mask, mnt)) {
-		list_for_each_entry_rcu(group, &fsnotify_vfsmount_groups, vfsmount_group_list) {
-			if (test_mask & group->mask) {
-				ret = send_to_group(group, to_tell, mnt, mask, data, data_is,
-						    cookie, file_name, &event);
+
+	if (mnt && (test_mask & mnt->mnt_fsnotify_mask)) {
+		hlist_for_each_entry_rcu(mark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
+
+			pr_debug("%s: mnt_loop: mark=%p mark->mask=%x mark->ignored_mask=%x\n",
+				 __func__, mark, mark->mask, mark->ignored_mask);
+
+			if (test_mask & mark->mask & ~mark->ignored_mask)  {
+				group = mark->group;
+				if (!group)
+					continue;
+				ret = send_to_group(group, to_tell, mnt, mask,
+						    data, data_is, cookie, file_name,
+						    &event);
 				if (ret)
 					goto out;
 			}
 		}
 	}
 out:
-	srcu_read_unlock(&fsnotify_grp_srcu, idx);
+	srcu_read_unlock(&fsnotify_mark_srcu, idx);
 	/*
 	 * fsnotify_create_event() took a reference so the event can't be cleaned
 	 * up while we are still trying to add it to lists, drop that one.
@@ -279,8 +294,14 @@ EXPORT_SYMBOL_GPL(fsnotify);
 
 static __init int fsnotify_init(void)
 {
+	int ret;
+
 	BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);
 
-	return init_srcu_struct(&fsnotify_grp_srcu);
+	ret = init_srcu_struct(&fsnotify_mark_srcu);
+	if (ret)
+		panic("initializing fsnotify_mark_srcu");
+
+	return 0;
 }
-subsys_initcall(fsnotify_init);
+core_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 1be54f6f9e7d..7eed86f942ba 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,8 +6,6 @@
 #include <linux/srcu.h>
 #include <linux/types.h>
 
-/* protects reads of fsnotify_groups */
-extern struct srcu_struct fsnotify_grp_srcu;
 /* all groups which receive inode fsnotify events */
 extern struct list_head fsnotify_inode_groups;
 /* all groups which receive vfsmount fsnotify events */
@@ -20,6 +18,9 @@ extern __u32 fsnotify_vfsmount_mask;
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
+/* protects reads of inode and vfsmount marks list */
+extern struct srcu_struct fsnotify_mark_srcu;
+
 extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
 						__u32 mask);
 /* add a mark to an inode */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 7ac65ed4735b..48d3a6d6e47a 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -30,8 +30,6 @@
 
 /* protects writes to fsnotify_groups and fsnotify_mask */
 static DEFINE_MUTEX(fsnotify_grp_mutex);
-/* protects reads while running the fsnotify_groups list */
-struct srcu_struct fsnotify_grp_srcu;
 /* all groups registered to receive inode filesystem notifications */
 LIST_HEAD(fsnotify_inode_groups);
 /* all groups registered to receive mount point filesystem notifications */
@@ -50,18 +48,17 @@ void fsnotify_recalc_global_mask(void)
 	struct fsnotify_group *group;
 	__u32 inode_mask = 0;
 	__u32 vfsmount_mask = 0;
-	int idx;
 
-	idx = srcu_read_lock(&fsnotify_grp_srcu);
+	mutex_lock(&fsnotify_grp_mutex);
 	list_for_each_entry_rcu(group, &fsnotify_inode_groups, inode_group_list)
 		inode_mask |= group->mask;
 	list_for_each_entry_rcu(group, &fsnotify_vfsmount_groups, vfsmount_group_list)
 		vfsmount_mask |= group->mask;
-		
-	srcu_read_unlock(&fsnotify_grp_srcu, idx);
 
 	fsnotify_inode_mask = inode_mask;
 	fsnotify_vfsmount_mask = vfsmount_mask;
+
+	mutex_unlock(&fsnotify_grp_mutex);
 }
 
 /*
@@ -168,6 +165,8 @@ static void fsnotify_destroy_group(struct fsnotify_group *group)
 	/* clear all inode marks for this group */
 	fsnotify_clear_marks_by_group(group);
 
+	synchronize_srcu(&fsnotify_mark_srcu);
+
 	/* past the point of no return, matches the initial value of 1 */
 	if (atomic_dec_and_test(&group->num_marks))
 		fsnotify_final_destroy_group(group);
@@ -216,12 +215,7 @@ void fsnotify_put_group(struct fsnotify_group *group)
 	 */
 	__fsnotify_evict_group(group);
 
-	/*
-	 * now it's off the list, so the only thing we might care about is
-	 * srcu access....
-	 */
 	mutex_unlock(&fsnotify_grp_mutex);
-	synchronize_srcu(&fsnotify_grp_srcu);
 
 	/* and now it is really dead. _Nothing_ could be seeing it */
 	fsnotify_recalc_global_mask();
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 69c5a166930c..41f3990f900b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -85,10 +85,12 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/srcu.h>
 #include <linux/writeback.h> /* for inode_lock */
 
 #include <asm/atomic.h>
@@ -96,6 +98,11 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
+struct srcu_struct fsnotify_mark_srcu;
+static DEFINE_SPINLOCK(destroy_lock);
+static LIST_HEAD(destroy_list);
+static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
+
 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
 	atomic_inc(&mark->refcnt);
@@ -144,11 +151,14 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
 
 	list_del_init(&mark->g_list);
 
-	fsnotify_put_mark(mark); /* for i_list and g_list */
-
 	spin_unlock(&group->mark_lock);
 	spin_unlock(&mark->lock);
 
+	spin_lock(&destroy_lock);
+	list_add(&mark->destroy_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+	wake_up(&destroy_waitq);
+
 	/*
 	 * Some groups like to know that marks are being freed.  This is a
 	 * callback to the group function to let it know that this mark
@@ -263,12 +273,17 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,
 err:
 	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
 	list_del_init(&mark->g_list);
+	mark->group = NULL;
 	atomic_dec(&group->num_marks);
-	fsnotify_put_mark(mark);
 
 	spin_unlock(&group->mark_lock);
 	spin_unlock(&mark->lock);
 
+	spin_lock(&destroy_lock);
+	list_add(&mark->destroy_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+	wake_up(&destroy_waitq);
+
 	return ret;
 }
 
@@ -326,3 +341,42 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 	atomic_set(&mark->refcnt, 1);
 	mark->free_mark = free_mark;
 }
+
+static int fsnotify_mark_destroy(void *ignored)
+{
+	struct fsnotify_mark *mark, *next;
+	LIST_HEAD(private_destroy_list);
+
+	for (;;) {
+		spin_lock(&destroy_lock);
+		list_for_each_entry_safe(mark, next, &destroy_list, destroy_list) {
+			list_del(&mark->destroy_list);
+			list_add(&mark->destroy_list, &private_destroy_list);
+		}
+		spin_unlock(&destroy_lock);
+
+		synchronize_srcu(&fsnotify_mark_srcu);
+
+		list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) {
+			list_del_init(&mark->destroy_list);
+			fsnotify_put_mark(mark);
+		}
+
+		wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
+	}
+
+	return 0;
+}
+
+static int __init fsnotify_mark_init(void)
+{
+	struct task_struct *thread;
+
+	thread = kthread_run(fsnotify_mark_destroy, NULL,
+			     "fsnotify_mark");
+	if (IS_ERR(thread))
+		panic("unable to start fsnotify mark destruction thread.");
+
+	return 0;
+}
+device_initcall(fsnotify_mark_init);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 8e24cdf72928..84159390969f 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -302,6 +302,7 @@ struct fsnotify_mark {
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x08
 #define FSNOTIFY_MARK_FLAG_ALIVE		0x10
 	unsigned int flags;		/* vfsmount or inode mark? */
+	struct list_head destroy_list;
 	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 3a9b16b407f10b2a771bcae13fb5791e527d6bcf Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:38 -0400
Subject: fsnotify: send fsnotify_mark to groups in event handling functions

With the change of fsnotify to use srcu walking the marks list instead of
walking the global groups list we now know the mark in question.  The code can
send the mark to the group's handling functions and the groups won't have to
find those marks themselves.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c          |  4 +++-
 fs/notify/fanotify/fanotify.c        |  8 +++++---
 fs/notify/fsnotify.c                 | 19 ++++++++++---------
 fs/notify/inotify/inotify_fsnotify.c |  8 +++++---
 include/linux/fsnotify_backend.h     |  7 ++++---
 kernel/audit_tree.c                  |  8 +++++---
 kernel/audit_watch.c                 |  8 +++++---
 7 files changed, 37 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 6624c2ee8786..2cae9be120db 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -83,6 +83,7 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
  * events.
  */
 static int dnotify_handle_event(struct fsnotify_group *group,
+				struct fsnotify_mark *mark,
 				struct fsnotify_event *event)
 {
 	struct fsnotify_mark *fsn_mark = NULL;
@@ -130,7 +131,8 @@ static int dnotify_handle_event(struct fsnotify_group *group,
  */
 static bool dnotify_should_send_event(struct fsnotify_group *group,
 				      struct inode *inode, struct vfsmount *mnt,
-				      __u32 mask, void *data, int data_type)
+				      struct fsnotify_mark *mark, __u32 mask,
+				      void *data, int data_type)
 {
 	struct fsnotify_mark *fsn_mark;
 	bool send;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index c2a3029052bc..abfba45abe2c 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -114,7 +114,9 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
 }
 #endif
 
-static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+static int fanotify_handle_event(struct fsnotify_group *group,
+				 struct fsnotify_mark *mark,
+				 struct fsnotify_event *event)
 {
 	int ret = 0;
 	struct fsnotify_event *notify_event = NULL;
@@ -214,8 +216,8 @@ static bool should_send_inode_event(struct fsnotify_group *group, struct inode *
 }
 
 static bool fanotify_should_send_event(struct fsnotify_group *group, struct inode *to_tell,
-				       struct vfsmount *mnt, __u32 mask, void *data,
-				       int data_type)
+				       struct vfsmount *mnt, struct fsnotify_mark *mark,
+				       __u32 mask, void *data, int data_type)
 {
 	pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x data=%p data_type=%d\n",
 		 __func__, group, to_tell, mnt, mask, data, data_type);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4678b416241e..59d639996cad 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -171,15 +171,16 @@ void __fsnotify_flush_ignored_mask(struct inode *inode, void *data, int data_is)
 }
 
 static int send_to_group(struct fsnotify_group *group, struct inode *to_tell,
-			 struct vfsmount *mnt, __u32 mask, void *data,
-			 int data_is, u32 cookie, const unsigned char *file_name,
+			 struct vfsmount *mnt, struct fsnotify_mark *mark,
+			 __u32 mask, void *data, int data_is, u32 cookie,
+			 const unsigned char *file_name,
 			 struct fsnotify_event **event)
 {
-	pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x data=%p data_is=%d"
-		 " cookie=%d event=%p\n", __func__, group, to_tell, mnt,
-		 mask, data, data_is, cookie, *event);
+	pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p"
+		 " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell,
+		 mnt, mark, mask, data, data_is, cookie, *event);
 
-	if (!group->ops->should_send_event(group, to_tell, mnt, mask,
+	if (!group->ops->should_send_event(group, to_tell, mnt, mark, mask,
 					   data, data_is))
 		return 0;
 	if (!*event) {
@@ -189,7 +190,7 @@ static int send_to_group(struct fsnotify_group *group, struct inode *to_tell,
 		if (!*event)
 			return -ENOMEM;
 	}
-	return group->ops->handle_event(group, *event);
+	return group->ops->handle_event(group, mark, *event);
 }
 
 static bool needed_by_vfsmount(__u32 test_mask, struct vfsmount *mnt)
@@ -252,7 +253,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 				group = mark->group;
 				if (!group)
 					continue;
-				ret = send_to_group(group, to_tell, NULL, mask,
+				ret = send_to_group(group, to_tell, NULL, mark, mask,
 						    data, data_is, cookie, file_name,
 						    &event);
 				if (ret)
@@ -271,7 +272,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 				group = mark->group;
 				if (!group)
 					continue;
-				ret = send_to_group(group, to_tell, mnt, mask,
+				ret = send_to_group(group, to_tell, mnt, mark, mask,
 						    data, data_is, cookie, file_name,
 						    &event);
 				if (ret)
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 3c506e0364cc..dbd76bbb3e21 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -89,7 +89,9 @@ static struct fsnotify_event *inotify_merge(struct list_head *list,
 	return last_event;
 }
 
-static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+static int inotify_handle_event(struct fsnotify_group *group,
+				struct fsnotify_mark *mark,
+				struct fsnotify_event *event)
 {
 	struct fsnotify_mark *fsn_mark;
 	struct inotify_inode_mark *i_mark;
@@ -148,8 +150,8 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify
 }
 
 static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
-				      struct vfsmount *mnt, __u32 mask, void *data,
-				      int data_type)
+				      struct vfsmount *mnt, struct fsnotify_mark *mark,
+				      __u32 mask, void *data, int data_type)
 {
 	struct fsnotify_mark *fsn_mark;
 	bool send;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 84159390969f..225dc0c3a48c 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -92,9 +92,10 @@ struct fsnotify_event_private_data;
  */
 struct fsnotify_ops {
 	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode,
-				  struct vfsmount *mnt, __u32 mask, void *data,
-				  int data_type);
-	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
+				  struct vfsmount *mnt, struct fsnotify_mark *mark,
+				  __u32 mask, void *data, int data_type);
+	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_mark *mark,
+			    struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
 	void (*free_event_priv)(struct fsnotify_event_private_data *priv);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index cfb97d752a61..584b94360217 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -903,7 +903,9 @@ static void evict_chunk(struct audit_chunk *chunk)
 	mutex_unlock(&audit_filter_mutex);
 }
 
-static int audit_tree_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+static int audit_tree_handle_event(struct fsnotify_group *group,
+				   struct fsnotify_mark *mark,
+				   struct fsnotify_event *event)
 {
 	BUG();
 	return -EOPNOTSUPP;
@@ -918,8 +920,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
 }
 
 static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
-				  struct vfsmount *mnt, __u32 mask, void *data,
-				  int data_type)
+				  struct vfsmount *mnt, struct fsnotify_mark *mark,
+				  __u32 mask, void *data, int data_type)
 {
 	return 0;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index b955a22d8ff1..4d5ea0319a6c 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -511,8 +511,8 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 }
 
 static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
-					  struct vfsmount *mnt, __u32 mask, void *data,
-					  int data_type)
+					  struct vfsmount *mnt, struct fsnotify_mark *mark,
+					  __u32 mask, void *data, int data_type)
 {
 	struct fsnotify_mark *entry;
 	bool send;
@@ -531,7 +531,9 @@ static bool audit_watch_should_send_event(struct fsnotify_group *group, struct i
 }
 
 /* Update watch data in audit rules based on fsnotify events. */
-static int audit_watch_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+static int audit_watch_handle_event(struct fsnotify_group *group,
+				    struct fsnotify_mark *mark,
+				    struct fsnotify_event *event)
 {
 	struct inode *inode;
 	__u32 mask = event->mask;
-- 
cgit v1.2.3-59-g8ed1b


From 03930979afa63e079e9aefd4d3dd429240711027 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:39 -0400
Subject: fsnotify: remove the global masks

Because we walk the object->fsnotify_marks list instead of the global
fsnotify groups list we don't need the fsnotify_inode_mask and
fsnotify_vfsmount_mask as these were simply shortcuts in fsnotify() for
performance.  They are now extra checks, rip them out.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             |  5 -----
 fs/notify/fsnotify.h             |  4 ----
 fs/notify/group.c                | 39 ++-------------------------------------
 include/linux/fsnotify_backend.h |  2 --
 4 files changed, 2 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 53b31f46d698..9ba29ee747cf 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -227,11 +227,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	if (mask & FS_MODIFY)
 		__fsnotify_flush_ignored_mask(to_tell, data, data_is);
 
-	/* if none of the directed listeners or vfsmount listeners care */
-	if (!(test_mask & fsnotify_inode_mask) &&
-	    !(test_mask & fsnotify_vfsmount_mask))
-		return 0;
- 
 	if (data_is == FSNOTIFY_EVENT_FILE)
 		mnt = ((struct file *)data)->f_path.mnt;
 
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 7eed86f942ba..b41dbf5a125c 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -10,10 +10,6 @@
 extern struct list_head fsnotify_inode_groups;
 /* all groups which receive vfsmount fsnotify events */
 extern struct list_head fsnotify_vfsmount_groups;
-/* all bitwise OR of all event types (FS_*) for all fsnotify_inode_groups */
-extern __u32 fsnotify_inode_mask;
-/* all bitwise OR of all event types (FS_*) for all fsnotify_vfsmount_groups */
-extern __u32 fsnotify_vfsmount_mask;
 
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 48d3a6d6e47a..8da532dd6026 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -34,54 +34,21 @@ static DEFINE_MUTEX(fsnotify_grp_mutex);
 LIST_HEAD(fsnotify_inode_groups);
 /* all groups registered to receive mount point filesystem notifications */
 LIST_HEAD(fsnotify_vfsmount_groups);
-/* bitwise OR of all events (FS_*) interesting to some group on this system */
-__u32 fsnotify_inode_mask;
-/* bitwise OR of all events (FS_*) interesting to some group on this system */
-__u32 fsnotify_vfsmount_mask;
-
-/*
- * When a new group registers or changes it's set of interesting events
- * this function updates the fsnotify_mask to contain all interesting events
- */
-void fsnotify_recalc_global_mask(void)
-{
-	struct fsnotify_group *group;
-	__u32 inode_mask = 0;
-	__u32 vfsmount_mask = 0;
-
-	mutex_lock(&fsnotify_grp_mutex);
-	list_for_each_entry_rcu(group, &fsnotify_inode_groups, inode_group_list)
-		inode_mask |= group->mask;
-	list_for_each_entry_rcu(group, &fsnotify_vfsmount_groups, vfsmount_group_list)
-		vfsmount_mask |= group->mask;
-
-	fsnotify_inode_mask = inode_mask;
-	fsnotify_vfsmount_mask = vfsmount_mask;
-
-	mutex_unlock(&fsnotify_grp_mutex);
-}
 
 /*
  * Update the group->mask by running all of the marks associated with this
- * group and finding the bitwise | of all of the mark->mask.  If we change
- * the group->mask we need to update the global mask of events interesting
- * to the system.
+ * group and finding the bitwise | of all of the mark->mask.
  */
 void fsnotify_recalc_group_mask(struct fsnotify_group *group)
 {
 	__u32 mask = 0;
-	__u32 old_mask = group->mask;
 	struct fsnotify_mark *mark;
 
 	spin_lock(&group->mark_lock);
 	list_for_each_entry(mark, &group->marks_list, g_list)
 		mask |= mark->mask;
-	spin_unlock(&group->mark_lock);
-
 	group->mask = mask;
-
-	if (old_mask != mask)
-		fsnotify_recalc_global_mask();
+	spin_unlock(&group->mark_lock);
 }
 
 void fsnotify_add_vfsmount_group(struct fsnotify_group *group)
@@ -217,8 +184,6 @@ void fsnotify_put_group(struct fsnotify_group *group)
 
 	mutex_unlock(&fsnotify_grp_mutex);
 
-	/* and now it is really dead. _Nothing_ could be seeing it */
-	fsnotify_recalc_global_mask();
 	fsnotify_destroy_group(group);
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 225dc0c3a48c..07d3c8954721 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -365,8 +365,6 @@ static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode
 
 /* called from fsnotify listeners, such as fanotify or dnotify */
 
-/* must call when a group changes its ->mask */
-extern void fsnotify_recalc_global_mask(void);
 /* get a reference to an existing or create a new group */
 extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops);
 /* run all marks associated with this group and update group->mask */
-- 
cgit v1.2.3-59-g8ed1b


From 43709a288ed03aa0e2979ab63dd089b3889645c4 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:39 -0400
Subject: fsnotify: remove group->mask

group->mask is now useless.  It was originally a shortcut for fsnotify to
save on performance.  These checks are now redundant, so we remove them.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c        |  4 ----
 fs/notify/fanotify/fanotify_user.c | 23 +++++------------------
 fs/notify/group.c                  | 16 ----------------
 fs/notify/inotify/inotify_user.c   |  9 ---------
 include/linux/fsnotify_backend.h   | 11 -----------
 kernel/audit_watch.c               |  8 --------
 6 files changed, 5 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index c3dc15879a52..e92b2c87ae94 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -199,8 +199,6 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	if (dn_mark->dn == NULL)
 		fsnotify_destroy_mark(fsn_mark);
 
-	fsnotify_recalc_group_mask(dnotify_group);
-
 	mutex_unlock(&dnotify_mark_mutex);
 
 	fsnotify_put_mark(fsn_mark);
@@ -385,8 +383,6 @@ out:
 	if (destroy)
 		fsnotify_destroy_mark(fsn_mark);
 
-	fsnotify_recalc_group_mask(dnotify_group);
-
 	mutex_unlock(&dnotify_mark_mutex);
 	fsnotify_put_mark(fsn_mark);
 out_err:
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 50cea74bf1c8..25a3b4dfcf61 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -496,8 +496,6 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
 	fsnotify_put_mark(fsn_mark);
-	if (removed & group->mask)
-		fsnotify_recalc_group_mask(group);
 	if (removed & mnt->mnt_fsnotify_mask)
 		fsnotify_recalc_vfsmount_mask(mnt);
 
@@ -518,9 +516,6 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
 	/* matches the fsnotify_find_inode_mark() */
 	fsnotify_put_mark(fsn_mark);
-
-	if (removed & group->mask)
-		fsnotify_recalc_group_mask(group);
 	if (removed & inode->i_fsnotify_mask)
 		fsnotify_recalc_inode_mask(inode);
 
@@ -572,12 +567,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
 	fsnotify_put_mark(fsn_mark);
-	if (added) {
-		if (added & ~group->mask)
-			fsnotify_recalc_group_mask(group);
-		if (added & ~mnt->mnt_fsnotify_mask)
-			fsnotify_recalc_vfsmount_mask(mnt);
-	}
+	if (added & ~mnt->mnt_fsnotify_mask)
+		fsnotify_recalc_vfsmount_mask(mnt);
+
 	return 0;
 }
 
@@ -607,12 +599,8 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
 	fsnotify_put_mark(fsn_mark);
-	if (added) {
-		if (added & ~group->mask)
-			fsnotify_recalc_group_mask(group);
-		if (added & ~inode->i_fsnotify_mask)
-			fsnotify_recalc_inode_mask(inode);
-	}
+	if (added & ~inode->i_fsnotify_mask)
+		fsnotify_recalc_inode_mask(inode);
 	return 0;
 }
 
@@ -734,7 +722,6 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 			fsnotify_clear_vfsmount_marks_by_group(group);
 		else
 			fsnotify_clear_inode_marks_by_group(group);
-		fsnotify_recalc_group_mask(group);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 8da532dd6026..fc0d966b270f 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -35,22 +35,6 @@ LIST_HEAD(fsnotify_inode_groups);
 /* all groups registered to receive mount point filesystem notifications */
 LIST_HEAD(fsnotify_vfsmount_groups);
 
-/*
- * Update the group->mask by running all of the marks associated with this
- * group and finding the bitwise | of all of the mark->mask.
- */
-void fsnotify_recalc_group_mask(struct fsnotify_group *group)
-{
-	__u32 mask = 0;
-	struct fsnotify_mark *mark;
-
-	spin_lock(&group->mark_lock);
-	list_for_each_entry(mark, &group->marks_list, g_list)
-		mask |= mark->mask;
-	group->mask = mask;
-	spin_unlock(&group->mark_lock);
-}
-
 void fsnotify_add_vfsmount_group(struct fsnotify_group *group)
 {
 	struct fsnotify_group *group_iter;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index a4cd227c4c76..bf7f6d776c31 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -606,16 +606,11 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 		int dropped = (old_mask & ~new_mask);
 		/* more bits in this fsn_mark than the inode's mask? */
 		int do_inode = (new_mask & ~inode->i_fsnotify_mask);
-		/* more bits in this fsn_mark than the group? */
-		int do_group = (new_mask & ~group->mask);
 
 		/* update the inode with this new fsn_mark */
 		if (dropped || do_inode)
 			fsnotify_recalc_inode_mask(inode);
 
-		/* update the group mask with the new mask */
-		if (dropped || do_group)
-			fsnotify_recalc_group_mask(group);
 	}
 
 	/* return the wd */
@@ -673,10 +668,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	/* return the watch descriptor for this new mark */
 	ret = tmp_i_mark->wd;
 
-	/* if this mark added a new event update the group mask */
-	if (mask & ~group->mask)
-		fsnotify_recalc_group_mask(group);
-
 out_err:
 	/* match the ref from fsnotify_init_mark() */
 	fsnotify_put_mark(&tmp_i_mark->fsn_mark);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 07d3c8954721..c4e7aab87461 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -119,15 +119,6 @@ struct fsnotify_group {
 	 */
 	struct list_head vfsmount_group_list;
 
-	/*
-	 * Defines all of the event types in which this group is interested.
-	 * This mask is a bitwise OR of the FS_* events from above.  Each time
-	 * this mask changes for a group (if it changes) the correct functions
-	 * must be called to update the global structures which indicate global
-	 * interest in event types.
-	 */
-	__u32 mask;
-
 	/*
 	 * How the refcnt is used is up to each group.  When the refcnt hits 0
 	 * fsnotify will clean up all of the resources associated with this group.
@@ -367,8 +358,6 @@ static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode
 
 /* get a reference to an existing or create a new group */
 extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops);
-/* run all marks associated with this group and update group->mask */
-extern void fsnotify_recalc_group_mask(struct fsnotify_group *group);
 /* drop reference on a group from fsnotify_alloc_group */
 extern void fsnotify_put_group(struct fsnotify_group *group);
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 097a61c65fe0..1b87e757845d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -164,8 +164,6 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp)
 		return ERR_PTR(ret);
 	}
 
-	fsnotify_recalc_group_mask(audit_watch_group);
-
 	return parent;
 }
 
@@ -352,9 +350,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 	mutex_unlock(&audit_filter_mutex);
 
 	fsnotify_destroy_mark(&parent->mark);
-
-	fsnotify_recalc_group_mask(audit_watch_group);
-
 }
 
 /* Get path information necessary for adding watches. */
@@ -505,9 +500,6 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 			audit_put_parent(parent);
 		}
 	}
-
-	fsnotify_recalc_group_mask(audit_watch_group);
-
 }
 
 static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
-- 
cgit v1.2.3-59-g8ed1b


From 02436668d98385f5b5d9ffb695a37dadf98ed8a8 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:39 -0400
Subject: fsnotify: remove global fsnotify groups lists

The global fsnotify groups lists were invented as a way to increase the
performance of fsnotify by shortcutting events which were not interesting.
With the changes to walk the object lists rather than global groups lists
these shortcuts are not useful.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             |   5 --
 fs/notify/fsnotify.h             |   9 ----
 fs/notify/group.c                | 107 +--------------------------------------
 fs/notify/mark.c                 |   9 ----
 include/linux/fsnotify_backend.h |  15 ------
 5 files changed, 2 insertions(+), 143 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 9ba29ee747cf..1dd1fde1da08 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -219,11 +219,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	/* global tests shouldn't care about events on child only the specific event */
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
-	/* if no fsnotify listeners, nothing to do */
-	if (list_empty(&fsnotify_inode_groups) &&
-	    list_empty(&fsnotify_vfsmount_groups))
-		return 0;
- 
 	if (mask & FS_MODIFY)
 		__fsnotify_flush_ignored_mask(to_tell, data, data_is);
 
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index b41dbf5a125c..85e7d2b431d9 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,11 +6,6 @@
 #include <linux/srcu.h>
 #include <linux/types.h>
 
-/* all groups which receive inode fsnotify events */
-extern struct list_head fsnotify_inode_groups;
-/* all groups which receive vfsmount fsnotify events */
-extern struct list_head fsnotify_vfsmount_groups;
-
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
@@ -28,10 +23,6 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 				      struct fsnotify_group *group, struct vfsmount *mnt,
 				      int allow_dups);
 
-/* add a group to the inode group list */
-extern void fsnotify_add_inode_group(struct fsnotify_group *group);
-/* add a group to the vfsmount group list */
-extern void fsnotify_add_vfsmount_group(struct fsnotify_group *group);
 /* final kfree of a group */
 extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
 
diff --git a/fs/notify/group.c b/fs/notify/group.c
index fc0d966b270f..d309f38449cb 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -28,67 +28,6 @@
 
 #include <asm/atomic.h>
 
-/* protects writes to fsnotify_groups and fsnotify_mask */
-static DEFINE_MUTEX(fsnotify_grp_mutex);
-/* all groups registered to receive inode filesystem notifications */
-LIST_HEAD(fsnotify_inode_groups);
-/* all groups registered to receive mount point filesystem notifications */
-LIST_HEAD(fsnotify_vfsmount_groups);
-
-void fsnotify_add_vfsmount_group(struct fsnotify_group *group)
-{
-	struct fsnotify_group *group_iter;
-
-	mutex_lock(&fsnotify_grp_mutex);
-
-	if (!group->on_vfsmount_group_list) {
-		list_for_each_entry(group_iter, &fsnotify_vfsmount_groups,
-				    vfsmount_group_list) {
-			/* insert in front of this one? */
-			if (group < group_iter) {
-				/* list_add_tail() insert in front of group_iter */
-				list_add_tail_rcu(&group->inode_group_list,
-						  &group_iter->inode_group_list);
-				goto out;
-			}
-		}
-
-		/* apparently we need to be the last entry */
-		list_add_tail_rcu(&group->vfsmount_group_list, &fsnotify_vfsmount_groups);
-	}
-out:
-	group->on_vfsmount_group_list = 1;
-
-	mutex_unlock(&fsnotify_grp_mutex);
-}
-
-void fsnotify_add_inode_group(struct fsnotify_group *group)
-{
-	struct fsnotify_group *group_iter;
-
-	mutex_lock(&fsnotify_grp_mutex);
-
-	/* add to global group list */
-	if (!group->on_inode_group_list) {
-		list_for_each_entry(group_iter, &fsnotify_inode_groups,
-				    inode_group_list) {
-			if (group < group_iter) {
-				/* list_add_tail() insert in front of group_iter */
-				list_add_tail_rcu(&group->inode_group_list,
-						  &group_iter->inode_group_list);
-				goto out;
-			}
-		}
-
-		/* apparently we need to be the last entry */
-		list_add_tail_rcu(&group->inode_group_list, &fsnotify_inode_groups);
-	}
-out:
-	group->on_inode_group_list = 1;
-
-	mutex_unlock(&fsnotify_grp_mutex);
-}
-
 /*
  * Final freeing of a group
  */
@@ -123,52 +62,13 @@ static void fsnotify_destroy_group(struct fsnotify_group *group)
 		fsnotify_final_destroy_group(group);
 }
 
-/*
- * Remove this group from the global list of groups that will get events
- * this can be done even if there are still references and things still using
- * this group.  This just stops the group from getting new events.
- */
-static void __fsnotify_evict_group(struct fsnotify_group *group)
-{
-	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
-
-	if (group->on_inode_group_list)
-		list_del_rcu(&group->inode_group_list);
-	group->on_inode_group_list = 0;
-	if (group->on_vfsmount_group_list)
-		list_del_rcu(&group->vfsmount_group_list);
-	group->on_vfsmount_group_list = 0;
-}
-
-/*
- * Called when a group is no longer interested in getting events.  This can be
- * used if a group is misbehaving or if for some reason a group should no longer
- * get any filesystem events.
- */
-void fsnotify_evict_group(struct fsnotify_group *group)
-{
-	mutex_lock(&fsnotify_grp_mutex);
-	__fsnotify_evict_group(group);
-	mutex_unlock(&fsnotify_grp_mutex);
-}
-
 /*
  * Drop a reference to a group.  Free it if it's through.
  */
 void fsnotify_put_group(struct fsnotify_group *group)
 {
-	if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
-		return;
-
-	/*
-	 * OK, now we know that there's no other users *and* we hold mutex,
-	 * so no new references will appear
-	 */
-	__fsnotify_evict_group(group);
-
-	mutex_unlock(&fsnotify_grp_mutex);
-
-	fsnotify_destroy_group(group);
+	if (atomic_dec_and_test(&group->refcnt))
+		fsnotify_destroy_group(group);
 }
 
 /*
@@ -195,9 +95,6 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 	init_waitqueue_head(&group->notification_waitq);
 	group->max_events = UINT_MAX;
 
-	INIT_LIST_HEAD(&group->inode_group_list);
-	INIT_LIST_HEAD(&group->vfsmount_group_list);
-
 	spin_lock_init(&group->mark_lock);
 	INIT_LIST_HEAD(&group->marks_list);
 
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 236f29b066ed..325185e514bb 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -222,15 +222,6 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,
 	BUG_ON(inode && mnt);
 	BUG_ON(!inode && !mnt);
 
-	/*
-	 * if this group isn't being testing for inode type events we need
-	 * to start testing
-	 */
-	if (inode && unlikely(list_empty(&group->inode_group_list)))
-		fsnotify_add_inode_group(group);
-	else if (mnt && unlikely(list_empty(&group->vfsmount_group_list)))
-		fsnotify_add_vfsmount_group(group);
-
 	/*
 	 * LOCKING ORDER!!!!
 	 * mark->lock
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index c4e7aab87461..2e7cc8c2a151 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -108,17 +108,6 @@ struct fsnotify_ops {
  * everything will be cleaned up.
  */
 struct fsnotify_group {
-	/*
-	 * global list of all groups receiving events from fsnotify.
-	 * anchored by fsnotify_inode_groups and protected by either fsnotify_grp_mutex
-	 * or fsnotify_grp_srcu depending on write vs read.
-	 */
-	struct list_head inode_group_list;
-	/*
-	 * same as above except anchored by fsnotify_vfsmount_groups
-	 */
-	struct list_head vfsmount_group_list;
-
 	/*
 	 * How the refcnt is used is up to each group.  When the refcnt hits 0
 	 * fsnotify will clean up all of the resources associated with this group.
@@ -145,10 +134,6 @@ struct fsnotify_group {
 					 * a group */
 	struct list_head marks_list;	/* all inode marks for this group */
 
-	/* prevents double list_del of group_list.  protected by global fsnotify_grp_mutex */
-	bool on_inode_group_list;
-	bool on_vfsmount_group_list;
-
 	/* groups can define private fields here or use the void *private */
 	union {
 		void *private;
-- 
cgit v1.2.3-59-g8ed1b


From ce8f76fb7320297ccbe7c950fd9a2d727dd6a5a0 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:39 -0400
Subject: fsnotify: pass both the vfsmount mark and inode mark

should_send_event() and handle_event() will both need to look up the inode
event if they get a vfsmount event.  Lets just pass both at the same time
since we have them both after walking the lists in lockstep.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c          | 18 ++++++----
 fs/notify/fanotify/fanotify.c        | 15 +++++---
 fs/notify/fsnotify.c                 | 70 ++++++++++++++++++++++++------------
 fs/notify/inotify/inotify_fsnotify.c | 12 ++++---
 include/linux/fsnotify_backend.h     |  7 ++--
 kernel/audit_tree.c                  |  6 ++--
 kernel/audit_watch.c                 |  8 +++--
 7 files changed, 91 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index e92b2c87ae94..bda588b831ad 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -83,7 +83,8 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
  * events.
  */
 static int dnotify_handle_event(struct fsnotify_group *group,
-				struct fsnotify_mark *mark,
+				struct fsnotify_mark *inode_mark,
+				struct fsnotify_mark *vfsmount_mark,
 				struct fsnotify_event *event)
 {
 	struct dnotify_mark *dn_mark;
@@ -93,11 +94,13 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 	struct fown_struct *fown;
 	__u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
 
+	BUG_ON(vfsmount_mark);
+
 	to_tell = event->to_tell;
 
-	dn_mark = container_of(mark, struct dnotify_mark, fsn_mark);
+	dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
 
-	spin_lock(&mark->lock);
+	spin_lock(&inode_mark->lock);
 	prev = &dn_mark->dn;
 	while ((dn = *prev) != NULL) {
 		if ((dn->dn_mask & test_mask) == 0) {
@@ -111,11 +114,11 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 		else {
 			*prev = dn->dn_next;
 			kmem_cache_free(dnotify_struct_cache, dn);
-			dnotify_recalc_inode_mask(mark);
+			dnotify_recalc_inode_mask(inode_mark);
 		}
 	}
 
-	spin_unlock(&mark->lock);
+	spin_unlock(&inode_mark->lock);
 
 	return 0;
 }
@@ -126,8 +129,9 @@ static int dnotify_handle_event(struct fsnotify_group *group,
  */
 static bool dnotify_should_send_event(struct fsnotify_group *group,
 				      struct inode *inode, struct vfsmount *mnt,
-				      struct fsnotify_mark *mark, __u32 mask,
-				      void *data, int data_type)
+				      struct fsnotify_mark *inode_mark,
+				      struct fsnotify_mark *vfsmount_mark,
+				      __u32 mask, void *data, int data_type)
 {
 	/* not a dir, dnotify doesn't care */
 	if (!S_ISDIR(inode->i_mode))
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index fbd7f35c6134..ef4fa4a45c94 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -115,7 +115,8 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
 #endif
 
 static int fanotify_handle_event(struct fsnotify_group *group,
-				 struct fsnotify_mark *mark,
+				 struct fsnotify_mark *inode_mark,
+				 struct fsnotify_mark *fanotify_mark,
 				 struct fsnotify_event *event)
 {
 	int ret = 0;
@@ -196,8 +197,11 @@ static bool should_send_inode_event(struct fsnotify_group *group,
 		return true;
 }
 
-static bool fanotify_should_send_event(struct fsnotify_group *group, struct inode *to_tell,
-				       struct vfsmount *mnt, struct fsnotify_mark *mark,
+static bool fanotify_should_send_event(struct fsnotify_group *group,
+				       struct inode *to_tell,
+				       struct vfsmount *mnt,
+				       struct fsnotify_mark *inode_mark,
+				       struct fsnotify_mark *vfsmount_mark,
 				       __u32 mask, void *data, int data_type)
 {
 	pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x data=%p data_type=%d\n",
@@ -213,9 +217,10 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, struct inod
 		return false;
 
 	if (mnt)
-		return should_send_vfsmount_event(group, mnt, to_tell, mark, mask);
+		return should_send_vfsmount_event(group, mnt, to_tell,
+						  vfsmount_mark, mask);
 	else
-		return should_send_inode_event(group, to_tell, mark, mask);
+		return should_send_inode_event(group, to_tell, inode_mark, mask);
 }
 
 const struct fsnotify_ops fanotify_fsnotify_ops = {
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index cdaa51cb698c..090b64c3b4f9 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -141,28 +141,51 @@ void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
 static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
-			 struct fsnotify_mark *mark,
-			__u32 mask, void *data,
+			 struct fsnotify_mark *inode_mark,
+			 struct fsnotify_mark *vfsmount_mark,
+			 __u32 mask, void *data,
 			 int data_is, u32 cookie,
 			 const unsigned char *file_name,
 			 struct fsnotify_event **event)
 {
-	struct fsnotify_group *group = mark->group;
-	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+	struct fsnotify_group *group = inode_mark->group;
+	__u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+	__u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
 	pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p"
 		 " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell,
-		 mnt, mark, mask, data, data_is, cookie, *event);
+		 mnt, inode_mark, mask, data, data_is, cookie, *event);
+
+	/* clear ignored on inode modification */
+	if (mask & FS_MODIFY) {
+		if (inode_mark &&
+		    !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+			inode_mark->ignored_mask = 0;
+		if (vfsmount_mark &&
+		    !(vfsmount_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+			vfsmount_mark->ignored_mask = 0;
+	}
 
-	if ((mask & FS_MODIFY) &&
-	    !(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
-		mark->ignored_mask = 0;
+	/* does the inode mark tell us to do something? */
+	if (inode_mark) {
+		inode_test_mask &= inode_mark->mask;
+		inode_test_mask &= ~inode_mark->ignored_mask;
+	}
 
-	if (!(test_mask & mark->mask & ~mark->ignored_mask))
+	/* does the vfsmount_mark tell us to do something? */
+	if (vfsmount_mark) {
+		vfsmount_test_mask &= vfsmount_mark->mask;
+		vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
+		if (inode_mark)
+			vfsmount_test_mask &= ~inode_mark->ignored_mask;
+	}
+
+	if (!inode_test_mask && !vfsmount_test_mask)
 		return 0;
 
-	if (group->ops->should_send_event(group, to_tell, mnt, mark, mask,
-					  data, data_is) == false)
+	if (group->ops->should_send_event(group, to_tell, mnt, inode_mark,
+					  vfsmount_mark, mask, data,
+					  data_is) == false)
 		return 0;
 
 	if (!*event) {
@@ -172,7 +195,7 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
 		if (!*event)
 			return -ENOMEM;
 	}
-	return group->ops->handle_event(group, mark, *event);
+	return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
 }
 
 /*
@@ -213,14 +236,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 
 	if ((mask & FS_MODIFY) ||
 	    (test_mask & to_tell->i_fsnotify_mask))
-		inode_node = to_tell->i_fsnotify_marks.first;
+		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+					      &fsnotify_mark_srcu);
 	else
 		inode_node = NULL;
 
 	if (mnt) {
 		if ((mask & FS_MODIFY) ||
 		    (test_mask & mnt->mnt_fsnotify_mask))
-			vfsmount_node = mnt->mnt_fsnotify_marks.first;
+			vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
+							 &fsnotify_mark_srcu);
 		else
 			vfsmount_node = NULL;
 	} else {
@@ -245,26 +270,27 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 
 		if (inode_group < vfsmount_group) {
 			/* handle inode */
-			send_to_group(to_tell, NULL, inode_mark, mask, data,
+			send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
 				      data_is, cookie, file_name, &event);
 			used_inode = true;
 		} else if (vfsmount_group < inode_group) {
-			send_to_group(to_tell, mnt, vfsmount_mark, mask, data,
+			send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
 				      data_is, cookie, file_name, &event);
 			used_vfsmount = true;
 		} else {
-			send_to_group(to_tell, mnt, vfsmount_mark, mask, data,
-				      data_is, cookie, file_name, &event);
+			send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
+				      mask, data, data_is, cookie, file_name,
+				      &event);
 			used_vfsmount = true;
-			send_to_group(to_tell, NULL, inode_mark, mask, data,
-				      data_is, cookie, file_name, &event);
 			used_inode = true;
 		}
 
 		if (used_inode)
-			inode_node = inode_node->next;
+			inode_node = srcu_dereference(inode_node->next,
+						      &fsnotify_mark_srcu);
 		if (used_vfsmount)
-			vfsmount_node = vfsmount_node->next;
+			vfsmount_node = srcu_dereference(vfsmount_node->next,
+							 &fsnotify_mark_srcu);
 	}
 
 	srcu_read_unlock(&fsnotify_mark_srcu, idx);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 7cf518b25daa..e53f49731b6e 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -90,7 +90,8 @@ static struct fsnotify_event *inotify_merge(struct list_head *list,
 }
 
 static int inotify_handle_event(struct fsnotify_group *group,
-				struct fsnotify_mark *mark,
+				struct fsnotify_mark *inode_mark,
+				struct fsnotify_mark *vfsmount_mark,
 				struct fsnotify_event *event)
 {
 	struct inotify_inode_mark *i_mark;
@@ -100,12 +101,14 @@ static int inotify_handle_event(struct fsnotify_group *group,
 	struct fsnotify_event *added_event;
 	int wd, ret = 0;
 
+	BUG_ON(vfsmount_mark);
+
 	pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
 		 event, event->to_tell, event->mask);
 
 	to_tell = event->to_tell;
 
-	i_mark = container_of(mark, struct inotify_inode_mark,
+	i_mark = container_of(inode_mark, struct inotify_inode_mark,
 			      fsn_mark);
 	wd = i_mark->wd;
 
@@ -127,8 +130,8 @@ static int inotify_handle_event(struct fsnotify_group *group,
 			ret = PTR_ERR(added_event);
 	}
 
-	if (mark->mask & IN_ONESHOT)
-		fsnotify_destroy_mark(mark);
+	if (inode_mark->mask & IN_ONESHOT)
+		fsnotify_destroy_mark(inode_mark);
 
 	return ret;
 }
@@ -140,6 +143,7 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify
 
 static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
 				      struct vfsmount *mnt, struct fsnotify_mark *mark,
+				      struct fsnotify_mark *vfsmount_mark,
 				      __u32 mask, void *data, int data_type)
 {
 	if ((mark->mask & FS_EXCL_UNLINK) &&
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 2e7cc8c2a151..d38f922977f9 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -92,9 +92,12 @@ struct fsnotify_event_private_data;
  */
 struct fsnotify_ops {
 	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode,
-				  struct vfsmount *mnt, struct fsnotify_mark *mark,
+				  struct vfsmount *mnt, struct fsnotify_mark *inode_mark,
+				  struct fsnotify_mark *vfsmount_mark,
 				  __u32 mask, void *data, int data_type);
-	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_mark *mark,
+	int (*handle_event)(struct fsnotify_group *group,
+			    struct fsnotify_mark *inode_mark,
+			    struct fsnotify_mark *vfsmount_mark,
 			    struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2abb99f3459d..781ab7f4e35c 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -904,7 +904,8 @@ static void evict_chunk(struct audit_chunk *chunk)
 }
 
 static int audit_tree_handle_event(struct fsnotify_group *group,
-				   struct fsnotify_mark *mark,
+				   struct fsnotify_mark *inode_mark,
+				   struct fsnotify_mark *vfsmonut_mark,
 				   struct fsnotify_event *event)
 {
 	BUG();
@@ -920,7 +921,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
 }
 
 static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
-				  struct vfsmount *mnt, struct fsnotify_mark *mark,
+				  struct vfsmount *mnt, struct fsnotify_mark *inode_mark,
+				  struct fsnotify_mark *vfsmount_mark,
 				  __u32 mask, void *data, int data_type)
 {
 	return false;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 1b87e757845d..a273cf340527 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -503,7 +503,8 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 }
 
 static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
-					  struct vfsmount *mnt, struct fsnotify_mark *mark,
+					  struct vfsmount *mnt, struct fsnotify_mark *inode_mark,
+					  struct fsnotify_mark *vfsmount_mark,
 					  __u32 mask, void *data, int data_type)
 {
        return true;
@@ -511,7 +512,8 @@ static bool audit_watch_should_send_event(struct fsnotify_group *group, struct i
 
 /* Update watch data in audit rules based on fsnotify events. */
 static int audit_watch_handle_event(struct fsnotify_group *group,
-				    struct fsnotify_mark *mark,
+				    struct fsnotify_mark *inode_mark,
+				    struct fsnotify_mark *vfsmount_mark,
 				    struct fsnotify_event *event)
 {
 	struct inode *inode;
@@ -519,7 +521,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 	const char *dname = event->file_name;
 	struct audit_parent *parent;
 
-	parent = container_of(mark, struct audit_parent, mark);
+	parent = container_of(inode_mark, struct audit_parent, mark);
 
 	BUG_ON(group != audit_watch_group);
 
-- 
cgit v1.2.3-59-g8ed1b


From 1968f5eed54ce47bde488fd9a450912e4a2d7138 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 28 Jul 2010 10:18:39 -0400
Subject: fanotify: use both marks when possible

fanotify currently, when given a vfsmount_mark will look up (if it exists)
the corresponding inode mark.  This patch drops that lookup and uses the
mark provided.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c          |  2 +-
 fs/notify/fanotify/fanotify.c        | 88 ++++++++++++++----------------------
 fs/notify/fsnotify.c                 |  2 +-
 fs/notify/inotify/inotify_fsnotify.c |  4 +-
 include/linux/fsnotify_backend.h     |  2 +-
 kernel/audit_tree.c                  |  2 +-
 kernel/audit_watch.c                 |  2 +-
 7 files changed, 41 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index bda588b831ad..3344bdd5506e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -128,7 +128,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
  * userspace notification for that pair.
  */
 static bool dnotify_should_send_event(struct fsnotify_group *group,
-				      struct inode *inode, struct vfsmount *mnt,
+				      struct inode *inode,
 				      struct fsnotify_mark *inode_mark,
 				      struct fsnotify_mark *vfsmount_mark,
 				      __u32 mask, void *data, int data_type)
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index ef4fa4a45c94..eb8f73c9c131 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -153,59 +153,20 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 	return ret;
 }
 
-static bool should_send_vfsmount_event(struct fsnotify_group *group,
-				       struct vfsmount *mnt,
-				       struct inode *inode,
-				       struct fsnotify_mark *mnt_mark,
-				       __u32 mask)
-{
-	struct fsnotify_mark *inode_mark;
-
-	pr_debug("%s: group=%p vfsmount=%p mark=%p mask=%x\n",
-		 __func__, group, mnt, mnt_mark, mask);
-
-	mask &= mnt_mark->mask;
-	mask &= ~mnt_mark->ignored_mask;
-
-	if (mask) {
-		inode_mark = fsnotify_find_inode_mark(group, inode);
-		if (inode_mark) {
-			mask &= ~inode_mark->ignored_mask;
-			fsnotify_put_mark(inode_mark);
-		}
-	}
-
-	return mask;
-}
-
-static bool should_send_inode_event(struct fsnotify_group *group,
-				    struct inode *inode,
-				    struct fsnotify_mark *mark,
-				    __u32 mask)
-{
-	pr_debug("%s: group=%p inode=%p mark=%p mask=%x\n",
-		 __func__, group, inode, mark, mask);
-
-	/*
-	 * if the event is for a child and this inode doesn't care about
-	 * events on the child, don't send it!
-	 */
-	if ((mask & FS_EVENT_ON_CHILD) &&
-	    !(mark->mask & FS_EVENT_ON_CHILD))
-		return false;
-	else
-		return true;
-}
-
 static bool fanotify_should_send_event(struct fsnotify_group *group,
 				       struct inode *to_tell,
-				       struct vfsmount *mnt,
 				       struct fsnotify_mark *inode_mark,
-				       struct fsnotify_mark *vfsmount_mark,
-				       __u32 mask, void *data, int data_type)
+				       struct fsnotify_mark *vfsmnt_mark,
+				       __u32 event_mask, void *data, int data_type)
 {
-	pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x data=%p data_type=%d\n",
-		 __func__, group, to_tell, mnt, mask, data, data_type);
+	__u32 marks_mask, marks_ignored_mask;
+
+	pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
+		 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
+		 inode_mark, vfsmnt_mark, event_mask, data, data_type);
+
+	pr_debug("%s: group=%p vfsmount_mark=%p inode_mark=%p mask=%x\n",
+		 __func__, group, vfsmnt_mark, inode_mark, event_mask);
 
 	/* sorry, fanotify only gives a damn about files and dirs */
 	if (!S_ISREG(to_tell->i_mode) &&
@@ -216,11 +177,30 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
 	if (data_type != FSNOTIFY_EVENT_FILE)
 		return false;
 
-	if (mnt)
-		return should_send_vfsmount_event(group, mnt, to_tell,
-						  vfsmount_mark, mask);
-	else
-		return should_send_inode_event(group, to_tell, inode_mark, mask);
+	if (inode_mark && vfsmnt_mark) {
+		marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
+		marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
+	} else if (inode_mark) {
+		/*
+		 * if the event is for a child and this inode doesn't care about
+		 * events on the child, don't send it!
+		 */
+		if ((event_mask & FS_EVENT_ON_CHILD) &&
+		    !(inode_mark->mask & FS_EVENT_ON_CHILD))
+			return false;
+		marks_mask = inode_mark->mask;
+		marks_ignored_mask = inode_mark->ignored_mask;
+	} else if (vfsmnt_mark) {
+		marks_mask = vfsmnt_mark->mask;
+		marks_ignored_mask = vfsmnt_mark->ignored_mask;
+	} else {
+		BUG();
+	}
+
+	if (event_mask & marks_mask & ~marks_ignored_mask)
+		return true;
+
+	return false;
 }
 
 const struct fsnotify_ops fanotify_fsnotify_ops = {
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 090b64c3b4f9..4d2a82c1ceb1 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -183,7 +183,7 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
 	if (!inode_test_mask && !vfsmount_test_mask)
 		return 0;
 
-	if (group->ops->should_send_event(group, to_tell, mnt, inode_mark,
+	if (group->ops->should_send_event(group, to_tell, inode_mark,
 					  vfsmount_mark, mask, data,
 					  data_is) == false)
 		return 0;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index e53f49731b6e..5e73eeb2c697 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -142,11 +142,11 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify
 }
 
 static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
-				      struct vfsmount *mnt, struct fsnotify_mark *mark,
+				      struct fsnotify_mark *inode_mark,
 				      struct fsnotify_mark *vfsmount_mark,
 				      __u32 mask, void *data, int data_type)
 {
-	if ((mark->mask & FS_EXCL_UNLINK) &&
+	if ((inode_mark->mask & FS_EXCL_UNLINK) &&
 	    (data_type == FSNOTIFY_EVENT_FILE)) {
 		struct file *file  = data;
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index d38f922977f9..9bbfd7204b04 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -92,7 +92,7 @@ struct fsnotify_event_private_data;
  */
 struct fsnotify_ops {
 	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode,
-				  struct vfsmount *mnt, struct fsnotify_mark *inode_mark,
+				  struct fsnotify_mark *inode_mark,
 				  struct fsnotify_mark *vfsmount_mark,
 				  __u32 mask, void *data, int data_type);
 	int (*handle_event)(struct fsnotify_group *group,
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 781ab7f4e35c..7f18d3a4527e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -921,7 +921,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
 }
 
 static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
-				  struct vfsmount *mnt, struct fsnotify_mark *inode_mark,
+				  struct fsnotify_mark *inode_mark,
 				  struct fsnotify_mark *vfsmount_mark,
 				  __u32 mask, void *data, int data_type)
 {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index a273cf340527..6bf2306be7d6 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -503,7 +503,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 }
 
 static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
-					  struct vfsmount *mnt, struct fsnotify_mark *inode_mark,
+					  struct fsnotify_mark *inode_mark,
 					  struct fsnotify_mark *vfsmount_mark,
 					  __u32 mask, void *data, int data_type)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 685fd0b4ea3f0f1d5385610b0d5b57775a8d5842 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 29 Jul 2010 11:16:32 +0100
Subject: irq: Add new IRQ flag IRQF_NO_SUSPEND

A small number of users of IRQF_TIMER are using it for the implied no
suspend behaviour on interrupts which are not timer interrupts.

Therefore add a new IRQF_NO_SUSPEND flag, rename IRQF_TIMER to
__IRQF_TIMER and redefine IRQF_TIMER in terms of these new flags.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: xen-devel@lists.xensource.com
Cc: linux-input@vger.kernel.org
Cc: linuxppc-dev@ozlabs.org
Cc: devicetree-discuss@lists.ozlabs.org
LKML-Reference: <1280398595-29708-1-git-send-email-ian.campbell@citrix.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h | 7 ++++++-
 kernel/irq/manage.c       | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c2331138ca1b..a0384a4d1e6f 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -53,16 +53,21 @@
  * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished.
  *                Used by threaded interrupts which need to keep the
  *                irq line disabled until the threaded handler has been run.
+ * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
+ *
  */
 #define IRQF_DISABLED		0x00000020
 #define IRQF_SAMPLE_RANDOM	0x00000040
 #define IRQF_SHARED		0x00000080
 #define IRQF_PROBE_SHARED	0x00000100
-#define IRQF_TIMER		0x00000200
+#define __IRQF_TIMER		0x00000200
 #define IRQF_PERCPU		0x00000400
 #define IRQF_NOBALANCING	0x00000800
 #define IRQF_IRQPOLL		0x00001000
 #define IRQF_ONESHOT		0x00002000
+#define IRQF_NO_SUSPEND		0x00004000
+
+#define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND)
 
 /*
  * Bits used by threaded handlers:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e1497481fe8a..c3003e9d91a3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
 void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 {
 	if (suspend) {
-		if (!desc->action || (desc->action->flags & IRQF_TIMER))
+		if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
 			return;
 		desc->status |= IRQ_SUSPENDED;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From bb8f563c848faa113059973f68c24a3bb6a9585e Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin.vincent@stericsson.com>
Date: Wed, 21 Jul 2010 12:53:57 +0100
Subject: ARM: 6243/1: mmci: pass power_mode to the translate_vdd callback

Platforms may have some external power control which need to be
controlled from board specific code.  Rename the translate_vdd()
callback to vdd_handler() and pass it the power mode.

Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Rabin Vincent <rabin.vincent@stericsson.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/mmc/host/mmci.c   | 13 +++----------
 include/linux/amba/mmci.h | 10 ++++++----
 2 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index 3eaa0e9373cd..7ae3eeeefc29 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -493,16 +493,9 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 			/* This implicitly enables the regulator */
 			mmc_regulator_set_ocr(host->vcc, ios->vdd);
 #endif
-		/*
-		 * The translate_vdd function is not used if you have
-		 * an external regulator, or your design is really weird.
-		 * Using it would mean sending in power control BOTH using
-		 * a regulator AND the 4 MMCIPWR bits. If we don't have
-		 * a regulator, we might have some other platform specific
-		 * power control behind this translate function.
-		 */
-		if (!host->vcc && host->plat->translate_vdd)
-			pwr |= host->plat->translate_vdd(mmc_dev(mmc), ios->vdd);
+		if (host->plat->vdd_handler)
+			pwr |= host->plat->vdd_handler(mmc_dev(mmc), ios->vdd,
+						       ios->power_mode);
 		/* The ST version does not have this, fall through to POWER_ON */
 		if (host->hw_designer != AMBA_VENDOR_ST) {
 			pwr |= MCI_PWR_UP;
diff --git a/include/linux/amba/mmci.h b/include/linux/amba/mmci.h
index 7e466fe72025..ca84ce70d5d5 100644
--- a/include/linux/amba/mmci.h
+++ b/include/linux/amba/mmci.h
@@ -15,9 +15,10 @@
  * @ocr_mask: available voltages on the 4 pins from the block, this
  * is ignored if a regulator is used, see the MMC_VDD_* masks in
  * mmc/host.h
- * @translate_vdd: a callback function to translate a MMC_VDD_*
- * mask into a value to be binary or:ed and written into the
- * MMCIPWR register of the block
+ * @vdd_handler: a callback function to translate a MMC_VDD_*
+ * mask into a value to be binary (or set some other custom bits
+ * in MMCIPWR) or:ed and written into the MMCIPWR register of the
+ * block.  May also control external power based on the power_mode.
  * @status: if no GPIO read function was given to the block in
  * gpio_wp (below) this function will be called to determine
  * whether a card is present in the MMC slot or not
@@ -29,7 +30,8 @@
 struct mmci_platform_data {
 	unsigned int f_max;
 	unsigned int ocr_mask;
-	u32 (*translate_vdd)(struct device *, unsigned int);
+	u32 (*vdd_handler)(struct device *, unsigned int vdd,
+			   unsigned char power_mode);
 	unsigned int (*status)(struct device *);
 	int	gpio_wp;
 	int	gpio_cd;
-- 
cgit v1.2.3-59-g8ed1b


From 872e330e38806d835bd6c311c93ab998e2fb9058 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 29 Jul 2010 18:19:22 +0200
Subject: firewire: add isochronous multichannel reception

This adds the DMA context programming and userspace ABI for multichannel
reception, i.e. for listening on multiple channel numbers by means of a
single DMA context.

The use case is reception of more streams than there are IR DMA units
offered by the link layer.  This is already implemented by the older
ohci1394 + ieee1394 + raw1394 stack.  And as discussed recently on
linux1394-devel, this feature is occasionally used in practice.

The big drawbacks of this mode are that buffer layout and interrupt
generation necessarily differ from single-channel reception:  Headers
and trailers are not stripped from packets, packets are not aligned with
buffer chunks, interrupts are per buffer chunk, not per packet.

These drawbacks also cause a rather hefty code footprint to support this
rarely used OHCI-1394 feature.  (367 lines added, among them 94 lines of
added userspace ABI documentation.)

This implementation enforces that a multichannel reception context may
only listen to channels to which no single-channel context on the same
link layer is presently listening to.  OHCI-1394 would allow to overlay
single-channel contexts by the multi-channel context, but this would be
a departure from the present first-come-first-served policy of IR
context creation.

The implementation is heavily based on an earlier one by Jay Fenlason.
Thanks Jay.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-cdev.c  |  93 ++++++++++---
 drivers/firewire/core-iso.c   |  32 ++++-
 drivers/firewire/core.h       |   2 +
 drivers/firewire/ohci.c       | 316 +++++++++++++++++++++++++++++++++---------
 include/linux/firewire-cdev.h | 281 +++++++++++++++++++++++++------------
 include/linux/firewire.h      |  29 ++--
 6 files changed, 560 insertions(+), 193 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index cf989e1635e1..ba23646bb108 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -193,6 +193,11 @@ struct iso_interrupt_event {
 	struct fw_cdev_event_iso_interrupt interrupt;
 };
 
+struct iso_interrupt_mc_event {
+	struct event event;
+	struct fw_cdev_event_iso_interrupt_mc interrupt;
+};
+
 struct iso_resource_event {
 	struct event event;
 	struct fw_cdev_event_iso_resource iso_resource;
@@ -415,6 +420,7 @@ union ioctl_arg {
 	struct fw_cdev_get_cycle_timer2		get_cycle_timer2;
 	struct fw_cdev_send_phy_packet		send_phy_packet;
 	struct fw_cdev_receive_phy_packets	receive_phy_packets;
+	struct fw_cdev_set_iso_channels		set_iso_channels;
 };
 
 static int ioctl_get_info(struct client *client, union ioctl_arg *arg)
@@ -932,26 +938,54 @@ static void iso_callback(struct fw_iso_context *context, u32 cycle,
 		    sizeof(e->interrupt) + header_length, NULL, 0);
 }
 
+static void iso_mc_callback(struct fw_iso_context *context,
+			    dma_addr_t completed, void *data)
+{
+	struct client *client = data;
+	struct iso_interrupt_mc_event *e;
+
+	e = kmalloc(sizeof(*e), GFP_ATOMIC);
+	if (e == NULL) {
+		fw_notify("Out of memory when allocating event\n");
+		return;
+	}
+	e->interrupt.type      = FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL;
+	e->interrupt.closure   = client->iso_closure;
+	e->interrupt.completed = fw_iso_buffer_lookup(&client->buffer,
+						      completed);
+	queue_event(client, &e->event, &e->interrupt,
+		    sizeof(e->interrupt), NULL, 0);
+}
+
 static int ioctl_create_iso_context(struct client *client, union ioctl_arg *arg)
 {
 	struct fw_cdev_create_iso_context *a = &arg->create_iso_context;
 	struct fw_iso_context *context;
+	fw_iso_callback_t cb;
 
 	BUILD_BUG_ON(FW_CDEV_ISO_CONTEXT_TRANSMIT != FW_ISO_CONTEXT_TRANSMIT ||
-		     FW_CDEV_ISO_CONTEXT_RECEIVE  != FW_ISO_CONTEXT_RECEIVE);
-
-	if (a->channel > 63)
-		return -EINVAL;
+		     FW_CDEV_ISO_CONTEXT_RECEIVE  != FW_ISO_CONTEXT_RECEIVE  ||
+		     FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL !=
+					FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL);
 
 	switch (a->type) {
-	case FW_ISO_CONTEXT_RECEIVE:
-		if (a->header_size < 4 || (a->header_size & 3))
+	case FW_ISO_CONTEXT_TRANSMIT:
+		if (a->speed > SCODE_3200 || a->channel > 63)
 			return -EINVAL;
+
+		cb = iso_callback;
 		break;
 
-	case FW_ISO_CONTEXT_TRANSMIT:
-		if (a->speed > SCODE_3200)
+	case FW_ISO_CONTEXT_RECEIVE:
+		if (a->header_size < 4 || (a->header_size & 3) ||
+		    a->channel > 63)
 			return -EINVAL;
+
+		cb = iso_callback;
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+		cb = (fw_iso_callback_t)iso_mc_callback;
 		break;
 
 	default:
@@ -959,8 +993,7 @@ static int ioctl_create_iso_context(struct client *client, union ioctl_arg *arg)
 	}
 
 	context = fw_iso_context_create(client->device->card, a->type,
-					a->channel, a->speed, a->header_size,
-					iso_callback, client);
+			a->channel, a->speed, a->header_size, cb, client);
 	if (IS_ERR(context))
 		return PTR_ERR(context);
 
@@ -980,6 +1013,17 @@ static int ioctl_create_iso_context(struct client *client, union ioctl_arg *arg)
 	return 0;
 }
 
+static int ioctl_set_iso_channels(struct client *client, union ioctl_arg *arg)
+{
+	struct fw_cdev_set_iso_channels *a = &arg->set_iso_channels;
+	struct fw_iso_context *ctx = client->iso_context;
+
+	if (ctx == NULL || a->handle != 0)
+		return -EINVAL;
+
+	return fw_iso_context_set_channels(ctx, &a->channels);
+}
+
 /* Macros for decoding the iso packet control header. */
 #define GET_PAYLOAD_LENGTH(v)	((v) & 0xffff)
 #define GET_INTERRUPT(v)	(((v) >> 16) & 0x01)
@@ -993,7 +1037,7 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg)
 	struct fw_cdev_queue_iso *a = &arg->queue_iso;
 	struct fw_cdev_iso_packet __user *p, *end, *next;
 	struct fw_iso_context *ctx = client->iso_context;
-	unsigned long payload, buffer_end, transmit_header_bytes;
+	unsigned long payload, buffer_end, transmit_header_bytes = 0;
 	u32 control;
 	int count;
 	struct {
@@ -1013,7 +1057,6 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg)
 	 * use the indirect payload, the iso buffer need not be mapped
 	 * and the a->data pointer is ignored.
 	 */
-
 	payload = (unsigned long)a->data - client->vm_start;
 	buffer_end = client->buffer.page_count << PAGE_SHIFT;
 	if (a->data == 0 || client->buffer.pages == NULL ||
@@ -1022,8 +1065,10 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg)
 		buffer_end = 0;
 	}
 
-	p = (struct fw_cdev_iso_packet __user *)u64_to_uptr(a->packets);
+	if (ctx->type == FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL && payload & 3)
+		return -EINVAL;
 
+	p = (struct fw_cdev_iso_packet __user *)u64_to_uptr(a->packets);
 	if (!access_ok(VERIFY_READ, p, a->size))
 		return -EFAULT;
 
@@ -1039,19 +1084,24 @@ static int ioctl_queue_iso(struct client *client, union ioctl_arg *arg)
 		u.packet.sy = GET_SY(control);
 		u.packet.header_length = GET_HEADER_LENGTH(control);
 
-		if (ctx->type == FW_ISO_CONTEXT_TRANSMIT) {
-			if (u.packet.header_length % 4 != 0)
+		switch (ctx->type) {
+		case FW_ISO_CONTEXT_TRANSMIT:
+			if (u.packet.header_length & 3)
 				return -EINVAL;
 			transmit_header_bytes = u.packet.header_length;
-		} else {
-			/*
-			 * We require that header_length is a multiple of
-			 * the fixed header size, ctx->header_size.
-			 */
+			break;
+
+		case FW_ISO_CONTEXT_RECEIVE:
 			if (u.packet.header_length == 0 ||
 			    u.packet.header_length % ctx->header_size != 0)
 				return -EINVAL;
-			transmit_header_bytes = 0;
+			break;
+
+		case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+			if (u.packet.payload_length == 0 ||
+			    u.packet.payload_length & 3)
+				return -EINVAL;
+			break;
 		}
 
 		next = (struct fw_cdev_iso_packet __user *)
@@ -1534,6 +1584,7 @@ static int (* const ioctl_handlers[])(struct client *, union ioctl_arg *) = {
 	[0x14] = ioctl_get_cycle_timer2,
 	[0x15] = ioctl_send_phy_packet,
 	[0x16] = ioctl_receive_phy_packets,
+	[0x17] = ioctl_set_iso_channels,
 };
 
 static int dispatch_ioctl(struct client *client,
diff --git a/drivers/firewire/core-iso.c b/drivers/firewire/core-iso.c
index 4fe932e60fb8..0c8e662a5daf 100644
--- a/drivers/firewire/core-iso.c
+++ b/drivers/firewire/core-iso.c
@@ -117,6 +117,23 @@ void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer,
 }
 EXPORT_SYMBOL(fw_iso_buffer_destroy);
 
+/* Convert DMA address to offset into virtually contiguous buffer. */
+size_t fw_iso_buffer_lookup(struct fw_iso_buffer *buffer, dma_addr_t completed)
+{
+	int i;
+	dma_addr_t address;
+	ssize_t offset;
+
+	for (i = 0; i < buffer->page_count; i++) {
+		address = page_private(buffer->pages[i]);
+		offset = (ssize_t)completed - (ssize_t)address;
+		if (offset > 0 && offset <= PAGE_SIZE)
+			return (i << PAGE_SHIFT) + offset;
+	}
+
+	return 0;
+}
+
 struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
 		int type, int channel, int speed, size_t header_size,
 		fw_iso_callback_t callback, void *callback_data)
@@ -133,7 +150,7 @@ struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
 	ctx->channel = channel;
 	ctx->speed = speed;
 	ctx->header_size = header_size;
-	ctx->callback = callback;
+	ctx->callback.sc = callback;
 	ctx->callback_data = callback_data;
 
 	return ctx;
@@ -142,9 +159,7 @@ EXPORT_SYMBOL(fw_iso_context_create);
 
 void fw_iso_context_destroy(struct fw_iso_context *ctx)
 {
-	struct fw_card *card = ctx->card;
-
-	card->driver->free_iso_context(ctx);
+	ctx->card->driver->free_iso_context(ctx);
 }
 EXPORT_SYMBOL(fw_iso_context_destroy);
 
@@ -155,14 +170,17 @@ int fw_iso_context_start(struct fw_iso_context *ctx,
 }
 EXPORT_SYMBOL(fw_iso_context_start);
 
+int fw_iso_context_set_channels(struct fw_iso_context *ctx, u64 *channels)
+{
+	return ctx->card->driver->set_iso_channels(ctx, channels);
+}
+
 int fw_iso_context_queue(struct fw_iso_context *ctx,
 			 struct fw_iso_packet *packet,
 			 struct fw_iso_buffer *buffer,
 			 unsigned long payload)
 {
-	struct fw_card *card = ctx->card;
-
-	return card->driver->queue_iso(ctx, packet, buffer, payload);
+	return ctx->card->driver->queue_iso(ctx, packet, buffer, payload);
 }
 EXPORT_SYMBOL(fw_iso_context_queue);
 
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 28621e44b111..e6239f971be6 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -90,6 +90,8 @@ struct fw_card_driver {
 	int (*start_iso)(struct fw_iso_context *ctx,
 			 s32 cycle, u32 sync, u32 tags);
 
+	int (*set_iso_channels)(struct fw_iso_context *ctx, u64 *channels);
+
 	int (*queue_iso)(struct fw_iso_context *ctx,
 			 struct fw_iso_packet *packet,
 			 struct fw_iso_buffer *buffer,
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 2e4b425847a7..4bda1c1b74ba 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -190,11 +190,13 @@ struct fw_ohci {
 	struct context at_request_ctx;
 	struct context at_response_ctx;
 
-	u32 it_context_mask;
+	u32 it_context_mask;     /* unoccupied IT contexts */
 	struct iso_context *it_context_list;
-	u64 ir_context_channels;
-	u32 ir_context_mask;
+	u64 ir_context_channels; /* unoccupied channels */
+	u32 ir_context_mask;     /* unoccupied IR contexts */
 	struct iso_context *ir_context_list;
+	u64 mc_channels; /* channels in use by the multichannel IR context */
+	bool mc_allocated;
 
 	__be32    *config_rom;
 	dma_addr_t config_rom_bus;
@@ -2197,10 +2199,9 @@ static int handle_ir_packet_per_buffer(struct context *context,
 	__le32 *ir_header;
 	void *p;
 
-	for (pd = d; pd <= last; pd++) {
+	for (pd = d; pd <= last; pd++)
 		if (pd->transfer_status)
 			break;
-	}
 	if (pd > last)
 		/* Descriptor(s) not done yet, stop iteration */
 		return 0;
@@ -2210,16 +2211,38 @@ static int handle_ir_packet_per_buffer(struct context *context,
 
 	if (le16_to_cpu(last->control) & DESCRIPTOR_IRQ_ALWAYS) {
 		ir_header = (__le32 *) p;
-		ctx->base.callback(&ctx->base,
-				   le32_to_cpu(ir_header[0]) & 0xffff,
-				   ctx->header_length, ctx->header,
-				   ctx->base.callback_data);
+		ctx->base.callback.sc(&ctx->base,
+				      le32_to_cpu(ir_header[0]) & 0xffff,
+				      ctx->header_length, ctx->header,
+				      ctx->base.callback_data);
 		ctx->header_length = 0;
 	}
 
 	return 1;
 }
 
+/* d == last because each descriptor block is only a single descriptor. */
+static int handle_ir_buffer_fill(struct context *context,
+				 struct descriptor *d,
+				 struct descriptor *last)
+{
+	struct iso_context *ctx =
+		container_of(context, struct iso_context, context);
+
+	if (!last->transfer_status)
+		/* Descriptor(s) not done yet, stop iteration */
+		return 0;
+
+	if (le16_to_cpu(last->control) & DESCRIPTOR_IRQ_ALWAYS)
+		ctx->base.callback.mc(&ctx->base,
+				      le32_to_cpu(last->data_address) +
+				      le16_to_cpu(last->req_count) -
+				      le16_to_cpu(last->res_count),
+				      ctx->base.callback_data);
+
+	return 1;
+}
+
 static int handle_it_packet(struct context *context,
 			    struct descriptor *d,
 			    struct descriptor *last)
@@ -2245,72 +2268,118 @@ static int handle_it_packet(struct context *context,
 		ctx->header_length += 4;
 	}
 	if (le16_to_cpu(last->control) & DESCRIPTOR_IRQ_ALWAYS) {
-		ctx->base.callback(&ctx->base, le16_to_cpu(last->res_count),
-				   ctx->header_length, ctx->header,
-				   ctx->base.callback_data);
+		ctx->base.callback.sc(&ctx->base, le16_to_cpu(last->res_count),
+				      ctx->header_length, ctx->header,
+				      ctx->base.callback_data);
 		ctx->header_length = 0;
 	}
 	return 1;
 }
 
+static void set_multichannel_mask(struct fw_ohci *ohci, u64 channels)
+{
+	u32 hi = channels >> 32, lo = channels;
+
+	reg_write(ohci, OHCI1394_IRMultiChanMaskHiClear, ~hi);
+	reg_write(ohci, OHCI1394_IRMultiChanMaskLoClear, ~lo);
+	reg_write(ohci, OHCI1394_IRMultiChanMaskHiSet, hi);
+	reg_write(ohci, OHCI1394_IRMultiChanMaskLoSet, lo);
+	mmiowb();
+	ohci->mc_channels = channels;
+}
+
 static struct fw_iso_context *ohci_allocate_iso_context(struct fw_card *card,
 				int type, int channel, size_t header_size)
 {
 	struct fw_ohci *ohci = fw_ohci(card);
-	struct iso_context *ctx, *list;
-	descriptor_callback_t callback;
-	u64 *channels, dont_care = ~0ULL;
-	u32 *mask, regs;
+	struct iso_context *uninitialized_var(ctx);
+	descriptor_callback_t uninitialized_var(callback);
+	u64 *uninitialized_var(channels);
+	u32 *uninitialized_var(mask), uninitialized_var(regs);
 	unsigned long flags;
-	int index, ret = -ENOMEM;
+	int index, ret = -EBUSY;
 
-	if (type == FW_ISO_CONTEXT_TRANSMIT) {
-		channels = &dont_care;
-		mask = &ohci->it_context_mask;
-		list = ohci->it_context_list;
+	spin_lock_irqsave(&ohci->lock, flags);
+
+	switch (type) {
+	case FW_ISO_CONTEXT_TRANSMIT:
+		mask     = &ohci->it_context_mask;
 		callback = handle_it_packet;
-	} else {
+		index    = ffs(*mask) - 1;
+		if (index >= 0) {
+			*mask &= ~(1 << index);
+			regs = OHCI1394_IsoXmitContextBase(index);
+			ctx  = &ohci->it_context_list[index];
+		}
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE:
 		channels = &ohci->ir_context_channels;
-		mask = &ohci->ir_context_mask;
-		list = ohci->ir_context_list;
+		mask     = &ohci->ir_context_mask;
 		callback = handle_ir_packet_per_buffer;
-	}
+		index    = *channels & 1ULL << channel ? ffs(*mask) - 1 : -1;
+		if (index >= 0) {
+			*channels &= ~(1ULL << channel);
+			*mask     &= ~(1 << index);
+			regs = OHCI1394_IsoRcvContextBase(index);
+			ctx  = &ohci->ir_context_list[index];
+		}
+		break;
 
-	spin_lock_irqsave(&ohci->lock, flags);
-	index = *channels & 1ULL << channel ? ffs(*mask) - 1 : -1;
-	if (index >= 0) {
-		*channels &= ~(1ULL << channel);
-		*mask &= ~(1 << index);
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+		mask     = &ohci->ir_context_mask;
+		callback = handle_ir_buffer_fill;
+		index    = !ohci->mc_allocated ? ffs(*mask) - 1 : -1;
+		if (index >= 0) {
+			ohci->mc_allocated = true;
+			*mask &= ~(1 << index);
+			regs = OHCI1394_IsoRcvContextBase(index);
+			ctx  = &ohci->ir_context_list[index];
+		}
+		break;
+
+	default:
+		index = -1;
+		ret = -ENOSYS;
 	}
+
 	spin_unlock_irqrestore(&ohci->lock, flags);
 
 	if (index < 0)
-		return ERR_PTR(-EBUSY);
-
-	if (type == FW_ISO_CONTEXT_TRANSMIT)
-		regs = OHCI1394_IsoXmitContextBase(index);
-	else
-		regs = OHCI1394_IsoRcvContextBase(index);
+		return ERR_PTR(ret);
 
-	ctx = &list[index];
 	memset(ctx, 0, sizeof(*ctx));
 	ctx->header_length = 0;
 	ctx->header = (void *) __get_free_page(GFP_KERNEL);
-	if (ctx->header == NULL)
+	if (ctx->header == NULL) {
+		ret = -ENOMEM;
 		goto out;
-
+	}
 	ret = context_init(&ctx->context, ohci, regs, callback);
 	if (ret < 0)
 		goto out_with_header;
 
+	if (type == FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL)
+		set_multichannel_mask(ohci, 0);
+
 	return &ctx->base;
 
  out_with_header:
 	free_page((unsigned long)ctx->header);
  out:
 	spin_lock_irqsave(&ohci->lock, flags);
-	*channels |= 1ULL << channel;
+
+	switch (type) {
+	case FW_ISO_CONTEXT_RECEIVE:
+		*channels |= 1ULL << channel;
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+		ohci->mc_allocated = false;
+		break;
+	}
 	*mask |= 1 << index;
+
 	spin_unlock_irqrestore(&ohci->lock, flags);
 
 	return ERR_PTR(ret);
@@ -2321,10 +2390,11 @@ static int ohci_start_iso(struct fw_iso_context *base,
 {
 	struct iso_context *ctx = container_of(base, struct iso_context, base);
 	struct fw_ohci *ohci = ctx->context.ohci;
-	u32 control, match;
+	u32 control = IR_CONTEXT_ISOCH_HEADER, match;
 	int index;
 
-	if (ctx->base.type == FW_ISO_CONTEXT_TRANSMIT) {
+	switch (ctx->base.type) {
+	case FW_ISO_CONTEXT_TRANSMIT:
 		index = ctx - ohci->it_context_list;
 		match = 0;
 		if (cycle >= 0)
@@ -2334,9 +2404,13 @@ static int ohci_start_iso(struct fw_iso_context *base,
 		reg_write(ohci, OHCI1394_IsoXmitIntEventClear, 1 << index);
 		reg_write(ohci, OHCI1394_IsoXmitIntMaskSet, 1 << index);
 		context_run(&ctx->context, match);
-	} else {
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+		control |= IR_CONTEXT_BUFFER_FILL|IR_CONTEXT_MULTI_CHANNEL_MODE;
+		/* fall through */
+	case FW_ISO_CONTEXT_RECEIVE:
 		index = ctx - ohci->ir_context_list;
-		control = IR_CONTEXT_ISOCH_HEADER;
 		match = (tags << 28) | (sync << 8) | ctx->base.channel;
 		if (cycle >= 0) {
 			match |= (cycle & 0x07fff) << 12;
@@ -2347,6 +2421,7 @@ static int ohci_start_iso(struct fw_iso_context *base,
 		reg_write(ohci, OHCI1394_IsoRecvIntMaskSet, 1 << index);
 		reg_write(ohci, CONTEXT_MATCH(ctx->context.regs), match);
 		context_run(&ctx->context, control);
+		break;
 	}
 
 	return 0;
@@ -2358,12 +2433,17 @@ static int ohci_stop_iso(struct fw_iso_context *base)
 	struct iso_context *ctx = container_of(base, struct iso_context, base);
 	int index;
 
-	if (ctx->base.type == FW_ISO_CONTEXT_TRANSMIT) {
+	switch (ctx->base.type) {
+	case FW_ISO_CONTEXT_TRANSMIT:
 		index = ctx - ohci->it_context_list;
 		reg_write(ohci, OHCI1394_IsoXmitIntMaskClear, 1 << index);
-	} else {
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE:
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
 		index = ctx - ohci->ir_context_list;
 		reg_write(ohci, OHCI1394_IsoRecvIntMaskClear, 1 << index);
+		break;
 	}
 	flush_writes(ohci);
 	context_stop(&ctx->context);
@@ -2384,24 +2464,65 @@ static void ohci_free_iso_context(struct fw_iso_context *base)
 
 	spin_lock_irqsave(&ohci->lock, flags);
 
-	if (ctx->base.type == FW_ISO_CONTEXT_TRANSMIT) {
+	switch (base->type) {
+	case FW_ISO_CONTEXT_TRANSMIT:
 		index = ctx - ohci->it_context_list;
 		ohci->it_context_mask |= 1 << index;
-	} else {
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE:
 		index = ctx - ohci->ir_context_list;
 		ohci->ir_context_mask |= 1 << index;
 		ohci->ir_context_channels |= 1ULL << base->channel;
+		break;
+
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+		index = ctx - ohci->ir_context_list;
+		ohci->ir_context_mask |= 1 << index;
+		ohci->ir_context_channels |= ohci->mc_channels;
+		ohci->mc_channels = 0;
+		ohci->mc_allocated = false;
+		break;
 	}
 
 	spin_unlock_irqrestore(&ohci->lock, flags);
 }
 
-static int ohci_queue_iso_transmit(struct fw_iso_context *base,
-				   struct fw_iso_packet *packet,
-				   struct fw_iso_buffer *buffer,
-				   unsigned long payload)
+static int ohci_set_iso_channels(struct fw_iso_context *base, u64 *channels)
+{
+	struct fw_ohci *ohci = fw_ohci(base->card);
+	unsigned long flags;
+	int ret;
+
+	switch (base->type) {
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+
+		spin_lock_irqsave(&ohci->lock, flags);
+
+		/* Don't allow multichannel to grab other contexts' channels. */
+		if (~ohci->ir_context_channels & ~ohci->mc_channels & *channels) {
+			*channels = ohci->ir_context_channels;
+			ret = -EBUSY;
+		} else {
+			set_multichannel_mask(ohci, *channels);
+			ret = 0;
+		}
+
+		spin_unlock_irqrestore(&ohci->lock, flags);
+
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int queue_iso_transmit(struct iso_context *ctx,
+			      struct fw_iso_packet *packet,
+			      struct fw_iso_buffer *buffer,
+			      unsigned long payload)
 {
-	struct iso_context *ctx = container_of(base, struct iso_context, base);
 	struct descriptor *d, *last, *pd;
 	struct fw_iso_packet *p;
 	__le32 *header;
@@ -2497,14 +2618,12 @@ static int ohci_queue_iso_transmit(struct fw_iso_context *base,
 	return 0;
 }
 
-static int ohci_queue_iso_receive_packet_per_buffer(struct fw_iso_context *base,
-					struct fw_iso_packet *packet,
-					struct fw_iso_buffer *buffer,
-					unsigned long payload)
+static int queue_iso_packet_per_buffer(struct iso_context *ctx,
+				       struct fw_iso_packet *packet,
+				       struct fw_iso_buffer *buffer,
+				       unsigned long payload)
 {
-	struct iso_context *ctx = container_of(base, struct iso_context, base);
 	struct descriptor *d, *pd;
-	struct fw_iso_packet *p = packet;
 	dma_addr_t d_bus, page_bus;
 	u32 z, header_z, rest;
 	int i, j, length;
@@ -2514,14 +2633,14 @@ static int ohci_queue_iso_receive_packet_per_buffer(struct fw_iso_context *base,
 	 * The OHCI controller puts the isochronous header and trailer in the
 	 * buffer, so we need at least 8 bytes.
 	 */
-	packet_count = p->header_length / ctx->base.header_size;
+	packet_count = packet->header_length / ctx->base.header_size;
 	header_size  = max(ctx->base.header_size, (size_t)8);
 
 	/* Get header size in number of descriptors. */
 	header_z = DIV_ROUND_UP(header_size, sizeof(*d));
 	page     = payload >> PAGE_SHIFT;
 	offset   = payload & ~PAGE_MASK;
-	payload_per_buffer = p->payload_length / packet_count;
+	payload_per_buffer = packet->payload_length / packet_count;
 
 	for (i = 0; i < packet_count; i++) {
 		/* d points to the header descriptor */
@@ -2533,7 +2652,7 @@ static int ohci_queue_iso_receive_packet_per_buffer(struct fw_iso_context *base,
 
 		d->control      = cpu_to_le16(DESCRIPTOR_STATUS |
 					      DESCRIPTOR_INPUT_MORE);
-		if (p->skip && i == 0)
+		if (packet->skip && i == 0)
 			d->control |= cpu_to_le16(DESCRIPTOR_WAIT);
 		d->req_count    = cpu_to_le16(header_size);
 		d->res_count    = d->req_count;
@@ -2566,7 +2685,7 @@ static int ohci_queue_iso_receive_packet_per_buffer(struct fw_iso_context *base,
 		pd->control = cpu_to_le16(DESCRIPTOR_STATUS |
 					  DESCRIPTOR_INPUT_LAST |
 					  DESCRIPTOR_BRANCH_ALWAYS);
-		if (p->interrupt && i == packet_count - 1)
+		if (packet->interrupt && i == packet_count - 1)
 			pd->control |= cpu_to_le16(DESCRIPTOR_IRQ_ALWAYS);
 
 		context_append(&ctx->context, d, z, header_z);
@@ -2575,6 +2694,58 @@ static int ohci_queue_iso_receive_packet_per_buffer(struct fw_iso_context *base,
 	return 0;
 }
 
+static int queue_iso_buffer_fill(struct iso_context *ctx,
+				 struct fw_iso_packet *packet,
+				 struct fw_iso_buffer *buffer,
+				 unsigned long payload)
+{
+	struct descriptor *d;
+	dma_addr_t d_bus, page_bus;
+	int page, offset, rest, z, i, length;
+
+	page   = payload >> PAGE_SHIFT;
+	offset = payload & ~PAGE_MASK;
+	rest   = packet->payload_length;
+
+	/* We need one descriptor for each page in the buffer. */
+	z = DIV_ROUND_UP(offset + rest, PAGE_SIZE);
+
+	if (WARN_ON(offset & 3 || rest & 3 || page + z > buffer->page_count))
+		return -EFAULT;
+
+	for (i = 0; i < z; i++) {
+		d = context_get_descriptors(&ctx->context, 1, &d_bus);
+		if (d == NULL)
+			return -ENOMEM;
+
+		d->control = cpu_to_le16(DESCRIPTOR_INPUT_MORE |
+					 DESCRIPTOR_BRANCH_ALWAYS);
+		if (packet->skip && i == 0)
+			d->control |= cpu_to_le16(DESCRIPTOR_WAIT);
+		if (packet->interrupt && i == z - 1)
+			d->control |= cpu_to_le16(DESCRIPTOR_IRQ_ALWAYS);
+
+		if (offset + rest < PAGE_SIZE)
+			length = rest;
+		else
+			length = PAGE_SIZE - offset;
+		d->req_count = cpu_to_le16(length);
+		d->res_count = d->req_count;
+		d->transfer_status = 0;
+
+		page_bus = page_private(buffer->pages[page]);
+		d->data_address = cpu_to_le32(page_bus + offset);
+
+		rest -= length;
+		offset = 0;
+		page++;
+
+		context_append(&ctx->context, d, 1, 0);
+	}
+
+	return 0;
+}
+
 static int ohci_queue_iso(struct fw_iso_context *base,
 			  struct fw_iso_packet *packet,
 			  struct fw_iso_buffer *buffer,
@@ -2582,14 +2753,20 @@ static int ohci_queue_iso(struct fw_iso_context *base,
 {
 	struct iso_context *ctx = container_of(base, struct iso_context, base);
 	unsigned long flags;
-	int ret;
+	int ret = -ENOSYS;
 
 	spin_lock_irqsave(&ctx->context.ohci->lock, flags);
-	if (base->type == FW_ISO_CONTEXT_TRANSMIT)
-		ret = ohci_queue_iso_transmit(base, packet, buffer, payload);
-	else
-		ret = ohci_queue_iso_receive_packet_per_buffer(base, packet,
-							buffer, payload);
+	switch (base->type) {
+	case FW_ISO_CONTEXT_TRANSMIT:
+		ret = queue_iso_transmit(ctx, packet, buffer, payload);
+		break;
+	case FW_ISO_CONTEXT_RECEIVE:
+		ret = queue_iso_packet_per_buffer(ctx, packet, buffer, payload);
+		break;
+	case FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+		ret = queue_iso_buffer_fill(ctx, packet, buffer, payload);
+		break;
+	}
 	spin_unlock_irqrestore(&ctx->context.ohci->lock, flags);
 
 	return ret;
@@ -2609,6 +2786,7 @@ static const struct fw_card_driver ohci_driver = {
 
 	.allocate_iso_context	= ohci_allocate_iso_context,
 	.free_iso_context	= ohci_free_iso_context,
+	.set_iso_channels	= ohci_set_iso_channels,
 	.queue_iso		= ohci_queue_iso,
 	.start_iso		= ohci_start_iso,
 	.stop_iso		= ohci_stop_iso,
diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 14831119ff71..bc5c26fc1c64 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -25,17 +25,18 @@
 #include <linux/types.h>
 #include <linux/firewire-constants.h>
 
-#define FW_CDEV_EVENT_BUS_RESET			0x00
-#define FW_CDEV_EVENT_RESPONSE			0x01
-#define FW_CDEV_EVENT_REQUEST			0x02
-#define FW_CDEV_EVENT_ISO_INTERRUPT		0x03
-#define FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED	0x04
-#define FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED	0x05
+#define FW_CDEV_EVENT_BUS_RESET				0x00
+#define FW_CDEV_EVENT_RESPONSE				0x01
+#define FW_CDEV_EVENT_REQUEST				0x02
+#define FW_CDEV_EVENT_ISO_INTERRUPT			0x03
+#define FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED		0x04
+#define FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED		0x05
 
 /* available since kernel version 2.6.36 */
-#define FW_CDEV_EVENT_REQUEST2			0x06
-#define FW_CDEV_EVENT_PHY_PACKET_SENT		0x07
-#define FW_CDEV_EVENT_PHY_PACKET_RECEIVED	0x08
+#define FW_CDEV_EVENT_REQUEST2				0x06
+#define FW_CDEV_EVENT_PHY_PACKET_SENT			0x07
+#define FW_CDEV_EVENT_PHY_PACKET_RECEIVED		0x08
+#define FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL	0x09
 
 /**
  * struct fw_cdev_event_common - Common part of all fw_cdev_event_ types
@@ -218,35 +219,41 @@ struct fw_cdev_event_request2 {
  * This event is sent when the controller has completed an &fw_cdev_iso_packet
  * with the %FW_CDEV_ISO_INTERRUPT bit set.
  *
- * Isochronous transmit events:
+ * Isochronous transmit events (context type %FW_CDEV_ISO_CONTEXT_TRANSMIT):
  *
- * In version 1 of the ABI, &header_length is 0.  In version 3 and some
- * implementations of version 2 of the ABI, &header_length is a multiple of 4
- * and &header contains timestamps of all packets up until the interrupt packet.
- * The format of the timestamps is as described below for isochronous reception.
+ * In version 3 and some implementations of version 2 of the ABI, &header_length
+ * is a multiple of 4 and &header contains timestamps of all packets up until
+ * the interrupt packet.  The format of the timestamps is as described below for
+ * isochronous reception.  In version 1 of the ABI, &header_length was 0.
  *
- * Isochronous receive events:
+ * Isochronous receive events (context type %FW_CDEV_ISO_CONTEXT_RECEIVE):
  *
  * The headers stripped of all packets up until and including the interrupt
  * packet are returned in the @header field.  The amount of header data per
  * packet is as specified at iso context creation by
  * &fw_cdev_create_iso_context.header_size.
  *
- * In version 1 of this ABI, header data consisted of the 1394 isochronous
- * packet header, followed by quadlets from the packet payload if
- * &fw_cdev_create_iso_context.header_size > 4.
+ * Hence, _interrupt.header_length / _context.header_size is the number of
+ * packets received in this interrupt event.  The client can now iterate
+ * through the mmap()'ed DMA buffer according to this number of packets and
+ * to the buffer sizes as the client specified in &fw_cdev_queue_iso.
  *
- * In version 2 of this ABI, header data consist of the 1394 isochronous
- * packet header, followed by a timestamp quadlet if
- * &fw_cdev_create_iso_context.header_size > 4, followed by quadlets from the
- * packet payload if &fw_cdev_create_iso_context.header_size > 8.
+ * Since version 2 of this ABI, the portion for each packet in _interrupt.header
+ * consists of the 1394 isochronous packet header, followed by a timestamp
+ * quadlet if &fw_cdev_create_iso_context.header_size > 4, followed by quadlets
+ * from the packet payload if &fw_cdev_create_iso_context.header_size > 8.
  *
- * Behaviour of ver. 1 of this ABI is no longer available since ABI ver. 2.
+ * Format of 1394 iso packet header:  16 bits data_length, 2 bits tag, 6 bits
+ * channel, 4 bits tcode, 4 bits sy, in big endian byte order.
+ * data_length is the actual received size of the packet without the four
+ * 1394 iso packet header bytes.
+ *
+ * Format of timestamp:  16 bits invalid, 3 bits cycleSeconds, 13 bits
+ * cycleCount, in big endian byte order.
  *
- * Format of 1394 iso packet header: 16 bits len, 2 bits tag, 6 bits channel,
- * 4 bits tcode, 4 bits sy, in big endian byte order.  Format of timestamp:
- * 16 bits invalid, 3 bits cycleSeconds, 13 bits cycleCount, in big endian byte
- * order.
+ * In version 1 of the ABI, no timestamp quadlet was inserted; instead, payload
+ * data followed directly after the 1394 is header if header_size > 4.
+ * Behaviour of ver. 1 of this ABI is no longer available since ABI ver. 2.
  */
 struct fw_cdev_event_iso_interrupt {
 	__u64 closure;
@@ -256,6 +263,43 @@ struct fw_cdev_event_iso_interrupt {
 	__u32 header[0];
 };
 
+/**
+ * struct fw_cdev_event_iso_interrupt_mc - An iso buffer chunk was completed
+ * @closure:	See &fw_cdev_event_common;
+ *		set by %FW_CDEV_CREATE_ISO_CONTEXT ioctl
+ * @type:	%FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL
+ * @completed:	Offset into the receive buffer; data before this offest is valid
+ *
+ * This event is sent in multichannel contexts (context type
+ * %FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL) for &fw_cdev_iso_packet buffer
+ * chunks that have the %FW_CDEV_ISO_INTERRUPT bit set.  Whether this happens
+ * when a packet is completed and/or when a buffer chunk is completed depends
+ * on the hardware implementation.
+ *
+ * The buffer is continuously filled with the following data, per packet:
+ *  - the 1394 iso packet header as described at &fw_cdev_event_iso_interrupt,
+ *    but in little endian byte order,
+ *  - packet payload (as many bytes as specified in the data_length field of
+ *    the 1394 iso packet header) in big endian byte order,
+ *  - 0...3 padding bytes as needed to align the following trailer quadlet,
+ *  - trailer quadlet, containing the reception timestamp as described at
+ *    &fw_cdev_event_iso_interrupt, but in little endian byte order.
+ *
+ * Hence the per-packet size is data_length (rounded up to a multiple of 4) + 8.
+ * When processing the data, stop before a packet that would cross the
+ * @completed offset.
+ *
+ * A packet near the end of a buffer chunk will typically spill over into the
+ * next queued buffer chunk.  It is the responsibility of the client to check
+ * for this condition, assemble a broken-up packet from its parts, and not to
+ * re-queue any buffer chunks in which as yet unread packet parts reside.
+ */
+struct fw_cdev_event_iso_interrupt_mc {
+	__u64 closure;
+	__u32 type;
+	__u32 completed;
+};
+
 /**
  * struct fw_cdev_event_iso_resource - Iso resources were allocated or freed
  * @closure:	See &fw_cdev_event_common;
@@ -311,16 +355,18 @@ struct fw_cdev_event_phy_packet {
 
 /**
  * union fw_cdev_event - Convenience union of fw_cdev_event_ types
- * @common:        Valid for all types
- * @bus_reset:     Valid if @common.type == %FW_CDEV_EVENT_BUS_RESET
- * @response:      Valid if @common.type == %FW_CDEV_EVENT_RESPONSE
- * @request:       Valid if @common.type == %FW_CDEV_EVENT_REQUEST
- * @request2:      Valid if @common.type == %FW_CDEV_EVENT_REQUEST2
- * @iso_interrupt: Valid if @common.type == %FW_CDEV_EVENT_ISO_INTERRUPT
- * @iso_resource:  Valid if @common.type ==
+ * @common:		Valid for all types
+ * @bus_reset:		Valid if @common.type == %FW_CDEV_EVENT_BUS_RESET
+ * @response:		Valid if @common.type == %FW_CDEV_EVENT_RESPONSE
+ * @request:		Valid if @common.type == %FW_CDEV_EVENT_REQUEST
+ * @request2:		Valid if @common.type == %FW_CDEV_EVENT_REQUEST2
+ * @iso_interrupt:	Valid if @common.type == %FW_CDEV_EVENT_ISO_INTERRUPT
+ * @iso_interrupt_mc:	Valid if @common.type ==
+ *				%FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL
+ * @iso_resource:	Valid if @common.type ==
  *				%FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or
  *				%FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED
- * @phy_packet:    Valid if @common.type ==
+ * @phy_packet:		Valid if @common.type ==
  *				%FW_CDEV_EVENT_PHY_PACKET_SENT or
  *				%FW_CDEV_EVENT_PHY_PACKET_RECEIVED
  *
@@ -337,10 +383,11 @@ union fw_cdev_event {
 	struct fw_cdev_event_bus_reset		bus_reset;
 	struct fw_cdev_event_response		response;
 	struct fw_cdev_event_request		request;
-	struct fw_cdev_event_request2		request2;     /* added in 2.6.36 */
+	struct fw_cdev_event_request2		request2;		/* added in 2.6.36 */
 	struct fw_cdev_event_iso_interrupt	iso_interrupt;
-	struct fw_cdev_event_iso_resource	iso_resource; /* added in 2.6.30 */
-	struct fw_cdev_event_phy_packet		phy_packet;   /* added in 2.6.36 */
+	struct fw_cdev_event_iso_interrupt_mc	iso_interrupt_mc;	/* added in 2.6.36 */
+	struct fw_cdev_event_iso_resource	iso_resource;		/* added in 2.6.30 */
+	struct fw_cdev_event_phy_packet		phy_packet;		/* added in 2.6.36 */
 };
 
 /* available since kernel version 2.6.22 */
@@ -375,6 +422,7 @@ union fw_cdev_event {
 /* available since kernel version 2.6.36 */
 #define FW_CDEV_IOC_SEND_PHY_PACKET    _IOWR('#', 0x15, struct fw_cdev_send_phy_packet)
 #define FW_CDEV_IOC_RECEIVE_PHY_PACKETS _IOW('#', 0x16, struct fw_cdev_receive_phy_packets)
+#define FW_CDEV_IOC_SET_ISO_CHANNELS    _IOW('#', 0x17, struct fw_cdev_set_iso_channels)
 
 /*
  * ABI version history
@@ -391,10 +439,13 @@ union fw_cdev_event {
  *               - shared use and auto-response for FCP registers
  *  3  (2.6.34)  - made &fw_cdev_get_cycle_timer reliable
  *               - added %FW_CDEV_IOC_GET_CYCLE_TIMER2
- *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2, %FW_CDEV_EVENT_PHY_PACKET_*
+ *  4  (2.6.36)  - added %FW_CDEV_EVENT_REQUEST2, %FW_CDEV_EVENT_PHY_PACKET_*,
+ *                 and &fw_cdev_allocate.region_end
  *               - implemented &fw_cdev_event_bus_reset.bm_node_id
  *               - added %FW_CDEV_IOC_SEND_PHY_PACKET, _RECEIVE_PHY_PACKETS
- *               - added &fw_cdev_allocate.region_end
+ *               - added %FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL,
+ *                 %FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL, and
+ *                 %FW_CDEV_IOC_SET_ISO_CHANNELS
  */
 #define FW_CDEV_VERSION 3 /* Meaningless; don't use this macro. */
 
@@ -597,34 +648,43 @@ struct fw_cdev_remove_descriptor {
 	__u32 handle;
 };
 
-#define FW_CDEV_ISO_CONTEXT_TRANSMIT	0
-#define FW_CDEV_ISO_CONTEXT_RECEIVE	1
+#define FW_CDEV_ISO_CONTEXT_TRANSMIT			0
+#define FW_CDEV_ISO_CONTEXT_RECEIVE			1
+#define FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL	2 /* added in 2.6.36 */
 
 /**
- * struct fw_cdev_create_iso_context - Create a context for isochronous IO
- * @type:	%FW_CDEV_ISO_CONTEXT_TRANSMIT or %FW_CDEV_ISO_CONTEXT_RECEIVE
- * @header_size: Header size to strip for receive contexts
- * @channel:	Channel to bind to
- * @speed:	Speed for transmit contexts
- * @closure:	To be returned in &fw_cdev_event_iso_interrupt
+ * struct fw_cdev_create_iso_context - Create a context for isochronous I/O
+ * @type:	%FW_CDEV_ISO_CONTEXT_TRANSMIT or %FW_CDEV_ISO_CONTEXT_RECEIVE or
+ *		%FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL
+ * @header_size: Header size to strip in single-channel reception
+ * @channel:	Channel to bind to in single-channel reception or transmission
+ * @speed:	Transmission speed
+ * @closure:	To be returned in &fw_cdev_event_iso_interrupt or
+ *		&fw_cdev_event_iso_interrupt_multichannel
  * @handle:	Handle to context, written back by kernel
  *
  * Prior to sending or receiving isochronous I/O, a context must be created.
  * The context records information about the transmit or receive configuration
  * and typically maps to an underlying hardware resource.  A context is set up
  * for either sending or receiving.  It is bound to a specific isochronous
- * channel.
+ * @channel.
  *
- * If a context was successfully created, the kernel writes back a handle to the
- * context, which must be passed in for subsequent operations on that context.
+ * In case of multichannel reception, @header_size and @channel are ignored
+ * and the channels are selected by %FW_CDEV_IOC_SET_ISO_CHANNELS.
+ *
+ * For %FW_CDEV_ISO_CONTEXT_RECEIVE contexts, @header_size must be at least 4
+ * and must be a multiple of 4.  It is ignored in other context types.
  *
- * For receive contexts, @header_size must be at least 4 and must be a multiple
- * of 4.
+ * @speed is ignored in receive context types.
  *
- * Note that the effect of a @header_size > 4 depends on
- * &fw_cdev_get_info.version, as documented at &fw_cdev_event_iso_interrupt.
+ * If a context was successfully created, the kernel writes back a handle to the
+ * context, which must be passed in for subsequent operations on that context.
  *
+ * Limitations:
  * No more than one iso context can be created per fd.
+ * The total number of contexts that all userspace and kernelspace drivers can
+ * create on a card at a time is a hardware limit, typically 4 or 8 contexts per
+ * direction, and of them at most one multichannel receive context.
  */
 struct fw_cdev_create_iso_context {
 	__u32 type;
@@ -635,6 +695,22 @@ struct fw_cdev_create_iso_context {
 	__u32 handle;
 };
 
+/**
+ * struct fw_cdev_set_iso_channels - Select channels in multichannel reception
+ * @channels:	Bitmask of channels to listen to
+ * @handle:	Handle of the mutichannel receive context
+ *
+ * @channels is the bitwise or of 1ULL << n for each channel n to listen to.
+ *
+ * The ioctl fails with errno %EBUSY if there is already another receive context
+ * on a channel in @channels.  In that case, the bitmask of all unoccupied
+ * channels is returned in @channels.
+ */
+struct fw_cdev_set_iso_channels {
+	__u64 channels;
+	__u32 handle;
+};
+
 #define FW_CDEV_ISO_PAYLOAD_LENGTH(v)	(v)
 #define FW_CDEV_ISO_INTERRUPT		(1 << 16)
 #define FW_CDEV_ISO_SKIP		(1 << 17)
@@ -645,42 +721,72 @@ struct fw_cdev_create_iso_context {
 
 /**
  * struct fw_cdev_iso_packet - Isochronous packet
- * @control:	Contains the header length (8 uppermost bits), the sy field
- *		(4 bits), the tag field (2 bits), a sync flag (1 bit),
- *		a skip flag (1 bit), an interrupt flag (1 bit), and the
+ * @control:	Contains the header length (8 uppermost bits),
+ *		the sy field (4 bits), the tag field (2 bits), a sync flag
+ *		or a skip flag (1 bit), an interrupt flag (1 bit), and the
  *		payload length (16 lowermost bits)
- * @header:	Header and payload
+ * @header:	Header and payload in case of a transmit context.
  *
  * &struct fw_cdev_iso_packet is used to describe isochronous packet queues.
- *
  * Use the FW_CDEV_ISO_ macros to fill in @control.
+ * The @header array is empty in case of receive contexts.
+ *
+ * Context type %FW_CDEV_ISO_CONTEXT_TRANSMIT:
+ *
+ * @control.HEADER_LENGTH must be a multiple of 4.  It specifies the numbers of
+ * bytes in @header that will be prepended to the packet's payload.  These bytes
+ * are copied into the kernel and will not be accessed after the ioctl has
+ * returned.
+ *
+ * The @control.SY and TAG fields are copied to the iso packet header.  These
+ * fields are specified by IEEE 1394a and IEC 61883-1.
+ *
+ * The @control.SKIP flag specifies that no packet is to be sent in a frame.
+ * When using this, all other fields except @control.INTERRUPT must be zero.
+ *
+ * When a packet with the @control.INTERRUPT flag set has been completed, an
+ * &fw_cdev_event_iso_interrupt event will be sent.
+ *
+ * Context type %FW_CDEV_ISO_CONTEXT_RECEIVE:
  *
- * For transmit packets, the header length must be a multiple of 4 and specifies
- * the numbers of bytes in @header that will be prepended to the packet's
- * payload; these bytes are copied into the kernel and will not be accessed
- * after the ioctl has returned.  The sy and tag fields are copied to the iso
- * packet header (these fields are specified by IEEE 1394a and IEC 61883-1).
- * The skip flag specifies that no packet is to be sent in a frame; when using
- * this, all other fields except the interrupt flag must be zero.
- *
- * For receive packets, the header length must be a multiple of the context's
- * header size; if the header length is larger than the context's header size,
- * multiple packets are queued for this entry.  The sy and tag fields are
- * ignored.  If the sync flag is set, the context drops all packets until
- * a packet with a matching sy field is received (the sync value to wait for is
- * specified in the &fw_cdev_start_iso structure).  The payload length defines
- * how many payload bytes can be received for one packet (in addition to payload
- * quadlets that have been defined as headers and are stripped and returned in
- * the &fw_cdev_event_iso_interrupt structure).  If more bytes are received, the
- * additional bytes are dropped.  If less bytes are received, the remaining
- * bytes in this part of the payload buffer will not be written to, not even by
- * the next packet, i.e., packets received in consecutive frames will not
- * necessarily be consecutive in memory.  If an entry has queued multiple
- * packets, the payload length is divided equally among them.
- *
- * When a packet with the interrupt flag set has been completed, the
+ * @control.HEADER_LENGTH must be a multiple of the context's header_size.
+ * If the HEADER_LENGTH is larger than the context's header_size, multiple
+ * packets are queued for this entry.
+ *
+ * The @control.SY and TAG fields are ignored.
+ *
+ * If the @control.SYNC flag is set, the context drops all packets until a
+ * packet with a sy field is received which matches &fw_cdev_start_iso.sync.
+ *
+ * @control.PAYLOAD_LENGTH defines how many payload bytes can be received for
+ * one packet (in addition to payload quadlets that have been defined as headers
+ * and are stripped and returned in the &fw_cdev_event_iso_interrupt structure).
+ * If more bytes are received, the additional bytes are dropped.  If less bytes
+ * are received, the remaining bytes in this part of the payload buffer will not
+ * be written to, not even by the next packet.  I.e., packets received in
+ * consecutive frames will not necessarily be consecutive in memory.  If an
+ * entry has queued multiple packets, the PAYLOAD_LENGTH is divided equally
+ * among them.
+ *
+ * When a packet with the @control.INTERRUPT flag set has been completed, an
  * &fw_cdev_event_iso_interrupt event will be sent.  An entry that has queued
  * multiple receive packets is completed when its last packet is completed.
+ *
+ * Context type %FW_CDEV_ISO_CONTEXT_RECEIVE_MULTICHANNEL:
+ *
+ * Here, &fw_cdev_iso_packet would be more aptly named _iso_buffer_chunk since
+ * it specifies a chunk of the mmap()'ed buffer, while the number and alignment
+ * of packets to be placed into the buffer chunk is not known beforehand.
+ *
+ * @control.PAYLOAD_LENGTH is the size of the buffer chunk and specifies room
+ * for header, payload, padding, and trailer bytes of one or more packets.
+ * It must be a multiple of 4.
+ *
+ * @control.HEADER_LENGTH, TAG and SY are ignored.  SYNC is treated as described
+ * for single-channel reception.
+ *
+ * When a buffer chunk with the @control.INTERRUPT flag set has been filled
+ * entirely, an &fw_cdev_event_iso_interrupt_mc event will be sent.
  */
 struct fw_cdev_iso_packet {
 	__u32 control;
@@ -689,9 +795,9 @@ struct fw_cdev_iso_packet {
 
 /**
  * struct fw_cdev_queue_iso - Queue isochronous packets for I/O
- * @packets:	Userspace pointer to packet data
+ * @packets:	Userspace pointer to an array of &fw_cdev_iso_packet
  * @data:	Pointer into mmap()'ed payload buffer
- * @size:	Size of packet data in bytes
+ * @size:	Size of the @packets array, in bytes
  * @handle:	Isochronous context handle
  *
  * Queue a number of isochronous packets for reception or transmission.
@@ -704,6 +810,9 @@ struct fw_cdev_iso_packet {
  * The kernel may or may not queue all packets, but will write back updated
  * values of the @packets, @data and @size fields, so the ioctl can be
  * resubmitted easily.
+ *
+ * In case of a multichannel receive context, @data must be quadlet-aligned
+ * relative to the buffer start.
  */
 struct fw_cdev_queue_iso {
 	__u64 packets;
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index d974aa4a24c9..1cd637ef62d2 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -372,17 +372,19 @@ void fw_core_remove_descriptor(struct fw_descriptor *desc);
  * scatter-gather streaming (e.g. assembling video frame automatically).
  */
 struct fw_iso_packet {
-	u16 payload_length;	/* Length of indirect payload. */
-	u32 interrupt:1;	/* Generate interrupt on this packet */
-	u32 skip:1;		/* Set to not send packet at all. */
-	u32 tag:2;
-	u32 sy:4;
-	u32 header_length:8;	/* Length of immediate header. */
-	u32 header[0];
+	u16 payload_length;	/* Length of indirect payload		*/
+	u32 interrupt:1;	/* Generate interrupt on this packet	*/
+	u32 skip:1;		/* tx: Set to not send packet at all	*/
+				/* rx: Sync bit, wait for matching sy	*/
+	u32 tag:2;		/* tx: Tag in packet header		*/
+	u32 sy:4;		/* tx: Sy in packet header		*/
+	u32 header_length:8;	/* Length of immediate header		*/
+	u32 header[0];		/* tx: Top of 1394 isoch. data_block	*/
 };
 
-#define FW_ISO_CONTEXT_TRANSMIT	0
-#define FW_ISO_CONTEXT_RECEIVE	1
+#define FW_ISO_CONTEXT_TRANSMIT			0
+#define FW_ISO_CONTEXT_RECEIVE			1
+#define FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL	2
 
 #define FW_ISO_CONTEXT_MATCH_TAG0	 1
 #define FW_ISO_CONTEXT_MATCH_TAG1	 2
@@ -406,24 +408,31 @@ struct fw_iso_buffer {
 int fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
 		       int page_count, enum dma_data_direction direction);
 void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer, struct fw_card *card);
+size_t fw_iso_buffer_lookup(struct fw_iso_buffer *buffer, dma_addr_t completed);
 
 struct fw_iso_context;
 typedef void (*fw_iso_callback_t)(struct fw_iso_context *context,
 				  u32 cycle, size_t header_length,
 				  void *header, void *data);
+typedef void (*fw_iso_mc_callback_t)(struct fw_iso_context *context,
+				     dma_addr_t completed, void *data);
 struct fw_iso_context {
 	struct fw_card *card;
 	int type;
 	int channel;
 	int speed;
 	size_t header_size;
-	fw_iso_callback_t callback;
+	union {
+		fw_iso_callback_t sc;
+		fw_iso_mc_callback_t mc;
+	} callback;
 	void *callback_data;
 };
 
 struct fw_iso_context *fw_iso_context_create(struct fw_card *card,
 		int type, int channel, int speed, size_t header_size,
 		fw_iso_callback_t callback, void *callback_data);
+int fw_iso_context_set_channels(struct fw_iso_context *ctx, u64 *channels);
 int fw_iso_context_queue(struct fw_iso_context *ctx,
 			 struct fw_iso_packet *packet,
 			 struct fw_iso_buffer *buffer,
-- 
cgit v1.2.3-59-g8ed1b


From c6601225380088018ae93df2ba7f0bb65334d63b Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 23 Jul 2010 15:04:01 -0600
Subject: of/device: Make of_device_make_bus_id() usable by other code.

The AMBA bus should also use of_device_make_bus_id() when populating device
out of device tree data.  This patch makes the function non-static, and
adds a suitable prototype in of_device.h

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/platform.c     | 2 +-
 include/linux/of_device.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 033a224a9fda..30a4641e798a 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -508,7 +508,7 @@ EXPORT_SYMBOL(of_unregister_driver);
  * value to derive a unique name.  As a last resort it will use the node
  * name followed by a unique number.
  */
-static void of_device_make_bus_id(struct device *dev)
+void of_device_make_bus_id(struct device *dev)
 {
 	static atomic_t bus_no_reg_magic;
 	struct device_node *node = dev->of_node;
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index e11a0be7893a..35aa44ad9f2c 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -27,6 +27,7 @@
 
 extern const struct of_device_id *of_match_device(
 	const struct of_device_id *matches, const struct device *dev);
+extern void of_device_make_bus_id(struct device *dev);
 
 /**
  * of_driver_match_device - Tell if a driver's of_match_table matches a device.
-- 
cgit v1.2.3-59-g8ed1b


From 559e2b7ee7a1c7753d534abcb2742a4775339293 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 23 Jul 2010 20:11:18 -0600
Subject: of: Provide default of_node_to_nid() implementation.

of_node_to_nid() is only relevant in a few architectures.  Don't force
everyone to implement it anyway.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/microblaze/include/asm/topology.h | 10 ----------
 arch/powerpc/include/asm/prom.h        |  7 +++++++
 arch/powerpc/include/asm/topology.h    |  7 -------
 arch/sparc/include/asm/prom.h          |  3 +--
 include/linux/of.h                     |  5 +++++
 5 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/topology.h b/arch/microblaze/include/asm/topology.h
index 96bcea5a9920..5428f333a02c 100644
--- a/arch/microblaze/include/asm/topology.h
+++ b/arch/microblaze/include/asm/topology.h
@@ -1,11 +1 @@
 #include <asm-generic/topology.h>
-
-#ifndef _ASM_MICROBLAZE_TOPOLOGY_H
-#define _ASM_MICROBLAZE_TOPOLOGY_H
-
-struct device_node;
-static inline int of_node_to_nid(struct device_node *device)
-{
-	return 0;
-}
-#endif /* _ASM_MICROBLAZE_TOPOLOGY_H */
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index da7dd634e7ce..55bccc0a21c3 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -103,6 +103,13 @@ struct device_node *of_find_next_cache_node(struct device_node *np);
 /* Get the MAC address */
 extern const void *of_get_mac_address(struct device_node *np);
 
+#ifdef CONFIG_NUMA
+extern int of_node_to_nid(struct device_node *device);
+#else
+static inline int of_node_to_nid(struct device_node *device) { return 0; }
+#endif
+#define of_node_to_nid of_node_to_nid
+
 /**
  * of_irq_map_pci - Resolve the interrupt for a PCI device
  * @pdev:	the device whose interrupt is to be resolved
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 32adf7280720..09dd38c8882c 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -41,8 +41,6 @@ static inline int cpu_to_node(int cpu)
 			       cpu_all_mask :				\
 			       node_to_cpumask_map[node])
 
-int of_node_to_nid(struct device_node *device);
-
 struct pci_bus;
 #ifdef CONFIG_PCI
 extern int pcibus_to_node(struct pci_bus *bus);
@@ -94,11 +92,6 @@ extern void sysfs_remove_device_from_node(struct sys_device *dev, int nid);
 
 #else
 
-static inline int of_node_to_nid(struct device_node *device)
-{
-	return 0;
-}
-
 static inline void dump_numa_cpu_topology(void) {}
 
 static inline int sysfs_add_device_to_node(struct sys_device *dev, int nid)
diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h
index c82a7da25f9f..291f12575edd 100644
--- a/arch/sparc/include/asm/prom.h
+++ b/arch/sparc/include/asm/prom.h
@@ -43,8 +43,7 @@ extern int of_getintprop_default(struct device_node *np,
 extern int of_find_in_proplist(const char *list, const char *match, int len);
 #ifdef CONFIG_NUMA
 extern int of_node_to_nid(struct device_node *dp);
-#else
-#define of_node_to_nid(dp)	(-1)
+#define of_node_to_nid of_node_to_nid
 #endif
 
 extern void prom_build_devicetree(void);
diff --git a/include/linux/of.h b/include/linux/of.h
index b0756f33249e..cad7cf0ab278 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -146,6 +146,11 @@ static inline unsigned long of_read_ulong(const __be32 *cell, int size)
 
 #define OF_BAD_ADDR	((u64)-1)
 
+#ifndef of_node_to_nid
+static inline int of_node_to_nid(struct device_node *np) { return -1; }
+#define of_node_to_nid of_node_to_nid
+#endif
+
 extern struct device_node *of_find_node_by_name(struct device_node *from,
 	const char *name);
 #define for_each_node_by_name(dn, name) \
-- 
cgit v1.2.3-59-g8ed1b


From 12b15e83289bc7cf2ec9a342412e0c955beeb395 Mon Sep 17 00:00:00 2001
From: Anatolij Gustschin <agust@denx.de>
Date: Tue, 27 Jul 2010 22:35:58 +0200
Subject: of/spi: call of_register_spi_devices() from spi core code

Move of_register_spi_devices() call from drivers to
spi_register_master(). Also change the function to use
the struct device_node pointer from master spi device
instead of passing it as function argument.

Signed-off-by: Anatolij Gustschin <agust@denx.de>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/of/of_spi.c           | 10 ++++++----
 drivers/spi/mpc512x_psc_spi.c |  1 +
 drivers/spi/mpc52xx_psc_spi.c | 10 ++--------
 drivers/spi/mpc52xx_spi.c     |  3 +--
 drivers/spi/spi.c             |  4 ++++
 drivers/spi/spi_mpc8xxx.c     |  4 +---
 drivers/spi/spi_ppc4xx.c      |  2 +-
 drivers/spi/xilinx_spi.c      |  3 +++
 drivers/spi/xilinx_spi_of.c   |  3 ---
 include/linux/of_spi.h        | 11 ++++++++---
 10 files changed, 27 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/of_spi.c b/drivers/of/of_spi.c
index d504f1d1324b..1dbce58a58b0 100644
--- a/drivers/of/of_spi.c
+++ b/drivers/of/of_spi.c
@@ -15,12 +15,11 @@
 /**
  * of_register_spi_devices - Register child devices onto the SPI bus
  * @master:	Pointer to spi_master device
- * @np:		parent node of SPI device nodes
  *
- * Registers an spi_device for each child node of 'np' which has a 'reg'
+ * Registers an spi_device for each child node of master node which has a 'reg'
  * property.
  */
-void of_register_spi_devices(struct spi_master *master, struct device_node *np)
+void of_register_spi_devices(struct spi_master *master)
 {
 	struct spi_device *spi;
 	struct device_node *nc;
@@ -28,7 +27,10 @@ void of_register_spi_devices(struct spi_master *master, struct device_node *np)
 	int rc;
 	int len;
 
-	for_each_child_of_node(np, nc) {
+	if (!master->dev.of_node)
+		return;
+
+	for_each_child_of_node(master->dev.of_node, nc) {
 		/* Alloc an spi_device */
 		spi = spi_alloc_device(master);
 		if (!spi) {
diff --git a/drivers/spi/mpc512x_psc_spi.c b/drivers/spi/mpc512x_psc_spi.c
index 2534b1ec3edd..1bb4315f5f84 100644
--- a/drivers/spi/mpc512x_psc_spi.c
+++ b/drivers/spi/mpc512x_psc_spi.c
@@ -440,6 +440,7 @@ static int __init mpc512x_psc_spi_do_probe(struct device *dev, u32 regaddr,
 	master->setup = mpc512x_psc_spi_setup;
 	master->transfer = mpc512x_psc_spi_transfer;
 	master->cleanup = mpc512x_psc_spi_cleanup;
+	master->dev.of_node = dev->of_node;
 
 	tempp = ioremap(regaddr, size);
 	if (!tempp) {
diff --git a/drivers/spi/mpc52xx_psc_spi.c b/drivers/spi/mpc52xx_psc_spi.c
index 7104cb739da7..bd81ff90cfb3 100644
--- a/drivers/spi/mpc52xx_psc_spi.c
+++ b/drivers/spi/mpc52xx_psc_spi.c
@@ -17,7 +17,6 @@
 #include <linux/errno.h>
 #include <linux/interrupt.h>
 #include <linux/of_platform.h>
-#include <linux/of_spi.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
 #include <linux/io.h>
@@ -398,6 +397,7 @@ static int __init mpc52xx_psc_spi_do_probe(struct device *dev, u32 regaddr,
 	master->setup = mpc52xx_psc_spi_setup;
 	master->transfer = mpc52xx_psc_spi_transfer;
 	master->cleanup = mpc52xx_psc_spi_cleanup;
+	master->dev.of_node = dev->of_node;
 
 	mps->psc = ioremap(regaddr, size);
 	if (!mps->psc) {
@@ -470,7 +470,6 @@ static int __init mpc52xx_psc_spi_of_probe(struct of_device *op,
 	const u32 *regaddr_p;
 	u64 regaddr64, size64;
 	s16 id = -1;
-	int rc;
 
 	regaddr_p = of_get_address(op->dev.of_node, 0, &size64, NULL);
 	if (!regaddr_p) {
@@ -491,13 +490,8 @@ static int __init mpc52xx_psc_spi_of_probe(struct of_device *op,
 		id = *psc_nump + 1;
 	}
 
-	rc = mpc52xx_psc_spi_do_probe(&op->dev, (u32)regaddr64, (u32)size64,
+	return mpc52xx_psc_spi_do_probe(&op->dev, (u32)regaddr64, (u32)size64,
 				irq_of_parse_and_map(op->dev.of_node, 0), id);
-	if (rc == 0)
-		of_register_spi_devices(dev_get_drvdata(&op->dev),
-					op->dev.of_node);
-
-	return rc;
 }
 
 static int __exit mpc52xx_psc_spi_of_remove(struct of_device *op)
diff --git a/drivers/spi/mpc52xx_spi.c b/drivers/spi/mpc52xx_spi.c
index b1a76bff775f..56136ff00e01 100644
--- a/drivers/spi/mpc52xx_spi.c
+++ b/drivers/spi/mpc52xx_spi.c
@@ -18,7 +18,6 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/spi/spi.h>
-#include <linux/of_spi.h>
 #include <linux/io.h>
 #include <linux/of_gpio.h>
 #include <linux/slab.h>
@@ -439,6 +438,7 @@ static int __devinit mpc52xx_spi_probe(struct of_device *op,
 	master->setup = mpc52xx_spi_setup;
 	master->transfer = mpc52xx_spi_transfer;
 	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_LSB_FIRST;
+	master->dev.of_node = op->dev.of_node;
 
 	dev_set_drvdata(&op->dev, master);
 
@@ -512,7 +512,6 @@ static int __devinit mpc52xx_spi_probe(struct of_device *op,
 	if (rc)
 		goto err_register;
 
-	of_register_spi_devices(master, op->dev.of_node);
 	dev_info(&ms->master->dev, "registered MPC5200 SPI bus\n");
 
 	return rc;
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index b3a1f9259b62..1bb1b88780ce 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/mod_devicetable.h>
 #include <linux/spi/spi.h>
+#include <linux/of_spi.h>
 
 
 /* SPI bustype and spi_master class are registered after board init code
@@ -540,6 +541,9 @@ int spi_register_master(struct spi_master *master)
 	/* populate children from any spi device tables */
 	scan_boardinfo(master);
 	status = 0;
+
+	/* Register devices from the device tree */
+	of_register_spi_devices(master);
 done:
 	return status;
 }
diff --git a/drivers/spi/spi_mpc8xxx.c b/drivers/spi/spi_mpc8xxx.c
index 97ab0a81338a..aad9ae1b9c69 100644
--- a/drivers/spi/spi_mpc8xxx.c
+++ b/drivers/spi/spi_mpc8xxx.c
@@ -38,7 +38,6 @@
 #include <linux/of_platform.h>
 #include <linux/gpio.h>
 #include <linux/of_gpio.h>
-#include <linux/of_spi.h>
 #include <linux/slab.h>
 
 #include <sysdev/fsl_soc.h>
@@ -1009,6 +1008,7 @@ mpc8xxx_spi_probe(struct device *dev, struct resource *mem, unsigned int irq)
 	master->setup = mpc8xxx_spi_setup;
 	master->transfer = mpc8xxx_spi_transfer;
 	master->cleanup = mpc8xxx_spi_cleanup;
+	master->dev.of_node = dev->of_node;
 
 	mpc8xxx_spi = spi_master_get_devdata(master);
 	mpc8xxx_spi->dev = dev;
@@ -1299,8 +1299,6 @@ static int __devinit of_mpc8xxx_spi_probe(struct of_device *ofdev,
 		goto err;
 	}
 
-	of_register_spi_devices(master, np);
-
 	return 0;
 
 err:
diff --git a/drivers/spi/spi_ppc4xx.c b/drivers/spi/spi_ppc4xx.c
index d53466a249d9..0f5fa7e2a550 100644
--- a/drivers/spi/spi_ppc4xx.c
+++ b/drivers/spi/spi_ppc4xx.c
@@ -407,6 +407,7 @@ static int __init spi_ppc4xx_of_probe(struct of_device *op,
 	master = spi_alloc_master(dev, sizeof *hw);
 	if (master == NULL)
 		return -ENOMEM;
+	master->dev.of_node = np;
 	dev_set_drvdata(dev, master);
 	hw = spi_master_get_devdata(master);
 	hw->master = spi_master_get(master);
@@ -545,7 +546,6 @@ static int __init spi_ppc4xx_of_probe(struct of_device *op,
 	}
 
 	dev_info(dev, "driver initialized\n");
-	of_register_spi_devices(master, np);
 
 	return 0;
 
diff --git a/drivers/spi/xilinx_spi.c b/drivers/spi/xilinx_spi.c
index 1b47363cb73f..80f2db5bcfd6 100644
--- a/drivers/spi/xilinx_spi.c
+++ b/drivers/spi/xilinx_spi.c
@@ -390,6 +390,9 @@ struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem,
 
 	master->bus_num = bus_num;
 	master->num_chipselect = pdata->num_chipselect;
+#ifdef CONFIG_OF
+	master->dev.of_node = dev->of_node;
+#endif
 
 	xspi->mem = *mem;
 	xspi->irq = irq;
diff --git a/drivers/spi/xilinx_spi_of.c b/drivers/spi/xilinx_spi_of.c
index 4654805b08d8..87cda0956a83 100644
--- a/drivers/spi/xilinx_spi_of.c
+++ b/drivers/spi/xilinx_spi_of.c
@@ -80,9 +80,6 @@ static int __devinit xilinx_spi_of_probe(struct of_device *ofdev,
 
 	dev_set_drvdata(&ofdev->dev, master);
 
-	/* Add any subnodes on the SPI bus */
-	of_register_spi_devices(master, ofdev->dev.of_node);
-
 	return 0;
 }
 
diff --git a/include/linux/of_spi.h b/include/linux/of_spi.h
index 5f71ee8c0868..9e3e70f78ae6 100644
--- a/include/linux/of_spi.h
+++ b/include/linux/of_spi.h
@@ -9,10 +9,15 @@
 #ifndef __LINUX_OF_SPI_H
 #define __LINUX_OF_SPI_H
 
-#include <linux/of.h>
 #include <linux/spi/spi.h>
 
-extern void of_register_spi_devices(struct spi_master *master,
-				    struct device_node *np);
+#if defined(CONFIG_OF_SPI) || defined(CONFIG_OF_SPI_MODULE)
+extern void of_register_spi_devices(struct spi_master *master);
+#else
+static inline void of_register_spi_devices(struct spi_master *master)
+{
+	return;
+}
+#endif /* CONFIG_OF_SPI */
 
 #endif /* __LINUX_OF_SPI */
-- 
cgit v1.2.3-59-g8ed1b


From 253d2e549818f5a4a52e2db0aba3dacee21e5b38 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 16 Jul 2010 10:19:22 -0700
Subject: PCI: disable mmio during bar sizing

It is a known issue that mmio decoding shall be disabled while doing PCI
bar sizing. Host bridge and other devices (PCI PIC) shall be excluded for
certain platforms. This patch mainly comes from Mathew Willcox's
patch in http://kerneltrap.org/mailarchive/linux-kernel/2007/9/13/258969.

A new flag bit "mmio_alway_on" is added to pci_dev with the intention that
devices with their mmio decoding cannot be disabled during BAR sizing shall
have this bit set, preferrablly in their quirks.

Without this patch, Intel Moorestown platform graphics unit will be
corrupted during bar sizing activities.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/pci/probe.c  | 10 ++++++++++
 drivers/pci/quirks.c | 13 +++++++++++++
 include/linux/pci.h  |  2 ++
 3 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index f4adba2d1dd3..12625d90f8b5 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -163,9 +163,16 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 			struct resource *res, unsigned int pos)
 {
 	u32 l, sz, mask;
+	u16 orig_cmd;
 
 	mask = type ? PCI_ROM_ADDRESS_MASK : ~0;
 
+	if (!dev->mmio_always_on) {
+		pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
+		pci_write_config_word(dev, PCI_COMMAND,
+			orig_cmd & ~(PCI_COMMAND_MEMORY | PCI_COMMAND_IO));
+	}
+
 	res->name = pci_name(dev);
 
 	pci_read_config_dword(dev, pos, &l);
@@ -173,6 +180,9 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 	pci_read_config_dword(dev, pos, &sz);
 	pci_write_config_dword(dev, pos, l);
 
+	if (!dev->mmio_always_on)
+		pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
+
 	/*
 	 * All bits set in sz means the device isn't working properly.
 	 * If the BAR isn't implemented, all bits must be 0.  If it's a
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 3a81d9d44019..202efa6f57c4 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -91,6 +91,19 @@ static void __devinit quirk_resource_alignment(struct pci_dev *dev)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, quirk_resource_alignment);
 
+/*
+ * Decoding should be disabled for a PCI device during BAR sizing to avoid
+ * conflict. But doing so may cause problems on host bridge and perhaps other
+ * key system devices. For devices that need to have mmio decoding always-on,
+ * we need to set the dev->mmio_always_on bit.
+ */
+static void __devinit quirk_mmio_always_on(struct pci_dev *dev)
+{
+	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
+		dev->mmio_always_on = 1;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, quirk_mmio_always_on);
+
 /* The Mellanox Tavor device gives false positive parity errors
  * Mark this device with a broken_parity_status, to allow
  * PCI scanning code to "skip" this now blacklisted device.
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f26fda76b87f..b1d17956a153 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -270,6 +270,8 @@ struct pci_dev {
 	unsigned int	d1_support:1;	/* Low power state D1 is supported */
 	unsigned int	d2_support:1;	/* Low power state D2 is supported */
 	unsigned int	no_d1d2:1;	/* Only allow D0 and D3 */
+	unsigned int	mmio_always_on:1;	/* disallow turning off io/mem
+						   decoding during bar sizing */
 	unsigned int	wakeup_prepared:1;
 	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
 
-- 
cgit v1.2.3-59-g8ed1b


From 911e1c9b05a8e3559a7aa89083930700a0b9e7ee Mon Sep 17 00:00:00 2001
From: Narendra K <Narendra_K@dell.com>
Date: Mon, 26 Jul 2010 05:56:50 -0500
Subject: PCI: export SMBIOS provided firmware instance and label to sysfs

This patch exports SMBIOS provided firmware instance and label of
onboard PCI devices to sysfs.  New files are:
  /sys/bus/pci/devices/.../label which contains the firmware name for
the device in question, and
  /sys/bus/pci/devices/.../index which contains the firmware device type
instance for the given device.

Signed-off-by: Jordan Hargrave <jordan_hargrave@dell.com>
Signed-off-by: Narendra K <narendra_k@dell.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/ABI/testing/sysfs-bus-pci |  27 ++++++
 drivers/firmware/dmi_scan.c             |  25 ++++++
 drivers/pci/Makefile                    |   3 +
 drivers/pci/pci-label.c                 | 143 ++++++++++++++++++++++++++++++++
 drivers/pci/pci-sysfs.c                 |   5 ++
 drivers/pci/pci.h                       |   9 ++
 include/linux/dmi.h                     |   9 ++
 7 files changed, 221 insertions(+)
 create mode 100644 drivers/pci/pci-label.c

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 25be3250f7d6..f979d825d112 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -139,3 +139,30 @@ Contact:	linux-pci@vger.kernel.org
 Description:
 		This symbolic link points to the PCI hotplug controller driver
 		module that manages the hotplug slot.
+
+What:		/sys/bus/pci/devices/.../label
+Date:		July 2010
+Contact:	Narendra K <narendra_k@dell.com>, linux-bugs@dell.com
+Description:
+		Reading this attribute will provide the firmware
+		given name(SMBIOS type 41 string) of the PCI device.
+		The attribute will be created only if the firmware
+		has given a name to the PCI device.
+Users:
+		Userspace applications interested in knowing the
+		firmware assigned name of the PCI device.
+
+What:		/sys/bus/pci/devices/.../index
+Date:		July 2010
+Contact:	Narendra K <narendra_k@dell.com>, linux-bugs@dell.com
+Description:
+		Reading this attribute will provide the firmware
+		given instance(SMBIOS type 41 device type instance)
+		of the PCI device. The attribute will be created
+		only if the firmware has given a device type instance
+		to the PCI device.
+Users:
+		Userspace applications interested in knowing the
+		firmware assigned device type instance of the PCI
+		device that can help in understanding the firmware
+		intended order of the PCI device.
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index d46467271349..b3d22d659990 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -277,6 +277,29 @@ static void __init dmi_save_ipmi_device(const struct dmi_header *dm)
 	list_add_tail(&dev->list, &dmi_devices);
 }
 
+static void __init dmi_save_dev_onboard(int instance, int segment, int bus,
+					int devfn, const char *name)
+{
+	struct dmi_dev_onboard *onboard_dev;
+
+	onboard_dev = dmi_alloc(sizeof(*onboard_dev) + strlen(name) + 1);
+	if (!onboard_dev) {
+		printk(KERN_ERR "dmi_save_dev_onboard: out of memory.\n");
+		return;
+	}
+	onboard_dev->instance = instance;
+	onboard_dev->segment = segment;
+	onboard_dev->bus = bus;
+	onboard_dev->devfn = devfn;
+
+	strcpy((char *)&onboard_dev[1], name);
+	onboard_dev->dev.type = DMI_DEV_TYPE_DEV_ONBOARD;
+	onboard_dev->dev.name = (char *)&onboard_dev[1];
+	onboard_dev->dev.device_data = onboard_dev;
+
+	list_add(&onboard_dev->dev.list, &dmi_devices);
+}
+
 static void __init dmi_save_extended_devices(const struct dmi_header *dm)
 {
 	const u8 *d = (u8*) dm + 5;
@@ -285,6 +308,8 @@ static void __init dmi_save_extended_devices(const struct dmi_header *dm)
 	if ((*d & 0x80) == 0)
 		return;
 
+	dmi_save_dev_onboard(*(d+1), *(u16 *)(d+2), *(d+4), *(d+5),
+			     dmi_string_nosave(dm, *(d-1)));
 	dmi_save_one_device(*d & 0x7f, dmi_string_nosave(dm, *(d - 1)));
 }
 
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 0b51857fbaf7..dc1aa0922868 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -55,6 +55,9 @@ obj-$(CONFIG_MICROBLAZE) += setup-bus.o
 #
 obj-$(CONFIG_ACPI)    += pci-acpi.o
 
+# SMBIOS provided firmware instance and labels
+obj-$(CONFIG_DMI)    += pci-label.o
+
 # Cardbus & CompactPCI use setup-bus
 obj-$(CONFIG_HOTPLUG) += setup-bus.o
 
diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c
new file mode 100644
index 000000000000..111500e86f94
--- /dev/null
+++ b/drivers/pci/pci-label.c
@@ -0,0 +1,143 @@
+/*
+ * Purpose: Export the firmware instance and label associated with
+ * a pci device to sysfs
+ * Copyright (C) 2010 Dell Inc.
+ * by Narendra K <Narendra_K@dell.com>,
+ * Jordan Hargrave <Jordan_Hargrave@dell.com>
+ *
+ * SMBIOS defines type 41 for onboard pci devices. This code retrieves
+ * the instance number and string from the type 41 record and exports
+ * it to sysfs.
+ *
+ * Please see http://linux.dell.com/wiki/index.php/Oss/libnetdevname for more
+ * information.
+ */
+
+#include <linux/dmi.h>
+#include <linux/sysfs.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include "pci.h"
+
+enum smbios_attr_enum {
+	SMBIOS_ATTR_NONE = 0,
+	SMBIOS_ATTR_LABEL_SHOW,
+	SMBIOS_ATTR_INSTANCE_SHOW,
+};
+
+static mode_t
+find_smbios_instance_string(struct pci_dev *pdev, char *buf,
+			    enum smbios_attr_enum attribute)
+{
+	const struct dmi_device *dmi;
+	struct dmi_dev_onboard *donboard;
+	int bus;
+	int devfn;
+
+	bus = pdev->bus->number;
+	devfn = pdev->devfn;
+
+	dmi = NULL;
+	while ((dmi = dmi_find_device(DMI_DEV_TYPE_DEV_ONBOARD,
+				      NULL, dmi)) != NULL) {
+		donboard = dmi->device_data;
+		if (donboard && donboard->bus == bus &&
+					donboard->devfn == devfn) {
+			if (buf) {
+				if (attribute == SMBIOS_ATTR_INSTANCE_SHOW)
+					return scnprintf(buf, PAGE_SIZE,
+							 "%d\n",
+							 donboard->instance);
+				else if (attribute == SMBIOS_ATTR_LABEL_SHOW)
+					return scnprintf(buf, PAGE_SIZE,
+							 "%s\n",
+							 dmi->name);
+			}
+			return strlen(dmi->name);
+		}
+	}
+	return 0;
+}
+
+static mode_t
+smbios_instance_string_exist(struct kobject *kobj, struct attribute *attr,
+			     int n)
+{
+	struct device *dev;
+	struct pci_dev *pdev;
+
+	dev = container_of(kobj, struct device, kobj);
+	pdev = to_pci_dev(dev);
+
+	return find_smbios_instance_string(pdev, NULL, SMBIOS_ATTR_NONE) ?
+					   S_IRUGO : 0;
+}
+
+static ssize_t
+smbioslabel_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pdev;
+	pdev = to_pci_dev(dev);
+
+	return find_smbios_instance_string(pdev, buf,
+					   SMBIOS_ATTR_LABEL_SHOW);
+}
+
+static ssize_t
+smbiosinstance_show(struct device *dev,
+		    struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pdev;
+	pdev = to_pci_dev(dev);
+
+	return find_smbios_instance_string(pdev, buf,
+					   SMBIOS_ATTR_INSTANCE_SHOW);
+}
+
+static struct device_attribute smbios_attr_label = {
+	.attr = {.name = "label", .mode = 0444, .owner = THIS_MODULE},
+	.show = smbioslabel_show,
+};
+
+static struct device_attribute smbios_attr_instance = {
+	.attr = {.name = "index", .mode = 0444, .owner = THIS_MODULE},
+	.show = smbiosinstance_show,
+};
+
+static struct attribute *smbios_attributes[] = {
+	&smbios_attr_label.attr,
+	&smbios_attr_instance.attr,
+	NULL,
+};
+
+static struct attribute_group smbios_attr_group = {
+	.attrs = smbios_attributes,
+	.is_visible = smbios_instance_string_exist,
+};
+
+static int
+pci_create_smbiosname_file(struct pci_dev *pdev)
+{
+	if (!sysfs_create_group(&pdev->dev.kobj, &smbios_attr_group))
+		return 0;
+	return -ENODEV;
+}
+
+static void
+pci_remove_smbiosname_file(struct pci_dev *pdev)
+{
+	sysfs_remove_group(&pdev->dev.kobj, &smbios_attr_group);
+}
+
+void pci_create_firmware_label_files(struct pci_dev *pdev)
+{
+	if (!pci_create_smbiosname_file(pdev))
+		;
+}
+
+void pci_remove_firmware_label_files(struct pci_dev *pdev)
+{
+	pci_remove_smbiosname_file(pdev);
+}
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index f7692dc531e4..b5a7d9bfcb24 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1165,6 +1165,8 @@ int __must_check pci_create_sysfs_dev_files (struct pci_dev *pdev)
 	if (retval)
 		goto err_vga_file;
 
+	pci_create_firmware_label_files(pdev);
+
 	return 0;
 
 err_vga_file:
@@ -1232,6 +1234,9 @@ void pci_remove_sysfs_dev_files(struct pci_dev *pdev)
 		sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr);
 		kfree(pdev->rom_attr);
 	}
+
+	pci_remove_firmware_label_files(pdev);
+
 }
 
 static int __init pci_sysfs_init(void)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index f8077b3c8c8c..d930338e0922 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -11,6 +11,15 @@
 extern int pci_uevent(struct device *dev, struct kobj_uevent_env *env);
 extern int pci_create_sysfs_dev_files(struct pci_dev *pdev);
 extern void pci_remove_sysfs_dev_files(struct pci_dev *pdev);
+#ifndef CONFIG_DMI
+static inline void pci_create_firmware_label_files(struct pci_dev *pdev)
+{ return 0; }
+static inline void pci_remove_firmware_label_files(struct pci_dev *pdev)
+{ return 0; }
+#else
+extern void pci_create_firmware_label_files(struct pci_dev *pdev);
+extern void pci_remove_firmware_label_files(struct pci_dev *pdev);
+#endif
 extern void pci_cleanup_rom(struct pci_dev *dev);
 #ifdef HAVE_PCI_MMAP
 extern int pci_mmap_fits(struct pci_dev *pdev, int resno,
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index a8a3e1ac281d..90e087f8d951 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -20,6 +20,7 @@ enum dmi_device_type {
 	DMI_DEV_TYPE_SAS,
 	DMI_DEV_TYPE_IPMI = -1,
 	DMI_DEV_TYPE_OEM_STRING = -2,
+	DMI_DEV_TYPE_DEV_ONBOARD = -3,
 };
 
 struct dmi_header {
@@ -37,6 +38,14 @@ struct dmi_device {
 
 #ifdef CONFIG_DMI
 
+struct dmi_dev_onboard {
+	struct dmi_device dev;
+	int instance;
+	int segment;
+	int bus;
+	int devfn;
+};
+
 extern int dmi_check_system(const struct dmi_system_id *list);
 const struct dmi_system_id *dmi_first_match(const struct dmi_system_id *list);
 extern const char * dmi_get_system_info(int field);
-- 
cgit v1.2.3-59-g8ed1b


From 30da55242818a8ca08583188ebcbaccd283ad4d9 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 23 Jul 2010 14:56:28 +0100
Subject: PCI: MSI: Restore read_msi_msg_desc(); add get_cached_msi_msg_desc()

commit 2ca1af9aa3285c6a5f103ed31ad09f7399fc65d7 "PCI: MSI: Remove
unsafe and unnecessary hardware access" changed read_msi_msg_desc() to
return the last MSI message written instead of reading it from the
device, since it may be called while the device is in a reduced
power state.

However, the pSeries platform code really does need to read messages
from the device, since they are initially written by firmware.
Therefore:
- Restore the previous behaviour of read_msi_msg_desc()
- Add new functions get_cached_msi_msg{,_desc}() which return the
  last MSI message written
- Use the new functions where appropriate

Acked-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/ia64/kernel/msi_ia64.c    |  2 +-
 arch/ia64/sn/kernel/msi_sn.c   |  2 +-
 arch/x86/kernel/apic/io_apic.c |  2 +-
 drivers/pci/msi.c              | 47 +++++++++++++++++++++++++++++++++++++-----
 include/linux/msi.h            |  2 ++
 5 files changed, 47 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 6c8922856049..4a746ea838ff 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -25,7 +25,7 @@ static int ia64_set_msi_irq_affinity(unsigned int irq,
 	if (irq_prepare_move(irq, cpu))
 		return -1;
 
-	read_msi_msg(irq, &msg);
+	get_cached_msi_msg(irq, &msg);
 
 	addr = msg.address_lo;
 	addr &= MSI_ADDR_DEST_ID_MASK;
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index ebfdd6a9ae1a..0c72dd463831 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -175,7 +175,7 @@ static int sn_set_msi_irq_affinity(unsigned int irq,
 	 * Release XIO resources for the old MSI PCI address
 	 */
 
-	read_msi_msg(irq, &msg);
+	get_cached_msi_msg(irq, &msg);
         sn_pdev = (struct pcidev_info *)sn_irq_info->irq_pciioinfo;
 	pdev = sn_pdev->pdi_linux_pcidev;
 	provider = SN_PCIDEV_BUSPROVIDER(pdev);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e41ed24ab26d..4dc0084ec1b1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
 	cfg = desc->chip_data;
 
-	read_msi_msg_desc(desc, &msg);
+	get_cached_msi_msg_desc(desc, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 4c14f31f2b4d..69b7be33b3a2 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -197,9 +197,46 @@ void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
 	struct msi_desc *entry = get_irq_desc_msi(desc);
 
-	/* We do not touch the hardware (which may not even be
-	 * accessible at the moment) but return the last message
-	 * written.  Assert that this is valid, assuming that
+	BUG_ON(entry->dev->current_state != PCI_D0);
+
+	if (entry->msi_attrib.is_msix) {
+		void __iomem *base = entry->mask_base +
+			entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
+		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
+		msg->data = readl(base + PCI_MSIX_ENTRY_DATA);
+	} else {
+		struct pci_dev *dev = entry->dev;
+		int pos = entry->msi_attrib.pos;
+		u16 data;
+
+		pci_read_config_dword(dev, msi_lower_address_reg(pos),
+					&msg->address_lo);
+		if (entry->msi_attrib.is_64) {
+			pci_read_config_dword(dev, msi_upper_address_reg(pos),
+						&msg->address_hi);
+			pci_read_config_word(dev, msi_data_reg(pos, 1), &data);
+		} else {
+			msg->address_hi = 0;
+			pci_read_config_word(dev, msi_data_reg(pos, 0), &data);
+		}
+		msg->data = data;
+	}
+}
+
+void read_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	read_msi_msg_desc(desc, msg);
+}
+
+void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
+{
+	struct msi_desc *entry = get_irq_desc_msi(desc);
+
+	/* Assert that the cache is valid, assuming that
 	 * valid messages are not all-zeroes. */
 	BUG_ON(!(entry->msg.address_hi | entry->msg.address_lo |
 		 entry->msg.data));
@@ -207,11 +244,11 @@ void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 	*msg = entry->msg;
 }
 
-void read_msi_msg(unsigned int irq, struct msi_msg *msg)
+void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	read_msi_msg_desc(desc, msg);
+	get_cached_msi_msg_desc(desc, msg);
 }
 
 void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 6991ab5b24d1..91b05c171854 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -14,8 +14,10 @@ struct irq_desc;
 extern void mask_msi_irq(unsigned int irq);
 extern void unmask_msi_irq(unsigned int irq);
 extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
+extern void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
 extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
 extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
+extern void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
 extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
 struct msi_desc {
-- 
cgit v1.2.3-59-g8ed1b


From f11ac8db5d07b6e99d41ff4aa39d878ee5cef1c5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 25 Jun 2010 16:35:53 -0400
Subject: NFSv4: Ensure that we track the NFSv4 lock state in read/write
 requests.

This patch fixes bugzilla entry 14501:
  https://bugzilla.kernel.org/show_bug.cgi?id=14501

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c          | 29 +++++++++++++++-----
 fs/nfs/inode.c           | 70 +++++++++++++++++++++++++++++++++++++++++++++---
 fs/nfs/nfs4xdr.c         |  8 +++---
 fs/nfs/pagelist.c        |  8 +++++-
 fs/nfs/read.c            |  1 +
 fs/nfs/write.c           |  5 +++-
 include/linux/nfs_fs.h   | 13 +++++++--
 include/linux/nfs_page.h |  1 +
 include/linux/nfs_xdr.h  |  2 ++
 9 files changed, 118 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ad4cd31d6050..064a80961677 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -69,6 +69,7 @@ struct nfs_direct_req {
 
 	/* I/O parameters */
 	struct nfs_open_context	*ctx;		/* file open context info */
+	struct nfs_lock_context *l_ctx;		/* Lock context info */
 	struct kiocb *		iocb;		/* controlling i/o request */
 	struct inode *		inode;		/* target file of i/o */
 
@@ -160,6 +161,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 	INIT_LIST_HEAD(&dreq->rewrite_list);
 	dreq->iocb = NULL;
 	dreq->ctx = NULL;
+	dreq->l_ctx = NULL;
 	spin_lock_init(&dreq->lock);
 	atomic_set(&dreq->io_count, 0);
 	dreq->count = 0;
@@ -173,6 +175,8 @@ static void nfs_direct_req_free(struct kref *kref)
 {
 	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
 
+	if (dreq->l_ctx != NULL)
+		nfs_put_lock_context(dreq->l_ctx);
 	if (dreq->ctx != NULL)
 		put_nfs_open_context(dreq->ctx);
 	kmem_cache_free(nfs_direct_cachep, dreq);
@@ -336,6 +340,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
+		data->args.lock_context = dreq->l_ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
@@ -416,24 +421,28 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 			       unsigned long nr_segs, loff_t pos)
 {
-	ssize_t result = 0;
+	ssize_t result = -ENOMEM;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct nfs_direct_req *dreq;
 
 	dreq = nfs_direct_req_alloc();
-	if (!dreq)
-		return -ENOMEM;
+	if (dreq == NULL)
+		goto out;
 
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (dreq->l_ctx == NULL)
+		goto out_release;
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
 		result = nfs_direct_wait(dreq);
+out_release:
 	nfs_direct_req_release(dreq);
-
+out:
 	return result;
 }
 
@@ -574,6 +583,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->args.offset = 0;
 	data->args.count = 0;
 	data->args.context = dreq->ctx;
+	data->args.lock_context = dreq->l_ctx;
 	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
@@ -761,6 +771,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
+		data->args.lock_context = dreq->l_ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
@@ -845,7 +856,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos,
 				size_t count)
 {
-	ssize_t result = 0;
+	ssize_t result = -ENOMEM;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct nfs_direct_req *dreq;
 	size_t wsize = NFS_SERVER(inode)->wsize;
@@ -853,7 +864,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 
 	dreq = nfs_direct_req_alloc();
 	if (!dreq)
-		return -ENOMEM;
+		goto out;
 	nfs_alloc_commit_data(dreq);
 
 	if (dreq->commit_data == NULL || count < wsize)
@@ -861,14 +872,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+	if (dreq->l_ctx != NULL)
+		goto out_release;
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
 	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
 	if (!result)
 		result = nfs_direct_wait(dreq);
+out_release:
 	nfs_direct_req_release(dreq);
-
+out:
 	return result;
 }
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 099b3518feea..ec7a8f96a2c2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -530,6 +530,68 @@ out:
 	return err;
 }
 
+static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
+{
+	atomic_set(&l_ctx->count, 1);
+	l_ctx->lockowner = current->files;
+	l_ctx->pid = current->tgid;
+	INIT_LIST_HEAD(&l_ctx->list);
+}
+
+static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
+{
+	struct nfs_lock_context *pos;
+
+	list_for_each_entry(pos, &ctx->lock_context.list, list) {
+		if (pos->lockowner != current->files)
+			continue;
+		if (pos->pid != current->tgid)
+			continue;
+		atomic_inc(&pos->count);
+		return pos;
+	}
+	return NULL;
+}
+
+struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
+{
+	struct nfs_lock_context *res, *new = NULL;
+	struct inode *inode = ctx->path.dentry->d_inode;
+
+	spin_lock(&inode->i_lock);
+	res = __nfs_find_lock_context(ctx);
+	if (res == NULL) {
+		spin_unlock(&inode->i_lock);
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		if (new == NULL)
+			return NULL;
+		nfs_init_lock_context(new);
+		spin_lock(&inode->i_lock);
+		res = __nfs_find_lock_context(ctx);
+		if (res == NULL) {
+			list_add_tail(&new->list, &ctx->lock_context.list);
+			new->open_context = ctx;
+			res = new;
+			new = NULL;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	kfree(new);
+	return res;
+}
+
+void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
+{
+	struct nfs_open_context *ctx = l_ctx->open_context;
+	struct inode *inode = ctx->path.dentry->d_inode;
+
+	if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
+		return;
+	list_del(&l_ctx->list);
+	spin_unlock(&inode->i_lock);
+	kfree(l_ctx);
+}
+
 /**
  * nfs_close_context - Common close_context() routine NFSv2/v3
  * @ctx: pointer to context
@@ -566,11 +628,11 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
 		path_get(&ctx->path);
 		ctx->cred = get_rpccred(cred);
 		ctx->state = NULL;
-		ctx->lockowner = current->files;
 		ctx->flags = 0;
 		ctx->error = 0;
 		ctx->dir_cookie = 0;
-		atomic_set(&ctx->count, 1);
+		nfs_init_lock_context(&ctx->lock_context);
+		ctx->lock_context.open_context = ctx;
 	}
 	return ctx;
 }
@@ -578,7 +640,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
 struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 {
 	if (ctx != NULL)
-		atomic_inc(&ctx->count);
+		atomic_inc(&ctx->lock_context.count);
 	return ctx;
 }
 
@@ -586,7 +648,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
 	struct inode *inode = ctx->path.dentry->d_inode;
 
-	if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
+	if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
 		return;
 	list_del(&ctx->list);
 	spin_unlock(&inode->i_lock);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1f7781d636ae..873b62f209ea 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1324,14 +1324,14 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	hdr->replen += decode_putrootfh_maxsz;
 }
 
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
 {
 	nfs4_stateid stateid;
 	__be32 *p;
 
 	p = reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
-		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
+		nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner);
 		xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
 	} else
 		xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1344,7 +1344,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(OP_READ);
 
-	encode_stateid(xdr, args->context);
+	encode_stateid(xdr, args->context, args->lock_context);
 
 	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
@@ -1523,7 +1523,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(OP_WRITE);
 
-	encode_stateid(xdr, args->context);
+	encode_stateid(xdr, args->context, args->lock_context);
 
 	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a3654e57b589..919490232e17 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -79,6 +79,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	req->wb_pgbase	= offset;
 	req->wb_bytes   = count;
 	req->wb_context = get_nfs_open_context(ctx);
+	req->wb_lock_context = nfs_get_lock_context(ctx);
 	kref_init(&req->wb_kref);
 	return req;
 }
@@ -141,11 +142,16 @@ void nfs_clear_request(struct nfs_page *req)
 {
 	struct page *page = req->wb_page;
 	struct nfs_open_context *ctx = req->wb_context;
+	struct nfs_lock_context *l_ctx = req->wb_lock_context;
 
 	if (page != NULL) {
 		page_cache_release(page);
 		req->wb_page = NULL;
 	}
+	if (l_ctx != NULL) {
+		nfs_put_lock_context(l_ctx);
+		req->wb_lock_context = NULL;
+	}
 	if (ctx != NULL) {
 		put_nfs_open_context(ctx);
 		req->wb_context = NULL;
@@ -235,7 +241,7 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
 {
 	if (req->wb_context->cred != prev->wb_context->cred)
 		return 0;
-	if (req->wb_context->lockowner != prev->wb_context->lockowner)
+	if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
 		return 0;
 	if (req->wb_context->state != prev->wb_context->state)
 		return 0;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 5a33a92e8168..87adc2744246 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -190,6 +190,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	data->args.pages  = data->pagevec;
 	data->args.count  = count;
 	data->args.context = get_nfs_open_context(req->wb_context);
+	data->args.lock_context = req->wb_lock_context;
 
 	data->res.fattr   = &data->fattr;
 	data->res.count   = count;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 03df22822c4c..5eccea127cac 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -689,7 +689,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 		req = nfs_page_find_request(page);
 		if (req == NULL)
 			return 0;
-		do_flush = req->wb_page != page || req->wb_context != ctx;
+		do_flush = req->wb_page != page || req->wb_context != ctx ||
+			req->wb_lock_context->lockowner != current->files ||
+			req->wb_lock_context->pid != current->tgid;
 		nfs_release_request(req);
 		if (!do_flush)
 			return 0;
@@ -813,6 +815,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	data->args.pages  = data->pagevec;
 	data->args.count  = count;
 	data->args.context = get_nfs_open_context(req->wb_context);
+	data->args.lock_context = req->wb_lock_context;
 	data->args.stable  = NFS_UNSTABLE;
 	if (how & FLUSH_STABLE) {
 		data->args.stable = NFS_DATA_SYNC;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 77c2ae53431c..a9d802615082 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -72,13 +72,20 @@ struct nfs_access_entry {
 	int			mask;
 };
 
+struct nfs_lock_context {
+	atomic_t count;
+	struct list_head list;
+	struct nfs_open_context *open_context;
+	fl_owner_t lockowner;
+	pid_t pid;
+};
+
 struct nfs4_state;
 struct nfs_open_context {
-	atomic_t count;
+	struct nfs_lock_context lock_context;
 	struct path path;
 	struct rpc_cred *cred;
 	struct nfs4_state *state;
-	fl_owner_t lockowner;
 	fmode_t mode;
 
 	unsigned long flags;
@@ -353,6 +360,8 @@ extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode);
+extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx);
+extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
 extern u64 nfs_compat_user_ino64(u64 fileid);
 extern void nfs_fattr_init(struct nfs_fattr *fattr);
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 3c60685d972b..f8b60e7f4c44 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -39,6 +39,7 @@ struct nfs_page {
 	struct list_head	wb_list;	/* Defines state of page: */
 	struct page		*wb_page;	/* page to read in/write out */
 	struct nfs_open_context	*wb_context;	/* File state context info */
+	struct nfs_lock_context	*wb_lock_context;	/* lock context info */
 	atomic_t		wb_complete;	/* i/os we're waiting for */
 	pgoff_t			wb_index;	/* Offset >> PAGE_CACHE_SHIFT */
 	unsigned int		wb_offset,	/* Offset & ~PAGE_CACHE_MASK */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index a319cb926abf..87202c7026e3 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -334,6 +334,7 @@ struct nfs4_delegreturnres {
 struct nfs_readargs {
 	struct nfs_fh *		fh;
 	struct nfs_open_context *context;
+	struct nfs_lock_context *lock_context;
 	__u64			offset;
 	__u32			count;
 	unsigned int		pgbase;
@@ -354,6 +355,7 @@ struct nfs_readres {
 struct nfs_writeargs {
 	struct nfs_fh *		fh;
 	struct nfs_open_context *context;
+	struct nfs_lock_context *lock_context;
 	__u64			offset;
 	__u32			count;
 	enum nfs3_stable_how	stable;
-- 
cgit v1.2.3-59-g8ed1b


From d3c7b7ccc199ee564177ee914c04771d6bc00295 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 1 Jul 2010 12:49:01 -0400
Subject: NFSv4: Add support for the RELEASE_LOCKOWNER operation

This is needed by NFSv4.0 servers in order to keep the number of locking
stateids at a manageable level.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h        |  1 +
 fs/nfs/nfs4proc.c       | 28 +++++++++++++++++++++++++
 fs/nfs/nfs4state.c      |  2 ++
 fs/nfs/nfs4xdr.c        | 55 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs4.h    |  1 +
 include/linux/nfs_xdr.h |  4 ++++
 6 files changed, 91 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index cee871471e8d..deaf37f5a7a9 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -236,6 +236,7 @@ extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nam
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 		struct nfs4_fs_locations *fs_locations, struct page *page);
+extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
 
 #if defined(CONFIG_NFS_V4_1)
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index de9ff1505a24..5d3e8a2db99f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4414,6 +4414,34 @@ out:
 	return err;
 }
 
+static void nfs4_release_lockowner_release(void *calldata)
+{
+	kfree(calldata);
+}
+
+const struct rpc_call_ops nfs4_release_lockowner_ops = {
+	.rpc_release = nfs4_release_lockowner_release,
+};
+
+void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
+{
+	struct nfs_server *server = lsp->ls_state->owner->so_server;
+	struct nfs_release_lockowner_args *args;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
+	};
+
+	if (server->nfs_client->cl_mvops->minor_version != 0)
+		return;
+	args = kmalloc(sizeof(*args), GFP_NOFS);
+	if (!args)
+		return;
+	args->lock_owner.clientid = server->nfs_client->cl_clientid;
+	args->lock_owner.id = lsp->ls_id.id;
+	msg.rpc_argp = args;
+	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
+}
+
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 
 int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 13e17e32e3e4..13a4f27e7271 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -701,6 +701,8 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
 	if (list_empty(&state->lock_states))
 		clear_bit(LK_STATE_IN_USE, &state->flags);
 	spin_unlock(&state->state_lock);
+	if (lsp->ls_flags & NFS_LOCK_INITIALIZED)
+		nfs4_release_lockowner(lsp);
 	nfs4_free_lock_state(lsp);
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 49df05afdc64..15185c2abd11 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -220,6 +220,11 @@ static int nfs4_stat_to_errno(int);
 				 4)
 #define decode_locku_maxsz	(op_decode_hdr_maxsz + \
 				 decode_stateid_maxsz)
+#define encode_release_lockowner_maxsz \
+				(op_encode_hdr_maxsz + \
+				 encode_lockowner_maxsz)
+#define decode_release_lockowner_maxsz \
+				(op_decode_hdr_maxsz)
 #define encode_access_maxsz	(op_encode_hdr_maxsz + 1)
 #define decode_access_maxsz	(op_decode_hdr_maxsz + 2)
 #define encode_symlink_maxsz	(op_encode_hdr_maxsz + \
@@ -474,6 +479,12 @@ static int nfs4_stat_to_errno(int);
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
 				decode_locku_maxsz)
+#define NFS4_enc_release_lockowner_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_lockowner_maxsz)
+#define NFS4_dec_release_lockowner_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_lockowner_maxsz)
 #define NFS4_enc_access_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
@@ -1116,6 +1127,17 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	hdr->replen += decode_locku_maxsz;
 }
 
+static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_RELEASE_LOCKOWNER);
+	encode_lockowner(xdr, lowner);
+	hdr->nops++;
+	hdr->replen += decode_release_lockowner_maxsz;
+}
+
 static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
 	int len = name->len;
@@ -2056,6 +2078,20 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
 	return 0;
 }
 
+static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.minorversion = 0,
+	};
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, req, &hdr);
+	encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
+	encode_nops(&hdr);
+	return 0;
+}
+
 /*
  * Encode a READLINK request
  */
@@ -3981,6 +4017,11 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 	return status;
 }
 
+static int decode_release_lockowner(struct xdr_stream *xdr)
+{
+	return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER);
+}
+
 static int decode_lookup(struct xdr_stream *xdr)
 {
 	return decode_op_hdr(xdr, OP_LOOKUP);
@@ -5267,6 +5308,19 @@ out:
 	return status;
 }
 
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (!status)
+		status = decode_release_lockowner(&xdr);
+	return status;
+}
+
 /*
  * Decode READLINK response
  */
@@ -5874,6 +5928,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(GETACL,		enc_getacl,	dec_getacl),
   PROC(SETACL,		enc_setacl,	dec_setacl),
   PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
+  PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
 #if defined(CONFIG_NFS_V4_1)
   PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),
   PROC(CREATE_SESSION,	enc_create_session,	dec_create_session),
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 9b8299af3741..07e40c625972 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -523,6 +523,7 @@ enum {
 	NFSPROC4_CLNT_GETACL,
 	NFSPROC4_CLNT_SETACL,
 	NFSPROC4_CLNT_FS_LOCATIONS,
+	NFSPROC4_CLNT_RELEASE_LOCKOWNER,
 
 	/* nfs41 */
 	NFSPROC4_CLNT_EXCHANGE_ID,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 87202c7026e3..fc461926c412 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -315,6 +315,10 @@ struct nfs_lockt_res {
 	struct nfs4_sequence_res	seq_res;
 };
 
+struct nfs_release_lockowner_args {
+	struct nfs_lowner	lock_owner;
+};
+
 struct nfs4_delegreturnargs {
 	const struct nfs_fh *fhandle;
 	const nfs4_stateid *stateid;
-- 
cgit v1.2.3-59-g8ed1b


From 60347c194acec7ff1b4291ac8e62a5345244c2ee Mon Sep 17 00:00:00 2001
From: Samuli Konttila <samuli.konttila@aavamobile.com>
Date: Fri, 30 Jul 2010 09:02:43 -0700
Subject: Input: cy8ctmg110 - capacitive touchscreen support

Add support for the cy8ctmg110 capacitive touchscreen used on some
embedded devices.

(Some clean up by Alan Cox)

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/Kconfig         |  14 ++
 drivers/input/touchscreen/Makefile        |   3 +-
 drivers/input/touchscreen/cy8ctmg110_ts.c | 363 ++++++++++++++++++++++++++++++
 include/linux/input/cy8ctmg110_pdata.h    |  10 +
 4 files changed, 389 insertions(+), 1 deletion(-)
 create mode 100644 drivers/input/touchscreen/cy8ctmg110_ts.c
 create mode 100644 include/linux/input/cy8ctmg110_pdata.h

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index 7bfcfdff6cf8..61f35184f76c 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -98,6 +98,20 @@ config TOUCHSCREEN_BITSY
 	  To compile this driver as a module, choose M here: the
 	  module will be called h3600_ts_input.
 
+config TOUCHSCREEN_CY8CTMG110
+	tristate "cy8ctmg110 touchscreen"
+	depends on I2C
+	depends on GPIOLIB
+
+	help
+	  Say Y here if you have a cy8ctmg110 capacitive touchscreen on
+	  an AAVA device.
+
+	  If unsure, say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called cy8ctmg110_ts.
+
 config TOUCHSCREEN_DA9034
 	tristate "Touchscreen support for Dialog Semiconductor DA9034"
 	depends on PMIC_DA903X
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile
index 779de0d9d413..bd6f30b4ff70 100644
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -14,6 +14,8 @@ obj-$(CONFIG_TOUCHSCREEN_AD7879_SPI)	+= ad7879-spi.o
 obj-$(CONFIG_TOUCHSCREEN_ADS7846)	+= ads7846.o
 obj-$(CONFIG_TOUCHSCREEN_ATMEL_TSADCC)	+= atmel_tsadcc.o
 obj-$(CONFIG_TOUCHSCREEN_BITSY)		+= h3600_ts_input.o
+obj-$(CONFIG_TOUCHSCREEN_CY8CTMG110)	+= cy8ctmg110_ts.o
+obj-$(CONFIG_TOUCHSCREEN_DA9034)	+= da9034-ts.o
 obj-$(CONFIG_TOUCHSCREEN_DYNAPRO)	+= dynapro.o
 obj-$(CONFIG_TOUCHSCREEN_HAMPSHIRE)	+= hampshire.o
 obj-$(CONFIG_TOUCHSCREEN_GUNZE)		+= gunze.o
@@ -41,7 +43,6 @@ obj-$(CONFIG_TOUCHSCREEN_TSC2007)	+= tsc2007.o
 obj-$(CONFIG_TOUCHSCREEN_UCB1400)	+= ucb1400_ts.o
 obj-$(CONFIG_TOUCHSCREEN_WACOM_W8001)	+= wacom_w8001.o
 obj-$(CONFIG_TOUCHSCREEN_WM97XX)	+= wm97xx-ts.o
-obj-$(CONFIG_TOUCHSCREEN_DA9034)	+= da9034-ts.o
 wm97xx-ts-$(CONFIG_TOUCHSCREEN_WM9705)	+= wm9705.o
 wm97xx-ts-$(CONFIG_TOUCHSCREEN_WM9712)	+= wm9712.o
 wm97xx-ts-$(CONFIG_TOUCHSCREEN_WM9713)	+= wm9713.o
diff --git a/drivers/input/touchscreen/cy8ctmg110_ts.c b/drivers/input/touchscreen/cy8ctmg110_ts.c
new file mode 100644
index 000000000000..4eb7df0b7f87
--- /dev/null
+++ b/drivers/input/touchscreen/cy8ctmg110_ts.c
@@ -0,0 +1,363 @@
+/*
+ * Driver for cypress touch screen controller
+ *
+ * Copyright (c) 2009 Aava Mobile
+ *
+ * Some cleanups by Alan Cox <alan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/input.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <linux/input/cy8ctmg110_pdata.h>
+
+#define CY8CTMG110_DRIVER_NAME      "cy8ctmg110"
+
+/* Touch coordinates */
+#define CY8CTMG110_X_MIN		0
+#define CY8CTMG110_Y_MIN		0
+#define CY8CTMG110_X_MAX		759
+#define CY8CTMG110_Y_MAX		465
+
+
+/* cy8ctmg110 register definitions */
+#define CY8CTMG110_TOUCH_WAKEUP_TIME	0
+#define CY8CTMG110_TOUCH_SLEEP_TIME	2
+#define CY8CTMG110_TOUCH_X1		3
+#define CY8CTMG110_TOUCH_Y1		5
+#define CY8CTMG110_TOUCH_X2		7
+#define CY8CTMG110_TOUCH_Y2		9
+#define CY8CTMG110_FINGERS		11
+#define CY8CTMG110_GESTURE		12
+#define CY8CTMG110_REG_MAX		13
+
+
+/*
+ * The touch driver structure.
+ */
+struct cy8ctmg110 {
+	struct input_dev *input;
+	char phys[32];
+	struct i2c_client *client;
+	int reset_pin;
+	int irq_pin;
+};
+
+/*
+ * cy8ctmg110_power is the routine that is called when touch hardware
+ * will powered off or on.
+ */
+static void cy8ctmg110_power(struct cy8ctmg110 *ts, bool poweron)
+{
+	if (ts->reset_pin)
+		gpio_direction_output(ts->reset_pin, 1 - poweron);
+}
+
+static int cy8ctmg110_write_regs(struct cy8ctmg110 *tsc, unsigned char reg,
+		unsigned char len, unsigned char *value)
+{
+	struct i2c_client *client = tsc->client;
+	unsigned int ret;
+	unsigned char i2c_data[6];
+
+	BUG_ON(len > 5);
+
+	i2c_data[0] = reg;
+	memcpy(i2c_data + 1, value, len);
+
+	ret = i2c_master_send(client, i2c_data, len + 1);
+	if (ret != 1) {
+		dev_err(&client->dev, "i2c write data cmd failed\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int cy8ctmg110_read_regs(struct cy8ctmg110 *tsc,
+		unsigned char *data, unsigned char len, unsigned char cmd)
+{
+	struct i2c_client *client = tsc->client;
+	unsigned int ret;
+	struct i2c_msg msg[2] = {
+		/* first write slave position to i2c devices */
+		{ client->addr, 0, 1, &cmd },
+		/* Second read data from position */
+		{ client->addr, I2C_M_RD, len, data }
+	};
+
+	ret = i2c_transfer(client->adapter, msg, 2);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static int cy8ctmg110_touch_pos(struct cy8ctmg110 *tsc)
+{
+	struct input_dev *input = tsc->input;
+	unsigned char reg_p[CY8CTMG110_REG_MAX];
+	int x, y;
+
+	memset(reg_p, 0, CY8CTMG110_REG_MAX);
+
+	/* Reading coordinates */
+	if (cy8ctmg110_read_regs(tsc, reg_p, 9, CY8CTMG110_TOUCH_X1) != 0)
+		return -EIO;
+
+	y = reg_p[2] << 8 | reg_p[3];
+	x = reg_p[0] << 8 | reg_p[1];
+
+	/* Number of touch */
+	if (reg_p[8] == 0) {
+		input_report_key(input, BTN_TOUCH, 0);
+	} else  {
+		input_report_key(input, BTN_TOUCH, 1);
+		input_report_abs(input, ABS_X, x);
+		input_report_abs(input, ABS_Y, y);
+	}
+
+	input_sync(input);
+
+	return 0;
+}
+
+static int cy8ctmg110_set_sleepmode(struct cy8ctmg110 *ts, bool sleep)
+{
+	unsigned char reg_p[3];
+
+	if (sleep) {
+		reg_p[0] = 0x00;
+		reg_p[1] = 0xff;
+		reg_p[2] = 5;
+	} else {
+		reg_p[0] = 0x10;
+		reg_p[1] = 0xff;
+		reg_p[2] = 0;
+	}
+
+	return cy8ctmg110_write_regs(ts, CY8CTMG110_TOUCH_WAKEUP_TIME, 3, reg_p);
+}
+
+static irqreturn_t cy8ctmg110_irq_thread(int irq, void *dev_id)
+{
+	struct cy8ctmg110 *tsc = dev_id;
+
+	cy8ctmg110_touch_pos(tsc);
+
+	return IRQ_HANDLED;
+}
+
+static int __devinit cy8ctmg110_probe(struct i2c_client *client,
+					const struct i2c_device_id *id)
+{
+	const struct cy8ctmg110_pdata *pdata = client->dev.platform_data;
+	struct cy8ctmg110 *ts;
+	struct input_dev *input_dev;
+	int err;
+
+	/* No pdata no way forward */
+	if (pdata == NULL) {
+		dev_err(&client->dev, "no pdata\n");
+		return -ENODEV;
+	}
+
+	if (!i2c_check_functionality(client->adapter,
+					I2C_FUNC_SMBUS_READ_WORD_DATA))
+		return -EIO;
+
+	ts = kzalloc(sizeof(struct cy8ctmg110), GFP_KERNEL);
+	input_dev = input_allocate_device();
+	if (!ts || !input_dev) {
+		err = -ENOMEM;
+		goto err_free_mem;
+	}
+
+	ts->client = client;
+	ts->input = input_dev;
+
+	snprintf(ts->phys, sizeof(ts->phys),
+		 "%s/input0", dev_name(&client->dev));
+
+	input_dev->name = CY8CTMG110_DRIVER_NAME " Touchscreen";
+	input_dev->phys = ts->phys;
+	input_dev->id.bustype = BUS_I2C;
+	input_dev->dev.parent = &client->dev;
+
+	input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS);
+	input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH);
+
+	input_set_abs_params(input_dev, ABS_X,
+			CY8CTMG110_X_MIN, CY8CTMG110_X_MAX, 0, 0);
+	input_set_abs_params(input_dev, ABS_Y,
+			CY8CTMG110_Y_MIN, CY8CTMG110_Y_MAX, 0, 0);
+
+	if (ts->reset_pin) {
+		err = gpio_request(ts->reset_pin, NULL);
+		if (err) {
+			dev_err(&client->dev,
+				"Unable to request GPIO pin %d.\n",
+				ts->reset_pin);
+			goto err_free_mem;
+		}
+	}
+
+	cy8ctmg110_power(ts, true);
+	cy8ctmg110_set_sleepmode(ts, false);
+
+	err = gpio_request(ts->irq_pin, "touch_irq_key");
+	if (err < 0) {
+		dev_err(&client->dev,
+			"Failed to request GPIO %d, error %d\n",
+			ts->irq_pin, err);
+		goto err_shutoff_device;
+	}
+
+	err = gpio_direction_input(ts->irq_pin);
+	if (err < 0) {
+		dev_err(&client->dev,
+			"Failed to configure input direction for GPIO %d, error %d\n",
+			ts->irq_pin, err);
+		goto err_free_irq_gpio;
+	}
+
+	client->irq = gpio_to_irq(ts->irq_pin);
+	if (client->irq < 0) {
+		err = client->irq;
+		dev_err(&client->dev,
+			"Unable to get irq number for GPIO %d, error %d\n",
+			ts->irq_pin, err);
+		goto err_free_irq_gpio;
+	}
+
+	err = request_threaded_irq(client->irq, NULL, cy8ctmg110_irq_thread,
+				   IRQF_TRIGGER_RISING, "touch_reset_key", ts);
+	if (err < 0) {
+		dev_err(&client->dev,
+			"irq %d busy? error %d\n", client->irq, err);
+		goto err_free_irq_gpio;
+	}
+
+	err = input_register_device(input_dev);
+	if (err)
+		goto err_free_irq;
+
+	i2c_set_clientdata(client, ts);
+	device_init_wakeup(&client->dev, 1);
+	return 0;
+
+err_free_irq:
+	free_irq(client->irq, ts);
+err_free_irq_gpio:
+	gpio_free(ts->irq_pin);
+err_shutoff_device:
+	cy8ctmg110_set_sleepmode(ts, true);
+	cy8ctmg110_power(ts, false);
+	if (ts->reset_pin)
+		gpio_free(ts->reset_pin);
+err_free_mem:
+	input_free_device(input_dev);
+	kfree(ts);
+	return err;
+}
+
+#ifdef CONFIG_PM
+static int cy8ctmg110_suspend(struct i2c_client *client, pm_message_t mesg)
+{
+	struct cy8ctmg110 *ts = i2c_get_clientdata(client);
+
+	if (device_may_wakeup(&client->dev))
+		enable_irq_wake(client->irq);
+	else {
+		cy8ctmg110_set_sleepmode(ts, true);
+		cy8ctmg110_power(ts, false);
+	}
+	return 0;
+}
+
+static int cy8ctmg110_resume(struct i2c_client *client)
+{
+	struct cy8ctmg110 *ts = i2c_get_clientdata(client);
+
+	if (device_may_wakeup(&client->dev))
+		disable_irq_wake(client->irq);
+	else {
+		cy8ctmg110_power(ts, true);
+		cy8ctmg110_set_sleepmode(ts, false);
+	}
+	return 0;
+}
+#endif
+
+static int __devexit cy8ctmg110_remove(struct i2c_client *client)
+{
+	struct cy8ctmg110 *ts = i2c_get_clientdata(client);
+
+	cy8ctmg110_set_sleepmode(ts, true);
+	cy8ctmg110_power(ts, false);
+
+	free_irq(client->irq, ts);
+	input_unregister_device(ts->input);
+	gpio_free(ts->irq_pin);
+	if (ts->reset_pin)
+		gpio_free(ts->reset_pin);
+	kfree(ts);
+
+	return 0;
+}
+
+static struct i2c_device_id cy8ctmg110_idtable[] = {
+	{ CY8CTMG110_DRIVER_NAME, 1 },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(i2c, cy8ctmg110_idtable);
+
+static struct i2c_driver cy8ctmg110_driver = {
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= CY8CTMG110_DRIVER_NAME,
+	},
+	.id_table	= cy8ctmg110_idtable,
+	.probe		= cy8ctmg110_probe,
+	.remove		= __devexit_p(cy8ctmg110_remove),
+#ifdef CONFIG_PM
+	.suspend	= cy8ctmg110_suspend,
+	.resume		= cy8ctmg110_resume,
+#endif
+};
+
+static int __init cy8ctmg110_init(void)
+{
+	return i2c_add_driver(&cy8ctmg110_driver);
+}
+
+static void __exit cy8ctmg110_exit(void)
+{
+	i2c_del_driver(&cy8ctmg110_driver);
+}
+
+module_init(cy8ctmg110_init);
+module_exit(cy8ctmg110_exit);
+
+MODULE_AUTHOR("Samuli Konttila <samuli.konttila@aavamobile.com>");
+MODULE_DESCRIPTION("cy8ctmg110 TouchScreen Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/input/cy8ctmg110_pdata.h b/include/linux/input/cy8ctmg110_pdata.h
new file mode 100644
index 000000000000..09522cb59910
--- /dev/null
+++ b/include/linux/input/cy8ctmg110_pdata.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_CY8CTMG110_PDATA_H
+#define _LINUX_CY8CTMG110_PDATA_H
+
+struct cy8ctmg110_pdata
+{
+	int reset_pin;		/* Reset pin is wired to this GPIO (optional) */
+	int irq_pin;		/* IRQ pin is wired to this GPIO */
+};
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From e6cc11707661770ca2bd4db4b0256d28f48e7541 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 27 Jul 2010 07:14:28 +0200
Subject: padata: Rename padata_alloc functions

We rename padata_alloc to padata_alloc_possible because this
function allocates a padata_instance and uses the cpu_possible
mask for parallel and serial workers. Also we rename __padata_alloc
to padata_alloc to avoid to export underlined functions. Underlined
functions are considered to be private to padata. Users are updated
accordingly.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/pcrypt.c        |  2 +-
 include/linux/padata.h |  9 +++++----
 kernel/padata.c        | 24 ++++++++++++------------
 3 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index 794c172b99f7..55460839624e 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -457,7 +457,7 @@ static int __pcrypt_init_instance(struct pcrypt_instance *pcrypt,
 	if (!pcrypt->wq)
 		goto err;
 
-	pcrypt->pinst = padata_alloc(pcrypt->wq);
+	pcrypt->pinst = padata_alloc_possible(pcrypt->wq);
 	if (!pcrypt->pinst)
 		goto err_destroy_workqueue;
 
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 293ad46ffced..71dfc9d1f856 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -165,10 +165,11 @@ struct padata_instance {
 #define	PADATA_INVALID	4
 };
 
-extern struct padata_instance *padata_alloc(struct workqueue_struct *wq);
-extern struct padata_instance *__padata_alloc(struct workqueue_struct *wq,
-					      const struct cpumask *pcpumask,
-					      const struct cpumask *cbcpumask);
+extern struct padata_instance *padata_alloc_possible(
+					struct workqueue_struct *wq);
+extern struct padata_instance *padata_alloc(struct workqueue_struct *wq,
+					    const struct cpumask *pcpumask,
+					    const struct cpumask *cbcpumask);
 extern void padata_free(struct padata_instance *pinst);
 extern int padata_do_parallel(struct padata_instance *pinst,
 			      struct padata_priv *padata, int cb_cpu);
diff --git a/kernel/padata.c b/kernel/padata.c
index 7f895e2b4efb..12860bce6b78 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1060,29 +1060,29 @@ static struct kobj_type padata_attr_type = {
 };
 
 /**
- * padata_alloc - Allocate and initialize padata instance.
- *                Use default cpumask(cpu_possible_mask)
- *                for serial and parallel workes.
+ * padata_alloc_possible - Allocate and initialize padata instance.
+ *                         Use the cpu_possible_mask for serial and
+ *                         parallel workers.
  *
  * @wq: workqueue to use for the allocated padata instance
  */
-struct padata_instance *padata_alloc(struct workqueue_struct *wq)
+struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
 {
-	return __padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
+	return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
 }
-EXPORT_SYMBOL(padata_alloc);
+EXPORT_SYMBOL(padata_alloc_possible);
 
 /**
- * __padata_alloc - allocate and initialize a padata instance
- *                  and specify cpumasks for serial and parallel workers.
+ * padata_alloc - allocate and initialize a padata instance and specify
+ *                cpumasks for serial and parallel workers.
  *
  * @wq: workqueue to use for the allocated padata instance
  * @pcpumask: cpumask that will be used for padata parallelization
  * @cbcpumask: cpumask that will be used for padata serialization
  */
-struct padata_instance *__padata_alloc(struct workqueue_struct *wq,
-				       const struct cpumask *pcpumask,
-				       const struct cpumask *cbcpumask)
+struct padata_instance *padata_alloc(struct workqueue_struct *wq,
+				     const struct cpumask *pcpumask,
+				     const struct cpumask *cbcpumask)
 {
 	struct padata_instance *pinst;
 	struct parallel_data *pd = NULL;
@@ -1138,7 +1138,7 @@ err_free_inst:
 err:
 	return NULL;
 }
-EXPORT_SYMBOL(__padata_alloc);
+EXPORT_SYMBOL(padata_alloc);
 
 /**
  * padata_free - free a padata instance
-- 
cgit v1.2.3-59-g8ed1b


From 65ff577e6b6e482ee9de3569e058edebdc02f069 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 27 Jul 2010 07:15:06 +0200
Subject: padata: Rearrange set_cpumask functions

padata_set_cpumask needs to be protected by a lock. We make
__padata_set_cpumasks unlocked and static. So this function
can be used by the exported and locked padata_set_cpumask and
padata_set_cpumasks functions.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |   6 +--
 kernel/padata.c        | 117 ++++++++++++++++++++++++++++---------------------
 2 files changed, 70 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 71dfc9d1f856..bb0fc5dd0bbb 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -178,9 +178,9 @@ extern int padata_get_cpumask(struct padata_instance *pinst,
 			      int cpumask_type, struct cpumask *out_mask);
 extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
 			      cpumask_var_t cpumask);
-extern int __padata_set_cpumasks(struct padata_instance *pinst,
-				 cpumask_var_t pcpumask,
-				 cpumask_var_t cbcpumask);
+extern int padata_set_cpumasks(struct padata_instance *pinst,
+			       cpumask_var_t pcpumask,
+			       cpumask_var_t cbcpumask);
 extern int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask);
 extern int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask);
 extern int padata_start(struct padata_instance *pinst);
diff --git a/kernel/padata.c b/kernel/padata.c
index 12860bce6b78..4987203770bc 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -623,6 +623,66 @@ int padata_get_cpumask(struct padata_instance *pinst,
 }
 EXPORT_SYMBOL(padata_get_cpumask);
 
+static int __padata_set_cpumasks(struct padata_instance *pinst,
+				 cpumask_var_t pcpumask,
+				 cpumask_var_t cbcpumask)
+{
+	int valid;
+	struct parallel_data *pd;
+
+	valid = padata_validate_cpumask(pinst, pcpumask);
+	if (!valid) {
+		__padata_stop(pinst);
+		goto out_replace;
+	}
+
+	valid = padata_validate_cpumask(pinst, cbcpumask);
+	if (!valid)
+		__padata_stop(pinst);
+
+out_replace:
+	pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
+	if (!pd)
+		return -ENOMEM;
+
+	cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+	cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
+
+	padata_replace(pinst, pd);
+
+	if (valid)
+		__padata_start(pinst);
+
+	return 0;
+}
+
+/**
+ * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
+ *                       one is used by parallel workers and the second one
+ *                       by the wokers doing serialization.
+ *
+ * @pinst: padata instance
+ * @pcpumask: the cpumask to use for parallel workers
+ * @cbcpumask: the cpumsak to use for serial workers
+ */
+int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
+			cpumask_var_t cbcpumask)
+{
+	int err;
+
+	mutex_lock(&pinst->lock);
+	get_online_cpus();
+
+	err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
+
+	put_online_cpus();
+	mutex_unlock(&pinst->lock);
+
+	return err;
+
+}
+EXPORT_SYMBOL(padata_set_cpumasks);
+
 /**
  * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
  *                     equivalent to @cpumask.
@@ -636,6 +696,10 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
 		       cpumask_var_t cpumask)
 {
 	struct cpumask *serial_mask, *parallel_mask;
+	int err = -EINVAL;
+
+	mutex_lock(&pinst->lock);
+	get_online_cpus();
 
 	switch (cpumask_type) {
 	case PADATA_CPU_PARALLEL:
@@ -647,65 +711,18 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
 		serial_mask = cpumask;
 		break;
 	default:
-		return -EINVAL;
+		 goto out;
 	}
 
-	return __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
-}
-EXPORT_SYMBOL(padata_set_cpumask);
-
-/**
- * __padata_set_cpumasks - Set both parallel and serial cpumasks. The first
- *                         one is used by parallel workers and the second one
- *                         by the wokers doing serialization.
- *
- * @pinst: padata instance
- * @pcpumask: the cpumask to use for parallel workers
- * @cbcpumask: the cpumsak to use for serial workers
- */
-int __padata_set_cpumasks(struct padata_instance *pinst,
-			  cpumask_var_t pcpumask, cpumask_var_t cbcpumask)
-{
-	int valid;
-	int err = 0;
-	struct parallel_data *pd = NULL;
-
-	mutex_lock(&pinst->lock);
-	get_online_cpus();
-
-	valid = padata_validate_cpumask(pinst, pcpumask);
-	if (!valid) {
-		__padata_stop(pinst);
-		goto out_replace;
-	}
-
-	valid = padata_validate_cpumask(pinst, cbcpumask);
-	if (!valid)
-		__padata_stop(pinst);
-
-out_replace:
-	pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
-	if (!pd) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	cpumask_copy(pinst->cpumask.pcpu, pcpumask);
-	cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
-
-	padata_replace(pinst, pd);
-
-	if (valid)
-		__padata_start(pinst);
+	err =  __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
 
 out:
 	put_online_cpus();
 	mutex_unlock(&pinst->lock);
 
 	return err;
-
 }
-EXPORT_SYMBOL(__padata_set_cpumasks);
+EXPORT_SYMBOL(padata_set_cpumask);
 
 static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
 {
-- 
cgit v1.2.3-59-g8ed1b


From c635696c7c0fbc720698dbec34bb83e53df6a967 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 27 Jul 2010 07:15:50 +0200
Subject: padata: Pass the padata cpumasks to the cpumask_change_notifier chain

We pass a pointer to the new padata cpumasks to the cpumask_change_notifier
chain. So users can access the cpumasks without the need of an extra
padata_get_cpumask function.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h | 40 +++++++++++++++++++++-------------------
 kernel/padata.c        |  3 ++-
 2 files changed, 23 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index bb0fc5dd0bbb..43db792f44dd 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -98,6 +98,16 @@ struct padata_parallel_queue {
        int                   cpu_index;
 };
 
+/**
+ * struct padata_cpumask - The cpumasks for the parallel/serial workers
+ *
+ * @pcpu: cpumask for the parallel workers.
+ * @cbcpu: cpumask for the serial (callback) workers.
+ */
+struct padata_cpumask {
+	cpumask_var_t	pcpu;
+	cpumask_var_t	cbcpu;
+};
 
 /**
  * struct parallel_data - Internal control structure, covers everything
@@ -110,8 +120,7 @@ struct padata_parallel_queue {
  * @reorder_objects: Number of objects waiting in the reorder queues.
  * @refcnt: Number of objects holding a reference on this parallel_data.
  * @max_seq_nr:  Maximal used sequence number.
- * @cpumask: Contains two cpumasks: pcpu and cbcpu for
- *           parallel and serial workers respectively.
+ * @cpumask: The cpumasks in use for parallel and serial workers.
  * @lock: Reorder lock.
  * @processed: Number of already processed objects.
  * @timer: Reorder timer.
@@ -120,17 +129,14 @@ struct parallel_data {
 	struct padata_instance		*pinst;
 	struct padata_parallel_queue	*pqueue;
 	struct padata_serial_queue	*squeue;
-	atomic_t			 seq_nr;
-	atomic_t			 reorder_objects;
-	atomic_t			 refcnt;
-	unsigned int			 max_seq_nr;
-	struct {
-		cpumask_var_t		 pcpu;
-		cpumask_var_t		 cbcpu;
-	} cpumask;
-	spinlock_t                       lock ____cacheline_aligned;
-	unsigned int			 processed;
-	struct timer_list		 timer;
+	atomic_t			seq_nr;
+	atomic_t			reorder_objects;
+	atomic_t			refcnt;
+	unsigned int			max_seq_nr;
+	struct padata_cpumask		cpumask;
+	spinlock_t                      lock ____cacheline_aligned;
+	unsigned int			processed;
+	struct timer_list		timer;
 };
 
 /**
@@ -139,8 +145,7 @@ struct parallel_data {
  * @cpu_notifier: cpu hotplug notifier.
  * @wq: The workqueue in use.
  * @pd: The internal control structure.
- * @cpumask: User supplied cpumask. Contains two cpumasks: pcpu and
- *           cbcpu for parallel and serial works respectivly.
+ * @cpumask: User supplied cpumasks for parallel and serial works.
  * @cpumask_change_notifier: Notifiers chain for user-defined notify
  *            callbacks that will be called when either @pcpu or @cbcpu
  *            or both cpumasks change.
@@ -152,10 +157,7 @@ struct padata_instance {
 	struct notifier_block		 cpu_notifier;
 	struct workqueue_struct		*wq;
 	struct parallel_data		*pd;
-	struct {
-		cpumask_var_t		 pcpu;
-		cpumask_var_t		 cbcpu;
-	} cpumask;
+	struct padata_cpumask		cpumask;
 	struct blocking_notifier_head	 cpumask_change_notifier;
 	struct kobject                   kobj;
 	struct mutex			 lock;
diff --git a/kernel/padata.c b/kernel/padata.c
index 4987203770bc..1c8c1d1d301d 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -538,7 +538,8 @@ static void padata_replace(struct padata_instance *pinst,
 
 	if (notification_mask)
 		blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
-					     notification_mask, pinst);
+					     notification_mask,
+					     &pd_new->cpumask);
 
 	pinst->flags &= ~PADATA_RESET;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 0500e9b3f11ce84fc6ee48a3e29909145e58ba48 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 27 Jul 2010 07:19:27 +0200
Subject: padata: Remove padata_get_cpumask

A function that copies the padata cpumasks to a user buffer
is a bit error prone. The cpumask can change any time so we
can't be sure to have the right cpumask when using this function.
A user who is interested in the padata cpumasks should register
to the padata cpumask notifier chain instead. Users of
padata_get_cpumask are already updated, so we can remove it.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |  2 --
 kernel/padata.c        | 35 -----------------------------------
 2 files changed, 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 43db792f44dd..bdcd1e9eacea 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -176,8 +176,6 @@ extern void padata_free(struct padata_instance *pinst);
 extern int padata_do_parallel(struct padata_instance *pinst,
 			      struct padata_priv *padata, int cb_cpu);
 extern void padata_do_serial(struct padata_priv *padata);
-extern int padata_get_cpumask(struct padata_instance *pinst,
-			      int cpumask_type, struct cpumask *out_mask);
 extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
 			      cpumask_var_t cpumask);
 extern int padata_set_cpumasks(struct padata_instance *pinst,
diff --git a/kernel/padata.c b/kernel/padata.c
index 1c8c1d1d301d..fd4679266ede 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -589,41 +589,6 @@ static bool padata_validate_cpumask(struct padata_instance *pinst,
 	return true;
 }
 
-/**
- * padata_get_cpumask: Fetch serial or parallel cpumask from the
- *                     given padata instance and copy it to @out_mask
- *
- * @pinst: A pointer to padata instance
- * @cpumask_type: Specifies which cpumask will be copied.
- *                Possible values are PADATA_CPU_SERIAL *or* PADATA_CPU_PARALLEL
- *                corresponding to serial and parallel cpumask respectively.
- * @out_mask: A pointer to cpumask structure where selected
- *            cpumask will be copied.
- */
-int padata_get_cpumask(struct padata_instance *pinst,
-		       int cpumask_type, struct cpumask *out_mask)
-{
-	struct parallel_data *pd;
-	int ret = 0;
-
-	rcu_read_lock_bh();
-	pd = rcu_dereference(pinst->pd);
-	switch (cpumask_type) {
-	case PADATA_CPU_SERIAL:
-		cpumask_copy(out_mask, pd->cpumask.cbcpu);
-		break;
-	case PADATA_CPU_PARALLEL:
-		cpumask_copy(out_mask, pd->cpumask.pcpu);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-	rcu_read_unlock_bh();
-	return ret;
-}
-EXPORT_SYMBOL(padata_get_cpumask);
-
 static int __padata_set_cpumasks(struct padata_instance *pinst,
 				 cpumask_var_t pcpumask,
 				 cpumask_var_t cbcpumask)
-- 
cgit v1.2.3-59-g8ed1b


From 7cfe249475fdd82ad3c2767a9b906cc775dab868 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Thu, 15 Jul 2010 10:47:14 +0100
Subject: ARM: AMBA: Add pclk support to AMBA bus infrastructure

Some platforms gate the pclk (APB - the bus - clock) to the peripherals
for power saving, along with the functional clock.  When devices are
accessed without pclk enabled, the kernel will oops.

This gives them two options:

1. Leave all clocks on all the time.
2. Attempt to gate pclk along with the functional clock.

(With some hardware, pclk and the functional clock are gated by a single
bit in a register.)

(1) has the disadvantage that it causes increased power usage, which is
bad news for battery operated devices.  (2) can lead to kernel oops if
registers are accessed without the functional clock being enabled.

So, introduce the apb_pclk signal in such a way existing drivers don't
need to be updated.  Essentially, this means we guarantee that:

1. pclk will be enabled whenever the driver is bound to a device -
   from probe() to remove() time.
2. pclk will also be enabled when reading the primecell IDs from the device.

In order to allow drivers to be incrementally updated to achieve greater
power savings, we provide two additional calls to allow drivers to
manage the pclk - amba_pclk_enable()/amba_pclk_disable().

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/amba/bus.c       | 88 +++++++++++++++++++++++++++++++++++++-----------
 include/linux/amba/bus.h | 11 ++++++
 2 files changed, 80 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index f60b2b6a0931..d31590e7011b 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -122,6 +122,31 @@ static int __init amba_init(void)
 
 postcore_initcall(amba_init);
 
+static int amba_get_enable_pclk(struct amba_device *pcdev)
+{
+	struct clk *pclk = clk_get(&pcdev->dev, "apb_pclk");
+	int ret;
+
+	pcdev->pclk = pclk;
+
+	if (IS_ERR(pclk))
+		return PTR_ERR(pclk);
+
+	ret = clk_enable(pclk);
+	if (ret)
+		clk_put(pclk);
+
+	return ret;
+}
+
+static void amba_put_disable_pclk(struct amba_device *pcdev)
+{
+	struct clk *pclk = pcdev->pclk;
+
+	clk_disable(pclk);
+	clk_put(pclk);
+}
+
 /*
  * These are the device model conversion veneers; they convert the
  * device model structures to our more specific structures.
@@ -130,17 +155,33 @@ static int amba_probe(struct device *dev)
 {
 	struct amba_device *pcdev = to_amba_device(dev);
 	struct amba_driver *pcdrv = to_amba_driver(dev->driver);
-	struct amba_id *id;
+	struct amba_id *id = amba_lookup(pcdrv->id_table, pcdev);
+	int ret;
 
-	id = amba_lookup(pcdrv->id_table, pcdev);
+	do {
+		ret = amba_get_enable_pclk(pcdev);
+		if (ret)
+			break;
+
+		ret = pcdrv->probe(pcdev, id);
+		if (ret == 0)
+			break;
 
-	return pcdrv->probe(pcdev, id);
+		amba_put_disable_pclk(pcdev);
+	} while (0);
+
+	return ret;
 }
 
 static int amba_remove(struct device *dev)
 {
+	struct amba_device *pcdev = to_amba_device(dev);
 	struct amba_driver *drv = to_amba_driver(dev->driver);
-	return drv->remove(to_amba_device(dev));
+	int ret = drv->remove(pcdev);
+
+	amba_put_disable_pclk(pcdev);
+
+	return ret;
 }
 
 static void amba_shutdown(struct device *dev)
@@ -203,7 +244,6 @@ static void amba_device_release(struct device *dev)
  */
 int amba_device_register(struct amba_device *dev, struct resource *parent)
 {
-	u32 pid, cid;
 	u32 size;
 	void __iomem *tmp;
 	int i, ret;
@@ -241,25 +281,35 @@ int amba_device_register(struct amba_device *dev, struct resource *parent)
 		goto err_release;
 	}
 
-	/*
-	 * Read pid and cid based on size of resource
-	 * they are located at end of region
-	 */
-	for (pid = 0, i = 0; i < 4; i++)
-		pid |= (readl(tmp + size - 0x20 + 4 * i) & 255) << (i * 8);
-	for (cid = 0, i = 0; i < 4; i++)
-		cid |= (readl(tmp + size - 0x10 + 4 * i) & 255) << (i * 8);
+	ret = amba_get_enable_pclk(dev);
+	if (ret == 0) {
+		u32 pid, cid;
 
-	iounmap(tmp);
+		/*
+		 * Read pid and cid based on size of resource
+		 * they are located at end of region
+		 */
+		for (pid = 0, i = 0; i < 4; i++)
+			pid |= (readl(tmp + size - 0x20 + 4 * i) & 255) <<
+				(i * 8);
+		for (cid = 0, i = 0; i < 4; i++)
+			cid |= (readl(tmp + size - 0x10 + 4 * i) & 255) <<
+				(i * 8);
 
-	if (cid == 0xb105f00d)
-		dev->periphid = pid;
+		amba_put_disable_pclk(dev);
 
-	if (!dev->periphid) {
-		ret = -ENODEV;
-		goto err_release;
+		if (cid == 0xb105f00d)
+			dev->periphid = pid;
+
+		if (!dev->periphid)
+			ret = -ENODEV;
 	}
 
+	iounmap(tmp);
+
+	if (ret)
+		goto err_release;
+
 	ret = device_add(&dev->dev);
 	if (ret)
 		goto err_release;
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 8b1038607831..b0c174012436 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -14,14 +14,19 @@
 #ifndef ASMARM_AMBA_H
 #define ASMARM_AMBA_H
 
+#include <linux/clk.h>
 #include <linux/device.h>
+#include <linux/err.h>
 #include <linux/resource.h>
 
 #define AMBA_NR_IRQS	2
 
+struct clk;
+
 struct amba_device {
 	struct device		dev;
 	struct resource		res;
+	struct clk		*pclk;
 	u64			dma_mask;
 	unsigned int		periphid;
 	unsigned int		irq[AMBA_NR_IRQS];
@@ -59,6 +64,12 @@ struct amba_device *amba_find_device(const char *, struct device *, unsigned int
 int amba_request_regions(struct amba_device *, const char *);
 void amba_release_regions(struct amba_device *);
 
+#define amba_pclk_enable(d)	\
+	(IS_ERR((d)->pclk) ? 0 : clk_enable((d)->pclk))
+
+#define amba_pclk_disable(d)	\
+	do { if (!IS_ERR((d)->pclk)) clk_disable((d)->pclk); } while (0)
+
 #define amba_config(d)	(((d)->periphid >> 24) & 0xff)
 #define amba_rev(d)	(((d)->periphid >> 20) & 0x0f)
 #define amba_manf(d)	(((d)->periphid >> 12) & 0xff)
-- 
cgit v1.2.3-59-g8ed1b


From bf998156d24bcb127318ad5bf531ac3bdfcd6449 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 31 May 2010 14:28:19 +0800
Subject: KVM: Avoid killing userspace through guest SRAO MCE on unmapped pages

In common cases, guest SRAO MCE will cause corresponding poisoned page
be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay
the MCE to guest OS.

But it is reported that if the poisoned page is accessed in guest
after unmapping and before MCE is relayed to guest OS, userspace will
be killed.

The reason is as follows. Because poisoned page has been un-mapped,
guest access will cause guest exit and kvm_mmu_page_fault will be
called. kvm_mmu_page_fault can not get the poisoned page for fault
address, so kernel and user space MMIO processing is tried in turn. In
user MMIO processing, poisoned page is accessed again, then userspace
is killed by force_sig_info.

To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM
and do not try kernel and user space MMIO processing for poisoned
page.

[xiao: fix warning introduced by avi]

Reported-by: Max Asbock <masbock@linux.vnet.ibm.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 34 ++++++++++++++++++++++++++--------
 arch/x86/kvm/paging_tmpl.h |  7 ++-----
 include/linux/kvm_host.h   |  1 +
 include/linux/mm.h         |  8 ++++++++
 mm/memory-failure.c        | 30 ++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c        | 30 ++++++++++++++++++++++++++++--
 6 files changed, 95 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1ed0a1a5913..b666d8d106a9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -32,6 +32,7 @@
 #include <linux/compiler.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -1960,6 +1961,27 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 	return pt_write;
 }
 
+static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+{
+	char buf[1];
+	void __user *hva;
+	int r;
+
+	/* Touch the page, so send SIGBUS */
+	hva = (void __user *)gfn_to_hva(kvm, gfn);
+	r = copy_from_user(buf, hva, 1);
+}
+
+static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+{
+	kvm_release_pfn_clean(pfn);
+	if (is_hwpoison_pfn(pfn)) {
+		kvm_send_hwpoison_signal(kvm, gfn);
+		return 0;
+	}
+	return 1;
+}
+
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 {
 	int r;
@@ -1983,10 +2005,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 
 	/* mmio */
-	if (is_error_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
+	if (is_error_pfn(pfn))
+		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2198,10 +2218,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
-	if (is_error_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
+	if (is_error_pfn(pfn))
+		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2331bdc2b549..c7f27779c998 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -431,11 +431,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
 
 	/* mmio */
-	if (is_error_pfn(pfn)) {
-		pgprintk("gfn %lx is mmio\n", walker.gfn);
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
+	if (is_error_pfn(pfn))
+		return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7cb116afa1cd..a0e019769f5d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -266,6 +266,7 @@ extern pfn_t bad_pfn;
 
 int is_error_page(struct page *page);
 int is_error_pfn(pfn_t pfn);
+int is_hwpoison_pfn(pfn_t pfn);
 int kvm_is_error_hva(unsigned long addr);
 int kvm_set_memory_region(struct kvm *kvm,
 			  struct kvm_userspace_memory_region *mem,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2b48041b910..7a9ab7db1975 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1465,6 +1465,14 @@ extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
 extern atomic_long_t mce_bad_pages;
 extern int soft_offline_page(struct page *page, int flags);
+#ifdef CONFIG_MEMORY_FAILURE
+int is_hwpoison_address(unsigned long addr);
+#else
+static inline int is_hwpoison_address(unsigned long addr)
+{
+	return 0;
+}
+#endif
 
 extern void dump_page(struct page *page);
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 620b0b461593..378b0f61fd3c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -45,6 +45,7 @@
 #include <linux/page-isolation.h>
 #include <linux/suspend.h>
 #include <linux/slab.h>
+#include <linux/swapops.h>
 #include "internal.h"
 
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1296,3 +1297,32 @@ done:
 	/* keep elevated page count for bad page */
 	return ret;
 }
+
+int is_hwpoison_address(unsigned long addr)
+{
+	pgd_t *pgdp;
+	pud_t pud, *pudp;
+	pmd_t pmd, *pmdp;
+	pte_t pte, *ptep;
+	swp_entry_t entry;
+
+	pgdp = pgd_offset(current->mm, addr);
+	if (!pgd_present(*pgdp))
+		return 0;
+	pudp = pud_offset(pgdp, addr);
+	pud = *pudp;
+	if (!pud_present(pud) || pud_large(pud))
+		return 0;
+	pmdp = pmd_offset(pudp, addr);
+	pmd = *pmdp;
+	if (!pmd_present(pmd) || pmd_large(pmd))
+		return 0;
+	ptep = pte_offset_map(pmdp, addr);
+	pte = *ptep;
+	pte_unmap(ptep);
+	if (!is_swap_pte(pte))
+		return 0;
+	entry = pte_to_swp_entry(pte);
+	return is_hwpoison_entry(entry);
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_address);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f032806a212f..187aa8d984a7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -92,6 +92,9 @@ static bool kvm_rebooting;
 
 static bool largepages_enabled = true;
 
+struct page *hwpoison_page;
+pfn_t hwpoison_pfn;
+
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn)) {
@@ -810,16 +813,22 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 
 int is_error_page(struct page *page)
 {
-	return page == bad_page;
+	return page == bad_page || page == hwpoison_page;
 }
 EXPORT_SYMBOL_GPL(is_error_page);
 
 int is_error_pfn(pfn_t pfn)
 {
-	return pfn == bad_pfn;
+	return pfn == bad_pfn || pfn == hwpoison_pfn;
 }
 EXPORT_SYMBOL_GPL(is_error_pfn);
 
+int is_hwpoison_pfn(pfn_t pfn)
+{
+	return pfn == hwpoison_pfn;
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
+
 static inline unsigned long bad_hva(void)
 {
 	return PAGE_OFFSET;
@@ -945,6 +954,11 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
 	if (unlikely(npages != 1)) {
 		struct vm_area_struct *vma;
 
+		if (is_hwpoison_address(addr)) {
+			get_page(hwpoison_page);
+			return page_to_pfn(hwpoison_page);
+		}
+
 		down_read(&current->mm->mmap_sem);
 		vma = find_vma(current->mm, addr);
 
@@ -2197,6 +2211,15 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 
 	bad_pfn = page_to_pfn(bad_page);
 
+	hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (hwpoison_page == NULL) {
+		r = -ENOMEM;
+		goto out_free_0;
+	}
+
+	hwpoison_pfn = page_to_pfn(hwpoison_page);
+
 	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
 		r = -ENOMEM;
 		goto out_free_0;
@@ -2269,6 +2292,8 @@ out_free_1:
 out_free_0a:
 	free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
+	if (hwpoison_page)
+		__free_page(hwpoison_page);
 	__free_page(bad_page);
 out:
 	kvm_arch_exit();
@@ -2290,6 +2315,7 @@ void kvm_exit(void)
 	kvm_arch_hardware_unsetup();
 	kvm_arch_exit();
 	free_cpumask_var(cpus_hardware_enabled);
+	__free_page(hwpoison_page);
 	__free_page(bad_page);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
-- 
cgit v1.2.3-59-g8ed1b


From d94e1dc9af60e3431a586c3edfbe42d8a0d3932b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 3 May 2010 16:54:48 +0300
Subject: KVM: Get rid of KVM_REQ_KICK

KVM_REQ_KICK poisons vcpu->requests by having a bit set during normal
operation.  This causes the fast path check for a clear vcpu->requests
to fail all the time, triggering tons of atomic operations.

Fix by replacing KVM_REQ_KICK with a vcpu->guest_mode atomic.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c       | 17 ++++++++++-------
 include/linux/kvm_host.h |  1 +
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63c87adcec48..fc5611b4007f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4604,13 +4604,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (vcpu->fpu_active)
 		kvm_load_guest_fpu(vcpu);
 
-	local_irq_disable();
+	atomic_set(&vcpu->guest_mode, 1);
+	smp_wmb();
 
-	clear_bit(KVM_REQ_KICK, &vcpu->requests);
-	smp_mb__after_clear_bit();
+	local_irq_disable();
 
-	if (vcpu->requests || need_resched() || signal_pending(current)) {
-		set_bit(KVM_REQ_KICK, &vcpu->requests);
+	if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
+	    || need_resched() || signal_pending(current)) {
+		atomic_set(&vcpu->guest_mode, 0);
+		smp_wmb();
 		local_irq_enable();
 		preempt_enable();
 		r = 1;
@@ -4655,7 +4657,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 
-	set_bit(KVM_REQ_KICK, &vcpu->requests);
+	atomic_set(&vcpu->guest_mode, 0);
+	smp_wmb();
 	local_irq_enable();
 
 	++vcpu->stat.exits;
@@ -5580,7 +5583,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 
 	me = get_cpu();
 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
-		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+		if (atomic_xchg(&vcpu->guest_mode, 0))
 			smp_send_reschedule(cpu);
 	put_cpu();
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a0e019769f5d..2c62319727ef 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -81,6 +81,7 @@ struct kvm_vcpu {
 	int vcpu_id;
 	struct mutex mutex;
 	int   cpu;
+	atomic_t guest_mode;
 	struct kvm_run *run;
 	unsigned long requests;
 	unsigned long guest_debug;
-- 
cgit v1.2.3-59-g8ed1b


From 22ae782f86b726f9cea752c0f269ff6dcdf2f6e1 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 29 Jul 2010 11:49:01 -0600
Subject: of/address: Clean up function declarations

This patch moves the declaration of of_get_address(), of_get_pci_address(),
and of_pci_address_to_resource() out of arch code and into the common
linux/of_address header file.

This patch also fixes some of the asm/prom.h ordering issues.  It still
includes some header files that it ideally shouldn't be, but at least the
ordering is consistent now so that of_* overrides work.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/microblaze/include/asm/prom.h        | 33 +++++++--------------
 arch/powerpc/include/asm/prom.h           | 49 +++++++------------------------
 arch/powerpc/kernel/legacy_serial.c       |  1 +
 arch/powerpc/kernel/pci-common.c          |  1 +
 arch/powerpc/platforms/52xx/lite5200.c    |  1 +
 arch/powerpc/platforms/amigaone/setup.c   |  3 +-
 arch/powerpc/platforms/iseries/mf.c       |  1 +
 arch/powerpc/platforms/powermac/feature.c |  2 ++
 arch/powerpc/sysdev/bestcomm/sram.c       |  1 +
 arch/powerpc/sysdev/fsl_gtm.c             |  1 +
 drivers/char/bsr.c                        |  1 +
 drivers/net/fsl_pq_mdio.c                 |  1 +
 drivers/net/xilinx_emaclite.c             |  2 +-
 drivers/serial/uartlite.c                 |  1 +
 drivers/spi/mpc512x_psc_spi.c             |  1 +
 drivers/spi/mpc52xx_psc_spi.c             |  1 +
 drivers/spi/xilinx_spi_of.c               |  1 +
 drivers/usb/gadget/fsl_qe_udc.c           |  1 +
 drivers/video/controlfb.c                 |  2 ++
 drivers/video/offb.c                      |  3 +-
 include/linux/of_address.h                | 32 ++++++++++++++++++++
 21 files changed, 76 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index cb9c3dd9a23b..101fa098f62a 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -20,11 +20,6 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/of_fdt.h>
-#include <linux/proc_fs.h>
-#include <linux/platform_device.h>
 #include <asm/irq.h>
 #include <asm/atomic.h>
 
@@ -52,25 +47,9 @@ extern void pci_create_OF_bus_map(void);
  * OF address retreival & translation
  */
 
-/* Extract an address from a device, returns the region size and
- * the address space flags too. The PCI version uses a BAR number
- * instead of an absolute index
- */
-extern const u32 *of_get_address(struct device_node *dev, int index,
-			u64 *size, unsigned int *flags);
-extern const u32 *of_get_pci_address(struct device_node *dev, int bar_no,
-			u64 *size, unsigned int *flags);
-
-extern int of_pci_address_to_resource(struct device_node *dev, int bar,
-				struct resource *r);
-
 #ifdef CONFIG_PCI
 extern unsigned long pci_address_to_pio(phys_addr_t address);
-#else
-static inline unsigned long pci_address_to_pio(phys_addr_t address)
-{
-	return (unsigned long)-1;
-}
+#define pci_address_to_pio pci_address_to_pio
 #endif	/* CONFIG_PCI */
 
 /* Parse the ibm,dma-window property of an OF node into the busno, phys and
@@ -99,8 +78,18 @@ extern const void *of_get_mac_address(struct device_node *np);
  * resolving using the OF tree walking.
  */
 struct pci_dev;
+struct of_irq;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
+
+/* These includes are put at the bottom because they may contain things
+ * that are overridden by this file.  Ideally they shouldn't be included
+ * by this file, but there are a bunch of .c files that currently depend
+ * on it.  Eventually they will be cleaned up. */
+#include <linux/of_fdt.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+
 #endif /* _ASM_MICROBLAZE_PROM_H */
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 55bccc0a21c3..ae26f2efd089 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -17,11 +17,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/types.h>
-#include <linux/of_fdt.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/proc_fs.h>
-#include <linux/platform_device.h>
 #include <asm/irq.h>
 #include <asm/atomic.h>
 
@@ -49,41 +44,9 @@ extern void pci_create_OF_bus_map(void);
 extern u64 of_translate_dma_address(struct device_node *dev,
 				    const u32 *in_addr);
 
-/* Extract an address from a device, returns the region size and
- * the address space flags too. The PCI version uses a BAR number
- * instead of an absolute index
- */
-extern const u32 *of_get_address(struct device_node *dev, int index,
-			   u64 *size, unsigned int *flags);
-#ifdef CONFIG_PCI
-extern const u32 *of_get_pci_address(struct device_node *dev, int bar_no,
-			       u64 *size, unsigned int *flags);
-#else
-static inline const u32 *of_get_pci_address(struct device_node *dev,
-		int bar_no, u64 *size, unsigned int *flags)
-{
-	return NULL;
-}
-#endif /* CONFIG_PCI */
-
-#ifdef CONFIG_PCI
-extern int of_pci_address_to_resource(struct device_node *dev, int bar,
-				      struct resource *r);
-#else
-static inline int of_pci_address_to_resource(struct device_node *dev, int bar,
-		struct resource *r)
-{
-	return -ENOSYS;
-}
-#endif /* CONFIG_PCI */
-
 #ifdef CONFIG_PCI
 extern unsigned long pci_address_to_pio(phys_addr_t address);
-#else
-static inline unsigned long pci_address_to_pio(phys_addr_t address)
-{
-	return (unsigned long)-1;
-}
+#define pci_address_to_pio pci_address_to_pio
 #endif	/* CONFIG_PCI */
 
 /* Parse the ibm,dma-window property of an OF node into the busno, phys and
@@ -122,9 +85,19 @@ static inline int of_node_to_nid(struct device_node *device) { return 0; }
  * resolving using the OF tree walking.
  */
 struct pci_dev;
+struct of_irq;
 extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
 
 extern void of_instantiate_rtc(void);
 
+/* These includes are put at the bottom because they may contain things
+ * that are overridden by this file.  Ideally they shouldn't be included
+ * by this file, but there are a bunch of .c files that currently depend
+ * on it.  Eventually they will be cleaned up. */
+#include <linux/of_fdt.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_PROM_H */
diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c
index 035ada5443ee..c1fd0f9658fd 100644
--- a/arch/powerpc/kernel/legacy_serial.c
+++ b/arch/powerpc/kernel/legacy_serial.c
@@ -4,6 +4,7 @@
 #include <linux/serial_core.h>
 #include <linux/console.h>
 #include <linux/pci.h>
+#include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <asm/io.h>
 #include <asm/mmu.h>
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 5b38f6ae2b29..9021c4ad4bbd 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -21,6 +21,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
+#include <linux/of_address.h>
 #include <linux/mm.h>
 #include <linux/list.h>
 #include <linux/syscalls.h>
diff --git a/arch/powerpc/platforms/52xx/lite5200.c b/arch/powerpc/platforms/52xx/lite5200.c
index 6d584f4e3c9a..de55bc0584b5 100644
--- a/arch/powerpc/platforms/52xx/lite5200.c
+++ b/arch/powerpc/platforms/52xx/lite5200.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/root_dev.h>
 #include <linux/initrd.h>
 #include <asm/time.h>
diff --git a/arch/powerpc/platforms/amigaone/setup.c b/arch/powerpc/platforms/amigaone/setup.c
index fb4eb0df054c..03aabc0e16ac 100644
--- a/arch/powerpc/platforms/amigaone/setup.c
+++ b/arch/powerpc/platforms/amigaone/setup.c
@@ -13,12 +13,13 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/seq_file.h>
 #include <generated/utsrelease.h>
 
 #include <asm/machdep.h>
 #include <asm/cputable.h>
-#include <asm/prom.h>
 #include <asm/pci-bridge.h>
 #include <asm/i8259.h>
 #include <asm/time.h>
diff --git a/arch/powerpc/platforms/iseries/mf.c b/arch/powerpc/platforms/iseries/mf.c
index d2c1d497846e..33e5fc7334fc 100644
--- a/arch/powerpc/platforms/iseries/mf.c
+++ b/arch/powerpc/platforms/iseries/mf.c
@@ -30,6 +30,7 @@
 #include <linux/init.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
+#include <linux/proc_fs.h>
 #include <linux/dma-mapping.h>
 #include <linux/bcd.h>
 #include <linux/rtc.h>
diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c
index 9e1b9fd75206..75eec031e7a2 100644
--- a/arch/powerpc/platforms/powermac/feature.c
+++ b/arch/powerpc/platforms/powermac/feature.c
@@ -21,6 +21,8 @@
 #include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/spinlock.h>
 #include <linux/adb.h>
 #include <linux/pmu.h>
diff --git a/arch/powerpc/sysdev/bestcomm/sram.c b/arch/powerpc/sysdev/bestcomm/sram.c
index 5d74ef7a651f..1225012a681a 100644
--- a/arch/powerpc/sysdev/bestcomm/sram.c
+++ b/arch/powerpc/sysdev/bestcomm/sram.c
@@ -11,6 +11,7 @@
  * kind, whether express or implied.
  */
 
+#include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/arch/powerpc/sysdev/fsl_gtm.c b/arch/powerpc/sysdev/fsl_gtm.c
index eca4545dd52e..7dd2885321ad 100644
--- a/arch/powerpc/sysdev/fsl_gtm.c
+++ b/arch/powerpc/sysdev/fsl_gtm.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/list.h>
 #include <linux/io.h>
diff --git a/drivers/char/bsr.c b/drivers/char/bsr.c
index 89d871ef8c2f..91917133ae0a 100644
--- a/drivers/char/bsr.c
+++ b/drivers/char/bsr.c
@@ -23,6 +23,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
+#include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/cdev.h>
 #include <linux/list.h>
diff --git a/drivers/net/fsl_pq_mdio.c b/drivers/net/fsl_pq_mdio.c
index b4c41d72c423..f53f850b6418 100644
--- a/drivers/net/fsl_pq_mdio.c
+++ b/drivers/net/fsl_pq_mdio.c
@@ -35,6 +35,7 @@
 #include <linux/mii.h>
 #include <linux/phy.h>
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/of_mdio.h>
 #include <linux/of_platform.h>
 
diff --git a/drivers/net/xilinx_emaclite.c b/drivers/net/xilinx_emaclite.c
index d04c5b262050..b2c2f391b29d 100644
--- a/drivers/net/xilinx_emaclite.c
+++ b/drivers/net/xilinx_emaclite.c
@@ -20,7 +20,7 @@
 #include <linux/skbuff.h>
 #include <linux/io.h>
 #include <linux/slab.h>
-
+#include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #include <linux/of_mdio.h>
diff --git a/drivers/serial/uartlite.c b/drivers/serial/uartlite.c
index 8acccd564378..caf085d3a76a 100644
--- a/drivers/serial/uartlite.c
+++ b/drivers/serial/uartlite.c
@@ -21,6 +21,7 @@
 #include <asm/io.h>
 #if defined(CONFIG_OF) && (defined(CONFIG_PPC32) || defined(CONFIG_MICROBLAZE))
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 
diff --git a/drivers/spi/mpc512x_psc_spi.c b/drivers/spi/mpc512x_psc_spi.c
index 1bb4315f5f84..10baac3f8ea5 100644
--- a/drivers/spi/mpc512x_psc_spi.c
+++ b/drivers/spi/mpc512x_psc_spi.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/of_address.h>
 #include <linux/of_platform.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
diff --git a/drivers/spi/mpc52xx_psc_spi.c b/drivers/spi/mpc52xx_psc_spi.c
index bd81ff90cfb3..66d170147dcc 100644
--- a/drivers/spi/mpc52xx_psc_spi.c
+++ b/drivers/spi/mpc52xx_psc_spi.c
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/of_address.h>
 #include <linux/of_platform.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
diff --git a/drivers/spi/xilinx_spi_of.c b/drivers/spi/xilinx_spi_of.c
index 87cda0956a83..f53d3f6b9f61 100644
--- a/drivers/spi/xilinx_spi_of.c
+++ b/drivers/spi/xilinx_spi_of.c
@@ -29,6 +29,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 
+#include <linux/of_address.h>
 #include <linux/of_platform.h>
 #include <linux/of_device.h>
 #include <linux/of_spi.h>
diff --git a/drivers/usb/gadget/fsl_qe_udc.c b/drivers/usb/gadget/fsl_qe_udc.c
index 82506ca297d5..9648b75f0283 100644
--- a/drivers/usb/gadget/fsl_qe_udc.c
+++ b/drivers/usb/gadget/fsl_qe_udc.c
@@ -32,6 +32,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/moduleparam.h>
+#include <linux/of_address.h>
 #include <linux/of_platform.h>
 #include <linux/dma-mapping.h>
 #include <linux/usb/ch9.h>
diff --git a/drivers/video/controlfb.c b/drivers/video/controlfb.c
index 49fcbe8f18ac..c225dcce89e7 100644
--- a/drivers/video/controlfb.c
+++ b/drivers/video/controlfb.c
@@ -40,6 +40,8 @@
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/fb.h>
 #include <linux/init.h>
 #include <linux/pci.h>
diff --git a/drivers/video/offb.c b/drivers/video/offb.c
index 46dda7d8aaee..cb163a5397be 100644
--- a/drivers/video/offb.c
+++ b/drivers/video/offb.c
@@ -19,13 +19,14 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/interrupt.h>
 #include <linux/fb.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
 #include <asm/io.h>
-#include <asm/prom.h>
 
 #ifdef CONFIG_PPC64
 #include <asm/pci-bridge.h>
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index cc567df9a00d..8aea06f0564c 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -8,5 +8,37 @@ extern int of_address_to_resource(struct device_node *dev, int index,
 				  struct resource *r);
 extern void __iomem *of_iomap(struct device_node *device, int index);
 
+/* Extract an address from a device, returns the region size and
+ * the address space flags too. The PCI version uses a BAR number
+ * instead of an absolute index
+ */
+extern const u32 *of_get_address(struct device_node *dev, int index,
+			   u64 *size, unsigned int *flags);
+
+#ifndef pci_address_to_pio
+static inline unsigned long pci_address_to_pio(phys_addr_t addr) { return -1; }
+#define pci_address_to_pio pci_address_to_pio
+#endif
+
+#ifdef CONFIG_PCI
+extern const u32 *of_get_pci_address(struct device_node *dev, int bar_no,
+			       u64 *size, unsigned int *flags);
+extern int of_pci_address_to_resource(struct device_node *dev, int bar,
+				      struct resource *r);
+#else /* CONFIG_PCI */
+static inline int of_pci_address_to_resource(struct device_node *dev, int bar,
+				             struct resource *r)
+{
+	return -ENOSYS;
+}
+
+static inline const u32 *of_get_pci_address(struct device_node *dev,
+		int bar_no, u64 *size, unsigned int *flags)
+{
+	return NULL;
+}
+#endif /* CONFIG_PCI */
+
+
 #endif /* __OF_ADDRESS_H */
 
-- 
cgit v1.2.3-59-g8ed1b


From 2acf923e38fb6a4ce0c57115decbb38d334902ac Mon Sep 17 00:00:00 2001
From: Dexuan Cui <dexuan.cui@intel.com>
Date: Thu, 10 Jun 2010 11:27:12 +0800
Subject: KVM: VMX: Enable XSAVE/XRSTOR for guest

This patch enable guest to use XSAVE/XRSTOR instructions.

We assume that host_xcr0 would use all possible bits that OS supported.

And we loaded xcr0 in the same way we handled fpu - do it as late as we can.

Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/include/asm/vmx.h      |   1 +
 arch/x86/kvm/kvm_cache_regs.h   |   6 ++
 arch/x86/kvm/vmx.c              |  13 ++++
 arch/x86/kvm/x86.c              | 130 +++++++++++++++++++++++++++++++++++++---
 include/linux/kvm_host.h        |   2 +-
 6 files changed, 146 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0cd0f2923af5..91631b8b2090 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -302,6 +302,7 @@ struct kvm_vcpu_arch {
 	} update_pte;
 
 	struct fpu guest_fpu;
+	u64 xcr0;
 
 	gva_t mmio_fault_cr2;
 	struct kvm_pio_request pio;
@@ -605,6 +606,7 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 96a5886d384e..9f0cbd987d50 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -267,6 +267,7 @@ enum vmcs_field {
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_WBINVD		54
+#define EXIT_REASON_XSETBV		55
 
 /*
  * Interruption-information format
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index d2a98f8f9af5..6491ac8e755b 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -71,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
 	return kvm_read_cr4_bits(vcpu, ~0UL);
 }
 
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+	return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
+		| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
+}
+
 #endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 26ba61d6af8c..864a1b6d155a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -37,6 +37,8 @@
 #include <asm/vmx.h>
 #include <asm/virtext.h>
 #include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
 
 #include "trace.h"
 
@@ -3390,6 +3392,16 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_xsetbv(struct kvm_vcpu *vcpu)
+{
+	u64 new_bv = kvm_read_edx_eax(vcpu);
+	u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+
+	if (kvm_set_xcr(vcpu, index, new_bv) == 0)
+		skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
 	return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
@@ -3668,6 +3680,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
+	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
 	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
 	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b08c0052e332..b5e644701cc1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -65,6 +65,7 @@
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
+			  | X86_CR4_OSXSAVE \
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -150,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
+u64 __read_mostly host_xcr0;
+
+static inline u32 bit(int bitno)
+{
+	return 1 << (bitno & 31);
+}
+
 static void kvm_on_user_return(struct user_return_notifier *urn)
 {
 	unsigned slot;
@@ -474,6 +482,61 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+	u64 xcr0;
+
+	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
+	if (index != XCR_XFEATURE_ENABLED_MASK)
+		return 1;
+	xcr0 = xcr;
+	if (kvm_x86_ops->get_cpl(vcpu) != 0)
+		return 1;
+	if (!(xcr0 & XSTATE_FP))
+		return 1;
+	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+		return 1;
+	if (xcr0 & ~host_xcr0)
+		return 1;
+	vcpu->arch.xcr0 = xcr0;
+	vcpu->guest_xcr0_loaded = 0;
+	return 0;
+}
+
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+	if (__kvm_set_xcr(vcpu, index, xcr)) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_xcr);
+
+static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
+}
+
+static void update_cpuid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	if (!best)
+		return;
+
+	/* Update OSXSAVE bit */
+	if (cpu_has_xsave && best->function == 0x1) {
+		best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+			best->ecx |= bit(X86_FEATURE_OSXSAVE);
+	}
+}
+
 int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
@@ -482,6 +545,9 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	if (cr4 & CR4_RESERVED_BITS)
 		return 1;
 
+	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
+		return 1;
+
 	if (is_long_mode(vcpu)) {
 		if (!(cr4 & X86_CR4_PAE))
 			return 1;
@@ -498,6 +564,9 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	if ((cr4 ^ old_cr4) & pdptr_bits)
 		kvm_mmu_reset_context(vcpu);
 
+	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+		update_cpuid(vcpu);
+
 	return 0;
 }
 
@@ -666,11 +735,6 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 }
 EXPORT_SYMBOL_GPL(kvm_get_dr);
 
-static inline u32 bit(int bitno)
-{
-	return 1 << (bitno & 31);
-}
-
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -1814,6 +1878,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	r = 0;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
+	update_cpuid(vcpu);
 
 out_free:
 	vfree(cpuid_entries);
@@ -1837,6 +1902,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 	vcpu->arch.cpuid_nent = cpuid->nent;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
+	update_cpuid(vcpu);
 	return 0;
 
 out:
@@ -1917,7 +1983,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved, XSAVE, OSXSAVE */;
+		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */;
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
@@ -1932,7 +1998,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	switch (function) {
 	case 0:
-		entry->eax = min(entry->eax, (u32)0xb);
+		entry->eax = min(entry->eax, (u32)0xd);
 		break;
 	case 1:
 		entry->edx &= kvm_supported_word0_x86_features;
@@ -1990,6 +2056,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
+	case 0xd: {
+		int i;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		for (i = 1; *nent < maxnent; ++i) {
+			if (entry[i - 1].eax == 0 && i != 2)
+				break;
+			do_cpuid_1_ent(&entry[i], function, i);
+			entry[i].flags |=
+			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+		}
+		break;
+	}
 	case KVM_CPUID_SIGNATURE: {
 		char signature[12] = "KVMKVMKVM\0\0";
 		u32 *sigptr = (u32 *)signature;
@@ -4125,6 +4205,9 @@ int kvm_arch_init(void *opaque)
 
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
 
+	if (cpu_has_xsave)
+		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
 	return 0;
 
 out:
@@ -4523,6 +4606,25 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+			!vcpu->guest_xcr0_loaded) {
+		/* kvm_set_xcr() also depends on this */
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+		vcpu->guest_xcr0_loaded = 1;
+	}
+}
+
+static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->guest_xcr0_loaded) {
+		if (vcpu->arch.xcr0 != host_xcr0)
+			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+		vcpu->guest_xcr0_loaded = 0;
+	}
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
@@ -4568,6 +4670,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->prepare_guest_switch(vcpu);
 	if (vcpu->fpu_active)
 		kvm_load_guest_fpu(vcpu);
+	kvm_load_guest_xcr0(vcpu);
 
 	atomic_set(&vcpu->guest_mode, 1);
 	smp_wmb();
@@ -5124,6 +5227,11 @@ int fx_init(struct kvm_vcpu *vcpu)
 
 	fpu_finit(&vcpu->arch.guest_fpu);
 
+	/*
+	 * Ensure guest xcr0 is valid for loading
+	 */
+	vcpu->arch.xcr0 = XSTATE_FP;
+
 	vcpu->arch.cr0 |= X86_CR0_ET;
 
 	return 0;
@@ -5140,6 +5248,12 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_fpu_loaded)
 		return;
 
+	/*
+	 * Restore all possible states in the guest,
+	 * and assume host would use all available bits.
+	 * Guest xcr0 would be loaded later.
+	 */
+	kvm_put_guest_xcr0(vcpu);
 	vcpu->guest_fpu_loaded = 1;
 	unlazy_fpu(current);
 	fpu_restore_checking(&vcpu->arch.guest_fpu);
@@ -5148,6 +5262,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
+	kvm_put_guest_xcr0(vcpu);
+
 	if (!vcpu->guest_fpu_loaded)
 		return;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c62319727ef..2d96555cd4ed 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -88,7 +88,7 @@ struct kvm_vcpu {
 	int srcu_idx;
 
 	int fpu_active;
-	int guest_fpu_loaded;
+	int guest_fpu_loaded, guest_xcr0_loaded;
 	wait_queue_head_t wq;
 	int sigset_active;
 	sigset_t sigset;
-- 
cgit v1.2.3-59-g8ed1b


From 2d5b5a665508c60577c1088e0405850a965b6795 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Sun, 13 Jun 2010 17:29:39 +0800
Subject: KVM: x86: XSAVE/XRSTOR live migration support

This patch enable save/restore of xsave state.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kvm/api.txt    |  74 +++++++++++++++++++++++
 arch/x86/include/asm/kvm.h   |  22 +++++++
 arch/x86/include/asm/xsave.h |   7 ++-
 arch/x86/kvm/x86.c           | 139 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/kvm.h          |  12 ++++
 5 files changed, 252 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 159b4efe1b0e..ffba03f55bdf 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -922,6 +922,80 @@ Define which vcpu is the Bootstrap Processor (BSP).  Values are the same
 as the vcpu id in KVM_CREATE_VCPU.  If this ioctl is not called, the default
 is vcpu 0.
 
+4.41 KVM_GET_XSAVE
+
+Capability: KVM_CAP_XSAVE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xsave (out)
+Returns: 0 on success, -1 on error
+
+struct kvm_xsave {
+	__u32 region[1024];
+};
+
+This ioctl would copy current vcpu's xsave struct to the userspace.
+
+4.42 KVM_SET_XSAVE
+
+Capability: KVM_CAP_XSAVE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xsave (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_xsave {
+	__u32 region[1024];
+};
+
+This ioctl would copy userspace's xsave struct to the kernel.
+
+4.43 KVM_GET_XCRS
+
+Capability: KVM_CAP_XCRS
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xcrs (out)
+Returns: 0 on success, -1 on error
+
+struct kvm_xcr {
+	__u32 xcr;
+	__u32 reserved;
+	__u64 value;
+};
+
+struct kvm_xcrs {
+	__u32 nr_xcrs;
+	__u32 flags;
+	struct kvm_xcr xcrs[KVM_MAX_XCRS];
+	__u64 padding[16];
+};
+
+This ioctl would copy current vcpu's xcrs to the userspace.
+
+4.44 KVM_SET_XCRS
+
+Capability: KVM_CAP_XCRS
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xcrs (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_xcr {
+	__u32 xcr;
+	__u32 reserved;
+	__u64 value;
+};
+
+struct kvm_xcrs {
+	__u32 nr_xcrs;
+	__u32 flags;
+	struct kvm_xcr xcrs[KVM_MAX_XCRS];
+	__u64 padding[16];
+};
+
+This ioctl would set vcpu's xcr to the value userspace specified.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ff90055c7f0b..4d8dcbdfc120 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -22,6 +22,8 @@
 #define __KVM_HAVE_XEN_HVM
 #define __KVM_HAVE_VCPU_EVENTS
 #define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
 	__u64 reserved[9];
 };
 
+/* for KVM_CAP_XSAVE */
+struct kvm_xsave {
+	__u32 region[1024];
+};
+
+#define KVM_MAX_XCRS	16
+
+struct kvm_xcr {
+	__u32 xcr;
+	__u32 reserved;
+	__u64 value;
+};
+
+struct kvm_xcrs {
+	__u32 nr_xcrs;
+	__u32 flags;
+	struct kvm_xcr xcrs[KVM_MAX_XCRS];
+	__u64 padding[16];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 29ee4e4c64cf..32c36668fa7b 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -13,8 +13,11 @@
 
 #define FXSAVE_SIZE	512
 
-#define XSTATE_YMM_SIZE 256
-#define XSTATE_YMM_OFFSET (512 + 64)
+#define XSAVE_HDR_SIZE	    64
+#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE	    256
+#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
 
 /*
  * These are the features that the OS can handle currently.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 795999e1ac19..0c8dc9614e7d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1680,6 +1680,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
+	case KVM_CAP_XSAVE:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -1703,6 +1704,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MCE:
 		r = KVM_MAX_MCE_BANKS;
 		break;
+	case KVM_CAP_XCRS:
+		r = cpu_has_xsave;
+		break;
 	default:
 		r = 0;
 		break;
@@ -2355,6 +2359,77 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+					 struct kvm_xsave *guest_xsave)
+{
+	if (cpu_has_xsave)
+		memcpy(guest_xsave->region,
+			&vcpu->arch.guest_fpu.state->xsave,
+			sizeof(struct xsave_struct));
+	else {
+		memcpy(guest_xsave->region,
+			&vcpu->arch.guest_fpu.state->fxsave,
+			sizeof(struct i387_fxsave_struct));
+		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
+			XSTATE_FPSSE;
+	}
+}
+
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+					struct kvm_xsave *guest_xsave)
+{
+	u64 xstate_bv =
+		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+
+	if (cpu_has_xsave)
+		memcpy(&vcpu->arch.guest_fpu.state->xsave,
+			guest_xsave->region, sizeof(struct xsave_struct));
+	else {
+		if (xstate_bv & ~XSTATE_FPSSE)
+			return -EINVAL;
+		memcpy(&vcpu->arch.guest_fpu.state->fxsave,
+			guest_xsave->region, sizeof(struct i387_fxsave_struct));
+	}
+	return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+					struct kvm_xcrs *guest_xcrs)
+{
+	if (!cpu_has_xsave) {
+		guest_xcrs->nr_xcrs = 0;
+		return;
+	}
+
+	guest_xcrs->nr_xcrs = 1;
+	guest_xcrs->flags = 0;
+	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+				       struct kvm_xcrs *guest_xcrs)
+{
+	int i, r = 0;
+
+	if (!cpu_has_xsave)
+		return -EINVAL;
+
+	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+		return -EINVAL;
+
+	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+		/* Only support XCR0 currently */
+		if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+				guest_xcrs->xcrs[0].value);
+			break;
+		}
+	if (r)
+		r = -EINVAL;
+	return r;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -2556,6 +2631,70 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
 		break;
 	}
+	case KVM_GET_XSAVE: {
+		struct kvm_xsave *xsave;
+
+		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!xsave)
+			break;
+
+		kvm_vcpu_ioctl_x86_get_xsave(vcpu, xsave);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, xsave, sizeof(struct kvm_xsave)))
+			break;
+		r = 0;
+		break;
+	}
+	case KVM_SET_XSAVE: {
+		struct kvm_xsave *xsave;
+
+		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!xsave)
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(xsave, argp, sizeof(struct kvm_xsave)))
+			break;
+
+		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, xsave);
+		break;
+	}
+	case KVM_GET_XCRS: {
+		struct kvm_xcrs *xcrs;
+
+		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!xcrs)
+			break;
+
+		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, xcrs);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, xcrs,
+				 sizeof(struct kvm_xcrs)))
+			break;
+		r = 0;
+		break;
+	}
+	case KVM_SET_XCRS: {
+		struct kvm_xcrs *xcrs;
+
+		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!xcrs)
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(xcrs, argp,
+				   sizeof(struct kvm_xcrs)))
+			break;
+
+		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, xcrs);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 23ea02253900..6fd40f540a8e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -524,6 +524,12 @@ struct kvm_enable_cap {
 #define KVM_CAP_PPC_OSI 52
 #define KVM_CAP_PPC_UNSET_IRQ 53
 #define KVM_CAP_ENABLE_CAP 54
+#ifdef __KVM_HAVE_XSAVE
+#define KVM_CAP_XSAVE 55
+#endif
+#ifdef __KVM_HAVE_XCRS
+#define KVM_CAP_XCRS 56
+#endif
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -714,6 +720,12 @@ struct kvm_clock_data {
 #define KVM_GET_DEBUGREGS         _IOR(KVMIO,  0xa1, struct kvm_debugregs)
 #define KVM_SET_DEBUGREGS         _IOW(KVMIO,  0xa2, struct kvm_debugregs)
 #define KVM_ENABLE_CAP            _IOW(KVMIO,  0xa3, struct kvm_enable_cap)
+/* Available with KVM_CAP_XSAVE */
+#define KVM_GET_XSAVE		  _IOR(KVMIO,  0xa4, struct kvm_xsave)
+#define KVM_SET_XSAVE		  _IOW(KVMIO,  0xa5, struct kvm_xsave)
+/* Available with KVM_CAP_XCRS */
+#define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
+#define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
-- 
cgit v1.2.3-59-g8ed1b


From a1f4d39500ad8ed61825eff061debff42386ab5b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 21 Jun 2010 11:44:20 +0300
Subject: KVM: Remove memory alias support

As advertised in feature-removal-schedule.txt.  Equivalent support is provided
by overlapping memory regions.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/feature-removal-schedule.txt |  11 ---
 Documentation/kvm/api.txt                  |  12 +--
 arch/ia64/kvm/kvm-ia64.c                   |   5 --
 arch/powerpc/kvm/powerpc.c                 |   5 --
 arch/s390/kvm/kvm-s390.c                   |   5 --
 arch/x86/include/asm/kvm_host.h            |  21 -----
 arch/x86/kvm/mmu.c                         |  17 +---
 arch/x86/kvm/paging_tmpl.h                 |   3 +-
 arch/x86/kvm/x86.c                         | 125 -----------------------------
 arch/x86/kvm/x86.h                         |   7 --
 include/linux/kvm.h                        |   1 +
 include/linux/kvm_host.h                   |   6 --
 virt/kvm/kvm_main.c                        |  18 +----
 13 files changed, 11 insertions(+), 225 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 1571c0c83dba..ad1e90dd2780 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -538,17 +538,6 @@ Who:	Jan Kiszka <jan.kiszka@web.de>
 
 ----------------------------
 
-What:	KVM memory aliases support
-When:	July 2010
-Why:	Memory aliasing support is used for speeding up guest vga access
-	through the vga windows.
-
-	Modern userspace no longer uses this feature, so it's just bitrotted
-	code and can be removed with no impact.
-Who:	Avi Kivity <avi@redhat.com>
-
-----------------------------
-
 What:	xtime, wall_to_monotonic
 When:	2.6.36+
 Files:	kernel/time/timekeeping.c include/linux/time.h
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index ffba03f55bdf..7e415943a11e 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -226,17 +226,7 @@ Type: vm ioctl
 Parameters: struct kvm_memory_alias (in)
 Returns: 0 (success), -1 (error)
 
-struct kvm_memory_alias {
-	__u32 slot;  /* this has a different namespace than memory slots */
-	__u32 flags;
-	__u64 guest_phys_addr;
-	__u64 memory_size;
-	__u64 target_phys_addr;
-};
-
-Defines a guest physical address space region as an alias to another
-region.  Useful for aliased address, for example the VGA low memory
-window. Should not be used with userspace memory.
+This ioctl is obsolete and has been removed.
 
 4.9 KVM_RUN
 
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 91760e80e268..bd510beb43af 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1946,11 +1946,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 	return vcpu->arch.timer_fired;
 }
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	return gfn;
-}
-
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) ||
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index b5ebdfbed20b..72a4ad86ee91 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -36,11 +36,6 @@
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	return gfn;
-}
-
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
 	return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 08a3b35d30be..4fe68650535c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -723,11 +723,6 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 {
 }
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	return gfn;
-}
-
 static int __init kvm_s390_init(void)
 {
 	int ret;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2ec2e27a403e..a57cdeacc4d2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -69,8 +69,6 @@
 
 #define IOPL_SHIFT 12
 
-#define KVM_ALIAS_SLOTS 4
-
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
 #define KVM_MMU_HASH_SHIFT 10
@@ -362,24 +360,7 @@ struct kvm_vcpu_arch {
 	u64 hv_vapic;
 };
 
-struct kvm_mem_alias {
-	gfn_t base_gfn;
-	unsigned long npages;
-	gfn_t target_gfn;
-#define KVM_ALIAS_INVALID     1UL
-	unsigned long flags;
-};
-
-#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
-
-struct kvm_mem_aliases {
-	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
-	int naliases;
-};
-
 struct kvm_arch {
-	struct kvm_mem_aliases *aliases;
-
 	unsigned int n_free_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_alloc_mmu_pages;
@@ -655,8 +636,6 @@ void kvm_disable_tdp(void);
 int complete_pio(struct kvm_vcpu *vcpu);
 bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
-
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
 	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8c2f580956d9..c5501bc10106 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -434,9 +434,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 	int *write_count;
 	int i;
 
-	gfn = unalias_gfn(kvm, gfn);
-
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		write_count   = slot_largepage_idx(gfn, slot, i);
@@ -450,8 +448,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 	int *write_count;
 	int i;
 
-	gfn = unalias_gfn(kvm, gfn);
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		write_count   = slot_largepage_idx(gfn, slot, i);
@@ -467,8 +464,7 @@ static int has_wrprotected_page(struct kvm *kvm,
 	struct kvm_memory_slot *slot;
 	int *largepage_idx;
 
-	gfn = unalias_gfn(kvm, gfn);
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	if (slot) {
 		largepage_idx = slot_largepage_idx(gfn, slot, level);
 		return *largepage_idx;
@@ -521,7 +517,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 
 /*
  * Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
  */
 
 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -561,7 +556,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 	if (!is_rmap_spte(*spte))
 		return count;
-	gfn = unalias_gfn(vcpu->kvm, gfn);
 	sp = page_header(__pa(spte));
 	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
@@ -698,7 +692,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 	u64 *spte;
 	int i, write_protected = 0;
 
-	gfn = unalias_gfn(kvm, gfn);
 	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
 
 	spte = rmap_next(kvm, rmapp, NULL);
@@ -885,7 +878,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 	sp = page_header(__pa(spte));
 
-	gfn = unalias_gfn(vcpu->kvm, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
 	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -3510,8 +3502,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 		if (sp->unsync)
 			continue;
 
-		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
-		slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
+		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
 		rmapp = &slot->rmap[gfn - slot->base_gfn];
 
 		spte = rmap_next(vcpu->kvm, rmapp, NULL);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 863920f649fb..a21a86ef9e20 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -576,7 +576,6 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
  * Using the cached information from sp->gfns is safe because:
  * - The spte has a reference to the struct page, so the pfn for a given gfn
  *   can't change unless all sptes pointing to it are nuked first.
- * - Alias changes zap the entire shadow cache.
  */
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			    bool clear_unsync)
@@ -611,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			return -EINVAL;
 
 		gfn = gpte_to_gfn(gpte);
-		if (unalias_gfn(vcpu->kvm, gfn) != sp->gfns[i] ||
+		if (gfn != sp->gfns[i] ||
 		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8e60b6c9c0b0..62596d373a49 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2740,115 +2740,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 	return kvm->arch.n_alloc_mmu_pages;
 }
 
-gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-	struct kvm_mem_alias *alias;
-	struct kvm_mem_aliases *aliases;
-
-	aliases = kvm_aliases(kvm);
-
-	for (i = 0; i < aliases->naliases; ++i) {
-		alias = &aliases->aliases[i];
-		if (alias->flags & KVM_ALIAS_INVALID)
-			continue;
-		if (gfn >= alias->base_gfn
-		    && gfn < alias->base_gfn + alias->npages)
-			return alias->target_gfn + gfn - alias->base_gfn;
-	}
-	return gfn;
-}
-
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-	struct kvm_mem_alias *alias;
-	struct kvm_mem_aliases *aliases;
-
-	aliases = kvm_aliases(kvm);
-
-	for (i = 0; i < aliases->naliases; ++i) {
-		alias = &aliases->aliases[i];
-		if (gfn >= alias->base_gfn
-		    && gfn < alias->base_gfn + alias->npages)
-			return alias->target_gfn + gfn - alias->base_gfn;
-	}
-	return gfn;
-}
-
-/*
- * Set a new alias region.  Aliases map a portion of physical memory into
- * another portion.  This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
-					 struct kvm_memory_alias *alias)
-{
-	int r, n;
-	struct kvm_mem_alias *p;
-	struct kvm_mem_aliases *aliases, *old_aliases;
-
-	r = -EINVAL;
-	/* General sanity checks */
-	if (alias->memory_size & (PAGE_SIZE - 1))
-		goto out;
-	if (alias->guest_phys_addr & (PAGE_SIZE - 1))
-		goto out;
-	if (alias->slot >= KVM_ALIAS_SLOTS)
-		goto out;
-	if (alias->guest_phys_addr + alias->memory_size
-	    < alias->guest_phys_addr)
-		goto out;
-	if (alias->target_phys_addr + alias->memory_size
-	    < alias->target_phys_addr)
-		goto out;
-
-	r = -ENOMEM;
-	aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-	if (!aliases)
-		goto out;
-
-	mutex_lock(&kvm->slots_lock);
-
-	/* invalidate any gfn reference in case of deletion/shrinking */
-	memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-	aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
-	old_aliases = kvm->arch.aliases;
-	rcu_assign_pointer(kvm->arch.aliases, aliases);
-	synchronize_srcu_expedited(&kvm->srcu);
-	kvm_mmu_zap_all(kvm);
-	kfree(old_aliases);
-
-	r = -ENOMEM;
-	aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-	if (!aliases)
-		goto out_unlock;
-
-	memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-
-	p = &aliases->aliases[alias->slot];
-	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
-	p->npages = alias->memory_size >> PAGE_SHIFT;
-	p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
-	p->flags &= ~(KVM_ALIAS_INVALID);
-
-	for (n = KVM_ALIAS_SLOTS; n > 0; --n)
-		if (aliases->aliases[n - 1].npages)
-			break;
-	aliases->naliases = n;
-
-	old_aliases = kvm->arch.aliases;
-	rcu_assign_pointer(kvm->arch.aliases, aliases);
-	synchronize_srcu_expedited(&kvm->srcu);
-	kfree(old_aliases);
-	r = 0;
-
-out_unlock:
-	mutex_unlock(&kvm->slots_lock);
-out:
-	return r;
-}
-
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 	int r;
@@ -3056,7 +2947,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	union {
 		struct kvm_pit_state ps;
 		struct kvm_pit_state2 ps2;
-		struct kvm_memory_alias alias;
 		struct kvm_pit_config pit_config;
 	} u;
 
@@ -3101,14 +2991,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	case KVM_GET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
 		break;
-	case KVM_SET_MEMORY_ALIAS:
-		r = -EFAULT;
-		if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
-			goto out;
-		r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
-		if (r)
-			goto out;
-		break;
 	case KVM_CREATE_IRQCHIP: {
 		struct kvm_pic *vpic;
 
@@ -5559,12 +5441,6 @@ struct  kvm *kvm_arch_create_vm(void)
 	if (!kvm)
 		return ERR_PTR(-ENOMEM);
 
-	kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-	if (!kvm->arch.aliases) {
-		kfree(kvm);
-		return ERR_PTR(-ENOMEM);
-	}
-
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
@@ -5622,7 +5498,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	if (kvm->arch.ept_identity_pagetable)
 		put_page(kvm->arch.ept_identity_pagetable);
 	cleanup_srcu_struct(&kvm->srcu);
-	kfree(kvm->arch.aliases);
 	kfree(kvm);
 }
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f4b54458285b..b7a404722d2b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
 }
 
-static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
-{
-	return rcu_dereference_check(kvm->arch.aliases,
-			srcu_read_lock_held(&kvm->srcu)
-			|| lockdep_is_held(&kvm->slots_lock));
-}
-
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 6fd40f540a8e..636fc381c897 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -619,6 +619,7 @@ struct kvm_clock_data {
  */
 #define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO,  0x42, struct kvm_dirty_log)
+/* KVM_SET_MEMORY_ALIAS is obsolete: */
 #define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO,  0x43, struct kvm_memory_alias)
 #define KVM_SET_NR_MMU_PAGES      _IO(KVMIO,   0x44)
 #define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2d96555cd4ed..240e460777bc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -286,8 +286,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 				int user_alloc);
 void kvm_disable_largepages(void);
 void kvm_arch_flush_shadow(struct kvm *kvm);
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
-gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn);
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
@@ -564,10 +562,6 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
 }
 #endif
 
-#ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION
-#define unalias_gfn_instantiation unalias_gfn
-#endif
-
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 
 #define KVM_MAX_IRQ_ROUTES 1024
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 84a090644d9d..65417e3d8462 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -841,7 +841,7 @@ int kvm_is_error_hva(unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
 	struct kvm_memslots *slots = kvm_memslots(kvm);
@@ -855,20 +855,13 @@ struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
 	}
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased);
-
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
-{
-	gfn = unalias_gfn(kvm, gfn);
-	return gfn_to_memslot_unaliased(kvm, gfn);
-}
+EXPORT_SYMBOL_GPL(gfn_to_memslot);
 
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 
-	gfn = unalias_gfn_instantiation(kvm, gfn);
 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
 		struct kvm_memory_slot *memslot = &slots->memslots[i];
 
@@ -913,7 +906,6 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	struct kvm_memory_slot *memslot = NULL;
 
-	gfn = unalias_gfn(kvm, gfn);
 	for (i = 0; i < slots->nmemslots; ++i) {
 		memslot = &slots->memslots[i];
 
@@ -934,8 +926,7 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *slot;
 
-	gfn = unalias_gfn_instantiation(kvm, gfn);
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return bad_hva();
 	return gfn_to_hva_memslot(slot, gfn);
@@ -1202,8 +1193,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *memslot;
 
-	gfn = unalias_gfn(kvm, gfn);
-	memslot = gfn_to_memslot_unaliased(kvm, gfn);
+	memslot = gfn_to_memslot(kvm, gfn);
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 
-- 
cgit v1.2.3-59-g8ed1b


From a8eeb04a44dd6dc4c8158953d9bae48849c9a188 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 10 May 2010 12:34:53 +0300
Subject: KVM: Add mini-API for vcpu->requests

Makes it a little more readable and hackable.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c     |  2 +-
 arch/x86/kvm/mmu.c       |  6 +++---
 arch/x86/kvm/svm.c       |  2 +-
 arch/x86/kvm/timer.c     |  2 +-
 arch/x86/kvm/vmx.c       |  2 +-
 arch/x86/kvm/x86.c       | 27 +++++++++++++--------------
 include/linux/kvm_host.h | 15 +++++++++++++++
 virt/kvm/kvm_main.c      |  4 ++--
 8 files changed, 37 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 49573c78c24b..77d8c0f4817d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -534,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
 	struct kvm_vcpu *vcpu = apic->vcpu;
 	struct kvm_run *run = vcpu->run;
 
-	set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+	kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
 	run->tpr_access.rip = kvm_rip_read(vcpu);
 	run->tpr_access.is_write = write;
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c5501bc10106..690a7fc58c17 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1378,7 +1378,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
 		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 		if (sp->unsync_children) {
-			set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
 			kvm_mmu_mark_parents_unsync(sp);
 		} else if (sp->unsync)
 			kvm_mmu_mark_parents_unsync(sp);
@@ -2131,7 +2131,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 	int ret = 0;
 
 	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
-		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		ret = 1;
 	}
 
@@ -2329,7 +2329,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.tlb_flush;
-	set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
+	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f7a6fdcf8ef3..587b99d37d44 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1494,7 +1494,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
 		 */
 		pr_err("KVM: Guest triggered AMD Erratum 383\n");
 
-		set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
 
 		return;
 	}
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 564548fbb3d6..e16a0dbe74d8 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -32,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
 	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
 		atomic_inc(&ktimer->pending);
 		/* FIXME: this code should not know anything about vcpus */
-		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
 	}
 
 	if (waitqueue_active(q))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 345a35470511..661c6e199b4a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -899,7 +899,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		unsigned long sysenter_esp;
 
 		kvm_migrate_timers(vcpu);
-		set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		local_irq_disable();
 		list_add(&vmx->local_vcpus_link,
 			 &per_cpu(vcpus_on_cpu, cpu));
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9be6e4e5e8ee..7ef44107a14a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -296,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 	prev_nr = vcpu->arch.exception.nr;
 	if (prev_nr == DF_VECTOR) {
 		/* triple fault -> shutdown */
-		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 	class1 = exception_class(prev_nr);
@@ -948,7 +948,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 
 	if (!vcpu->time_page)
 		return 0;
-	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+	kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
 	return 1;
 }
 
@@ -2253,7 +2253,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 			printk(KERN_DEBUG "kvm: set_mce: "
 			       "injects mce exception while "
 			       "previous one is in progress!\n");
-			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 			return 0;
 		}
 		if (banks[1] & MCI_STATUS_VAL)
@@ -4617,7 +4617,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		vcpu->run->request_interrupt_window;
 
 	if (vcpu->requests)
-		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 			kvm_mmu_unload(vcpu);
 
 	r = kvm_mmu_reload(vcpu);
@@ -4625,26 +4625,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		goto out;
 
 	if (vcpu->requests) {
-		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
-		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
 			kvm_write_guest_time(vcpu);
-		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
 			kvm_mmu_sync_roots(vcpu);
-		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
 			kvm_x86_ops->tlb_flush(vcpu);
-		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
-				       &vcpu->requests)) {
+		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
 			r = 0;
 			goto out;
 		}
-		if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+		if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
 			r = 0;
 			goto out;
 		}
-		if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+		if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
 			vcpu->fpu_active = 0;
 			kvm_x86_ops->fpu_deactivate(vcpu);
 		}
@@ -4773,7 +4772,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 			kvm_vcpu_block(vcpu);
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-			if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+			if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
 			{
 				switch(vcpu->arch.mp_state) {
 				case KVM_MP_STATE_HALTED:
@@ -5255,7 +5254,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 	vcpu->guest_fpu_loaded = 0;
 	fpu_save_init(&vcpu->arch.guest_fpu);
 	++vcpu->stat.fpu_reload;
-	set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
 	trace_kvm_fpu(0);
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 240e460777bc..c8a9d628898e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -624,5 +624,20 @@ static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 
 #endif
 
+static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
+{
+	set_bit(req, &vcpu->requests);
+}
+
+static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu)
+{
+	return test_and_set_bit(req, &vcpu->requests);
+}
+
+static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
+{
+	return test_and_clear_bit(req, &vcpu->requests);
+}
+
 #endif
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 65417e3d8462..5bd2f34ba576 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -145,7 +145,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 	raw_spin_lock(&kvm->requests_lock);
 	me = smp_processor_id();
 	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (test_and_set_bit(req, &vcpu->requests))
+		if (kvm_make_check_request(req, vcpu))
 			continue;
 		cpu = vcpu->cpu;
 		if (cpus != NULL && cpu != -1 && cpu != me)
@@ -1212,7 +1212,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
 		if (kvm_arch_vcpu_runnable(vcpu)) {
-			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			kvm_make_request(KVM_REQ_UNHALT, vcpu);
 			break;
 		}
 		if (kvm_cpu_has_pending_timer(vcpu))
-- 
cgit v1.2.3-59-g8ed1b


From 0719837c0832a7b305e42327caa7d330462360ea Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 10 May 2010 13:08:26 +0300
Subject: KVM: Reduce atomic operations on vcpu->requests

Usually the vcpu->requests bitmap is sparse, so a test_and_clear_bit() for
each request generates a large number of unneeded atomics if a bit is set.

Replace with a separate test/clear sequence.  This is safe since there is
no clear_bit() outside the vcpu thread.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c8a9d628898e..e820eb579108 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -636,7 +636,12 @@ static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu)
 
 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 {
-	return test_and_clear_bit(req, &vcpu->requests);
+	if (test_bit(req, &vcpu->requests)) {
+		clear_bit(req, &vcpu->requests);
+		return true;
+	} else {
+		return false;
+	}
 }
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From e36d96f7cfaa71870c407131eb4fbd38ea285c01 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 21 Jun 2010 10:56:36 +0300
Subject: KVM: Keep slot ID in memory slot structure

May be used for distinguishing between internal and user slots, or for sorting
slots in size order.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e820eb579108..e796326f3646 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -124,6 +124,7 @@ struct kvm_memory_slot {
 	} *lpage_info[KVM_NR_PAGE_SIZES - 1];
 	unsigned long userspace_addr;
 	int user_alloc;
+	int id;
 };
 
 static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5bd2f34ba576..74f731920945 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -570,6 +570,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	new = old = *memslot;
 
+	new.id = mem->slot;
 	new.base_gfn = base_gfn;
 	new.npages = npages;
 	new.flags = mem->flags;
-- 
cgit v1.2.3-59-g8ed1b


From 6ee0578b4daaea01c96b172c6aacca43fd9807a6 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 30 Jul 2010 14:57:37 -0700
Subject: workqueue: mark init_workqueues() as early_initcall()

Mark init_workqueues() as early_initcall() and thus it will be initialized
before smp bringup. init_workqueues() registers for the hotcpu notifier
and thus it should cope with the processors that are brought online after
the workqueues are initialized.

x86 smp bringup code uses workqueues and uses a workaround for the
cold boot process (as the workqueues are initialized post smp_init()).
Marking init_workqueues() as early_initcall() will pave the way for
cleaning up this code.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/workqueue.h | 1 -
 init/main.c               | 2 --
 kernel/workqueue.c        | 4 +++-
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 5f76001c4e6d..51dc9a727e5e 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -327,7 +327,6 @@ extern int schedule_delayed_work_on(int cpu, struct delayed_work *work,
 extern int schedule_on_each_cpu(work_func_t func);
 extern int keventd_up(void);
 
-extern void init_workqueues(void);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
 extern int flush_work(struct work_struct *work);
diff --git a/init/main.c b/init/main.c
index 3bdb152f412f..5f2ec2cdd900 100644
--- a/init/main.c
+++ b/init/main.c
@@ -32,7 +32,6 @@
 #include <linux/start_kernel.h>
 #include <linux/security.h>
 #include <linux/smp.h>
-#include <linux/workqueue.h>
 #include <linux/profile.h>
 #include <linux/rcupdate.h>
 #include <linux/moduleparam.h>
@@ -786,7 +785,6 @@ static void __init do_initcalls(void)
  */
 static void __init do_basic_setup(void)
 {
-	init_workqueues();
 	cpuset_init_smp();
 	usermodehelper_init();
 	init_tmpfs();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1105c474073a..e2eb351d9152 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3507,7 +3507,7 @@ out_unlock:
 }
 #endif /* CONFIG_FREEZER */
 
-void __init init_workqueues(void)
+static int __init init_workqueues(void)
 {
 	unsigned int cpu;
 	int i;
@@ -3559,4 +3559,6 @@ void __init init_workqueues(void)
 	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
 					    WQ_UNBOUND_MAX_ACTIVE);
 	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
+	return 0;
 }
+early_initcall(init_workqueues);
-- 
cgit v1.2.3-59-g8ed1b


From 0814a979a64a5ae61c7567496d090e204ecabd2b Mon Sep 17 00:00:00 2001
From: Anatolij Gustschin <agust@denx.de>
Date: Fri, 23 Jul 2010 04:00:36 +0000
Subject: powerpc/5121: move fsl-diu-fb.h to include/linux

Some DIU structures will be used in platform code in
subsequent MPC5121 DIU patch, so we move this header
to be able to include it elsewhere.

Signed-off-by: Anatolij Gustschin <agust@denx.de>
Acked-by: Timur Tabi <timur@freescale.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/video/fsl-diu-fb.c |   2 +-
 drivers/video/fsl-diu-fb.h | 223 ---------------------------------------------
 include/linux/fsl-diu-fb.h | 223 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 224 insertions(+), 224 deletions(-)
 delete mode 100644 drivers/video/fsl-diu-fb.h
 create mode 100644 include/linux/fsl-diu-fb.h

(limited to 'include/linux')

diff --git a/drivers/video/fsl-diu-fb.c b/drivers/video/fsl-diu-fb.c
index 9b8c99111221..48905d5f4e8d 100644
--- a/drivers/video/fsl-diu-fb.c
+++ b/drivers/video/fsl-diu-fb.c
@@ -34,7 +34,7 @@
 #include <linux/of_platform.h>
 
 #include <sysdev/fsl_soc.h>
-#include "fsl-diu-fb.h"
+#include <linux/fsl-diu-fb.h>
 
 /*
  * These parameters give default parameters
diff --git a/drivers/video/fsl-diu-fb.h b/drivers/video/fsl-diu-fb.h
deleted file mode 100644
index fc295d7ea463..000000000000
--- a/drivers/video/fsl-diu-fb.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright 2008 Freescale Semiconductor, Inc. All Rights Reserved.
- *
- *  Freescale DIU Frame Buffer device driver
- *
- *  Authors: Hongjun Chen <hong-jun.chen@freescale.com>
- *           Paul Widmer <paul.widmer@freescale.com>
- *           Srikanth Srinivasan <srikanth.srinivasan@freescale.com>
- *           York Sun <yorksun@freescale.com>
- *
- *   Based on imxfb.c Copyright (C) 2004 S.Hauer, Pengutronix
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- *
- */
-
-#ifndef __FSL_DIU_FB_H__
-#define __FSL_DIU_FB_H__
-
-/* Arbitrary threshold to determine the allocation method
- * See mpc8610fb_set_par(), map_video_memory(), and unmap_video_memory()
- */
-#define MEM_ALLOC_THRESHOLD (1024*768*4+32)
-/* Minimum value that the pixel clock can be set to in pico seconds
- * This is determined by platform clock/3 where the minimum platform
- * clock is 533MHz. This gives 5629 pico seconds.
- */
-#define MIN_PIX_CLK 5629
-#define MAX_PIX_CLK 96096
-
-#include <linux/types.h>
-
-struct mfb_alpha {
-	int enable;
-	int alpha;
-};
-
-struct mfb_chroma_key {
-	int enable;
-	__u8  red_max;
-	__u8  green_max;
-	__u8  blue_max;
-	__u8  red_min;
-	__u8  green_min;
-	__u8  blue_min;
-};
-
-struct aoi_display_offset {
-	int x_aoi_d;
-	int y_aoi_d;
-};
-
-#define MFB_SET_CHROMA_KEY	_IOW('M', 1, struct mfb_chroma_key)
-#define MFB_WAIT_FOR_VSYNC	_IOW('F', 0x20, u_int32_t)
-#define MFB_SET_BRIGHTNESS	_IOW('M', 3, __u8)
-
-#define MFB_SET_ALPHA		0x80014d00
-#define MFB_GET_ALPHA		0x40014d00
-#define MFB_SET_AOID		0x80084d04
-#define MFB_GET_AOID		0x40084d04
-#define MFB_SET_PIXFMT		0x80014d08
-#define MFB_GET_PIXFMT		0x40014d08
-
-#define FBIOGET_GWINFO		0x46E0
-#define FBIOPUT_GWINFO		0x46E1
-
-#ifdef __KERNEL__
-#include <linux/spinlock.h>
-
-/*
- * These are the fields of area descriptor(in DDR memory) for every plane
- */
-struct diu_ad {
-	/* Word 0(32-bit) in DDR memory */
-/* 	__u16 comp; */
-/* 	__u16 pixel_s:2; */
-/* 	__u16 pallete:1; */
-/* 	__u16 red_c:2; */
-/* 	__u16 green_c:2; */
-/* 	__u16 blue_c:2; */
-/* 	__u16 alpha_c:3; */
-/* 	__u16 byte_f:1; */
-/* 	__u16 res0:3; */
-
-	__be32 pix_fmt; /* hard coding pixel format */
-
-	/* Word 1(32-bit) in DDR memory */
-	__le32 addr;
-
-	/* Word 2(32-bit) in DDR memory */
-/* 	__u32 delta_xs:11; */
-/* 	__u32 res1:1; */
-/* 	__u32 delta_ys:11; */
-/* 	__u32 res2:1; */
-/* 	__u32 g_alpha:8; */
-	__le32 src_size_g_alpha;
-
-	/* Word 3(32-bit) in DDR memory */
-/* 	__u32 delta_xi:11; */
-/* 	__u32 res3:5; */
-/* 	__u32 delta_yi:11; */
-/* 	__u32 res4:3; */
-/* 	__u32 flip:2; */
-	__le32 aoi_size;
-
-	/* Word 4(32-bit) in DDR memory */
-	/*__u32 offset_xi:11;
-	__u32 res5:5;
-	__u32 offset_yi:11;
-	__u32 res6:5;
-	*/
-	__le32 offset_xyi;
-
-	/* Word 5(32-bit) in DDR memory */
-	/*__u32 offset_xd:11;
-	__u32 res7:5;
-	__u32 offset_yd:11;
-	__u32 res8:5; */
-	__le32 offset_xyd;
-
-
-	/* Word 6(32-bit) in DDR memory */
-	__u8 ckmax_r;
-	__u8 ckmax_g;
-	__u8 ckmax_b;
-	__u8 res9;
-
-	/* Word 7(32-bit) in DDR memory */
-	__u8 ckmin_r;
-	__u8 ckmin_g;
-	__u8 ckmin_b;
-	__u8 res10;
-/* 	__u32 res10:8; */
-
-	/* Word 8(32-bit) in DDR memory */
-	__le32 next_ad;
-
-	/* Word 9(32-bit) in DDR memory, just for 64-bit aligned */
-	__u32 paddr;
-} __attribute__ ((packed));
-
-/* DIU register map */
-struct diu {
-	__be32 desc[3];
-	__be32 gamma;
-	__be32 pallete;
-	__be32 cursor;
-	__be32 curs_pos;
-	__be32 diu_mode;
-	__be32 bgnd;
-	__be32 bgnd_wb;
-	__be32 disp_size;
-	__be32 wb_size;
-	__be32 wb_mem_addr;
-	__be32 hsyn_para;
-	__be32 vsyn_para;
-	__be32 syn_pol;
-	__be32 thresholds;
-	__be32 int_status;
-	__be32 int_mask;
-	__be32 colorbar[8];
-	__be32 filling;
-	__be32 plut;
-} __attribute__ ((packed));
-
-struct diu_hw {
-	struct diu *diu_reg;
-	spinlock_t reg_lock;
-
-	__u32 mode;		/* DIU operation mode */
-};
-
-struct diu_addr {
-	__u8 __iomem *vaddr;	/* Virtual address */
-	dma_addr_t paddr;	/* Physical address */
-	__u32 	   offset;
-};
-
-struct diu_pool {
-	struct diu_addr ad;
-	struct diu_addr gamma;
-	struct diu_addr pallete;
-	struct diu_addr cursor;
-};
-
-#define FSL_DIU_BASE_OFFSET	0x2C000	/* Offset of DIU */
-#define INT_LCDC		64	/* DIU interrupt number */
-
-#define FSL_AOI_NUM	6	/* 5 AOIs and one dummy AOI */
-				/* 1 for plane 0, 2 for plane 1&2 each */
-
-/* Minimum X and Y resolutions */
-#define MIN_XRES	64
-#define MIN_YRES	64
-
-/* HW cursor parameters */
-#define MAX_CURS		32
-
-/* Modes of operation of DIU */
-#define MFB_MODE0	0	/* DIU off */
-#define MFB_MODE1	1	/* All three planes output to display */
-#define MFB_MODE2	2	/* Plane 1 to display, planes 2+3 written back*/
-#define MFB_MODE3	3	/* All three planes written back to memory */
-#define MFB_MODE4	4	/* Color bar generation */
-
-/* INT_STATUS/INT_MASK field descriptions */
-#define INT_VSYNC	0x01	/* Vsync interrupt  */
-#define INT_VSYNC_WB	0x02	/* Vsync interrupt for write back operation */
-#define INT_UNDRUN	0x04	/* Under run exception interrupt */
-#define INT_PARERR	0x08	/* Display parameters error interrupt */
-#define INT_LS_BF_VS	0x10	/* Lines before vsync. interrupt */
-
-/* Panels'operation modes */
-#define MFB_TYPE_OUTPUT	0	/* Panel output to display */
-#define MFB_TYPE_OFF	1	/* Panel off */
-#define MFB_TYPE_WB	2	/* Panel written back to memory */
-#define MFB_TYPE_TEST	3	/* Panel generate color bar */
-
-#endif /* __KERNEL__ */
-#endif /* __FSL_DIU_FB_H__ */
diff --git a/include/linux/fsl-diu-fb.h b/include/linux/fsl-diu-fb.h
new file mode 100644
index 000000000000..fc295d7ea463
--- /dev/null
+++ b/include/linux/fsl-diu-fb.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2008 Freescale Semiconductor, Inc. All Rights Reserved.
+ *
+ *  Freescale DIU Frame Buffer device driver
+ *
+ *  Authors: Hongjun Chen <hong-jun.chen@freescale.com>
+ *           Paul Widmer <paul.widmer@freescale.com>
+ *           Srikanth Srinivasan <srikanth.srinivasan@freescale.com>
+ *           York Sun <yorksun@freescale.com>
+ *
+ *   Based on imxfb.c Copyright (C) 2004 S.Hauer, Pengutronix
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ */
+
+#ifndef __FSL_DIU_FB_H__
+#define __FSL_DIU_FB_H__
+
+/* Arbitrary threshold to determine the allocation method
+ * See mpc8610fb_set_par(), map_video_memory(), and unmap_video_memory()
+ */
+#define MEM_ALLOC_THRESHOLD (1024*768*4+32)
+/* Minimum value that the pixel clock can be set to in pico seconds
+ * This is determined by platform clock/3 where the minimum platform
+ * clock is 533MHz. This gives 5629 pico seconds.
+ */
+#define MIN_PIX_CLK 5629
+#define MAX_PIX_CLK 96096
+
+#include <linux/types.h>
+
+struct mfb_alpha {
+	int enable;
+	int alpha;
+};
+
+struct mfb_chroma_key {
+	int enable;
+	__u8  red_max;
+	__u8  green_max;
+	__u8  blue_max;
+	__u8  red_min;
+	__u8  green_min;
+	__u8  blue_min;
+};
+
+struct aoi_display_offset {
+	int x_aoi_d;
+	int y_aoi_d;
+};
+
+#define MFB_SET_CHROMA_KEY	_IOW('M', 1, struct mfb_chroma_key)
+#define MFB_WAIT_FOR_VSYNC	_IOW('F', 0x20, u_int32_t)
+#define MFB_SET_BRIGHTNESS	_IOW('M', 3, __u8)
+
+#define MFB_SET_ALPHA		0x80014d00
+#define MFB_GET_ALPHA		0x40014d00
+#define MFB_SET_AOID		0x80084d04
+#define MFB_GET_AOID		0x40084d04
+#define MFB_SET_PIXFMT		0x80014d08
+#define MFB_GET_PIXFMT		0x40014d08
+
+#define FBIOGET_GWINFO		0x46E0
+#define FBIOPUT_GWINFO		0x46E1
+
+#ifdef __KERNEL__
+#include <linux/spinlock.h>
+
+/*
+ * These are the fields of area descriptor(in DDR memory) for every plane
+ */
+struct diu_ad {
+	/* Word 0(32-bit) in DDR memory */
+/* 	__u16 comp; */
+/* 	__u16 pixel_s:2; */
+/* 	__u16 pallete:1; */
+/* 	__u16 red_c:2; */
+/* 	__u16 green_c:2; */
+/* 	__u16 blue_c:2; */
+/* 	__u16 alpha_c:3; */
+/* 	__u16 byte_f:1; */
+/* 	__u16 res0:3; */
+
+	__be32 pix_fmt; /* hard coding pixel format */
+
+	/* Word 1(32-bit) in DDR memory */
+	__le32 addr;
+
+	/* Word 2(32-bit) in DDR memory */
+/* 	__u32 delta_xs:11; */
+/* 	__u32 res1:1; */
+/* 	__u32 delta_ys:11; */
+/* 	__u32 res2:1; */
+/* 	__u32 g_alpha:8; */
+	__le32 src_size_g_alpha;
+
+	/* Word 3(32-bit) in DDR memory */
+/* 	__u32 delta_xi:11; */
+/* 	__u32 res3:5; */
+/* 	__u32 delta_yi:11; */
+/* 	__u32 res4:3; */
+/* 	__u32 flip:2; */
+	__le32 aoi_size;
+
+	/* Word 4(32-bit) in DDR memory */
+	/*__u32 offset_xi:11;
+	__u32 res5:5;
+	__u32 offset_yi:11;
+	__u32 res6:5;
+	*/
+	__le32 offset_xyi;
+
+	/* Word 5(32-bit) in DDR memory */
+	/*__u32 offset_xd:11;
+	__u32 res7:5;
+	__u32 offset_yd:11;
+	__u32 res8:5; */
+	__le32 offset_xyd;
+
+
+	/* Word 6(32-bit) in DDR memory */
+	__u8 ckmax_r;
+	__u8 ckmax_g;
+	__u8 ckmax_b;
+	__u8 res9;
+
+	/* Word 7(32-bit) in DDR memory */
+	__u8 ckmin_r;
+	__u8 ckmin_g;
+	__u8 ckmin_b;
+	__u8 res10;
+/* 	__u32 res10:8; */
+
+	/* Word 8(32-bit) in DDR memory */
+	__le32 next_ad;
+
+	/* Word 9(32-bit) in DDR memory, just for 64-bit aligned */
+	__u32 paddr;
+} __attribute__ ((packed));
+
+/* DIU register map */
+struct diu {
+	__be32 desc[3];
+	__be32 gamma;
+	__be32 pallete;
+	__be32 cursor;
+	__be32 curs_pos;
+	__be32 diu_mode;
+	__be32 bgnd;
+	__be32 bgnd_wb;
+	__be32 disp_size;
+	__be32 wb_size;
+	__be32 wb_mem_addr;
+	__be32 hsyn_para;
+	__be32 vsyn_para;
+	__be32 syn_pol;
+	__be32 thresholds;
+	__be32 int_status;
+	__be32 int_mask;
+	__be32 colorbar[8];
+	__be32 filling;
+	__be32 plut;
+} __attribute__ ((packed));
+
+struct diu_hw {
+	struct diu *diu_reg;
+	spinlock_t reg_lock;
+
+	__u32 mode;		/* DIU operation mode */
+};
+
+struct diu_addr {
+	__u8 __iomem *vaddr;	/* Virtual address */
+	dma_addr_t paddr;	/* Physical address */
+	__u32 	   offset;
+};
+
+struct diu_pool {
+	struct diu_addr ad;
+	struct diu_addr gamma;
+	struct diu_addr pallete;
+	struct diu_addr cursor;
+};
+
+#define FSL_DIU_BASE_OFFSET	0x2C000	/* Offset of DIU */
+#define INT_LCDC		64	/* DIU interrupt number */
+
+#define FSL_AOI_NUM	6	/* 5 AOIs and one dummy AOI */
+				/* 1 for plane 0, 2 for plane 1&2 each */
+
+/* Minimum X and Y resolutions */
+#define MIN_XRES	64
+#define MIN_YRES	64
+
+/* HW cursor parameters */
+#define MAX_CURS		32
+
+/* Modes of operation of DIU */
+#define MFB_MODE0	0	/* DIU off */
+#define MFB_MODE1	1	/* All three planes output to display */
+#define MFB_MODE2	2	/* Plane 1 to display, planes 2+3 written back*/
+#define MFB_MODE3	3	/* All three planes written back to memory */
+#define MFB_MODE4	4	/* Color bar generation */
+
+/* INT_STATUS/INT_MASK field descriptions */
+#define INT_VSYNC	0x01	/* Vsync interrupt  */
+#define INT_VSYNC_WB	0x02	/* Vsync interrupt for write back operation */
+#define INT_UNDRUN	0x04	/* Under run exception interrupt */
+#define INT_PARERR	0x08	/* Display parameters error interrupt */
+#define INT_LS_BF_VS	0x10	/* Lines before vsync. interrupt */
+
+/* Panels'operation modes */
+#define MFB_TYPE_OUTPUT	0	/* Panel output to display */
+#define MFB_TYPE_OFF	1	/* Panel off */
+#define MFB_TYPE_WB	2	/* Panel written back to memory */
+#define MFB_TYPE_TEST	3	/* Panel generate color bar */
+
+#endif /* __KERNEL__ */
+#endif /* __FSL_DIU_FB_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 08354809d6c73eb73973e132502a0a4e53250971 Mon Sep 17 00:00:00 2001
From: Jassi Brar <jassi.brar@samsung.com>
Date: Fri, 25 Jun 2010 18:21:19 +0900
Subject: ahci_platform: Provide for vendor specific init

Some AHCI implementations may use Vendor Specific HBA[A0h, FFh]
and/or Port[70h, 7Fh] registers to 'prepare' for initialization.
For that, the platform needs memory mapped address of AHCI registers.

This patch adds the 'mmio' argument and reorders the call to
platform init function.

Signed-off-by: Jassi Brar <jassi.brar@samsung.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/ahci_platform.c   | 25 +++++++++++++++----------
 include/linux/ahci_platform.h |  4 +++-
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c
index 5e11b160f247..68ef6b563b7d 100644
--- a/drivers/ata/ahci_platform.c
+++ b/drivers/ata/ahci_platform.c
@@ -54,19 +54,13 @@ static int __init ahci_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	if (pdata && pdata->init) {
-		rc = pdata->init(dev);
-		if (rc)
-			return rc;
-	}
-
 	if (pdata && pdata->ata_port_info)
 		pi = *pdata->ata_port_info;
 
 	hpriv = devm_kzalloc(dev, sizeof(*hpriv), GFP_KERNEL);
 	if (!hpriv) {
-		rc = -ENOMEM;
-		goto err0;
+		dev_err(dev, "can't alloc ahci_host_priv\n");
+		return -ENOMEM;
 	}
 
 	hpriv->flags |= (unsigned long)pi.private_data;
@@ -74,8 +68,19 @@ static int __init ahci_probe(struct platform_device *pdev)
 	hpriv->mmio = devm_ioremap(dev, mem->start, resource_size(mem));
 	if (!hpriv->mmio) {
 		dev_err(dev, "can't map %pR\n", mem);
-		rc = -ENOMEM;
-		goto err0;
+		return -ENOMEM;
+	}
+
+	/*
+	 * Some platforms might need to prepare for mmio region access,
+	 * which could be done in the following init call. So, the mmio
+	 * region shouldn't be accessed before init (if provided) has
+	 * returned successfully.
+	 */
+	if (pdata && pdata->init) {
+		rc = pdata->init(dev, hpriv->mmio);
+		if (rc)
+			return rc;
 	}
 
 	ahci_save_initial_config(dev, hpriv,
diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h
index f7dd576dd5a4..be3d9a77d6ed 100644
--- a/include/linux/ahci_platform.h
+++ b/include/linux/ahci_platform.h
@@ -15,11 +15,13 @@
 #ifndef _AHCI_PLATFORM_H
 #define _AHCI_PLATFORM_H
 
+#include <linux/compiler.h>
+
 struct device;
 struct ata_port_info;
 
 struct ahci_platform_data {
-	int (*init)(struct device *dev);
+	int (*init)(struct device *dev, void __iomem *addr);
 	void (*exit)(struct device *dev);
 	const struct ata_port_info *ata_port_info;
 	unsigned int force_port_map;
-- 
cgit v1.2.3-59-g8ed1b


From 5b6ae5ba0c45c4d04721537308728688414c9e6b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 30 Jul 2010 11:42:42 +0200
Subject: libata: more PCI IDs for jmicron controllers

Add support for JMB364 and 369.

Patch-originally-from: Aries Lee <arieslee@jmicron.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/pci/quirks.c    | 6 ++++++
 include/linux/pci_ids.h | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 477345d41641..a0c20d9e8396 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1459,6 +1459,7 @@ static void quirk_jmicron_ata(struct pci_dev *pdev)
 	switch (pdev->device) {
 	case PCI_DEVICE_ID_JMICRON_JMB360: /* SATA single port */
 	case PCI_DEVICE_ID_JMICRON_JMB362: /* SATA dual ports */
+	case PCI_DEVICE_ID_JMICRON_JMB364: /* SATA dual ports */
 		/* The controller should be in single function ahci mode */
 		conf1 |= 0x0002A100; /* Set 8, 13, 15, 17 */
 		break;
@@ -1470,6 +1471,7 @@ static void quirk_jmicron_ata(struct pci_dev *pdev)
 		/* Fall through */
 	case PCI_DEVICE_ID_JMICRON_JMB361:
 	case PCI_DEVICE_ID_JMICRON_JMB363:
+	case PCI_DEVICE_ID_JMICRON_JMB369:
 		/* Enable dual function mode, AHCI on fn 0, IDE fn1 */
 		/* Set the class codes correctly and then direct IDE 0 */
 		conf1 |= 0x00C2A1B3; /* Set 0, 1, 4, 5, 7, 8, 13, 15, 17, 22, 23 */
@@ -1496,16 +1498,20 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB360, qui
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB361, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB362, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB363, quirk_jmicron_ata);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB364, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB365, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB366, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB368, quirk_jmicron_ata);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB369, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB360, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB361, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB362, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB363, quirk_jmicron_ata);
+DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB364, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB365, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB366, quirk_jmicron_ata);
 DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB368, quirk_jmicron_ata);
+DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB369, quirk_jmicron_ata);
 
 #endif
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3bedcc149c84..eb200e6beb65 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2324,9 +2324,11 @@
 #define PCI_DEVICE_ID_JMICRON_JMB361	0x2361
 #define PCI_DEVICE_ID_JMICRON_JMB362	0x2362
 #define PCI_DEVICE_ID_JMICRON_JMB363	0x2363
+#define PCI_DEVICE_ID_JMICRON_JMB364	0x2364
 #define PCI_DEVICE_ID_JMICRON_JMB365	0x2365
 #define PCI_DEVICE_ID_JMICRON_JMB366	0x2366
 #define PCI_DEVICE_ID_JMICRON_JMB368	0x2368
+#define PCI_DEVICE_ID_JMICRON_JMB369	0x2369
 #define PCI_DEVICE_ID_JMICRON_JMB38X_SD	0x2381
 #define PCI_DEVICE_ID_JMICRON_JMB38X_MMC 0x2382
 #define PCI_DEVICE_ID_JMICRON_JMB38X_MS	0x2383
-- 
cgit v1.2.3-59-g8ed1b


From 5689cc53fa9d09b5bf41b1b1a7c90bd6c112ab40 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 1 Jul 2010 16:00:12 +0200
Subject: KVM: Use u64 for frame data types

For 32bit machines where the physical address width is
larger than the virtual address width the frame number types
in KVM may overflow. Fix this by changing them to u64.

[sfr: fix build on 32-bit ppc]

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/powerpc/kvm/44x_tlb.c | 3 ++-
 include/linux/kvm_types.h  | 4 ++--
 virt/kvm/iommu.c           | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 812312542e50..9b9b5cdea840 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -316,7 +316,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
 	gfn = gpaddr >> PAGE_SHIFT;
 	new_page = gfn_to_page(vcpu->kvm, gfn);
 	if (is_error_page(new_page)) {
-		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
+		printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n",
+			(unsigned long long)gfn);
 		kvm_release_page_clean(new_page);
 		return;
 	}
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index fb46efbeabec..7ac0d4eee430 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -32,11 +32,11 @@
 
 typedef unsigned long  gva_t;
 typedef u64            gpa_t;
-typedef unsigned long  gfn_t;
+typedef u64            gfn_t;
 
 typedef unsigned long  hva_t;
 typedef u64            hpa_t;
-typedef unsigned long  hfn_t;
+typedef u64            hfn_t;
 
 typedef hfn_t pfn_t;
 
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 779559552ce7..62a9caf0563c 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -108,7 +108,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 			      get_order(page_size), flags);
 		if (r) {
 			printk(KERN_ERR "kvm_iommu_map_address:"
-			       "iommu failed to map pfn=%lx\n", pfn);
+			       "iommu failed to map pfn=%llx\n", pfn);
 			goto unmap_pages;
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From edba23e51578f7cb6781461568489fc1825db4ac Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 7 Jul 2010 20:16:45 +0300
Subject: KVM: Return EFAULT from kvm ioctl when guest accesses bad area

Currently if guest access address that belongs to memory slot but is not
backed up by page or page is read only KVM treats it like MMIO access.
Remove that capability. It was never part of the interface and should
not be relied upon.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       |  4 +++-
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 28 ++++++++++++++++++++++++----
 3 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d8d48329cb82..89d7a2cae53b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2078,7 +2078,9 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
 	if (is_hwpoison_pfn(pfn)) {
 		kvm_send_hwpoison_signal(kvm, gfn);
 		return 0;
-	}
+	} else if (is_fault_pfn(pfn))
+		return -EFAULT;
+
 	return 1;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e796326f3646..8055067b6bec 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -269,6 +269,7 @@ extern pfn_t bad_pfn;
 int is_error_page(struct page *page);
 int is_error_pfn(pfn_t pfn);
 int is_hwpoison_pfn(pfn_t pfn);
+int is_fault_pfn(pfn_t pfn);
 int kvm_is_error_hva(unsigned long addr);
 int kvm_set_memory_region(struct kvm *kvm,
 			  struct kvm_userspace_memory_region *mem,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 630d1224f187..b78b794c1039 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -96,6 +96,9 @@ static bool largepages_enabled = true;
 static struct page *hwpoison_page;
 static pfn_t hwpoison_pfn;
 
+static struct page *fault_page;
+static pfn_t fault_pfn;
+
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn)) {
@@ -815,13 +818,13 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 
 int is_error_page(struct page *page)
 {
-	return page == bad_page || page == hwpoison_page;
+	return page == bad_page || page == hwpoison_page || page == fault_page;
 }
 EXPORT_SYMBOL_GPL(is_error_page);
 
 int is_error_pfn(pfn_t pfn)
 {
-	return pfn == bad_pfn || pfn == hwpoison_pfn;
+	return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
 }
 EXPORT_SYMBOL_GPL(is_error_pfn);
 
@@ -831,6 +834,12 @@ int is_hwpoison_pfn(pfn_t pfn)
 }
 EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
 
+int is_fault_pfn(pfn_t pfn)
+{
+	return pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_fault_pfn);
+
 static inline unsigned long bad_hva(void)
 {
 	return PAGE_OFFSET;
@@ -959,8 +968,8 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
 		if (vma == NULL || addr < vma->vm_start ||
 		    !(vma->vm_flags & VM_PFNMAP)) {
 			up_read(&current->mm->mmap_sem);
-			get_page(bad_page);
-			return page_to_pfn(bad_page);
+			get_page(fault_page);
+			return page_to_pfn(fault_page);
 		}
 
 		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -2226,6 +2235,15 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 
 	hwpoison_pfn = page_to_pfn(hwpoison_page);
 
+	fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (fault_page == NULL) {
+		r = -ENOMEM;
+		goto out_free_0;
+	}
+
+	fault_pfn = page_to_pfn(fault_page);
+
 	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
 		r = -ENOMEM;
 		goto out_free_0;
@@ -2298,6 +2316,8 @@ out_free_1:
 out_free_0a:
 	free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
+	if (fault_page)
+		__free_page(fault_page);
 	if (hwpoison_page)
 		__free_page(hwpoison_page);
 	__free_page(bad_page);
-- 
cgit v1.2.3-59-g8ed1b


From 4a994358b919c3b14de61be5e30d9edc9089ba3f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 11 Jul 2010 15:32:23 +0300
Subject: KVM: Convert mask notifiers to use irqchip/pin instead of gsi

Devices register mask notifier using gsi, but irqchip knows about
irqchip/pin, so conversion from irqchip/pin to gsi should be done before
looking for mask notifier to call.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm_host.h |  3 ++-
 virt/kvm/ioapic.c        |  2 +-
 virt/kvm/irq_comm.c      | 12 ++++++++----
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8055067b6bec..c13cc48697aa 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -447,7 +447,8 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
 				    struct kvm_irq_mask_notifier *kimn);
 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
-void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+			     bool mask);
 
 #ifdef __KVM_HAVE_IOAPIC
 void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 1149c60b198f..0b9df8303dcf 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -152,7 +152,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 		update_handled_vectors(ioapic);
 		mask_after = e->fields.mask;
 		if (mask_before != mask_after)
-			kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after);
+			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
 		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
 		    && ioapic->irr & (1 << index))
 			ioapic_service(ioapic, index);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 06cf61e729d2..369e38010ad5 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -279,15 +279,19 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 	synchronize_rcu();
 }
 
-void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+			     bool mask)
 {
 	struct kvm_irq_mask_notifier *kimn;
 	struct hlist_node *n;
+	int gsi;
 
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
-		if (kimn->irq == irq)
-			kimn->func(kimn, mask);
+	gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+	if (gsi != -1)
+		hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
+			if (kimn->irq == gsi)
+				kimn->func(kimn, mask);
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From ea0d3ab239fba48d6e998b19c28d78f765963007 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 2 Jun 2010 13:24:43 +0900
Subject: LSM: Remove unused arguments from security_path_truncate().

When commit be6d3e56a6b9b3a4ee44a0685e39e595073c6f0d "introduce new LSM hooks
where vfsmount is available." was proposed, regarding security_path_truncate(),
only "struct file *" argument (which AppArmor wanted to use) was removed.
But length and time_attrs arguments are not used by TOMOYO nor AppArmor.
Thus, let's remove these arguments.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/namei.c               |  3 +--
 fs/open.c                |  5 ++---
 include/linux/security.h | 11 +++--------
 security/capability.c    |  3 +--
 security/security.c      |  5 ++---
 security/tomoyo/tomoyo.c |  3 +--
 6 files changed, 10 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 868d0cb9d473..fe34c2b879f4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1484,8 +1484,7 @@ static int handle_truncate(struct path *path)
 	 */
 	error = locks_verify_locked(inode);
 	if (!error)
-		error = security_path_truncate(path, 0,
-				       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
+		error = security_path_truncate(path);
 	if (!error) {
 		error = do_truncate(path->dentry, 0,
 				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
diff --git a/fs/open.c b/fs/open.c
index 5463266db9e6..a54ed85209c1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -110,7 +110,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
 
 	error = locks_verify_truncate(inode, NULL, length);
 	if (!error)
-		error = security_path_truncate(&path, length, 0);
+		error = security_path_truncate(&path);
 	if (!error)
 		error = do_truncate(path.dentry, length, 0, NULL);
 
@@ -165,8 +165,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 
 	error = locks_verify_truncate(inode, file, length);
 	if (!error)
-		error = security_path_truncate(&file->f_path, length,
-					       ATTR_MTIME|ATTR_CTIME);
+		error = security_path_truncate(&file->f_path);
 	if (!error)
 		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
diff --git a/include/linux/security.h b/include/linux/security.h
index 0c8819170463..723a93df756a 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -470,8 +470,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  * @path_truncate:
  *	Check permission before truncating a file.
  *	@path contains the path structure for the file.
- *	@length is the new length of the file.
- *	@time_attrs is the flags passed to do_truncate().
  *	Return 0 if permission is granted.
  * @inode_getattr:
  *	Check permission before obtaining file attributes.
@@ -1412,8 +1410,7 @@ struct security_operations {
 	int (*path_rmdir) (struct path *dir, struct dentry *dentry);
 	int (*path_mknod) (struct path *dir, struct dentry *dentry, int mode,
 			   unsigned int dev);
-	int (*path_truncate) (struct path *path, loff_t length,
-			      unsigned int time_attrs);
+	int (*path_truncate) (struct path *path);
 	int (*path_symlink) (struct path *dir, struct dentry *dentry,
 			     const char *old_name);
 	int (*path_link) (struct dentry *old_dentry, struct path *new_dir,
@@ -2806,8 +2803,7 @@ int security_path_mkdir(struct path *dir, struct dentry *dentry, int mode);
 int security_path_rmdir(struct path *dir, struct dentry *dentry);
 int security_path_mknod(struct path *dir, struct dentry *dentry, int mode,
 			unsigned int dev);
-int security_path_truncate(struct path *path, loff_t length,
-			   unsigned int time_attrs);
+int security_path_truncate(struct path *path);
 int security_path_symlink(struct path *dir, struct dentry *dentry,
 			  const char *old_name);
 int security_path_link(struct dentry *old_dentry, struct path *new_dir,
@@ -2841,8 +2837,7 @@ static inline int security_path_mknod(struct path *dir, struct dentry *dentry,
 	return 0;
 }
 
-static inline int security_path_truncate(struct path *path, loff_t length,
-					 unsigned int time_attrs)
+static inline int security_path_truncate(struct path *path)
 {
 	return 0;
 }
diff --git a/security/capability.c b/security/capability.c
index 8168e3ecd5bf..4aeb699da1b3 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -268,8 +268,7 @@ static int cap_path_rename(struct path *old_path, struct dentry *old_dentry,
 	return 0;
 }
 
-static int cap_path_truncate(struct path *path, loff_t length,
-			     unsigned int time_attrs)
+static int cap_path_truncate(struct path *path)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 351942a4ca0e..e8c87b8601b4 100644
--- a/security/security.c
+++ b/security/security.c
@@ -417,12 +417,11 @@ int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
 					 new_dentry);
 }
 
-int security_path_truncate(struct path *path, loff_t length,
-			   unsigned int time_attrs)
+int security_path_truncate(struct path *path)
 {
 	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
 		return 0;
-	return security_ops->path_truncate(path, length, time_attrs);
+	return security_ops->path_truncate(path);
 }
 
 int security_path_chmod(struct dentry *dentry, struct vfsmount *mnt,
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 57d442e7339b..7be732cadd47 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -93,8 +93,7 @@ static int tomoyo_bprm_check_security(struct linux_binprm *bprm)
 	return tomoyo_check_open_permission(domain, &bprm->file->f_path, O_RDONLY);
 }
 
-static int tomoyo_path_truncate(struct path *path, loff_t length,
-				unsigned int time_attrs)
+static int tomoyo_path_truncate(struct path *path)
 {
 	return tomoyo_path_perm(TOMOYO_TYPE_TRUNCATE, path);
 }
-- 
cgit v1.2.3-59-g8ed1b


From af4f136056c984b0aa67feed7d3170b958370b2f Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Thu, 1 Jul 2010 15:07:43 -0400
Subject: security: move LSM xattrnames to xattr.h

Make the security extended attributes names global. Updated to move
the remaining Smack xattrs.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/capability.h |  3 ---
 include/linux/xattr.h      | 14 ++++++++++++++
 security/selinux/hooks.c   |  3 ---
 security/smack/smack.h     | 10 ----------
 4 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 39e5ff512fbe..90012b9ddbf3 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -49,9 +49,6 @@ typedef struct __user_cap_data_struct {
 } __user *cap_user_data_t;
 
 
-#define XATTR_CAPS_SUFFIX "capability"
-#define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
-
 #define VFS_CAP_REVISION_MASK	0xFF000000
 #define VFS_CAP_REVISION_SHIFT	24
 #define VFS_CAP_FLAGS_MASK	~VFS_CAP_REVISION_MASK
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 0cfa1e9c4cc1..f1e5bde4b35a 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -33,6 +33,20 @@
 #define XATTR_USER_PREFIX "user."
 #define XATTR_USER_PREFIX_LEN (sizeof (XATTR_USER_PREFIX) - 1)
 
+/* Security namespace */
+#define XATTR_SELINUX_SUFFIX "selinux"
+#define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX
+
+#define XATTR_SMACK_SUFFIX "SMACK64"
+#define XATTR_SMACK_IPIN "SMACK64IPIN"
+#define XATTR_SMACK_IPOUT "SMACK64IPOUT"
+#define XATTR_NAME_SMACK XATTR_SECURITY_PREFIX XATTR_SMACK_SUFFIX
+#define XATTR_NAME_SMACKIPIN	XATTR_SECURITY_PREFIX XATTR_SMACK_IPIN
+#define XATTR_NAME_SMACKIPOUT	XATTR_SECURITY_PREFIX XATTR_SMACK_IPOUT
+
+#define XATTR_CAPS_SUFFIX "capability"
+#define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
+
 struct inode;
 struct dentry;
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 0f524b7d102e..85338f0c0481 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -87,9 +87,6 @@
 #include "netlabel.h"
 #include "audit.h"
 
-#define XATTR_SELINUX_SUFFIX "selinux"
-#define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX
-
 #define NUM_SEL_MNT_OPTS 5
 
 extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm);
diff --git a/security/smack/smack.h b/security/smack/smack.h
index c6e9acae72e4..43ae747a5aa4 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -123,16 +123,6 @@ struct smack_known {
 #define SMK_FSHAT	"smackfshat="
 #define SMK_FSROOT	"smackfsroot="
 
-/*
- * xattr names
- */
-#define XATTR_SMACK_SUFFIX	"SMACK64"
-#define XATTR_SMACK_IPIN	"SMACK64IPIN"
-#define XATTR_SMACK_IPOUT	"SMACK64IPOUT"
-#define XATTR_NAME_SMACK	XATTR_SECURITY_PREFIX XATTR_SMACK_SUFFIX
-#define XATTR_NAME_SMACKIPIN	XATTR_SECURITY_PREFIX XATTR_SMACK_IPIN
-#define XATTR_NAME_SMACKIPOUT	XATTR_SECURITY_PREFIX XATTR_SMACK_IPOUT
-
 #define SMACK_CIPSO_OPTION 	"-CIPSO"
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 9cfcac810e8993fa7a5bfd24b1a21f1dbbb03a7b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 23 Jul 2010 11:43:51 -0400
Subject: vfs: re-introduce MAY_CHDIR

Currently MAY_ACCESS means that filesystems must check the permissions
right then and not rely on cached results or the results of future
operations on the object.  This can be because of a call to sys_access() or
because of a call to chdir() which needs to check search without relying on
any future operations inside that dir.  I plan to use MAY_ACCESS for other
purposes in the security system, so I split the MAY_ACCESS and the
MAY_CHDIR cases.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by:  Stephen D. Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/fuse/dir.c      | 2 +-
 fs/nfs/dir.c       | 2 +-
 fs/open.c          | 6 +++---
 include/linux/fs.h | 1 +
 4 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 3cdc5f78a406..431be0795b6b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1016,7 +1016,7 @@ static int fuse_permission(struct inode *inode, int mask)
 		   exist.  So if permissions are revoked this won't be
 		   noticed immediately, only after the attribute
 		   timeout has expired */
-	} else if (mask & MAY_ACCESS) {
+	} else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
 		err = fuse_access(inode, mask);
 	} else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		if (!(inode->i_mode & S_IXUGO)) {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e60416d3f818..832e9e239324 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1953,7 +1953,7 @@ int nfs_permission(struct inode *inode, int mask)
 	if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
 		goto out;
 	/* Is this sys_access() ? */
-	if (mask & MAY_ACCESS)
+	if (mask & (MAY_ACCESS | MAY_CHDIR))
 		goto force_lookup;
 
 	switch (inode->i_mode & S_IFMT) {
diff --git a/fs/open.c b/fs/open.c
index a54ed85209c1..0d1fa3dc0efb 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -366,7 +366,7 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
 	if (error)
 		goto out;
 
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 
@@ -395,7 +395,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 	if (!S_ISDIR(inode->i_mode))
 		goto out_putf;
 
-	error = inode_permission(inode, MAY_EXEC | MAY_ACCESS);
+	error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
 	if (!error)
 		set_fs_pwd(current->fs, &file->f_path);
 out_putf:
@@ -413,7 +413,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
 	if (error)
 		goto out;
 
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 68ca1b0491af..7d94b72f0346 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -53,6 +53,7 @@ struct inodes_stat_t {
 #define MAY_APPEND 8
 #define MAY_ACCESS 16
 #define MAY_OPEN 32
+#define MAY_CHDIR 64
 
 /*
  * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
-- 
cgit v1.2.3-59-g8ed1b


From b782e0a68d17894d9a618ffea55b33639faa6bb4 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 23 Jul 2010 11:44:03 -0400
Subject: SELinux: special dontaudit for access checks

Currently there are a number of applications (nautilus being the main one) which
calls access() on files in order to determine how they should be displayed.  It
is normal and expected that nautilus will want to see if files are executable
or if they are really read/write-able.  access() should return the real
permission.  SELinux policy checks are done in access() and can result in lots
of AVC denials as policy denies RWX on files which DAC allows.  Currently
SELinux must dontaudit actual attempts to read/write/execute a file in
order to silence these messages (and not flood the logs.)  But dontaudit rules
like that can hide real attacks.  This patch addes a new common file
permission audit_access.  This permission is special in that it is meaningless
and should never show up in an allow rule.  Instead the only place this
permission has meaning is in a dontaudit rule like so:

dontaudit nautilus_t sbin_t:file audit_access

With such a rule if nautilus just checks access() we will still get denied and
thus userspace will still get the correct answer but we will not log the denial.
If nautilus attempted to actually perform one of the forbidden actions
(rather than just querying access(2) about it) we would still log a denial.
This type of dontaudit rule should be used sparingly, as it could be a
method for an attacker to probe the system permissions without detection.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by:  Stephen D. Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/lsm_audit.h           |  5 +++++
 security/selinux/avc.c              | 24 ++++++++++++++++++++++--
 security/selinux/hooks.c            | 20 +++++++++++++++-----
 security/selinux/include/classmap.h |  2 +-
 4 files changed, 43 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 6907251d5200..788f0ab937aa 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -90,6 +90,11 @@ struct common_audit_data {
 			u32 requested;
 			u32 audited;
 			u32 denied;
+			/*
+			 * auditdeny is a bit tricky and unintuitive.  See the
+			 * comments in avc.c for it's meaning and usage.
+			 */
+			u32 auditdeny;
 			struct av_decision *avd;
 			int result;
 		} selinux_audit_data;
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 3662b0f15ec5..9da6420e2056 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -488,9 +488,29 @@ void avc_audit(u32 ssid, u32 tsid,
 	struct common_audit_data stack_data;
 	u32 denied, audited;
 	denied = requested & ~avd->allowed;
-	if (denied)
+	if (denied) {
 		audited = denied & avd->auditdeny;
-	else if (result)
+		/*
+		 * a->selinux_audit_data.auditdeny is TRICKY!  Setting a bit in
+		 * this field means that ANY denials should NOT be audited if
+		 * the policy contains an explicit dontaudit rule for that
+		 * permission.  Take notice that this is unrelated to the
+		 * actual permissions that were denied.  As an example lets
+		 * assume:
+		 *
+		 * denied == READ
+		 * avd.auditdeny & ACCESS == 0 (not set means explicit rule)
+		 * selinux_audit_data.auditdeny & ACCESS == 1
+		 *
+		 * We will NOT audit the denial even though the denied
+		 * permission was READ and the auditdeny checks were for
+		 * ACCESS
+		 */
+		if (a &&
+		    a->selinux_audit_data.auditdeny &&
+		    !(a->selinux_audit_data.auditdeny & avd->auditdeny))
+			audited = 0;
+	} else if (result)
 		audited = denied = requested;
 	else
 		audited = requested & avd->auditallow;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 0c98846f188d..650947a72a2b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2644,16 +2644,26 @@ static int selinux_inode_follow_link(struct dentry *dentry, struct nameidata *na
 static int selinux_inode_permission(struct inode *inode, int mask)
 {
 	const struct cred *cred = current_cred();
+	struct common_audit_data ad;
+	u32 perms;
+	bool from_access;
 
+	from_access = mask & MAY_ACCESS;
 	mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND);
 
-	if (!mask) {
-		/* No permission to check.  Existence test. */
+	/* No permission to check.  Existence test. */
+	if (!mask)
 		return 0;
-	}
 
-	return inode_has_perm(cred, inode,
-			      file_mask_to_av(inode->i_mode, mask), NULL);
+	COMMON_AUDIT_DATA_INIT(&ad, FS);
+	ad.u.fs.inode = inode;
+
+	if (from_access)
+		ad.selinux_audit_data.auditdeny |= FILE__AUDIT_ACCESS;
+
+	perms = file_mask_to_av(inode->i_mode, mask);
+
+	return inode_has_perm(cred, inode, perms, &ad);
 }
 
 static int selinux_inode_setattr(struct dentry *dentry, struct iattr *iattr)
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 8b32e959bb2e..d64603e10dbe 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -2,7 +2,7 @@
     "getattr", "setattr", "lock", "relabelfrom", "relabelto", "append"
 
 #define COMMON_FILE_PERMS COMMON_FILE_SOCK_PERMS, "unlink", "link", \
-    "rename", "execute", "swapon", "quotaon", "mounton"
+    "rename", "execute", "swapon", "quotaon", "mounton", "audit_access"
 
 #define COMMON_SOCK_PERMS COMMON_FILE_SOCK_PERMS, "bind", "connect", \
     "listen", "accept", "getopt", "setopt", "shutdown", "recvfrom",  \
-- 
cgit v1.2.3-59-g8ed1b


From 67012e8209df95a8290d135753ff5145431a666e Mon Sep 17 00:00:00 2001
From: John Johansen <john.johansen@canonical.com>
Date: Thu, 29 Jul 2010 14:47:58 -0700
Subject: AppArmor: basic auditing infrastructure.

Update lsm_audit for AppArmor specific data, and add the core routines for
AppArmor uses for auditing.

Signed-off-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/lsm_audit.h         |  27 +++++
 security/apparmor/audit.c         | 215 ++++++++++++++++++++++++++++++++++++++
 security/apparmor/include/audit.h | 123 ++++++++++++++++++++++
 3 files changed, 365 insertions(+)
 create mode 100644 security/apparmor/audit.c
 create mode 100644 security/apparmor/include/audit.h

(limited to 'include/linux')

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 788f0ab937aa..112a55033352 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -98,6 +98,33 @@ struct common_audit_data {
 			struct av_decision *avd;
 			int result;
 		} selinux_audit_data;
+#endif
+#ifdef CONFIG_SECURITY_APPARMOR
+		struct {
+			int error;
+			int op;
+			int type;
+			void *profile;
+			const char *name;
+			const char *info;
+			union {
+				void *target;
+				struct {
+					long pos;
+					void *target;
+				} iface;
+				struct {
+					int rlim;
+					unsigned long max;
+				} rlim;
+				struct {
+					const char *target;
+					u32 request;
+					u32 denied;
+					uid_t ouid;
+				} fs;
+			};
+		} apparmor_audit_data;
 #endif
 	};
 	/* these callback will be implemented by a specific LSM */
diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c
new file mode 100644
index 000000000000..96502b22b268
--- /dev/null
+++ b/security/apparmor/audit.c
@@ -0,0 +1,215 @@
+/*
+ * AppArmor security module
+ *
+ * This file contains AppArmor auditing functions
+ *
+ * Copyright (C) 1998-2008 Novell/SUSE
+ * Copyright 2009-2010 Canonical Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/audit.h>
+#include <linux/socket.h>
+
+#include "include/apparmor.h"
+#include "include/audit.h"
+#include "include/policy.h"
+
+const char *op_table[] = {
+	"null",
+
+	"sysctl",
+	"capable",
+
+	"unlink",
+	"mkdir",
+	"rmdir",
+	"mknod",
+	"truncate",
+	"link",
+	"symlink",
+	"rename_src",
+	"rename_dest",
+	"chmod",
+	"chown",
+	"getattr",
+	"open",
+
+	"file_perm",
+	"file_lock",
+	"file_mmap",
+	"file_mprotect",
+
+	"create",
+	"post_create",
+	"bind",
+	"connect",
+	"listen",
+	"accept",
+	"sendmsg",
+	"recvmsg",
+	"getsockname",
+	"getpeername",
+	"getsockopt",
+	"setsockopt",
+	"socket_shutdown",
+
+	"ptrace",
+
+	"exec",
+	"change_hat",
+	"change_profile",
+	"change_onexec",
+
+	"setprocattr",
+	"setrlimit",
+
+	"profile_replace",
+	"profile_load",
+	"profile_remove"
+};
+
+const char *audit_mode_names[] = {
+	"normal",
+	"quiet_denied",
+	"quiet",
+	"noquiet",
+	"all"
+};
+
+static char *aa_audit_type[] = {
+	"AUDIT",
+	"ALLOWED",
+	"DENIED",
+	"HINT",
+	"STATUS",
+	"ERROR",
+	"KILLED"
+};
+
+/*
+ * Currently AppArmor auditing is fed straight into the audit framework.
+ *
+ * TODO:
+ * netlink interface for complain mode
+ * user auditing, - send user auditing to netlink interface
+ * system control of whether user audit messages go to system log
+ */
+
+/**
+ * audit_base - core AppArmor function.
+ * @ab: audit buffer to fill (NOT NULL)
+ * @ca: audit structure containing data to audit (NOT NULL)
+ *
+ * Record common AppArmor audit data from @sa
+ */
+static void audit_pre(struct audit_buffer *ab, void *ca)
+{
+	struct common_audit_data *sa = ca;
+	struct task_struct *tsk = sa->tsk ? sa->tsk : current;
+
+	if (aa_g_audit_header) {
+		audit_log_format(ab, "apparmor=");
+		audit_log_string(ab, aa_audit_type[sa->aad.type]);
+	}
+
+	if (sa->aad.op) {
+		audit_log_format(ab, " operation=");
+		audit_log_string(ab, op_table[sa->aad.op]);
+	}
+
+	if (sa->aad.info) {
+		audit_log_format(ab, " info=");
+		audit_log_string(ab, sa->aad.info);
+		if (sa->aad.error)
+			audit_log_format(ab, " error=%d", sa->aad.error);
+	}
+
+	if (sa->aad.profile) {
+		struct aa_profile *profile = sa->aad.profile;
+		pid_t pid;
+		rcu_read_lock();
+		pid = tsk->real_parent->pid;
+		rcu_read_unlock();
+		audit_log_format(ab, " parent=%d", pid);
+		if (profile->ns != root_ns) {
+			audit_log_format(ab, " namespace=");
+			audit_log_untrustedstring(ab, profile->ns->base.hname);
+		}
+		audit_log_format(ab, " profile=");
+		audit_log_untrustedstring(ab, profile->base.hname);
+	}
+
+	if (sa->aad.name) {
+		audit_log_format(ab, " name=");
+		audit_log_untrustedstring(ab, sa->aad.name);
+	}
+}
+
+/**
+ * aa_audit_msg - Log a message to the audit subsystem
+ * @sa: audit event structure (NOT NULL)
+ * @cb: optional callback fn for type specific fields (MAYBE NULL)
+ */
+void aa_audit_msg(int type, struct common_audit_data *sa,
+		  void (*cb) (struct audit_buffer *, void *))
+{
+	sa->aad.type = type;
+	sa->lsm_pre_audit = audit_pre;
+	sa->lsm_post_audit = cb;
+	common_lsm_audit(sa);
+}
+
+/**
+ * aa_audit - Log a profile based audit event to the audit subsystem
+ * @type: audit type for the message
+ * @profile: profile to check against (NOT NULL)
+ * @gfp: allocation flags to use
+ * @sa: audit event (NOT NULL)
+ * @cb: optional callback fn for type specific fields (MAYBE NULL)
+ *
+ * Handle default message switching based off of audit mode flags
+ *
+ * Returns: error on failure
+ */
+int aa_audit(int type, struct aa_profile *profile, gfp_t gfp,
+	     struct common_audit_data *sa,
+	     void (*cb) (struct audit_buffer *, void *))
+{
+	BUG_ON(!profile);
+
+	if (type == AUDIT_APPARMOR_AUTO) {
+		if (likely(!sa->aad.error)) {
+			if (AUDIT_MODE(profile) != AUDIT_ALL)
+				return 0;
+			type = AUDIT_APPARMOR_AUDIT;
+		} else if (COMPLAIN_MODE(profile))
+			type = AUDIT_APPARMOR_ALLOWED;
+		else
+			type = AUDIT_APPARMOR_DENIED;
+	}
+	if (AUDIT_MODE(profile) == AUDIT_QUIET ||
+	    (type == AUDIT_APPARMOR_DENIED &&
+	     AUDIT_MODE(profile) == AUDIT_QUIET))
+		return sa->aad.error;
+
+	if (KILL_MODE(profile) && type == AUDIT_APPARMOR_DENIED)
+		type = AUDIT_APPARMOR_KILL;
+
+	if (!unconfined(profile))
+		sa->aad.profile = profile;
+
+	aa_audit_msg(type, sa, cb);
+
+	if (sa->aad.type == AUDIT_APPARMOR_KILL)
+		(void)send_sig_info(SIGKILL, NULL, sa->tsk ? sa->tsk : current);
+
+	if (sa->aad.type == AUDIT_APPARMOR_ALLOWED)
+		return complain_error(sa->aad.error);
+
+	return sa->aad.error;
+}
diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h
new file mode 100644
index 000000000000..1951786d32e9
--- /dev/null
+++ b/security/apparmor/include/audit.h
@@ -0,0 +1,123 @@
+/*
+ * AppArmor security module
+ *
+ * This file contains AppArmor auditing function definitions.
+ *
+ * Copyright (C) 1998-2008 Novell/SUSE
+ * Copyright 2009-2010 Canonical Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#ifndef __AA_AUDIT_H
+#define __AA_AUDIT_H
+
+#include <linux/audit.h>
+#include <linux/fs.h>
+#include <linux/lsm_audit.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "file.h"
+
+struct aa_profile;
+
+extern const char *audit_mode_names[];
+#define AUDIT_MAX_INDEX 5
+
+#define AUDIT_APPARMOR_AUTO 0	/* auto choose audit message type */
+
+enum audit_mode {
+	AUDIT_NORMAL,		/* follow normal auditing of accesses */
+	AUDIT_QUIET_DENIED,	/* quiet all denied access messages */
+	AUDIT_QUIET,		/* quiet all messages */
+	AUDIT_NOQUIET,		/* do not quiet audit messages */
+	AUDIT_ALL		/* audit all accesses */
+};
+
+enum audit_type {
+	AUDIT_APPARMOR_AUDIT,
+	AUDIT_APPARMOR_ALLOWED,
+	AUDIT_APPARMOR_DENIED,
+	AUDIT_APPARMOR_HINT,
+	AUDIT_APPARMOR_STATUS,
+	AUDIT_APPARMOR_ERROR,
+	AUDIT_APPARMOR_KILL
+};
+
+extern const char *op_table[];
+enum aa_ops {
+	OP_NULL,
+
+	OP_SYSCTL,
+	OP_CAPABLE,
+
+	OP_UNLINK,
+	OP_MKDIR,
+	OP_RMDIR,
+	OP_MKNOD,
+	OP_TRUNC,
+	OP_LINK,
+	OP_SYMLINK,
+	OP_RENAME_SRC,
+	OP_RENAME_DEST,
+	OP_CHMOD,
+	OP_CHOWN,
+	OP_GETATTR,
+	OP_OPEN,
+
+	OP_FPERM,
+	OP_FLOCK,
+	OP_FMMAP,
+	OP_FMPROT,
+
+	OP_CREATE,
+	OP_POST_CREATE,
+	OP_BIND,
+	OP_CONNECT,
+	OP_LISTEN,
+	OP_ACCEPT,
+	OP_SENDMSG,
+	OP_RECVMSG,
+	OP_GETSOCKNAME,
+	OP_GETPEERNAME,
+	OP_GETSOCKOPT,
+	OP_SETSOCKOPT,
+	OP_SOCK_SHUTDOWN,
+
+	OP_PTRACE,
+
+	OP_EXEC,
+	OP_CHANGE_HAT,
+	OP_CHANGE_PROFILE,
+	OP_CHANGE_ONEXEC,
+
+	OP_SETPROCATTR,
+	OP_SETRLIMIT,
+
+	OP_PROF_REPL,
+	OP_PROF_LOAD,
+	OP_PROF_RM,
+};
+
+
+/* define a short hand for apparmor_audit_data portion of common_audit_data */
+#define aad apparmor_audit_data
+
+void aa_audit_msg(int type, struct common_audit_data *sa,
+		  void (*cb) (struct audit_buffer *, void *));
+int aa_audit(int type, struct aa_profile *profile, gfp_t gfp,
+	     struct common_audit_data *sa,
+	     void (*cb) (struct audit_buffer *, void *));
+
+static inline int complain_error(int error)
+{
+	if (error == -EPERM || error == -EACCES)
+		return 0;
+	return error;
+}
+
+#endif /* __AA_AUDIT_H */
-- 
cgit v1.2.3-59-g8ed1b


From 9938424f0c4d208883cbf32083ec2bfcc220f85b Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Mon, 14 Jun 2010 18:10:33 +0200
Subject: mtd: add an ioctl to query the lock status of a flash sector

This patchs adds a way for user space programs to find out whether a
flash sector is locked. An optional driver method in the mtd_info struct
provides the information.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/chips/cfi_cmdset_0001.c | 10 ++++++++++
 drivers/mtd/mtdchar.c               | 14 ++++++++++++++
 drivers/mtd/mtdpart.c               | 10 ++++++++++
 include/linux/mtd/mtd.h             |  1 +
 include/mtd/mtd-abi.h               |  1 +
 5 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index 62f3ea9de848..2fadb0239ba3 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -63,6 +63,8 @@ static int cfi_intelext_erase_varsize(struct mtd_info *, struct erase_info *);
 static void cfi_intelext_sync (struct mtd_info *);
 static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
 static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+static int cfi_intelext_is_locked(struct mtd_info *mtd, loff_t ofs,
+				  uint64_t len);
 #ifdef CONFIG_MTD_OTP
 static int cfi_intelext_read_fact_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
 static int cfi_intelext_read_user_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
@@ -448,6 +450,7 @@ struct mtd_info *cfi_cmdset_0001(struct map_info *map, int primary)
 	mtd->sync    = cfi_intelext_sync;
 	mtd->lock    = cfi_intelext_lock;
 	mtd->unlock  = cfi_intelext_unlock;
+	mtd->is_locked = cfi_intelext_is_locked;
 	mtd->suspend = cfi_intelext_suspend;
 	mtd->resume  = cfi_intelext_resume;
 	mtd->flags   = MTD_CAP_NORFLASH;
@@ -2139,6 +2142,13 @@ static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 	return ret;
 }
 
+static int cfi_intelext_is_locked(struct mtd_info *mtd, loff_t ofs,
+				  uint64_t len)
+{
+	return cfi_varsize_frob(mtd, do_getlockstatus_oneblock,
+				ofs, len, NULL) ? 1 : 0;
+}
+
 #ifdef CONFIG_MTD_OTP
 
 typedef int (*otp_op_t)(struct map_info *map, struct flchip *chip,
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 8b223c0343ee..a8e69dd2b2e4 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -676,6 +676,20 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		break;
 	}
 
+	case MEMISLOCKED:
+	{
+		struct erase_info_user einfo;
+
+		if (copy_from_user(&einfo, argp, sizeof(einfo)))
+			return -EFAULT;
+
+		if (!mtd->is_locked)
+			ret = -EOPNOTSUPP;
+		else
+			ret = mtd->is_locked(mtd, einfo.start, einfo.length);
+		break;
+	}
+
 	/* Legacy interface */
 	case MEMGETOOBSEL:
 	{
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index b8043a9ba32d..4c539ded0b70 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -264,6 +264,14 @@ static int part_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 	return part->master->unlock(part->master, ofs + part->offset, len);
 }
 
+static int part_is_locked(struct mtd_info *mtd, loff_t ofs, uint64_t len)
+{
+	struct mtd_part *part = PART(mtd);
+	if ((len + ofs) > mtd->size)
+		return -EINVAL;
+	return part->master->is_locked(part->master, ofs + part->offset, len);
+}
+
 static void part_sync(struct mtd_info *mtd)
 {
 	struct mtd_part *part = PART(mtd);
@@ -402,6 +410,8 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 		slave->mtd.lock = part_lock;
 	if (master->unlock)
 		slave->mtd.unlock = part_unlock;
+	if (master->is_locked)
+		slave->mtd.is_locked = part_is_locked;
 	if (master->block_isbad)
 		slave->mtd.block_isbad = part_block_isbad;
 	if (master->block_markbad)
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 5326435a7571..43b7d72c6116 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -216,6 +216,7 @@ struct mtd_info {
 	/* Chip-supported device locking */
 	int (*lock) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
 	int (*unlock) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
+	int (*is_locked) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
 
 	/* Power Management functions */
 	int (*suspend) (struct mtd_info *mtd);
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index be51ae2bd0ff..e12872e3c694 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -110,6 +110,7 @@ struct otp_info {
 #define MEMERASE64		_IOW('M', 20, struct erase_info_user64)
 #define MEMWRITEOOB64		_IOWR('M', 21, struct mtd_oob_buf64)
 #define MEMREADOOB64		_IOWR('M', 22, struct mtd_oob_buf64)
+#define MEMISLOCKED		_IOR('M', 23, struct erase_info_user)
 
 /*
  * Obsolete legacy interface. Keep it in order not to break userspace
-- 
cgit v1.2.3-59-g8ed1b


From 30fe8115b55223cb84530ce04c4a20ba9d6dcf0b Mon Sep 17 00:00:00 2001
From: Brian Norris <norris@broadcom.com>
Date: Wed, 23 Jun 2010 13:36:02 -0700
Subject: mtd: nand: edit macro flag for BBT scan of last page in block

NAND_BB_LAST_PAGE used to be in nand.h, but it pertained to bad block
management and so belongs next to NAND_BBT_SCAN2NDPAGE in bbm.h. Also,
its previous flag value (0x00000400) conflicted with NAND_BBT_SCANALLPAGES
so I changed its value to 0x00008000. All uses of the name were modified to
provide consistency with other "NAND_BBT_*" flags.

Signed-off-by: Brian Norris <norris@broadcom.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/nand_base.c | 6 +++---
 drivers/mtd/nand/nand_bbt.c  | 2 +-
 include/linux/mtd/bbm.h      | 2 ++
 include/linux/mtd/nand.h     | 2 --
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 4a7b86423ee9..e6cf9aefef13 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -347,7 +347,7 @@ static int nand_block_bad(struct mtd_info *mtd, loff_t ofs, int getchip)
 	struct nand_chip *chip = mtd->priv;
 	u16 bad;
 
-	if (chip->options & NAND_BB_LAST_PAGE)
+	if (chip->options & NAND_BBT_SCANLASTPAGE)
 		ofs += mtd->erasesize - mtd->writesize;
 
 	page = (int)(ofs >> chip->page_shift) & chip->pagemask;
@@ -399,7 +399,7 @@ static int nand_default_block_markbad(struct mtd_info *mtd, loff_t ofs)
 	uint8_t buf[2] = { 0, 0 };
 	int block, ret;
 
-	if (chip->options & NAND_BB_LAST_PAGE)
+	if (chip->options & NAND_BBT_SCANLASTPAGE)
 		ofs += mtd->erasesize - mtd->writesize;
 
 	/* Get block number */
@@ -2946,7 +2946,7 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd,
 	if ((chip->cellinfo & NAND_CI_CELLTYPE_MSK) &&
 			(*maf_id == NAND_MFR_SAMSUNG ||
 			 *maf_id == NAND_MFR_HYNIX))
-		chip->options |= NAND_BB_LAST_PAGE;
+		chip->options |= NAND_BBT_SCANLASTPAGE;
 
 	/* Check for AND chips with 4 page planes */
 	if (chip->options & NAND_4PAGE_ARRAY)
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index ad97c0ce73b2..71d83be24ff6 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -432,7 +432,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf,
 		from = (loff_t)startblock << (this->bbt_erase_shift - 1);
 	}
 
-	if (this->options & NAND_BB_LAST_PAGE)
+	if (this->options & NAND_BBT_SCANLASTPAGE)
 		from += mtd->erasesize - (mtd->writesize * len);
 
 	for (i = startblock; i < numblocks;) {
diff --git a/include/linux/mtd/bbm.h b/include/linux/mtd/bbm.h
index 9c3757c5759d..8ad0b8629c3f 100644
--- a/include/linux/mtd/bbm.h
+++ b/include/linux/mtd/bbm.h
@@ -82,6 +82,8 @@ struct nand_bbt_descr {
 #define NAND_BBT_SAVECONTENT	0x00002000
 /* Search good / bad pattern on the first and the second page */
 #define NAND_BBT_SCAN2NDPAGE	0x00004000
+/* Search good / bad pattern on the last page of the eraseblock */
+#define NAND_BBT_SCANLASTPAGE	0x00008000
 
 /* The maximum number of blocks to scan for a bbt */
 #define NAND_BBT_SCAN_MAXBLOCKS	4
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index a81b185e23a7..50f3aa00a452 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -181,8 +181,6 @@ typedef enum {
 #define NAND_NO_READRDY		0x00000100
 /* Chip does not allow subpage writes */
 #define NAND_NO_SUBPAGE_WRITE	0x00000200
-/* Chip stores bad block marker on the last page of the eraseblock */
-#define NAND_BB_LAST_PAGE	0x00000400
 
 /* Device is one of 'new' xD cards that expose fake nand command set */
 #define NAND_BROKEN_XD		0x00000400
-- 
cgit v1.2.3-59-g8ed1b


From 58373ff0afff4cc8ac40608872995f4d87eb72ec Mon Sep 17 00:00:00 2001
From: Brian Norris <norris@broadcom.com>
Date: Thu, 15 Jul 2010 12:15:44 -0700
Subject: mtd: nand: more BB Detection refactoring and dynamic scan options

This is a revision to PATCH 2/2 that I sent. Link:
http://lists.infradead.org/pipermail/linux-mtd/2010-July/030911.html

Added new flag for scanning of both bytes 1 and 6 of the OOB for
a BB marker (instead of simply one or the other).

The "check_pattern" and "check_short_pattern" functions were updated
to include support for scanning the two different locations in the OOB.

In order to handle increases in variety of necessary scanning patterns,
I implemented dynamic memory allocation of nand_bbt_descr structs
in new function 'nand_create_default_bbt_descr()'. This replaces
some increasingly-unwieldy, statically-declared descriptors. It can
replace several more (e.g. "flashbased" structs). However, I do not
test the flashbased options personally.

How this was tested:

I referenced 30+ data sheets (covering 100+ parts), and I tested a
selection of 10 different chips to varying degrees. Particularly, I
tested the creation of bad-block descriptors and basic BB scanning on
three parts:

ST NAND04GW3B2D, 2K page
ST NAND128W3A, 512B page
Samsung K9F1G08U0A, 2K page

To test these, I wrote some fake bad block markers to the flash (in OOB
bytes 1, 6, and elsewhere) to see if the scanning routine would detect
them properly. However, this method was somewhat limited because the
driver I am using has some bugs in its OOB write functionality.

Signed-off-by: Brian Norris <norris@broadcom.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/nand_base.c |  14 +++++
 drivers/mtd/nand/nand_bbt.c  | 127 ++++++++++++++++++++++++-------------------
 include/linux/mtd/bbm.h      |   4 ++
 3 files changed, 90 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index bd697909db53..c2901bd126f9 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2963,6 +2963,15 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd,
 			 *maf_id == NAND_MFR_MICRON))
 		chip->options |= NAND_BBT_SCAN2NDPAGE;
 
+	/*
+	 * Numonyx/ST 2K pages, x8 bus use BOTH byte 1 and 6
+	 */
+	if (!(busw & NAND_BUSWIDTH_16) &&
+			*maf_id == NAND_MFR_STMICRO &&
+			mtd->writesize == 2048) {
+		chip->options |= NAND_BBT_SCANBYTE1AND6;
+		chip->badblockpos = 0;
+	}
 
 	/* Check for AND chips with 4 page planes */
 	if (chip->options & NAND_4PAGE_ARRAY)
@@ -3322,6 +3331,11 @@ void nand_release(struct mtd_info *mtd)
 	kfree(chip->bbt);
 	if (!(chip->options & NAND_OWN_BUFFERS))
 		kfree(chip->buffers);
+
+	/* Free bad block descriptor memory */
+	if (chip->badblock_pattern && chip->badblock_pattern->options
+			& NAND_BBT_DYNAMICSTRUCT)
+		kfree(chip->badblock_pattern);
 }
 
 EXPORT_SYMBOL_GPL(nand_lock);
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index ec1700eaf198..469de17107e5 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -93,6 +93,28 @@ static int check_pattern(uint8_t *buf, int len, int paglen, struct nand_bbt_desc
 			return -1;
 	}
 
+	/* Check both positions 1 and 6 for pattern? */
+	if (td->options & NAND_BBT_SCANBYTE1AND6) {
+		if (td->options & NAND_BBT_SCANEMPTY) {
+			p += td->len;
+			end += NAND_SMALL_BADBLOCK_POS - td->offs;
+			/* Check region between positions 1 and 6 */
+			for (i = 0; i < NAND_SMALL_BADBLOCK_POS - td->offs - td->len;
+					i++) {
+				if (*p++ != 0xff)
+					return -1;
+			}
+		}
+		else {
+			p += NAND_SMALL_BADBLOCK_POS - td->offs;
+		}
+		/* Compare the pattern */
+		for (i = 0; i < td->len; i++) {
+			if (p[i] != td->pattern[i])
+				return -1;
+		}
+	}
+
 	if (td->options & NAND_BBT_SCANEMPTY) {
 		p += td->len;
 		end += td->len;
@@ -124,6 +146,13 @@ static int check_short_pattern(uint8_t *buf, struct nand_bbt_descr *td)
 		if (p[td->offs + i] != td->pattern[i])
 			return -1;
 	}
+	/* Need to check location 1 AND 6? */
+	if (td->options & NAND_BBT_SCANBYTE1AND6) {
+		for (i = 0; i < td->len; i++) {
+			if (p[NAND_SMALL_BADBLOCK_POS + i] != td->pattern[i])
+				return -1;
+		}
+	}
 	return 0;
 }
 
@@ -397,12 +426,10 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf,
 
 	if (bd->options & NAND_BBT_SCANALLPAGES)
 		len = 1 << (this->bbt_erase_shift - this->page_shift);
-	else {
-		if (bd->options & NAND_BBT_SCAN2NDPAGE)
-			len = 2;
-		else
-			len = 1;
-	}
+	else if (bd->options & NAND_BBT_SCAN2NDPAGE)
+		len = 2;
+	else
+		len = 1;
 
 	if (!(bd->options & NAND_BBT_SCANEMPTY)) {
 		/* We need only read few bytes from the OOB area */
@@ -1092,41 +1119,6 @@ int nand_update_bbt(struct mtd_info *mtd, loff_t offs)
  * while scanning a device for factory marked good / bad blocks. */
 static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
 
-static struct nand_bbt_descr smallpage_memorybased = {
-	.options = 0,
-	.offs = NAND_SMALL_BADBLOCK_POS,
-	.len = 1,
-	.pattern = scan_ff_pattern
-};
-
-static struct nand_bbt_descr smallpage_scan2nd_memorybased = {
-	.options = NAND_BBT_SCAN2NDPAGE,
-	.offs = NAND_SMALL_BADBLOCK_POS,
-	.len = 2,
-	.pattern = scan_ff_pattern
-};
-
-static struct nand_bbt_descr largepage_memorybased = {
-	.options = 0,
-	.offs = NAND_LARGE_BADBLOCK_POS,
-	.len = 1,
-	.pattern = scan_ff_pattern
-};
-
-static struct nand_bbt_descr largepage_scan2nd_memorybased = {
-	.options = NAND_BBT_SCAN2NDPAGE,
-	.offs = NAND_LARGE_BADBLOCK_POS,
-	.len = 2,
-	.pattern = scan_ff_pattern
-};
-
-static struct nand_bbt_descr lastpage_memorybased = {
-	.options = NAND_BBT_SCANLASTPAGE,
-	.offs = 0,
-	.len = 1,
-	.pattern = scan_ff_pattern
-};
-
 static struct nand_bbt_descr smallpage_flashbased = {
 	.options = NAND_BBT_SCAN2NDPAGE,
 	.offs = NAND_SMALL_BADBLOCK_POS,
@@ -1175,6 +1167,43 @@ static struct nand_bbt_descr bbt_mirror_descr = {
 	.pattern = mirror_pattern
 };
 
+#define BBT_SCAN_OPTIONS (NAND_BBT_SCANLASTPAGE | NAND_BBT_SCAN2NDPAGE | \
+		NAND_BBT_SCANBYTE1AND6)
+/**
+ * nand_create_default_bbt_descr - [Internal] Creates a BBT descriptor structure
+ * @this:	NAND chip to create descriptor for
+ *
+ * This function allocates and initializes a nand_bbt_descr for BBM detection
+ * based on the properties of "this". The new descriptor is stored in
+ * this->badblock_pattern. Thus, this->badblock_pattern should be NULL when
+ * passed to this function.
+ *
+ * TODO: Handle other flags, replace other static structs
+ *        (e.g. handle NAND_BBT_FLASH for flash-based BBT,
+ *             replace smallpage_flashbased)
+ *
+ */
+static int nand_create_default_bbt_descr(struct nand_chip *this)
+{
+	struct nand_bbt_descr *bd;
+	if (this->badblock_pattern) {
+		printk(KERN_WARNING "BBT descr already allocated; not replacing.\n");
+		return -EINVAL;
+	}
+	bd = kzalloc(sizeof(*bd), GFP_KERNEL);
+	if (!bd) {
+		printk(KERN_ERR "nand_create_default_bbt_descr: Out of memory\n");
+		return -ENOMEM;
+	}
+	bd->options = this->options & BBT_SCAN_OPTIONS;
+	bd->offs = this->badblockpos;
+	bd->len = (this->options & NAND_BUSWIDTH_16) ? 2 : 1;
+	bd->pattern = scan_ff_pattern;
+	bd->options |= NAND_BBT_DYNAMICSTRUCT;
+	this->badblock_pattern = bd;
+	return 0;
+}
+
 /**
  * nand_default_bbt - [NAND Interface] Select a default bad block table for the device
  * @mtd:	MTD device structure
@@ -1217,20 +1246,8 @@ int nand_default_bbt(struct mtd_info *mtd)
 	} else {
 		this->bbt_td = NULL;
 		this->bbt_md = NULL;
-		if (!this->badblock_pattern) {
-			if (this->options & NAND_BBT_SCANLASTPAGE)
-				this->badblock_pattern = &lastpage_memorybased;
-			else if (this->options & NAND_BBT_SCAN2NDPAGE)
-				this->badblock_pattern = this->badblockpos ==
-					NAND_SMALL_BADBLOCK_POS ?
-					&smallpage_scan2nd_memorybased :
-					&largepage_scan2nd_memorybased;
-			else
-				this->badblock_pattern = this->badblockpos ==
-					NAND_SMALL_BADBLOCK_POS ?
-					&smallpage_memorybased :
-					&largepage_memorybased;
-		}
+		if (!this->badblock_pattern)
+			nand_create_default_bbt_descr(this);
 	}
 	return nand_scan_bbt(mtd, this->badblock_pattern);
 }
diff --git a/include/linux/mtd/bbm.h b/include/linux/mtd/bbm.h
index 8ad0b8629c3f..a04b962492a8 100644
--- a/include/linux/mtd/bbm.h
+++ b/include/linux/mtd/bbm.h
@@ -84,6 +84,10 @@ struct nand_bbt_descr {
 #define NAND_BBT_SCAN2NDPAGE	0x00004000
 /* Search good / bad pattern on the last page of the eraseblock */
 #define NAND_BBT_SCANLASTPAGE	0x00008000
+/* Chip stores bad block marker on BOTH 1st and 6th bytes of OOB */
+#define NAND_BBT_SCANBYTE1AND6 0x00100000
+/* The nand_bbt_descr was created dynamicaly and must be freed */
+#define NAND_BBT_DYNAMICSTRUCT 0x00200000
 
 /* The maximum number of blocks to scan for a bbt */
 #define NAND_BBT_SCAN_MAXBLOCKS	4
-- 
cgit v1.2.3-59-g8ed1b


From a51dca9cd3bb4ec5a05bfb6feabf024a5c808a37 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 2 Aug 2010 08:43:25 -0400
Subject: jbd2: Use atomic variables to avoid taking t_handle_lock in
 jbd2_journal_stop

By using an atomic_t for t_updates and t_outstanding credits, this
should allow us to not need to take transaction t_handle_lock in
jbd2_journal_stop().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c  |  2 +-
 fs/jbd2/commit.c      | 13 +++++-----
 fs/jbd2/transaction.c | 69 +++++++++++++++++++++++++++++----------------------
 include/linux/jbd2.h  |  8 +++---
 4 files changed, 51 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 076d1cc44f95..f8cdc02520f9 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -775,7 +775,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
-	J_ASSERT(transaction->t_updates == 0);
+	J_ASSERT(atomic_read(&transaction->t_updates) == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index af056810acb6..fbd2c564e916 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -417,12 +417,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 					      stats.run.rs_locked);
 
 	spin_lock(&commit_transaction->t_handle_lock);
-	while (commit_transaction->t_updates) {
+	while (atomic_read(&commit_transaction->t_updates)) {
 		DEFINE_WAIT(wait);
 
 		prepare_to_wait(&journal->j_wait_updates, &wait,
 					TASK_UNINTERRUPTIBLE);
-		if (commit_transaction->t_updates) {
+		if (atomic_read(&commit_transaction->t_updates)) {
 			spin_unlock(&commit_transaction->t_handle_lock);
 			spin_unlock(&journal->j_state_lock);
 			schedule();
@@ -433,7 +433,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	}
 	spin_unlock(&commit_transaction->t_handle_lock);
 
-	J_ASSERT (commit_transaction->t_outstanding_credits <=
+	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 			journal->j_max_transaction_buffers);
 
 	/*
@@ -527,11 +527,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	stats.run.rs_logging = jiffies;
 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 					       stats.run.rs_logging);
-	stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
+	stats.run.rs_blocks =
+		atomic_read(&commit_transaction->t_outstanding_credits);
 	stats.run.rs_blocks_logged = 0;
 
 	J_ASSERT(commit_transaction->t_nr_buffers <=
-		 commit_transaction->t_outstanding_credits);
+		 atomic_read(&commit_transaction->t_outstanding_credits));
 
 	err = 0;
 	descriptor = NULL;
@@ -616,7 +617,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		 * the free space in the log, but this counter is changed
 		 * by jbd2_journal_next_log_block() also.
 		 */
-		commit_transaction->t_outstanding_credits--;
+		atomic_dec(&commit_transaction->t_outstanding_credits);
 
 		/* Bump b_count to prevent truncate from stumbling over
                    the shadowed buffer!  @@@ This can go if we ever get
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 001e95fb0fe1..9c64c7ec48d4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -55,6 +55,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
+	atomic_set(&transaction->t_updates, 0);
+	atomic_set(&transaction->t_outstanding_credits, 0);
 	INIT_LIST_HEAD(&transaction->t_inode_list);
 	INIT_LIST_HEAD(&transaction->t_private_list);
 
@@ -177,7 +179,7 @@ repeat_locked:
 	 * checkpoint to free some more log space.
 	 */
 	spin_lock(&transaction->t_handle_lock);
-	needed = transaction->t_outstanding_credits + nblocks;
+	needed = atomic_read(&transaction->t_outstanding_credits) + nblocks;
 
 	if (needed > journal->j_max_transaction_buffers) {
 		/*
@@ -240,11 +242,12 @@ repeat_locked:
 	}
 
 	handle->h_transaction = transaction;
-	transaction->t_outstanding_credits += nblocks;
-	transaction->t_updates++;
+	atomic_add(nblocks, &transaction->t_outstanding_credits);
+	atomic_inc(&transaction->t_updates);
 	transaction->t_handle_count++;
 	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
-		  handle, nblocks, transaction->t_outstanding_credits,
+		  handle, nblocks,
+		  atomic_read(&transaction->t_outstanding_credits),
 		  __jbd2_log_space_left(journal));
 	spin_unlock(&transaction->t_handle_lock);
 	spin_unlock(&journal->j_state_lock);
@@ -369,7 +372,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
 	}
 
 	spin_lock(&transaction->t_handle_lock);
-	wanted = transaction->t_outstanding_credits + nblocks;
+	wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
 
 	if (wanted > journal->j_max_transaction_buffers) {
 		jbd_debug(3, "denied handle %p %d blocks: "
@@ -384,7 +387,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
 	}
 
 	handle->h_buffer_credits += nblocks;
-	transaction->t_outstanding_credits += nblocks;
+	atomic_add(nblocks, &transaction->t_outstanding_credits);
 	result = 0;
 
 	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -426,15 +429,14 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 	 * First unlink the handle from its current transaction, and start the
 	 * commit on that.
 	 */
-	J_ASSERT(transaction->t_updates > 0);
+	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
 	J_ASSERT(journal_current_handle() == handle);
 
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
-	transaction->t_outstanding_credits -= handle->h_buffer_credits;
-	transaction->t_updates--;
-
-	if (!transaction->t_updates)
+	atomic_sub(handle->h_buffer_credits,
+		   &transaction->t_outstanding_credits);
+	if (atomic_dec_and_test(&transaction->t_updates))
 		wake_up(&journal->j_wait_updates);
 	spin_unlock(&transaction->t_handle_lock);
 
@@ -481,7 +483,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
 			break;
 
 		spin_lock(&transaction->t_handle_lock);
-		if (!transaction->t_updates) {
+		if (!atomic_read(&transaction->t_updates)) {
 			spin_unlock(&transaction->t_handle_lock);
 			break;
 		}
@@ -1258,7 +1260,8 @@ int jbd2_journal_stop(handle_t *handle)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
-	int err;
+	int err, wait_for_commit = 0;
+	tid_t tid;
 	pid_t pid;
 
 	J_ASSERT(journal_current_handle() == handle);
@@ -1266,7 +1269,7 @@ int jbd2_journal_stop(handle_t *handle)
 	if (is_handle_aborted(handle))
 		err = -EIO;
 	else {
-		J_ASSERT(transaction->t_updates > 0);
+		J_ASSERT(atomic_read(&transaction->t_updates) > 0);
 		err = 0;
 	}
 
@@ -1334,14 +1337,8 @@ int jbd2_journal_stop(handle_t *handle)
 	if (handle->h_sync)
 		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
-	spin_lock(&transaction->t_handle_lock);
-	transaction->t_outstanding_credits -= handle->h_buffer_credits;
-	transaction->t_updates--;
-	if (!transaction->t_updates) {
-		wake_up(&journal->j_wait_updates);
-		if (journal->j_barrier_count)
-			wake_up(&journal->j_wait_transaction_locked);
-	}
+	atomic_sub(handle->h_buffer_credits,
+		   &transaction->t_outstanding_credits);
 
 	/*
 	 * If the handle is marked SYNC, we need to set another commit
@@ -1350,15 +1347,13 @@ int jbd2_journal_stop(handle_t *handle)
 	 * transaction is too old now.
 	 */
 	if (handle->h_sync ||
-			transaction->t_outstanding_credits >
-				journal->j_max_transaction_buffers ||
-			time_after_eq(jiffies, transaction->t_expires)) {
+	    (atomic_read(&transaction->t_outstanding_credits) >
+	     journal->j_max_transaction_buffers) ||
+	    time_after_eq(jiffies, transaction->t_expires)) {
 		/* Do this even for aborted journals: an abort still
 		 * completes the commit thread, it just doesn't write
 		 * anything to disk. */
-		tid_t tid = transaction->t_tid;
 
-		spin_unlock(&transaction->t_handle_lock);
 		jbd_debug(2, "transaction too old, requesting commit for "
 					"handle %p\n", handle);
 		/* This is non-blocking */
@@ -1369,11 +1364,25 @@ int jbd2_journal_stop(handle_t *handle)
 		 * to wait for the commit to complete.
 		 */
 		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
-			err = jbd2_log_wait_commit(journal, tid);
-	} else {
-		spin_unlock(&transaction->t_handle_lock);
+			wait_for_commit = 1;
 	}
 
+	/*
+	 * Once we drop t_updates, if it goes to zero the transaction
+	 * could start commiting on us and eventually disappear.  So
+	 * once we do this, we must not dereference transaction
+	 * pointer again.
+	 */
+	tid = transaction->t_tid;
+	if (atomic_dec_and_test(&transaction->t_updates)) {
+		wake_up(&journal->j_wait_updates);
+		if (journal->j_barrier_count)
+			wake_up(&journal->j_wait_transaction_locked);
+	}
+
+	if (wait_for_commit)
+		err = jbd2_log_wait_commit(journal, tid);
+
 	lock_map_release(&handle->h_lockdep_map);
 
 	jbd2_free_handle(handle);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 5a72bc75b273..a72ce21de0e1 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -601,13 +601,13 @@ struct transaction_s
 	 * Number of outstanding updates running on this transaction
 	 * [t_handle_lock]
 	 */
-	int			t_updates;
+	atomic_t		t_updates;
 
 	/*
 	 * Number of buffers reserved for use by all handles in this transaction
 	 * handle but not yet modified. [t_handle_lock]
 	 */
-	int			t_outstanding_credits;
+	atomic_t		t_outstanding_credits;
 
 	/*
 	 * Forward and backward links for the circular list of all transactions
@@ -1258,8 +1258,8 @@ static inline int jbd_space_needed(journal_t *journal)
 {
 	int nblocks = journal->j_max_transaction_buffers;
 	if (journal->j_committing_transaction)
-		nblocks += journal->j_committing_transaction->
-					t_outstanding_credits;
+		nblocks += atomic_read(&journal->j_committing_transaction->
+				       t_outstanding_credits);
 	return nblocks;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From b126468e08d92aaeffa58ef04d70e417241dadc1 Mon Sep 17 00:00:00 2001
From: Fang Wenqi <anton.fang@gmail.com>
Date: Tue, 1 Jun 2010 02:43:06 +0000
Subject: virtio_9p.h needs <linux/types.h>

Found with makes headers_check:
include/linux/virtio_9p.h:15: found __[us]{8,16,32,64} type without #include <linux/types.h>

Signed-off-by: Fang Wenqi <antonf@turbolinux.com.cn>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 include/linux/virtio_9p.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_9p.h b/include/linux/virtio_9p.h
index 5cf11765146b..395c38a47adb 100644
--- a/include/linux/virtio_9p.h
+++ b/include/linux/virtio_9p.h
@@ -4,6 +4,7 @@
  * compatible drivers/servers. */
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
+#include <linux/types.h>
 
 /* The feature bitmap for virtio 9P */
 
-- 
cgit v1.2.3-59-g8ed1b


From f6a20eb1a2d35660240cd1eb8dc2bd6504a0c6c5 Mon Sep 17 00:00:00 2001
From: Klaus Schmidinger <Klaus.Schmidinger@tvdr.de>
Date: Thu, 1 Jul 2010 01:37:34 -0300
Subject: V4L/DVB: Add FE_CAN_TURBO_FEC

Some (North American) providers use a non-standard mode called
"8psk turbo fec". Since there is no flag in the driver that
would allow an application to determine whether a particular
device can handle "turbo fec", the attached patch introduces
FE_CAN_TURBO_FEC.

Since there is no flag in the SI data that would indicate
that a transponder uses "turbo fec", VDR will assume that
all 8psk transponders on DVB-S use "turbo fec".

Tested-by: Derek Kelly <user.vdr@gmail.com>
Signed-off-by: Klaus Schmidinger <Klaus.Schmidinger@tvdr.de>
Signed-off-by: Douglas Schilling Landgraf <dougsland@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/dvb/dvb-usb/gp8psk-fe.c | 2 +-
 include/linux/dvb/frontend.h          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/media/dvb/dvb-usb/gp8psk-fe.c b/drivers/media/dvb/dvb-usb/gp8psk-fe.c
index 7a7f1b2b681c..dbdb5347b2a8 100644
--- a/drivers/media/dvb/dvb-usb/gp8psk-fe.c
+++ b/drivers/media/dvb/dvb-usb/gp8psk-fe.c
@@ -349,7 +349,7 @@ static struct dvb_frontend_ops gp8psk_fe_ops = {
 			 * FE_CAN_QAM_16 is for compatibility
 			 * (Myth incorrectly detects Turbo-QPSK as plain QAM-16)
 			 */
-			FE_CAN_QPSK | FE_CAN_QAM_16
+			FE_CAN_QPSK | FE_CAN_QAM_16 | FE_CAN_TURBO_FEC
 	},
 
 	.release = gp8psk_fe_release,
diff --git a/include/linux/dvb/frontend.h b/include/linux/dvb/frontend.h
index b6cb5425cde3..493a2bf85f62 100644
--- a/include/linux/dvb/frontend.h
+++ b/include/linux/dvb/frontend.h
@@ -62,6 +62,7 @@ typedef enum fe_caps {
 	FE_CAN_8VSB			= 0x200000,
 	FE_CAN_16VSB			= 0x400000,
 	FE_HAS_EXTENDED_CAPS		= 0x800000,   /* We need more bitspace for newer APIs, indicate this. */
+	FE_CAN_TURBO_FEC		= 0x8000000,  /* frontend supports "turbo fec modulation" */
 	FE_CAN_2G_MODULATION		= 0x10000000, /* frontend supports "2nd generation modulation" (DVB-S2) */
 	FE_NEEDS_BENDING		= 0x20000000, /* not supported anymore, don't use (frontend requires frequency bending) */
 	FE_CAN_RECOVER			= 0x40000000, /* frontend can recover from a cable unplug automatically */
-- 
cgit v1.2.3-59-g8ed1b


From 1ece36097d0170a41fc129b8b1823a36ec2fb5c6 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Sat, 3 Jul 2010 18:06:13 -0300
Subject: V4L/DVB: Increment DVB API version

A new flag were added at the Frontend capabilities. Increment
API minor revision.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/dvb/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dvb/version.h b/include/linux/dvb/version.h
index 540b0583d9fb..5a7546c12688 100644
--- a/include/linux/dvb/version.h
+++ b/include/linux/dvb/version.h
@@ -24,6 +24,6 @@
 #define _DVBVERSION_H_
 
 #define DVB_API_VERSION 5
-#define DVB_API_VERSION_MINOR 1
+#define DVB_API_VERSION_MINOR 2
 
 #endif /*_DVBVERSION_H_*/
-- 
cgit v1.2.3-59-g8ed1b


From 1b4e21c4f62eae6bdcb3e7bfdfc52171a24f3689 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Thu, 17 Jun 2010 11:11:51 -0300
Subject: V4L/DVB: uvcvideo: Define control information bits using macros

Use the macros instead of hardcoding numerical constants for the
controls information bitfield.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/uvc/uvc_ctrl.c | 12 ++++++------
 include/linux/usb/video.h          |  7 +++++++
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/uvc/uvc_ctrl.c b/drivers/media/video/uvc/uvc_ctrl.c
index bd72100a21dd..fa06cf512ecf 100644
--- a/drivers/media/video/uvc/uvc_ctrl.c
+++ b/drivers/media/video/uvc/uvc_ctrl.c
@@ -1324,9 +1324,8 @@ static void uvc_ctrl_add_ctrl(struct uvc_device *dev,
 		/* Check if the device control information and length match
 		 * the user supplied information.
 		 */
-		__u32 flags;
 		__le16 size;
-		__u8 inf;
+		__u8 _info;
 
 		ret = uvc_query_ctrl(dev, UVC_GET_LEN, ctrl->entity->id,
 			dev->intfnum, info->selector, (__u8 *)&size, 2);
@@ -1345,7 +1344,7 @@ static void uvc_ctrl_add_ctrl(struct uvc_device *dev,
 		}
 
 		ret = uvc_query_ctrl(dev, UVC_GET_INFO, ctrl->entity->id,
-			dev->intfnum, info->selector, &inf, 1);
+				     dev->intfnum, info->selector, &_info, 1);
 		if (ret < 0) {
 			uvc_trace(UVC_TRACE_CONTROL,
 				"GET_INFO failed on control %pUl/%u (%d).\n",
@@ -1353,9 +1352,10 @@ static void uvc_ctrl_add_ctrl(struct uvc_device *dev,
 			return;
 		}
 
-		flags = info->flags;
-		if (((flags & UVC_CONTROL_GET_CUR) && !(inf & (1 << 0))) ||
-		    ((flags & UVC_CONTROL_SET_CUR) && !(inf & (1 << 1)))) {
+		if (((info->flags & UVC_CONTROL_GET_CUR) &&
+		    !(_info & UVC_CONTROL_CAP_GET)) ||
+		    ((info->flags & UVC_CONTROL_SET_CUR) &&
+		    !(_info & UVC_CONTROL_CAP_SET))) {
 			uvc_trace(UVC_TRACE_CONTROL, "Control %pUl/%u flags "
 				"don't match supported operations.\n",
 				info->entity, info->selector);
diff --git a/include/linux/usb/video.h b/include/linux/usb/video.h
index be436d9ee479..2d5b7fc6a265 100644
--- a/include/linux/usb/video.h
+++ b/include/linux/usb/video.h
@@ -160,5 +160,12 @@
 #define UVC_STATUS_TYPE_CONTROL				1
 #define UVC_STATUS_TYPE_STREAMING			2
 
+/* 4.1.2. Control Capabilities */
+#define UVC_CONTROL_CAP_GET				(1 << 0)
+#define UVC_CONTROL_CAP_SET				(1 << 1)
+#define UVC_CONTROL_CAP_DISABLED			(1 << 2)
+#define UVC_CONTROL_CAP_AUTOUPDATE			(1 << 3)
+#define UVC_CONTROL_CAP_ASYNCHRONOUS			(1 << 4)
+
 #endif /* __LINUX_USB_VIDEO_H */
 
-- 
cgit v1.2.3-59-g8ed1b


From bbafc0cb6c52c40647f561854db5fbac4d608186 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Sat, 10 Jul 2010 15:03:20 -0300
Subject: V4L/DVB: uvc: Move constants and structures definitions to
 linux/usb/video.h

The UVC host and gadget drivers both define constants and structures in
private header files. Move all those definitions to linux/usb/video.h
where they can be shared by the two drivers (and be available for
userspace applications).

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/uvc/uvcvideo.h |  19 --
 drivers/usb/gadget/f_uvc.c         |  16 +-
 drivers/usb/gadget/f_uvc.h         | 352 +-------------------------------
 drivers/usb/gadget/uvc.h           |  36 ----
 drivers/usb/gadget/webcam.c        |  24 +--
 include/linux/usb/video.h          | 397 +++++++++++++++++++++++++++++++++++++
 6 files changed, 418 insertions(+), 426 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/uvc/uvcvideo.h b/drivers/media/video/uvc/uvcvideo.h
index 47b20e7e3786..ac272456fbfd 100644
--- a/drivers/media/video/uvc/uvcvideo.h
+++ b/drivers/media/video/uvc/uvcvideo.h
@@ -196,25 +196,6 @@ struct uvc_device;
 /* TODO: Put the most frequently accessed fields at the beginning of
  * structures to maximize cache efficiency.
  */
-struct uvc_streaming_control {
-	__u16 bmHint;
-	__u8  bFormatIndex;
-	__u8  bFrameIndex;
-	__u32 dwFrameInterval;
-	__u16 wKeyFrameRate;
-	__u16 wPFrameRate;
-	__u16 wCompQuality;
-	__u16 wCompWindowSize;
-	__u16 wDelay;
-	__u32 dwMaxVideoFrameSize;
-	__u32 dwMaxPayloadTransferSize;
-	__u32 dwClockFrequency;
-	__u8  bmFramingInfo;
-	__u8  bPreferedVersion;
-	__u8  bMinVersion;
-	__u8  bMaxVersion;
-};
-
 struct uvc_control_info {
 	struct list_head list;
 	struct list_head mappings;
diff --git a/drivers/usb/gadget/f_uvc.c b/drivers/usb/gadget/f_uvc.c
index dbe6db0184fd..be446b7e7eaa 100644
--- a/drivers/usb/gadget/f_uvc.c
+++ b/drivers/usb/gadget/f_uvc.c
@@ -61,12 +61,12 @@ static struct usb_gadget_strings *uvc_function_strings[] = {
 #define UVC_INTF_VIDEO_STREAMING		1
 
 static struct usb_interface_assoc_descriptor uvc_iad __initdata = {
-	.bLength		= USB_DT_INTERFACE_ASSOCIATION_SIZE,
+	.bLength		= sizeof(uvc_iad),
 	.bDescriptorType	= USB_DT_INTERFACE_ASSOCIATION,
 	.bFirstInterface	= 0,
 	.bInterfaceCount	= 2,
 	.bFunctionClass		= USB_CLASS_VIDEO,
-	.bFunctionSubClass	= 0x03,
+	.bFunctionSubClass	= UVC_SC_VIDEO_INTERFACE_COLLECTION,
 	.bFunctionProtocol	= 0x00,
 	.iFunction		= 0,
 };
@@ -78,7 +78,7 @@ static struct usb_interface_descriptor uvc_control_intf __initdata = {
 	.bAlternateSetting	= 0,
 	.bNumEndpoints		= 1,
 	.bInterfaceClass	= USB_CLASS_VIDEO,
-	.bInterfaceSubClass	= 0x01,
+	.bInterfaceSubClass	= UVC_SC_VIDEOCONTROL,
 	.bInterfaceProtocol	= 0x00,
 	.iInterface		= 0,
 };
@@ -106,7 +106,7 @@ static struct usb_interface_descriptor uvc_streaming_intf_alt0 __initdata = {
 	.bAlternateSetting	= 0,
 	.bNumEndpoints		= 0,
 	.bInterfaceClass	= USB_CLASS_VIDEO,
-	.bInterfaceSubClass	= 0x02,
+	.bInterfaceSubClass	= UVC_SC_VIDEOSTREAMING,
 	.bInterfaceProtocol	= 0x00,
 	.iInterface		= 0,
 };
@@ -118,7 +118,7 @@ static struct usb_interface_descriptor uvc_streaming_intf_alt1 __initdata = {
 	.bAlternateSetting	= 1,
 	.bNumEndpoints		= 1,
 	.bInterfaceClass	= USB_CLASS_VIDEO,
-	.bInterfaceSubClass	= 0x02,
+	.bInterfaceSubClass	= UVC_SC_VIDEOSTREAMING,
 	.bInterfaceProtocol	= 0x00,
 	.iInterface		= 0,
 };
@@ -603,15 +603,15 @@ uvc_bind_config(struct usb_configuration *c,
 
 	/* Validate the descriptors. */
 	if (control == NULL || control[0] == NULL ||
-	    control[0]->bDescriptorSubType != UVC_DT_HEADER)
+	    control[0]->bDescriptorSubType != UVC_VC_HEADER)
 		goto error;
 
 	if (fs_streaming == NULL || fs_streaming[0] == NULL ||
-	    fs_streaming[0]->bDescriptorSubType != UVC_DT_INPUT_HEADER)
+	    fs_streaming[0]->bDescriptorSubType != UVC_VS_INPUT_HEADER)
 		goto error;
 
 	if (hs_streaming == NULL || hs_streaming[0] == NULL ||
-	    hs_streaming[0]->bDescriptorSubType != UVC_DT_INPUT_HEADER)
+	    hs_streaming[0]->bDescriptorSubType != UVC_VS_INPUT_HEADER)
 		goto error;
 
 	uvc->desc.control = control;
diff --git a/drivers/usb/gadget/f_uvc.h b/drivers/usb/gadget/f_uvc.h
index 8a5db7c4fe7c..e18a6636c283 100644
--- a/drivers/usb/gadget/f_uvc.h
+++ b/drivers/usb/gadget/f_uvc.h
@@ -15,357 +15,7 @@
 #define _F_UVC_H_
 
 #include <linux/usb/composite.h>
-
-#define USB_CLASS_VIDEO_CONTROL		1
-#define USB_CLASS_VIDEO_STREAMING	2
-
-struct uvc_descriptor_header {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-} __attribute__ ((packed));
-
-struct uvc_header_descriptor {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u16 bcdUVC;
-	__u16 wTotalLength;
-	__u32 dwClockFrequency;
-	__u8  bInCollection;
-	__u8  baInterfaceNr[];
-} __attribute__((__packed__));
-
-#define UVC_HEADER_DESCRIPTOR(n)	uvc_header_descriptor_##n
-
-#define DECLARE_UVC_HEADER_DESCRIPTOR(n) 			\
-struct UVC_HEADER_DESCRIPTOR(n) {				\
-	__u8  bLength;						\
-	__u8  bDescriptorType;					\
-	__u8  bDescriptorSubType;				\
-	__u16 bcdUVC;						\
-	__u16 wTotalLength;					\
-	__u32 dwClockFrequency;					\
-	__u8  bInCollection;					\
-	__u8  baInterfaceNr[n];					\
-} __attribute__ ((packed))
-
-struct uvc_input_terminal_descriptor {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bTerminalID;
-	__u16 wTerminalType;
-	__u8  bAssocTerminal;
-	__u8  iTerminal;
-} __attribute__((__packed__));
-
-struct uvc_output_terminal_descriptor {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bTerminalID;
-	__u16 wTerminalType;
-	__u8  bAssocTerminal;
-	__u8  bSourceID;
-	__u8  iTerminal;
-} __attribute__((__packed__));
-
-struct uvc_camera_terminal_descriptor {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bTerminalID;
-	__u16 wTerminalType;
-	__u8  bAssocTerminal;
-	__u8  iTerminal;
-	__u16 wObjectiveFocalLengthMin;
-	__u16 wObjectiveFocalLengthMax;
-	__u16 wOcularFocalLength;
-	__u8  bControlSize;
-	__u8  bmControls[3];
-} __attribute__((__packed__));
-
-struct uvc_selector_unit_descriptor {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bUnitID;
-	__u8  bNrInPins;
-	__u8  baSourceID[0];
-	__u8  iSelector;
-} __attribute__((__packed__));
-
-#define UVC_SELECTOR_UNIT_DESCRIPTOR(n)	\
-	uvc_selector_unit_descriptor_##n
-
-#define DECLARE_UVC_SELECTOR_UNIT_DESCRIPTOR(n) 		\
-struct UVC_SELECTOR_UNIT_DESCRIPTOR(n) {			\
-	__u8  bLength;						\
-	__u8  bDescriptorType;					\
-	__u8  bDescriptorSubType;				\
-	__u8  bUnitID;						\
-	__u8  bNrInPins;					\
-	__u8  baSourceID[n];					\
-	__u8  iSelector;					\
-} __attribute__ ((packed))
-
-struct uvc_processing_unit_descriptor {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bUnitID;
-	__u8  bSourceID;
-	__u16 wMaxMultiplier;
-	__u8  bControlSize;
-	__u8  bmControls[2];
-	__u8  iProcessing;
-} __attribute__((__packed__));
-
-struct uvc_extension_unit_descriptor {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bUnitID;
-	__u8  guidExtensionCode[16];
-	__u8  bNumControls;
-	__u8  bNrInPins;
-	__u8  baSourceID[0];
-	__u8  bControlSize;
-	__u8  bmControls[0];
-	__u8  iExtension;
-} __attribute__((__packed__));
-
-#define UVC_EXTENSION_UNIT_DESCRIPTOR(p, n) \
-	uvc_extension_unit_descriptor_##p_##n
-
-#define DECLARE_UVC_EXTENSION_UNIT_DESCRIPTOR(p, n) 		\
-struct UVC_EXTENSION_UNIT_DESCRIPTOR(p, n) {			\
-	__u8  bLength;						\
-	__u8  bDescriptorType;					\
-	__u8  bDescriptorSubType;				\
-	__u8  bUnitID;						\
-	__u8  guidExtensionCode[16];				\
-	__u8  bNumControls;					\
-	__u8  bNrInPins;					\
-	__u8  baSourceID[p];					\
-	__u8  bControlSize;					\
-	__u8  bmControls[n];					\
-	__u8  iExtension;					\
-} __attribute__ ((packed))
-
-struct uvc_control_endpoint_descriptor {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u16 wMaxTransferSize;
-} __attribute__((__packed__));
-
-#define UVC_DT_HEADER				1
-#define UVC_DT_INPUT_TERMINAL			2
-#define UVC_DT_OUTPUT_TERMINAL			3
-#define UVC_DT_SELECTOR_UNIT			4
-#define UVC_DT_PROCESSING_UNIT			5
-#define UVC_DT_EXTENSION_UNIT			6
-
-#define UVC_DT_HEADER_SIZE(n)			(12+(n))
-#define UVC_DT_INPUT_TERMINAL_SIZE		8
-#define UVC_DT_OUTPUT_TERMINAL_SIZE		9
-#define UVC_DT_CAMERA_TERMINAL_SIZE(n)		(15+(n))
-#define UVC_DT_SELECTOR_UNIT_SIZE(n)		(6+(n))
-#define UVC_DT_PROCESSING_UNIT_SIZE(n)		(9+(n))
-#define UVC_DT_EXTENSION_UNIT_SIZE(p,n)		(24+(p)+(n))
-#define UVC_DT_CONTROL_ENDPOINT_SIZE		5
-
-struct uvc_input_header_descriptor {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bNumFormats;
-	__u16 wTotalLength;
-	__u8  bEndpointAddress;
-	__u8  bmInfo;
-	__u8  bTerminalLink;
-	__u8  bStillCaptureMethod;
-	__u8  bTriggerSupport;
-	__u8  bTriggerUsage;
-	__u8  bControlSize;
-	__u8  bmaControls[];
-} __attribute__((__packed__));
-
-#define UVC_INPUT_HEADER_DESCRIPTOR(n, p) \
-	uvc_input_header_descriptor_##n_##p
-
-#define DECLARE_UVC_INPUT_HEADER_DESCRIPTOR(n, p)		\
-struct UVC_INPUT_HEADER_DESCRIPTOR(n, p) {			\
-	__u8  bLength;						\
-	__u8  bDescriptorType;					\
-	__u8  bDescriptorSubType;				\
-	__u8  bNumFormats;					\
-	__u16 wTotalLength;					\
-	__u8  bEndpointAddress;					\
-	__u8  bmInfo;						\
-	__u8  bTerminalLink;					\
-	__u8  bStillCaptureMethod;				\
-	__u8  bTriggerSupport;					\
-	__u8  bTriggerUsage;					\
-	__u8  bControlSize;					\
-	__u8  bmaControls[p][n];				\
-} __attribute__ ((packed))
-
-struct uvc_output_header_descriptor {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bNumFormats;
-	__u16 wTotalLength;
-	__u8  bEndpointAddress;
-	__u8  bTerminalLink;
-	__u8  bControlSize;
-	__u8  bmaControls[];
-} __attribute__((__packed__));
-
-#define UVC_OUTPUT_HEADER_DESCRIPTOR(n, p) \
-	uvc_output_header_descriptor_##n_##p
-
-#define DECLARE_UVC_OUTPUT_HEADER_DESCRIPTOR(n, p)		\
-struct UVC_OUTPUT_HEADER_DESCRIPTOR(n, p) {			\
-	__u8  bLength;						\
-	__u8  bDescriptorType;					\
-	__u8  bDescriptorSubType;				\
-	__u8  bNumFormats;					\
-	__u16 wTotalLength;					\
-	__u8  bEndpointAddress;					\
-	__u8  bTerminalLink;					\
-	__u8  bControlSize;					\
-	__u8  bmaControls[p][n];				\
-} __attribute__ ((packed))
-
-struct uvc_format_uncompressed {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bFormatIndex;
-	__u8  bNumFrameDescriptors;
-	__u8  guidFormat[16];
-	__u8  bBitsPerPixel;
-	__u8  bDefaultFrameIndex;
-	__u8  bAspectRatioX;
-	__u8  bAspectRatioY;
-	__u8  bmInterfaceFlags;
-	__u8  bCopyProtect;
-} __attribute__((__packed__));
-
-struct uvc_frame_uncompressed {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bFrameIndex;
-	__u8  bmCapabilities;
-	__u16 wWidth;
-	__u16 wHeight;
-	__u32 dwMinBitRate;
-	__u32 dwMaxBitRate;
-	__u32 dwMaxVideoFrameBufferSize;
-	__u32 dwDefaultFrameInterval;
-	__u8  bFrameIntervalType;
-	__u32 dwFrameInterval[];
-} __attribute__((__packed__));
-
-#define UVC_FRAME_UNCOMPRESSED(n) \
-	uvc_frame_uncompressed_##n
-
-#define DECLARE_UVC_FRAME_UNCOMPRESSED(n) 			\
-struct UVC_FRAME_UNCOMPRESSED(n) {				\
-	__u8  bLength;						\
-	__u8  bDescriptorType;					\
-	__u8  bDescriptorSubType;				\
-	__u8  bFrameIndex;					\
-	__u8  bmCapabilities;					\
-	__u16 wWidth;						\
-	__u16 wHeight;						\
-	__u32 dwMinBitRate;					\
-	__u32 dwMaxBitRate;					\
-	__u32 dwMaxVideoFrameBufferSize;			\
-	__u32 dwDefaultFrameInterval;				\
-	__u8  bFrameIntervalType;				\
-	__u32 dwFrameInterval[n];				\
-} __attribute__ ((packed))
-
-struct uvc_format_mjpeg {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bFormatIndex;
-	__u8  bNumFrameDescriptors;
-	__u8  bmFlags;
-	__u8  bDefaultFrameIndex;
-	__u8  bAspectRatioX;
-	__u8  bAspectRatioY;
-	__u8  bmInterfaceFlags;
-	__u8  bCopyProtect;
-} __attribute__((__packed__));
-
-struct uvc_frame_mjpeg {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bFrameIndex;
-	__u8  bmCapabilities;
-	__u16 wWidth;
-	__u16 wHeight;
-	__u32 dwMinBitRate;
-	__u32 dwMaxBitRate;
-	__u32 dwMaxVideoFrameBufferSize;
-	__u32 dwDefaultFrameInterval;
-	__u8  bFrameIntervalType;
-	__u32 dwFrameInterval[];
-} __attribute__((__packed__));
-
-#define UVC_FRAME_MJPEG(n) \
-	uvc_frame_mjpeg_##n
-
-#define DECLARE_UVC_FRAME_MJPEG(n) 				\
-struct UVC_FRAME_MJPEG(n) {					\
-	__u8  bLength;						\
-	__u8  bDescriptorType;					\
-	__u8  bDescriptorSubType;				\
-	__u8  bFrameIndex;					\
-	__u8  bmCapabilities;					\
-	__u16 wWidth;						\
-	__u16 wHeight;						\
-	__u32 dwMinBitRate;					\
-	__u32 dwMaxBitRate;					\
-	__u32 dwMaxVideoFrameBufferSize;			\
-	__u32 dwDefaultFrameInterval;				\
-	__u8  bFrameIntervalType;				\
-	__u32 dwFrameInterval[n];				\
-} __attribute__ ((packed))
-
-struct uvc_color_matching_descriptor {
-	__u8  bLength;
-	__u8  bDescriptorType;
-	__u8  bDescriptorSubType;
-	__u8  bColorPrimaries;
-	__u8  bTransferCharacteristics;
-	__u8  bMatrixCoefficients;
-} __attribute__((__packed__));
-
-#define UVC_DT_INPUT_HEADER			1
-#define UVC_DT_OUTPUT_HEADER			2
-#define UVC_DT_FORMAT_UNCOMPRESSED		4
-#define UVC_DT_FRAME_UNCOMPRESSED		5
-#define UVC_DT_FORMAT_MJPEG			6
-#define UVC_DT_FRAME_MJPEG			7
-#define UVC_DT_COLOR_MATCHING			13
-
-#define UVC_DT_INPUT_HEADER_SIZE(n, p)		(13+(n*p))
-#define UVC_DT_OUTPUT_HEADER_SIZE(n, p)		(9+(n*p))
-#define UVC_DT_FORMAT_UNCOMPRESSED_SIZE		27
-#define UVC_DT_FRAME_UNCOMPRESSED_SIZE(n)	(26+4*(n))
-#define UVC_DT_FORMAT_MJPEG_SIZE		11
-#define UVC_DT_FRAME_MJPEG_SIZE(n)		(26+4*(n))
-#define UVC_DT_COLOR_MATCHING_SIZE		6
+#include <linux/usb/video.h>
 
 extern int uvc_bind_config(struct usb_configuration *c,
 			   const struct uvc_descriptor_header * const *control,
diff --git a/drivers/usb/gadget/uvc.h b/drivers/usb/gadget/uvc.h
index e92454cddd7d..5b7919460fd2 100644
--- a/drivers/usb/gadget/uvc.h
+++ b/drivers/usb/gadget/uvc.h
@@ -47,39 +47,6 @@ struct uvc_event
 #define UVC_INTF_CONTROL		0
 #define UVC_INTF_STREAMING		1
 
-/* ------------------------------------------------------------------------
- * UVC constants & structures
- */
-
-/* Values for bmHeaderInfo (Video and Still Image Payload Headers, 2.4.3.3) */
-#define UVC_STREAM_EOH				(1 << 7)
-#define UVC_STREAM_ERR				(1 << 6)
-#define UVC_STREAM_STI				(1 << 5)
-#define UVC_STREAM_RES				(1 << 4)
-#define UVC_STREAM_SCR				(1 << 3)
-#define UVC_STREAM_PTS				(1 << 2)
-#define UVC_STREAM_EOF				(1 << 1)
-#define UVC_STREAM_FID				(1 << 0)
-
-struct uvc_streaming_control {
-	__u16 bmHint;
-	__u8  bFormatIndex;
-	__u8  bFrameIndex;
-	__u32 dwFrameInterval;
-	__u16 wKeyFrameRate;
-	__u16 wPFrameRate;
-	__u16 wCompQuality;
-	__u16 wCompWindowSize;
-	__u16 wDelay;
-	__u32 dwMaxVideoFrameSize;
-	__u32 dwMaxPayloadTransferSize;
-	__u32 dwClockFrequency;
-	__u8  bmFramingInfo;
-	__u8  bPreferedVersion;
-	__u8  bMinVersion;
-	__u8  bMaxVersion;
-} __attribute__((__packed__));
-
 /* ------------------------------------------------------------------------
  * Debugging, printing and logging
  */
@@ -137,9 +104,6 @@ extern unsigned int uvc_gadget_trace_param;
 #define UVC_MAX_REQUEST_SIZE			64
 #define UVC_MAX_EVENTS				4
 
-#define USB_DT_INTERFACE_ASSOCIATION_SIZE	8
-#define USB_CLASS_MISC				0xef
-
 /* ------------------------------------------------------------------------
  * Structures
  */
diff --git a/drivers/usb/gadget/webcam.c b/drivers/usb/gadget/webcam.c
index f5f3030cc416..288d21155abe 100644
--- a/drivers/usb/gadget/webcam.c
+++ b/drivers/usb/gadget/webcam.c
@@ -90,7 +90,7 @@ DECLARE_UVC_HEADER_DESCRIPTOR(1);
 static const struct UVC_HEADER_DESCRIPTOR(1) uvc_control_header = {
 	.bLength		= UVC_DT_HEADER_SIZE(1),
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubType	= UVC_DT_HEADER,
+	.bDescriptorSubType	= UVC_VC_HEADER,
 	.bcdUVC			= cpu_to_le16(0x0100),
 	.wTotalLength		= 0, /* dynamic */
 	.dwClockFrequency	= cpu_to_le32(48000000),
@@ -101,7 +101,7 @@ static const struct UVC_HEADER_DESCRIPTOR(1) uvc_control_header = {
 static const struct uvc_camera_terminal_descriptor uvc_camera_terminal = {
 	.bLength		= UVC_DT_CAMERA_TERMINAL_SIZE(3),
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubType	= UVC_DT_INPUT_TERMINAL,
+	.bDescriptorSubType	= UVC_VC_INPUT_TERMINAL,
 	.bTerminalID		= 1,
 	.wTerminalType		= cpu_to_le16(0x0201),
 	.bAssocTerminal		= 0,
@@ -118,7 +118,7 @@ static const struct uvc_camera_terminal_descriptor uvc_camera_terminal = {
 static const struct uvc_processing_unit_descriptor uvc_processing = {
 	.bLength		= UVC_DT_PROCESSING_UNIT_SIZE(2),
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubType	= UVC_DT_PROCESSING_UNIT,
+	.bDescriptorSubType	= UVC_VC_PROCESSING_UNIT,
 	.bUnitID		= 2,
 	.bSourceID		= 1,
 	.wMaxMultiplier		= cpu_to_le16(16*1024),
@@ -131,7 +131,7 @@ static const struct uvc_processing_unit_descriptor uvc_processing = {
 static const struct uvc_output_terminal_descriptor uvc_output_terminal = {
 	.bLength		= UVC_DT_OUTPUT_TERMINAL_SIZE,
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubType	= UVC_DT_OUTPUT_TERMINAL,
+	.bDescriptorSubType	= UVC_VC_OUTPUT_TERMINAL,
 	.bTerminalID		= 3,
 	.wTerminalType		= cpu_to_le16(0x0101),
 	.bAssocTerminal		= 0,
@@ -144,7 +144,7 @@ DECLARE_UVC_INPUT_HEADER_DESCRIPTOR(1, 2);
 static const struct UVC_INPUT_HEADER_DESCRIPTOR(1, 2) uvc_input_header = {
 	.bLength		= UVC_DT_INPUT_HEADER_SIZE(1, 2),
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubType	= UVC_DT_INPUT_HEADER,
+	.bDescriptorSubType	= UVC_VS_INPUT_HEADER,
 	.bNumFormats		= 2,
 	.wTotalLength		= 0, /* dynamic */
 	.bEndpointAddress	= 0, /* dynamic */
@@ -161,7 +161,7 @@ static const struct UVC_INPUT_HEADER_DESCRIPTOR(1, 2) uvc_input_header = {
 static const struct uvc_format_uncompressed uvc_format_yuv = {
 	.bLength		= UVC_DT_FORMAT_UNCOMPRESSED_SIZE,
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubType	= UVC_DT_FORMAT_UNCOMPRESSED,
+	.bDescriptorSubType	= UVC_VS_FORMAT_UNCOMPRESSED,
 	.bFormatIndex		= 1,
 	.bNumFrameDescriptors	= 2,
 	.guidFormat		=
@@ -181,7 +181,7 @@ DECLARE_UVC_FRAME_UNCOMPRESSED(3);
 static const struct UVC_FRAME_UNCOMPRESSED(3) uvc_frame_yuv_360p = {
 	.bLength		= UVC_DT_FRAME_UNCOMPRESSED_SIZE(3),
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubType	= UVC_DT_FRAME_UNCOMPRESSED,
+	.bDescriptorSubType	= UVC_VS_FRAME_UNCOMPRESSED,
 	.bFrameIndex		= 1,
 	.bmCapabilities		= 0,
 	.wWidth			= cpu_to_le16(640),
@@ -199,7 +199,7 @@ static const struct UVC_FRAME_UNCOMPRESSED(3) uvc_frame_yuv_360p = {
 static const struct UVC_FRAME_UNCOMPRESSED(1) uvc_frame_yuv_720p = {
 	.bLength		= UVC_DT_FRAME_UNCOMPRESSED_SIZE(1),
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubType	= UVC_DT_FRAME_UNCOMPRESSED,
+	.bDescriptorSubType	= UVC_VS_FRAME_UNCOMPRESSED,
 	.bFrameIndex		= 2,
 	.bmCapabilities		= 0,
 	.wWidth			= cpu_to_le16(1280),
@@ -215,7 +215,7 @@ static const struct UVC_FRAME_UNCOMPRESSED(1) uvc_frame_yuv_720p = {
 static const struct uvc_format_mjpeg uvc_format_mjpg = {
 	.bLength		= UVC_DT_FORMAT_MJPEG_SIZE,
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubType	= UVC_DT_FORMAT_MJPEG,
+	.bDescriptorSubType	= UVC_VS_FORMAT_MJPEG,
 	.bFormatIndex		= 2,
 	.bNumFrameDescriptors	= 2,
 	.bmFlags		= 0,
@@ -232,7 +232,7 @@ DECLARE_UVC_FRAME_MJPEG(3);
 static const struct UVC_FRAME_MJPEG(3) uvc_frame_mjpg_360p = {
 	.bLength		= UVC_DT_FRAME_MJPEG_SIZE(3),
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubType	= UVC_DT_FRAME_MJPEG,
+	.bDescriptorSubType	= UVC_VS_FRAME_MJPEG,
 	.bFrameIndex		= 1,
 	.bmCapabilities		= 0,
 	.wWidth			= cpu_to_le16(640),
@@ -250,7 +250,7 @@ static const struct UVC_FRAME_MJPEG(3) uvc_frame_mjpg_360p = {
 static const struct UVC_FRAME_MJPEG(1) uvc_frame_mjpg_720p = {
 	.bLength		= UVC_DT_FRAME_MJPEG_SIZE(1),
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubType	= UVC_DT_FRAME_MJPEG,
+	.bDescriptorSubType	= UVC_VS_FRAME_MJPEG,
 	.bFrameIndex		= 2,
 	.bmCapabilities		= 0,
 	.wWidth			= cpu_to_le16(1280),
@@ -266,7 +266,7 @@ static const struct UVC_FRAME_MJPEG(1) uvc_frame_mjpg_720p = {
 static const struct uvc_color_matching_descriptor uvc_color_matching = {
 	.bLength		= UVC_DT_COLOR_MATCHING_SIZE,
 	.bDescriptorType	= USB_DT_CS_INTERFACE,
-	.bDescriptorSubType	= UVC_DT_COLOR_MATCHING,
+	.bDescriptorSubType	= UVC_VS_COLORFORMAT,
 	.bColorPrimaries	= 1,
 	.bTransferCharacteristics	= 1,
 	.bMatrixCoefficients	= 4,
diff --git a/include/linux/usb/video.h b/include/linux/usb/video.h
index 2d5b7fc6a265..3b3b95e01f71 100644
--- a/include/linux/usb/video.h
+++ b/include/linux/usb/video.h
@@ -160,6 +160,16 @@
 #define UVC_STATUS_TYPE_CONTROL				1
 #define UVC_STATUS_TYPE_STREAMING			2
 
+/* 2.4.3.3. Payload Header Information */
+#define UVC_STREAM_EOH					(1 << 7)
+#define UVC_STREAM_ERR					(1 << 6)
+#define UVC_STREAM_STI					(1 << 5)
+#define UVC_STREAM_RES					(1 << 4)
+#define UVC_STREAM_SCR					(1 << 3)
+#define UVC_STREAM_PTS					(1 << 2)
+#define UVC_STREAM_EOF					(1 << 1)
+#define UVC_STREAM_FID					(1 << 0)
+
 /* 4.1.2. Control Capabilities */
 #define UVC_CONTROL_CAP_GET				(1 << 0)
 #define UVC_CONTROL_CAP_SET				(1 << 1)
@@ -167,5 +177,392 @@
 #define UVC_CONTROL_CAP_AUTOUPDATE			(1 << 3)
 #define UVC_CONTROL_CAP_ASYNCHRONOUS			(1 << 4)
 
+/* ------------------------------------------------------------------------
+ * UVC structures
+ */
+
+/* All UVC descriptors have these 3 fields at the beginning */
+struct uvc_descriptor_header {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+} __attribute__((packed));
+
+/* 3.7.2. Video Control Interface Header Descriptor */
+struct uvc_header_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u16 bcdUVC;
+	__u16 wTotalLength;
+	__u32 dwClockFrequency;
+	__u8  bInCollection;
+	__u8  baInterfaceNr[];
+} __attribute__((__packed__));
+
+#define UVC_DT_HEADER_SIZE(n)				(12+(n))
+
+#define UVC_HEADER_DESCRIPTOR(n) \
+	uvc_header_descriptor_##n
+
+#define DECLARE_UVC_HEADER_DESCRIPTOR(n)		\
+struct UVC_HEADER_DESCRIPTOR(n) {			\
+	__u8  bLength;					\
+	__u8  bDescriptorType;				\
+	__u8  bDescriptorSubType;			\
+	__u16 bcdUVC;					\
+	__u16 wTotalLength;				\
+	__u32 dwClockFrequency;				\
+	__u8  bInCollection;				\
+	__u8  baInterfaceNr[n];				\
+} __attribute__ ((packed))
+
+/* 3.7.2.1. Input Terminal Descriptor */
+struct uvc_input_terminal_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bTerminalID;
+	__u16 wTerminalType;
+	__u8  bAssocTerminal;
+	__u8  iTerminal;
+} __attribute__((__packed__));
+
+#define UVC_DT_INPUT_TERMINAL_SIZE			8
+
+/* 3.7.2.2. Output Terminal Descriptor */
+struct uvc_output_terminal_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bTerminalID;
+	__u16 wTerminalType;
+	__u8  bAssocTerminal;
+	__u8  bSourceID;
+	__u8  iTerminal;
+} __attribute__((__packed__));
+
+#define UVC_DT_OUTPUT_TERMINAL_SIZE			9
+
+/* 3.7.2.3. Camera Terminal Descriptor */
+struct uvc_camera_terminal_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bTerminalID;
+	__u16 wTerminalType;
+	__u8  bAssocTerminal;
+	__u8  iTerminal;
+	__u16 wObjectiveFocalLengthMin;
+	__u16 wObjectiveFocalLengthMax;
+	__u16 wOcularFocalLength;
+	__u8  bControlSize;
+	__u8  bmControls[3];
+} __attribute__((__packed__));
+
+#define UVC_DT_CAMERA_TERMINAL_SIZE(n)			(15+(n))
+
+/* 3.7.2.4. Selector Unit Descriptor */
+struct uvc_selector_unit_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bUnitID;
+	__u8  bNrInPins;
+	__u8  baSourceID[0];
+	__u8  iSelector;
+} __attribute__((__packed__));
+
+#define UVC_DT_SELECTOR_UNIT_SIZE(n)			(6+(n))
+
+#define UVC_SELECTOR_UNIT_DESCRIPTOR(n)	\
+	uvc_selector_unit_descriptor_##n
+
+#define DECLARE_UVC_SELECTOR_UNIT_DESCRIPTOR(n)	\
+struct UVC_SELECTOR_UNIT_DESCRIPTOR(n) {		\
+	__u8  bLength;					\
+	__u8  bDescriptorType;				\
+	__u8  bDescriptorSubType;			\
+	__u8  bUnitID;					\
+	__u8  bNrInPins;				\
+	__u8  baSourceID[n];				\
+	__u8  iSelector;				\
+} __attribute__ ((packed))
+
+/* 3.7.2.5. Processing Unit Descriptor */
+struct uvc_processing_unit_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bUnitID;
+	__u8  bSourceID;
+	__u16 wMaxMultiplier;
+	__u8  bControlSize;
+	__u8  bmControls[2];
+	__u8  iProcessing;
+} __attribute__((__packed__));
+
+#define UVC_DT_PROCESSING_UNIT_SIZE(n)			(9+(n))
+
+/* 3.7.2.6. Extension Unit Descriptor */
+struct uvc_extension_unit_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bUnitID;
+	__u8  guidExtensionCode[16];
+	__u8  bNumControls;
+	__u8  bNrInPins;
+	__u8  baSourceID[0];
+	__u8  bControlSize;
+	__u8  bmControls[0];
+	__u8  iExtension;
+} __attribute__((__packed__));
+
+#define UVC_DT_EXTENSION_UNIT_SIZE(p, n)		(24+(p)+(n))
+
+#define UVC_EXTENSION_UNIT_DESCRIPTOR(p, n) \
+	uvc_extension_unit_descriptor_##p_##n
+
+#define DECLARE_UVC_EXTENSION_UNIT_DESCRIPTOR(p, n)	\
+struct UVC_EXTENSION_UNIT_DESCRIPTOR(p, n) {		\
+	__u8  bLength;					\
+	__u8  bDescriptorType;				\
+	__u8  bDescriptorSubType;			\
+	__u8  bUnitID;					\
+	__u8  guidExtensionCode[16];			\
+	__u8  bNumControls;				\
+	__u8  bNrInPins;				\
+	__u8  baSourceID[p];				\
+	__u8  bControlSize;				\
+	__u8  bmControls[n];				\
+	__u8  iExtension;				\
+} __attribute__ ((packed))
+
+/* 3.8.2.2. Video Control Interrupt Endpoint Descriptor */
+struct uvc_control_endpoint_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u16 wMaxTransferSize;
+} __attribute__((__packed__));
+
+#define UVC_DT_CONTROL_ENDPOINT_SIZE			5
+
+/* 3.9.2.1. Input Header Descriptor */
+struct uvc_input_header_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bNumFormats;
+	__u16 wTotalLength;
+	__u8  bEndpointAddress;
+	__u8  bmInfo;
+	__u8  bTerminalLink;
+	__u8  bStillCaptureMethod;
+	__u8  bTriggerSupport;
+	__u8  bTriggerUsage;
+	__u8  bControlSize;
+	__u8  bmaControls[];
+} __attribute__((__packed__));
+
+#define UVC_DT_INPUT_HEADER_SIZE(n, p)			(13+(n*p))
+
+#define UVC_INPUT_HEADER_DESCRIPTOR(n, p) \
+	uvc_input_header_descriptor_##n_##p
+
+#define DECLARE_UVC_INPUT_HEADER_DESCRIPTOR(n, p)	\
+struct UVC_INPUT_HEADER_DESCRIPTOR(n, p) {		\
+	__u8  bLength;					\
+	__u8  bDescriptorType;				\
+	__u8  bDescriptorSubType;			\
+	__u8  bNumFormats;				\
+	__u16 wTotalLength;				\
+	__u8  bEndpointAddress;				\
+	__u8  bmInfo;					\
+	__u8  bTerminalLink;				\
+	__u8  bStillCaptureMethod;			\
+	__u8  bTriggerSupport;				\
+	__u8  bTriggerUsage;				\
+	__u8  bControlSize;				\
+	__u8  bmaControls[p][n];			\
+} __attribute__ ((packed))
+
+/* 3.9.2.2. Output Header Descriptor */
+struct uvc_output_header_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bNumFormats;
+	__u16 wTotalLength;
+	__u8  bEndpointAddress;
+	__u8  bTerminalLink;
+	__u8  bControlSize;
+	__u8  bmaControls[];
+} __attribute__((__packed__));
+
+#define UVC_DT_OUTPUT_HEADER_SIZE(n, p)			(9+(n*p))
+
+#define UVC_OUTPUT_HEADER_DESCRIPTOR(n, p) \
+	uvc_output_header_descriptor_##n_##p
+
+#define DECLARE_UVC_OUTPUT_HEADER_DESCRIPTOR(n, p)	\
+struct UVC_OUTPUT_HEADER_DESCRIPTOR(n, p) {		\
+	__u8  bLength;					\
+	__u8  bDescriptorType;				\
+	__u8  bDescriptorSubType;			\
+	__u8  bNumFormats;				\
+	__u16 wTotalLength;				\
+	__u8  bEndpointAddress;				\
+	__u8  bTerminalLink;				\
+	__u8  bControlSize;				\
+	__u8  bmaControls[p][n];			\
+} __attribute__ ((packed))
+
+/* 3.9.2.6. Color matching descriptor */
+struct uvc_color_matching_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bColorPrimaries;
+	__u8  bTransferCharacteristics;
+	__u8  bMatrixCoefficients;
+} __attribute__((__packed__));
+
+#define UVC_DT_COLOR_MATCHING_SIZE			6
+
+/* 4.3.1.1. Video Probe and Commit Controls */
+struct uvc_streaming_control {
+	__u16 bmHint;
+	__u8  bFormatIndex;
+	__u8  bFrameIndex;
+	__u32 dwFrameInterval;
+	__u16 wKeyFrameRate;
+	__u16 wPFrameRate;
+	__u16 wCompQuality;
+	__u16 wCompWindowSize;
+	__u16 wDelay;
+	__u32 dwMaxVideoFrameSize;
+	__u32 dwMaxPayloadTransferSize;
+	__u32 dwClockFrequency;
+	__u8  bmFramingInfo;
+	__u8  bPreferedVersion;
+	__u8  bMinVersion;
+	__u8  bMaxVersion;
+} __attribute__((__packed__));
+
+/* Uncompressed Payload - 3.1.1. Uncompressed Video Format Descriptor */
+struct uvc_format_uncompressed {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bFormatIndex;
+	__u8  bNumFrameDescriptors;
+	__u8  guidFormat[16];
+	__u8  bBitsPerPixel;
+	__u8  bDefaultFrameIndex;
+	__u8  bAspectRatioX;
+	__u8  bAspectRatioY;
+	__u8  bmInterfaceFlags;
+	__u8  bCopyProtect;
+} __attribute__((__packed__));
+
+#define UVC_DT_FORMAT_UNCOMPRESSED_SIZE			27
+
+/* Uncompressed Payload - 3.1.2. Uncompressed Video Frame Descriptor */
+struct uvc_frame_uncompressed {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bFrameIndex;
+	__u8  bmCapabilities;
+	__u16 wWidth;
+	__u16 wHeight;
+	__u32 dwMinBitRate;
+	__u32 dwMaxBitRate;
+	__u32 dwMaxVideoFrameBufferSize;
+	__u32 dwDefaultFrameInterval;
+	__u8  bFrameIntervalType;
+	__u32 dwFrameInterval[];
+} __attribute__((__packed__));
+
+#define UVC_DT_FRAME_UNCOMPRESSED_SIZE(n)		(26+4*(n))
+
+#define UVC_FRAME_UNCOMPRESSED(n) \
+	uvc_frame_uncompressed_##n
+
+#define DECLARE_UVC_FRAME_UNCOMPRESSED(n)		\
+struct UVC_FRAME_UNCOMPRESSED(n) {			\
+	__u8  bLength;					\
+	__u8  bDescriptorType;				\
+	__u8  bDescriptorSubType;			\
+	__u8  bFrameIndex;				\
+	__u8  bmCapabilities;				\
+	__u16 wWidth;					\
+	__u16 wHeight;					\
+	__u32 dwMinBitRate;				\
+	__u32 dwMaxBitRate;				\
+	__u32 dwMaxVideoFrameBufferSize;		\
+	__u32 dwDefaultFrameInterval;			\
+	__u8  bFrameIntervalType;			\
+	__u32 dwFrameInterval[n];			\
+} __attribute__ ((packed))
+
+/* MJPEG Payload - 3.1.1. MJPEG Video Format Descriptor */
+struct uvc_format_mjpeg {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bFormatIndex;
+	__u8  bNumFrameDescriptors;
+	__u8  bmFlags;
+	__u8  bDefaultFrameIndex;
+	__u8  bAspectRatioX;
+	__u8  bAspectRatioY;
+	__u8  bmInterfaceFlags;
+	__u8  bCopyProtect;
+} __attribute__((__packed__));
+
+#define UVC_DT_FORMAT_MJPEG_SIZE			11
+
+/* MJPEG Payload - 3.1.2. MJPEG Video Frame Descriptor */
+struct uvc_frame_mjpeg {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bFrameIndex;
+	__u8  bmCapabilities;
+	__u16 wWidth;
+	__u16 wHeight;
+	__u32 dwMinBitRate;
+	__u32 dwMaxBitRate;
+	__u32 dwMaxVideoFrameBufferSize;
+	__u32 dwDefaultFrameInterval;
+	__u8  bFrameIntervalType;
+	__u32 dwFrameInterval[];
+} __attribute__((__packed__));
+
+#define UVC_DT_FRAME_MJPEG_SIZE(n)			(26+4*(n))
+
+#define UVC_FRAME_MJPEG(n) \
+	uvc_frame_mjpeg_##n
+
+#define DECLARE_UVC_FRAME_MJPEG(n)			\
+struct UVC_FRAME_MJPEG(n) {				\
+	__u8  bLength;					\
+	__u8  bDescriptorType;				\
+	__u8  bDescriptorSubType;			\
+	__u8  bFrameIndex;				\
+	__u8  bmCapabilities;				\
+	__u16 wWidth;					\
+	__u16 wHeight;					\
+	__u32 dwMinBitRate;				\
+	__u32 dwMaxBitRate;				\
+	__u32 dwMaxVideoFrameBufferSize;		\
+	__u32 dwDefaultFrameInterval;			\
+	__u8  bFrameIntervalType;			\
+	__u32 dwFrameInterval[n];			\
+} __attribute__ ((packed))
+
 #endif /* __LINUX_USB_VIDEO_H */
 
-- 
cgit v1.2.3-59-g8ed1b


From 67b284d476bcb3d100e946da23d6cf9acfd0465c Mon Sep 17 00:00:00 2001
From: Matt Carlson <mcarlson@broadcom.com>
Date: Mon, 2 Aug 2010 11:26:02 +0000
Subject: tg3: Remove 5720, 5750, and 5750M

These devices were never released to the public.

Reviewed-by: Benjamin Li <benli@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tg3.c       | 3 ---
 include/linux/pci_ids.h | 3 ---
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index a52f52fbb477..32e3a3de4c6d 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -221,12 +221,9 @@ static DEFINE_PCI_DEVICE_TABLE(tg3_pci_tbl) = {
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5901_2)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5704S_2)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5705F)},
-	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5720)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5721)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5722)},
-	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5750)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5751)},
-	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5750M)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5751M)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5751F)},
 	{PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5752)},
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index ae66851870be..9ac60dabb6ff 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2053,7 +2053,6 @@
 #define PCI_DEVICE_ID_NX2_57711E	0x1650
 #define PCI_DEVICE_ID_TIGON3_5705	0x1653
 #define PCI_DEVICE_ID_TIGON3_5705_2	0x1654
-#define PCI_DEVICE_ID_TIGON3_5720	0x1658
 #define PCI_DEVICE_ID_TIGON3_5721	0x1659
 #define PCI_DEVICE_ID_TIGON3_5722	0x165a
 #define PCI_DEVICE_ID_TIGON3_5723	0x165b
@@ -2067,13 +2066,11 @@
 #define PCI_DEVICE_ID_TIGON3_5754M	0x1672
 #define PCI_DEVICE_ID_TIGON3_5755M	0x1673
 #define PCI_DEVICE_ID_TIGON3_5756	0x1674
-#define PCI_DEVICE_ID_TIGON3_5750	0x1676
 #define PCI_DEVICE_ID_TIGON3_5751	0x1677
 #define PCI_DEVICE_ID_TIGON3_5715	0x1678
 #define PCI_DEVICE_ID_TIGON3_5715S	0x1679
 #define PCI_DEVICE_ID_TIGON3_5754	0x167a
 #define PCI_DEVICE_ID_TIGON3_5755	0x167b
-#define PCI_DEVICE_ID_TIGON3_5750M	0x167c
 #define PCI_DEVICE_ID_TIGON3_5751M	0x167d
 #define PCI_DEVICE_ID_TIGON3_5751F	0x167e
 #define PCI_DEVICE_ID_TIGON3_5787F	0x167f
-- 
cgit v1.2.3-59-g8ed1b


From 9292d8f20ff3c034c99c2adfe27496957b3defe3 Mon Sep 17 00:00:00 2001
From: Krzysztof Hałasa <khc@pm.waw.pl>
Date: Mon, 2 Aug 2010 16:03:29 -0700
Subject: Tulip: don't initialize SBE xT3E3 WAN ports.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SBE 2T3E3 cards use DECchips 21143 but they need a different driver.
Don't even try to use a normal tulip driver with them.

Signed-off-by: Krzysztof Hałasa <khc@pm.waw.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tulip/tulip_core.c | 6 ++++++
 include/linux/pci_ids.h        | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/tulip/tulip_core.c b/drivers/net/tulip/tulip_core.c
index 14e5312e906e..3a8d7efa2acf 100644
--- a/drivers/net/tulip/tulip_core.c
+++ b/drivers/net/tulip/tulip_core.c
@@ -1341,6 +1341,12 @@ static int __devinit tulip_init_one (struct pci_dev *pdev,
         if (pdev->subsystem_vendor == PCI_VENDOR_ID_LMC) {
 		pr_err(PFX "skipping LMC card\n");
 		return -ENODEV;
+	} else if (pdev->subsystem_vendor == PCI_VENDOR_ID_SBE &&
+		   (pdev->subsystem_device == PCI_SUBDEVICE_ID_SBE_T3E3 ||
+		    pdev->subsystem_device == PCI_SUBDEVICE_ID_SBE_2T3E3_P0 ||
+		    pdev->subsystem_device == PCI_SUBDEVICE_ID_SBE_2T3E3_P1)) {
+		pr_err(PFX "skipping SBE T3E3 port\n");
+		return -ENODEV;
 	}
 
 	/*
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 9ac60dabb6ff..384c2a25db1f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1493,6 +1493,9 @@
 #define PCI_DEVICE_ID_SBE_WANXL100	0x0301
 #define PCI_DEVICE_ID_SBE_WANXL200	0x0302
 #define PCI_DEVICE_ID_SBE_WANXL400	0x0104
+#define PCI_SUBDEVICE_ID_SBE_T3E3	0x0009
+#define PCI_SUBDEVICE_ID_SBE_2T3E3_P0	0x0901
+#define PCI_SUBDEVICE_ID_SBE_2T3E3_P1	0x0902
 
 #define PCI_VENDOR_ID_TOSHIBA		0x1179
 #define PCI_DEVICE_ID_TOSHIBA_PICCOLO_1	0x0101
-- 
cgit v1.2.3-59-g8ed1b


From 7957e9c4d175cc065f4277211fcb7d784fcee860 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Mon, 2 Aug 2010 19:33:51 -0700
Subject: Input: add static inline accessors for ABS properties

In preparation for dynamically allocated ABS axis, introduce a number of
static inline access helpers. This should make the transition less
painful.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/input.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index 339d043ccb53..4a5531161de1 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -1469,6 +1469,36 @@ static inline void input_set_abs_params(struct input_dev *dev, int axis, int min
 	dev->absbit[BIT_WORD(axis)] |= BIT_MASK(axis);
 }
 
+#define INPUT_GENERATE_ABS_ACCESSORS(_suffix, _item)			\
+static inline int input_abs_get_##_suffix(struct input_dev *dev,	\
+					  unsigned int axis)		\
+{									\
+	return dev->abs##_item[axis];					\
+}									\
+									\
+static inline void input_abs_set_##_suffix(struct input_dev *dev,	\
+					   unsigned int axis, int val)	\
+{									\
+	dev->abs##_item[axis] = val;					\
+}
+
+INPUT_GENERATE_ABS_ACCESSORS(min, min)
+INPUT_GENERATE_ABS_ACCESSORS(max, max)
+INPUT_GENERATE_ABS_ACCESSORS(fuzz, fuzz)
+INPUT_GENERATE_ABS_ACCESSORS(flat, flat)
+INPUT_GENERATE_ABS_ACCESSORS(res, res)
+
+static inline int input_abs_get_val(struct input_dev *dev, unsigned int axis)
+{
+	return dev->abs[axis];
+}
+
+static inline void input_abs_set_val(struct input_dev *dev,
+				     unsigned int axis, int val)
+{
+	dev->abs[axis] = val;
+}
+
 int input_get_keycode(struct input_dev *dev,
 		      unsigned int scancode, unsigned int *keycode);
 int input_set_keycode(struct input_dev *dev,
-- 
cgit v1.2.3-59-g8ed1b


From d31b2865a4e8a9dd02f39e56c8fadb824c5e187b Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Mon, 2 Aug 2010 20:18:21 -0700
Subject: Input: dynamically allocate ABS information

As all callers are now changed to only use the input_abs_*() access
helpers, switching over to dynamically allocated ABS information is
easy. This reduces size of struct input_dev from 3152 to 1640 on
64 bit architectures.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/evdev.c | 21 +++++--------------
 drivers/input/input.c | 42 +++++++++++++++++++++++++++++++++++--
 include/linux/input.h | 57 ++++++++++++++++-----------------------------------
 3 files changed, 63 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 9807c8ff6a84..08f48c03eec4 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -649,13 +649,7 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
 			if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCGABS(0))) {
 
 				t = _IOC_NR(cmd) & ABS_MAX;
-
-				abs.value = input_abs_get_val(dev, t);
-				abs.minimum = input_abs_get_min(dev, t);
-				abs.maximum = input_abs_get_max(dev, t);
-				abs.fuzz = input_abs_get_fuzz(dev, t);
-				abs.flat = input_abs_get_flat(dev, t);
-				abs.resolution = input_abs_get_res(dev, t);
+				abs = dev->absinfo[t];
 
 				if (copy_to_user(p, &abs, min_t(size_t,
 								_IOC_SIZE(cmd),
@@ -691,6 +685,9 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
 								  sizeof(struct input_absinfo))))
 					return -EFAULT;
 
+				if (_IOC_SIZE(cmd) < sizeof(struct input_absinfo))
+					abs.resolution = 0;
+
 				/* We can't change number of reserved MT slots */
 				if (t == ABS_MT_SLOT)
 					return -EINVAL;
@@ -701,15 +698,7 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
 				 * of event.
 				 */
 				spin_lock_irq(&dev->event_lock);
-
-				input_abs_set_val(dev, t, abs.value);
-				input_abs_set_min(dev, t, abs.minimum);
-				input_abs_set_max(dev, t, abs.maximum);
-				input_abs_set_fuzz(dev, t, abs.fuzz);
-				input_abs_set_flat(dev, t, abs.flat);
-				input_abs_set_res(dev, t, _IOC_SIZE(cmd) < sizeof(struct input_absinfo) ?
-								0 : abs.resolution);
-
+				dev->absinfo[t] = abs;
 				spin_unlock_irq(&dev->event_lock);
 
 				return 0;
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 7259adb8619d..a9b025f4147a 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -182,7 +182,7 @@ static int input_handle_abs_event(struct input_dev *dev,
 	is_mt_event = code >= ABS_MT_FIRST && code <= ABS_MT_LAST;
 
 	if (!is_mt_event) {
-		pold = &dev->abs[code];
+		pold = &dev->absinfo[code].value;
 	} else if (dev->mt) {
 		struct input_mt_slot *mtslot = &dev->mt[dev->slot];
 		pold = &mtslot->abs[code - ABS_MT_FIRST];
@@ -196,7 +196,7 @@ static int input_handle_abs_event(struct input_dev *dev,
 
 	if (pold) {
 		*pval = input_defuzz_abs_event(*pval, *pold,
-						dev->absfuzz[code]);
+						dev->absinfo[code].fuzz);
 		if (*pold == *pval)
 			return INPUT_IGNORE_EVENT;
 
@@ -390,6 +390,43 @@ void input_inject_event(struct input_handle *handle,
 }
 EXPORT_SYMBOL(input_inject_event);
 
+/**
+ * input_alloc_absinfo - allocates array of input_absinfo structs
+ * @dev: the input device emitting absolute events
+ *
+ * If the absinfo struct the caller asked for is already allocated, this
+ * functions will not do anything.
+ */
+void input_alloc_absinfo(struct input_dev *dev)
+{
+	if (!dev->absinfo)
+		dev->absinfo = kcalloc(ABS_CNT, sizeof(struct input_absinfo),
+					GFP_KERNEL);
+
+	WARN(!dev->absinfo, "%s(): kcalloc() failed?\n", __func__);
+}
+EXPORT_SYMBOL(input_alloc_absinfo);
+
+void input_set_abs_params(struct input_dev *dev, unsigned int axis,
+			  int min, int max, int fuzz, int flat)
+{
+	struct input_absinfo *absinfo;
+
+	input_alloc_absinfo(dev);
+	if (!dev->absinfo)
+		return;
+
+	absinfo = &dev->absinfo[axis];
+	absinfo->minimum = min;
+	absinfo->maximum = max;
+	absinfo->fuzz = fuzz;
+	absinfo->flat = flat;
+
+	dev->absbit[BIT_WORD(axis)] |= BIT_MASK(axis);
+}
+EXPORT_SYMBOL(input_set_abs_params);
+
+
 /**
  * input_grab_device - grabs device for exclusive use
  * @handle: input handle that wants to own the device
@@ -1308,6 +1345,7 @@ static void input_dev_release(struct device *device)
 
 	input_ff_destroy(dev);
 	input_mt_destroy_slots(dev);
+	kfree(dev->absinfo);
 	kfree(dev);
 
 	module_put(THIS_MODULE);
diff --git a/include/linux/input.h b/include/linux/input.h
index 4a5531161de1..896a92227bc4 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -776,6 +776,7 @@ struct input_absinfo {
 #define REP_DELAY		0x00
 #define REP_PERIOD		0x01
 #define REP_MAX			0x01
+#define REP_CNT			(REP_MAX+1)
 
 /*
  * Sounds
@@ -1099,21 +1100,18 @@ struct input_mt_slot {
  * @repeat_key: stores key code of the last key pressed; used to implement
  *	software autorepeat
  * @timer: timer for software autorepeat
- * @abs: current values for reports from absolute axes
  * @rep: current values for autorepeat parameters (delay, rate)
  * @mt: pointer to array of struct input_mt_slot holding current values
  *	of tracked contacts
  * @mtsize: number of MT slots the device uses
  * @slot: MT slot currently being transmitted
+ * @absinfo: array of &struct absinfo elements holding information
+ *	about absolute axes (current value, min, max, flat, fuzz,
+ *	resolution)
  * @key: reflects current state of device's keys/buttons
  * @led: reflects current state of device's LEDs
  * @snd: reflects current state of sound effects
  * @sw: reflects current state of device's switches
- * @absmax: maximum values for events coming from absolute axes
- * @absmin: minimum values for events coming from absolute axes
- * @absfuzz: describes noisiness for axes
- * @absflat: size of the center flat position (used by joydev)
- * @absres: resolution used for events coming form absolute axes
  * @open: this method is called when the very first user calls
  *	input_open_device(). The driver must prepare the device
  *	to start generating events (start polling thread,
@@ -1180,24 +1178,19 @@ struct input_dev {
 	unsigned int repeat_key;
 	struct timer_list timer;
 
-	int abs[ABS_CNT];
-	int rep[REP_MAX + 1];
+	int rep[REP_CNT];
 
 	struct input_mt_slot *mt;
 	int mtsize;
 	int slot;
 
+	struct input_absinfo *absinfo;
+
 	unsigned long key[BITS_TO_LONGS(KEY_CNT)];
 	unsigned long led[BITS_TO_LONGS(LED_CNT)];
 	unsigned long snd[BITS_TO_LONGS(SND_CNT)];
 	unsigned long sw[BITS_TO_LONGS(SW_CNT)];
 
-	int absmax[ABS_CNT];
-	int absmin[ABS_CNT];
-	int absfuzz[ABS_CNT];
-	int absflat[ABS_CNT];
-	int absres[ABS_CNT];
-
 	int (*open)(struct input_dev *dev);
 	void (*close)(struct input_dev *dev);
 	int (*flush)(struct input_dev *dev, struct file *file);
@@ -1459,45 +1452,31 @@ static inline void input_set_events_per_packet(struct input_dev *dev, int n_even
 	dev->hint_events_per_packet = n_events;
 }
 
-static inline void input_set_abs_params(struct input_dev *dev, int axis, int min, int max, int fuzz, int flat)
-{
-	dev->absmin[axis] = min;
-	dev->absmax[axis] = max;
-	dev->absfuzz[axis] = fuzz;
-	dev->absflat[axis] = flat;
-
-	dev->absbit[BIT_WORD(axis)] |= BIT_MASK(axis);
-}
+void input_alloc_absinfo(struct input_dev *dev);
+void input_set_abs_params(struct input_dev *dev, unsigned int axis,
+			  int min, int max, int fuzz, int flat);
 
 #define INPUT_GENERATE_ABS_ACCESSORS(_suffix, _item)			\
 static inline int input_abs_get_##_suffix(struct input_dev *dev,	\
 					  unsigned int axis)		\
 {									\
-	return dev->abs##_item[axis];					\
+	return dev->absinfo ? dev->absinfo[axis]._item : 0;		\
 }									\
 									\
 static inline void input_abs_set_##_suffix(struct input_dev *dev,	\
 					   unsigned int axis, int val)	\
 {									\
-	dev->abs##_item[axis] = val;					\
+	input_alloc_absinfo(dev);					\
+	if (dev->absinfo)						\
+		dev->absinfo[axis]._item = val;				\
 }
 
-INPUT_GENERATE_ABS_ACCESSORS(min, min)
-INPUT_GENERATE_ABS_ACCESSORS(max, max)
+INPUT_GENERATE_ABS_ACCESSORS(val, value)
+INPUT_GENERATE_ABS_ACCESSORS(min, minimum)
+INPUT_GENERATE_ABS_ACCESSORS(max, maximum)
 INPUT_GENERATE_ABS_ACCESSORS(fuzz, fuzz)
 INPUT_GENERATE_ABS_ACCESSORS(flat, flat)
-INPUT_GENERATE_ABS_ACCESSORS(res, res)
-
-static inline int input_abs_get_val(struct input_dev *dev, unsigned int axis)
-{
-	return dev->abs[axis];
-}
-
-static inline void input_abs_set_val(struct input_dev *dev,
-				     unsigned int axis, int val)
-{
-	dev->abs[axis] = val;
-}
+INPUT_GENERATE_ABS_ACCESSORS(res, resolution)
 
 int input_get_keycode(struct input_dev *dev,
 		      unsigned int scancode, unsigned int *keycode);
-- 
cgit v1.2.3-59-g8ed1b


From cff0d6e6edac7672b3f915bb4fb59f279243b7f9 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 3 Aug 2010 00:31:48 -0700
Subject: can-raw: Fix skb_orphan_try handling

Commit fc6055a5ba31e2c14e36e8939f9bf2b6d586a7f5 (net: Introduce
skb_orphan_try()) allows an early orphan of the skb and takes care on
tx timestamping, which needs the sk-reference in the skb on driver level.
So does the can-raw socket, which has not been taken into account here.

The patch below adds a 'prevent_sk_orphan' bit in the skb tx shared info,
which fixes the problem discovered by Matthias Fuchs here:

      http://marc.info/?t=128030411900003&r=1&w=2

Even if it's not a primary tx timestamp topic it fits well into some skb
shared tx context. Or should be find a different place for the information to
protect the sk reference until it reaches the driver level?

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 4 +++-
 net/can/raw.c          | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d89876b806a0..d20d9e7a9bbd 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -169,6 +169,7 @@ struct skb_shared_hwtstamps {
  * @software:		generate software time stamp
  * @in_progress:	device driver is going to provide
  *			hardware time stamp
+ * @prevent_sk_orphan:	make sk reference available on driver level
  * @flags:		all shared_tx flags
  *
  * These flags are attached to packets as part of the
@@ -178,7 +179,8 @@ union skb_shared_tx {
 	struct {
 		__u8	hardware:1,
 			software:1,
-			in_progress:1;
+			in_progress:1,
+			prevent_sk_orphan:1;
 	};
 	__u8 flags;
 };
diff --git a/net/can/raw.c b/net/can/raw.c
index ccfe633eec8e..a10e3338f084 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -650,6 +650,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock,
 	err = sock_tx_timestamp(msg, sk, skb_tx(skb));
 	if (err < 0)
 		goto free_skb;
+
+	/* to be able to check the received tx sock reference in raw_rcv() */
+	skb_tx(skb)->prevent_sk_orphan = 1;
+
 	skb->dev = dev;
 	skb->sk  = sk;
 
-- 
cgit v1.2.3-59-g8ed1b


From 4565956dc0847985c0403c9ebbf274b6a122e1e2 Mon Sep 17 00:00:00 2001
From: James Chapman <jchapman@katalix.com>
Date: Tue, 3 Aug 2010 00:42:17 -0700
Subject: l2tp: fix export of header file for userspace

The header file l2tp.h should be exported to the installed include/linux/
tree for userspace programs.

This patch fixes compilation errors in L2TP userspace apps which want to
use the new L2TP support introduced in 2.6.35.

Signed-off-by: James Chapman <jchapman@katalix.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/Kbuild | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 2fc8e14cc24a..9aa9bcadf869 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -276,6 +276,7 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \
 		  $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),)
 unifdef-y += kvm_para.h
 endif
+unifdef-y += l2tp.h
 unifdef-y += llc.h
 unifdef-y += loop.h
 unifdef-y += lp.h
-- 
cgit v1.2.3-59-g8ed1b


From 078ff546a806b2c2ab74c25c8edd4c6d4680656a Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@nokia.com>
Date: Wed, 17 Mar 2010 20:36:51 +0200
Subject: OMAP: DSS2: OMAPFB: Add support for switching memory regions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Separate the memory region from the framebuffer device a little bit.
It's now possible to select the memory region used by the framebuffer
device using the new mem_idx parameter of omapfb_plane_info. If the
mem_idx is specified it will be interpreted as an index into the
memory regions array, if it's not specified the framebuffer's index is
used instead. So by default each framebuffer keeps using it's own
memory region which preserves backwards compatibility.

This allows cloning the same memory region to several overlays and yet
each overlay can be controlled independently since they can be
associated with separate framebuffer devices.

Signed-off-by: Ville Syrjälä <ville.syrjala@nokia.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@nokia.com>
---
 drivers/video/omap2/omapfb/omapfb-ioctl.c | 125 ++++++++++++++++++++++++------
 drivers/video/omap2/omapfb/omapfb-main.c  |  71 ++++++++++-------
 drivers/video/omap2/omapfb/omapfb-sysfs.c |  37 ++++++---
 drivers/video/omap2/omapfb/omapfb.h       |   9 ++-
 include/linux/omapfb.h                    |   5 +-
 5 files changed, 183 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/omap2/omapfb/omapfb-ioctl.c b/drivers/video/omap2/omapfb/omapfb-ioctl.c
index 9c7361871d78..6635bd75affa 100644
--- a/drivers/video/omap2/omapfb/omapfb-ioctl.c
+++ b/drivers/video/omap2/omapfb/omapfb-ioctl.c
@@ -34,12 +34,37 @@
 
 #include "omapfb.h"
 
+static u8 get_mem_idx(struct omapfb_info *ofbi)
+{
+	if (ofbi->id == ofbi->region->id)
+		return 0;
+
+	return OMAPFB_MEM_IDX_ENABLED | ofbi->region->id;
+}
+
+static struct omapfb2_mem_region *get_mem_region(struct omapfb_info *ofbi,
+						 u8 mem_idx)
+{
+	struct omapfb2_device *fbdev = ofbi->fbdev;
+
+	if (mem_idx & OMAPFB_MEM_IDX_ENABLED)
+		mem_idx &= OMAPFB_MEM_IDX_MASK;
+	else
+		mem_idx = ofbi->id;
+
+	if (mem_idx >= fbdev->num_fbs)
+		return NULL;
+
+	return &fbdev->regions[mem_idx];
+}
+
 static int omapfb_setup_plane(struct fb_info *fbi, struct omapfb_plane_info *pi)
 {
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 	struct omapfb2_device *fbdev = ofbi->fbdev;
 	struct omap_overlay *ovl;
-	struct omap_overlay_info info;
+	struct omap_overlay_info old_info;
+	struct omapfb2_mem_region *old_rg, *new_rg;
 	int r = 0;
 
 	DBG("omapfb_setup_plane\n");
@@ -52,7 +77,14 @@ static int omapfb_setup_plane(struct fb_info *fbi, struct omapfb_plane_info *pi)
 	/* XXX uses only the first overlay */
 	ovl = ofbi->overlays[0];
 
-	if (pi->enabled && !ofbi->region.size) {
+	old_rg = ofbi->region;
+	new_rg = get_mem_region(ofbi, pi->mem_idx);
+	if (!new_rg) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (pi->enabled && !new_rg->size) {
 		/*
 		 * This plane's memory was freed, can't enable it
 		 * until it's reallocated.
@@ -61,27 +93,60 @@ static int omapfb_setup_plane(struct fb_info *fbi, struct omapfb_plane_info *pi)
 		goto out;
 	}
 
-	ovl->get_overlay_info(ovl, &info);
+	ovl->get_overlay_info(ovl, &old_info);
 
-	info.pos_x = pi->pos_x;
-	info.pos_y = pi->pos_y;
-	info.out_width = pi->out_width;
-	info.out_height = pi->out_height;
-	info.enabled = pi->enabled;
+	if (old_rg != new_rg) {
+		ofbi->region = new_rg;
+		set_fb_fix(fbi);
+	}
 
-	r = ovl->set_overlay_info(ovl, &info);
-	if (r)
-		goto out;
+	if (pi->enabled) {
+		struct omap_overlay_info info;
+
+		r = omapfb_setup_overlay(fbi, ovl, pi->pos_x, pi->pos_y,
+			pi->out_width, pi->out_height);
+		if (r)
+			goto undo;
 
-	if (ovl->manager) {
-		r = ovl->manager->apply(ovl->manager);
+		ovl->get_overlay_info(ovl, &info);
+
+		if (!info.enabled) {
+			info.enabled = pi->enabled;
+			r = ovl->set_overlay_info(ovl, &info);
+			if (r)
+				goto undo;
+		}
+	} else {
+		struct omap_overlay_info info;
+
+		ovl->get_overlay_info(ovl, &info);
+
+		info.enabled = pi->enabled;
+		info.pos_x = pi->pos_x;
+		info.pos_y = pi->pos_y;
+		info.out_width = pi->out_width;
+		info.out_height = pi->out_height;
+
+		r = ovl->set_overlay_info(ovl, &info);
 		if (r)
-			goto out;
+			goto undo;
 	}
 
-out:
-	if (r)
-		dev_err(fbdev->dev, "setup_plane failed\n");
+	if (ovl->manager)
+		ovl->manager->apply(ovl->manager);
+
+	return 0;
+
+ undo:
+	if (old_rg != new_rg) {
+		ofbi->region = old_rg;
+		set_fb_fix(fbi);
+	}
+
+	ovl->set_overlay_info(ovl, &old_info);
+ out:
+	dev_err(fbdev->dev, "setup_plane failed\n");
+
 	return r;
 }
 
@@ -92,8 +157,8 @@ static int omapfb_query_plane(struct fb_info *fbi, struct omapfb_plane_info *pi)
 	if (ofbi->num_overlays != 1) {
 		memset(pi, 0, sizeof(*pi));
 	} else {
-		struct omap_overlay_info *ovli;
 		struct omap_overlay *ovl;
+		struct omap_overlay_info *ovli;
 
 		ovl = ofbi->overlays[0];
 		ovli = &ovl->info;
@@ -103,6 +168,7 @@ static int omapfb_query_plane(struct fb_info *fbi, struct omapfb_plane_info *pi)
 		pi->enabled = ovli->enabled;
 		pi->channel_out = 0; /* xxx */
 		pi->mirror = 0;
+		pi->mem_idx = get_mem_idx(ofbi);
 		pi->out_width = ovli->out_width;
 		pi->out_height = ovli->out_height;
 	}
@@ -123,11 +189,24 @@ static int omapfb_setup_mem(struct fb_info *fbi, struct omapfb_mem_info *mi)
 
 	size = PAGE_ALIGN(mi->size);
 
-	rg = &ofbi->region;
+	rg = ofbi->region;
 
-	for (i = 0; i < ofbi->num_overlays; i++) {
-		if (ofbi->overlays[i]->info.enabled)
-			return -EBUSY;
+	if (atomic_read(&rg->map_count))
+		return -EBUSY;
+
+	for (i = 0; i < fbdev->num_fbs; i++) {
+		struct omapfb_info *ofbi2 = FB2OFB(fbdev->fbs[i]);
+		int j;
+
+		if (ofbi2->region != rg)
+			continue;
+
+		for (j = 0; j < ofbi2->num_overlays; j++) {
+			if (ofbi2->overlays[j]->info.enabled) {
+				r = -EBUSY;
+				return r;
+			}
+		}
 	}
 
 	if (rg->size != size || rg->type != mi->type) {
@@ -146,7 +225,7 @@ static int omapfb_query_mem(struct fb_info *fbi, struct omapfb_mem_info *mi)
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 	struct omapfb2_mem_region *rg;
 
-	rg = &ofbi->region;
+	rg = ofbi->region;
 	memset(mi, 0, sizeof(*mi));
 
 	mi->size = rg->size;
diff --git a/drivers/video/omap2/omapfb/omapfb-main.c b/drivers/video/omap2/omapfb/omapfb-main.c
index ccccf3d71a39..4a0588022b33 100644
--- a/drivers/video/omap2/omapfb/omapfb-main.c
+++ b/drivers/video/omap2/omapfb/omapfb-main.c
@@ -157,7 +157,7 @@ static void fill_fb(struct fb_info *fbi)
 
 static unsigned omapfb_get_vrfb_offset(const struct omapfb_info *ofbi, int rot)
 {
-	const struct vrfb *vrfb = &ofbi->region.vrfb;
+	const struct vrfb *vrfb = &ofbi->region->vrfb;
 	unsigned offset;
 
 	switch (rot) {
@@ -185,27 +185,27 @@ static unsigned omapfb_get_vrfb_offset(const struct omapfb_info *ofbi, int rot)
 static u32 omapfb_get_region_rot_paddr(const struct omapfb_info *ofbi, int rot)
 {
 	if (ofbi->rotation_type == OMAP_DSS_ROT_VRFB) {
-		return ofbi->region.vrfb.paddr[rot]
+		return ofbi->region->vrfb.paddr[rot]
 			+ omapfb_get_vrfb_offset(ofbi, rot);
 	} else {
-		return ofbi->region.paddr;
+		return ofbi->region->paddr;
 	}
 }
 
 static u32 omapfb_get_region_paddr(const struct omapfb_info *ofbi)
 {
 	if (ofbi->rotation_type == OMAP_DSS_ROT_VRFB)
-		return ofbi->region.vrfb.paddr[0];
+		return ofbi->region->vrfb.paddr[0];
 	else
-		return ofbi->region.paddr;
+		return ofbi->region->paddr;
 }
 
 static void __iomem *omapfb_get_region_vaddr(const struct omapfb_info *ofbi)
 {
 	if (ofbi->rotation_type == OMAP_DSS_ROT_VRFB)
-		return ofbi->region.vrfb.vaddr[0];
+		return ofbi->region->vrfb.vaddr[0];
 	else
-		return ofbi->region.vaddr;
+		return ofbi->region->vaddr;
 }
 
 static struct omapfb_colormode omapfb_colormodes[] = {
@@ -450,7 +450,7 @@ static int check_vrfb_fb_size(unsigned long region_size,
 static int check_fb_size(const struct omapfb_info *ofbi,
 		struct fb_var_screeninfo *var)
 {
-	unsigned long max_frame_size = ofbi->region.size;
+	unsigned long max_frame_size = ofbi->region->size;
 	int bytespp = var->bits_per_pixel >> 3;
 	unsigned long line_size = var->xres_virtual * bytespp;
 
@@ -497,7 +497,7 @@ static int check_fb_size(const struct omapfb_info *ofbi,
 static int setup_vrfb_rotation(struct fb_info *fbi)
 {
 	struct omapfb_info *ofbi = FB2OFB(fbi);
-	struct omapfb2_mem_region *rg = &ofbi->region;
+	struct omapfb2_mem_region *rg = ofbi->region;
 	struct vrfb *vrfb = &rg->vrfb;
 	struct fb_var_screeninfo *var = &fbi->var;
 	struct fb_fix_screeninfo *fix = &fbi->fix;
@@ -558,9 +558,9 @@ static int setup_vrfb_rotation(struct fb_info *fbi)
 		return r;
 
 	/* used by open/write in fbmem.c */
-	fbi->screen_base = ofbi->region.vrfb.vaddr[0];
+	fbi->screen_base = ofbi->region->vrfb.vaddr[0];
 
-	fix->smem_start = ofbi->region.vrfb.paddr[0];
+	fix->smem_start = ofbi->region->vrfb.paddr[0];
 
 	switch (var->nonstd) {
 	case OMAPFB_COLOR_YUV422:
@@ -599,7 +599,7 @@ void set_fb_fix(struct fb_info *fbi)
 	struct fb_fix_screeninfo *fix = &fbi->fix;
 	struct fb_var_screeninfo *var = &fbi->var;
 	struct omapfb_info *ofbi = FB2OFB(fbi);
-	struct omapfb2_mem_region *rg = &ofbi->region;
+	struct omapfb2_mem_region *rg = ofbi->region;
 
 	DBG("set_fb_fix\n");
 
@@ -688,7 +688,7 @@ int check_fb_var(struct fb_info *fbi, struct fb_var_screeninfo *var)
 		return -EINVAL;
 
 	/* When no memory is allocated ignore the size check */
-	if (ofbi->region.size != 0 && check_fb_size(ofbi, var))
+	if (ofbi->region->size != 0 && check_fb_size(ofbi, var))
 		return -EINVAL;
 
 	if (var->xres + var->xoffset > var->xres_virtual)
@@ -856,7 +856,7 @@ static void omapfb_calc_addr(const struct omapfb_info *ofbi,
 }
 
 /* setup overlay according to the fb */
-static int omapfb_setup_overlay(struct fb_info *fbi, struct omap_overlay *ovl,
+int omapfb_setup_overlay(struct fb_info *fbi, struct omap_overlay *ovl,
 		u16 posx, u16 posy, u16 outw, u16 outh)
 {
 	int r = 0;
@@ -892,7 +892,7 @@ static int omapfb_setup_overlay(struct fb_info *fbi, struct omap_overlay *ovl,
 		yres = var->yres;
 	}
 
-	if (ofbi->region.size)
+	if (ofbi->region->size)
 		omapfb_calc_addr(ofbi, var, fix, rotation,
 				 &data_start_p, &data_start_v);
 
@@ -971,7 +971,7 @@ int omapfb_apply_changes(struct fb_info *fbi, int init)
 
 		DBG("apply_changes, fb %d, ovl %d\n", ofbi->id, ovl->id);
 
-		if (ofbi->region.size == 0) {
+		if (ofbi->region->size == 0) {
 			/* the fb is not available. disable the overlay */
 			omapfb_overlay_enable(ovl, 0);
 			if (!init && ovl->manager)
@@ -1071,16 +1071,16 @@ static int omapfb_pan_display(struct fb_var_screeninfo *var,
 
 static void mmap_user_open(struct vm_area_struct *vma)
 {
-	struct omapfb_info *ofbi = (struct omapfb_info *)vma->vm_private_data;
+	struct omapfb2_mem_region *rg = vma->vm_private_data;
 
-	atomic_inc(&ofbi->map_count);
+	atomic_inc(&rg->map_count);
 }
 
 static void mmap_user_close(struct vm_area_struct *vma)
 {
-	struct omapfb_info *ofbi = (struct omapfb_info *)vma->vm_private_data;
+	struct omapfb2_mem_region *rg = vma->vm_private_data;
 
-	atomic_dec(&ofbi->map_count);
+	atomic_dec(&rg->map_count);
 }
 
 static struct vm_operations_struct mmap_user_ops = {
@@ -1092,6 +1092,7 @@ static int omapfb_mmap(struct fb_info *fbi, struct vm_area_struct *vma)
 {
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 	struct fb_fix_screeninfo *fix = &fbi->fix;
+	struct omapfb2_mem_region *rg;
 	unsigned long off;
 	unsigned long start;
 	u32 len;
@@ -1102,6 +1103,8 @@ static int omapfb_mmap(struct fb_info *fbi, struct vm_area_struct *vma)
 		return -EINVAL;
 	off = vma->vm_pgoff << PAGE_SHIFT;
 
+	rg = ofbi->region;
+
 	start = omapfb_get_region_paddr(ofbi);
 	len = fix->smem_len;
 	if (off >= len)
@@ -1117,12 +1120,12 @@ static int omapfb_mmap(struct fb_info *fbi, struct vm_area_struct *vma)
 	vma->vm_flags |= VM_IO | VM_RESERVED;
 	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
 	vma->vm_ops = &mmap_user_ops;
-	vma->vm_private_data = ofbi;
+	vma->vm_private_data = rg;
 	if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
 			     vma->vm_end - vma->vm_start, vma->vm_page_prot))
 		return -EAGAIN;
 	/* vm_ops.open won't be called for mmap itself. */
-	atomic_inc(&ofbi->map_count);
+	atomic_inc(&rg->map_count);
 	return 0;
 }
 
@@ -1312,7 +1315,9 @@ static void omapfb_free_fbmem(struct fb_info *fbi)
 	struct omapfb2_device *fbdev = ofbi->fbdev;
 	struct omapfb2_mem_region *rg;
 
-	rg = &ofbi->region;
+	rg = ofbi->region;
+
+	WARN_ON(atomic_read(&rg->map_count));
 
 	if (rg->paddr)
 		if (omap_vram_free(rg->paddr, rg->size))
@@ -1367,8 +1372,15 @@ static int omapfb_alloc_fbmem(struct fb_info *fbi, unsigned long size,
 	void __iomem *vaddr;
 	int r;
 
-	rg = &ofbi->region;
-	memset(rg, 0, sizeof(*rg));
+	rg = ofbi->region;
+
+	rg->paddr = 0;
+	rg->vaddr = NULL;
+	memset(&rg->vrfb, 0, sizeof rg->vrfb);
+	rg->size = 0;
+	rg->type = 0;
+	rg->alloc = false;
+	rg->map = false;
 
 	size = PAGE_ALIGN(size);
 
@@ -1621,7 +1633,7 @@ static int omapfb_allocate_all_fbs(struct omapfb2_device *fbdev)
 	for (i = 0; i < fbdev->num_fbs; i++) {
 		struct omapfb_info *ofbi = FB2OFB(fbdev->fbs[i]);
 		struct omapfb2_mem_region *rg;
-		rg = &ofbi->region;
+		rg = ofbi->region;
 
 		DBG("region%d phys %08x virt %p size=%lu\n",
 				i,
@@ -1638,7 +1650,7 @@ int omapfb_realloc_fbmem(struct fb_info *fbi, unsigned long size, int type)
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 	struct omapfb2_device *fbdev = ofbi->fbdev;
 	struct omap_dss_device *display = fb2display(fbi);
-	struct omapfb2_mem_region *rg = &ofbi->region;
+	struct omapfb2_mem_region *rg = ofbi->region;
 	unsigned long old_size = rg->size;
 	unsigned long old_paddr = rg->paddr;
 	int old_type = rg->type;
@@ -1721,7 +1733,7 @@ static int omapfb_fb_init(struct omapfb2_device *fbdev, struct fb_info *fbi)
 	fbi->flags = FBINFO_FLAG_DEFAULT;
 	fbi->pseudo_palette = fbdev->pseudo_palette;
 
-	if (ofbi->region.size == 0) {
+	if (ofbi->region->size == 0) {
 		clear_fb_info(fbi);
 		return 0;
 	}
@@ -1883,6 +1895,9 @@ static int omapfb_create_framebuffers(struct omapfb2_device *fbdev)
 		ofbi->fbdev = fbdev;
 		ofbi->id = i;
 
+		ofbi->region = &fbdev->regions[i];
+		ofbi->region->id = i;
+
 		/* assign these early, so that fb alloc can use them */
 		ofbi->rotation_type = def_vrfb ? OMAP_DSS_ROT_VRFB :
 			OMAP_DSS_ROT_DMA;
diff --git a/drivers/video/omap2/omapfb/omapfb-sysfs.c b/drivers/video/omap2/omapfb/omapfb-sysfs.c
index 5179219128bd..dea1aa46a7db 100644
--- a/drivers/video/omap2/omapfb/omapfb-sysfs.c
+++ b/drivers/video/omap2/omapfb/omapfb-sysfs.c
@@ -64,7 +64,7 @@ static ssize_t store_rotate_type(struct device *dev,
 	if (rot_type == ofbi->rotation_type)
 		goto out;
 
-	if (ofbi->region.size) {
+	if (ofbi->region->size) {
 		r = -EBUSY;
 		goto out;
 	}
@@ -408,7 +408,7 @@ static ssize_t show_size(struct device *dev,
 	struct fb_info *fbi = dev_get_drvdata(dev);
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 
-	return snprintf(buf, PAGE_SIZE, "%lu\n", ofbi->region.size);
+	return snprintf(buf, PAGE_SIZE, "%lu\n", ofbi->region->size);
 }
 
 static ssize_t store_size(struct device *dev, struct device_attribute *attr,
@@ -416,6 +416,8 @@ static ssize_t store_size(struct device *dev, struct device_attribute *attr,
 {
 	struct fb_info *fbi = dev_get_drvdata(dev);
 	struct omapfb_info *ofbi = FB2OFB(fbi);
+	struct omapfb2_device *fbdev = ofbi->fbdev;
+	struct omapfb2_mem_region *rg;
 	unsigned long size;
 	int r;
 	int i;
@@ -425,15 +427,30 @@ static ssize_t store_size(struct device *dev, struct device_attribute *attr,
 	if (!lock_fb_info(fbi))
 		return -ENODEV;
 
-	for (i = 0; i < ofbi->num_overlays; i++) {
-		if (ofbi->overlays[i]->info.enabled) {
-			r = -EBUSY;
-			goto out;
+	rg = ofbi->region;
+
+	if (atomic_read(&rg->map_count)) {
+		r = -EBUSY;
+		goto out;
+	}
+
+	for (i = 0; i < fbdev->num_fbs; i++) {
+		struct omapfb_info *ofbi2 = FB2OFB(fbdev->fbs[i]);
+		int j;
+
+		if (ofbi2->region != rg)
+			continue;
+
+		for (j = 0; j < ofbi2->num_overlays; j++) {
+			if (ofbi2->overlays[j]->info.enabled) {
+				r = -EBUSY;
+				goto out;
+			}
 		}
 	}
 
-	if (size != ofbi->region.size) {
-		r = omapfb_realloc_fbmem(fbi, size, ofbi->region.type);
+	if (size != ofbi->region->size) {
+		r = omapfb_realloc_fbmem(fbi, size, ofbi->region->type);
 		if (r) {
 			dev_err(dev, "realloc fbmem failed\n");
 			goto out;
@@ -453,7 +470,7 @@ static ssize_t show_phys(struct device *dev,
 	struct fb_info *fbi = dev_get_drvdata(dev);
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 
-	return snprintf(buf, PAGE_SIZE, "%0x\n", ofbi->region.paddr);
+	return snprintf(buf, PAGE_SIZE, "%0x\n", ofbi->region->paddr);
 }
 
 static ssize_t show_virt(struct device *dev,
@@ -462,7 +479,7 @@ static ssize_t show_virt(struct device *dev,
 	struct fb_info *fbi = dev_get_drvdata(dev);
 	struct omapfb_info *ofbi = FB2OFB(fbi);
 
-	return snprintf(buf, PAGE_SIZE, "%p\n", ofbi->region.vaddr);
+	return snprintf(buf, PAGE_SIZE, "%p\n", ofbi->region->vaddr);
 }
 
 static struct device_attribute omapfb_attrs[] = {
diff --git a/drivers/video/omap2/omapfb/omapfb.h b/drivers/video/omap2/omapfb/omapfb.h
index c9866be0460a..02f1ba9b228c 100644
--- a/drivers/video/omap2/omapfb/omapfb.h
+++ b/drivers/video/omap2/omapfb/omapfb.h
@@ -44,6 +44,7 @@ extern unsigned int omapfb_debug;
 #define OMAPFB_MAX_OVL_PER_FB 3
 
 struct omapfb2_mem_region {
+	int             id;
 	u32		paddr;
 	void __iomem	*vaddr;
 	struct vrfb	vrfb;
@@ -51,13 +52,13 @@ struct omapfb2_mem_region {
 	u8		type;		/* OMAPFB_PLANE_MEM_* */
 	bool		alloc;		/* allocated by the driver */
 	bool		map;		/* kernel mapped by the driver */
+	atomic_t	map_count;
 };
 
 /* appended to fb_info */
 struct omapfb_info {
 	int id;
-	struct omapfb2_mem_region region;
-	atomic_t map_count;
+	struct omapfb2_mem_region *region;
 	int num_overlays;
 	struct omap_overlay *overlays[OMAPFB_MAX_OVL_PER_FB];
 	struct omapfb2_device *fbdev;
@@ -76,6 +77,7 @@ struct omapfb2_device {
 
 	unsigned num_fbs;
 	struct fb_info *fbs[10];
+	struct omapfb2_mem_region regions[10];
 
 	unsigned num_displays;
 	struct omap_dss_device *displays[10];
@@ -117,6 +119,9 @@ int omapfb_update_window(struct fb_info *fbi,
 int dss_mode_to_fb_mode(enum omap_color_mode dssmode,
 			struct fb_var_screeninfo *var);
 
+int omapfb_setup_overlay(struct fb_info *fbi, struct omap_overlay *ovl,
+		u16 posx, u16 posy, u16 outw, u16 outh);
+
 /* find the display connected to this fb, if any */
 static inline struct omap_dss_device *fb2display(struct fb_info *fbi)
 {
diff --git a/include/linux/omapfb.h b/include/linux/omapfb.h
index 9bdd91486b49..0ecf7311c1ae 100644
--- a/include/linux/omapfb.h
+++ b/include/linux/omapfb.h
@@ -85,6 +85,9 @@
 #define OMAPFB_MEMTYPE_SRAM		1
 #define OMAPFB_MEMTYPE_MAX		1
 
+#define OMAPFB_MEM_IDX_ENABLED	0x80
+#define OMAPFB_MEM_IDX_MASK	0x7f
+
 enum omapfb_color_format {
 	OMAPFB_COLOR_RGB565 = 0,
 	OMAPFB_COLOR_YUV422,
@@ -136,7 +139,7 @@ struct omapfb_plane_info {
 	__u8  enabled;
 	__u8  channel_out;
 	__u8  mirror;
-	__u8  reserved1;
+	__u8  mem_idx;
 	__u32 out_width;
 	__u32 out_height;
 	__u32 reserved2[12];
-- 
cgit v1.2.3-59-g8ed1b


From 8cadd2831bf3abc94f4530e7fdbab7bb39b6b27d Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Mon, 10 May 2010 14:26:20 -0700
Subject: timer: add on-stack deferrable timer interfaces

In some cases (for instance with kernel threads) it may be desireable to
use on-stack deferrable timers to get their power saving benefits.  Add
interfaces to support this for the IPS driver.

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 include/linux/timer.h | 15 +++++++++++++++
 kernel/timer.c        | 13 +++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index ea965b857a50..38cf093ef62c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -100,6 +100,13 @@ void init_timer_deferrable_key(struct timer_list *timer,
 		setup_timer_on_stack_key((timer), #timer, &__key,	\
 					 (fn), (data));			\
 	} while (0)
+#define setup_deferrable_timer_on_stack(timer, fn, data)		\
+	do {								\
+		static struct lock_class_key __key;			\
+		setup_deferrable_timer_on_stack_key((timer), #timer,	\
+						    &__key, (fn),	\
+						    (data));		\
+	} while (0)
 #else
 #define init_timer(timer)\
 	init_timer_key((timer), NULL, NULL)
@@ -111,6 +118,8 @@ void init_timer_deferrable_key(struct timer_list *timer,
 	setup_timer_key((timer), NULL, NULL, (fn), (data))
 #define setup_timer_on_stack(timer, fn, data)\
 	setup_timer_on_stack_key((timer), NULL, NULL, (fn), (data))
+#define setup_deferrable_timer_on_stack(timer, fn, data)\
+	setup_deferrable_timer_on_stack_key((timer), NULL, NULL, (fn), (data))
 #endif
 
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
@@ -150,6 +159,12 @@ static inline void setup_timer_on_stack_key(struct timer_list *timer,
 	init_timer_on_stack_key(timer, name, key);
 }
 
+extern void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
+						const char *name,
+						struct lock_class_key *key,
+						void (*function)(unsigned long),
+						unsigned long data);
+
 /**
  * timer_pending - is a timer pending?
  * @timer: the timer in question
diff --git a/kernel/timer.c b/kernel/timer.c
index ee305c8d4e18..efde11e197c4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -577,6 +577,19 @@ static void __init_timer(struct timer_list *timer,
 	lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
 
+void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
+					 const char *name,
+					 struct lock_class_key *key,
+					 void (*function)(unsigned long),
+					 unsigned long data)
+{
+	timer->function = function;
+	timer->data = data;
+	init_timer_on_stack_key(timer, name, key);
+	timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
+
 /**
  * init_timer_key - initialize a timer
  * @timer: the timer to be initialized
-- 
cgit v1.2.3-59-g8ed1b


From c715a38bb7fc22fb8018b916c8a9f7ff017a8ad7 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Fri, 18 Jun 2010 14:05:52 +0100
Subject: rar: Move the RAR driver into the right place as its now clean

We exit staging rar! rar! rar!...

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/platform/x86/Kconfig                |  22 +
 drivers/platform/x86/Makefile               |   1 +
 drivers/platform/x86/intel_rar_register.c   | 671 +++++++++++++++++++++++++++
 drivers/staging/Kconfig                     |   2 -
 drivers/staging/Makefile                    |   1 -
 drivers/staging/memrar/memrar_handler.c     |   3 +-
 drivers/staging/rar_register/Kconfig        |  30 --
 drivers/staging/rar_register/Makefile       |   2 -
 drivers/staging/rar_register/rar_register.c | 675 ----------------------------
 drivers/staging/rar_register/rar_register.h |  44 --
 include/linux/rar_register.h                |  44 ++
 11 files changed, 739 insertions(+), 756 deletions(-)
 create mode 100644 drivers/platform/x86/intel_rar_register.c
 delete mode 100644 drivers/staging/rar_register/Kconfig
 delete mode 100644 drivers/staging/rar_register/Makefile
 delete mode 100644 drivers/staging/rar_register/rar_register.c
 delete mode 100644 drivers/staging/rar_register/rar_register.h
 create mode 100644 include/linux/rar_register.h

(limited to 'include/linux')

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 724b2ed1a3cb..2f173bc0ff05 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -539,6 +539,28 @@ config INTEL_SCU_IPC
 	  some embedded Intel x86 platforms. This is not needed for PC-type
 	  machines.
 
+config RAR_REGISTER
+	bool "Restricted Access Region Register Driver"
+	depends on PCI && X86_MRST
+	default n
+	---help---
+	  This driver allows other kernel drivers access to the
+	  contents of the restricted access region control registers.
+
+	  The restricted access region control registers
+	  (rar_registers) are used to pass address and
+	  locking information on restricted access regions
+	  to other drivers that use restricted access regions.
+
+	  The restricted access regions are regions of memory
+	  on the Intel MID Platform that are not accessible to
+	  the x86 processor, but are accessible to dedicated
+	  processors on board peripheral devices.
+
+	  The purpose of the restricted access regions is to
+	  protect sensitive data from compromise by unauthorized
+	  programs running on the x86 processor.
+
 config INTEL_IPS
 	tristate "Intel Intelligent Power Sharing"
 	depends on ACPI
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 7318fc2c1629..ed50eca1b556 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -26,4 +26,5 @@ obj-$(CONFIG_TOPSTAR_LAPTOP)	+= topstar-laptop.o
 obj-$(CONFIG_ACPI_TOSHIBA)	+= toshiba_acpi.o
 obj-$(CONFIG_TOSHIBA_BT_RFKILL)	+= toshiba_bluetooth.o
 obj-$(CONFIG_INTEL_SCU_IPC)	+= intel_scu_ipc.o
+obj-$(CONFIG_RAR_REGISTER)	+= intel_rar_register.o
 obj-$(CONFIG_INTEL_IPS)		+= intel_ips.o
diff --git a/drivers/platform/x86/intel_rar_register.c b/drivers/platform/x86/intel_rar_register.c
new file mode 100644
index 000000000000..73f8e6d72669
--- /dev/null
+++ b/drivers/platform/x86/intel_rar_register.c
@@ -0,0 +1,671 @@
+/*
+ *  rar_register.c - An Intel Restricted Access Region register driver
+ *
+ *  Copyright(c) 2009 Intel Corporation. All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation; either version 2 of the
+ *  License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ *  02111-1307, USA.
+ *
+ * -------------------------------------------------------------------
+ *  20091204 Mark Allyn <mark.a.allyn@intel.com>
+ *	     Ossama Othman <ossama.othman@intel.com>
+ *	Cleanup per feedback from Alan Cox and Arjan Van De Ven
+ *
+ *  20090806 Ossama Othman <ossama.othman@intel.com>
+ *      Return zero high address if upper 22 bits is zero.
+ *      Cleaned up checkpatch errors.
+ *      Clarified that driver is dealing with bus addresses.
+ *
+ *  20090702 Ossama Othman <ossama.othman@intel.com>
+ *      Removed unnecessary include directives
+ *      Cleaned up spinlocks.
+ *      Cleaned up logging.
+ *      Improved invalid parameter checks.
+ *      Fixed and simplified RAR address retrieval and RAR locking
+ *      code.
+ *
+ *  20090626 Mark Allyn <mark.a.allyn@intel.com>
+ *      Initial publish
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/rar_register.h>
+
+/* === Lincroft Message Bus Interface === */
+#define LNC_MCR_OFFSET		0xD0	/* Message Control Register */
+#define LNC_MDR_OFFSET		0xD4	/* Message Data Register */
+
+/* Message Opcodes */
+#define LNC_MESSAGE_READ_OPCODE	0xD0
+#define LNC_MESSAGE_WRITE_OPCODE 0xE0
+
+/* Message Write Byte Enables */
+#define LNC_MESSAGE_BYTE_WRITE_ENABLES	0xF
+
+/* B-unit Port */
+#define LNC_BUNIT_PORT	0x3
+
+/* === Lincroft B-Unit Registers - Programmed by IA32 firmware === */
+#define LNC_BRAR0L	0x10
+#define LNC_BRAR0H	0x11
+#define LNC_BRAR1L	0x12
+#define LNC_BRAR1H	0x13
+/* Reserved for SeP */
+#define LNC_BRAR2L	0x14
+#define LNC_BRAR2H	0x15
+
+/* Moorestown supports three restricted access regions. */
+#define MRST_NUM_RAR 3
+
+/* RAR Bus Address Range */
+struct rar_addr {
+	dma_addr_t low;
+	dma_addr_t high;
+};
+
+/*
+ *	We create one of these for each RAR
+ */
+struct client {
+	int (*callback)(unsigned long data);
+	unsigned long driver_priv;
+	bool busy;
+};
+
+static DEFINE_MUTEX(rar_mutex);
+static DEFINE_MUTEX(lnc_reg_mutex);
+
+/*
+ *	One per RAR device (currently only one device)
+ */
+struct rar_device {
+	struct rar_addr rar_addr[MRST_NUM_RAR];
+	struct pci_dev *rar_dev;
+	bool registered;
+	bool allocated;
+	struct client client[MRST_NUM_RAR];
+};
+
+/* Current platforms have only one rar_device for 3 rar regions */
+static struct rar_device my_rar_device;
+
+/*
+ *	Abstract out multiple device support. Current platforms only
+ *	have a single RAR device.
+ */
+
+/**
+ *	alloc_rar_device	-	return a new RAR structure
+ *
+ *	Return a new (but not yet ready) RAR device object
+ */
+static struct rar_device *alloc_rar_device(void)
+{
+	if (my_rar_device.allocated)
+		return NULL;
+	my_rar_device.allocated = 1;
+	return &my_rar_device;
+}
+
+/**
+ *	free_rar_device		-	free a RAR object
+ *	@rar: the RAR device being freed
+ *
+ *	Release a RAR object and any attached resources
+ */
+static void free_rar_device(struct rar_device *rar)
+{
+	pci_dev_put(rar->rar_dev);
+	rar->allocated = 0;
+}
+
+/**
+ *	_rar_to_device		-	return the device handling this RAR
+ *	@rar: RAR number
+ *	@off: returned offset
+ *
+ *	Internal helper for looking up RAR devices. This and alloc are the
+ *	two functions that need touching to go to multiple RAR devices.
+ */
+static struct rar_device *_rar_to_device(int rar, int *off)
+{
+	if (rar >= 0 && rar <= 3) {
+		*off = rar;
+		return &my_rar_device;
+	}
+	return NULL;
+}
+
+/**
+ *	rar_to_device		-	return the device handling this RAR
+ *	@rar: RAR number
+ *	@off: returned offset
+ *
+ *	Return the device this RAR maps to if one is present, otherwise
+ *	returns NULL. Reports the offset relative to the base of this
+ *	RAR device in off.
+ */
+static struct rar_device *rar_to_device(int rar, int *off)
+{
+	struct rar_device *rar_dev = _rar_to_device(rar, off);
+	if (rar_dev == NULL || !rar_dev->registered)
+		return NULL;
+	return rar_dev;
+}
+
+/**
+ *	rar_to_client		-	return the client handling this RAR
+ *	@rar: RAR number
+ *
+ *	Return the client this RAR maps to if a mapping is known, otherwise
+ *	returns NULL.
+ */
+static struct client *rar_to_client(int rar)
+{
+	int idx;
+	struct rar_device *r = _rar_to_device(rar, &idx);
+	if (r != NULL)
+		return &r->client[idx];
+	return NULL;
+}
+
+/**
+ *	rar_read_addr		-	retrieve a RAR mapping
+ *	@pdev: PCI device for the RAR
+ *	@offset: offset for message
+ *	@addr: returned address
+ *
+ *	Reads the address of a given RAR register. Returns 0 on success
+ *	or an error code on failure.
+ */
+static int rar_read_addr(struct pci_dev *pdev, int offset, dma_addr_t *addr)
+{
+	/*
+	 * ======== The Lincroft Message Bus Interface ========
+	 * Lincroft registers may be obtained via PCI from
+	 * the host bridge using the Lincroft Message Bus
+	 * Interface.  That message bus interface is generally
+	 * comprised of two registers: a control register (MCR, 0xDO)
+	 * and a data register (MDR, 0xD4).
+	 *
+	 * The MCR (message control register) format is the following:
+	 *   1.  [31:24]: Opcode
+	 *   2.  [23:16]: Port
+	 *   3.  [15:8]: Register Offset
+	 *   4.  [7:4]: Byte Enables (use 0xF to set all of these bits
+	 *              to 1)
+	 *   5.  [3:0]: reserved
+	 *
+	 *  Read (0xD0) and write (0xE0) opcodes are written to the
+	 *  control register when reading and writing to Lincroft
+	 *  registers, respectively.
+	 *
+	 *  We're interested in registers found in the Lincroft
+	 *  B-unit.  The B-unit port is 0x3.
+	 *
+	 *  The six B-unit RAR register offsets we use are listed
+	 *  earlier in this file.
+	 *
+	 *  Lastly writing to the MCR register requires the "Byte
+	 *  enables" bits to be set to 1.  This may be achieved by
+	 *  writing 0xF at bit 4.
+	 *
+	 * The MDR (message data register) format is the following:
+	 *   1. [31:0]: Read/Write Data
+	 *
+	 *  Data being read from this register is only available after
+	 *  writing the appropriate control message to the MCR
+	 *  register.
+	 *
+	 *  Data being written to this register must be written before
+	 *  writing the appropriate control message to the MCR
+	 *  register.
+	*/
+
+	int result;
+	u32 addr32;
+
+	/* Construct control message */
+	u32 const message =
+		 (LNC_MESSAGE_READ_OPCODE << 24)
+		 | (LNC_BUNIT_PORT << 16)
+		 | (offset << 8)
+		 | (LNC_MESSAGE_BYTE_WRITE_ENABLES << 4);
+
+	dev_dbg(&pdev->dev, "Offset for 'get' LNC MSG is %x\n", offset);
+
+	/*
+	* We synchronize access to the Lincroft MCR and MDR registers
+	* until BOTH the command is issued through the MCR register
+	* and the corresponding data is read from the MDR register.
+	* Otherwise a race condition would exist between accesses to
+	* both registers.
+	*/
+
+	mutex_lock(&lnc_reg_mutex);
+
+	/* Send the control message */
+	result = pci_write_config_dword(pdev, LNC_MCR_OFFSET, message);
+	if (!result) {
+		/* Read back the address as a 32bit value */
+		result = pci_read_config_dword(pdev, LNC_MDR_OFFSET, &addr32);
+		*addr = (dma_addr_t)addr32;
+	}
+	mutex_unlock(&lnc_reg_mutex);
+	return result;
+}
+
+/**
+ *	rar_set_addr		-	Set a RAR mapping
+ *	@pdev: PCI device for the RAR
+ *	@offset: offset for message
+ *	@addr: address to set
+ *
+ *	Sets the address of a given RAR register. Returns 0 on success
+ *	or an error code on failure.
+ */
+static int rar_set_addr(struct pci_dev *pdev,
+	int offset,
+	dma_addr_t addr)
+{
+	/*
+	* Data being written to this register must be written before
+	* writing the appropriate control message to the MCR
+	* register.
+	* See rar_get_addrs() for a description of the
+	* message bus interface being used here.
+	*/
+
+	int result;
+
+	/* Construct control message */
+	u32 const message = (LNC_MESSAGE_WRITE_OPCODE << 24)
+		| (LNC_BUNIT_PORT << 16)
+		| (offset << 8)
+		| (LNC_MESSAGE_BYTE_WRITE_ENABLES << 4);
+
+	/*
+	* We synchronize access to the Lincroft MCR and MDR registers
+	* until BOTH the command is issued through the MCR register
+	* and the corresponding data is read from the MDR register.
+	* Otherwise a race condition would exist between accesses to
+	* both registers.
+	*/
+
+	mutex_lock(&lnc_reg_mutex);
+
+	/* Send the control message */
+	result = pci_write_config_dword(pdev, LNC_MDR_OFFSET, addr);
+	if (!result)
+		/* And address */
+		result = pci_write_config_dword(pdev, LNC_MCR_OFFSET, message);
+
+	mutex_unlock(&lnc_reg_mutex);
+	return result;
+}
+
+/*
+ *	rar_init_params		-	Initialize RAR parameters
+ *	@rar: RAR device to initialise
+ *
+ *	Initialize RAR parameters, such as bus addresses, etc. Returns 0
+ *	on success, or an error code on failure.
+ */
+static int init_rar_params(struct rar_device *rar)
+{
+	struct pci_dev *pdev = rar->rar_dev;
+	unsigned int i;
+	int result = 0;
+	int offset = 0x10;	/* RAR 0 to 2 in order low/high/low/high/... */
+
+	/* Retrieve RAR start and end bus addresses.
+	* Access the RAR registers through the Lincroft Message Bus
+	* Interface on PCI device: 00:00.0 Host bridge.
+	*/
+
+	for (i = 0; i < MRST_NUM_RAR; ++i) {
+		struct rar_addr *addr = &rar->rar_addr[i];
+
+		result = rar_read_addr(pdev, offset++, &addr->low);
+		if (result != 0)
+			return result;
+
+		result = rar_read_addr(pdev, offset++, &addr->high);
+		if (result != 0)
+			return result;
+
+
+		/*
+		* Only the upper 22 bits of the RAR addresses are
+		* stored in their corresponding RAR registers so we
+		* must set the lower 10 bits accordingly.
+
+		* The low address has its lower 10 bits cleared, and
+		* the high address has all its lower 10 bits set,
+		* e.g.:
+		* low = 0x2ffffc00
+		*/
+
+		addr->low &= (dma_addr_t)0xfffffc00u;
+
+		/*
+		* Set bits 9:0 on uppser address if bits 31:10 are non
+		* zero; otherwize clear all bits
+		*/
+
+		if ((addr->high & 0xfffffc00u) == 0)
+			addr->high = 0;
+		else
+			addr->high |= 0x3ffu;
+	}
+	/* Done accessing the device. */
+
+	if (result == 0) {
+		for (i = 0; i != MRST_NUM_RAR; ++i) {
+			/*
+			* "BRAR" refers to the RAR registers in the
+			* Lincroft B-unit.
+			*/
+			dev_info(&pdev->dev, "BRAR[%u] bus address range = "
+			  "[%lx, %lx]\n", i,
+			  (unsigned long)rar->rar_addr[i].low,
+			  (unsigned long)rar->rar_addr[i].high);
+		}
+	}
+	return result;
+}
+
+/**
+ *	rar_get_address		-	get the bus address in a RAR
+ *	@start: return value of start address of block
+ *	@end: return value of end address of block
+ *
+ *	The rar_get_address function is used by other device drivers
+ *	to obtain RAR address information on a RAR. It takes three
+ *	parameters:
+ *
+ *	The function returns a 0 upon success or an error if there is no RAR
+ *	facility on this system.
+ */
+int rar_get_address(int rar_index, dma_addr_t *start, dma_addr_t *end)
+{
+	int idx;
+	struct rar_device *rar = rar_to_device(rar_index, &idx);
+
+	if (rar == NULL) {
+		WARN_ON(1);
+		return -ENODEV;
+	}
+
+	*start = rar->rar_addr[idx].low;
+	*end = rar->rar_addr[idx].high;
+	return 0;
+}
+EXPORT_SYMBOL(rar_get_address);
+
+/**
+ *	rar_lock	-	lock a RAR register
+ *	@rar_index: RAR to lock (0-2)
+ *
+ *	The rar_lock function is ued by other device drivers to lock an RAR.
+ *	once a RAR is locked, it stays locked until the next system reboot.
+ *
+ *	The function returns a 0 upon success or an error if there is no RAR
+ *	facility on this system, or the locking fails
+ */
+int rar_lock(int rar_index)
+{
+	struct rar_device *rar;
+	int result;
+	int idx;
+	dma_addr_t low, high;
+
+	rar = rar_to_device(rar_index, &idx);
+
+	if (rar == NULL) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	low = rar->rar_addr[idx].low & 0xfffffc00u;
+	high = rar->rar_addr[idx].high & 0xfffffc00u;
+
+	/*
+	* Only allow I/O from the graphics and Langwell;
+	* not from the x86 processor
+	*/
+
+	if (rar_index == RAR_TYPE_VIDEO) {
+		low |= 0x00000009;
+		high |= 0x00000015;
+	} else if (rar_index == RAR_TYPE_AUDIO) {
+		/* Only allow I/O from Langwell; nothing from x86 */
+		low |= 0x00000008;
+		high |= 0x00000018;
+	} else
+		/* Read-only from all agents */
+		high |= 0x00000018;
+
+	/*
+	* Now program the register using the Lincroft message
+	* bus interface.
+	*/
+	result = rar_set_addr(rar->rar_dev,
+				2 * idx, low);
+
+	if (result == 0)
+		result = rar_set_addr(rar->rar_dev,
+				2 * idx + 1, high);
+
+	return result;
+}
+EXPORT_SYMBOL(rar_lock);
+
+/**
+ *	register_rar		-	register a RAR handler
+ *	@num: RAR we wish to register for
+ *	@callback: function to call when RAR support is available
+ *	@data: data to pass to this function
+ *
+ *	The register_rar function is to used by other device drivers
+ *	to ensure that this driver is ready. As we cannot be sure of
+ *	the compile/execute order of drivers in ther kernel, it is
+ *	best to give this driver a callback function to call when
+ *	it is ready to give out addresses. The callback function
+ *	would have those steps that continue the initialization of
+ *	a driver that do require a valid RAR address. One of those
+ *	steps would be to call rar_get_address()
+ *
+ *	This function return 0 on success or an error code on failure.
+ */
+int register_rar(int num, int (*callback)(unsigned long data),
+							unsigned long data)
+{
+	/* For now we hardcode a single RAR device */
+	struct rar_device *rar;
+	struct client *c;
+	int idx;
+	int retval = 0;
+
+	mutex_lock(&rar_mutex);
+
+	/* Do we have a client mapping for this RAR number ? */
+	c = rar_to_client(num);
+	if (c == NULL) {
+		retval = -ERANGE;
+		goto done;
+	}
+	/* Is it claimed ? */
+	if (c->busy) {
+		retval = -EBUSY;
+		goto done;
+	}
+	c->busy = 1;
+
+	/* See if we have a handler for this RAR yet, if we do then fire it */
+	rar = rar_to_device(num, &idx);
+
+	if (rar) {
+		/*
+		* if the driver already registered, then we can simply
+		* call the callback right now
+		*/
+		(*callback)(data);
+		goto done;
+	}
+
+	/* Arrange to be called back when the hardware is found */
+	c->callback = callback;
+	c->driver_priv = data;
+done:
+	mutex_unlock(&rar_mutex);
+	return retval;
+}
+EXPORT_SYMBOL(register_rar);
+
+/**
+ *	unregister_rar	-	release a RAR allocation
+ *	@num: RAR number
+ *
+ *	Releases a RAR allocation, or pending allocation. If a callback is
+ *	pending then this function will either complete before the unregister
+ *	returns or not at all.
+ */
+
+void unregister_rar(int num)
+{
+	struct client *c;
+
+	mutex_lock(&rar_mutex);
+	c = rar_to_client(num);
+	if (c == NULL || !c->busy)
+		WARN_ON(1);
+	else
+		c->busy = 0;
+	mutex_unlock(&rar_mutex);
+}
+EXPORT_SYMBOL(unregister_rar);
+
+/**
+ *	rar_callback		-	Process callbacks
+ *	@rar: new RAR device
+ *
+ *	Process the callbacks for a newly found RAR device.
+ */
+
+static void rar_callback(struct rar_device *rar)
+{
+	struct client *c = &rar->client[0];
+	int i;
+
+	mutex_lock(&rar_mutex);
+
+	rar->registered = 1;	/* Ensure no more callbacks queue */
+
+	for (i = 0; i < MRST_NUM_RAR; i++) {
+		if (c->callback && c->busy) {
+			c->callback(c->driver_priv);
+			c->callback = NULL;
+		}
+		c++;
+	}
+	mutex_unlock(&rar_mutex);
+}
+
+/**
+ *	rar_probe		-	PCI probe callback
+ *	@dev: PCI device
+ *	@id: matching entry in the match table
+ *
+ *	A RAR device has been discovered. Initialise it and if successful
+ *	process any pending callbacks that can now be completed.
+ */
+static int rar_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	int error;
+	struct rar_device *rar;
+
+	dev_dbg(&dev->dev, "PCI probe starting\n");
+
+	rar = alloc_rar_device();
+	if (rar == NULL)
+		return -EBUSY;
+
+	/* Enable the device */
+	error = pci_enable_device(dev);
+	if (error) {
+		dev_err(&dev->dev,
+			"Error enabling RAR register PCI device\n");
+		goto end_function;
+	}
+
+	/* Fill in the rar_device structure */
+	rar->rar_dev = pci_dev_get(dev);
+	pci_set_drvdata(dev, rar);
+
+	/*
+	 * Initialize the RAR parameters, which have to be retrieved
+	 * via the message bus interface.
+	 */
+	error = init_rar_params(rar);
+	if (error) {
+		pci_disable_device(dev);
+		dev_err(&dev->dev, "Error retrieving RAR addresses\n");
+		goto end_function;
+	}
+	/* now call anyone who has registered (using callbacks) */
+	rar_callback(rar);
+	return 0;
+end_function:
+	free_rar_device(rar);
+	return error;
+}
+
+const struct pci_device_id rar_pci_id_tbl[] = {
+	{ PCI_VDEVICE(INTEL, 0x4110) },
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, rar_pci_id_tbl);
+
+const struct pci_device_id *my_id_table = rar_pci_id_tbl;
+
+/* field for registering driver to PCI device */
+static struct pci_driver rar_pci_driver = {
+	.name = "rar_register_driver",
+	.id_table = rar_pci_id_tbl,
+	.probe = rar_probe,
+	/* Cannot be unplugged - no remove */
+};
+
+static int __init rar_init_handler(void)
+{
+	return pci_register_driver(&rar_pci_driver);
+}
+
+static void __exit rar_exit_handler(void)
+{
+	pci_unregister_driver(&rar_pci_driver);
+}
+
+module_init(rar_init_handler);
+module_exit(rar_exit_handler);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Intel Restricted Access Region Register Driver");
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 984a75440710..9dfef8a59974 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -109,8 +109,6 @@ source "drivers/staging/hv/Kconfig"
 
 source "drivers/staging/vme/Kconfig"
 
-source "drivers/staging/rar_register/Kconfig"
-
 source "drivers/staging/memrar/Kconfig"
 
 source "drivers/staging/sep/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 9fa25133874a..3dbf681ca64e 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -35,7 +35,6 @@ obj-$(CONFIG_VT6656)		+= vt6656/
 obj-$(CONFIG_FB_UDL)		+= udlfb/
 obj-$(CONFIG_HYPERV)		+= hv/
 obj-$(CONFIG_VME_BUS)		+= vme/
-obj-$(CONFIG_RAR_REGISTER)	+= rar_register/
 obj-$(CONFIG_MRST_RAR_HANDLER)	+= memrar/
 obj-$(CONFIG_DX_SEP)		+= sep/
 obj-$(CONFIG_IIO)		+= iio/
diff --git a/drivers/staging/memrar/memrar_handler.c b/drivers/staging/memrar/memrar_handler.c
index efa7fd62d390..41876f2b0e54 100644
--- a/drivers/staging/memrar/memrar_handler.c
+++ b/drivers/staging/memrar/memrar_handler.c
@@ -47,8 +47,7 @@
 #include <linux/mm.h>
 #include <linux/ioport.h>
 #include <linux/io.h>
-
-#include "../rar_register/rar_register.h"
+#include <linux/rar_register.h>
 
 #include "memrar.h"
 #include "memrar_allocator.h"
diff --git a/drivers/staging/rar_register/Kconfig b/drivers/staging/rar_register/Kconfig
deleted file mode 100644
index e9c27738199b..000000000000
--- a/drivers/staging/rar_register/Kconfig
+++ /dev/null
@@ -1,30 +0,0 @@
-#
-# RAR device configuration
-#
-
-menu "RAR Register Driver"
-#
-#	Restricted Access Register Manager
-#
-config RAR_REGISTER
-	tristate "Restricted Access Region Register Driver"
-	depends on PCI
-	default n
-	---help---
-	  This driver allows other kernel drivers access to the
-	  contents of the restricted access region control registers.
-
-	  The restricted access region control registers
-	  (rar_registers) are used to pass address and
-	  locking information on restricted access regions
-	  to other drivers that use restricted access regions.
-
-	  The restricted access regions are regions of memory
-	  on the Intel MID Platform that are not accessible to
-	  the x86 processor, but are accessible to dedicated
-	  processors on board peripheral devices.
-
-	  The purpose of the restricted access regions is to
-	  protect sensitive data from compromise by unauthorized
-	  programs running on the x86 processor.
-endmenu
diff --git a/drivers/staging/rar_register/Makefile b/drivers/staging/rar_register/Makefile
deleted file mode 100644
index d5954ccc16c9..000000000000
--- a/drivers/staging/rar_register/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-EXTRA_CFLAGS += -DLITTLE__ENDIAN
-obj-$(CONFIG_RAR_REGISTER) += rar_register.o
diff --git a/drivers/staging/rar_register/rar_register.c b/drivers/staging/rar_register/rar_register.c
deleted file mode 100644
index 618503f422ef..000000000000
--- a/drivers/staging/rar_register/rar_register.c
+++ /dev/null
@@ -1,675 +0,0 @@
-/*
- *  rar_register.c - An Intel Restricted Access Region register driver
- *
- *  Copyright(c) 2009 Intel Corporation. All rights reserved.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License as
- *  published by the Free Software Foundation; either version 2 of the
- *  License, or (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- *  02111-1307, USA.
- *
- * -------------------------------------------------------------------
- *  20091204 Mark Allyn <mark.a.allyn@intel.com>
- *	     Ossama Othman <ossama.othman@intel.com>
- *	Cleanup per feedback from Alan Cox and Arjan Van De Ven
- *
- *  20090806 Ossama Othman <ossama.othman@intel.com>
- *      Return zero high address if upper 22 bits is zero.
- *      Cleaned up checkpatch errors.
- *      Clarified that driver is dealing with bus addresses.
- *
- *  20090702 Ossama Othman <ossama.othman@intel.com>
- *      Removed unnecessary include directives
- *      Cleaned up spinlocks.
- *      Cleaned up logging.
- *      Improved invalid parameter checks.
- *      Fixed and simplified RAR address retrieval and RAR locking
- *      code.
- *
- *  20090626 Mark Allyn <mark.a.allyn@intel.com>
- *      Initial publish
- */
-
-#define DEBUG 1
-
-#include "rar_register.h"
-
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/spinlock.h>
-#include <linux/device.h>
-#include <linux/kernel.h>
-
-/* === Lincroft Message Bus Interface === */
-#define LNC_MCR_OFFSET		0xD0	/* Message Control Register */
-#define LNC_MDR_OFFSET		0xD4	/* Message Data Register */
-
-/* Message Opcodes */
-#define LNC_MESSAGE_READ_OPCODE	0xD0
-#define LNC_MESSAGE_WRITE_OPCODE 0xE0
-
-/* Message Write Byte Enables */
-#define LNC_MESSAGE_BYTE_WRITE_ENABLES	0xF
-
-/* B-unit Port */
-#define LNC_BUNIT_PORT	0x3
-
-/* === Lincroft B-Unit Registers - Programmed by IA32 firmware === */
-#define LNC_BRAR0L	0x10
-#define LNC_BRAR0H	0x11
-#define LNC_BRAR1L	0x12
-#define LNC_BRAR1H	0x13
-/* Reserved for SeP */
-#define LNC_BRAR2L	0x14
-#define LNC_BRAR2H	0x15
-
-/* Moorestown supports three restricted access regions. */
-#define MRST_NUM_RAR 3
-
-/* RAR Bus Address Range */
-struct rar_addr {
-	dma_addr_t low;
-	dma_addr_t high;
-};
-
-/*
- *	We create one of these for each RAR
- */
-struct client {
-	int (*callback)(unsigned long data);
-	unsigned long driver_priv;
-	bool busy;
-};
-
-static DEFINE_MUTEX(rar_mutex);
-static DEFINE_MUTEX(lnc_reg_mutex);
-
-/*
- *	One per RAR device (currently only one device)
- */
-struct rar_device {
-	struct rar_addr rar_addr[MRST_NUM_RAR];
-	struct pci_dev *rar_dev;
-	bool registered;
-	bool allocated;
-	struct client client[MRST_NUM_RAR];
-};
-
-/* Current platforms have only one rar_device for 3 rar regions */
-static struct rar_device my_rar_device;
-
-/*
- *	Abstract out multiple device support. Current platforms only
- *	have a single RAR device.
- */
-
-/**
- *	alloc_rar_device	-	return a new RAR structure
- *
- *	Return a new (but not yet ready) RAR device object
- */
-static struct rar_device *alloc_rar_device(void)
-{
-	if (my_rar_device.allocated)
-		return NULL;
-	my_rar_device.allocated = 1;
-	return &my_rar_device;
-}
-
-/**
- *	free_rar_device		-	free a RAR object
- *	@rar: the RAR device being freed
- *
- *	Release a RAR object and any attached resources
- */
-static void free_rar_device(struct rar_device *rar)
-{
-	pci_dev_put(rar->rar_dev);
-	rar->allocated = 0;
-}
-
-/**
- *	_rar_to_device		-	return the device handling this RAR
- *	@rar: RAR number
- *	@off: returned offset
- *
- *	Internal helper for looking up RAR devices. This and alloc are the
- *	two functions that need touching to go to multiple RAR devices.
- */
-static struct rar_device *_rar_to_device(int rar, int *off)
-{
-	if (rar >= 0 && rar <= 3) {
-		*off = rar;
-		return &my_rar_device;
-	}
-	return NULL;
-}
-
-
-/**
- *	rar_to_device		-	return the device handling this RAR
- *	@rar: RAR number
- *	@off: returned offset
- *
- *	Return the device this RAR maps to if one is present, otherwise
- *	returns NULL. Reports the offset relative to the base of this
- *	RAR device in off.
- */
-static struct rar_device *rar_to_device(int rar, int *off)
-{
-	struct rar_device *rar_dev = _rar_to_device(rar, off);
-	if (rar_dev == NULL || !rar_dev->registered)
-		return NULL;
-	return rar_dev;
-}
-
-/**
- *	rar_to_client		-	return the client handling this RAR
- *	@rar: RAR number
- *
- *	Return the client this RAR maps to if a mapping is known, otherwise
- *	returns NULL.
- */
-static struct client *rar_to_client(int rar)
-{
-	int idx;
-	struct rar_device *r = _rar_to_device(rar, &idx);
-	if (r != NULL)
-		return &r->client[idx];
-	return NULL;
-}
-
-/**
- *	rar_read_addr		-	retrieve a RAR mapping
- *	@pdev: PCI device for the RAR
- *	@offset: offset for message
- *	@addr: returned address
- *
- *	Reads the address of a given RAR register. Returns 0 on success
- *	or an error code on failure.
- */
-static int rar_read_addr(struct pci_dev *pdev, int offset, dma_addr_t *addr)
-{
-	/*
-	 * ======== The Lincroft Message Bus Interface ========
-	 * Lincroft registers may be obtained via PCI from
-	 * the host bridge using the Lincroft Message Bus
-	 * Interface.  That message bus interface is generally
-	 * comprised of two registers: a control register (MCR, 0xDO)
-	 * and a data register (MDR, 0xD4).
-	 *
-	 * The MCR (message control register) format is the following:
-	 *   1.  [31:24]: Opcode
-	 *   2.  [23:16]: Port
-	 *   3.  [15:8]: Register Offset
-	 *   4.  [7:4]: Byte Enables (use 0xF to set all of these bits
-	 *              to 1)
-	 *   5.  [3:0]: reserved
-	 *
-	 *  Read (0xD0) and write (0xE0) opcodes are written to the
-	 *  control register when reading and writing to Lincroft
-	 *  registers, respectively.
-	 *
-	 *  We're interested in registers found in the Lincroft
-	 *  B-unit.  The B-unit port is 0x3.
-	 *
-	 *  The six B-unit RAR register offsets we use are listed
-	 *  earlier in this file.
-	 *
-	 *  Lastly writing to the MCR register requires the "Byte
-	 *  enables" bits to be set to 1.  This may be achieved by
-	 *  writing 0xF at bit 4.
-	 *
-	 * The MDR (message data register) format is the following:
-	 *   1. [31:0]: Read/Write Data
-	 *
-	 *  Data being read from this register is only available after
-	 *  writing the appropriate control message to the MCR
-	 *  register.
-	 *
-	 *  Data being written to this register must be written before
-	 *  writing the appropriate control message to the MCR
-	 *  register.
-	*/
-
-	int result;
-	u32 addr32;
-
-	/* Construct control message */
-	u32 const message =
-		 (LNC_MESSAGE_READ_OPCODE << 24)
-		 | (LNC_BUNIT_PORT << 16)
-		 | (offset << 8)
-		 | (LNC_MESSAGE_BYTE_WRITE_ENABLES << 4);
-
-	dev_dbg(&pdev->dev, "Offset for 'get' LNC MSG is %x\n", offset);
-
-	/*
-	* We synchronize access to the Lincroft MCR and MDR registers
-	* until BOTH the command is issued through the MCR register
-	* and the corresponding data is read from the MDR register.
-	* Otherwise a race condition would exist between accesses to
-	* both registers.
-	*/
-
-	mutex_lock(&lnc_reg_mutex);
-
-	/* Send the control message */
-	result = pci_write_config_dword(pdev, LNC_MCR_OFFSET, message);
-	if (!result) {
-		/* Read back the address as a 32bit value */
-		result = pci_read_config_dword(pdev, LNC_MDR_OFFSET, &addr32);
-		*addr = (dma_addr_t)addr32;
-	}
-	mutex_unlock(&lnc_reg_mutex);
-	return result;
-}
-
-/**
- *	rar_set_addr		-	Set a RAR mapping
- *	@pdev: PCI device for the RAR
- *	@offset: offset for message
- *	@addr: address to set
- *
- *	Sets the address of a given RAR register. Returns 0 on success
- *	or an error code on failure.
- */
-static int rar_set_addr(struct pci_dev *pdev,
-	int offset,
-	dma_addr_t addr)
-{
-	/*
-	* Data being written to this register must be written before
-	* writing the appropriate control message to the MCR
-	* register.
-	* See rar_get_addrs() for a description of the
-	* message bus interface being used here.
-	*/
-
-	int result;
-
-	/* Construct control message */
-	u32 const message = (LNC_MESSAGE_WRITE_OPCODE << 24)
-		| (LNC_BUNIT_PORT << 16)
-		| (offset << 8)
-		| (LNC_MESSAGE_BYTE_WRITE_ENABLES << 4);
-
-	/*
-	* We synchronize access to the Lincroft MCR and MDR registers
-	* until BOTH the command is issued through the MCR register
-	* and the corresponding data is read from the MDR register.
-	* Otherwise a race condition would exist between accesses to
-	* both registers.
-	*/
-
-	mutex_lock(&lnc_reg_mutex);
-
-	/* Send the control message */
-	result = pci_write_config_dword(pdev, LNC_MDR_OFFSET, addr);
-	if (!result)
-		/* And address */
-		result = pci_write_config_dword(pdev, LNC_MCR_OFFSET, message);
-
-	mutex_unlock(&lnc_reg_mutex);
-	return result;
-}
-
-/*
- *	rar_init_params		-	Initialize RAR parameters
- *	@rar: RAR device to initialise
- *
- *	Initialize RAR parameters, such as bus addresses, etc. Returns 0
- *	on success, or an error code on failure.
- */
-static int init_rar_params(struct rar_device *rar)
-{
-	struct pci_dev *pdev = rar->rar_dev;
-	unsigned int i;
-	int result = 0;
-	int offset = 0x10;	/* RAR 0 to 2 in order low/high/low/high/... */
-
-	/* Retrieve RAR start and end bus addresses.
-	* Access the RAR registers through the Lincroft Message Bus
-	* Interface on PCI device: 00:00.0 Host bridge.
-	*/
-
-	for (i = 0; i < MRST_NUM_RAR; ++i) {
-		struct rar_addr *addr = &rar->rar_addr[i];
-
-		result = rar_read_addr(pdev, offset++, &addr->low);
-		if (result != 0)
-			return result;
-
-		result = rar_read_addr(pdev, offset++, &addr->high);
-		if (result != 0)
-			return result;
-
-
-		/*
-		* Only the upper 22 bits of the RAR addresses are
-		* stored in their corresponding RAR registers so we
-		* must set the lower 10 bits accordingly.
-
-		* The low address has its lower 10 bits cleared, and
-		* the high address has all its lower 10 bits set,
-		* e.g.:
-		* low = 0x2ffffc00
-		*/
-
-		addr->low &= (dma_addr_t)0xfffffc00u;
-
-		/*
-		* Set bits 9:0 on uppser address if bits 31:10 are non
-		* zero; otherwize clear all bits
-		*/
-
-		if ((addr->high & 0xfffffc00u) == 0)
-			addr->high = 0;
-		else
-			addr->high |= 0x3ffu;
-	}
-	/* Done accessing the device. */
-
-	if (result == 0) {
-		for (i = 0; i != MRST_NUM_RAR; ++i) {
-			/*
-			* "BRAR" refers to the RAR registers in the
-			* Lincroft B-unit.
-			*/
-			dev_info(&pdev->dev, "BRAR[%u] bus address range = "
-			  "[%lx, %lx]\n", i,
-			  (unsigned long)rar->rar_addr[i].low,
-			  (unsigned long)rar->rar_addr[i].high);
-		}
-	}
-	return result;
-}
-
-/**
- *	rar_get_address		-	get the bus address in a RAR
- *	@start: return value of start address of block
- *	@end: return value of end address of block
- *
- *	The rar_get_address function is used by other device drivers
- *	to obtain RAR address information on a RAR. It takes three
- *	parameters:
- *
- *	The function returns a 0 upon success or an error if there is no RAR
- *	facility on this system.
- */
-int rar_get_address(int rar_index, dma_addr_t *start, dma_addr_t *end)
-{
-	int idx;
-	struct rar_device *rar = rar_to_device(rar_index, &idx);
-
-	if (rar == NULL) {
-		WARN_ON(1);
-		return -ENODEV;
-	}
-
-	*start = rar->rar_addr[idx].low;
-	*end = rar->rar_addr[idx].high;
-	return 0;
-}
-EXPORT_SYMBOL(rar_get_address);
-
-/**
- *	rar_lock	-	lock a RAR register
- *	@rar_index: RAR to lock (0-2)
- *
- *	The rar_lock function is ued by other device drivers to lock an RAR.
- *	once a RAR is locked, it stays locked until the next system reboot.
- *
- *	The function returns a 0 upon success or an error if there is no RAR
- *	facility on this system, or the locking fails
- */
-int rar_lock(int rar_index)
-{
-	struct rar_device *rar;
-	int result;
-	int idx;
-	dma_addr_t low, high;
-
-	rar = rar_to_device(rar_index, &idx);
-
-	if (rar == NULL) {
-		WARN_ON(1);
-		return -EINVAL;
-	}
-
-	low = rar->rar_addr[idx].low & 0xfffffc00u;
-	high = rar->rar_addr[idx].high & 0xfffffc00u;
-
-	/*
-	* Only allow I/O from the graphics and Langwell;
-	* not from the x86 processor
-	*/
-
-	if (rar_index == RAR_TYPE_VIDEO) {
-		low |= 0x00000009;
-		high |= 0x00000015;
-	} else if (rar_index == RAR_TYPE_AUDIO) {
-		/* Only allow I/O from Langwell; nothing from x86 */
-		low |= 0x00000008;
-		high |= 0x00000018;
-	} else
-		/* Read-only from all agents */
-		high |= 0x00000018;
-
-	/*
-	* Now program the register using the Lincroft message
-	* bus interface.
-	*/
-	result = rar_set_addr(rar->rar_dev,
-				2 * idx, low);
-
-	if (result == 0)
-		result = rar_set_addr(rar->rar_dev,
-				2 * idx + 1, high);
-
-	return result;
-}
-EXPORT_SYMBOL(rar_lock);
-
-/**
- *	register_rar		-	register a RAR handler
- *	@num: RAR we wish to register for
- *	@callback: function to call when RAR support is available
- *	@data: data to pass to this function
- *
- *	The register_rar function is to used by other device drivers
- *	to ensure that this driver is ready. As we cannot be sure of
- *	the compile/execute order of drivers in ther kernel, it is
- *	best to give this driver a callback function to call when
- *	it is ready to give out addresses. The callback function
- *	would have those steps that continue the initialization of
- *	a driver that do require a valid RAR address. One of those
- *	steps would be to call rar_get_address()
- *
- *	This function return 0 on success an error code on failure.
- */
-int register_rar(int num, int (*callback)(unsigned long data),
-							unsigned long data)
-{
-	/* For now we hardcode a single RAR device */
-	struct rar_device *rar;
-	struct client *c;
-	int idx;
-	int retval = 0;
-
-	mutex_lock(&rar_mutex);
-
-	/* Do we have a client mapping for this RAR number ? */
-	c = rar_to_client(num);
-	if (c == NULL) {
-		retval = -ERANGE;
-		goto done;
-	}
-	/* Is it claimed ? */
-	if (c->busy) {
-		retval = -EBUSY;
-		goto done;
-	}
-	c->busy = 1;
-
-	/* See if we have a handler for this RAR yet, if we do then fire it */
-	rar = rar_to_device(num, &idx);
-
-	if (rar) {
-		/*
-		* if the driver already registered, then we can simply
-		* call the callback right now
-		*/
-		(*callback)(data);
-		goto done;
-	}
-
-	/* Arrange to be called back when the hardware is found */
-	c->callback = callback;
-	c->driver_priv = data;
-done:
-	mutex_unlock(&rar_mutex);
-	return retval;
-}
-EXPORT_SYMBOL(register_rar);
-
-/**
- *	unregister_rar	-	release a RAR allocation
- *	@num: RAR number
- *
- *	Releases a RAR allocation, or pending allocation. If a callback is
- *	pending then this function will either complete before the unregister
- *	returns or not at all.
- */
-
-void unregister_rar(int num)
-{
-	struct client *c;
-
-	mutex_lock(&rar_mutex);
-	c = rar_to_client(num);
-	if (c == NULL || !c->busy)
-		WARN_ON(1);
-	else
-		c->busy = 0;
-	mutex_unlock(&rar_mutex);
-}
-EXPORT_SYMBOL(unregister_rar);
-
-/**
- *	rar_callback		-	Process callbacks
- *	@rar: new RAR device
- *
- *	Process the callbacks for a newly found RAR device.
- */
-
-static void rar_callback(struct rar_device *rar)
-{
-	struct client *c = &rar->client[0];
-	int i;
-
-	mutex_lock(&rar_mutex);
-
-	rar->registered = 1;	/* Ensure no more callbacks queue */
-
-	for (i = 0; i < MRST_NUM_RAR; i++) {
-		if (c->callback && c->busy) {
-			c->callback(c->driver_priv);
-			c->callback = NULL;
-		}
-		c++;
-	}
-	mutex_unlock(&rar_mutex);
-}
-
-/**
- *	rar_probe		-	PCI probe callback
- *	@dev: PCI device
- *	@id: matching entry in the match table
- *
- *	A RAR device has been discovered. Initialise it and if successful
- *	process any pending callbacks that can now be completed.
- */
-static int rar_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	int error;
-	struct rar_device *rar;
-
-	dev_dbg(&dev->dev, "PCI probe starting\n");
-
-	rar = alloc_rar_device();
-	if (rar == NULL)
-		return -EBUSY;
-
-	/* Enable the device */
-	error = pci_enable_device(dev);
-	if (error) {
-		dev_err(&dev->dev,
-			"Error enabling RAR register PCI device\n");
-		goto end_function;
-	}
-
-	/* Fill in the rar_device structure */
-	rar->rar_dev = pci_dev_get(dev);
-	pci_set_drvdata(dev, rar);
-
-	/*
-	 * Initialize the RAR parameters, which have to be retrieved
-	 * via the message bus interface.
-	 */
-	error = init_rar_params(rar);
-	if (error) {
-		pci_disable_device(dev);
-		dev_err(&dev->dev, "Error retrieving RAR addresses\n");
-		goto end_function;
-	}
-	/* now call anyone who has registered (using callbacks) */
-	rar_callback(rar);
-	return 0;
-end_function:
-	free_rar_device(rar);
-	return error;
-}
-
-const struct pci_device_id rar_pci_id_tbl[] = {
-	{ PCI_VDEVICE(INTEL, 0x4110) },
-	{ 0 }
-};
-
-MODULE_DEVICE_TABLE(pci, rar_pci_id_tbl);
-
-const struct pci_device_id *my_id_table = rar_pci_id_tbl;
-
-/* field for registering driver to PCI device */
-static struct pci_driver rar_pci_driver = {
-	.name = "rar_register_driver",
-	.id_table = rar_pci_id_tbl,
-	.probe = rar_probe,
-	/* Cannot be unplugged - no remove */
-};
-
-static int __init rar_init_handler(void)
-{
-	return pci_register_driver(&rar_pci_driver);
-}
-
-static void __exit rar_exit_handler(void)
-{
-	pci_unregister_driver(&rar_pci_driver);
-}
-
-module_init(rar_init_handler);
-module_exit(rar_exit_handler);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Intel Restricted Access Region Register Driver");
diff --git a/drivers/staging/rar_register/rar_register.h b/drivers/staging/rar_register/rar_register.h
deleted file mode 100644
index ffa805780f85..000000000000
--- a/drivers/staging/rar_register/rar_register.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (C) 2010 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General
- * Public License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied
- * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- * PURPOSE.  See the GNU General Public License for more details.
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the Free
- * Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA  02111-1307, USA.
- * The full GNU General Public License is included in this
- * distribution in the file called COPYING.
- */
-
-
-#ifndef _RAR_REGISTER_H
-#define _RAR_REGISTER_H
-
-#include <linux/types.h>
-
-/* following are used both in drivers as well as user space apps */
-
-#define	RAR_TYPE_VIDEO	0
-#define	RAR_TYPE_AUDIO	1
-#define	RAR_TYPE_IMAGE	2
-#define	RAR_TYPE_DATA	3
-
-#ifdef __KERNEL__
-
-struct rar_device;
-
-int register_rar(int num,
-		int (*callback)(unsigned long data), unsigned long data);
-void unregister_rar(int num);
-int rar_get_address(int rar_index, dma_addr_t *start, dma_addr_t *end);
-int rar_lock(int rar_index);
-
-#endif  /* __KERNEL__ */
-#endif  /* _RAR_REGISTER_H */
diff --git a/include/linux/rar_register.h b/include/linux/rar_register.h
new file mode 100644
index 000000000000..ffa805780f85
--- /dev/null
+++ b/include/linux/rar_register.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2010 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE.  See the GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA  02111-1307, USA.
+ * The full GNU General Public License is included in this
+ * distribution in the file called COPYING.
+ */
+
+
+#ifndef _RAR_REGISTER_H
+#define _RAR_REGISTER_H
+
+#include <linux/types.h>
+
+/* following are used both in drivers as well as user space apps */
+
+#define	RAR_TYPE_VIDEO	0
+#define	RAR_TYPE_AUDIO	1
+#define	RAR_TYPE_IMAGE	2
+#define	RAR_TYPE_DATA	3
+
+#ifdef __KERNEL__
+
+struct rar_device;
+
+int register_rar(int num,
+		int (*callback)(unsigned long data), unsigned long data);
+void unregister_rar(int num);
+int rar_get_address(int rar_index, dma_addr_t *start, dma_addr_t *end);
+int rar_lock(int rar_index);
+
+#endif  /* __KERNEL__ */
+#endif  /* _RAR_REGISTER_H */
-- 
cgit v1.2.3-59-g8ed1b


From 8950778704cf8483cc5cc0140f557adf0d3f45a5 Mon Sep 17 00:00:00 2001
From: Alek Du <alek.du@intel.com>
Date: Tue, 13 Jul 2010 10:56:25 +0100
Subject: gpio: Add PMIC GPIO block support

Moorestown has PMIC chip which contains GPIO blocks. The PMIC chip is
connected to Langwell by SPI interface. So this GPIO driver will be regarded
as SPI GPIO expander though the actual GPIO access is through IPC and SRAM.
The SPI master contoller will probe this device driver by parsing SPIB table.

Cleaned up for new IPC, GPE removed and some printk and other tidying by
Alan Cox. Fixes for points noted by Matthew Garrett

Signed-off-by: Alek Du <alek.du@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 drivers/platform/x86/Kconfig           |   7 +
 drivers/platform/x86/Makefile          |   2 +
 drivers/platform/x86/intel_pmic_gpio.c | 340 +++++++++++++++++++++++++++++++++
 include/linux/intel_pmic_gpio.h        |  15 ++
 4 files changed, 364 insertions(+)
 create mode 100644 drivers/platform/x86/intel_pmic_gpio.c
 create mode 100644 include/linux/intel_pmic_gpio.h

(limited to 'include/linux')

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 2189565d9e0e..ca30e561859e 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -540,6 +540,13 @@ config INTEL_SCU_IPC
 	  some embedded Intel x86 platforms. This is not needed for PC-type
 	  machines.
 
+config GPIO_INTEL_PMIC
+	bool "Intel PMIC GPIO support"
+	depends on INTEL_SCU_IPC && GPIOLIB
+	---help---
+	  Say Y here to support GPIO via the SCU IPC interface
+	  on Intel MID platforms.
+
 config RAR_REGISTER
 	bool "Restricted Access Region Register Driver"
 	depends on PCI && X86_MRST
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index ed50eca1b556..4744c7744ffa 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -28,3 +28,5 @@ obj-$(CONFIG_TOSHIBA_BT_RFKILL)	+= toshiba_bluetooth.o
 obj-$(CONFIG_INTEL_SCU_IPC)	+= intel_scu_ipc.o
 obj-$(CONFIG_RAR_REGISTER)	+= intel_rar_register.o
 obj-$(CONFIG_INTEL_IPS)		+= intel_ips.o
+obj-$(CONFIG_GPIO_INTEL_PMIC)	+= intel_pmic_gpio.o
+
diff --git a/drivers/platform/x86/intel_pmic_gpio.c b/drivers/platform/x86/intel_pmic_gpio.c
new file mode 100644
index 000000000000..5cdcff653918
--- /dev/null
+++ b/drivers/platform/x86/intel_pmic_gpio.c
@@ -0,0 +1,340 @@
+/* Moorestown PMIC GPIO (access through IPC) driver
+ * Copyright (c) 2008 - 2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Moorestown platform PMIC chip
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/gpio.h>
+#include <linux/interrupt.h>
+#include <asm/intel_scu_ipc.h>
+#include <linux/device.h>
+#include <linux/intel_pmic_gpio.h>
+#include <linux/platform_device.h>
+
+#define DRIVER_NAME "pmic_gpio"
+
+/* register offset that IPC driver should use
+ * 8 GPIO + 8 GPOSW (6 controllable) + 8GPO
+ */
+enum pmic_gpio_register {
+	GPIO0		= 0xE0,
+	GPIO7		= 0xE7,
+	GPIOINT		= 0xE8,
+	GPOSWCTL0	= 0xEC,
+	GPOSWCTL5	= 0xF1,
+	GPO		= 0xF4,
+};
+
+/* bits definition for GPIO & GPOSW */
+#define GPIO_DRV 0x01
+#define GPIO_DIR 0x02
+#define GPIO_DIN 0x04
+#define GPIO_DOU 0x08
+#define GPIO_INTCTL 0x30
+#define GPIO_DBC 0xc0
+
+#define GPOSW_DRV 0x01
+#define GPOSW_DOU 0x08
+#define GPOSW_RDRV 0x30
+
+
+#define NUM_GPIO 24
+
+struct pmic_gpio_irq {
+	spinlock_t lock;
+	u32 trigger[NUM_GPIO];
+	u32 dirty;
+	struct work_struct work;
+};
+
+
+struct pmic_gpio {
+	struct gpio_chip	chip;
+	struct pmic_gpio_irq	irqtypes;
+	void			*gpiointr;
+	int			irq;
+	unsigned		irq_base;
+};
+
+static void pmic_program_irqtype(int gpio, int type)
+{
+	if (type & IRQ_TYPE_EDGE_RISING)
+		intel_scu_ipc_update_register(GPIO0 + gpio, 0x20, 0x20);
+	else
+		intel_scu_ipc_update_register(GPIO0 + gpio, 0x00, 0x20);
+
+	if (type & IRQ_TYPE_EDGE_FALLING)
+		intel_scu_ipc_update_register(GPIO0 + gpio, 0x10, 0x10);
+	else
+		intel_scu_ipc_update_register(GPIO0 + gpio, 0x00, 0x10);
+};
+
+static void pmic_irqtype_work(struct work_struct *work)
+{
+	struct pmic_gpio_irq *t =
+		container_of(work, struct pmic_gpio_irq, work);
+	unsigned long flags;
+	int i;
+	u16 type;
+
+	spin_lock_irqsave(&t->lock, flags);
+	/* As we drop the lock, we may need multiple scans if we race the
+	   pmic_irq_type function */
+	while (t->dirty) {
+		/*
+		 *	For each pin that has the dirty bit set send an IPC
+		 *	message to configure the hardware via the PMIC
+		 */
+		for (i = 0; i < NUM_GPIO; i++) {
+			if (!(t->dirty & (1 << i)))
+				continue;
+			t->dirty &= ~(1 << i);
+			/* We can't trust the array entry or dirty
+			   once the lock is dropped */
+			type = t->trigger[i];
+			spin_unlock_irqrestore(&t->lock, flags);
+			pmic_program_irqtype(i, type);
+			spin_lock_irqsave(&t->lock, flags);
+		}
+	}
+	spin_unlock_irqrestore(&t->lock, flags);
+}
+
+static int pmic_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
+{
+	if (offset > 8) {
+		printk(KERN_ERR
+			"%s: only pin 0-7 support input\n", __func__);
+		return -1;/* we only have 8 GPIO can use as input */
+	}
+	return intel_scu_ipc_update_register(GPIO0 + offset,
+							GPIO_DIR, GPIO_DIR);
+}
+
+static int pmic_gpio_direction_output(struct gpio_chip *chip,
+			unsigned offset, int value)
+{
+	int rc = 0;
+
+	if (offset < 8)/* it is GPIO */
+		rc = intel_scu_ipc_update_register(GPIO0 + offset,
+				GPIO_DRV | GPIO_DOU | GPIO_DIR,
+				GPIO_DRV | (value ? GPIO_DOU : 0));
+	else if (offset < 16)/* it is GPOSW */
+		rc = intel_scu_ipc_update_register(GPOSWCTL0 + offset - 8,
+				GPOSW_DRV | GPOSW_DOU | GPOSW_RDRV,
+				GPOSW_DRV | (value ? GPOSW_DOU : 0));
+	else if (offset > 15 && offset < 24)/* it is GPO */
+		rc = intel_scu_ipc_update_register(GPO,
+				1 << (offset - 16),
+				value ? 1 << (offset - 16) : 0);
+	else {
+		printk(KERN_ERR
+			"%s: invalid PMIC GPIO pin %d!\n", __func__, offset);
+		WARN_ON(1);
+	}
+
+	return rc;
+}
+
+static int pmic_gpio_get(struct gpio_chip *chip, unsigned offset)
+{
+	u8 r;
+	int ret;
+
+	/* we only have 8 GPIO pins we can use as input */
+	if (offset > 8)
+		return -EOPNOTSUPP;
+	ret = intel_scu_ipc_ioread8(GPIO0 + offset, &r);
+	if (ret < 0)
+		return ret;
+	return r & GPIO_DIN;
+}
+
+static void pmic_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
+{
+	if (offset < 8)/* it is GPIO */
+		intel_scu_ipc_update_register(GPIO0 + offset,
+			GPIO_DRV | GPIO_DOU,
+			GPIO_DRV | (value ? GPIO_DOU : 0));
+	else if (offset < 16)/* it is GPOSW */
+		intel_scu_ipc_update_register(GPOSWCTL0 + offset - 8,
+			GPOSW_DRV | GPOSW_DOU | GPOSW_RDRV,
+			GPOSW_DRV | (value ? GPOSW_DOU : 0));
+	else if (offset > 15 && offset < 24) /* it is GPO */
+		intel_scu_ipc_update_register(GPO,
+			1 << (offset - 16),
+			value ? 1 << (offset - 16) : 0);
+}
+
+static int pmic_irq_type(unsigned irq, unsigned type)
+{
+	struct pmic_gpio *pg = get_irq_chip_data(irq);
+	u32 gpio = irq - pg->irq_base;
+	unsigned long flags;
+
+	if (gpio > pg->chip.ngpio)
+		return -EINVAL;
+
+	spin_lock_irqsave(&pg->irqtypes.lock, flags);
+	pg->irqtypes.trigger[gpio] = type;
+	pg->irqtypes.dirty |=  (1 << gpio);
+	spin_unlock_irqrestore(&pg->irqtypes.lock, flags);
+	schedule_work(&pg->irqtypes.work);
+	return 0;
+}
+
+
+
+static int pmic_gpio_to_irq(struct gpio_chip *chip, unsigned offset)
+{
+	struct pmic_gpio *pg = container_of(chip, struct pmic_gpio, chip);
+
+	return pg->irq_base + offset;
+}
+
+/* the gpiointr register is read-clear, so just do nothing. */
+static void pmic_irq_unmask(unsigned irq)
+{
+};
+
+static void pmic_irq_mask(unsigned irq)
+{
+};
+
+static struct irq_chip pmic_irqchip = {
+	.name		= "PMIC-GPIO",
+	.mask		= pmic_irq_mask,
+	.unmask		= pmic_irq_unmask,
+	.set_type	= pmic_irq_type,
+};
+
+static void pmic_irq_handler(unsigned irq, struct irq_desc *desc)
+{
+	struct pmic_gpio *pg = (struct pmic_gpio *)get_irq_data(irq);
+	u8 intsts = *((u8 *)pg->gpiointr + 4);
+	int gpio;
+
+	for (gpio = 0; gpio < 8; gpio++) {
+		if (intsts & (1 << gpio)) {
+			pr_debug("pmic pin %d triggered\n", gpio);
+			generic_handle_irq(pg->irq_base + gpio);
+		}
+	}
+	desc->chip->eoi(irq);
+}
+
+static int __devinit platform_pmic_gpio_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	int irq = platform_get_irq(pdev, 0);
+	struct intel_pmic_gpio_platform_data *pdata = dev->platform_data;
+
+	struct pmic_gpio *pg;
+	int retval;
+	int i;
+
+	if (irq < 0) {
+		dev_dbg(dev, "no IRQ line\n");
+		return -EINVAL;
+	}
+
+	if (!pdata || !pdata->gpio_base || !pdata->irq_base) {
+		dev_dbg(dev, "incorrect or missing platform data\n");
+		return -EINVAL;
+	}
+
+	pg = kzalloc(sizeof(*pg), GFP_KERNEL);
+	if (!pg)
+		return -ENOMEM;
+
+	dev_set_drvdata(dev, pg);
+
+	pg->irq = irq;
+	/* setting up SRAM mapping for GPIOINT register */
+	pg->gpiointr = ioremap_nocache(pdata->gpiointr, 8);
+	if (!pg->gpiointr) {
+		printk(KERN_ERR "%s: Can not map GPIOINT.\n", __func__);
+		retval = -EINVAL;
+		goto err2;
+	}
+	pg->irq_base = pdata->irq_base;
+	pg->chip.label = "intel_pmic";
+	pg->chip.direction_input = pmic_gpio_direction_input;
+	pg->chip.direction_output = pmic_gpio_direction_output;
+	pg->chip.get = pmic_gpio_get;
+	pg->chip.set = pmic_gpio_set;
+	pg->chip.to_irq = pmic_gpio_to_irq;
+	pg->chip.base = pdata->gpio_base;
+	pg->chip.ngpio = NUM_GPIO;
+	pg->chip.can_sleep = 1;
+	pg->chip.dev = dev;
+
+	INIT_WORK(&pg->irqtypes.work, pmic_irqtype_work);
+	spin_lock_init(&pg->irqtypes.lock);
+
+	pg->chip.dev = dev;
+	retval = gpiochip_add(&pg->chip);
+	if (retval) {
+		printk(KERN_ERR "%s: Can not add pmic gpio chip.\n", __func__);
+		goto err;
+	}
+	set_irq_data(pg->irq, pg);
+	set_irq_chained_handler(pg->irq, pmic_irq_handler);
+	for (i = 0; i < 8; i++) {
+		set_irq_chip_and_handler_name(i + pg->irq_base, &pmic_irqchip,
+					handle_simple_irq, "demux");
+		set_irq_chip_data(i + pg->irq_base, pg);
+	}
+	return 0;
+err:
+	iounmap(pg->gpiointr);
+err2:
+	kfree(pg);
+	return retval;
+}
+
+/* at the same time, register a platform driver
+ * this supports the sfi 0.81 fw */
+static struct platform_driver platform_pmic_gpio_driver = {
+	.driver = {
+		.name		= DRIVER_NAME,
+		.owner		= THIS_MODULE,
+	},
+	.probe		= platform_pmic_gpio_probe,
+};
+
+static int __init platform_pmic_gpio_init(void)
+{
+	return platform_driver_register(&platform_pmic_gpio_driver);
+}
+
+subsys_initcall(platform_pmic_gpio_init);
+
+MODULE_AUTHOR("Alek Du <alek.du@intel.com>");
+MODULE_DESCRIPTION("Intel Moorestown PMIC GPIO driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/intel_pmic_gpio.h b/include/linux/intel_pmic_gpio.h
new file mode 100644
index 000000000000..920109a29191
--- /dev/null
+++ b/include/linux/intel_pmic_gpio.h
@@ -0,0 +1,15 @@
+#ifndef LINUX_INTEL_PMIC_H
+#define LINUX_INTEL_PMIC_H
+
+struct intel_pmic_gpio_platform_data {
+	/* the first IRQ of the chip */
+	unsigned	irq_base;
+	/* number assigned to the first GPIO */
+	unsigned	gpio_base;
+	/* sram address for gpiointr register, the langwell chip will map
+	 * the PMIC spi GPIO expander's GPIOINTR register in sram.
+	 */
+	unsigned	gpiointr;
+};
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 226528c6100e4191842e61997110c8ace40605f7 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Thu, 4 Mar 2010 03:23:36 -0500
Subject: [CPUFREQ] unexport (un)lock_policy_rwsem* functions

lock_policy_rwsem_* and unlock_policy_rwsem_* functions are scheduled
to be unexported when 2.6.33. Now there are no other callers of them
out of cpufreq.c, unexport them and make them static.

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 Documentation/feature-removal-schedule.txt | 10 ----------
 drivers/cpufreq/cpufreq.c                  | 10 +++-------
 include/linux/cpufreq.h                    |  5 -----
 3 files changed, 3 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 1571c0c83dba..182bbe49429b 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -377,16 +377,6 @@ Who:	Eric Paris <eparis@redhat.com>
 
 ----------------------------
 
-What:	lock_policy_rwsem_* and unlock_policy_rwsem_* will not be
-	exported interface anymore.
-When:	2.6.33
-Why:	cpu_policy_rwsem has a new cleaner definition making it local to
-	cpufreq core and contained inside cpufreq.c. Other dependent
-	drivers should not use it in order to safely avoid lockdep issues.
-Who:	Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
-
-----------------------------
-
 What:	sound-slot/service-* module aliases and related clutters in
 	sound/sound_core.c
 When:	August 2010
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 938b74ea9ffb..40877d219081 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -68,7 +68,7 @@ static DEFINE_PER_CPU(int, cpufreq_policy_cpu);
 static DEFINE_PER_CPU(struct rw_semaphore, cpu_policy_rwsem);
 
 #define lock_policy_rwsem(mode, cpu)					\
-int lock_policy_rwsem_##mode						\
+static int lock_policy_rwsem_##mode					\
 (int cpu)								\
 {									\
 	int policy_cpu = per_cpu(cpufreq_policy_cpu, cpu);		\
@@ -83,26 +83,22 @@ int lock_policy_rwsem_##mode						\
 }
 
 lock_policy_rwsem(read, cpu);
-EXPORT_SYMBOL_GPL(lock_policy_rwsem_read);
 
 lock_policy_rwsem(write, cpu);
-EXPORT_SYMBOL_GPL(lock_policy_rwsem_write);
 
-void unlock_policy_rwsem_read(int cpu)
+static void unlock_policy_rwsem_read(int cpu)
 {
 	int policy_cpu = per_cpu(cpufreq_policy_cpu, cpu);
 	BUG_ON(policy_cpu == -1);
 	up_read(&per_cpu(cpu_policy_rwsem, policy_cpu));
 }
-EXPORT_SYMBOL_GPL(unlock_policy_rwsem_read);
 
-void unlock_policy_rwsem_write(int cpu)
+static void unlock_policy_rwsem_write(int cpu)
 {
 	int policy_cpu = per_cpu(cpufreq_policy_cpu, cpu);
 	BUG_ON(policy_cpu == -1);
 	up_write(&per_cpu(cpu_policy_rwsem, policy_cpu));
 }
-EXPORT_SYMBOL_GPL(unlock_policy_rwsem_write);
 
 
 /* internal prototypes */
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 9f15150ce8d6..c3e9de8321c6 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -196,11 +196,6 @@ extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy,
 int cpufreq_register_governor(struct cpufreq_governor *governor);
 void cpufreq_unregister_governor(struct cpufreq_governor *governor);
 
-int lock_policy_rwsem_read(int cpu);
-int lock_policy_rwsem_write(int cpu);
-void unlock_policy_rwsem_read(int cpu);
-void unlock_policy_rwsem_write(int cpu);
-
 
 /*********************************************************************
  *                      CPUFREQ DRIVER INTERFACE                     *
-- 
cgit v1.2.3-59-g8ed1b


From a931da6ac9331a6c80dd91c199105806f2336188 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 3 Aug 2010 21:35:12 -0400
Subject: jbd2: Change j_state_lock to be a rwlock_t

Lockstat reports have shown that j_state_lock is a major source of
lock contention, especially on systems with more than 4 CPU cores.  So
change it to be a read/write spinlock.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c       |  4 +--
 fs/ext4/super.c       |  4 +--
 fs/jbd2/checkpoint.c  | 16 ++++-----
 fs/jbd2/commit.c      | 26 +++++++-------
 fs/jbd2/journal.c     | 94 +++++++++++++++++++++++++--------------------------
 fs/jbd2/transaction.c | 74 +++++++++++++++++++++-------------------
 fs/ocfs2/journal.c    |  4 +--
 include/linux/jbd2.h  |  2 +-
 8 files changed, 114 insertions(+), 110 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 533b607f9cb5..ab2247d642c6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5066,7 +5066,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		transaction_t *transaction;
 		tid_t tid;
 
-		spin_lock(&journal->j_state_lock);
+		read_lock(&journal->j_state_lock);
 		if (journal->j_running_transaction)
 			transaction = journal->j_running_transaction;
 		else
@@ -5075,7 +5075,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			tid = transaction->t_tid;
 		else
 			tid = journal->j_commit_sequence;
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		ei->i_sync_tid = tid;
 		ei->i_datasync_tid = tid;
 	}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3fd65eb66ccd..81cb3fc1218e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3232,7 +3232,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 	journal->j_min_batch_time = sbi->s_min_batch_time;
 	journal->j_max_batch_time = sbi->s_max_batch_time;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
 		journal->j_flags |= JBD2_BARRIER;
 	else
@@ -3241,7 +3241,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
 	else
 		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 static journal_t *ext4_get_journal(struct super_block *sb,
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index f8cdc02520f9..1c23a0f4e8a3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -118,13 +118,13 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 void __jbd2_log_wait_for_space(journal_t *journal)
 {
 	int nblocks, space_left;
-	assert_spin_locked(&journal->j_state_lock);
+	/* assert_spin_locked(&journal->j_state_lock); */
 
 	nblocks = jbd_space_needed(journal);
 	while (__jbd2_log_space_left(journal) < nblocks) {
 		if (journal->j_flags & JBD2_ABORT)
 			return;
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		mutex_lock(&journal->j_checkpoint_mutex);
 
 		/*
@@ -138,7 +138,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 		 * filesystem, so abort the journal and leave a stack
 		 * trace for forensic evidence.
 		 */
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
 		space_left = __jbd2_log_space_left(journal);
@@ -149,7 +149,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 			if (journal->j_committing_transaction)
 				tid = journal->j_committing_transaction->t_tid;
 			spin_unlock(&journal->j_list_lock);
-			spin_unlock(&journal->j_state_lock);
+			write_unlock(&journal->j_state_lock);
 			if (chkpt) {
 				jbd2_log_do_checkpoint(journal);
 			} else if (jbd2_cleanup_journal_tail(journal) == 0) {
@@ -167,7 +167,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 				WARN_ON(1);
 				jbd2_journal_abort(journal, 0);
 			}
-			spin_lock(&journal->j_state_lock);
+			write_lock(&journal->j_state_lock);
 		} else {
 			spin_unlock(&journal->j_list_lock);
 		}
@@ -474,7 +474,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 * next transaction ID we will write, and where it will
 	 * start. */
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	transaction = journal->j_checkpoint_transactions;
 	if (transaction) {
@@ -496,7 +496,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	/* If the oldest pinned transaction is at the tail of the log
            already then there's not much we can do right now. */
 	if (journal->j_tail_sequence == first_tid) {
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		return 1;
 	}
 
@@ -516,7 +516,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	journal->j_free += freed;
 	journal->j_tail_sequence = first_tid;
 	journal->j_tail = blocknr;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	/*
 	 * If there is an external journal, we need to make sure that
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index fbd2c564e916..67bb0a2f35e5 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -152,9 +152,9 @@ static int journal_submit_commit_record(journal_t *journal,
 		printk(KERN_WARNING
 		       "JBD2: Disabling barriers on %s, "
 		       "not supported by device\n", journal->j_devname);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 		journal->j_flags &= ~JBD2_BARRIER;
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 
 		/* And try again, without the barrier */
 		lock_buffer(bh);
@@ -182,9 +182,9 @@ retry:
 		printk(KERN_WARNING
 		       "JBD2: %s: disabling barries on %s - not supported "
 		       "by device\n", __func__, journal->j_devname);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 		journal->j_flags &= ~JBD2_BARRIER;
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 
 		lock_buffer(bh);
 		clear_buffer_dirty(bh);
@@ -400,7 +400,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
 	/*
@@ -424,9 +424,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 					TASK_UNINTERRUPTIBLE);
 		if (atomic_read(&commit_transaction->t_updates)) {
 			spin_unlock(&commit_transaction->t_handle_lock);
-			spin_unlock(&journal->j_state_lock);
+			write_unlock(&journal->j_state_lock);
 			schedule();
-			spin_lock(&journal->j_state_lock);
+			write_lock(&journal->j_state_lock);
 			spin_lock(&commit_transaction->t_handle_lock);
 		}
 		finish_wait(&journal->j_wait_updates, &wait);
@@ -497,7 +497,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	start_time = ktime_get();
 	commit_transaction->t_log_start = journal->j_head;
 	wake_up(&journal->j_wait_transaction_locked);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	jbd_debug (3, "JBD: commit phase 2\n");
 
@@ -519,9 +519,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * transaction!  Now comes the tricky part: we need to write out
 	 * metadata.  Loop over the transaction's entire buffer list:
 	 */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_COMMIT;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	trace_jbd2_commit_logging(journal, commit_transaction);
 	stats.run.rs_logging = jiffies;
@@ -978,7 +978,7 @@ restart_loop:
 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
 	 * other checkpointing code processing the transaction...
 	 */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	/*
 	 * Now recheck if some buffers did not get attached to the transaction
@@ -986,7 +986,7 @@ restart_loop:
 	 */
 	if (commit_transaction->t_forget) {
 		spin_unlock(&journal->j_list_lock);
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		goto restart_loop;
 	}
 
@@ -1038,7 +1038,7 @@ restart_loop:
 				journal->j_average_commit_time*3) / 4;
 	else
 		journal->j_average_commit_time = commit_time;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	if (commit_transaction->t_checkpoint_list == NULL &&
 	    commit_transaction->t_checkpoint_io_list == NULL) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a79d3345b55a..e7bf0fd9cec7 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -142,7 +142,7 @@ static int kjournald2(void *arg)
 	/*
 	 * And now, wait forever for commit wakeup events.
 	 */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 
 loop:
 	if (journal->j_flags & JBD2_UNMOUNT)
@@ -153,10 +153,10 @@ loop:
 
 	if (journal->j_commit_sequence != journal->j_commit_request) {
 		jbd_debug(1, "OK, requests differ\n");
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		del_timer_sync(&journal->j_commit_timer);
 		jbd2_journal_commit_transaction(journal);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 		goto loop;
 	}
 
@@ -168,9 +168,9 @@ loop:
 		 * be already stopped.
 		 */
 		jbd_debug(1, "Now suspending kjournald2\n");
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		refrigerator();
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 	} else {
 		/*
 		 * We assume on resume that commits are already there,
@@ -190,9 +190,9 @@ loop:
 		if (journal->j_flags & JBD2_UNMOUNT)
 			should_sleep = 0;
 		if (should_sleep) {
-			spin_unlock(&journal->j_state_lock);
+			write_unlock(&journal->j_state_lock);
 			schedule();
-			spin_lock(&journal->j_state_lock);
+			write_lock(&journal->j_state_lock);
 		}
 		finish_wait(&journal->j_wait_commit, &wait);
 	}
@@ -210,7 +210,7 @@ loop:
 	goto loop;
 
 end_loop:
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	del_timer_sync(&journal->j_commit_timer);
 	journal->j_task = NULL;
 	wake_up(&journal->j_wait_done_commit);
@@ -233,16 +233,16 @@ static int jbd2_journal_start_thread(journal_t *journal)
 
 static void journal_kill_thread(journal_t *journal)
 {
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	journal->j_flags |= JBD2_UNMOUNT;
 
 	while (journal->j_task) {
 		wake_up(&journal->j_wait_commit);
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 	}
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 /*
@@ -452,7 +452,7 @@ int __jbd2_log_space_left(journal_t *journal)
 {
 	int left = journal->j_free;
 
-	assert_spin_locked(&journal->j_state_lock);
+	/* assert_spin_locked(&journal->j_state_lock); */
 
 	/*
 	 * Be pessimistic here about the number of those free blocks which
@@ -497,9 +497,9 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
 {
 	int ret;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	ret = __jbd2_log_start_commit(journal, tid);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return ret;
 }
 
@@ -518,7 +518,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 	transaction_t *transaction = NULL;
 	tid_t tid;
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	if (journal->j_running_transaction && !current->journal_info) {
 		transaction = journal->j_running_transaction;
 		__jbd2_log_start_commit(journal, transaction->t_tid);
@@ -526,12 +526,12 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 		transaction = journal->j_committing_transaction;
 
 	if (!transaction) {
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		return 0;	/* Nothing to retry */
 	}
 
 	tid = transaction->t_tid;
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 	jbd2_log_wait_commit(journal, tid);
 	return 1;
 }
@@ -545,7 +545,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 {
 	int ret = 0;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (journal->j_running_transaction) {
 		tid_t tid = journal->j_running_transaction->t_tid;
 
@@ -564,7 +564,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 			*ptid = journal->j_committing_transaction->t_tid;
 		ret = 1;
 	}
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return ret;
 }
 
@@ -576,26 +576,24 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 {
 	int err = 0;
 
+	read_lock(&journal->j_state_lock);
 #ifdef CONFIG_JBD2_DEBUG
-	spin_lock(&journal->j_state_lock);
 	if (!tid_geq(journal->j_commit_request, tid)) {
 		printk(KERN_EMERG
 		       "%s: error: j_commit_request=%d, tid=%d\n",
 		       __func__, journal->j_commit_request, tid);
 	}
-	spin_unlock(&journal->j_state_lock);
 #endif
-	spin_lock(&journal->j_state_lock);
 	while (tid_gt(tid, journal->j_commit_sequence)) {
 		jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
 				  tid, journal->j_commit_sequence);
 		wake_up(&journal->j_wait_commit);
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_done_commit,
 				!tid_gt(tid, journal->j_commit_sequence));
-		spin_lock(&journal->j_state_lock);
+		read_lock(&journal->j_state_lock);
 	}
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 
 	if (unlikely(is_journal_aborted(journal))) {
 		printk(KERN_EMERG "journal commit I/O error\n");
@@ -612,7 +610,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 {
 	unsigned long blocknr;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	J_ASSERT(journal->j_free > 1);
 
 	blocknr = journal->j_head;
@@ -620,7 +618,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 	journal->j_free--;
 	if (journal->j_head == journal->j_last)
 		journal->j_head = journal->j_first;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return jbd2_journal_bmap(journal, blocknr, retp);
 }
 
@@ -840,7 +838,7 @@ static journal_t * journal_init_common (void)
 	mutex_init(&journal->j_checkpoint_mutex);
 	spin_lock_init(&journal->j_revoke_lock);
 	spin_lock_init(&journal->j_list_lock);
-	spin_lock_init(&journal->j_state_lock);
+	rwlock_init(&journal->j_state_lock);
 
 	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
 	journal->j_min_batch_time = 0;
@@ -1106,14 +1104,14 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
 		set_buffer_uptodate(bh);
 	}
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
 	sb->s_start    = cpu_to_be32(journal->j_tail);
 	sb->s_errno    = cpu_to_be32(journal->j_errno);
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 
 	BUFFER_TRACE(bh, "marking dirty");
 	mark_buffer_dirty(bh);
@@ -1134,12 +1132,12 @@ out:
 	 * any future commit will have to be careful to update the
 	 * superblock again to re-record the true start of the log. */
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (sb->s_start)
 		journal->j_flags &= ~JBD2_FLUSHED;
 	else
 		journal->j_flags |= JBD2_FLUSHED;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 /*
@@ -1551,7 +1549,7 @@ int jbd2_journal_flush(journal_t *journal)
 	transaction_t *transaction = NULL;
 	unsigned long old_tail;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 
 	/* Force everything buffered to the log... */
 	if (journal->j_running_transaction) {
@@ -1564,10 +1562,10 @@ int jbd2_journal_flush(journal_t *journal)
 	if (transaction) {
 		tid_t tid = transaction->t_tid;
 
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		jbd2_log_wait_commit(journal, tid);
 	} else {
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 	}
 
 	/* ...and flush everything in the log out to disk. */
@@ -1591,12 +1589,12 @@ int jbd2_journal_flush(journal_t *journal)
 	 * the magic code for a fully-recovered superblock.  Any future
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	old_tail = journal->j_tail;
 	journal->j_tail = 0;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	jbd2_journal_update_superblock(journal, 1);
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	journal->j_tail = old_tail;
 
 	J_ASSERT(!journal->j_running_transaction);
@@ -1604,7 +1602,7 @@ int jbd2_journal_flush(journal_t *journal)
 	J_ASSERT(!journal->j_checkpoint_transactions);
 	J_ASSERT(journal->j_head == journal->j_tail);
 	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return 0;
 }
 
@@ -1668,12 +1666,12 @@ void __jbd2_journal_abort_hard(journal_t *journal)
 	printk(KERN_ERR "Aborting journal on device %s.\n",
 	       journal->j_devname);
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	journal->j_flags |= JBD2_ABORT;
 	transaction = journal->j_running_transaction;
 	if (transaction)
 		__jbd2_log_start_commit(journal, transaction->t_tid);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 /* Soft abort: record the abort error status in the journal superblock,
@@ -1758,12 +1756,12 @@ int jbd2_journal_errno(journal_t *journal)
 {
 	int err;
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	if (journal->j_flags & JBD2_ABORT)
 		err = -EROFS;
 	else
 		err = journal->j_errno;
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 	return err;
 }
 
@@ -1778,12 +1776,12 @@ int jbd2_journal_clear_err(journal_t *journal)
 {
 	int err = 0;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (journal->j_flags & JBD2_ABORT)
 		err = -EROFS;
 	else
 		journal->j_errno = 0;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	return err;
 }
 
@@ -1796,10 +1794,10 @@ int jbd2_journal_clear_err(journal_t *journal)
  */
 void jbd2_journal_ack_err(journal_t *journal)
 {
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (journal->j_errno)
 		journal->j_flags |= JBD2_ACK_ERR;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 int jbd2_journal_blocks_per_page(struct inode *inode)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 9c64c7ec48d4..663065142b42 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -124,36 +124,38 @@ alloc_transaction:
 
 	jbd_debug(3, "New handle %p going live.\n", handle);
 
-repeat:
-
 	/*
 	 * We need to hold j_state_lock until t_updates has been incremented,
 	 * for proper journal barrier handling
 	 */
-	spin_lock(&journal->j_state_lock);
-repeat_locked:
+repeat:
+	read_lock(&journal->j_state_lock);
 	if (is_journal_aborted(journal) ||
 	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		kfree(new_transaction);
 		return -EROFS;
 	}
 
 	/* Wait on the journal's transaction barrier if necessary */
 	if (journal->j_barrier_count) {
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_transaction_locked,
 				journal->j_barrier_count == 0);
 		goto repeat;
 	}
 
 	if (!journal->j_running_transaction) {
-		if (!new_transaction) {
-			spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
+		if (!new_transaction)
 			goto alloc_transaction;
+		write_lock(&journal->j_state_lock);
+		if (!journal->j_running_transaction) {
+			jbd2_get_transaction(journal, new_transaction);
+			new_transaction = NULL;
 		}
-		jbd2_get_transaction(journal, new_transaction);
-		new_transaction = NULL;
+		write_unlock(&journal->j_state_lock);
+		goto repeat;
 	}
 
 	transaction = journal->j_running_transaction;
@@ -167,7 +169,7 @@ repeat_locked:
 
 		prepare_to_wait(&journal->j_wait_transaction_locked,
 					&wait, TASK_UNINTERRUPTIBLE);
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		schedule();
 		finish_wait(&journal->j_wait_transaction_locked, &wait);
 		goto repeat;
@@ -194,7 +196,7 @@ repeat_locked:
 		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
 				TASK_UNINTERRUPTIBLE);
 		__jbd2_log_start_commit(journal, transaction->t_tid);
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 		schedule();
 		finish_wait(&journal->j_wait_transaction_locked, &wait);
 		goto repeat;
@@ -228,8 +230,12 @@ repeat_locked:
 	if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
 		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
 		spin_unlock(&transaction->t_handle_lock);
-		__jbd2_log_wait_for_space(journal);
-		goto repeat_locked;
+		read_unlock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
+		if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
+			__jbd2_log_wait_for_space(journal);
+		write_unlock(&journal->j_state_lock);
+		goto repeat;
 	}
 
 	/* OK, account for the buffers that this operation expects to
@@ -250,7 +256,7 @@ repeat_locked:
 		  atomic_read(&transaction->t_outstanding_credits),
 		  __jbd2_log_space_left(journal));
 	spin_unlock(&transaction->t_handle_lock);
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 
 	lock_map_acquire(&handle->h_lockdep_map);
 	kfree(new_transaction);
@@ -362,7 +368,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
 
 	result = 1;
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 
 	/* Don't extend a locked-down transaction! */
 	if (handle->h_transaction->t_state != T_RUNNING) {
@@ -394,7 +400,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
 unlock:
 	spin_unlock(&transaction->t_handle_lock);
 error_out:
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 out:
 	return result;
 }
@@ -432,7 +438,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
 	J_ASSERT(journal_current_handle() == handle);
 
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
 	atomic_sub(handle->h_buffer_credits,
 		   &transaction->t_outstanding_credits);
@@ -442,7 +448,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 
 	jbd_debug(2, "restarting handle %p\n", handle);
 	__jbd2_log_start_commit(journal, transaction->t_tid);
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 
 	lock_map_release(&handle->h_lockdep_map);
 	handle->h_buffer_credits = nblocks;
@@ -472,7 +478,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
 {
 	DEFINE_WAIT(wait);
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	++journal->j_barrier_count;
 
 	/* Wait until there are no running updates */
@@ -490,12 +496,12 @@ void jbd2_journal_lock_updates(journal_t *journal)
 		prepare_to_wait(&journal->j_wait_updates, &wait,
 				TASK_UNINTERRUPTIBLE);
 		spin_unlock(&transaction->t_handle_lock);
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		schedule();
 		finish_wait(&journal->j_wait_updates, &wait);
-		spin_lock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
 	}
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 
 	/*
 	 * We have now established a barrier against other normal updates, but
@@ -519,9 +525,9 @@ void jbd2_journal_unlock_updates (journal_t *journal)
 	J_ASSERT(journal->j_barrier_count != 0);
 
 	mutex_unlock(&journal->j_barrier);
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	--journal->j_barrier_count;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 	wake_up(&journal->j_wait_transaction_locked);
 }
 
@@ -1314,9 +1320,9 @@ int jbd2_journal_stop(handle_t *handle)
 
 		journal->j_last_sync_writer = pid;
 
-		spin_lock(&journal->j_state_lock);
+		read_lock(&journal->j_state_lock);
 		commit_time = journal->j_average_commit_time;
-		spin_unlock(&journal->j_state_lock);
+		read_unlock(&journal->j_state_lock);
 
 		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
 						   transaction->t_start_time));
@@ -1748,7 +1754,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 		goto zap_buffer_unlocked;
 
 	/* OK, we have data buffer in journaled mode */
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
 
@@ -1801,7 +1807,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 			jbd2_journal_put_journal_head(jh);
 			spin_unlock(&journal->j_list_lock);
 			jbd_unlock_bh_state(bh);
-			spin_unlock(&journal->j_state_lock);
+			write_unlock(&journal->j_state_lock);
 			return ret;
 		} else {
 			/* There is no currently-running transaction. So the
@@ -1815,7 +1821,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 				jbd2_journal_put_journal_head(jh);
 				spin_unlock(&journal->j_list_lock);
 				jbd_unlock_bh_state(bh);
-				spin_unlock(&journal->j_state_lock);
+				write_unlock(&journal->j_state_lock);
 				return ret;
 			} else {
 				/* The orphan record's transaction has
@@ -1839,7 +1845,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 		jbd2_journal_put_journal_head(jh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
-		spin_unlock(&journal->j_state_lock);
+		write_unlock(&journal->j_state_lock);
 		return 0;
 	} else {
 		/* Good, the buffer belongs to the running transaction.
@@ -1858,7 +1864,7 @@ zap_buffer:
 zap_buffer_no_jh:
 	spin_unlock(&journal->j_list_lock);
 	jbd_unlock_bh_state(bh);
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 zap_buffer_unlocked:
 	clear_buffer_dirty(bh);
 	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2165,9 +2171,9 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal,
 	/* Locks are here just to force reading of recent values, it is
 	 * enough that the transaction was not committing before we started
 	 * a transaction adding the inode to orphan list */
-	spin_lock(&journal->j_state_lock);
+	read_lock(&journal->j_state_lock);
 	commit_trans = journal->j_committing_transaction;
-	spin_unlock(&journal->j_state_lock);
+	read_unlock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	inode_trans = jinode->i_transaction;
 	spin_unlock(&journal->j_list_lock);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 47878cf16418..9c1b92ebeb94 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -760,13 +760,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
 	if (osb->osb_commit_interval)
 		commit_interval = osb->osb_commit_interval;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	journal->j_commit_interval = commit_interval;
 	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
 		journal->j_flags |= JBD2_BARRIER;
 	else
 		journal->j_flags &= ~JBD2_BARRIER;
-	spin_unlock(&journal->j_state_lock);
+	write_unlock(&journal->j_state_lock);
 }
 
 int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index a72ce21de0e1..15d5743ccfbb 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -764,7 +764,7 @@ struct journal_s
 	/*
 	 * Protect the various scalars in the journal
 	 */
-	spinlock_t		j_state_lock;
+	rwlock_t		j_state_lock;
 
 	/*
 	 * Number of processes waiting to create a barrier lock [j_state_lock]
-- 
cgit v1.2.3-59-g8ed1b


From 8dd420466c7bfc459fa04680bd5690bfc41a4553 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 3 Aug 2010 21:38:29 -0400
Subject: jbd2: Remove t_handle_lock from start_this_handle()

This should remove the last exclusive lock from start_this_handle(),
so that we should now be able to start multiple transactions at the
same time on large SMP systems.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c      |  3 ++-
 fs/jbd2/transaction.c | 33 ++++++++++++++++++++++-----------
 include/linux/jbd2.h  |  2 +-
 3 files changed, 25 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 67bb0a2f35e5..f52e5e8049f1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -1004,7 +1004,8 @@ restart_loop:
 	 * File the transaction statistics
 	 */
 	stats.ts_tid = commit_transaction->t_tid;
-	stats.run.rs_handle_count = commit_transaction->t_handle_count;
+	stats.run.rs_handle_count =
+		atomic_read(&commit_transaction->t_handle_count);
 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
 			     commit_transaction->t_tid, &stats.run);
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 663065142b42..0752bcda535f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,6 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	spin_lock_init(&transaction->t_handle_lock);
 	atomic_set(&transaction->t_updates, 0);
 	atomic_set(&transaction->t_outstanding_credits, 0);
+	atomic_set(&transaction->t_handle_count, 0);
 	INIT_LIST_HEAD(&transaction->t_inode_list);
 	INIT_LIST_HEAD(&transaction->t_private_list);
 
@@ -180,8 +181,8 @@ repeat:
 	 * buffers requested by this operation, we need to stall pending a log
 	 * checkpoint to free some more log space.
 	 */
-	spin_lock(&transaction->t_handle_lock);
-	needed = atomic_read(&transaction->t_outstanding_credits) + nblocks;
+	needed = atomic_add_return(nblocks,
+				   &transaction->t_outstanding_credits);
 
 	if (needed > journal->j_max_transaction_buffers) {
 		/*
@@ -192,7 +193,7 @@ repeat:
 		DEFINE_WAIT(wait);
 
 		jbd_debug(2, "Handle %p starting new commit...\n", handle);
-		spin_unlock(&transaction->t_handle_lock);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
 				TASK_UNINTERRUPTIBLE);
 		__jbd2_log_start_commit(journal, transaction->t_tid);
@@ -229,7 +230,7 @@ repeat:
 	 */
 	if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
 		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
-		spin_unlock(&transaction->t_handle_lock);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		read_unlock(&journal->j_state_lock);
 		write_lock(&journal->j_state_lock);
 		if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
@@ -239,23 +240,33 @@ repeat:
 	}
 
 	/* OK, account for the buffers that this operation expects to
-	 * use and add the handle to the running transaction. */
-
-	if (time_after(transaction->t_start, ts)) {
+	 * use and add the handle to the running transaction. 
+	 *
+	 * In order for t_max_wait to be reliable, it must be
+	 * protected by a lock.  But doing so will mean that
+	 * start_this_handle() can not be run in parallel on SMP
+	 * systems, which limits our scalability.  So we only enable
+	 * it when debugging is enabled.  We may want to use a
+	 * separate flag, eventually, so we can enable this
+	 * independently of debugging.
+	 */
+#ifdef CONFIG_JBD2_DEBUG
+	if (jbd2_journal_enable_debug &&
+	    time_after(transaction->t_start, ts)) {
 		ts = jbd2_time_diff(ts, transaction->t_start);
+		spin_lock(&transaction->t_handle_lock);
 		if (ts > transaction->t_max_wait)
 			transaction->t_max_wait = ts;
+		spin_unlock(&transaction->t_handle_lock);
 	}
-
+#endif
 	handle->h_transaction = transaction;
-	atomic_add(nblocks, &transaction->t_outstanding_credits);
 	atomic_inc(&transaction->t_updates);
-	transaction->t_handle_count++;
+	atomic_inc(&transaction->t_handle_count);
 	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
 		  handle, nblocks,
 		  atomic_read(&transaction->t_outstanding_credits),
 		  __jbd2_log_space_left(journal));
-	spin_unlock(&transaction->t_handle_lock);
 	read_unlock(&journal->j_state_lock);
 
 	lock_map_acquire(&handle->h_lockdep_map);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 15d5743ccfbb..01743b5446ff 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -629,7 +629,7 @@ struct transaction_s
 	/*
 	 * How many handles used this transaction? [t_handle_lock]
 	 */
-	int t_handle_count;
+	atomic_t		t_handle_count;
 
 	/*
 	 * This transaction is being forced and some process is
-- 
cgit v1.2.3-59-g8ed1b


From f1f88fc7e818c6678c6799a2edb8f1aeccc124aa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:07 -0400
Subject: SUNRPC: The function rpc_restart_call() should return success/failure

Both rpc_restart_call_prepare() and rpc_restart_call() test for the
RPC_TASK_KILLED flag, and fail to restart the RPC call if that flag is set.

This patch allows callers to know whether or not the restart was
successful, so that they can perform cleanups etc in case of failure.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  4 ++--
 net/sunrpc/clnt.c           | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 8ed9642a5a76..debe75532199 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -148,8 +148,8 @@ int		rpc_call_sync(struct rpc_clnt *clnt,
 			      const struct rpc_message *msg, int flags);
 struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred,
 			       int flags);
-void		rpc_restart_call_prepare(struct rpc_task *);
-void		rpc_restart_call(struct rpc_task *);
+int		rpc_restart_call_prepare(struct rpc_task *);
+int		rpc_restart_call(struct rpc_task *);
 void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 size_t		rpc_max_payload(struct rpc_clnt *);
 void		rpc_force_rebind(struct rpc_clnt *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 756fc324db9e..234c40c15f69 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -756,12 +756,13 @@ EXPORT_SYMBOL_GPL(rpc_force_rebind);
  * Restart an (async) RPC call from the call_prepare state.
  * Usually called from within the exit handler.
  */
-void
+int
 rpc_restart_call_prepare(struct rpc_task *task)
 {
 	if (RPC_ASSASSINATED(task))
-		return;
+		return 0;
 	task->tk_action = rpc_prepare_task;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
 
@@ -769,13 +770,13 @@ EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
  * Restart an (async) RPC call. Usually called from within the
  * exit handler.
  */
-void
+int
 rpc_restart_call(struct rpc_task *task)
 {
 	if (RPC_ASSASSINATED(task))
-		return;
-
+		return 0;
 	task->tk_action = call_start;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(rpc_restart_call);
 
-- 
cgit v1.2.3-59-g8ed1b


From 173bdd746b128241d3d6d202142820692e7dd530 Mon Sep 17 00:00:00 2001
From: Shubhrajyoti D <shubhrajyoti@ti.com>
Date: Tue, 3 Aug 2010 19:44:40 -0700
Subject: Input: gpio_keys - add hooks to enable/disable device

Allow platform code to specify callbcks that will be invoked when
input device is opened or closed, allowing, for example, to enable
the device.

Signed-off-by: Shubhrajyoti D <shubhrajyoti@ti.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/keyboard/gpio_keys.c | 22 ++++++++++++++++++++++
 include/linux/gpio_keys.h          |  2 ++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c
index a9fd147f2ba7..6069abe31e42 100644
--- a/drivers/input/keyboard/gpio_keys.c
+++ b/drivers/input/keyboard/gpio_keys.c
@@ -39,6 +39,8 @@ struct gpio_keys_drvdata {
 	struct input_dev *input;
 	struct mutex disable_lock;
 	unsigned int n_buttons;
+	int (*enable)(struct device *dev);
+	void (*disable)(struct device *dev);
 	struct gpio_button_data data[0];
 };
 
@@ -423,6 +425,21 @@ fail2:
 	return error;
 }
 
+static int gpio_keys_open(struct input_dev *input)
+{
+	struct gpio_keys_drvdata *ddata = input_get_drvdata(input);
+
+	return ddata->enable ? ddata->enable(input->dev.parent) : 0;
+}
+
+static void gpio_keys_close(struct input_dev *input)
+{
+	struct gpio_keys_drvdata *ddata = input_get_drvdata(input);
+
+	if (ddata->disable)
+		ddata->disable(input->dev.parent);
+}
+
 static int __devinit gpio_keys_probe(struct platform_device *pdev)
 {
 	struct gpio_keys_platform_data *pdata = pdev->dev.platform_data;
@@ -444,13 +461,18 @@ static int __devinit gpio_keys_probe(struct platform_device *pdev)
 
 	ddata->input = input;
 	ddata->n_buttons = pdata->nbuttons;
+	ddata->enable = pdata->enable;
+	ddata->disable = pdata->disable;
 	mutex_init(&ddata->disable_lock);
 
 	platform_set_drvdata(pdev, ddata);
+	input_set_drvdata(input, ddata);
 
 	input->name = pdev->name;
 	input->phys = "gpio-keys/input0";
 	input->dev.parent = &pdev->dev;
+	input->open = gpio_keys_open;
+	input->close = gpio_keys_close;
 
 	input->id.bustype = BUS_HOST;
 	input->id.vendor = 0x0001;
diff --git a/include/linux/gpio_keys.h b/include/linux/gpio_keys.h
index cd0b3f30f48e..ce73a30113b4 100644
--- a/include/linux/gpio_keys.h
+++ b/include/linux/gpio_keys.h
@@ -17,6 +17,8 @@ struct gpio_keys_platform_data {
 	struct gpio_keys_button *buttons;
 	int nbuttons;
 	unsigned int rep:1;		/* enable input subsystem auto repeat */
+	int (*enable)(struct device *dev);
+	void (*disable)(struct device *dev);
 };
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From b5272b509a8570bb559156001e74ee162c5cb96a Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Date: Wed, 21 Jul 2010 10:13:06 +0000
Subject: sh: add a list of parent configurations to struct clk

Many system clocks can select a parent by writing a value to a specific field
in the configuration register. Add a list of parents and location and width of
the source selection field in the clock configuration register to struct clk to
assist in clk_set_parent() implementation.

Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Acked-by: Magnus Damm <damm@opensource.se>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 include/linux/sh_clk.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sh_clk.h b/include/linux/sh_clk.h
index 1636d1e2a5f1..08a07b9a894f 100644
--- a/include/linux/sh_clk.h
+++ b/include/linux/sh_clk.h
@@ -25,6 +25,10 @@ struct clk {
 	int			id;
 
 	struct clk		*parent;
+	struct clk		**parent_table;	/* list of parents to */
+	unsigned short		parent_num;	/* choose between */
+	unsigned char		src_shift;	/* source clock field in the */
+	unsigned char		src_width;	/* configuration register */
 	struct clk_ops		*ops;
 
 	struct list_head	children;
-- 
cgit v1.2.3-59-g8ed1b


From b3dd51a8a6ce2e618e8a1be8fa0e7d3d4733c300 Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Date: Wed, 21 Jul 2010 10:13:10 +0000
Subject: sh: add a reparent function to DIV6 clocks

Add support for reparenting of div6 clocks on SuperH and SH-Mobile SoCs.

Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Acked-by: Magnus Damm <damm@opensource.se>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/sh/clk-cpg.c   | 58 ++++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/sh_clk.h | 19 ++++++++++++-----
 2 files changed, 70 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/sh/clk-cpg.c b/drivers/sh/clk-cpg.c
index f5c80ba9ab1c..8c024b984ed8 100644
--- a/drivers/sh/clk-cpg.c
+++ b/drivers/sh/clk-cpg.c
@@ -68,6 +68,39 @@ static unsigned long sh_clk_div6_recalc(struct clk *clk)
 	return clk->freq_table[idx].frequency;
 }
 
+static int sh_clk_div6_set_parent(struct clk *clk, struct clk *parent)
+{
+	struct clk_div_mult_table *table = &sh_clk_div6_table;
+	u32 value;
+	int ret, i;
+
+	if (!clk->parent_table || !clk->parent_num)
+		return -EINVAL;
+
+	/* Search the parent */
+	for (i = 0; i < clk->parent_num; i++)
+		if (clk->parent_table[i] == parent)
+			break;
+
+	if (i == clk->parent_num)
+		return -ENODEV;
+
+	ret = clk_reparent(clk, parent);
+	if (ret < 0)
+		return ret;
+
+	value = __raw_readl(clk->enable_reg) &
+		~(((1 << clk->src_width) - 1) << clk->src_shift);
+
+	__raw_writel(value | (i << clk->src_shift), clk->enable_reg);
+
+	/* Rebuild the frequency table */
+	clk_rate_table_build(clk, clk->freq_table, table->nr_divisors,
+			     table, &clk->arch_flags);
+
+	return 0;
+}
+
 static int sh_clk_div6_set_rate(struct clk *clk,
 				unsigned long rate, int algo_id)
 {
@@ -117,7 +150,17 @@ static struct clk_ops sh_clk_div6_clk_ops = {
 	.disable	= sh_clk_div6_disable,
 };
 
-int __init sh_clk_div6_register(struct clk *clks, int nr)
+static struct clk_ops sh_clk_div6_reparent_clk_ops = {
+	.recalc		= sh_clk_div6_recalc,
+	.round_rate	= sh_clk_div_round_rate,
+	.set_rate	= sh_clk_div6_set_rate,
+	.enable		= sh_clk_div6_enable,
+	.disable	= sh_clk_div6_disable,
+	.set_parent	= sh_clk_div6_set_parent,
+};
+
+static int __init sh_clk_div6_register_ops(struct clk *clks, int nr,
+					   struct clk_ops *ops)
 {
 	struct clk *clkp;
 	void *freq_table;
@@ -136,7 +179,7 @@ int __init sh_clk_div6_register(struct clk *clks, int nr)
 	for (k = 0; !ret && (k < nr); k++) {
 		clkp = clks + k;
 
-		clkp->ops = &sh_clk_div6_clk_ops;
+		clkp->ops = ops;
 		clkp->id = -1;
 		clkp->freq_table = freq_table + (k * freq_table_size);
 		clkp->freq_table[nr_divs].frequency = CPUFREQ_TABLE_END;
@@ -147,6 +190,17 @@ int __init sh_clk_div6_register(struct clk *clks, int nr)
 	return ret;
 }
 
+int __init sh_clk_div6_register(struct clk *clks, int nr)
+{
+	return sh_clk_div6_register_ops(clks, nr, &sh_clk_div6_clk_ops);
+}
+
+int __init sh_clk_div6_reparent_register(struct clk *clks, int nr)
+{
+	return sh_clk_div6_register_ops(clks, nr,
+					&sh_clk_div6_reparent_clk_ops);
+}
+
 static unsigned long sh_clk_div4_recalc(struct clk *clk)
 {
 	struct clk_div4_table *d4t = clk->priv;
diff --git a/include/linux/sh_clk.h b/include/linux/sh_clk.h
index 08a07b9a894f..875ce50719a9 100644
--- a/include/linux/sh_clk.h
+++ b/include/linux/sh_clk.h
@@ -142,13 +142,22 @@ int sh_clk_div4_enable_register(struct clk *clks, int nr,
 int sh_clk_div4_reparent_register(struct clk *clks, int nr,
 			 struct clk_div4_table *table);
 
-#define SH_CLK_DIV6(_parent, _reg, _flags)	\
-{						\
-	.parent = _parent,			\
-	.enable_reg = (void __iomem *)_reg,	\
-	.flags = _flags,			\
+#define SH_CLK_DIV6_EXT(_parent, _reg, _flags, _parents,	\
+			_num_parents, _src_shift, _src_width)	\
+{								\
+	.parent = _parent,					\
+	.enable_reg = (void __iomem *)_reg,			\
+	.flags = _flags,					\
+	.parent_table = _parents,				\
+	.parent_num = _num_parents,				\
+	.src_shift = _src_shift,				\
+	.src_width = _src_width,				\
 }
 
+#define SH_CLK_DIV6(_parent, _reg, _flags)			\
+	SH_CLK_DIV6_EXT(_parent, _reg, _flags, NULL, 0, 0, 0)
+
 int sh_clk_div6_register(struct clk *clks, int nr);
+int sh_clk_div6_reparent_register(struct clk *clks, int nr);
 
 #endif /* __SH_CLOCK_H */
-- 
cgit v1.2.3-59-g8ed1b


From e1b004c3ef9c59db5f013528628b51c8653155ec Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Aug 2010 10:53:00 +0200
Subject: Revert "timer: Added usleep[_range] timer"

This reverts commit 22b8f15c2f7130bb0386f548428df2ffd4e81903 to merge
an advanced version.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/delay.h |  6 ------
 kernel/timer.c        | 22 ----------------------
 2 files changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index 0e303d1aacd8..fd832c6d419e 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -45,12 +45,6 @@ extern unsigned long lpj_fine;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
-void usleep_range(unsigned long min, unsigned long max);
-
-static inline void usleep(unsigned long usecs)
-{
-	usleep_range(usecs, usecs);
-}
 
 static inline void ssleep(unsigned int seconds)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index f110f241ab66..ce98685cd1cb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1755,25 +1755,3 @@ unsigned long msleep_interruptible(unsigned int msecs)
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
-
-static int __sched do_usleep_range(unsigned long min, unsigned long max)
-{
-	ktime_t kmin;
-	unsigned long delta;
-
-	kmin = ktime_set(0, min * NSEC_PER_USEC);
-	delta = max - min;
-	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
-}
-
-/**
- * usleep_range - Drop in replacement for udelay where wakeup is flexible
- * @min: Minimum time in usecs to sleep
- * @max: Maximum time in usecs to sleep
- */
-void usleep_range(unsigned long min, unsigned long max)
-{
-	__set_current_state(TASK_UNINTERRUPTIBLE);
-	do_usleep_range(min, max);
-}
-EXPORT_SYMBOL(usleep_range);
-- 
cgit v1.2.3-59-g8ed1b


From 5e7f5a178bba45c5aca3448fddecabd4e28f1f6b Mon Sep 17 00:00:00 2001
From: Patrick Pannuto <ppannuto@codeaurora.org>
Date: Mon, 2 Aug 2010 15:01:04 -0700
Subject: timer: Added usleep_range timer

usleep_range is a finer precision implementations of msleep
and is designed to be a drop-in replacement for udelay where
a precise sleep / busy-wait is unnecessary.

Since an easy interface to hrtimers could lead to an undesired
proliferation of interrupts, we provide only a "range" API,
forcing the caller to think about an acceptable tolerance on
both ends and hopefully avoiding introducing another interrupt.

INTRO

As discussed here ( http://lkml.org/lkml/2007/8/3/250 ), msleep(1) is not
precise enough for many drivers (yes, sleep precision is an unfair notion,
but consistently sleeping for ~an order of magnitude greater than requested
is worth fixing). This patch adds a usleep API so that udelay does not have
to be used. Obviously not every udelay can be replaced (those in atomic
contexts or being used for simple bitbanging come to mind), but there are
many, many examples of

mydriver_write(...)
/* Wait for hardware to latch */
udelay(100)

in various drivers where a busy-wait loop is neither beneficial nor
necessary, but msleep simply does not provide enough precision and people
are using a busy-wait loop instead.

CONCERNS FROM THE RFC

Why is udelay a problem / necessary? Most callers of udelay are in device/
driver initialization code, which is serial...

	As I see it, there is only benefit to sleeping over a delay; the
	notion of "refactoring" areas that use udelay was presented, but
	I see usleep as the refactoring. Consider i2c, if the bus is busy,
	you need to wait a bit (say 100us) before trying again, your
	current options are:

		* udelay(100)
		* msleep(1) <-- As noted above, actually as high as ~20ms
				on some platforms, so not really an option
		* Manually set up an hrtimer to try again in 100us (which
		  is what usleep does anyway...)

	People choose the udelay route because it is EASY; we need to
	provide a better easy route.

	Device / driver / boot code is *currently* serial, but every few
	months someone makes noise about parallelizing boot, and IMHO, a
	little forward-thinking now is one less thing to worry about
	if/when that ever happens

udelay's could be preempted

	Sure, but if udelay plans on looping 1000 times, and it gets
	preempted on loop 200, whenever it's scheduled again, it is
	going to do the next 800 loops.

Is the interruptible case needed?

	Probably not, but I see usleep as a very logical parallel to msleep,
	so it made sense to include the "full" API. Processors are getting
	faster (albeit not as quickly as they are becoming more parallel),
	so if someone wanted to be interruptible for a few usecs, why not
	let them? If this is a contentious point, I'm happy to remove it.

OTHER THOUGHTS

I believe there is also value in exposing the usleep_range option; it gives
the scheduler a lot more flexibility and allows the programmer to express
his intent much more clearly; it's something I would hope future driver
writers will take advantage of.

To get the results in the NUMBERS section below, I literally s/udelay/usleep
the kernel tree; I had to go in and undo the changes to the USB drivers, but
everything else booted successfully; I find that extremely telling in and
of itself -- many people are using a delay API where a sleep will suit them
just fine.

SOME ATTEMPTS AT NUMBERS

It turns out that calculating quantifiable benefit on this is challenging,
so instead I will simply present the current state of things, and I hope
this to be sufficient:

How many udelay calls are there in 2.6.35-rc5?

	udealy(ARG) >=	| COUNT
	1000		| 319
	500		| 414
	100		| 1146
	20		| 1832

I am working on Android, so that is my focus for this. The following table
is a modified usleep that simply printk's the amount of time requested to
sleep; these tests were run on a kernel with udelay >= 20 --> usleep

"boot" is power-on to lock screen
"power collapse" is when the power button is pushed and the device suspends
"resume" is when the power button is pushed and the lock screen is displayed
         (no touchscreen events or anything, just turning on the display)
"use device" is from the unlock swipe to clicking around a bit; there is no
	sd card in this phone, so fail loading music, video, camera

	ACTION		| TOTAL NUMBER OF USLEEP CALLS	| NET TIME (us)
	boot		| 22				| 1250
	power-collapse	| 9				| 1200
	resume		| 5				| 500
	use device	| 59				| 7700

The most interesting category to me is the "use device" field; 7700us of
busy-wait time that could be put towards better responsiveness, or at the
least less power usage.

Signed-off-by: Patrick Pannuto <ppannuto@codeaurora.org>
Cc: apw@canonical.com
Cc: corbet@lwn.net
Cc: arjan@linux.intel.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/delay.h |  1 +
 kernel/timer.c        | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/delay.h b/include/linux/delay.h
index fd832c6d419e..a6ecb34cf547 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -45,6 +45,7 @@ extern unsigned long lpj_fine;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
+void usleep_range(unsigned long min, unsigned long max);
 
 static inline void ssleep(unsigned int seconds)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index ce98685cd1cb..723a62e86dcb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1755,3 +1755,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
+
+static int __sched do_usleep_range(unsigned long min, unsigned long max)
+{
+	ktime_t kmin;
+	unsigned long delta;
+
+	kmin = ktime_set(0, min * NSEC_PER_USEC);
+	delta = (max - min) * NSEC_PER_USEC;
+	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+}
+
+/**
+ * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ */
+void usleep_range(unsigned long min, unsigned long max)
+{
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	do_usleep_range(min, max);
+}
+EXPORT_SYMBOL(usleep_range);
-- 
cgit v1.2.3-59-g8ed1b


From ad0d363b8fb7559a410483635349e22de6727988 Mon Sep 17 00:00:00 2001
From: Kyungmin Park <kmpark@infradead.org>
Date: Fri, 28 May 2010 11:03:11 +0900
Subject: mtd: OneNAND: Introduce chip_probe function

Samsung SoCs use the own OneNAND controler and detect OneNAND chip at power on.
To use this feature, introduce the chip_probe function.

Also remove workaround for Samsung SoCs.

Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/onenand/onenand_base.c | 42 +++++++++++++++++++++++++++-----------
 include/linux/mtd/onenand.h        |  2 ++
 2 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index f749935f3cb5..a2bb520286f8 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -3733,17 +3733,16 @@ out:
 }
 
 /**
- * onenand_probe - [OneNAND Interface] Probe the OneNAND device
+ * onenand_chip_probe - [OneNAND Interface] The generic chip probe
  * @param mtd		MTD device structure
  *
  * OneNAND detection method:
  *   Compare the values from command with ones from register
  */
-static int onenand_probe(struct mtd_info *mtd)
+static int onenand_chip_probe(struct mtd_info *mtd)
 {
 	struct onenand_chip *this = mtd->priv;
-	int bram_maf_id, bram_dev_id, maf_id, dev_id, ver_id;
-	int density;
+	int bram_maf_id, bram_dev_id, maf_id, dev_id;
 	int syscfg;
 
 	/* Save system configuration 1 */
@@ -3766,12 +3765,6 @@ static int onenand_probe(struct mtd_info *mtd)
 	/* Restore system configuration 1 */
 	this->write_word(syscfg, this->base + ONENAND_REG_SYS_CFG1);
 
-	/* Workaround */
-	if (syscfg & ONENAND_SYS_CFG1_SYNC_WRITE) {
-		bram_maf_id = this->read_word(this->base + ONENAND_REG_MANUFACTURER_ID);
-		bram_dev_id = this->read_word(this->base + ONENAND_REG_DEVICE_ID);
-	}
-
 	/* Check manufacturer ID */
 	if (onenand_check_maf(bram_maf_id))
 		return -ENXIO;
@@ -3779,13 +3772,35 @@ static int onenand_probe(struct mtd_info *mtd)
 	/* Read manufacturer and device IDs from Register */
 	maf_id = this->read_word(this->base + ONENAND_REG_MANUFACTURER_ID);
 	dev_id = this->read_word(this->base + ONENAND_REG_DEVICE_ID);
-	ver_id = this->read_word(this->base + ONENAND_REG_VERSION_ID);
-	this->technology = this->read_word(this->base + ONENAND_REG_TECHNOLOGY);
 
 	/* Check OneNAND device */
 	if (maf_id != bram_maf_id || dev_id != bram_dev_id)
 		return -ENXIO;
 
+	return 0;
+}
+
+/**
+ * onenand_probe - [OneNAND Interface] Probe the OneNAND device
+ * @param mtd		MTD device structure
+ */
+static int onenand_probe(struct mtd_info *mtd)
+{
+	struct onenand_chip *this = mtd->priv;
+	int maf_id, dev_id, ver_id;
+	int density;
+	int ret;
+
+	ret = this->chip_probe(mtd);
+	if (ret)
+		return ret;
+
+	/* Read manufacturer and device IDs from Register */
+	maf_id = this->read_word(this->base + ONENAND_REG_MANUFACTURER_ID);
+	dev_id = this->read_word(this->base + ONENAND_REG_DEVICE_ID);
+	ver_id = this->read_word(this->base + ONENAND_REG_VERSION_ID);
+	this->technology = this->read_word(this->base + ONENAND_REG_TECHNOLOGY);
+
 	/* Flash device information */
 	onenand_print_device_info(dev_id, ver_id);
 	this->device_id = dev_id;
@@ -3912,6 +3927,9 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
 	if (!this->unlock_all)
 		this->unlock_all = onenand_unlock_all;
 
+	if (!this->chip_probe)
+		this->chip_probe = onenand_chip_probe;
+
 	if (!this->read_bufferram)
 		this->read_bufferram = onenand_read_bufferram;
 	if (!this->write_bufferram)
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index c26ff86ad08a..0c8815bfae1c 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -68,6 +68,7 @@ struct onenand_bufferram {
  * @write_word:		[REPLACEABLE] hardware specific function for write
  *			register of OneNAND
  * @mmcontrol:		sync burst read function
+ * @chip_probe:		[REPLACEABLE] hardware specific function for chip probe
  * @block_markbad:	function to mark a block as bad
  * @scan_bbt:		[REPLACEALBE] hardware specific function for scanning
  *			Bad block Table
@@ -114,6 +115,7 @@ struct onenand_chip {
 	unsigned short (*read_word)(void __iomem *addr);
 	void (*write_word)(unsigned short value, void __iomem *addr);
 	void (*mmcontrol)(struct mtd_info *mtd, int sync_read);
+	int (*chip_probe)(struct mtd_info *mtd);
 	int (*block_markbad)(struct mtd_info *mtd, loff_t ofs);
 	int (*scan_bbt)(struct mtd_info *mtd);
 
-- 
cgit v1.2.3-59-g8ed1b


From 5d8d9a4d9ff74c55901642b4e2ac5124830ddafe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:07 -0400
Subject: NFS: Ensure the AUTH_UNIX credcache is allocated dynamically

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |  7 ++++---
 net/sunrpc/auth.c           | 19 ++++++++++++++++---
 net/sunrpc/auth_generic.c   | 12 +++---------
 net/sunrpc/auth_unix.c      | 15 +++++++--------
 net/sunrpc/sunrpc_syms.c    | 15 ++++++++++-----
 5 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 87d7ec0bf779..784e78c73ec5 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -125,11 +125,12 @@ struct rpc_credops {
 extern const struct rpc_authops	authunix_ops;
 extern const struct rpc_authops	authnull_ops;
 
-void __init		rpc_init_authunix(void);
-void __init		rpc_init_generic_auth(void);
-void __init		rpcauth_init_module(void);
+int __init		rpc_init_authunix(void);
+int __init		rpc_init_generic_auth(void);
+int __init		rpcauth_init_module(void);
 void __exit		rpcauth_remove_module(void);
 void __exit		rpc_destroy_generic_auth(void);
+void 			rpc_destroy_authunix(void);
 
 struct rpc_cred *	rpc_lookup_cred(void);
 struct rpc_cred *	rpc_lookup_machine_cred(void);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 73affb8624fa..db135543d21e 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -587,14 +587,27 @@ static struct shrinker rpc_cred_shrinker = {
 	.seeks = DEFAULT_SEEKS,
 };
 
-void __init rpcauth_init_module(void)
+int __init rpcauth_init_module(void)
 {
-	rpc_init_authunix();
-	rpc_init_generic_auth();
+	int err;
+
+	err = rpc_init_authunix();
+	if (err < 0)
+		goto out1;
+	err = rpc_init_generic_auth();
+	if (err < 0)
+		goto out2;
 	register_shrinker(&rpc_cred_shrinker);
+	return 0;
+out2:
+	rpc_destroy_authunix();
+out1:
+	return err;
 }
 
 void __exit rpcauth_remove_module(void)
 {
+	rpc_destroy_authunix();
+	rpc_destroy_generic_auth();
 	unregister_shrinker(&rpc_cred_shrinker);
 }
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 8f623b0f03dd..8bae33b36cc6 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -27,7 +27,6 @@ struct generic_cred {
 };
 
 static struct rpc_auth generic_auth;
-static struct rpc_cred_cache generic_cred_cache;
 static const struct rpc_credops generic_credops;
 
 /*
@@ -159,20 +158,16 @@ out_nomatch:
 	return 0;
 }
 
-void __init rpc_init_generic_auth(void)
+int __init rpc_init_generic_auth(void)
 {
-	spin_lock_init(&generic_cred_cache.lock);
+	return rpcauth_init_credcache(&generic_auth);
 }
 
 void __exit rpc_destroy_generic_auth(void)
 {
-	rpcauth_clear_credcache(&generic_cred_cache);
+	rpcauth_destroy_credcache(&generic_auth);
 }
 
-static struct rpc_cred_cache generic_cred_cache = {
-	{{ NULL, },},
-};
-
 static const struct rpc_authops generic_auth_ops = {
 	.owner = THIS_MODULE,
 	.au_name = "Generic",
@@ -183,7 +178,6 @@ static const struct rpc_authops generic_auth_ops = {
 static struct rpc_auth generic_auth = {
 	.au_ops = &generic_auth_ops,
 	.au_count = ATOMIC_INIT(0),
-	.au_credcache = &generic_cred_cache,
 };
 
 static const struct rpc_credops generic_credops = {
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index aac2f8b4ee21..d5e37dbf207b 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -29,7 +29,6 @@ struct unx_cred {
 #endif
 
 static struct rpc_auth		unix_auth;
-static struct rpc_cred_cache	unix_cred_cache;
 static const struct rpc_credops	unix_credops;
 
 static struct rpc_auth *
@@ -203,9 +202,14 @@ unx_validate(struct rpc_task *task, __be32 *p)
 	return p;
 }
 
-void __init rpc_init_authunix(void)
+int __init rpc_init_authunix(void)
 {
-	spin_lock_init(&unix_cred_cache.lock);
+	return rpcauth_init_credcache(&unix_auth);
+}
+
+void rpc_destroy_authunix(void)
+{
+	rpcauth_destroy_credcache(&unix_auth);
 }
 
 const struct rpc_authops authunix_ops = {
@@ -218,10 +222,6 @@ const struct rpc_authops authunix_ops = {
 	.crcreate	= unx_create_cred,
 };
 
-static
-struct rpc_cred_cache	unix_cred_cache = {
-};
-
 static
 struct rpc_auth		unix_auth = {
 	.au_cslack	= UNX_WRITESLACK,
@@ -229,7 +229,6 @@ struct rpc_auth		unix_auth = {
 	.au_ops		= &authunix_ops,
 	.au_flavor	= RPC_AUTH_UNIX,
 	.au_count	= ATOMIC_INIT(0),
-	.au_credcache	= &unix_cred_cache,
 };
 
 static
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index f438347d817b..34b58f9e704a 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -33,10 +33,11 @@ init_sunrpc(void)
 	if (err)
 		goto out;
 	err = rpc_init_mempool();
-	if (err) {
-		unregister_rpc_pipefs();
-		goto out;
-	}
+	if (err)
+		goto out2;
+	err = rpcauth_init_module();
+	if (err)
+		goto out3;
 #ifdef RPC_DEBUG
 	rpc_register_sysctl();
 #endif
@@ -47,7 +48,11 @@ init_sunrpc(void)
 	cache_register(&unix_gid_cache);
 	svc_init_xprt_sock();	/* svc sock transport */
 	init_socket_xprt();	/* clnt sock transport */
-	rpcauth_init_module();
+	return 0;
+out3:
+	rpc_destroy_mempool();
+out2:
+	unregister_rpc_pipefs();
 out:
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 988664a0f6bbfc356e6ce55f7a87b8594050012f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:07 -0400
Subject: SUNRPC: Store the hashtable size in struct rpc_cred_cache

Cleanup in preparation for allowing the user to determine the maximum hash
table size.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |  1 +
 net/sunrpc/auth.c           | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 784e78c73ec5..84d64b6926a9 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -65,6 +65,7 @@ struct rpc_cred {
 #define RPC_CREDCACHE_NR	(1 << RPC_CREDCACHE_HASHBITS)
 struct rpc_cred_cache {
 	struct hlist_head	hashtable[RPC_CREDCACHE_NR];
+	unsigned int		hashbits;
 	spinlock_t		lock;
 };
 
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index db135543d21e..eef76a1f1dd6 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -145,12 +145,15 @@ int
 rpcauth_init_credcache(struct rpc_auth *auth)
 {
 	struct rpc_cred_cache *new;
+	unsigned int hashsize;
 	int i;
 
 	new = kmalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
-	for (i = 0; i < RPC_CREDCACHE_NR; i++)
+	new->hashbits = RPC_CREDCACHE_HASHBITS;
+	hashsize = 1U << new->hashbits;
+	for (i = 0; i < hashsize; i++)
 		INIT_HLIST_HEAD(&new->hashtable[i]);
 	spin_lock_init(&new->lock);
 	auth->au_credcache = new;
@@ -183,11 +186,12 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache)
 	LIST_HEAD(free);
 	struct hlist_head *head;
 	struct rpc_cred	*cred;
+	unsigned int hashsize = 1U << cache->hashbits;
 	int		i;
 
 	spin_lock(&rpc_credcache_lock);
 	spin_lock(&cache->lock);
-	for (i = 0; i < RPC_CREDCACHE_NR; i++) {
+	for (i = 0; i < hashsize; i++) {
 		head = &cache->hashtable[i];
 		while (!hlist_empty(head)) {
 			cred = hlist_entry(head->first, struct rpc_cred, cr_hash);
@@ -297,7 +301,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 			*entry, *new;
 	unsigned int nr;
 
-	nr = hash_long(acred->uid, RPC_CREDCACHE_HASHBITS);
+	nr = hash_long(acred->uid, cache->hashbits);
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) {
-- 
cgit v1.2.3-59-g8ed1b


From 241269bd0b580faae71575443d9ab38df7469126 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:08 -0400
Subject: SUNRPC: Make the credential cache hashtable size configurable

This patch allows the user to configure the credential cache hashtable size
using a new module parameter: auth_hashtable_size
When set, this parameter will be rounded up to the nearest power of two,
with a maximum allowed value of 1024 elements.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |  9 +------
 net/sunrpc/auth.c           | 60 +++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 56 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 84d64b6926a9..d2737625a247 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -61,14 +61,7 @@ struct rpc_cred {
 /*
  * Client authentication handle
  */
-#define RPC_CREDCACHE_HASHBITS	4
-#define RPC_CREDCACHE_NR	(1 << RPC_CREDCACHE_HASHBITS)
-struct rpc_cred_cache {
-	struct hlist_head	hashtable[RPC_CREDCACHE_NR];
-	unsigned int		hashbits;
-	spinlock_t		lock;
-};
-
+struct rpc_cred_cache;
 struct rpc_authops;
 struct rpc_auth {
 	unsigned int		au_cslack;	/* call cred size estimate */
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index eef76a1f1dd6..d80f01725fc2 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -19,6 +19,15 @@
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
+#define RPC_CREDCACHE_DEFAULT_HASHBITS	(4)
+struct rpc_cred_cache {
+	struct hlist_head	*hashtable;
+	unsigned int		hashbits;
+	spinlock_t		lock;
+};
+
+static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS;
+
 static DEFINE_SPINLOCK(rpc_authflavor_lock);
 static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
 	&authnull_ops,		/* AUTH_NULL */
@@ -29,6 +38,42 @@ static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
 static LIST_HEAD(cred_unused);
 static unsigned long number_cred_unused;
 
+#define MAX_HASHTABLE_BITS (10) 
+static int param_set_hashtbl_sz(const char *val, struct kernel_param *kp)
+{
+	unsigned long num;
+	unsigned int nbits;
+	int ret;
+
+	if (!val)
+		goto out_inval;
+	ret = strict_strtoul(val, 0, &num);
+	if (ret == -EINVAL)
+		goto out_inval;
+	nbits = fls(num);
+	if (num > (1U << nbits))
+		nbits++;
+	if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
+		goto out_inval;
+	*(unsigned int *)kp->arg = nbits;
+	return 0;
+out_inval:
+	return -EINVAL;
+}
+
+static int param_get_hashtbl_sz(char *buffer, struct kernel_param *kp)
+{
+	unsigned int nbits;
+
+	nbits = *(unsigned int *)kp->arg;
+	return sprintf(buffer, "%u", 1U << nbits);
+}
+
+#define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int);
+
+module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644);
+MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
+
 static u32
 pseudoflavor_to_flavor(u32 flavor) {
 	if (flavor >= RPC_AUTH_MAXFLAVOR)
@@ -146,18 +191,22 @@ rpcauth_init_credcache(struct rpc_auth *auth)
 {
 	struct rpc_cred_cache *new;
 	unsigned int hashsize;
-	int i;
 
 	new = kmalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
-		return -ENOMEM;
-	new->hashbits = RPC_CREDCACHE_HASHBITS;
+		goto out_nocache;
+	new->hashbits = auth_hashbits;
 	hashsize = 1U << new->hashbits;
-	for (i = 0; i < hashsize; i++)
-		INIT_HLIST_HEAD(&new->hashtable[i]);
+	new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL);
+	if (!new->hashtable)
+		goto out_nohashtbl;
 	spin_lock_init(&new->lock);
 	auth->au_credcache = new;
 	return 0;
+out_nohashtbl:
+	kfree(new);
+out_nocache:
+	return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(rpcauth_init_credcache);
 
@@ -220,6 +269,7 @@ rpcauth_destroy_credcache(struct rpc_auth *auth)
 	if (cache) {
 		auth->au_credcache = NULL;
 		rpcauth_clear_credcache(cache);
+		kfree(cache->hashtable);
 		kfree(cache);
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From d9b6cd94601e1d17273f93a326a135fbf487a918 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:08 -0400
Subject: SUNRPC: Ensure that rpc_exit() always wakes up a sleeping task

Make rpc_exit() non-inline, and ensure that it always wakes up a task that
has been queued.

Kill off the now unused rpc_wake_up_task().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h |  7 +------
 net/sunrpc/sched.c           | 20 +++++++++-----------
 2 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 7be4f3a6d246..88513fd8e208 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -213,6 +213,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
 				const struct rpc_call_ops *ops);
 void		rpc_put_task(struct rpc_task *);
 void		rpc_exit_task(struct rpc_task *);
+void		rpc_exit(struct rpc_task *, int);
 void		rpc_release_calldata(const struct rpc_call_ops *, void *);
 void		rpc_killall_tasks(struct rpc_clnt *);
 void		rpc_execute(struct rpc_task *);
@@ -241,12 +242,6 @@ void		rpc_destroy_mempool(void);
 extern struct workqueue_struct *rpciod_workqueue;
 void		rpc_prepare_task(struct rpc_task *task);
 
-static inline void rpc_exit(struct rpc_task *task, int status)
-{
-	task->tk_status = status;
-	task->tk_action = rpc_exit_task;
-}
-
 static inline int rpc_wait_for_completion_task(struct rpc_task *task)
 {
 	return __rpc_wait_for_completion_task(task, NULL);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 4a843b883b89..37452762af70 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -405,14 +405,6 @@ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task);
 
-/*
- * Wake up the specified task
- */
-static void rpc_wake_up_task(struct rpc_task *task)
-{
-	rpc_wake_up_queued_task(task->tk_waitqueue, task);
-}
-
 /*
  * Wake up the next task on a priority queue.
  */
@@ -600,7 +592,15 @@ void rpc_exit_task(struct rpc_task *task)
 		}
 	}
 }
-EXPORT_SYMBOL_GPL(rpc_exit_task);
+
+void rpc_exit(struct rpc_task *task, int status)
+{
+	task->tk_status = status;
+	task->tk_action = rpc_exit_task;
+	if (RPC_IS_QUEUED(task))
+		rpc_wake_up_queued_task(task->tk_waitqueue, task);
+}
+EXPORT_SYMBOL_GPL(rpc_exit);
 
 void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata)
 {
@@ -690,7 +690,6 @@ static void __rpc_execute(struct rpc_task *task)
 			dprintk("RPC: %5u got signal\n", task->tk_pid);
 			task->tk_flags |= RPC_TASK_KILLED;
 			rpc_exit(task, -ERESTARTSYS);
-			rpc_wake_up_task(task);
 		}
 		rpc_set_running(task);
 		dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
@@ -950,7 +949,6 @@ void rpc_killall_tasks(struct rpc_clnt *clnt)
 		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
 			rovr->tk_flags |= RPC_TASK_KILLED;
 			rpc_exit(rovr, -EIO);
-			rpc_wake_up_task(rovr);
 		}
 	}
 	spin_unlock(&clnt->cl_lock);
-- 
cgit v1.2.3-59-g8ed1b


From 58f9612c6ea858f532021a0ce42ec53cb0a493b3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:08 -0400
Subject: SUNRPC: Move remaining RPC client related task initialisation into
 clnt.c

Now that rpc_run_task() is the sole entry point for RPC calls, we can move
the remaining rpc_client-related initialisation of struct rpc_task from
sched.c into clnt.c.

Also move rpc_killall_tasks() into the same file, since that too is
relative to the rpc_clnt.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 84 +++++++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/sched.c          | 77 +++--------------------------------------
 3 files changed, 89 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index debe75532199..569dc722a600 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -131,6 +131,7 @@ struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
+void		rpc_task_release_client(struct rpc_task *);
 
 int		rpcb_register(u32, u32, int, unsigned short);
 int		rpcb_v4_register(const u32 program, const u32 version,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 234c40c15f69..3647c81fd689 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -413,6 +413,35 @@ out_no_clnt:
 }
 EXPORT_SYMBOL_GPL(rpc_clone_client);
 
+/*
+ * Kill all tasks for the given client.
+ * XXX: kill their descendants as well?
+ */
+void rpc_killall_tasks(struct rpc_clnt *clnt)
+{
+	struct rpc_task	*rovr;
+
+
+	if (list_empty(&clnt->cl_tasks))
+		return;
+	dprintk("RPC:       killing all tasks for client %p\n", clnt);
+	/*
+	 * Spin lock all_tasks to prevent changes...
+	 */
+	spin_lock(&clnt->cl_lock);
+	list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) {
+		if (!RPC_IS_ACTIVATED(rovr))
+			continue;
+		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
+			rovr->tk_flags |= RPC_TASK_KILLED;
+			rpc_exit(rovr, -EIO);
+			rpc_wake_up_queued_task(rovr->tk_waitqueue, rovr);
+		}
+	}
+	spin_unlock(&clnt->cl_lock);
+}
+EXPORT_SYMBOL_GPL(rpc_killall_tasks);
+
 /*
  * Properly shut down an RPC client, terminating all outstanding
  * requests.
@@ -538,6 +567,49 @@ out:
 }
 EXPORT_SYMBOL_GPL(rpc_bind_new_program);
 
+void rpc_task_release_client(struct rpc_task *task)
+{
+	struct rpc_clnt *clnt = task->tk_client;
+
+	if (clnt != NULL) {
+		/* Remove from client task list */
+		spin_lock(&clnt->cl_lock);
+		list_del(&task->tk_task);
+		spin_unlock(&clnt->cl_lock);
+		task->tk_client = NULL;
+
+		rpc_release_client(clnt);
+	}
+}
+
+static
+void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
+{
+	if (clnt != NULL) {
+		rpc_task_release_client(task);
+		task->tk_client = clnt;
+		kref_get(&clnt->cl_kref);
+		if (clnt->cl_softrtry)
+			task->tk_flags |= RPC_TASK_SOFT;
+		/* Add to the client's list of all tasks */
+		spin_lock(&clnt->cl_lock);
+		list_add_tail(&task->tk_task, &clnt->cl_tasks);
+		spin_unlock(&clnt->cl_lock);
+	}
+}
+
+static void
+rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
+{
+	if (msg != NULL) {
+		task->tk_msg.rpc_proc = msg->rpc_proc;
+		task->tk_msg.rpc_argp = msg->rpc_argp;
+		task->tk_msg.rpc_resp = msg->rpc_resp;
+		/* Bind the user cred */
+		rpcauth_bindcred(task, msg->rpc_cred, task->tk_flags);
+	}
+}
+
 /*
  * Default callback for async RPC calls
  */
@@ -562,6 +634,18 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
 	if (IS_ERR(task))
 		goto out;
 
+	rpc_task_set_client(task, task_setup_data->rpc_client);
+	rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
+
+	if (task->tk_status != 0) {
+		int ret = task->tk_status;
+		rpc_put_task(task);
+		return ERR_PTR(ret);
+	}
+
+	if (task->tk_action == NULL)
+		rpc_call_start(task);
+
 	atomic_inc(&task->tk_count);
 	rpc_execute(task);
 out:
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 37452762af70..a42296db2ecd 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -246,17 +246,8 @@ static inline void rpc_task_set_debuginfo(struct rpc_task *task)
 
 static void rpc_set_active(struct rpc_task *task)
 {
-	struct rpc_clnt *clnt;
-	if (test_and_set_bit(RPC_TASK_ACTIVE, &task->tk_runstate) != 0)
-		return;
 	rpc_task_set_debuginfo(task);
-	/* Add to global list of all tasks */
-	clnt = task->tk_client;
-	if (clnt != NULL) {
-		spin_lock(&clnt->cl_lock);
-		list_add_tail(&task->tk_task, &clnt->cl_tasks);
-		spin_unlock(&clnt->cl_lock);
-	}
+	set_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
 }
 
 /*
@@ -319,11 +310,6 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 	dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
 			task->tk_pid, rpc_qname(q), jiffies);
 
-	if (!RPC_IS_ASYNC(task) && !RPC_IS_ACTIVATED(task)) {
-		printk(KERN_ERR "RPC: Inactive synchronous task put to sleep!\n");
-		return;
-	}
-
 	__rpc_add_wait_queue(q, task);
 
 	BUG_ON(task->tk_callback != NULL);
@@ -334,8 +320,8 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 				rpc_action action)
 {
-	/* Mark the task as being activated if so needed */
-	rpc_set_active(task);
+	/* We shouldn't ever put an inactive task to sleep */
+	BUG_ON(!RPC_IS_ACTIVATED(task));
 
 	/*
 	 * Protect the queue operations.
@@ -807,26 +793,9 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 	/* Initialize workqueue for async tasks */
 	task->tk_workqueue = task_setup_data->workqueue;
 
-	task->tk_client = task_setup_data->rpc_client;
-	if (task->tk_client != NULL) {
-		kref_get(&task->tk_client->cl_kref);
-		if (task->tk_client->cl_softrtry)
-			task->tk_flags |= RPC_TASK_SOFT;
-	}
-
 	if (task->tk_ops->rpc_call_prepare != NULL)
 		task->tk_action = rpc_prepare_task;
 
-	if (task_setup_data->rpc_message != NULL) {
-		task->tk_msg.rpc_proc = task_setup_data->rpc_message->rpc_proc;
-		task->tk_msg.rpc_argp = task_setup_data->rpc_message->rpc_argp;
-		task->tk_msg.rpc_resp = task_setup_data->rpc_message->rpc_resp;
-		/* Bind the user cred */
-		rpcauth_bindcred(task, task_setup_data->rpc_message->rpc_cred, task_setup_data->flags);
-		if (task->tk_action == NULL)
-			rpc_call_start(task);
-	}
-
 	/* starting timestamp */
 	task->tk_start = ktime_get();
 
@@ -896,10 +865,7 @@ void rpc_put_task(struct rpc_task *task)
 		xprt_release(task);
 	if (task->tk_msg.rpc_cred)
 		rpcauth_unbindcred(task);
-	if (task->tk_client) {
-		rpc_release_client(task->tk_client);
-		task->tk_client = NULL;
-	}
+	rpc_task_release_client(task);
 	if (task->tk_workqueue != NULL) {
 		INIT_WORK(&task->u.tk_work, rpc_async_release);
 		queue_work(task->tk_workqueue, &task->u.tk_work);
@@ -912,13 +878,6 @@ static void rpc_release_task(struct rpc_task *task)
 {
 	dprintk("RPC: %5u release task\n", task->tk_pid);
 
-	if (!list_empty(&task->tk_task)) {
-		struct rpc_clnt *clnt = task->tk_client;
-		/* Remove from client task list */
-		spin_lock(&clnt->cl_lock);
-		list_del(&task->tk_task);
-		spin_unlock(&clnt->cl_lock);
-	}
 	BUG_ON (RPC_IS_QUEUED(task));
 
 	/* Wake up anyone who is waiting for task completion */
@@ -927,34 +886,6 @@ static void rpc_release_task(struct rpc_task *task)
 	rpc_put_task(task);
 }
 
-/*
- * Kill all tasks for the given client.
- * XXX: kill their descendants as well?
- */
-void rpc_killall_tasks(struct rpc_clnt *clnt)
-{
-	struct rpc_task	*rovr;
-
-
-	if (list_empty(&clnt->cl_tasks))
-		return;
-	dprintk("RPC:       killing all tasks for client %p\n", clnt);
-	/*
-	 * Spin lock all_tasks to prevent changes...
-	 */
-	spin_lock(&clnt->cl_lock);
-	list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) {
-		if (! RPC_IS_ACTIVATED(rovr))
-			continue;
-		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
-			rovr->tk_flags |= RPC_TASK_KILLED;
-			rpc_exit(rovr, -EIO);
-		}
-	}
-	spin_unlock(&clnt->cl_lock);
-}
-EXPORT_SYMBOL_GPL(rpc_killall_tasks);
-
 int rpciod_up(void)
 {
 	return try_module_get(THIS_MODULE) ? 0 : -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 8572b8e2e3c5f3d990122348c4d2c64dad338611 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:08 -0400
Subject: SUNRPC: Clean up of rpc_bindcred()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |  6 +++---
 net/sunrpc/auth.c           | 37 +++++++++++++++++--------------------
 net/sunrpc/auth_generic.c   | 11 +++--------
 net/sunrpc/clnt.c           |  2 +-
 4 files changed, 24 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index d2737625a247..90e4c3827ac0 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -106,7 +106,7 @@ struct rpc_credops {
 	void			(*crdestroy)(struct rpc_cred *);
 
 	int			(*crmatch)(struct auth_cred *, struct rpc_cred *, int);
-	void			(*crbind)(struct rpc_task *, struct rpc_cred *, int);
+	struct rpc_cred *	(*crbind)(struct rpc_task *, struct rpc_cred *, int);
 	__be32 *		(*crmarshal)(struct rpc_task *, __be32 *);
 	int			(*crrefresh)(struct rpc_task *);
 	__be32 *		(*crvalidate)(struct rpc_task *, __be32 *);
@@ -135,8 +135,8 @@ void			rpcauth_release(struct rpc_auth *);
 struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
-void			rpcauth_bindcred(struct rpc_task *, struct rpc_cred *, int);
-void			rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
+int			rpcauth_bindcred(struct rpc_task *, struct rpc_cred *, int);
+struct rpc_cred *	rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
 void			put_rpccred(struct rpc_cred *);
 void			rpcauth_unbindcred(struct rpc_task *);
 __be32 *		rpcauth_marshcred(struct rpc_task *, __be32 *);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index d80f01725fc2..d8968faf5ccf 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -444,16 +444,16 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
 }
 EXPORT_SYMBOL_GPL(rpcauth_init_cred);
 
-void
+struct rpc_cred *
 rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags)
 {
-	task->tk_msg.rpc_cred = get_rpccred(cred);
 	dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid,
 			cred->cr_auth->au_ops->au_name, cred);
+	return get_rpccred(cred);
 }
 EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred);
 
-static void
+static struct rpc_cred *
 rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
@@ -461,45 +461,42 @@ rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
 		.uid = 0,
 		.gid = 0,
 	};
-	struct rpc_cred *ret;
 
 	dprintk("RPC: %5u looking up %s cred\n",
 		task->tk_pid, task->tk_client->cl_auth->au_ops->au_name);
-	ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags);
-	if (!IS_ERR(ret))
-		task->tk_msg.rpc_cred = ret;
-	else
-		task->tk_status = PTR_ERR(ret);
+	return auth->au_ops->lookup_cred(auth, &acred, lookupflags);
 }
 
-static void
+static struct rpc_cred *
 rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
-	struct rpc_cred *ret;
 
 	dprintk("RPC: %5u looking up %s cred\n",
 		task->tk_pid, auth->au_ops->au_name);
-	ret = rpcauth_lookupcred(auth, lookupflags);
-	if (!IS_ERR(ret))
-		task->tk_msg.rpc_cred = ret;
-	else
-		task->tk_status = PTR_ERR(ret);
+	return rpcauth_lookupcred(auth, lookupflags);
 }
 
-void
+int
 rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 {
+	struct rpc_cred *new;
 	int lookupflags = 0;
 
 	if (flags & RPC_TASK_ASYNC)
 		lookupflags |= RPCAUTH_LOOKUP_NEW;
 	if (cred != NULL)
-		cred->cr_ops->crbind(task, cred, lookupflags);
+		new = cred->cr_ops->crbind(task, cred, lookupflags);
 	else if (flags & RPC_TASK_ROOTCREDS)
-		rpcauth_bind_root_cred(task, lookupflags);
+		new = rpcauth_bind_root_cred(task, lookupflags);
 	else
-		rpcauth_bind_new_cred(task, lookupflags);
+		new = rpcauth_bind_new_cred(task, lookupflags);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+	if (task->tk_msg.rpc_cred != NULL)
+		put_rpccred(task->tk_msg.rpc_cred);
+	task->tk_msg.rpc_cred = new;
+	return 0;
 }
 
 void
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 8bae33b36cc6..43162bb3b78f 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -54,18 +54,13 @@ struct rpc_cred *rpc_lookup_machine_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred);
 
-static void
-generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags)
+static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
+		struct rpc_cred *cred, int lookupflags)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
 	struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred;
-	struct rpc_cred *ret;
 
-	ret = auth->au_ops->lookup_cred(auth, acred, lookupflags);
-	if (!IS_ERR(ret))
-		task->tk_msg.rpc_cred = ret;
-	else
-		task->tk_status = PTR_ERR(ret);
+	return auth->au_ops->lookup_cred(auth, acred, lookupflags);
 }
 
 /*
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 3647c81fd689..f34b5e3823c0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -606,7 +606,7 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
 		task->tk_msg.rpc_argp = msg->rpc_argp;
 		task->tk_msg.rpc_resp = msg->rpc_resp;
 		/* Bind the user cred */
-		rpcauth_bindcred(task, msg->rpc_cred, task->tk_flags);
+		task->tk_status = rpcauth_bindcred(task, msg->rpc_cred, task->tk_flags);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a17c2153d2e271b0cbacae9bed83b0eaa41db7e1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 31 Jul 2010 14:29:08 -0400
Subject: SUNRPC: Move the bound cred to struct rpc_rqst

This will allow us to save the original generic cred in rpc_message, so
that if we migrate from one server to another, we can generate a new bound
cred without having to punt back to the NFS layer.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c               |  7 ++--
 fs/nfs/nfs3xdr.c               |  8 ++--
 fs/nfs/nfs4xdr.c               |  2 +-
 include/linux/sunrpc/auth.h    |  2 -
 include/linux/sunrpc/xprt.h    |  1 +
 net/sunrpc/auth.c              | 43 ++++++++++----------
 net/sunrpc/auth_gss/auth_gss.c | 22 +++++-----
 net/sunrpc/auth_null.c         |  2 +-
 net/sunrpc/auth_unix.c         |  6 +--
 net/sunrpc/clnt.c              | 91 +++++++++++++++++++++---------------------
 net/sunrpc/sched.c             |  2 +-
 net/sunrpc/xprt.c              |  2 +
 12 files changed, 92 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 81cf14257916..db8846a0e82e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -233,7 +233,7 @@ nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs
 static int
 nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 	u32 offset = (u32)args->offset;
 	u32 count = args->count;
@@ -393,8 +393,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *arg
 static int
 nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
 {
-	struct rpc_task	*task = req->rq_task;
-	struct rpc_auth	*auth = task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 	u32 count = args->count;
 
@@ -575,7 +574,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 static int
 nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 
 	p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 75dcfc7da365..9769704f8ce6 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -330,7 +330,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg
 static int
 nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 	u32 count = args->count;
 
@@ -471,7 +471,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
 static int
 nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 	u32 count = args->count;
 
@@ -675,7 +675,7 @@ static int
 nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
 		    struct nfs3_getaclargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 
 	p = xdr_encode_fhandle(p, args->fh);
@@ -802,7 +802,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
 static int
 nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth	*auth = req->rq_cred->cr_auth;
 	unsigned int replen;
 
 	p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 257c1811feb4..08ef91291132 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -758,7 +758,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 				struct compound_hdr *hdr)
 {
 	__be32 *p;
-	struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+	struct rpc_auth *auth = req->rq_cred->cr_auth;
 
 	/* initialize running count of expected bytes in reply.
 	 * NOTE: the replied tag SHOULD be the same is the one sent,
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 90e4c3827ac0..5bbc447175dc 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -135,10 +135,8 @@ void			rpcauth_release(struct rpc_auth *);
 struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
-int			rpcauth_bindcred(struct rpc_task *, struct rpc_cred *, int);
 struct rpc_cred *	rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
 void			put_rpccred(struct rpc_cred *);
-void			rpcauth_unbindcred(struct rpc_task *);
 __be32 *		rpcauth_marshcred(struct rpc_task *, __be32 *);
 __be32 *		rpcauth_checkverf(struct rpc_task *, __be32 *);
 int			rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp, __be32 *data, void *obj);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index b51470302399..ff5a77b28c50 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -64,6 +64,7 @@ struct rpc_rqst {
 	 * This is the private part
 	 */
 	struct rpc_task *	rq_task;	/* RPC task data */
+	struct rpc_cred *	rq_cred;	/* Bound cred */
 	__be32			rq_xid;		/* request XID */
 	int			rq_cong;	/* has incremented xprt->cong */
 	u32			rq_seqno;	/* gss seq no. used on req. */
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index d8968faf5ccf..95721426296d 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -477,9 +477,10 @@ rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags)
 	return rpcauth_lookupcred(auth, lookupflags);
 }
 
-int
+static int
 rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 {
+	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_cred *new;
 	int lookupflags = 0;
 
@@ -493,9 +494,9 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 		new = rpcauth_bind_new_cred(task, lookupflags);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
-	if (task->tk_msg.rpc_cred != NULL)
-		put_rpccred(task->tk_msg.rpc_cred);
-	task->tk_msg.rpc_cred = new;
+	if (req->rq_cred != NULL)
+		put_rpccred(req->rq_cred);
+	req->rq_cred = new;
 	return 0;
 }
 
@@ -535,22 +536,10 @@ out_nodestroy:
 }
 EXPORT_SYMBOL_GPL(put_rpccred);
 
-void
-rpcauth_unbindcred(struct rpc_task *task)
-{
-	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
-
-	dprintk("RPC: %5u releasing %s cred %p\n",
-		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
-
-	put_rpccred(cred);
-	task->tk_msg.rpc_cred = NULL;
-}
-
 __be32 *
 rpcauth_marshcred(struct rpc_task *task, __be32 *p)
 {
-	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
+	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u marshaling %s cred %p\n",
 		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
@@ -561,7 +550,7 @@ rpcauth_marshcred(struct rpc_task *task, __be32 *p)
 __be32 *
 rpcauth_checkverf(struct rpc_task *task, __be32 *p)
 {
-	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
+	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u validating %s cred %p\n",
 		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
@@ -573,7 +562,7 @@ int
 rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp,
 		__be32 *data, void *obj)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u using %s cred %p to wrap rpc data\n",
 			task->tk_pid, cred->cr_ops->cr_name, cred);
@@ -587,7 +576,7 @@ int
 rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
 		__be32 *data, void *obj)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u using %s cred %p to unwrap rpc data\n",
 			task->tk_pid, cred->cr_ops->cr_name, cred);
@@ -601,13 +590,21 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
 int
 rpcauth_refreshcred(struct rpc_task *task)
 {
-	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
+	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
 	int err;
 
+	cred = task->tk_rqstp->rq_cred;
+	if (cred == NULL) {
+		err = rpcauth_bindcred(task, task->tk_msg.rpc_cred, task->tk_flags);
+		if (err < 0)
+			goto out;
+		cred = task->tk_rqstp->rq_cred;
+	};
 	dprintk("RPC: %5u refreshing %s cred %p\n",
 		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
 
 	err = cred->cr_ops->crrefresh(task);
+out:
 	if (err < 0)
 		task->tk_status = err;
 	return err;
@@ -616,7 +613,7 @@ rpcauth_refreshcred(struct rpc_task *task)
 void
 rpcauth_invalcred(struct rpc_task *task)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u invalidating %s cred %p\n",
 		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
@@ -627,7 +624,7 @@ rpcauth_invalcred(struct rpc_task *task)
 int
 rpcauth_uptodatecred(struct rpc_task *task)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 
 	return cred == NULL ||
 		test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 8da2a0e68574..096e1260bc67 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -373,7 +373,7 @@ gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss
 static void
 gss_upcall_callback(struct rpc_task *task)
 {
-	struct gss_cred *gss_cred = container_of(task->tk_msg.rpc_cred,
+	struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred,
 			struct gss_cred, gc_base);
 	struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall;
 	struct inode *inode = &gss_msg->inode->vfs_inode;
@@ -502,7 +502,7 @@ static void warn_gssd(void)
 static inline int
 gss_refresh_upcall(struct rpc_task *task)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_auth *gss_auth = container_of(cred->cr_auth,
 			struct gss_auth, rpc_auth);
 	struct gss_cred *gss_cred = container_of(cred,
@@ -1064,12 +1064,12 @@ out:
 static __be32 *
 gss_marshal(struct rpc_task *task, __be32 *p)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_cred *cred = req->rq_cred;
 	struct gss_cred	*gss_cred = container_of(cred, struct gss_cred,
 						 gc_base);
 	struct gss_cl_ctx	*ctx = gss_cred_get_ctx(cred);
 	__be32		*cred_len;
-	struct rpc_rqst *req = task->tk_rqstp;
 	u32             maj_stat = 0;
 	struct xdr_netobj mic;
 	struct kvec	iov;
@@ -1119,7 +1119,7 @@ out_put_ctx:
 
 static int gss_renew_cred(struct rpc_task *task)
 {
-	struct rpc_cred *oldcred = task->tk_msg.rpc_cred;
+	struct rpc_cred *oldcred = task->tk_rqstp->rq_cred;
 	struct gss_cred *gss_cred = container_of(oldcred,
 						 struct gss_cred,
 						 gc_base);
@@ -1133,7 +1133,7 @@ static int gss_renew_cred(struct rpc_task *task)
 	new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
-	task->tk_msg.rpc_cred = new;
+	task->tk_rqstp->rq_cred = new;
 	put_rpccred(oldcred);
 	return 0;
 }
@@ -1161,7 +1161,7 @@ static int gss_cred_is_negative_entry(struct rpc_cred *cred)
 static int
 gss_refresh(struct rpc_task *task)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	int ret = 0;
 
 	if (gss_cred_is_negative_entry(cred))
@@ -1172,7 +1172,7 @@ gss_refresh(struct rpc_task *task)
 		ret = gss_renew_cred(task);
 		if (ret < 0)
 			goto out;
-		cred = task->tk_msg.rpc_cred;
+		cred = task->tk_rqstp->rq_cred;
 	}
 
 	if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags))
@@ -1191,7 +1191,7 @@ gss_refresh_null(struct rpc_task *task)
 static __be32 *
 gss_validate(struct rpc_task *task, __be32 *p)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
 	__be32		seq;
 	struct kvec	iov;
@@ -1400,7 +1400,7 @@ static int
 gss_wrap_req(struct rpc_task *task,
 	     kxdrproc_t encode, void *rqstp, __be32 *p, void *obj)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cred	*gss_cred = container_of(cred, struct gss_cred,
 			gc_base);
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
@@ -1503,7 +1503,7 @@ static int
 gss_unwrap_resp(struct rpc_task *task,
 		kxdrproc_t decode, void *rqstp, __be32 *p, void *obj)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
 			gc_base);
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 1db618f56ecb..a5c36c01707b 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -75,7 +75,7 @@ nul_marshal(struct rpc_task *task, __be32 *p)
 static int
 nul_refresh(struct rpc_task *task)
 {
-	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_msg.rpc_cred->cr_flags);
+	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags);
 	return 0;
 }
 
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index d5e37dbf207b..4cb70dc6e7ad 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -140,7 +140,7 @@ static __be32 *
 unx_marshal(struct rpc_task *task, __be32 *p)
 {
 	struct rpc_clnt	*clnt = task->tk_client;
-	struct unx_cred	*cred = container_of(task->tk_msg.rpc_cred, struct unx_cred, uc_base);
+	struct unx_cred	*cred = container_of(task->tk_rqstp->rq_cred, struct unx_cred, uc_base);
 	__be32		*base, *hold;
 	int		i;
 
@@ -173,7 +173,7 @@ unx_marshal(struct rpc_task *task, __be32 *p)
 static int
 unx_refresh(struct rpc_task *task)
 {
-	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_msg.rpc_cred->cr_flags);
+	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags);
 	return 0;
 }
 
@@ -196,7 +196,7 @@ unx_validate(struct rpc_task *task, __be32 *p)
 		printk("RPC: giant verf size: %u\n", size);
 		return NULL;
 	}
-	task->tk_msg.rpc_cred->cr_auth->au_rslack = (size >> 2) + 2;
+	task->tk_rqstp->rq_cred->cr_auth->au_rslack = (size >> 2) + 2;
 	p += (size >> 2);
 
 	return p;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f34b5e3823c0..2388d83b68ff 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -605,8 +605,8 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
 		task->tk_msg.rpc_proc = msg->rpc_proc;
 		task->tk_msg.rpc_argp = msg->rpc_argp;
 		task->tk_msg.rpc_resp = msg->rpc_resp;
-		/* Bind the user cred */
-		task->tk_status = rpcauth_bindcred(task, msg->rpc_cred, task->tk_flags);
+		if (msg->rpc_cred != NULL)
+			task->tk_msg.rpc_cred = get_rpccred(msg->rpc_cred);
 	}
 }
 
@@ -909,11 +909,6 @@ call_reserve(struct rpc_task *task)
 {
 	dprint_status(task);
 
-	if (!rpcauth_uptodatecred(task)) {
-		task->tk_action = call_refresh;
-		return;
-	}
-
 	task->tk_status  = 0;
 	task->tk_action  = call_reserveresult;
 	xprt_reserve(task);
@@ -977,7 +972,7 @@ call_reserveresult(struct rpc_task *task)
 static void
 call_allocate(struct rpc_task *task)
 {
-	unsigned int slack = task->tk_msg.rpc_cred->cr_auth->au_cslack;
+	unsigned int slack = task->tk_client->cl_auth->au_cslack;
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = task->tk_xprt;
 	struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
@@ -985,7 +980,7 @@ call_allocate(struct rpc_task *task)
 	dprint_status(task);
 
 	task->tk_status = 0;
-	task->tk_action = call_bind;
+	task->tk_action = call_refresh;
 
 	if (req->rq_buffer)
 		return;
@@ -1022,6 +1017,47 @@ call_allocate(struct rpc_task *task)
 	rpc_exit(task, -ERESTARTSYS);
 }
 
+/*
+ * 2a.	Bind and/or refresh the credentials
+ */
+static void
+call_refresh(struct rpc_task *task)
+{
+	dprint_status(task);
+
+	task->tk_action = call_refreshresult;
+	task->tk_status = 0;
+	task->tk_client->cl_stats->rpcauthrefresh++;
+	rpcauth_refreshcred(task);
+}
+
+/*
+ * 2b.	Process the results of a credential refresh
+ */
+static void
+call_refreshresult(struct rpc_task *task)
+{
+	int status = task->tk_status;
+
+	dprint_status(task);
+
+	task->tk_status = 0;
+	task->tk_action = call_bind;
+	if (status >= 0 && rpcauth_uptodatecred(task))
+		return;
+	switch (status) {
+	case -EACCES:
+		rpc_exit(task, -EACCES);
+		return;
+	case -ENOMEM:
+		rpc_exit(task, -ENOMEM);
+		return;
+	case -ETIMEDOUT:
+		rpc_delay(task, 3*HZ);
+	}
+	task->tk_action = call_refresh;
+}
+
 static inline int
 rpc_task_need_encode(struct rpc_task *task)
 {
@@ -1557,43 +1593,6 @@ out_retry:
 	}
 }
 
-/*
- * 8.	Refresh the credentials if rejected by the server
- */
-static void
-call_refresh(struct rpc_task *task)
-{
-	dprint_status(task);
-
-	task->tk_action = call_refreshresult;
-	task->tk_status = 0;
-	task->tk_client->cl_stats->rpcauthrefresh++;
-	rpcauth_refreshcred(task);
-}
-
-/*
- * 8a.	Process the results of a credential refresh
- */
-static void
-call_refreshresult(struct rpc_task *task)
-{
-	int status = task->tk_status;
-
-	dprint_status(task);
-
-	task->tk_status = 0;
-	task->tk_action = call_reserve;
-	if (status >= 0 && rpcauth_uptodatecred(task))
-		return;
-	if (status == -EACCES) {
-		rpc_exit(task, -EACCES);
-		return;
-	}
-	task->tk_action = call_refresh;
-	if (status != -ETIMEDOUT)
-		rpc_delay(task, 3*HZ);
-}
-
 static __be32 *
 rpc_encode_header(struct rpc_task *task)
 {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index a42296db2ecd..f6db6131fb2e 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -864,7 +864,7 @@ void rpc_put_task(struct rpc_task *task)
 	if (task->tk_rqstp)
 		xprt_release(task);
 	if (task->tk_msg.rpc_cred)
-		rpcauth_unbindcred(task);
+		put_rpccred(task->tk_msg.rpc_cred);
 	rpc_task_release_client(task);
 	if (task->tk_workqueue != NULL) {
 		INIT_WORK(&task->u.tk_work, rpc_async_release);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index dcd0132396ba..70297836a191 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1032,6 +1032,8 @@ void xprt_release(struct rpc_task *task)
 	spin_unlock_bh(&xprt->transport_lock);
 	if (req->rq_buffer)
 		xprt->ops->buf_free(req->rq_buffer);
+	if (req->rq_cred != NULL)
+		put_rpccred(req->rq_cred);
 	task->tk_rqstp = NULL;
 	if (req->rq_release_snd_buf)
 		req->rq_release_snd_buf(req);
-- 
cgit v1.2.3-59-g8ed1b


From e3b5e0d552b34d65e15b20610273b200555eea53 Mon Sep 17 00:00:00 2001
From: Ilya Yanok <yanok@emcraft.com>
Date: Thu, 8 Jul 2010 10:10:38 +0000
Subject: powerpc/fsl_pci: add quirk for mpc8308 pcie bridge

This patch adds the quirk for PCIE controller found on Freescale MPC8308.
The quirk is the same as for other MPC83xx processors.

Signed-off-by: Ilya Yanok <yanok@emcraft.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/sysdev/fsl_pci.c | 1 +
 include/linux/pci_ids.h       | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index a14760fe513a..7e900ec988f6 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -412,6 +412,7 @@ DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4080, quirk_fsl_pcie_header);
 #endif /* CONFIG_FSL_SOC_BOOKE || CONFIG_PPC_86xx */
 
 #if defined(CONFIG_PPC_83xx) || defined(CONFIG_PPC_MPC512x)
+DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8308, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8314E, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8314, quirk_fsl_pcie_header);
 DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8315E, quirk_fsl_pcie_header);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3bedcc149c84..79bb11f35c4b 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2264,6 +2264,7 @@
 #define PCI_DEVICE_ID_TDI_EHCI          0x0101
 
 #define PCI_VENDOR_ID_FREESCALE		0x1957
+#define PCI_DEVICE_ID_MPC8308		0xc006
 #define PCI_DEVICE_ID_MPC8315E		0x00b4
 #define PCI_DEVICE_ID_MPC8315		0x00b5
 #define PCI_DEVICE_ID_MPC8314E		0x00b6
-- 
cgit v1.2.3-59-g8ed1b


From 0c42bd0e425e9c8ddb7019fc446f7d915e36c5f6 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Fri, 30 Jul 2010 16:23:03 +0800
Subject: dmaengine: Driver for Topcliff PCH DMA controller

Topcliff PCH is the platform controller hub that is going to
be used in Intel's upcoming general embedded platforms. This
adds the driver for Topcliff PCH DMA controller. The DMA
channels are strictly for device to host or host to device
transfers and cannot be used for generic memcpy.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
[kill GFP_ATOMIC, kill __raw_{read|write}l, locking fixlet]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/Kconfig     |   7 +
 drivers/dma/Makefile    |   1 +
 drivers/dma/pch_dma.c   | 957 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pch_dma.h |  37 ++
 4 files changed, 1002 insertions(+)
 create mode 100644 drivers/dma/pch_dma.c
 create mode 100644 include/linux/pch_dma.h

(limited to 'include/linux')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 6a3e5bfa6742..fed57634b6c1 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -188,6 +188,13 @@ config PL330_DMA
 	  You need to provide platform specific settings via
 	  platform_data for a dma-pl330 device.
 
+config PCH_DMA
+	tristate "Topcliff PCH DMA support"
+	depends on PCI && X86
+	select DMA_ENGINE
+	help
+	  Enable support for the Topcliff PCH DMA engine.
+
 config DMA_ENGINE
 	bool
 
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 27b6c69ae639..72bd70384d8a 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_AMCC_PPC440SPE_ADMA) += ppc4xx/
 obj-$(CONFIG_TIMB_DMA) += timb_dma.o
 obj-$(CONFIG_STE_DMA40) += ste_dma40.o ste_dma40_ll.o
 obj-$(CONFIG_PL330_DMA) += pl330.o
+obj-$(CONFIG_PCH_DMA) += pch_dma.o
diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c
new file mode 100644
index 000000000000..37139ca496ec
--- /dev/null
+++ b/drivers/dma/pch_dma.c
@@ -0,0 +1,957 @@
+/*
+ * Topcliff PCH DMA controller driver
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pch_dma.h>
+
+#define DRV_NAME "pch-dma"
+
+#define DMA_CTL0_DISABLE		0x0
+#define DMA_CTL0_SG			0x1
+#define DMA_CTL0_ONESHOT		0x2
+#define DMA_CTL0_MODE_MASK_BITS		0x3
+#define DMA_CTL0_DIR_SHIFT_BITS		2
+#define DMA_CTL0_BITS_PER_CH		4
+
+#define DMA_CTL2_START_SHIFT_BITS	8
+#define DMA_CTL2_IRQ_ENABLE_MASK	((1UL << DMA_CTL2_START_SHIFT_BITS) - 1)
+
+#define DMA_STATUS_IDLE			0x0
+#define DMA_STATUS_DESC_READ		0x1
+#define DMA_STATUS_WAIT			0x2
+#define DMA_STATUS_ACCESS		0x3
+#define DMA_STATUS_BITS_PER_CH		2
+#define DMA_STATUS_MASK_BITS		0x3
+#define DMA_STATUS_SHIFT_BITS		16
+#define DMA_STATUS_IRQ(x)		(0x1 << (x))
+#define DMA_STATUS_ERR(x)		(0x1 << ((x) + 8))
+
+#define DMA_DESC_WIDTH_SHIFT_BITS	12
+#define DMA_DESC_WIDTH_1_BYTE		(0x3 << DMA_DESC_WIDTH_SHIFT_BITS)
+#define DMA_DESC_WIDTH_2_BYTES		(0x2 << DMA_DESC_WIDTH_SHIFT_BITS)
+#define DMA_DESC_WIDTH_4_BYTES		(0x0 << DMA_DESC_WIDTH_SHIFT_BITS)
+#define DMA_DESC_MAX_COUNT_1_BYTE	0x3FF
+#define DMA_DESC_MAX_COUNT_2_BYTES	0x3FF
+#define DMA_DESC_MAX_COUNT_4_BYTES	0x7FF
+#define DMA_DESC_END_WITHOUT_IRQ	0x0
+#define DMA_DESC_END_WITH_IRQ		0x1
+#define DMA_DESC_FOLLOW_WITHOUT_IRQ	0x2
+#define DMA_DESC_FOLLOW_WITH_IRQ	0x3
+
+#define MAX_CHAN_NR			8
+
+static unsigned int init_nr_desc_per_channel = 64;
+module_param(init_nr_desc_per_channel, uint, 0644);
+MODULE_PARM_DESC(init_nr_desc_per_channel,
+		 "initial descriptors per channel (default: 64)");
+
+struct pch_dma_desc_regs {
+	u32	dev_addr;
+	u32	mem_addr;
+	u32	size;
+	u32	next;
+};
+
+struct pch_dma_regs {
+	u32	dma_ctl0;
+	u32	dma_ctl1;
+	u32	dma_ctl2;
+	u32	reserved1;
+	u32	dma_sts0;
+	u32	dma_sts1;
+	u32	reserved2;
+	u32	reserved3;
+	struct pch_dma_desc_regs desc[0];
+};
+
+struct pch_dma_desc {
+	struct pch_dma_desc_regs regs;
+	struct dma_async_tx_descriptor txd;
+	struct list_head	desc_node;
+	struct list_head	tx_list;
+};
+
+struct pch_dma_chan {
+	struct dma_chan		chan;
+	void __iomem *membase;
+	enum dma_data_direction	dir;
+	struct tasklet_struct	tasklet;
+	unsigned long		err_status;
+
+	spinlock_t		lock;
+
+	dma_cookie_t		completed_cookie;
+	struct list_head	active_list;
+	struct list_head	queue;
+	struct list_head	free_list;
+	unsigned int		descs_allocated;
+};
+
+#define PDC_DEV_ADDR	0x00
+#define PDC_MEM_ADDR	0x04
+#define PDC_SIZE	0x08
+#define PDC_NEXT	0x0C
+
+#define channel_readl(pdc, name) \
+	readl((pdc)->membase + PDC_##name)
+#define channel_writel(pdc, name, val) \
+	writel((val), (pdc)->membase + PDC_##name)
+
+struct pch_dma {
+	struct dma_device	dma;
+	void __iomem *membase;
+	struct pci_pool		*pool;
+	struct pch_dma_regs	regs;
+	struct pch_dma_desc_regs ch_regs[MAX_CHAN_NR];
+	struct pch_dma_chan	channels[0];
+};
+
+#define PCH_DMA_CTL0	0x00
+#define PCH_DMA_CTL1	0x04
+#define PCH_DMA_CTL2	0x08
+#define PCH_DMA_STS0	0x10
+#define PCH_DMA_STS1	0x14
+
+#define dma_readl(pd, name) \
+	__raw_readl((pd)->membase + PCH_DMA_##name)
+#define dma_writel(pd, name, val) \
+	__raw_writel((val), (pd)->membase + PCH_DMA_##name)
+
+static inline struct pch_dma_desc *to_pd_desc(struct dma_async_tx_descriptor *txd)
+{
+	return container_of(txd, struct pch_dma_desc, txd);
+}
+
+static inline struct pch_dma_chan *to_pd_chan(struct dma_chan *chan)
+{
+	return container_of(chan, struct pch_dma_chan, chan);
+}
+
+static inline struct pch_dma *to_pd(struct dma_device *ddev)
+{
+	return container_of(ddev, struct pch_dma, dma);
+}
+
+static inline struct device *chan2dev(struct dma_chan *chan)
+{
+	return &chan->dev->device;
+}
+
+static inline struct device *chan2parent(struct dma_chan *chan)
+{
+	return chan->dev->device.parent;
+}
+
+static inline struct pch_dma_desc *pdc_first_active(struct pch_dma_chan *pd_chan)
+{
+	return list_first_entry(&pd_chan->active_list,
+				struct pch_dma_desc, desc_node);
+}
+
+static inline struct pch_dma_desc *pdc_first_queued(struct pch_dma_chan *pd_chan)
+{
+	return list_first_entry(&pd_chan->queue,
+				struct pch_dma_desc, desc_node);
+}
+
+static void pdc_enable_irq(struct dma_chan *chan, int enable)
+{
+	struct pch_dma *pd = to_pd(chan->device);
+	u32 val;
+
+	val = dma_readl(pd, CTL2);
+
+	if (enable)
+		val |= 0x1 << chan->chan_id;
+	else
+		val &= ~(0x1 << chan->chan_id);
+
+	dma_writel(pd, CTL2, val);
+
+	dev_dbg(chan2dev(chan), "pdc_enable_irq: chan %d -> %x\n",
+		chan->chan_id, val);
+}
+
+static void pdc_set_dir(struct dma_chan *chan)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+	struct pch_dma *pd = to_pd(chan->device);
+	u32 val;
+
+	val = dma_readl(pd, CTL0);
+
+	if (pd_chan->dir == DMA_TO_DEVICE)
+		val |= 0x1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id +
+			       DMA_CTL0_DIR_SHIFT_BITS);
+	else
+		val &= ~(0x1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id +
+				 DMA_CTL0_DIR_SHIFT_BITS));
+
+	dma_writel(pd, CTL0, val);
+
+	dev_dbg(chan2dev(chan), "pdc_set_dir: chan %d -> %x\n",
+		chan->chan_id, val);
+}
+
+static void pdc_set_mode(struct dma_chan *chan, u32 mode)
+{
+	struct pch_dma *pd = to_pd(chan->device);
+	u32 val;
+
+	val = dma_readl(pd, CTL0);
+
+	val &= ~(DMA_CTL0_MODE_MASK_BITS <<
+		(DMA_CTL0_BITS_PER_CH * chan->chan_id));
+	val |= mode << (DMA_CTL0_BITS_PER_CH * chan->chan_id);
+
+	dma_writel(pd, CTL0, val);
+
+	dev_dbg(chan2dev(chan), "pdc_set_mode: chan %d -> %x\n",
+		chan->chan_id, val);
+}
+
+static u32 pdc_get_status(struct pch_dma_chan *pd_chan)
+{
+	struct pch_dma *pd = to_pd(pd_chan->chan.device);
+	u32 val;
+
+	val = dma_readl(pd, STS0);
+	return DMA_STATUS_MASK_BITS & (val >> (DMA_STATUS_SHIFT_BITS +
+			DMA_STATUS_BITS_PER_CH * pd_chan->chan.chan_id));
+}
+
+static bool pdc_is_idle(struct pch_dma_chan *pd_chan)
+{
+	if (pdc_get_status(pd_chan) == DMA_STATUS_IDLE)
+		return true;
+	else
+		return false;
+}
+
+static void pdc_dostart(struct pch_dma_chan *pd_chan, struct pch_dma_desc* desc)
+{
+	struct pch_dma *pd = to_pd(pd_chan->chan.device);
+	u32 val;
+
+	if (!pdc_is_idle(pd_chan)) {
+		dev_err(chan2dev(&pd_chan->chan),
+			"BUG: Attempt to start non-idle channel\n");
+		return;
+	}
+
+	channel_writel(pd_chan, DEV_ADDR, desc->regs.dev_addr);
+	channel_writel(pd_chan, MEM_ADDR, desc->regs.mem_addr);
+	channel_writel(pd_chan, SIZE, desc->regs.size);
+	channel_writel(pd_chan, NEXT, desc->regs.next);
+
+	dev_dbg(chan2dev(&pd_chan->chan), "chan %d -> dev_addr: %x\n",
+		pd_chan->chan.chan_id, desc->regs.dev_addr);
+	dev_dbg(chan2dev(&pd_chan->chan), "chan %d -> mem_addr: %x\n",
+		pd_chan->chan.chan_id, desc->regs.mem_addr);
+	dev_dbg(chan2dev(&pd_chan->chan), "chan %d -> size: %x\n",
+		pd_chan->chan.chan_id, desc->regs.size);
+	dev_dbg(chan2dev(&pd_chan->chan), "chan %d -> next: %x\n",
+		pd_chan->chan.chan_id, desc->regs.next);
+
+	if (list_empty(&desc->tx_list))
+		pdc_set_mode(&pd_chan->chan, DMA_CTL0_ONESHOT);
+	else
+		pdc_set_mode(&pd_chan->chan, DMA_CTL0_SG);
+
+	val = dma_readl(pd, CTL2);
+	val |= 1 << (DMA_CTL2_START_SHIFT_BITS + pd_chan->chan.chan_id);
+	dma_writel(pd, CTL2, val);
+}
+
+static void pdc_chain_complete(struct pch_dma_chan *pd_chan,
+			       struct pch_dma_desc *desc)
+{
+	struct dma_async_tx_descriptor *txd = &desc->txd;
+	dma_async_tx_callback callback = txd->callback;
+	void *param = txd->callback_param;
+
+	list_splice_init(&desc->tx_list, &pd_chan->free_list);
+	list_move(&desc->desc_node, &pd_chan->free_list);
+
+	if (callback)
+		callback(param);
+}
+
+static void pdc_complete_all(struct pch_dma_chan *pd_chan)
+{
+	struct pch_dma_desc *desc, *_d;
+	LIST_HEAD(list);
+
+	BUG_ON(!pdc_is_idle(pd_chan));
+
+	if (!list_empty(&pd_chan->queue))
+		pdc_dostart(pd_chan, pdc_first_queued(pd_chan));
+
+	list_splice_init(&pd_chan->active_list, &list);
+	list_splice_init(&pd_chan->queue, &pd_chan->active_list);
+
+	list_for_each_entry_safe(desc, _d, &list, desc_node)
+		pdc_chain_complete(pd_chan, desc);
+}
+
+static void pdc_handle_error(struct pch_dma_chan *pd_chan)
+{
+	struct pch_dma_desc *bad_desc;
+
+	bad_desc = pdc_first_active(pd_chan);
+	list_del(&bad_desc->desc_node);
+
+	list_splice_init(&pd_chan->queue, pd_chan->active_list.prev);
+
+	if (!list_empty(&pd_chan->active_list))
+		pdc_dostart(pd_chan, pdc_first_active(pd_chan));
+
+	dev_crit(chan2dev(&pd_chan->chan), "Bad descriptor submitted\n");
+	dev_crit(chan2dev(&pd_chan->chan), "descriptor cookie: %d\n",
+		 bad_desc->txd.cookie);
+
+	pdc_chain_complete(pd_chan, bad_desc);
+}
+
+static void pdc_advance_work(struct pch_dma_chan *pd_chan)
+{
+	if (list_empty(&pd_chan->active_list) ||
+		list_is_singular(&pd_chan->active_list)) {
+		pdc_complete_all(pd_chan);
+	} else {
+		pdc_chain_complete(pd_chan, pdc_first_active(pd_chan));
+		pdc_dostart(pd_chan, pdc_first_active(pd_chan));
+	}
+}
+
+static dma_cookie_t pdc_assign_cookie(struct pch_dma_chan *pd_chan,
+				      struct pch_dma_desc *desc)
+{
+	dma_cookie_t cookie = pd_chan->chan.cookie;
+
+	if (++cookie < 0)
+		cookie = 1;
+
+	pd_chan->chan.cookie = cookie;
+	desc->txd.cookie = cookie;
+
+	return cookie;
+}
+
+static dma_cookie_t pd_tx_submit(struct dma_async_tx_descriptor *txd)
+{
+	struct pch_dma_desc *desc = to_pd_desc(txd);
+	struct pch_dma_chan *pd_chan = to_pd_chan(txd->chan);
+	dma_cookie_t cookie;
+
+	spin_lock_bh(&pd_chan->lock);
+	cookie = pdc_assign_cookie(pd_chan, desc);
+
+	if (list_empty(&pd_chan->active_list)) {
+		list_add_tail(&desc->desc_node, &pd_chan->active_list);
+		pdc_dostart(pd_chan, desc);
+	} else {
+		list_add_tail(&desc->desc_node, &pd_chan->queue);
+	}
+
+	spin_unlock_bh(&pd_chan->lock);
+	return 0;
+}
+
+static struct pch_dma_desc *pdc_alloc_desc(struct dma_chan *chan, gfp_t flags)
+{
+	struct pch_dma_desc *desc = NULL;
+	struct pch_dma *pd = to_pd(chan->device);
+	dma_addr_t addr;
+
+	desc = pci_pool_alloc(pd->pool, GFP_KERNEL, &addr);
+	if (desc) {
+		memset(desc, 0, sizeof(struct pch_dma_desc));
+		INIT_LIST_HEAD(&desc->tx_list);
+		dma_async_tx_descriptor_init(&desc->txd, chan);
+		desc->txd.tx_submit = pd_tx_submit;
+		desc->txd.flags = DMA_CTRL_ACK;
+		desc->txd.phys = addr;
+	}
+
+	return desc;
+}
+
+static struct pch_dma_desc *pdc_desc_get(struct pch_dma_chan *pd_chan)
+{
+	struct pch_dma_desc *desc, *_d;
+	struct pch_dma_desc *ret = NULL;
+	int i;
+
+	spin_lock_bh(&pd_chan->lock);
+	list_for_each_entry_safe(desc, _d, &pd_chan->free_list, desc_node) {
+		i++;
+		if (async_tx_test_ack(&desc->txd)) {
+			list_del(&desc->desc_node);
+			ret = desc;
+			break;
+		}
+		dev_dbg(chan2dev(&pd_chan->chan), "desc %p not ACKed\n", desc);
+	}
+	spin_unlock_bh(&pd_chan->lock);
+	dev_dbg(chan2dev(&pd_chan->chan), "scanned %d descriptors\n", i);
+
+	if (!ret) {
+		ret = pdc_alloc_desc(&pd_chan->chan, GFP_NOIO);
+		if (ret) {
+			spin_lock_bh(&pd_chan->lock);
+			pd_chan->descs_allocated++;
+			spin_unlock_bh(&pd_chan->lock);
+		} else {
+			dev_err(chan2dev(&pd_chan->chan),
+				"failed to alloc desc\n");
+		}
+	}
+
+	return ret;
+}
+
+static void pdc_desc_put(struct pch_dma_chan *pd_chan,
+			 struct pch_dma_desc *desc)
+{
+	if (desc) {
+		spin_lock_bh(&pd_chan->lock);
+		list_splice_init(&desc->tx_list, &pd_chan->free_list);
+		list_add(&desc->desc_node, &pd_chan->free_list);
+		spin_unlock_bh(&pd_chan->lock);
+	}
+}
+
+static int pd_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+	struct pch_dma_desc *desc;
+	LIST_HEAD(tmp_list);
+	int i;
+
+	if (!pdc_is_idle(pd_chan)) {
+		dev_dbg(chan2dev(chan), "DMA channel not idle ?\n");
+		return -EIO;
+	}
+
+	if (!list_empty(&pd_chan->free_list))
+		return pd_chan->descs_allocated;
+
+	for (i = 0; i < init_nr_desc_per_channel; i++) {
+		desc = pdc_alloc_desc(chan, GFP_KERNEL);
+
+		if (!desc) {
+			dev_warn(chan2dev(chan),
+				"Only allocated %d initial descriptors\n", i);
+			break;
+		}
+
+		list_add_tail(&desc->desc_node, &tmp_list);
+	}
+
+	spin_lock_bh(&pd_chan->lock);
+	list_splice(&tmp_list, &pd_chan->free_list);
+	pd_chan->descs_allocated = i;
+	pd_chan->completed_cookie = chan->cookie = 1;
+	spin_unlock_bh(&pd_chan->lock);
+
+	pdc_enable_irq(chan, 1);
+	pdc_set_dir(chan);
+
+	return pd_chan->descs_allocated;
+}
+
+static void pd_free_chan_resources(struct dma_chan *chan)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+	struct pch_dma *pd = to_pd(chan->device);
+	struct pch_dma_desc *desc, *_d;
+	LIST_HEAD(tmp_list);
+
+	BUG_ON(!pdc_is_idle(pd_chan));
+	BUG_ON(!list_empty(&pd_chan->active_list));
+	BUG_ON(!list_empty(&pd_chan->queue));
+
+	spin_lock_bh(&pd_chan->lock);
+	list_splice_init(&pd_chan->free_list, &tmp_list);
+	pd_chan->descs_allocated = 0;
+	spin_unlock_bh(&pd_chan->lock);
+
+	list_for_each_entry_safe(desc, _d, &tmp_list, desc_node)
+		pci_pool_free(pd->pool, desc, desc->txd.phys);
+
+	pdc_enable_irq(chan, 0);
+}
+
+static enum dma_status pd_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
+				    struct dma_tx_state *txstate)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+	dma_cookie_t last_used;
+	dma_cookie_t last_completed;
+	int ret;
+
+	spin_lock_bh(&pd_chan->lock);
+	last_completed = pd_chan->completed_cookie;
+	last_used = chan->cookie;
+	spin_unlock_bh(&pd_chan->lock);
+
+	ret = dma_async_is_complete(cookie, last_completed, last_used);
+
+	dma_set_tx_state(txstate, last_completed, last_used, 0);
+
+	return ret;
+}
+
+static void pd_issue_pending(struct dma_chan *chan)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+
+	if (pdc_is_idle(pd_chan)) {
+		spin_lock_bh(&pd_chan->lock);
+		pdc_advance_work(pd_chan);
+		spin_unlock_bh(&pd_chan->lock);
+	}
+}
+
+static struct dma_async_tx_descriptor *pd_prep_slave_sg(struct dma_chan *chan,
+			struct scatterlist *sgl, unsigned int sg_len,
+			enum dma_data_direction direction, unsigned long flags)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+	struct pch_dma_slave *pd_slave = chan->private;
+	struct pch_dma_desc *first = NULL;
+	struct pch_dma_desc *prev = NULL;
+	struct pch_dma_desc *desc = NULL;
+	struct scatterlist *sg;
+	dma_addr_t reg;
+	int i;
+
+	if (unlikely(!sg_len)) {
+		dev_info(chan2dev(chan), "prep_slave_sg: length is zero!\n");
+		return NULL;
+	}
+
+	if (direction == DMA_FROM_DEVICE)
+		reg = pd_slave->rx_reg;
+	else if (direction == DMA_TO_DEVICE)
+		reg = pd_slave->tx_reg;
+	else
+		return NULL;
+
+	for_each_sg(sgl, sg, sg_len, i) {
+		desc = pdc_desc_get(pd_chan);
+
+		if (!desc)
+			goto err_desc_get;
+
+		desc->regs.dev_addr = reg;
+		desc->regs.mem_addr = sg_phys(sg);
+		desc->regs.size = sg_dma_len(sg);
+		desc->regs.next = DMA_DESC_FOLLOW_WITHOUT_IRQ;
+
+		switch (pd_slave->width) {
+		case PCH_DMA_WIDTH_1_BYTE:
+			if (desc->regs.size > DMA_DESC_MAX_COUNT_1_BYTE)
+				goto err_desc_get;
+			desc->regs.size |= DMA_DESC_WIDTH_1_BYTE;
+			break;
+		case PCH_DMA_WIDTH_2_BYTES:
+			if (desc->regs.size > DMA_DESC_MAX_COUNT_2_BYTES)
+				goto err_desc_get;
+			desc->regs.size |= DMA_DESC_WIDTH_2_BYTES;
+			break;
+		case PCH_DMA_WIDTH_4_BYTES:
+			if (desc->regs.size > DMA_DESC_MAX_COUNT_4_BYTES)
+				goto err_desc_get;
+			desc->regs.size |= DMA_DESC_WIDTH_4_BYTES;
+			break;
+		default:
+			goto err_desc_get;
+		}
+
+
+		if (!first) {
+			first = desc;
+		} else {
+			prev->regs.next |= desc->txd.phys;
+			list_add_tail(&desc->desc_node, &first->tx_list);
+		}
+
+		prev = desc;
+	}
+
+	if (flags & DMA_PREP_INTERRUPT)
+		desc->regs.next = DMA_DESC_END_WITH_IRQ;
+	else
+		desc->regs.next = DMA_DESC_END_WITHOUT_IRQ;
+
+	first->txd.cookie = -EBUSY;
+	desc->txd.flags = flags;
+
+	return &first->txd;
+
+err_desc_get:
+	dev_err(chan2dev(chan), "failed to get desc or wrong parameters\n");
+	pdc_desc_put(pd_chan, first);
+	return NULL;
+}
+
+static int pd_device_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
+			     unsigned long arg)
+{
+	struct pch_dma_chan *pd_chan = to_pd_chan(chan);
+	struct pch_dma_desc *desc, *_d;
+	LIST_HEAD(list);
+
+	if (cmd != DMA_TERMINATE_ALL)
+		return -ENXIO;
+
+	spin_lock_bh(&pd_chan->lock);
+
+	pdc_set_mode(&pd_chan->chan, DMA_CTL0_DISABLE);
+
+	list_splice_init(&pd_chan->active_list, &list);
+	list_splice_init(&pd_chan->queue, &list);
+
+	list_for_each_entry_safe(desc, _d, &list, desc_node)
+		pdc_chain_complete(pd_chan, desc);
+
+	spin_unlock_bh(&pd_chan->lock);
+
+
+	return 0;
+}
+
+static void pdc_tasklet(unsigned long data)
+{
+	struct pch_dma_chan *pd_chan = (struct pch_dma_chan *)data;
+
+	if (!pdc_is_idle(pd_chan)) {
+		dev_err(chan2dev(&pd_chan->chan),
+			"BUG: handle non-idle channel in tasklet\n");
+		return;
+	}
+
+	spin_lock_bh(&pd_chan->lock);
+	if (test_and_clear_bit(0, &pd_chan->err_status))
+		pdc_handle_error(pd_chan);
+	else
+		pdc_advance_work(pd_chan);
+	spin_unlock_bh(&pd_chan->lock);
+}
+
+static irqreturn_t pd_irq(int irq, void *devid)
+{
+	struct pch_dma *pd = (struct pch_dma *)devid;
+	struct pch_dma_chan *pd_chan;
+	u32 sts0;
+	int i;
+	int ret = IRQ_NONE;
+
+	sts0 = dma_readl(pd, STS0);
+
+	dev_dbg(pd->dma.dev, "pd_irq sts0: %x\n", sts0);
+
+	for (i = 0; i < pd->dma.chancnt; i++) {
+		pd_chan = &pd->channels[i];
+
+		if (sts0 & DMA_STATUS_IRQ(i)) {
+			if (sts0 & DMA_STATUS_ERR(i))
+				set_bit(0, &pd_chan->err_status);
+
+			tasklet_schedule(&pd_chan->tasklet);
+			ret = IRQ_HANDLED;
+		}
+
+	}
+
+	/* clear interrupt bits in status register */
+	dma_writel(pd, STS0, sts0);
+
+	return ret;
+}
+
+static void pch_dma_save_regs(struct pch_dma *pd)
+{
+	struct pch_dma_chan *pd_chan;
+	struct dma_chan *chan, *_c;
+	int i = 0;
+
+	pd->regs.dma_ctl0 = dma_readl(pd, CTL0);
+	pd->regs.dma_ctl1 = dma_readl(pd, CTL1);
+	pd->regs.dma_ctl2 = dma_readl(pd, CTL2);
+
+	list_for_each_entry_safe(chan, _c, &pd->dma.channels, device_node) {
+		pd_chan = to_pd_chan(chan);
+
+		pd->ch_regs[i].dev_addr = channel_readl(pd_chan, DEV_ADDR);
+		pd->ch_regs[i].mem_addr = channel_readl(pd_chan, MEM_ADDR);
+		pd->ch_regs[i].size = channel_readl(pd_chan, SIZE);
+		pd->ch_regs[i].next = channel_readl(pd_chan, NEXT);
+
+		i++;
+	}
+}
+
+static void pch_dma_restore_regs(struct pch_dma *pd)
+{
+	struct pch_dma_chan *pd_chan;
+	struct dma_chan *chan, *_c;
+	int i = 0;
+
+	dma_writel(pd, CTL0, pd->regs.dma_ctl0);
+	dma_writel(pd, CTL1, pd->regs.dma_ctl1);
+	dma_writel(pd, CTL2, pd->regs.dma_ctl2);
+
+	list_for_each_entry_safe(chan, _c, &pd->dma.channels, device_node) {
+		pd_chan = to_pd_chan(chan);
+
+		channel_writel(pd_chan, DEV_ADDR, pd->ch_regs[i].dev_addr);
+		channel_writel(pd_chan, MEM_ADDR, pd->ch_regs[i].mem_addr);
+		channel_writel(pd_chan, SIZE, pd->ch_regs[i].size);
+		channel_writel(pd_chan, NEXT, pd->ch_regs[i].next);
+
+		i++;
+	}
+}
+
+static int pch_dma_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct pch_dma *pd = pci_get_drvdata(pdev);
+
+	if (pd)
+		pch_dma_save_regs(pd);
+
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+
+	return 0;
+}
+
+static int pch_dma_resume(struct pci_dev *pdev)
+{
+	struct pch_dma *pd = pci_get_drvdata(pdev);
+	int err;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_dbg(&pdev->dev, "failed to enable device\n");
+		return err;
+	}
+
+	if (pd)
+		pch_dma_restore_regs(pd);
+
+	return 0;
+}
+
+static int __devinit pch_dma_probe(struct pci_dev *pdev,
+				   const struct pci_device_id *id)
+{
+	struct pch_dma *pd;
+	struct pch_dma_regs *regs;
+	unsigned int nr_channels;
+	int err;
+	int i;
+
+	nr_channels = id->driver_data;
+	pd = kzalloc(sizeof(struct pch_dma)+
+		sizeof(struct pch_dma_chan) * nr_channels, GFP_KERNEL);
+	if (!pd)
+		return -ENOMEM;
+
+	pci_set_drvdata(pdev, pd);
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device\n");
+		goto err_free_mem;
+	}
+
+	if (!(pci_resource_flags(pdev, 1) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Cannot find proper base address\n");
+		goto err_disable_pdev;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot obtain PCI resources\n");
+		goto err_disable_pdev;
+	}
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+	if (err) {
+		dev_err(&pdev->dev, "Cannot set proper DMA config\n");
+		goto err_free_res;
+	}
+
+	regs = pd->membase = pci_iomap(pdev, 1, 0);
+	if (!pd->membase) {
+		dev_err(&pdev->dev, "Cannot map MMIO registers\n");
+		err = -ENOMEM;
+		goto err_free_res;
+	}
+
+	pci_set_master(pdev);
+
+	err = request_irq(pdev->irq, pd_irq, IRQF_SHARED, DRV_NAME, pd);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to request IRQ\n");
+		goto err_iounmap;
+	}
+
+	pd->pool = pci_pool_create("pch_dma_desc_pool", pdev,
+				   sizeof(struct pch_dma_desc), 4, 0);
+	if (!pd->pool) {
+		dev_err(&pdev->dev, "Failed to alloc DMA descriptors\n");
+		err = -ENOMEM;
+		goto err_free_irq;
+	}
+
+	pd->dma.dev = &pdev->dev;
+	pd->dma.chancnt = nr_channels;
+
+	INIT_LIST_HEAD(&pd->dma.channels);
+
+	for (i = 0; i < nr_channels; i++) {
+		struct pch_dma_chan *pd_chan = &pd->channels[i];
+
+		pd_chan->chan.device = &pd->dma;
+		pd_chan->chan.cookie = 1;
+		pd_chan->chan.chan_id = i;
+
+		pd_chan->membase = &regs->desc[i];
+
+		pd_chan->dir = (i % 2) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+
+		spin_lock_init(&pd_chan->lock);
+
+		INIT_LIST_HEAD(&pd_chan->active_list);
+		INIT_LIST_HEAD(&pd_chan->queue);
+		INIT_LIST_HEAD(&pd_chan->free_list);
+
+		tasklet_init(&pd_chan->tasklet, pdc_tasklet,
+			     (unsigned long)pd_chan);
+		list_add_tail(&pd_chan->chan.device_node, &pd->dma.channels);
+	}
+
+	dma_cap_zero(pd->dma.cap_mask);
+	dma_cap_set(DMA_PRIVATE, pd->dma.cap_mask);
+	dma_cap_set(DMA_SLAVE, pd->dma.cap_mask);
+
+	pd->dma.device_alloc_chan_resources = pd_alloc_chan_resources;
+	pd->dma.device_free_chan_resources = pd_free_chan_resources;
+	pd->dma.device_tx_status = pd_tx_status;
+	pd->dma.device_issue_pending = pd_issue_pending;
+	pd->dma.device_prep_slave_sg = pd_prep_slave_sg;
+	pd->dma.device_control = pd_device_control;
+
+	err = dma_async_device_register(&pd->dma);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to register DMA device\n");
+		goto err_free_pool;
+	}
+
+	return 0;
+
+err_free_pool:
+	pci_pool_destroy(pd->pool);
+err_free_irq:
+	free_irq(pdev->irq, pd);
+err_iounmap:
+	pci_iounmap(pdev, pd->membase);
+err_free_res:
+	pci_release_regions(pdev);
+err_disable_pdev:
+	pci_disable_device(pdev);
+err_free_mem:
+	return err;
+}
+
+static void __devexit pch_dma_remove(struct pci_dev *pdev)
+{
+	struct pch_dma *pd = pci_get_drvdata(pdev);
+	struct pch_dma_chan *pd_chan;
+	struct dma_chan *chan, *_c;
+
+	if (pd) {
+		dma_async_device_unregister(&pd->dma);
+
+		list_for_each_entry_safe(chan, _c, &pd->dma.channels,
+					 device_node) {
+			pd_chan = to_pd_chan(chan);
+
+			tasklet_disable(&pd_chan->tasklet);
+			tasklet_kill(&pd_chan->tasklet);
+		}
+
+		pci_pool_destroy(pd->pool);
+		free_irq(pdev->irq, pd);
+		pci_iounmap(pdev, pd->membase);
+		pci_release_regions(pdev);
+		pci_disable_device(pdev);
+		kfree(pd);
+	}
+}
+
+/* PCI Device ID of DMA device */
+#define PCI_DEVICE_ID_PCH_DMA_8CH        0x8810
+#define PCI_DEVICE_ID_PCH_DMA_4CH        0x8815
+
+static const struct pci_device_id pch_dma_id_table[] = {
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PCH_DMA_8CH), 8 },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PCH_DMA_4CH), 4 },
+};
+
+static struct pci_driver pch_dma_driver = {
+	.name		= DRV_NAME,
+	.id_table	= pch_dma_id_table,
+	.probe		= pch_dma_probe,
+	.remove		= __devexit_p(pch_dma_remove),
+#ifdef CONFIG_PM
+	.suspend	= pch_dma_suspend,
+	.resume		= pch_dma_resume,
+#endif
+};
+
+static int __init pch_dma_init(void)
+{
+	return pci_register_driver(&pch_dma_driver);
+}
+
+static void __exit pch_dma_exit(void)
+{
+	pci_unregister_driver(&pch_dma_driver);
+}
+
+module_init(pch_dma_init);
+module_exit(pch_dma_exit);
+
+MODULE_DESCRIPTION("Topcliff PCH DMA controller driver");
+MODULE_AUTHOR("Yong Wang <yong.y.wang@intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/pch_dma.h b/include/linux/pch_dma.h
new file mode 100644
index 000000000000..fdafe529ef8a
--- /dev/null
+++ b/include/linux/pch_dma.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef PCH_DMA_H
+#define PCH_DMA_H
+
+#include <linux/dmaengine.h>
+
+enum pch_dma_width {
+	PCH_DMA_WIDTH_1_BYTE,
+	PCH_DMA_WIDTH_2_BYTES,
+	PCH_DMA_WIDTH_4_BYTES,
+};
+
+struct pch_dma_slave {
+	struct device		*dma_dev;
+	unsigned int		chan_id;
+	dma_addr_t		tx_reg;
+	dma_addr_t		rx_reg;
+	enum pch_dma_width	width;
+};
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From c156d0a5b0c667999e06d0bb52e3d1376faec8bf Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@stericsson.com>
Date: Wed, 4 Aug 2010 13:37:33 +0200
Subject: DMAENGINE: generic slave channel control v3

This adds an interface to the DMAengine to make it possible to
reconfigure a slave channel at runtime. We add a few foreseen
config parameters to the passed struct, with a void * pointer
for custom per-device or per-platform runtime slave data.

Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dmaengine.h | 71 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 5204f018931b..c61d4ca27bcc 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -114,11 +114,17 @@ enum dma_ctrl_flags {
  * @DMA_TERMINATE_ALL: terminate all ongoing transfers
  * @DMA_PAUSE: pause ongoing transfers
  * @DMA_RESUME: resume paused transfer
+ * @DMA_SLAVE_CONFIG: this command is only implemented by DMA controllers
+ * that need to runtime reconfigure the slave channels (as opposed to passing
+ * configuration data in statically from the platform). An additional
+ * argument of struct dma_slave_config must be passed in with this
+ * command.
  */
 enum dma_ctrl_cmd {
 	DMA_TERMINATE_ALL,
 	DMA_PAUSE,
 	DMA_RESUME,
+	DMA_SLAVE_CONFIG,
 };
 
 /**
@@ -199,6 +205,71 @@ struct dma_chan_dev {
 	atomic_t *idr_ref;
 };
 
+/**
+ * enum dma_slave_buswidth - defines bus with of the DMA slave
+ * device, source or target buses
+ */
+enum dma_slave_buswidth {
+	DMA_SLAVE_BUSWIDTH_UNDEFINED = 0,
+	DMA_SLAVE_BUSWIDTH_1_BYTE = 1,
+	DMA_SLAVE_BUSWIDTH_2_BYTES = 2,
+	DMA_SLAVE_BUSWIDTH_4_BYTES = 4,
+	DMA_SLAVE_BUSWIDTH_8_BYTES = 8,
+};
+
+/**
+ * struct dma_slave_config - dma slave channel runtime config
+ * @direction: whether the data shall go in or out on this slave
+ * channel, right now. DMA_TO_DEVICE and DMA_FROM_DEVICE are
+ * legal values, DMA_BIDIRECTIONAL is not acceptable since we
+ * need to differentiate source and target addresses.
+ * @src_addr: this is the physical address where DMA slave data
+ * should be read (RX), if the source is memory this argument is
+ * ignored.
+ * @dst_addr: this is the physical address where DMA slave data
+ * should be written (TX), if the source is memory this argument
+ * is ignored.
+ * @src_addr_width: this is the width in bytes of the source (RX)
+ * register where DMA data shall be read. If the source
+ * is memory this may be ignored depending on architecture.
+ * Legal values: 1, 2, 4, 8.
+ * @dst_addr_width: same as src_addr_width but for destination
+ * target (TX) mutatis mutandis.
+ * @src_maxburst: the maximum number of words (note: words, as in
+ * units of the src_addr_width member, not bytes) that can be sent
+ * in one burst to the device. Typically something like half the
+ * FIFO depth on I/O peripherals so you don't overflow it. This
+ * may or may not be applicable on memory sources.
+ * @dst_maxburst: same as src_maxburst but for destination target
+ * mutatis mutandis.
+ *
+ * This struct is passed in as configuration data to a DMA engine
+ * in order to set up a certain channel for DMA transport at runtime.
+ * The DMA device/engine has to provide support for an additional
+ * command in the channel config interface, DMA_SLAVE_CONFIG
+ * and this struct will then be passed in as an argument to the
+ * DMA engine device_control() function.
+ *
+ * The rationale for adding configuration information to this struct
+ * is as follows: if it is likely that most DMA slave controllers in
+ * the world will support the configuration option, then make it
+ * generic. If not: if it is fixed so that it be sent in static from
+ * the platform data, then prefer to do that. Else, if it is neither
+ * fixed at runtime, nor generic enough (such as bus mastership on
+ * some CPU family and whatnot) then create a custom slave config
+ * struct and pass that, then make this config a member of that
+ * struct, if applicable.
+ */
+struct dma_slave_config {
+	enum dma_data_direction direction;
+	dma_addr_t src_addr;
+	dma_addr_t dst_addr;
+	enum dma_slave_buswidth src_addr_width;
+	enum dma_slave_buswidth dst_addr_width;
+	u32 src_maxburst;
+	u32 dst_maxburst;
+};
+
 static inline const char *dma_chan_name(struct dma_chan *chan)
 {
 	return dev_name(&chan->dev->device);
-- 
cgit v1.2.3-59-g8ed1b


From fca3ec01e0b40cab82cac7745e154b01969e6219 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 4 Aug 2010 14:34:24 +0100
Subject: drm,io-mapping: Specify slot to use for atomic mappings

This is required should we ever attempt to use an io-mapping where
KM_USER0 is verboten, such as inside an IRQ context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Eric Anholt <eric@anholt.net>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/i915/i915_gem.c        |  9 +++++----
 drivers/gpu/drm/i915/intel_overlay.c   |  5 +++--
 drivers/gpu/drm/nouveau/nouveau_bios.c |  8 ++++----
 include/linux/io-mapping.h             | 16 ++++++++++------
 4 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4efd4fd3b340..2a4ed7ca8b4e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -496,10 +496,10 @@ fast_user_write(struct io_mapping *mapping,
 	char *vaddr_atomic;
 	unsigned long unwritten;
 
-	vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base);
+	vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base, KM_USER0);
 	unwritten = __copy_from_user_inatomic_nocache(vaddr_atomic + page_offset,
 						      user_data, length);
-	io_mapping_unmap_atomic(vaddr_atomic);
+	io_mapping_unmap_atomic(vaddr_atomic, KM_USER0);
 	if (unwritten)
 		return -EFAULT;
 	return 0;
@@ -3487,7 +3487,8 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
 		reloc_offset = obj_priv->gtt_offset + reloc->offset;
 		reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
 						      (reloc_offset &
-						       ~(PAGE_SIZE - 1)));
+						       ~(PAGE_SIZE - 1)),
+						      KM_USER0);
 		reloc_entry = (uint32_t __iomem *)(reloc_page +
 						   (reloc_offset & (PAGE_SIZE - 1)));
 		reloc_val = target_obj_priv->gtt_offset + reloc->delta;
@@ -3498,7 +3499,7 @@ i915_gem_object_pin_and_relocate(struct drm_gem_object *obj,
 			  readl(reloc_entry), reloc_val);
 #endif
 		writel(reloc_val, reloc_entry);
-		io_mapping_unmap_atomic(reloc_page);
+		io_mapping_unmap_atomic(reloc_page, KM_USER0);
 
 		/* The updated presumed offset for this entry will be
 		 * copied back out to the user.
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index f26ec2f27d36..d39aea24eabe 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -185,7 +185,8 @@ static struct overlay_registers *intel_overlay_map_regs_atomic(struct intel_over
 
 	if (OVERLAY_NONPHYSICAL(overlay->dev)) {
 		regs = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
-				overlay->reg_bo->gtt_offset);
+						overlay->reg_bo->gtt_offset,
+						KM_USER0);
 
 		if (!regs) {
 			DRM_ERROR("failed to map overlay regs in GTT\n");
@@ -200,7 +201,7 @@ static struct overlay_registers *intel_overlay_map_regs_atomic(struct intel_over
 static void intel_overlay_unmap_regs_atomic(struct intel_overlay *overlay)
 {
 	if (OVERLAY_NONPHYSICAL(overlay->dev))
-		io_mapping_unmap_atomic(overlay->virt_addr);
+		io_mapping_unmap_atomic(overlay->virt_addr, KM_USER0);
 
 	overlay->virt_addr = NULL;
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.c b/drivers/gpu/drm/nouveau/nouveau_bios.c
index b59f348f14fc..7369b5e73649 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bios.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bios.c
@@ -2083,11 +2083,11 @@ peek_fb(struct drm_device *dev, struct io_mapping *fb,
 	uint32_t val = 0;
 
 	if (off < pci_resource_len(dev->pdev, 1)) {
-		uint32_t __iomem *p = io_mapping_map_atomic_wc(fb, off);
+		uint32_t __iomem *p = io_mapping_map_atomic_wc(fb, off, KM_USER0);
 
 		val = ioread32(p);
 
-		io_mapping_unmap_atomic(p);
+		io_mapping_unmap_atomic(p, KM_USER0);
 	}
 
 	return val;
@@ -2098,12 +2098,12 @@ poke_fb(struct drm_device *dev, struct io_mapping *fb,
 	uint32_t off, uint32_t val)
 {
 	if (off < pci_resource_len(dev->pdev, 1)) {
-		uint32_t __iomem *p = io_mapping_map_atomic_wc(fb, off);
+		uint32_t __iomem *p = io_mapping_map_atomic_wc(fb, off, KM_USER0);
 
 		iowrite32(val, p);
 		wmb();
 
-		io_mapping_unmap_atomic(p);
+		io_mapping_unmap_atomic(p, KM_USER0);
 	}
 }
 
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 25085ddd955f..e0ea40f6c515 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -79,7 +79,9 @@ io_mapping_free(struct io_mapping *mapping)
 
 /* Atomic map/unmap */
 static inline void *
-io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
+io_mapping_map_atomic_wc(struct io_mapping *mapping,
+			 unsigned long offset,
+			 int slot)
 {
 	resource_size_t phys_addr;
 	unsigned long pfn;
@@ -87,13 +89,13 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
 	BUG_ON(offset >= mapping->size);
 	phys_addr = mapping->base + offset;
 	pfn = (unsigned long) (phys_addr >> PAGE_SHIFT);
-	return iomap_atomic_prot_pfn(pfn, KM_USER0, mapping->prot);
+	return iomap_atomic_prot_pfn(pfn, slot, mapping->prot);
 }
 
 static inline void
-io_mapping_unmap_atomic(void *vaddr)
+io_mapping_unmap_atomic(void *vaddr, int slot)
 {
-	iounmap_atomic(vaddr, KM_USER0);
+	iounmap_atomic(vaddr, slot);
 }
 
 static inline void *
@@ -133,13 +135,15 @@ io_mapping_free(struct io_mapping *mapping)
 
 /* Atomic map/unmap */
 static inline void *
-io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
+io_mapping_map_atomic_wc(struct io_mapping *mapping,
+			 unsigned long offset,
+			 int slot)
 {
 	return ((char *) mapping) + offset;
 }
 
 static inline void
-io_mapping_unmap_atomic(void *vaddr)
+io_mapping_unmap_atomic(void *vaddr, int slot)
 {
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From f9599ce11192f24dbb0f4fdb70121a05edc58342 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Wed, 4 Aug 2010 04:43:44 +0000
Subject: sk_buff: introduce pskb_network_may_pull()

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d20d9e7a9bbd..77eb60d2b496 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1379,6 +1379,11 @@ static inline int skb_network_offset(const struct sk_buff *skb)
 	return skb_network_header(skb) - skb->data;
 }
 
+static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
+{
+	return pskb_may_pull(skb, skb_network_offset(skb) + len);
+}
+
 /*
  * CPUs often take a performance hit when accessing unaligned memory
  * locations. The actual performance hit varies, it can be small if the
-- 
cgit v1.2.3-59-g8ed1b


From d7100da026317fcf07411f765fe1cdb044053917 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Wed, 4 Aug 2010 07:34:36 +0000
Subject: ppp: make channel_ops const

The PPP channel ops structure should be const.
Cleanup the declarations to use standard C99 format.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/char/pcmcia/ipwireless/network.c | 2 +-
 drivers/net/ppp_async.c                  | 6 +++---
 drivers/net/ppp_synctty.c                | 6 +++---
 drivers/net/pppoe.c                      | 4 ++--
 include/linux/ppp_channel.h              | 2 +-
 net/atm/pppoatm.c                        | 2 +-
 net/irda/irnet/irnet_ppp.c               | 2 +-
 net/l2tp/l2tp_ppp.c                      | 5 ++++-
 8 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pcmcia/ipwireless/network.c b/drivers/char/pcmcia/ipwireless/network.c
index 65920163f53d..9fe538347932 100644
--- a/drivers/char/pcmcia/ipwireless/network.c
+++ b/drivers/char/pcmcia/ipwireless/network.c
@@ -239,7 +239,7 @@ static int ipwireless_ppp_ioctl(struct ppp_channel *ppp_channel,
 	return err;
 }
 
-static struct ppp_channel_ops ipwireless_ppp_channel_ops = {
+static const struct ppp_channel_ops ipwireless_ppp_channel_ops = {
 	.start_xmit = ipwireless_ppp_start_xmit,
 	.ioctl      = ipwireless_ppp_ioctl
 };
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
index 6c2e8fa0ca31..af50a530daee 100644
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -108,9 +108,9 @@ static void ppp_async_process(unsigned long arg);
 static void async_lcp_peek(struct asyncppp *ap, unsigned char *data,
 			   int len, int inbound);
 
-static struct ppp_channel_ops async_ops = {
-	ppp_async_send,
-	ppp_async_ioctl
+static const struct ppp_channel_ops async_ops = {
+	.start_xmit = ppp_async_send,
+	.ioctl      = ppp_async_ioctl,
 };
 
 /*
diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c
index 52938da1e542..4c95ec3fb8d4 100644
--- a/drivers/net/ppp_synctty.c
+++ b/drivers/net/ppp_synctty.c
@@ -97,9 +97,9 @@ static void ppp_sync_flush_output(struct syncppp *ap);
 static void ppp_sync_input(struct syncppp *ap, const unsigned char *buf,
 			   char *flags, int count);
 
-static struct ppp_channel_ops sync_ops = {
-	ppp_sync_send,
-	ppp_sync_ioctl
+static const struct ppp_channel_ops sync_ops = {
+	.start_xmit = ppp_sync_send,
+	.ioctl      = ppp_sync_ioctl,
 };
 
 /*
diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index 344ef330e123..c07de359dc07 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -92,7 +92,7 @@
 static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb);
 
 static const struct proto_ops pppoe_ops;
-static struct ppp_channel_ops pppoe_chan_ops;
+static const struct ppp_channel_ops pppoe_chan_ops;
 
 /* per-net private data for this module */
 static int pppoe_net_id __read_mostly;
@@ -963,7 +963,7 @@ static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 	return __pppoe_xmit(sk, skb);
 }
 
-static struct ppp_channel_ops pppoe_chan_ops = {
+static const struct ppp_channel_ops pppoe_chan_ops = {
 	.start_xmit = pppoe_xmit,
 };
 
diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h
index bff98ec1bfed..5d87f810a3b7 100644
--- a/include/linux/ppp_channel.h
+++ b/include/linux/ppp_channel.h
@@ -36,7 +36,7 @@ struct ppp_channel_ops {
 
 struct ppp_channel {
 	void		*private;	/* channel private data */
-	struct ppp_channel_ops *ops;	/* operations for this channel */
+	const struct ppp_channel_ops *ops; /* operations for this channel */
 	int		mtu;		/* max transmit packet size */
 	int		hdrlen;		/* amount of headroom channel needs */
 	void		*ppp;		/* opaque to channel */
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index e49bb6d948a1..e9aced0ec56b 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -260,7 +260,7 @@ static int pppoatm_devppp_ioctl(struct ppp_channel *chan, unsigned int cmd,
 	return -ENOTTY;
 }
 
-static /*const*/ struct ppp_channel_ops pppoatm_ops = {
+static const struct ppp_channel_ops pppoatm_ops = {
 	.start_xmit = pppoatm_send,
 	.ioctl = pppoatm_devppp_ioctl,
 };
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 800bc53b7f63..dfe7b38dd4af 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -20,7 +20,7 @@
 /* Please put other headers in irnet.h - Thanks */
 
 /* Generic PPP callbacks (to call us) */
-static struct ppp_channel_ops irnet_ppp_ops = {
+static const struct ppp_channel_ops irnet_ppp_ops = {
 	.start_xmit = ppp_irnet_send,
 	.ioctl = ppp_irnet_ioctl
 };
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 90d82b3f2889..ff954b3e94b6 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -135,7 +135,10 @@ struct pppol2tp_session {
 
 static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb);
 
-static struct ppp_channel_ops pppol2tp_chan_ops = { pppol2tp_xmit , NULL };
+static const struct ppp_channel_ops pppol2tp_chan_ops = {
+	.start_xmit =  pppol2tp_xmit,
+};
+
 static const struct proto_ops pppol2tp_ops;
 
 /* Helpers to obtain tunnel/session contexts from sockets.
-- 
cgit v1.2.3-59-g8ed1b


From 5aa9a2cbc037b2ec34b76619d233852d2d17b502 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marek.vasut@gmail.com>
Date: Fri, 4 Jun 2010 03:07:34 +0200
Subject: [ARM] wm97xx_batt: remove now useless header file

Signed-off-by: Marek Vasut <marek.vasut@gmail.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Eric Miao <eric.y.miao@gmail.com>
---
 drivers/power/wm97xx_battery.c | 16 ----------------
 include/linux/wm97xx_batt.h    | 16 ----------------
 2 files changed, 32 deletions(-)
 delete mode 100644 include/linux/wm97xx_batt.h

(limited to 'include/linux')

diff --git a/drivers/power/wm97xx_battery.c b/drivers/power/wm97xx_battery.c
index 4e8afce0c818..5071d85ec12d 100644
--- a/drivers/power/wm97xx_battery.c
+++ b/drivers/power/wm97xx_battery.c
@@ -29,7 +29,6 @@ static DEFINE_MUTEX(bat_lock);
 static struct work_struct bat_work;
 static struct mutex work_lock;
 static int bat_status = POWER_SUPPLY_STATUS_UNKNOWN;
-static struct wm97xx_batt_info *gpdata;
 static enum power_supply_property *prop;
 
 static unsigned long wm97xx_read_bat(struct power_supply *bat_ps)
@@ -172,12 +171,6 @@ static int __devinit wm97xx_bat_probe(struct platform_device *dev)
 	struct wm97xx_pdata *wmdata = dev->dev.platform_data;
 	struct wm97xx_batt_pdata *pdata;
 
-	if (gpdata) {
-		dev_err(&dev->dev, "Do not pass platform_data through "
-			"wm97xx_bat_set_pdata!\n");
-		return -EINVAL;
-	}
-
 	if (!wmdata) {
 		dev_err(&dev->dev, "No platform data supplied\n");
 		return -EINVAL;
@@ -308,15 +301,6 @@ static void __exit wm97xx_bat_exit(void)
 	platform_driver_unregister(&wm97xx_bat_driver);
 }
 
-/* The interface is deprecated, as well as linux/wm97xx_batt.h */
-void wm97xx_bat_set_pdata(struct wm97xx_batt_info *data);
-
-void wm97xx_bat_set_pdata(struct wm97xx_batt_info *data)
-{
-	gpdata = data;
-}
-EXPORT_SYMBOL_GPL(wm97xx_bat_set_pdata);
-
 module_init(wm97xx_bat_init);
 module_exit(wm97xx_bat_exit);
 
diff --git a/include/linux/wm97xx_batt.h b/include/linux/wm97xx_batt.h
deleted file mode 100644
index a1d6419c2ff8..000000000000
--- a/include/linux/wm97xx_batt.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _LINUX_WM97XX_BAT_H
-#define _LINUX_WM97XX_BAT_H
-
-#include <linux/wm97xx.h>
-
-#warning This file will be removed soon, use wm97xx.h instead!
-
-#define wm97xx_batt_info wm97xx_batt_pdata
-
-#ifdef CONFIG_BATTERY_WM97XX
-void wm97xx_bat_set_pdata(struct wm97xx_batt_info *data);
-#else
-static inline void wm97xx_bat_set_pdata(struct wm97xx_batt_info *data) {}
-#endif
-
-#endif
-- 
cgit v1.2.3-59-g8ed1b


From f6a21388bd255773cc80d4423afb4c69d4daa173 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Sat, 19 Jun 2010 04:08:29 +0000
Subject: POWER: Add JZ4740 battery driver.

Add support for the battery voltage measurement part of the JZ4740 ADC unit.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Acked-by: Anton Vorontsov <cbouatmailru@gmail.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/1416/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/power/Kconfig                |  11 +
 drivers/power/Makefile               |   1 +
 drivers/power/jz4740-battery.c       | 445 +++++++++++++++++++++++++++++++++++
 include/linux/power/jz4740-battery.h |  24 ++
 4 files changed, 481 insertions(+)
 create mode 100644 drivers/power/jz4740-battery.c
 create mode 100644 include/linux/power/jz4740-battery.h

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 8e9ba177d817..1e5506be39b4 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -142,4 +142,15 @@ config CHARGER_PCF50633
 	help
 	 Say Y to include support for NXP PCF50633 Main Battery Charger.
 
+config BATTERY_JZ4740
+	tristate "Ingenic JZ4740 battery"
+	depends on MACH_JZ4740
+	depends on MFD_JZ4740_ADC
+	help
+	  Say Y to enable support for the battery on Ingenic JZ4740 based
+	  boards.
+
+	  This driver can be build as a module. If so, the module will be
+	  called jz4740-battery.
+
 endif # POWER_SUPPLY
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index 00050809a6c7..cf95009d9bcd 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -34,3 +34,4 @@ obj-$(CONFIG_BATTERY_DA9030)	+= da9030_battery.o
 obj-$(CONFIG_BATTERY_MAX17040)	+= max17040_battery.o
 obj-$(CONFIG_BATTERY_Z2)	+= z2_battery.o
 obj-$(CONFIG_CHARGER_PCF50633)	+= pcf50633-charger.o
+obj-$(CONFIG_BATTERY_JZ4740)	+= jz4740-battery.o
diff --git a/drivers/power/jz4740-battery.c b/drivers/power/jz4740-battery.c
new file mode 100644
index 000000000000..20c4b952e9bd
--- /dev/null
+++ b/drivers/power/jz4740-battery.c
@@ -0,0 +1,445 @@
+/*
+ * Battery measurement code for Ingenic JZ SOC.
+ *
+ * Copyright (C) 2009 Jiejing Zhang <kzjeef@gmail.com>
+ * Copyright (C) 2010, Lars-Peter Clausen <lars@metafoo.de>
+ *
+ * based on tosa_battery.c
+ *
+ * Copyright (C) 2008 Marek Vasut <marek.vasut@gmail.com>
+*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#include <linux/delay.h>
+#include <linux/gpio.h>
+#include <linux/mfd/core.h>
+#include <linux/power_supply.h>
+
+#include <linux/power/jz4740-battery.h>
+#include <linux/jz4740-adc.h>
+
+struct jz_battery {
+	struct jz_battery_platform_data *pdata;
+	struct platform_device *pdev;
+
+	struct resource *mem;
+	void __iomem *base;
+
+	int irq;
+	int charge_irq;
+
+	struct mfd_cell *cell;
+
+	int status;
+	long voltage;
+
+	struct completion read_completion;
+
+	struct power_supply battery;
+	struct delayed_work work;
+};
+
+static inline struct jz_battery *psy_to_jz_battery(struct power_supply *psy)
+{
+	return container_of(psy, struct jz_battery, battery);
+}
+
+static irqreturn_t jz_battery_irq_handler(int irq, void *devid)
+{
+	struct jz_battery *battery = devid;
+
+	complete(&battery->read_completion);
+	return IRQ_HANDLED;
+}
+
+static long jz_battery_read_voltage(struct jz_battery *battery)
+{
+	unsigned long t;
+	unsigned long val;
+	long voltage;
+
+	INIT_COMPLETION(battery->read_completion);
+
+	enable_irq(battery->irq);
+	battery->cell->enable(battery->pdev);
+
+	t = wait_for_completion_interruptible_timeout(&battery->read_completion,
+		HZ);
+
+	if (t > 0) {
+		val = readw(battery->base) & 0xfff;
+
+		if (battery->pdata->info.voltage_max_design <= 2500000)
+			val = (val * 78125UL) >> 7UL;
+		else
+			val = ((val * 924375UL) >> 9UL) + 33000;
+		voltage = (long)val;
+	} else {
+		voltage = t ? t : -ETIMEDOUT;
+	}
+
+	battery->cell->disable(battery->pdev);
+	disable_irq(battery->irq);
+
+	return voltage;
+}
+
+static int jz_battery_get_capacity(struct power_supply *psy)
+{
+	struct jz_battery *jz_battery = psy_to_jz_battery(psy);
+	struct power_supply_info *info = &jz_battery->pdata->info;
+	long voltage;
+	int ret;
+	int voltage_span;
+
+	voltage = jz_battery_read_voltage(jz_battery);
+
+	if (voltage < 0)
+		return voltage;
+
+	voltage_span = info->voltage_max_design - info->voltage_min_design;
+	ret = ((voltage - info->voltage_min_design) * 100) / voltage_span;
+
+	if (ret > 100)
+		ret = 100;
+	else if (ret < 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int jz_battery_get_property(struct power_supply *psy,
+	enum power_supply_property psp, union power_supply_propval *val)
+{
+	struct jz_battery *jz_battery = psy_to_jz_battery(psy);
+	struct power_supply_info *info = &jz_battery->pdata->info;
+	long voltage;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_STATUS:
+		val->intval = jz_battery->status;
+		break;
+	case POWER_SUPPLY_PROP_TECHNOLOGY:
+		val->intval = jz_battery->pdata->info.technology;
+		break;
+	case POWER_SUPPLY_PROP_HEALTH:
+		voltage = jz_battery_read_voltage(jz_battery);
+		if (voltage < info->voltage_min_design)
+			val->intval = POWER_SUPPLY_HEALTH_DEAD;
+		else
+			val->intval = POWER_SUPPLY_HEALTH_GOOD;
+		break;
+	case POWER_SUPPLY_PROP_CAPACITY:
+		val->intval = jz_battery_get_capacity(psy);
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+		val->intval = jz_battery_read_voltage(jz_battery);
+		if (val->intval < 0)
+			return val->intval;
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN:
+		val->intval = info->voltage_max_design;
+		break;
+	case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN:
+		val->intval = info->voltage_min_design;
+		break;
+	case POWER_SUPPLY_PROP_PRESENT:
+		val->intval = 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void jz_battery_external_power_changed(struct power_supply *psy)
+{
+	struct jz_battery *jz_battery = psy_to_jz_battery(psy);
+
+	cancel_delayed_work(&jz_battery->work);
+	schedule_delayed_work(&jz_battery->work, 0);
+}
+
+static irqreturn_t jz_battery_charge_irq(int irq, void *data)
+{
+	struct jz_battery *jz_battery = data;
+
+	cancel_delayed_work(&jz_battery->work);
+	schedule_delayed_work(&jz_battery->work, 0);
+
+	return IRQ_HANDLED;
+}
+
+static void jz_battery_update(struct jz_battery *jz_battery)
+{
+	int status;
+	long voltage;
+	bool has_changed = false;
+	int is_charging;
+
+	if (gpio_is_valid(jz_battery->pdata->gpio_charge)) {
+		is_charging = gpio_get_value(jz_battery->pdata->gpio_charge);
+		is_charging ^= jz_battery->pdata->gpio_charge_active_low;
+		if (is_charging)
+			status = POWER_SUPPLY_STATUS_CHARGING;
+		else
+			status = POWER_SUPPLY_STATUS_NOT_CHARGING;
+
+		if (status != jz_battery->status) {
+			jz_battery->status = status;
+			has_changed = true;
+		}
+	}
+
+	voltage = jz_battery_read_voltage(jz_battery);
+	if (abs(voltage - jz_battery->voltage) < 50000) {
+		jz_battery->voltage = voltage;
+		has_changed = true;
+	}
+
+	if (has_changed)
+		power_supply_changed(&jz_battery->battery);
+}
+
+static enum power_supply_property jz_battery_properties[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_CAPACITY,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN,
+	POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN,
+	POWER_SUPPLY_PROP_PRESENT,
+};
+
+static void jz_battery_work(struct work_struct *work)
+{
+	/* Too small interval will increase system workload */
+	const int interval = HZ * 30;
+	struct jz_battery *jz_battery = container_of(work, struct jz_battery,
+					    work.work);
+
+	jz_battery_update(jz_battery);
+	schedule_delayed_work(&jz_battery->work, interval);
+}
+
+static int __devinit jz_battery_probe(struct platform_device *pdev)
+{
+	int ret = 0;
+	struct jz_battery_platform_data *pdata = pdev->dev.parent->platform_data;
+	struct jz_battery *jz_battery;
+	struct power_supply *battery;
+
+	jz_battery = kzalloc(sizeof(*jz_battery), GFP_KERNEL);
+	if (!jz_battery) {
+		dev_err(&pdev->dev, "Failed to allocate driver structure\n");
+		return -ENOMEM;
+	}
+
+	jz_battery->cell = pdev->dev.platform_data;
+
+	jz_battery->irq = platform_get_irq(pdev, 0);
+	if (jz_battery->irq < 0) {
+		ret = jz_battery->irq;
+		dev_err(&pdev->dev, "Failed to get platform irq: %d\n", ret);
+		goto err_free;
+	}
+
+	jz_battery->mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!jz_battery->mem) {
+		ret = -ENOENT;
+		dev_err(&pdev->dev, "Failed to get platform mmio resource\n");
+		goto err_free;
+	}
+
+	jz_battery->mem = request_mem_region(jz_battery->mem->start,
+				resource_size(jz_battery->mem),	pdev->name);
+	if (!jz_battery->mem) {
+		ret = -EBUSY;
+		dev_err(&pdev->dev, "Failed to request mmio memory region\n");
+		goto err_free;
+	}
+
+	jz_battery->base = ioremap_nocache(jz_battery->mem->start,
+				resource_size(jz_battery->mem));
+	if (!jz_battery->base) {
+		ret = -EBUSY;
+		dev_err(&pdev->dev, "Failed to ioremap mmio memory\n");
+		goto err_release_mem_region;
+	}
+
+	battery = &jz_battery->battery;
+	battery->name = pdata->info.name;
+	battery->type = POWER_SUPPLY_TYPE_BATTERY;
+	battery->properties	= jz_battery_properties;
+	battery->num_properties	= ARRAY_SIZE(jz_battery_properties);
+	battery->get_property = jz_battery_get_property;
+	battery->external_power_changed = jz_battery_external_power_changed;
+	battery->use_for_apm = 1;
+
+	jz_battery->pdata = pdata;
+	jz_battery->pdev = pdev;
+
+	init_completion(&jz_battery->read_completion);
+
+	INIT_DELAYED_WORK(&jz_battery->work, jz_battery_work);
+
+	ret = request_irq(jz_battery->irq, jz_battery_irq_handler, 0, pdev->name,
+			jz_battery);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to request irq %d\n", ret);
+		goto err_iounmap;
+	}
+	disable_irq(jz_battery->irq);
+
+	if (gpio_is_valid(pdata->gpio_charge)) {
+		ret = gpio_request(pdata->gpio_charge, dev_name(&pdev->dev));
+		if (ret) {
+			dev_err(&pdev->dev, "charger state gpio request failed.\n");
+			goto err_free_irq;
+		}
+		ret = gpio_direction_input(pdata->gpio_charge);
+		if (ret) {
+			dev_err(&pdev->dev, "charger state gpio set direction failed.\n");
+			goto err_free_gpio;
+		}
+
+		jz_battery->charge_irq = gpio_to_irq(pdata->gpio_charge);
+
+		if (jz_battery->charge_irq >= 0) {
+			ret = request_irq(jz_battery->charge_irq,
+				    jz_battery_charge_irq,
+				    IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
+				    dev_name(&pdev->dev), jz_battery);
+			if (ret) {
+				dev_err(&pdev->dev, "Failed to request charge irq: %d\n", ret);
+				goto err_free_gpio;
+			}
+		}
+	} else {
+		jz_battery->charge_irq = -1;
+	}
+
+	if (jz_battery->pdata->info.voltage_max_design <= 2500000)
+		jz4740_adc_set_config(pdev->dev.parent, JZ_ADC_CONFIG_BAT_MB,
+			JZ_ADC_CONFIG_BAT_MB);
+	else
+		jz4740_adc_set_config(pdev->dev.parent, JZ_ADC_CONFIG_BAT_MB, 0);
+
+	ret = power_supply_register(&pdev->dev, &jz_battery->battery);
+	if (ret) {
+		dev_err(&pdev->dev, "power supply battery register failed.\n");
+		goto err_free_charge_irq;
+	}
+
+	platform_set_drvdata(pdev, jz_battery);
+	schedule_delayed_work(&jz_battery->work, 0);
+
+	return 0;
+
+err_free_charge_irq:
+	if (jz_battery->charge_irq >= 0)
+		free_irq(jz_battery->charge_irq, jz_battery);
+err_free_gpio:
+	if (gpio_is_valid(pdata->gpio_charge))
+		gpio_free(jz_battery->pdata->gpio_charge);
+err_free_irq:
+	free_irq(jz_battery->irq, jz_battery);
+err_iounmap:
+	platform_set_drvdata(pdev, NULL);
+	iounmap(jz_battery->base);
+err_release_mem_region:
+	release_mem_region(jz_battery->mem->start, resource_size(jz_battery->mem));
+err_free:
+	kfree(jz_battery);
+	return ret;
+}
+
+static int __devexit jz_battery_remove(struct platform_device *pdev)
+{
+	struct jz_battery *jz_battery = platform_get_drvdata(pdev);
+
+	cancel_delayed_work_sync(&jz_battery->work);
+
+	if (gpio_is_valid(jz_battery->pdata->gpio_charge)) {
+		if (jz_battery->charge_irq >= 0)
+			free_irq(jz_battery->charge_irq, jz_battery);
+		gpio_free(jz_battery->pdata->gpio_charge);
+	}
+
+	power_supply_unregister(&jz_battery->battery);
+
+	free_irq(jz_battery->irq, jz_battery);
+
+	iounmap(jz_battery->base);
+	release_mem_region(jz_battery->mem->start, resource_size(jz_battery->mem));
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int jz_battery_suspend(struct device *dev)
+{
+	struct jz_battery *jz_battery = dev_get_drvdata(dev);
+
+	cancel_delayed_work_sync(&jz_battery->work);
+	jz_battery->status = POWER_SUPPLY_STATUS_UNKNOWN;
+
+	return 0;
+}
+
+static int jz_battery_resume(struct device *dev)
+{
+	struct jz_battery *jz_battery = dev_get_drvdata(dev);
+
+	schedule_delayed_work(&jz_battery->work, 0);
+
+	return 0;
+}
+
+static const struct dev_pm_ops jz_battery_pm_ops = {
+	.suspend	= jz_battery_suspend,
+	.resume		= jz_battery_resume,
+};
+
+#define JZ_BATTERY_PM_OPS (&jz_battery_pm_ops)
+#else
+#define JZ_BATTERY_PM_OPS NULL
+#endif
+
+static struct platform_driver jz_battery_driver = {
+	.probe		= jz_battery_probe,
+	.remove		= __devexit_p(jz_battery_remove),
+	.driver = {
+		.name = "jz4740-battery",
+		.owner = THIS_MODULE,
+		.pm = JZ_BATTERY_PM_OPS,
+	},
+};
+
+static int __init jz_battery_init(void)
+{
+	return platform_driver_register(&jz_battery_driver);
+}
+module_init(jz_battery_init);
+
+static void __exit jz_battery_exit(void)
+{
+	platform_driver_unregister(&jz_battery_driver);
+}
+module_exit(jz_battery_exit);
+
+MODULE_ALIAS("platform:jz4740-battery");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
+MODULE_DESCRIPTION("JZ4740 SoC battery driver");
diff --git a/include/linux/power/jz4740-battery.h b/include/linux/power/jz4740-battery.h
new file mode 100644
index 000000000000..19c9610c720a
--- /dev/null
+++ b/include/linux/power/jz4740-battery.h
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (C) 2009, Jiejing Zhang <kzjeef@gmail.com>
+ *
+ *  This program is free software; you can redistribute	 it and/or modify it
+ *  under  the terms of	 the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the	License, or (at your
+ *  option) any later version.
+ *
+ *  You should have received a copy of the  GNU General Public License along
+ *  with this program; if not, write  to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __JZ4740_BATTERY_H
+#define __JZ4740_BATTERY_H
+
+struct jz_battery_platform_data {
+	struct power_supply_info info;
+	int gpio_charge;	/* GPIO port of Charger state */
+	int gpio_charge_active_low;
+};
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 534af1082329392bc29f6badf815e69ae2ae0f4c Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 5 Aug 2010 09:22:20 -0500
Subject: kgdb,kdb: individual register set and and get API

The kdb shell specification includes the ability to get and set
architecture specific registers by name.

For the time being individual register get and set will be implemented
on a per architecture basis.  If an architecture defines
DBG_MAX_REG_NUM > 0 then kdb and the gdbstub will use the capability
for individually getting and setting architecture specific registers.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 include/linux/kgdb.h        |  13 +++++
 kernel/debug/gdbstub.c      |  26 +++++++++
 kernel/debug/kdb/kdb_main.c | 132 ++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 159 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 9340f34d1bb5..d5eb882e01f3 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -90,6 +90,19 @@ struct kgdb_bkpt {
 	enum kgdb_bpstate	state;
 };
 
+struct dbg_reg_def_t {
+	char *name;
+	int size;
+	int offset;
+};
+
+#ifndef DBG_MAX_REG_NUM
+#define DBG_MAX_REG_NUM 0
+#else
+extern struct dbg_reg_def_t dbg_reg_def[];
+extern char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs);
+extern int dbg_set_reg(int regno, void *mem, struct pt_regs *regs);
+#endif
 #ifndef KGDB_MAX_BREAKPOINTS
 # define KGDB_MAX_BREAKPOINTS	1000
 #endif
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index e117cfd75887..006bad8905d3 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -328,6 +328,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
 	return probe_kernel_write(mem, c, size);
 }
 
+#if DBG_MAX_REG_NUM > 0
+void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+	int i;
+	int idx = 0;
+	char *ptr = (char *)gdb_regs;
+
+	for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+		dbg_get_reg(i, ptr + idx, regs);
+		idx += dbg_reg_def[i].size;
+	}
+}
+
+void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+	int i;
+	int idx = 0;
+	char *ptr = (char *)gdb_regs;
+
+	for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+		dbg_set_reg(i, ptr + idx, regs);
+		idx += dbg_reg_def[i].size;
+	}
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
 /* Write memory due to an 'M' or 'X' packet. */
 static int write_mem_msg(int binary)
 {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ebe4a287419e..8577e45a9a58 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value)
 
 	if (endp == arg) {
 		/*
-		 * Try base 16, for us folks too lazy to type the
+		 * Also try base 16, for us folks too lazy to type the
 		 * leading 0x...
 		 */
 		val = simple_strtoul(arg, &endp, 16);
@@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value)
 	return 0;
 }
 
+int kdbgetu64arg(const char *arg, u64 *value)
+{
+	char *endp;
+	u64 val;
+
+	val = simple_strtoull(arg, &endp, 0);
+
+	if (endp == arg) {
+
+		val = simple_strtoull(arg, &endp, 16);
+		if (endp == arg)
+			return KDB_BADINT;
+	}
+
+	*value = val;
+
+	return 0;
+}
+
 /*
  * kdb_set - This function implements the 'set' command.  Alter an
  *	existing environment variable or create a new one.
@@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv)
  */
 static int kdb_rd(int argc, const char **argv)
 {
-	int diag = kdb_check_regs();
-	if (diag)
-		return diag;
+	int len = kdb_check_regs();
+#if DBG_MAX_REG_NUM > 0
+	int i;
+	char *rname;
+	int rsize;
+	u64 reg64;
+	u32 reg32;
+	u16 reg16;
+	u8 reg8;
+
+	if (len)
+		return len;
+
+	for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+		rsize = dbg_reg_def[i].size * 2;
+		if (rsize > 16)
+			rsize = 2;
+		if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
+			len = 0;
+			kdb_printf("\n");
+		}
+		if (len)
+			len += kdb_printf("  ");
+		switch(dbg_reg_def[i].size * 8) {
+		case 8:
+			rname = dbg_get_reg(i, &reg8, kdb_current_regs);
+			if (!rname)
+				break;
+			len += kdb_printf("%s: %02x", rname, reg8);
+			break;
+		case 16:
+			rname = dbg_get_reg(i, &reg16, kdb_current_regs);
+			if (!rname)
+				break;
+			len += kdb_printf("%s: %04x", rname, reg16);
+			break;
+		case 32:
+			rname = dbg_get_reg(i, &reg32, kdb_current_regs);
+			if (!rname)
+				break;
+			len += kdb_printf("%s: %08x", rname, reg32);
+			break;
+		case 64:
+			rname = dbg_get_reg(i, &reg64, kdb_current_regs);
+			if (!rname)
+				break;
+			len += kdb_printf("%s: %016llx", rname, reg64);
+			break;
+		default:
+			len += kdb_printf("%s: ??", dbg_reg_def[i].name);
+		}
+	}
+	kdb_printf("\n");
+#else
+	if (len)
+		return len;
 
 	kdb_dumpregs(kdb_current_regs);
+#endif
 	return 0;
 }
 
@@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv)
  * kdb_rm - This function implements the 'rm' (register modify)  command.
  *	rm register-name new-contents
  * Remarks:
- *	Currently doesn't allow modification of control or
- *	debug registers.
+ *	Allows register modification with the same restrictions as gdb
  */
 static int kdb_rm(int argc, const char **argv)
 {
+#if DBG_MAX_REG_NUM > 0
 	int diag;
-	int ind = 0;
-	unsigned long contents;
+	const char *rname;
+	int i;
+	u64 reg64;
+	u32 reg32;
+	u16 reg16;
+	u8 reg8;
 
 	if (argc != 2)
 		return KDB_ARGCOUNT;
 	/*
 	 * Allow presence or absence of leading '%' symbol.
 	 */
-	if (argv[1][0] == '%')
-		ind = 1;
+	rname = argv[1];
+	if (*rname == '%')
+		rname++;
 
-	diag = kdbgetularg(argv[2], &contents);
+	diag = kdbgetu64arg(argv[2], &reg64);
 	if (diag)
 		return diag;
 
 	diag = kdb_check_regs();
 	if (diag)
 		return diag;
+
+	diag = KDB_BADREG;
+	for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+		if (strcmp(rname, dbg_reg_def[i].name) == 0) {
+			diag = 0;
+			break;
+		}
+	}
+	if (!diag) {
+		switch(dbg_reg_def[i].size * 8) {
+		case 8:
+			reg8 = reg64;
+			dbg_set_reg(i, &reg8, kdb_current_regs);
+			break;
+		case 16:
+			reg16 = reg64;
+			dbg_set_reg(i, &reg16, kdb_current_regs);
+			break;
+		case 32:
+			reg32 = reg64;
+			dbg_set_reg(i, &reg32, kdb_current_regs);
+			break;
+		case 64:
+			dbg_set_reg(i, &reg64, kdb_current_regs);
+			break;
+		}
+	}
+	return diag;
+#else
 	kdb_printf("ERROR: Register set currently not implemented\n");
-	return 0;
+    return 0;
+#endif
 }
 
 #if defined(CONFIG_MAGIC_SYSRQ)
-- 
cgit v1.2.3-59-g8ed1b


From 55751145dc1e08e16df418cdd101661f5c6ac991 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 5 Aug 2010 09:22:21 -0500
Subject: gdbstub: Implement gdbserial 'p' and 'P' packets

The gdbserial 'p' and 'P' packets allow gdb to individually get and
set registers instead of querying for all the available registers.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 include/linux/kgdb.h   |  2 +-
 kernel/debug/gdbstub.c | 97 +++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 78 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index d5eb882e01f3..cc96f0f23e04 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -294,7 +294,7 @@ extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern struct kgdb_io *dbg_io_ops;
 
 extern int kgdb_hex2long(char **ptr, unsigned long *long_val);
-extern int kgdb_mem2hex(char *mem, char *buf, int count);
+extern char *kgdb_mem2hex(char *mem, char *buf, int count);
 extern int kgdb_hex2mem(char *buf, char *mem, int count);
 
 extern int kgdb_isremovedbreak(unsigned long addr);
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 006bad8905d3..4ef9dddf4588 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -225,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len)
  * buf.  Return a pointer to the last char put in buf (null). May
  * return an error.
  */
-int kgdb_mem2hex(char *mem, char *buf, int count)
+char *kgdb_mem2hex(char *mem, char *buf, int count)
 {
 	char *tmp;
 	int err;
@@ -237,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
 	tmp = buf + count;
 
 	err = probe_kernel_read(tmp, mem, count);
-	if (!err) {
-		while (count > 0) {
-			buf = pack_hex_byte(buf, *tmp);
-			tmp++;
-			count--;
-		}
-
-		*buf = 0;
+	if (err)
+		return NULL;
+	while (count > 0) {
+		buf = pack_hex_byte(buf, *tmp);
+		tmp++;
+		count--;
 	}
+	*buf = 0;
 
-	return err;
+	return buf;
 }
 
 /*
@@ -481,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
 	pack_hex_byte(&remcom_out_buffer[1], ks->signo);
 }
 
-/* Handle the 'g' get registers request */
-static void gdb_cmd_getregs(struct kgdb_state *ks)
+static void gdb_get_regs_helper(struct kgdb_state *ks)
 {
 	struct task_struct *thread;
 	void *local_debuggerinfo;
@@ -523,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
 		 */
 		sleeping_thread_to_gdb_regs(gdb_regs, thread);
 	}
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+	gdb_get_regs_helper(ks);
 	kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
 }
 
@@ -545,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks)
 	char *ptr = &remcom_in_buffer[1];
 	unsigned long length;
 	unsigned long addr;
-	int err;
+	char *err;
 
 	if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
 					kgdb_hex2long(&ptr, &length) > 0) {
 		err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
-		if (err)
-			error_packet(remcom_out_buffer, err);
+		if (!err)
+			error_packet(remcom_out_buffer, -EINVAL);
 	} else {
 		error_packet(remcom_out_buffer, -EINVAL);
 	}
@@ -568,6 +572,52 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks)
 		strcpy(remcom_out_buffer, "OK");
 }
 
+#if DBG_MAX_REG_NUM > 0
+static char *gdb_hex_reg_helper(int regnum, char *out)
+{
+	int i;
+	int offset = 0;
+
+	for (i = 0; i < regnum; i++)
+		offset += dbg_reg_def[i].size;
+	return kgdb_mem2hex((char *)gdb_regs + offset, out,
+			    dbg_reg_def[i].size);
+}
+
+/* Handle the 'p' individual regster get */
+static void gdb_cmd_reg_get(struct kgdb_state *ks)
+{
+	unsigned long regnum;
+	char *ptr = &remcom_in_buffer[1];
+
+	kgdb_hex2long(&ptr, &regnum);
+	if (regnum >= DBG_MAX_REG_NUM) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	gdb_get_regs_helper(ks);
+	gdb_hex_reg_helper(regnum, remcom_out_buffer);
+}
+
+/* Handle the 'P' individual regster set */
+static void gdb_cmd_reg_set(struct kgdb_state *ks)
+{
+	unsigned long regnum;
+	char *ptr = &remcom_in_buffer[1];
+
+	kgdb_hex2long(&ptr, &regnum);
+	if (*ptr++ != '=' ||
+	    !(!kgdb_usethread || kgdb_usethread == current) ||
+	    !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	kgdb_hex2mem(ptr, (char *)gdb_regs, dbg_reg_def[regnum].size);
+	dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
+	strcpy(remcom_out_buffer, "OK");
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
 /* Handle the 'X' memory binary write bytes */
 static void gdb_cmd_binwrite(struct kgdb_state *ks)
 {
@@ -874,8 +924,11 @@ int gdb_serial_stub(struct kgdb_state *ks)
 	int error = 0;
 	int tmp;
 
-	/* Clear the out buffer. */
+	/* Initialize comm buffer and globals. */
 	memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+	kgdb_usethread = kgdb_info[ks->cpu].task;
+	ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+	ks->pass_exception = 0;
 
 	if (kgdb_connected) {
 		unsigned char thref[BUF_THREAD_ID_SIZE];
@@ -892,10 +945,6 @@ int gdb_serial_stub(struct kgdb_state *ks)
 		put_packet(remcom_out_buffer);
 	}
 
-	kgdb_usethread = kgdb_info[ks->cpu].task;
-	ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
-	ks->pass_exception = 0;
-
 	while (1) {
 		error = 0;
 
@@ -920,6 +969,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
 		case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
 			gdb_cmd_memwrite(ks);
 			break;
+#if DBG_MAX_REG_NUM > 0
+		case 'p': /* pXX Return gdb register XX (in hex) */
+			gdb_cmd_reg_get(ks);
+			break;
+		case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
+			gdb_cmd_reg_set(ks);
+			break;
+#endif /* DBG_MAX_REG_NUM > 0 */
 		case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
 			gdb_cmd_binwrite(ks);
 			break;
-- 
cgit v1.2.3-59-g8ed1b


From b45cfba4e9005d64d419718e7ff7f7cab44c1994 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Thu, 5 Aug 2010 09:22:30 -0500
Subject: vt,console,kdb: implement atomic console enter/leave functions

These functions allow the kernel debugger to save and restore the
state of the system console.

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
CC: David Airlie <airlied@linux.ie>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/char/vt.c       | 61 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/console.h | 13 +++++++++++
 2 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 7cdb6ee569cd..117ce99115dc 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -187,10 +187,15 @@ static DECLARE_WORK(console_work, console_callback);
  * fg_console is the current virtual console,
  * last_console is the last used one,
  * want_console is the console we want to switch to,
+ * saved_* variants are for save/restore around kernel debugger enter/leave
  */
 int fg_console;
 int last_console;
 int want_console = -1;
+int saved_fg_console;
+int saved_last_console;
+int saved_want_console;
+int saved_vc_mode;
 
 /*
  * For each existing display, we have a pointer to console currently visible
@@ -3413,6 +3418,62 @@ int con_is_bound(const struct consw *csw)
 }
 EXPORT_SYMBOL(con_is_bound);
 
+/**
+ * con_debug_enter - prepare the console for the kernel debugger
+ * @sw: console driver
+ *
+ * Called when the console is taken over by the kernel debugger, this
+ * function needs to save the current console state, then put the console
+ * into a state suitable for the kernel debugger.
+ *
+ * RETURNS:
+ * Zero on success, nonzero if a failure occurred when trying to prepare
+ * the console for the debugger.
+ */
+int con_debug_enter(struct vc_data *vc)
+{
+	int ret = 0;
+
+	saved_fg_console = fg_console;
+	saved_last_console = last_console;
+	saved_want_console = want_console;
+	saved_vc_mode = vc->vc_mode;
+	vc->vc_mode = KD_TEXT;
+	console_blanked = 0;
+	if (vc->vc_sw->con_debug_enter)
+		ret = vc->vc_sw->con_debug_enter(vc);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(con_debug_enter);
+
+/**
+ * con_debug_leave - restore console state
+ * @sw: console driver
+ *
+ * Restore the console state to what it was before the kernel debugger
+ * was invoked.
+ *
+ * RETURNS:
+ * Zero on success, nonzero if a failure occurred when trying to restore
+ * the console.
+ */
+int con_debug_leave(void)
+{
+	struct vc_data *vc;
+	int ret = 0;
+
+	fg_console = saved_fg_console;
+	last_console = saved_last_console;
+	want_console = saved_want_console;
+	vc_cons[fg_console].d->vc_mode = saved_vc_mode;
+
+	vc = vc_cons[fg_console].d;
+	if (vc->vc_sw->con_debug_leave)
+		ret = vc->vc_sw->con_debug_leave(vc);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(con_debug_leave);
+
 /**
  * register_con_driver - register console driver to console layer
  * @csw: console driver
diff --git a/include/linux/console.h b/include/linux/console.h
index dcca5339ceb3..f76fc297322d 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -55,6 +55,16 @@ struct consw {
 	void	(*con_invert_region)(struct vc_data *, u16 *, int);
 	u16    *(*con_screen_pos)(struct vc_data *, int);
 	unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *);
+	/*
+	 * Prepare the console for the debugger.  This includes, but is not
+	 * limited to, unblanking the console, loading an appropriate
+	 * palette, and allowing debugger generated output.
+	 */
+	int	(*con_debug_enter)(struct vc_data *);
+	/*
+	 * Restore the console to its pre-debug state as closely as possible.
+	 */
+	int	(*con_debug_leave)(struct vc_data *);
 };
 
 extern const struct consw *conswitchp;
@@ -69,6 +79,9 @@ int register_con_driver(const struct consw *csw, int first, int last);
 int unregister_con_driver(const struct consw *csw);
 int take_over_console(const struct consw *sw, int first, int last, int deflt);
 void give_up_console(const struct consw *sw);
+int con_debug_enter(struct vc_data *vc);
+int con_debug_leave(void);
+
 /* scroll */
 #define SM_UP       (1)
 #define SM_DOWN     (2)
-- 
cgit v1.2.3-59-g8ed1b


From 81d4450732c68aa728f2c86c0c2993c6cfc3d032 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 5 Aug 2010 09:22:30 -0500
Subject: vt,console,kdb: automatically set kdb LINES variable

The kernel console interface stores the number of lines it is
configured to use. The kdb debugger can greatly benefit by knowing how
many lines there are on the console for the pager functionality
without having the end user compile in the setting or have to
repeatedly change it at run time.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
CC: David Airlie <airlied@linux.ie>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/char/vt.c              | 17 +++++++++++++++++
 include/linux/kdb.h            |  4 ++++
 kernel/debug/kdb/kdb_private.h |  2 --
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 117ce99115dc..4a9eb3044e52 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -104,6 +104,7 @@
 #include <linux/io.h>
 #include <asm/system.h>
 #include <linux/uaccess.h>
+#include <linux/kdb.h>
 
 #define MAX_NR_CON_DRIVER 16
 
@@ -3442,6 +3443,22 @@ int con_debug_enter(struct vc_data *vc)
 	console_blanked = 0;
 	if (vc->vc_sw->con_debug_enter)
 		ret = vc->vc_sw->con_debug_enter(vc);
+#ifdef CONFIG_KGDB_KDB
+	/* Set the initial LINES variable if it is not already set */
+	if (vc->vc_rows < 999) {
+		int linecount;
+		char lns[4];
+		const char *setargs[3] = {
+			"set",
+			"LINES",
+			lns,
+		};
+		if (kdbgetintenv(setargs[0], &linecount)) {
+			snprintf(lns, 4, "%i", vc->vc_rows);
+			kdb_set(2, setargs);
+		}
+	}
+#endif /* CONFIG_KGDB_KDB */
 	return ret;
 }
 EXPORT_SYMBOL_GPL(con_debug_enter);
diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index ccb2b3ec0fe8..ea6e5244ed3f 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -114,4 +114,8 @@ enum {
 	KDB_INIT_EARLY,
 	KDB_INIT_FULL,
 };
+
+extern int kdbgetintenv(const char *, int *);
+extern int kdb_set(int, const char **);
+
 #endif	/* !_KDB_H */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 97d3ba69775d..c438f545a321 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -144,9 +144,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
 extern int kdb_putword(unsigned long, unsigned long, size_t);
 
 extern int kdbgetularg(const char *, unsigned long *);
-extern int kdb_set(int, const char **);
 extern char *kdbgetenv(const char *);
-extern int kdbgetintenv(const char *, int *);
 extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
 			 long *, char **);
 extern int kdbgetsymval(const char *, kdb_symtab_t *);
-- 
cgit v1.2.3-59-g8ed1b


From d219adc1228a3887486b58a430e736b0831f192c Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Mon, 2 Aug 2010 12:05:41 -0700
Subject: fb: add hooks to handle KDB enter/exit

Add fb ops to handle enter/exit of the kernel debugger.  If present, the
fb core will register them with KGDB and they'll be called when the
debugger is entered and exited.  The new functions are responsible for
switching to an appropriate debug framebuffer and restoring the
interrupted state at exit time.

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 drivers/video/console/fbcon.c | 26 ++++++++++++++++++++++++++
 drivers/video/console/fbcon.h |  1 +
 include/linux/fb.h            | 13 +++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index b0a3fa00706d..3b3f5749af92 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -2342,6 +2342,30 @@ static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch)
 	return 0;
 }
 
+static int fbcon_debug_enter(struct vc_data *vc)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	ops->save_graphics = ops->graphics;
+	ops->graphics = 0;
+	if (info->fbops->fb_debug_enter)
+		info->fbops->fb_debug_enter(info);
+	fbcon_set_palette(vc, color_table);
+	return 0;
+}
+
+static int fbcon_debug_leave(struct vc_data *vc)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	ops->graphics = ops->save_graphics;
+	if (info->fbops->fb_debug_leave)
+		info->fbops->fb_debug_leave(info);
+	return 0;
+}
+
 static int fbcon_get_font(struct vc_data *vc, struct console_font *font)
 {
 	u8 *fontdata = vc->vc_font.data;
@@ -3276,6 +3300,8 @@ static const struct consw fb_con = {
 	.con_screen_pos 	= fbcon_screen_pos,
 	.con_getxy 		= fbcon_getxy,
 	.con_resize             = fbcon_resize,
+	.con_debug_enter	= fbcon_debug_enter,
+	.con_debug_leave	= fbcon_debug_leave,
 };
 
 static struct notifier_block fbcon_event_notifier = {
diff --git a/drivers/video/console/fbcon.h b/drivers/video/console/fbcon.h
index 89a346880ec0..6bd2e0c7f209 100644
--- a/drivers/video/console/fbcon.h
+++ b/drivers/video/console/fbcon.h
@@ -74,6 +74,7 @@ struct fbcon_ops {
 	int    cursor_reset;
 	int    blank_state;
 	int    graphics;
+	int    save_graphics; /* for debug enter/leave */
 	int    flags;
 	int    rotate;
 	int    cur_rotate;
diff --git a/include/linux/fb.h b/include/linux/fb.h
index e7445df44d6c..0c5659c41b01 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -3,6 +3,9 @@
 
 #include <linux/types.h>
 #include <linux/i2c.h>
+#ifdef __KERNEL__
+#include <linux/kgdb.h>
+#endif /* __KERNEL__ */
 
 /* Definitions of frame buffers						*/
 
@@ -607,6 +610,12 @@ struct fb_deferred_io {
  * LOCKING NOTE: those functions must _ALL_ be called with the console
  * semaphore held, this is the only suitable locking mechanism we have
  * in 2.6. Some may be called at interrupt time at this point though.
+ *
+ * The exception to this is the debug related hooks.  Putting the fb
+ * into a debug state (e.g. flipping to the kernel console) and restoring
+ * it must be done in a lock-free manner, so low level drivers should
+ * keep track of the initial console (if applicable) and may need to
+ * perform direct, unlocked hardware writes in these hooks.
  */
 
 struct fb_ops {
@@ -676,6 +685,10 @@ struct fb_ops {
 
 	/* teardown any resources to do with this framebuffer */
 	void (*fb_destroy)(struct fb_info *info);
+
+	/* called at KDB enter and leave time to prepare the console */
+	int (*fb_debug_enter)(struct fb_info *info);
+	int (*fb_debug_leave)(struct fb_info *info);
 };
 
 #ifdef CONFIG_FB_TILEBLITTING
-- 
cgit v1.2.3-59-g8ed1b


From 1a4240f4764ac78adbf4b0ebb49b3bd8c72ffa11 Mon Sep 17 00:00:00 2001
From: Wang Lei <wang840925@gmail.com>
Date: Wed, 4 Aug 2010 15:16:33 +0100
Subject: DNS: Separate out CIFS DNS Resolver code

Separate out the DNS resolver key type from the CIFS filesystem into its own
module so that it can be made available for general use, including the AFS
filesystem module.

This facility makes it possible for the kernel to upcall to userspace to have
it issue DNS requests, package up the replies and present them to the kernel
in a useful form.  The kernel is then able to cache the DNS replies as keys
can be retained in keyrings.

Resolver keys are of type "dns_resolver" and have a case-insensitive
description that is of the form "[<type>:]<domain_name>".  The optional <type>
indicates the particular DNS lookup and packaging that's required.  The
<domain_name> is the query to be made.

If <type> isn't given, a basic hostname to IP address lookup is made, and the
result is stored in the key in the form of a printable string consisting of a
comma-separated list of IPv4 and IPv6 addresses.

This key type is supported by userspace helpers driven from /sbin/request-key
and configured through /etc/request-key.conf.  The cifs.upcall utility is
invoked for UNC path server name to IP address resolution.

The CIFS functionality is encapsulated by the dns_resolve_unc_to_ip() function,
which is used to resolve a UNC path to an IP address for CIFS filesystem.  This
part remains in the CIFS module for now.

See the added Documentation/networking/dns_resolver.txt for more information.

Signed-off-by: Wang Lei <wang840925@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 Documentation/networking/dns_resolver.txt | 146 +++++++++++++++++++
 fs/cifs/Kconfig                           |  17 +--
 fs/cifs/cifsfs.c                          |  13 +-
 fs/cifs/dns_resolve.c                     | 229 ++++++------------------------
 fs/cifs/dns_resolve.h                     |   2 -
 include/keys/dns_resolver-type.h          |  23 +++
 include/linux/dns_resolver.h              |  34 +++++
 net/Kconfig                               |   1 +
 net/Makefile                              |   1 +
 net/dns_resolver/Kconfig                  |  27 ++++
 net/dns_resolver/Makefile                 |   7 +
 net/dns_resolver/dns_key.c                | 210 +++++++++++++++++++++++++++
 net/dns_resolver/dns_query.c              | 159 +++++++++++++++++++++
 net/dns_resolver/internal.h               |  44 ++++++
 14 files changed, 708 insertions(+), 205 deletions(-)
 create mode 100644 Documentation/networking/dns_resolver.txt
 create mode 100644 include/keys/dns_resolver-type.h
 create mode 100644 include/linux/dns_resolver.h
 create mode 100644 net/dns_resolver/Kconfig
 create mode 100644 net/dns_resolver/Makefile
 create mode 100644 net/dns_resolver/dns_key.c
 create mode 100644 net/dns_resolver/dns_query.c
 create mode 100644 net/dns_resolver/internal.h

(limited to 'include/linux')

diff --git a/Documentation/networking/dns_resolver.txt b/Documentation/networking/dns_resolver.txt
new file mode 100644
index 000000000000..d8e0ce1d38c7
--- /dev/null
+++ b/Documentation/networking/dns_resolver.txt
@@ -0,0 +1,146 @@
+			     ===================
+			     DNS Resolver Module
+			     ===================
+
+Contents:
+
+ - Overview.
+ - Compilation.
+ - Setting up.
+ - Usage.
+ - Mechanism.
+ - Debugging.
+
+
+========
+OVERVIEW
+========
+
+The DNS resolver module provides a way for kernel services to make DNS queries
+by way of requesting a key of key type dns_resolver.  These queries are
+upcalled to userspace through /sbin/request-key.
+
+These routines must be supported by userspace tools dns.upcall, cifs.upcall and
+request-key.  It is under development and does not yet provide the full feature
+set.  The features it does support include:
+
+ (*) Implements the dns_resolver key_type to contact userspace.
+
+It does not yet support the following AFS features:
+
+ (*) Dns query support for AFSDB resource record.
+
+This code is extracted from the CIFS filesystem.
+
+
+===========
+COMPILATION
+===========
+
+The module should be enabled by turning on the kernel configuration options:
+
+	CONFIG_DNS_RESOLVER	- tristate "DNS Resolver support"
+
+
+==========
+SETTING UP
+==========
+
+To set up this facility, the /etc/request-key.conf file must be altered so that
+/sbin/request-key can appropriately direct the upcalls.  For example, to handle
+basic dname to IPv4/IPv6 address resolution, the following line should be
+added:
+
+	#OP	TYPE		DESC	CO-INFO	PROGRAM ARG1 ARG2 ARG3 ...
+	#======	============	=======	=======	==========================
+	create	dns_resolver  	*	*	/usr/sbin/cifs.upcall %k
+
+To direct a query for query type 'foo', a line of the following should be added
+before the more general line given above as the first match is the one taken.
+
+	create	dns_resolver  	foo:*	*	/usr/sbin/dns.foo %k
+
+
+
+=====
+USAGE
+=====
+
+To make use of this facility, one of the following functions that are
+implemented in the module can be called after doing:
+
+	#include <linux/dns_resolver.h>
+
+ (1) int dns_query(const char *type, const char *name, size_t namelen,
+		   const char *options, char **_result, time_t *_expiry);
+
+     This is the basic access function.  It looks for a cached DNS query and if
+     it doesn't find it, it upcalls to userspace to make a new DNS query, which
+     may then be cached.  The key description is constructed as a string of the
+     form:
+
+		[<type>:]<name>
+
+     where <type> optionally specifies the particular upcall program to invoke,
+     and thus the type of query to do, and <name> specifies the string to be
+     looked up.  The default query type is a straight hostname to IP address
+     set lookup.
+
+     The name parameter is not required to be a NUL-terminated string, and its
+     length should be given by the namelen argument.
+
+     The options parameter may be NULL or it may be a set of options
+     appropriate to the query type.
+
+     The return value is a string appropriate to the query type.  For instance,
+     for the default query type it is just a list of comma-separated IPv4 and
+     IPv6 addresses.  The caller must free the result.
+
+     The length of the result string is returned on success, and a -ve error
+     code is returned otherwise.  -EKEYREJECTED will be returned if the DNS
+     lookup failed.
+
+     If _expiry is non-NULL, the expiry time (TTL) of the result will be
+     returned also.
+
+
+=========
+MECHANISM
+=========
+
+The dnsresolver module registers a key type called "dns_resolver".  Keys of
+this type are used to transport and cache DNS lookup results from userspace.
+
+When dns_query() is invoked, it calls request_key() to search the local
+keyrings for a cached DNS result.  If that fails to find one, it upcalls to
+userspace to get a new result.
+
+Upcalls to userspace are made through the request_key() upcall vector, and are
+directed by means of configuration lines in /etc/request-key.conf that tell
+/sbin/request-key what program to run to instantiate the key.
+
+The upcall handler program is responsible for querying the DNS, processing the
+result into a form suitable for passing to the keyctl_instantiate_key()
+routine.  This then passes the data to dns_resolver_instantiate() which strips
+off and processes any options included in the data, and then attaches the
+remainder of the string to the key as its payload.
+
+The upcall handler program should set the expiry time on the key to that of the
+lowest TTL of all the records it has extracted a result from.  This means that
+the key will be discarded and recreated when the data it holds has expired.
+
+dns_query() returns a copy of the value attached to the key, or an error if
+that is indicated instead.
+
+See <file:Documentation/keys-request-key.txt> for further information about
+request-key function.
+
+
+=========
+DEBUGGING
+=========
+
+Debugging messages can be turned on dynamically by writing a 1 into the
+following file:
+
+        /sys/module/dnsresolver/parameters/debug
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 5739fd7f88b4..57f0aa9f141f 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -71,14 +71,14 @@ config CIFS_WEAK_PW_HASH
 	  If unsure, say N.
 
 config CIFS_UPCALL
-	  bool "Kerberos/SPNEGO advanced session setup"
-	  depends on CIFS && KEYS
-	  help
-	    Enables an upcall mechanism for CIFS which accesses
-	    userspace helper utilities to provide SPNEGO packaged (RFC 4178)
-	    Kerberos tickets which are needed to mount to certain secure servers
-	    (for which more secure Kerberos authentication is required). If
-	    unsure, say N.
+	bool "Kerberos/SPNEGO advanced session setup"
+	depends on CIFS && KEYS
+	select DNS_RESOLVER
+	help
+	  Enables an upcall mechanism for CIFS which accesses userspace helper
+	  utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
+	  which are needed to mount to certain secure servers (for which more
+	  secure Kerberos authentication is required). If unsure, say N.
 
 config CIFS_XATTR
         bool "CIFS extended attributes"
@@ -122,6 +122,7 @@ config CIFS_DEBUG2
 config CIFS_DFS_UPCALL
 	  bool "DFS feature support"
 	  depends on CIFS && KEYS
+	  select DNS_RESOLVER
 	  help
 	    Distributed File System (DFS) support is used to access shares
 	    transparently in an enterprise name space, even if the share
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8a2cf129e535..2a0c892959f4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -45,7 +45,6 @@
 #include "cifs_fs_sb.h"
 #include <linux/mm.h>
 #include <linux/key-type.h>
-#include "dns_resolve.h"
 #include "cifs_spnego.h"
 #include "fscache.h"
 #define CIFS_MAGIC_NUMBER 0xFF534D42	/* the first four bytes of SMB PDUs */
@@ -933,23 +932,14 @@ init_cifs(void)
 	rc = register_key_type(&cifs_spnego_key_type);
 	if (rc)
 		goto out_unregister_filesystem;
-#endif
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	rc = cifs_init_dns_resolver();
-	if (rc)
-		goto out_unregister_key_type;
 #endif
 	rc = slow_work_register_user(THIS_MODULE);
 	if (rc)
-		goto out_unregister_resolver_key;
+		goto out_unregister_key_type;
 
 	return 0;
 
- out_unregister_resolver_key:
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	cifs_exit_dns_resolver();
  out_unregister_key_type:
-#endif
 #ifdef CONFIG_CIFS_UPCALL
 	unregister_key_type(&cifs_spnego_key_type);
  out_unregister_filesystem:
@@ -976,7 +966,6 @@ exit_cifs(void)
 	cifs_fscache_unregister();
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	cifs_dfs_release_automount_timer();
-	cifs_exit_dns_resolver();
 #endif
 #ifdef CONFIG_CIFS_UPCALL
 	unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index aa967e7917f8..0eb87026cad3 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -4,6 +4,8 @@
  *   Copyright (c) 2007 Igor Mammedov
  *   Author(s): Igor Mammedov (niallain@gmail.com)
  *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
  *
  *   Contains the CIFS DFS upcall routines used for hostname to
  *   IP address translation.
@@ -24,212 +26,73 @@
  */
 
 #include <linux/slab.h>
-#include <linux/keyctl.h>
-#include <linux/key-type.h>
-#include <keys/user-type.h>
+#include <linux/dns_resolver.h>
 #include "dns_resolve.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
-static const struct cred *dns_resolver_cache;
-
-/* Checks if supplied name is IP address
- * returns:
- * 		1 - name is IP
- * 		0 - name is not IP
- */
-static int
-is_ip(const char *name, int len)
-{
-	struct sockaddr_storage ss;
-
-	return cifs_convert_address((struct sockaddr *)&ss, name, len);
-}
-
-static int
-dns_resolver_instantiate(struct key *key, const void *data,
-		size_t datalen)
-{
-	int rc = 0;
-	char *ip;
-
-	/* make sure this looks like an address */
-	if (!is_ip(data, datalen))
-		return -EINVAL;
-
-	ip = kmalloc(datalen + 1, GFP_KERNEL);
-	if (!ip)
-		return -ENOMEM;
-
-	memcpy(ip, data, datalen);
-	ip[datalen] = '\0';
-
-	key->type_data.x[0] = datalen;
-	key->payload.data = ip;
-
-	return rc;
-}
-
-static void
-dns_resolver_destroy(struct key *key)
-{
-	kfree(key->payload.data);
-}
-
-struct key_type key_type_dns_resolver = {
-	.name        = "dns_resolver",
-	.def_datalen = sizeof(struct in_addr),
-	.describe    = user_describe,
-	.instantiate = dns_resolver_instantiate,
-	.destroy     = dns_resolver_destroy,
-	.match       = user_match,
-};
-
-/* Resolves server name to ip address.
- * input:
- * 	unc - server UNC
- * output:
- * 	*ip_addr - pointer to server ip, caller responcible for freeing it.
- * return the length of the returned string on success
+/**
+ * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
+ * @unc: UNC path specifying the server
+ * @ip_addr: Where to return the IP address.
+ *
+ * The IP address will be returned in string form, and the caller is
+ * responsible for freeing it.
+ *
+ * Returns length of result on success, -ve on error.
  */
 int
 dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 {
-	const struct cred *saved_cred;
-	int rc = -EAGAIN;
-	struct key *rkey = ERR_PTR(-EAGAIN);
+	struct sockaddr_storage ss;
+	const char *hostname, *sep;
 	char *name;
-	char *data = NULL;
-	int len;
+	int len, rc;
 
 	if (!ip_addr || !unc)
 		return -EINVAL;
 
-	/* search for server name delimiter */
 	len = strlen(unc);
 	if (len < 3) {
 		cFYI(1, "%s: unc is too short: %s", __func__, unc);
 		return -EINVAL;
 	}
-	len -= 2;
-	name = memchr(unc+2, '\\', len);
-	if (!name) {
-		cFYI(1, "%s: probably server name is whole unc: %s",
-					__func__, unc);
-	} else {
-		len = (name - unc) - 2/* leading // */;
-	}
-
-	name = kmalloc(len+1, GFP_KERNEL);
-	if (!name) {
-		rc = -ENOMEM;
-		return rc;
-	}
-	memcpy(name, unc+2, len);
-	name[len] = 0;
-
-	if (is_ip(name, len)) {
-		cFYI(1, "%s: it is IP, skipping dns upcall: %s",
-					__func__, name);
-		data = name;
-		goto skip_upcall;
-	}
 
-	saved_cred = override_creds(dns_resolver_cache);
-	rkey = request_key(&key_type_dns_resolver, name, "");
-	revert_creds(saved_cred);
-	if (!IS_ERR(rkey)) {
-		if (!(rkey->perm & KEY_USR_VIEW)) {
-			down_read(&rkey->sem);
-			rkey->perm |= KEY_USR_VIEW;
-			up_read(&rkey->sem);
-		}
-		len = rkey->type_data.x[0];
-		data = rkey->payload.data;
-	} else {
-		cERROR(1, "%s: unable to resolve: %s", __func__, name);
-		goto out;
-	}
-
-skip_upcall:
-	if (data) {
-		*ip_addr = kmalloc(len + 1, GFP_KERNEL);
-		if (*ip_addr) {
-			memcpy(*ip_addr, data, len + 1);
-			if (!IS_ERR(rkey))
-				cFYI(1, "%s: resolved: %s to %s", __func__,
-							name,
-							*ip_addr
-					);
-			rc = len;
-		} else {
-			rc = -ENOMEM;
-		}
-		if (!IS_ERR(rkey))
-			key_put(rkey);
-	}
+	/* Discount leading slashes for cifs */
+	len -= 2;
+	hostname = unc + 2;
 
-out:
-	kfree(name);
+	/* Search for server name delimiter */
+	sep = memchr(hostname, '\\', len);
+	if (sep)
+		len = sep - unc;
+	else
+		cFYI(1, "%s: probably server name is whole unc: %s",
+		     __func__, unc);
+
+	/* Try to interpret hostname as an IPv4 or IPv6 address */
+	rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
+	if (rc > 0)
+		goto name_is_IP_address;
+
+	/* Perform the upcall */
+	rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
+	if (rc < 0)
+		cERROR(1, "%s: unable to resolve: %*.*s",
+		       __func__, len, len, hostname);
+	else
+		cFYI(1, "%s: resolved: %*.*s to %s",
+		     __func__, len, len, hostname, *ip_addr);
 	return rc;
-}
 
-int __init cifs_init_dns_resolver(void)
-{
-	struct cred *cred;
-	struct key *keyring;
-	int ret;
-
-	printk(KERN_NOTICE "Registering the %s key type\n",
-	       key_type_dns_resolver.name);
-
-	/* create an override credential set with a special thread keyring in
-	 * which DNS requests are cached
-	 *
-	 * this is used to prevent malicious redirections from being installed
-	 * with add_key().
-	 */
-	cred = prepare_kernel_cred(NULL);
-	if (!cred)
+name_is_IP_address:
+	name = kmalloc(len + 1, GFP_KERNEL);
+	if (!name)
 		return -ENOMEM;
-
-	keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred,
-			    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
-			    KEY_USR_VIEW | KEY_USR_READ,
-			    KEY_ALLOC_NOT_IN_QUOTA);
-	if (IS_ERR(keyring)) {
-		ret = PTR_ERR(keyring);
-		goto failed_put_cred;
-	}
-
-	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
-	if (ret < 0)
-		goto failed_put_key;
-
-	ret = register_key_type(&key_type_dns_resolver);
-	if (ret < 0)
-		goto failed_put_key;
-
-	/* instruct request_key() to use this special keyring as a cache for
-	 * the results it looks up */
-	cred->thread_keyring = keyring;
-	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
-	dns_resolver_cache = cred;
+	memcpy(name, hostname, len);
+	name[len] = 0;
+	cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name);
+	*ip_addr = name;
 	return 0;
-
-failed_put_key:
-	key_put(keyring);
-failed_put_cred:
-	put_cred(cred);
-	return ret;
-}
-
-void cifs_exit_dns_resolver(void)
-{
-	key_revoke(dns_resolver_cache->thread_keyring);
-	unregister_key_type(&key_type_dns_resolver);
-	put_cred(dns_resolver_cache);
-	printk(KERN_NOTICE "Unregistered %s key type\n",
-	       key_type_dns_resolver.name);
 }
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 5d7f291df162..d3f5d27f4d06 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,6 @@
 #define _DNS_RESOLVE_H
 
 #ifdef __KERNEL__
-extern int __init cifs_init_dns_resolver(void);
-extern void cifs_exit_dns_resolver(void);
 extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
 #endif /* KERNEL */
 
diff --git a/include/keys/dns_resolver-type.h b/include/keys/dns_resolver-type.h
new file mode 100644
index 000000000000..9284a19393aa
--- /dev/null
+++ b/include/keys/dns_resolver-type.h
@@ -0,0 +1,23 @@
+/* DNS resolver key type
+ *
+ * Copyright (C) 2010 Wang Lei. All Rights Reserved.
+ * Written by Wang Lei (wang840925@gmail.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _KEYS_DNS_RESOLVER_TYPE_H
+#define _KEYS_DNS_RESOLVER_TYPE_H
+
+#include <linux/key-type.h>
+
+extern struct key_type key_type_dns_resolver;
+
+extern int request_dns_resolver_key(const char *description,
+				    const char *callout_info,
+				    char **data);
+
+#endif /* _KEYS_DNS_RESOLVER_TYPE_H */
diff --git a/include/linux/dns_resolver.h b/include/linux/dns_resolver.h
new file mode 100644
index 000000000000..cc92268af89a
--- /dev/null
+++ b/include/linux/dns_resolver.h
@@ -0,0 +1,34 @@
+/*
+ *   DNS Resolver upcall management for CIFS DFS and AFS
+ *   Handles host name to IP address resolution and DNS query for AFSDB RR.
+ *
+ *   Copyright (c) International Business Machines  Corp., 2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _LINUX_DNS_RESOLVER_H
+#define _LINUX_DNS_RESOLVER_H
+
+#ifdef __KERNEL__
+
+extern int dns_query(const char *type, const char *name, size_t namelen,
+		     const char *options, char **_result, time_t *_expiry);
+
+#endif /* KERNEL */
+
+#endif /* _LINUX_DNS_RESOLVER_H */
diff --git a/net/Kconfig b/net/Kconfig
index e24fa0873f32..e330594d3709 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -213,6 +213,7 @@ source "net/phonet/Kconfig"
 source "net/ieee802154/Kconfig"
 source "net/sched/Kconfig"
 source "net/dcb/Kconfig"
+source "net/dns_resolver/Kconfig"
 
 config RPS
 	boolean
diff --git a/net/Makefile b/net/Makefile
index 41d420070a38..ea60fbce9b1b 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -67,3 +67,4 @@ ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
 endif
 obj-$(CONFIG_WIMAX)		+= wimax/
+obj-$(CONFIG_DNS_RESOLVER)	+= dns_resolver/
diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig
new file mode 100644
index 000000000000..2ec47cb5d0d6
--- /dev/null
+++ b/net/dns_resolver/Kconfig
@@ -0,0 +1,27 @@
+#
+# Configuration for DNS Resolver
+#
+config DNS_RESOLVER
+	tristate "DNS Resolver support"
+	depends on NET && KEYS
+	help
+	  Saying Y here will include support for the DNS Resolver key type
+	  which can be used to make upcalls to perform DNS lookups in
+	  userspace.
+
+	  DNS Resolver is used to query DNS server for information.  Examples
+	  being resolving a UNC hostname element to an IP address for CIFS or
+	  performing a DNS query for AFSDB records so that AFS can locate a
+	  cell's volume location database servers.
+
+	  DNS Resolver is used by the CIFS and AFS modules, and would support
+	  samba4 later.  DNS Resolver is supported by the userspace upcall
+	  helper "/sbin/dns.resolver" via /etc/request-key.conf.
+
+	  See <file:Documentation/networking/dns_resolver.txt> for further
+	  information.
+
+	  To compile this as a module, choose M here: the module will be called
+	  dnsresolver.
+
+	  If unsure, say N.
diff --git a/net/dns_resolver/Makefile b/net/dns_resolver/Makefile
new file mode 100644
index 000000000000..c0ef4e71dc49
--- /dev/null
+++ b/net/dns_resolver/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux DNS Resolver.
+#
+
+obj-$(CONFIG_DNS_RESOLVER) += dns_resolver.o
+
+dns_resolver-objs :=  dns_key.o dns_query.o
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
new file mode 100644
index 000000000000..1b1b411adcf1
--- /dev/null
+++ b/net/dns_resolver/dns_key.c
@@ -0,0 +1,210 @@
+/* Key type used to cache DNS lookups made by the kernel
+ *
+ * See Documentation/networking/dns_resolver.txt
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/keyctl.h>
+#include <keys/dns_resolver-type.h>
+#include <keys/user-type.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("DNS Resolver");
+MODULE_AUTHOR("Wang Lei");
+MODULE_LICENSE("GPL");
+
+unsigned dns_resolver_debug;
+module_param_named(debug, dns_resolver_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "DNS Resolver debugging mask");
+
+const struct cred *dns_resolver_cache;
+
+/*
+ * Instantiate a user defined key for dns_resolver.
+ *
+ * The data must be a NUL-terminated string, with the NUL char accounted in
+ * datalen.
+ *
+ * If the data contains a '#' characters, then we take the clause after each
+ * one to be an option of the form 'key=value'.  The actual data of interest is
+ * the string leading up to the first '#'.  For instance:
+ *
+ *        "ip1,ip2,...#foo=bar"
+ */
+static int
+dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen)
+{
+	struct user_key_payload *upayload;
+	int ret;
+	size_t result_len = 0;
+	const char *data = _data, *opt;
+
+	kenter("%%%d,%s,'%s',%zu",
+	       key->serial, key->description, data, datalen);
+
+	if (datalen <= 1 || !data || data[datalen - 1] != '\0')
+		return -EINVAL;
+	datalen--;
+
+	/* deal with any options embedded in the data */
+	opt = memchr(data, '#', datalen);
+	if (!opt) {
+		kdebug("no options currently supported");
+		return -EINVAL;
+	}
+
+	result_len = datalen;
+	ret = key_payload_reserve(key, result_len);
+	if (ret < 0)
+		return -EINVAL;
+
+	upayload = kmalloc(sizeof(*upayload) + result_len + 1, GFP_KERNEL);
+	if (!upayload) {
+		kleave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	upayload->datalen = result_len;
+	memcpy(upayload->data, data, result_len);
+	upayload->data[result_len] = '\0';
+	rcu_assign_pointer(key->payload.data, upayload);
+
+	kleave(" = 0");
+	return 0;
+}
+
+/*
+ * The description is of the form "[<type>:]<domain_name>"
+ *
+ * The domain name may be a simple name or an absolute domain name (which
+ * should end with a period).  The domain name is case-independent.
+ */
+static int
+dns_resolver_match(const struct key *key, const void *description)
+{
+	int slen, dlen, ret = 0;
+	const char *src = key->description, *dsp = description;
+
+	kenter("%s,%s", src, dsp);
+
+	if (!src || !dsp)
+		goto no_match;
+
+	if (strcasecmp(src, dsp) == 0)
+		goto matched;
+
+	slen = strlen(src);
+	dlen = strlen(dsp);
+	if (slen <= 0 || dlen <= 0)
+		goto no_match;
+	if (src[slen - 1] == '.')
+		slen--;
+	if (dsp[dlen - 1] == '.')
+		dlen--;
+	if (slen != dlen || strncasecmp(src, dsp, slen) != 0)
+		goto no_match;
+
+matched:
+	ret = 1;
+no_match:
+	kleave(" = %d", ret);
+	return ret;
+}
+
+struct key_type key_type_dns_resolver = {
+	.name		= "dns_resolver",
+	.instantiate	= dns_resolver_instantiate,
+	.match		= dns_resolver_match,
+	.revoke		= user_revoke,
+	.destroy	= user_destroy,
+	.describe	= user_describe,
+	.read		= user_read,
+};
+
+static int __init init_dns_resolver(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret;
+
+	printk(KERN_NOTICE "Registering the %s key type\n",
+	       key_type_dns_resolver.name);
+
+	/* create an override credential set with a special thread keyring in
+	 * which DNS requests are cached
+	 *
+	 * this is used to prevent malicious redirections from being installed
+	 * with add_key().
+	 */
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred,
+			    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			    KEY_USR_VIEW | KEY_USR_READ,
+			    KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
+	ret = register_key_type(&key_type_dns_resolver);
+	if (ret < 0)
+		goto failed_put_key;
+
+	/* instruct request_key() to use this special keyring as a cache for
+	 * the results it looks up */
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	dns_resolver_cache = cred;
+
+	kdebug("DNS resolver keyring: %d\n", key_serial(keyring));
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+static void __exit exit_dns_resolver(void)
+{
+	key_revoke(dns_resolver_cache->thread_keyring);
+	unregister_key_type(&key_type_dns_resolver);
+	put_cred(dns_resolver_cache);
+	printk(KERN_NOTICE "Unregistered %s key type\n",
+	       key_type_dns_resolver.name);
+}
+
+module_init(init_dns_resolver)
+module_exit(exit_dns_resolver)
+MODULE_LICENSE("GPL");
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
new file mode 100644
index 000000000000..6c0cf31ea00d
--- /dev/null
+++ b/net/dns_resolver/dns_query.c
@@ -0,0 +1,159 @@
+/* Upcall routine, designed to work as a key type and working through
+ * /sbin/request-key to contact userspace when handling DNS queries.
+ *
+ * See Documentation/networking/dns_resolver.txt
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
+ *
+ *   The upcall wrapper used to make an arbitrary DNS query.
+ *
+ *   This function requires the appropriate userspace tool dns.upcall to be
+ *   installed and something like the following lines should be added to the
+ *   /etc/request-key.conf file:
+ *
+ *	create dns_resolver * * /sbin/dns.upcall %k
+ *
+ *   For example to use this module to query AFSDB RR:
+ *
+ *	create dns_resolver afsdb:* * /sbin/dns.afsdb %k
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dns_resolver.h>
+#include <keys/dns_resolver-type.h>
+#include <keys/user-type.h>
+
+#include "internal.h"
+
+/*
+ * dns_query - Query the DNS
+ * @type: Query type (or NULL for straight host->IP lookup)
+ * @name: Name to look up
+ * @namelen: Length of name
+ * @options: Request options (or NULL if no options)
+ * @_result: Where to place the returned data.
+ * @_expiry: Where to store the result expiry time (or NULL)
+ *
+ * The data will be returned in the pointer at *result, and the caller is
+ * responsible for freeing it.
+ *
+ * The description should be of the form "[<query_type>:]<domain_name>", and
+ * the options need to be appropriate for the query type requested.  If no
+ * query_type is given, then the query is a straight hostname to IP address
+ * lookup.
+ *
+ * The DNS resolution lookup is performed by upcalling to userspace by way of
+ * requesting a key of type dns_resolver.
+ *
+ * Returns the size of the result on success, -ve error code otherwise.
+ */
+int dns_query(const char *type, const char *name, size_t namelen,
+	      const char *options, char **_result, time_t *_expiry)
+{
+	struct key *rkey;
+	struct user_key_payload *upayload;
+	const struct cred *saved_cred;
+	size_t typelen, desclen;
+	char *desc, *cp;
+	int ret, len;
+
+	kenter("%s,%*.*s,%zu,%s",
+	       type, (int)namelen, (int)namelen, name, namelen, options);
+
+	if (!name || namelen == 0 || !_result)
+		return -EINVAL;
+
+	/* construct the query key description as "[<type>:]<name>" */
+	typelen = 0;
+	desclen = 0;
+	if (type) {
+		typelen = strlen(type);
+		if (typelen < 1)
+			return -EINVAL;
+		desclen += typelen + 1;
+	}
+
+	if (!namelen)
+		namelen = strlen(name);
+	if (namelen < 3)
+		return -EINVAL;
+	desclen += namelen + 1;
+
+	desc = kmalloc(desclen, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	cp = desc;
+	if (type) {
+		memcpy(cp, type, typelen);
+		cp += typelen;
+		*cp++ = ':';
+	}
+	memcpy(cp, name, namelen);
+	cp += namelen;
+	*cp = '\0';
+
+	if (!options)
+		options = "";
+	kdebug("call request_key(,%s,%s)", desc, options);
+
+	/* make the upcall, using special credentials to prevent the use of
+	 * add_key() to preinstall malicious redirections
+	 */
+	saved_cred = override_creds(dns_resolver_cache);
+	rkey = request_key(&key_type_dns_resolver, desc, options);
+	revert_creds(saved_cred);
+	kfree(desc);
+	if (IS_ERR(rkey)) {
+		ret = PTR_ERR(rkey);
+		goto out;
+	}
+
+	down_read(&rkey->sem);
+	rkey->perm |= KEY_USR_VIEW;
+
+	ret = key_validate(rkey);
+	if (ret < 0)
+		goto put;
+
+	upayload = rcu_dereference_protected(rkey->payload.data,
+					     lockdep_is_held(&rkey->sem));
+	len = upayload->datalen;
+
+	ret = -ENOMEM;
+	*_result = kmalloc(len + 1, GFP_KERNEL);
+	if (!*_result)
+		goto put;
+
+	memcpy(*_result, upayload->data, len + 1);
+	if (_expiry)
+		*_expiry = rkey->expiry;
+
+	ret = len;
+put:
+	up_read(&rkey->sem);
+	key_put(rkey);
+out:
+	kleave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(dns_query);
diff --git a/net/dns_resolver/internal.h b/net/dns_resolver/internal.h
new file mode 100644
index 000000000000..189ca9e9b785
--- /dev/null
+++ b/net/dns_resolver/internal.h
@@ -0,0 +1,44 @@
+/*
+ *   Copyright (c) 2010 Wang Lei
+ *   Author(s): Wang Lei (wang840925@gmail.com). All Rights Reserved.
+ *
+ *   Internal DNS Rsolver stuff
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+
+/*
+ * dns_key.c
+ */
+extern const struct cred *dns_resolver_cache;
+
+/*
+ * debug tracing
+ */
+extern unsigned dns_resolver_debug;
+
+#define	kdebug(FMT, ...)				\
+do {							\
+	if (unlikely(dns_resolver_debug))		\
+		printk(KERN_DEBUG "[%-6.6s] "FMT"\n",	\
+		       current->comm, ##__VA_ARGS__);	\
+} while (0)
+
+#define kenter(FMT, ...) kdebug("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) kdebug("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-- 
cgit v1.2.3-59-g8ed1b


From cc7447a5fa92759b0856d6a83ba2539c6a94e67e Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Wed, 16 Jun 2010 11:44:18 +0200
Subject: Driver core: Drop __must_check from bus_for_each_drv()

There is little rationale for marking bus_for_each_drv() __must_check.
It is more of an iteration helper than a real function. You don't know
in advance which callback it will be used on, so you have no clue how
important it can be to check the returned value. In practice, this
helper function can be used for best-effort tasks.

As a matter of fact, bus_for_each_dev() is not marked __must_check.
So remove it from bus_for_each_drv() as well. This is the same that
was done back in October 2006 by Russell King for
device_for_each_child(), for exactly the same reasons.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/device.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 6a8276f683b6..ddffdf7da393 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -84,9 +84,8 @@ struct device *bus_find_device_by_name(struct bus_type *bus,
 				       struct device *start,
 				       const char *name);
 
-int __must_check bus_for_each_drv(struct bus_type *bus,
-				  struct device_driver *start, void *data,
-				  int (*fn)(struct device_driver *, void *));
+int bus_for_each_drv(struct bus_type *bus, struct device_driver *start,
+		     void *data, int (*fn)(struct device_driver *, void *));
 
 void bus_sort_breadthfirst(struct bus_type *bus,
 			   int (*compare)(const struct device *a,
-- 
cgit v1.2.3-59-g8ed1b


From 44f28bdea09415d40b4d73a7668db5961362ec53 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 21 Jun 2010 16:11:44 +0200
Subject: Driver core: reduce duplicated code for platform_device creation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This makes the two similar functions platform_device_register_simple
and platform_device_register_data one line inline functions using a new
generic function platform_device_register_resndata.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/DocBook/device-drivers.tmpl |   1 +
 drivers/base/platform.c                   | 104 ++++++++----------------------
 include/linux/platform_device.h           |  62 ++++++++++++++++--
 3 files changed, 85 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl
index 1b2dd4fc3db2..ecd35e9d4410 100644
--- a/Documentation/DocBook/device-drivers.tmpl
+++ b/Documentation/DocBook/device-drivers.tmpl
@@ -111,6 +111,7 @@ X!Edrivers/base/attribute_container.c
 <!--
 X!Edrivers/base/interface.c
 -->
+!Iinclude/linux/platform_device.h
 !Edrivers/base/platform.c
 !Edrivers/base/bus.c
      </sect1>
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 26eb69d88eb6..ffcfd73fe8a6 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -344,108 +344,56 @@ void platform_device_unregister(struct platform_device *pdev)
 EXPORT_SYMBOL_GPL(platform_device_unregister);
 
 /**
- * platform_device_register_simple - add a platform-level device and its resources
- * @name: base name of the device we're adding
- * @id: instance id
- * @res: set of resources that needs to be allocated for the device
- * @num: number of resources
- *
- * This function creates a simple platform device that requires minimal
- * resource and memory management. Canned release function freeing memory
- * allocated for the device allows drivers using such devices to be
- * unloaded without waiting for the last reference to the device to be
- * dropped.
+ * platform_device_register_resndata - add a platform-level device with
+ * resources and platform-specific data
  *
- * This interface is primarily intended for use with legacy drivers which
- * probe hardware directly.  Because such drivers create sysfs device nodes
- * themselves, rather than letting system infrastructure handle such device
- * enumeration tasks, they don't fully conform to the Linux driver model.
- * In particular, when such drivers are built as modules, they can't be
- * "hotplugged".
- *
- * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
- */
-struct platform_device *platform_device_register_simple(const char *name,
-							int id,
-							const struct resource *res,
-							unsigned int num)
-{
-	struct platform_device *pdev;
-	int retval;
-
-	pdev = platform_device_alloc(name, id);
-	if (!pdev) {
-		retval = -ENOMEM;
-		goto error;
-	}
-
-	if (num) {
-		retval = platform_device_add_resources(pdev, res, num);
-		if (retval)
-			goto error;
-	}
-
-	retval = platform_device_add(pdev);
-	if (retval)
-		goto error;
-
-	return pdev;
-
-error:
-	platform_device_put(pdev);
-	return ERR_PTR(retval);
-}
-EXPORT_SYMBOL_GPL(platform_device_register_simple);
-
-/**
- * platform_device_register_data - add a platform-level device with platform-specific data
  * @parent: parent device for the device we're adding
  * @name: base name of the device we're adding
  * @id: instance id
+ * @res: set of resources that needs to be allocated for the device
+ * @num: number of resources
  * @data: platform specific data for this platform device
  * @size: size of platform specific data
  *
- * This function creates a simple platform device that requires minimal
- * resource and memory management. Canned release function freeing memory
- * allocated for the device allows drivers using such devices to be
- * unloaded without waiting for the last reference to the device to be
- * dropped.
- *
  * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
  */
-struct platform_device *platform_device_register_data(
+struct platform_device *platform_device_register_resndata(
 		struct device *parent,
 		const char *name, int id,
+		const struct resource *res, unsigned int num,
 		const void *data, size_t size)
 {
+	int ret = -ENOMEM;
 	struct platform_device *pdev;
-	int retval;
 
 	pdev = platform_device_alloc(name, id);
-	if (!pdev) {
-		retval = -ENOMEM;
-		goto error;
-	}
+	if (!pdev)
+		goto err;
 
 	pdev->dev.parent = parent;
 
-	if (size) {
-		retval = platform_device_add_data(pdev, data, size);
-		if (retval)
-			goto error;
+	if (res) {
+		ret = platform_device_add_resources(pdev, res, num);
+		if (ret)
+			goto err;
 	}
 
-	retval = platform_device_add(pdev);
-	if (retval)
-		goto error;
+	if (data) {
+		ret = platform_device_add_data(pdev, data, size);
+		if (ret)
+			goto err;
+	}
 
-	return pdev;
+	ret = platform_device_add(pdev);
+	if (ret) {
+err:
+		platform_device_put(pdev);
+		return ERR_PTR(ret);
+	}
 
-error:
-	platform_device_put(pdev);
-	return ERR_PTR(retval);
+	return pdev;
 }
-EXPORT_SYMBOL_GPL(platform_device_register_data);
+EXPORT_SYMBOL_GPL(platform_device_register_resndata);
 
 static int platform_drv_probe(struct device *_dev)
 {
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 5417944d3687..d7ecad0093bb 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -43,10 +43,64 @@ extern struct resource *platform_get_resource_byname(struct platform_device *, u
 extern int platform_get_irq_byname(struct platform_device *, const char *);
 extern int platform_add_devices(struct platform_device **, int);
 
-extern struct platform_device *platform_device_register_simple(const char *, int id,
-					const struct resource *, unsigned int);
-extern struct platform_device *platform_device_register_data(struct device *,
-		const char *, int, const void *, size_t);
+extern struct platform_device *platform_device_register_resndata(
+		struct device *parent, const char *name, int id,
+		const struct resource *res, unsigned int num,
+		const void *data, size_t size);
+
+/**
+ * platform_device_register_simple - add a platform-level device and its resources
+ * @name: base name of the device we're adding
+ * @id: instance id
+ * @res: set of resources that needs to be allocated for the device
+ * @num: number of resources
+ *
+ * This function creates a simple platform device that requires minimal
+ * resource and memory management. Canned release function freeing memory
+ * allocated for the device allows drivers using such devices to be
+ * unloaded without waiting for the last reference to the device to be
+ * dropped.
+ *
+ * This interface is primarily intended for use with legacy drivers which
+ * probe hardware directly.  Because such drivers create sysfs device nodes
+ * themselves, rather than letting system infrastructure handle such device
+ * enumeration tasks, they don't fully conform to the Linux driver model.
+ * In particular, when such drivers are built as modules, they can't be
+ * "hotplugged".
+ *
+ * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
+ */
+static inline struct platform_device *platform_device_register_simple(
+		const char *name, int id,
+		const struct resource *res, unsigned int num)
+{
+	return platform_device_register_resndata(NULL, name, id,
+			res, num, NULL, 0);
+}
+
+/**
+ * platform_device_register_data - add a platform-level device with platform-specific data
+ * @parent: parent device for the device we're adding
+ * @name: base name of the device we're adding
+ * @id: instance id
+ * @data: platform specific data for this platform device
+ * @size: size of platform specific data
+ *
+ * This function creates a simple platform device that requires minimal
+ * resource and memory management. Canned release function freeing memory
+ * allocated for the device allows drivers using such devices to be
+ * unloaded without waiting for the last reference to the device to be
+ * dropped.
+ *
+ * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
+ */
+static inline struct platform_device *platform_device_register_data(
+		struct device *parent, const char *name, int id,
+		const void *data, size_t size)
+{
+	return platform_device_register_resndata(parent, name, id,
+			NULL, 0, data, size);
+}
 
 extern struct platform_device *platform_device_alloc(const char *name, int id);
 extern int platform_device_add_resources(struct platform_device *pdev,
-- 
cgit v1.2.3-59-g8ed1b


From 49c19400f60bbe362202d7e7b3e68cc66040d0fa Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 2 Jul 2010 16:54:05 +0200
Subject: sysfs: sysfs_chmod_file's attr can be const

sysfs_chmod_file doesn't change the attribute it operates on, so this
attribute can be marked const.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c       | 3 ++-
 include/linux/sysfs.h | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1beaa739d0a6..1b27b5688f62 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -593,7 +593,8 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
  * @mode: file permissions.
  *
  */
-int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
+int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
+		     mode_t mode)
 {
 	struct sysfs_dirent *sd;
 	struct iattr newattrs;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index f2694eb4dd3d..8bf06b64487c 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -136,8 +136,8 @@ int __must_check sysfs_create_file(struct kobject *kobj,
 				   const struct attribute *attr);
 int __must_check sysfs_create_files(struct kobject *kobj,
 				   const struct attribute **attr);
-int __must_check sysfs_chmod_file(struct kobject *kobj, struct attribute *attr,
-				  mode_t mode);
+int __must_check sysfs_chmod_file(struct kobject *kobj,
+				  const struct attribute *attr, mode_t mode);
 void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr);
 void sysfs_remove_files(struct kobject *kobj, const struct attribute **attr);
 
@@ -225,7 +225,7 @@ static inline int sysfs_create_files(struct kobject *kobj,
 }
 
 static inline int sysfs_chmod_file(struct kobject *kobj,
-				   struct attribute *attr, mode_t mode)
+				   const struct attribute *attr, mode_t mode)
 {
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 45daef0fdcc44f6af86fdebc4fc7eb7c79375398 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@opensource.se>
Date: Fri, 23 Jul 2010 19:56:18 +0900
Subject: Driver core: Add BUS_NOTIFY_BIND_DRIVER

Add BUS_NOTIFY_BIND_DRIVER as a bus notifier event.

For driver binding/unbinding we with this in
place have the following bus notifier events:
 - BUS_NOTIFY_BIND_DRIVER - before ->probe()
 - BUS_NOTIFY_BOUND_DRIVER - after ->probe()
 - BUS_NOTIFY_UNBIND_DRIVER - before ->remove()
 - BUS_NOTIFY_UNBOUND_DRIVER - after ->remove()

The event BUS_NOTIFY_BIND_DRIVER allows bus code
to be notified that ->probe() is about to be called.

Useful for bus code that needs to setup hardware before
the driver gets to run. With this in place platform
drivers can be loaded and unloaded as modules and the
new BIND event allows bus code to control for instance
device clocks that must be enabled before the driver
can be executed.

Without this patch there is no way for the bus code to
get notified that a modular driver is about to be probed.

Signed-off-by: Magnus Damm <damm@opensource.se>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/dd.c      | 4 ++++
 include/linux/device.h | 8 +++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 503c2620bbcc..da57ee9d63fe 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -51,6 +51,10 @@ static int driver_sysfs_add(struct device *dev)
 {
 	int ret;
 
+	if (dev->bus)
+		blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
+					     BUS_NOTIFY_BIND_DRIVER, dev);
+
 	ret = sysfs_create_link(&dev->driver->p->kobj, &dev->kobj,
 			  kobject_name(&dev->kobj));
 	if (ret == 0) {
diff --git a/include/linux/device.h b/include/linux/device.h
index ddffdf7da393..0ca24e93304f 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -109,10 +109,12 @@ extern int bus_unregister_notifier(struct bus_type *bus,
  */
 #define BUS_NOTIFY_ADD_DEVICE		0x00000001 /* device added */
 #define BUS_NOTIFY_DEL_DEVICE		0x00000002 /* device removed */
-#define BUS_NOTIFY_BOUND_DRIVER		0x00000003 /* driver bound to device */
-#define BUS_NOTIFY_UNBIND_DRIVER	0x00000004 /* driver about to be
+#define BUS_NOTIFY_BIND_DRIVER		0x00000003 /* driver about to be
+						      bound */
+#define BUS_NOTIFY_BOUND_DRIVER		0x00000004 /* driver bound to device */
+#define BUS_NOTIFY_UNBIND_DRIVER	0x00000005 /* driver about to be
 						      unbound */
-#define BUS_NOTIFY_UNBOUND_DRIVER	0x00000005 /* driver is unbound
+#define BUS_NOTIFY_UNBOUND_DRIVER	0x00000006 /* driver is unbound
 						      from the device */
 
 extern struct kset *bus_get_kset(struct bus_type *bus);
-- 
cgit v1.2.3-59-g8ed1b


From 6fd69dc578fa0b1bbc3aad70ae3af9a137211707 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <guenter.roeck@ericsson.com>
Date: Wed, 28 Jul 2010 22:09:26 -0700
Subject: sysfs: Remove owner field from sysfs struct attribute

Signed-off-by: Guenter Roeck <guenter.roeck@ericsson.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/sysfs.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 8bf06b64487c..3c92121ba9af 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -22,14 +22,8 @@ struct kobject;
 struct module;
 enum kobj_ns_type;
 
-/* FIXME
- * The *owner field is no longer used.
- * x86 tree has been cleaned up. The owner
- * attribute is still left for other arches.
- */
 struct attribute {
 	const char		*name;
-	struct module		*owner;
 	mode_t			mode;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lock_class_key	*key;
-- 
cgit v1.2.3-59-g8ed1b


From 6937e8f8c0135f2325194c372ada6dc655499992 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 5 Aug 2010 17:38:18 +0200
Subject: driver core: device_rename's new_name can be const

The new_name argument to device_rename() can be
const as kobject_rename's new_name argument is.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/core.c    | 2 +-
 include/linux/device.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index f8e72724dd4b..d1b2c9adc271 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1599,7 +1599,7 @@ EXPORT_SYMBOL_GPL(device_destroy);
  * on the same device to ensure that new_name is valid and
  * won't conflict with other devices.
  */
-int device_rename(struct device *dev, char *new_name)
+int device_rename(struct device *dev, const char *new_name)
 {
 	char *old_class_name = NULL;
 	char *new_class_name = NULL;
diff --git a/include/linux/device.h b/include/linux/device.h
index 0ca24e93304f..516fecacf27b 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -552,7 +552,7 @@ extern int device_for_each_child(struct device *dev, void *data,
 		     int (*fn)(struct device *dev, void *data));
 extern struct device *device_find_child(struct device *dev, void *data,
 				int (*match)(struct device *dev, void *data));
-extern int device_rename(struct device *dev, char *new_name);
+extern int device_rename(struct device *dev, const char *new_name);
 extern int device_move(struct device *dev, struct device *new_parent,
 		       enum dpm_order dpm_order);
 extern const char *device_get_devnode(struct device *dev,
-- 
cgit v1.2.3-59-g8ed1b


From 8ae664184c45def51ff0b61d4bd6c6671db6cb4f Mon Sep 17 00:00:00 2001
From: Stefani Seibold <stefani@seibold.net>
Date: Thu, 5 Aug 2010 09:19:26 +0200
Subject: mtd: change struct flchip_shared spinlock locking into mutex

This patch prevent to schedule while atomic by changing the
flchip_shared spinlock into a mutex. This should be save since no atomic
path will use this lock.

It was suggested by Arnd Bergmann and Vasiliy Kulikov.

Signed-off-by: Stefani Seibold <stefani@seibold.net>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/chips/cfi_cmdset_0001.c | 20 ++++++++++----------
 drivers/mtd/lpddr/lpddr_cmds.c      | 20 ++++++++++----------
 include/linux/mtd/flashchip.h       |  2 +-
 3 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index 2fadb0239ba3..97d5546f9ea4 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -720,7 +720,7 @@ static int cfi_intelext_partition_fixup(struct mtd_info *mtd,
 		chip = &newcfi->chips[0];
 		for (i = 0; i < cfi->numchips; i++) {
 			shared[i].writing = shared[i].erasing = NULL;
-			spin_lock_init(&shared[i].lock);
+			mutex_init(&shared[i].lock);
 			for (j = 0; j < numparts; j++) {
 				*chip = cfi->chips[i];
 				chip->start += j << partshift;
@@ -889,7 +889,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 		 */
 		struct flchip_shared *shared = chip->priv;
 		struct flchip *contender;
-		spin_lock(&shared->lock);
+		mutex_lock(&shared->lock);
 		contender = shared->writing;
 		if (contender && contender != chip) {
 			/*
@@ -902,7 +902,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 			 * get_chip returns success we're clear to go ahead.
 			 */
 			ret = mutex_trylock(&contender->mutex);
-			spin_unlock(&shared->lock);
+			mutex_unlock(&shared->lock);
 			if (!ret)
 				goto retry;
 			mutex_unlock(&chip->mutex);
@@ -917,7 +917,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 				mutex_unlock(&contender->mutex);
 				return ret;
 			}
-			spin_lock(&shared->lock);
+			mutex_lock(&shared->lock);
 
 			/* We should not own chip if it is already
 			 * in FL_SYNCING state. Put contender and retry. */
@@ -933,7 +933,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 		 * on this chip. Sleep. */
 		if (mode == FL_ERASING && shared->erasing
 		    && shared->erasing->oldstate == FL_ERASING) {
-			spin_unlock(&shared->lock);
+			mutex_unlock(&shared->lock);
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			add_wait_queue(&chip->wq, &wait);
 			mutex_unlock(&chip->mutex);
@@ -947,7 +947,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 		shared->writing = chip;
 		if (mode == FL_ERASING)
 			shared->erasing = chip;
-		spin_unlock(&shared->lock);
+		mutex_unlock(&shared->lock);
 	}
 	ret = chip_ready(map, chip, adr, mode);
 	if (ret == -EAGAIN)
@@ -962,7 +962,7 @@ static void put_chip(struct map_info *map, struct flchip *chip, unsigned long ad
 
 	if (chip->priv) {
 		struct flchip_shared *shared = chip->priv;
-		spin_lock(&shared->lock);
+		mutex_lock(&shared->lock);
 		if (shared->writing == chip && chip->oldstate == FL_READY) {
 			/* We own the ability to write, but we're done */
 			shared->writing = shared->erasing;
@@ -970,7 +970,7 @@ static void put_chip(struct map_info *map, struct flchip *chip, unsigned long ad
 				/* give back ownership to who we loaned it from */
 				struct flchip *loaner = shared->writing;
 				mutex_lock(&loaner->mutex);
-				spin_unlock(&shared->lock);
+				mutex_unlock(&shared->lock);
 				mutex_unlock(&chip->mutex);
 				put_chip(map, loaner, loaner->start);
 				mutex_lock(&chip->mutex);
@@ -988,11 +988,11 @@ static void put_chip(struct map_info *map, struct flchip *chip, unsigned long ad
 			 * Don't let the switch below mess things up since
 			 * we don't have ownership to resume anything.
 			 */
-			spin_unlock(&shared->lock);
+			mutex_unlock(&shared->lock);
 			wake_up(&chip->wq);
 			return;
 		}
-		spin_unlock(&shared->lock);
+		mutex_unlock(&shared->lock);
 	}
 
 	switch(chip->oldstate) {
diff --git a/drivers/mtd/lpddr/lpddr_cmds.c b/drivers/mtd/lpddr/lpddr_cmds.c
index fece5be58715..04fdfcca93f7 100644
--- a/drivers/mtd/lpddr/lpddr_cmds.c
+++ b/drivers/mtd/lpddr/lpddr_cmds.c
@@ -98,7 +98,7 @@ struct mtd_info *lpddr_cmdset(struct map_info *map)
 	numchips = lpddr->numchips / lpddr->qinfo->HWPartsNum;
 	for (i = 0; i < numchips; i++) {
 		shared[i].writing = shared[i].erasing = NULL;
-		spin_lock_init(&shared[i].lock);
+		mutex_init(&shared[i].lock);
 		for (j = 0; j < lpddr->qinfo->HWPartsNum; j++) {
 			*chip = lpddr->chips[i];
 			chip->start += j << lpddr->chipshift;
@@ -217,7 +217,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, int mode)
 		 */
 		struct flchip_shared *shared = chip->priv;
 		struct flchip *contender;
-		spin_lock(&shared->lock);
+		mutex_lock(&shared->lock);
 		contender = shared->writing;
 		if (contender && contender != chip) {
 			/*
@@ -230,7 +230,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, int mode)
 			 * get_chip returns success we're clear to go ahead.
 			 */
 			ret = mutex_trylock(&contender->mutex);
-			spin_unlock(&shared->lock);
+			mutex_unlock(&shared->lock);
 			if (!ret)
 				goto retry;
 			mutex_unlock(&chip->mutex);
@@ -245,7 +245,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, int mode)
 				mutex_unlock(&contender->mutex);
 				return ret;
 			}
-			spin_lock(&shared->lock);
+			mutex_lock(&shared->lock);
 
 			/* We should not own chip if it is already in FL_SYNCING
 			 * state. Put contender and retry. */
@@ -261,7 +261,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, int mode)
 		   Must sleep in such a case. */
 		if (mode == FL_ERASING && shared->erasing
 		    && shared->erasing->oldstate == FL_ERASING) {
-			spin_unlock(&shared->lock);
+			mutex_unlock(&shared->lock);
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			add_wait_queue(&chip->wq, &wait);
 			mutex_unlock(&chip->mutex);
@@ -275,7 +275,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, int mode)
 		shared->writing = chip;
 		if (mode == FL_ERASING)
 			shared->erasing = chip;
-		spin_unlock(&shared->lock);
+		mutex_unlock(&shared->lock);
 	}
 
 	ret = chip_ready(map, chip, mode);
@@ -348,7 +348,7 @@ static void put_chip(struct map_info *map, struct flchip *chip)
 {
 	if (chip->priv) {
 		struct flchip_shared *shared = chip->priv;
-		spin_lock(&shared->lock);
+		mutex_lock(&shared->lock);
 		if (shared->writing == chip && chip->oldstate == FL_READY) {
 			/* We own the ability to write, but we're done */
 			shared->writing = shared->erasing;
@@ -356,7 +356,7 @@ static void put_chip(struct map_info *map, struct flchip *chip)
 				/* give back the ownership */
 				struct flchip *loaner = shared->writing;
 				mutex_lock(&loaner->mutex);
-				spin_unlock(&shared->lock);
+				mutex_unlock(&shared->lock);
 				mutex_unlock(&chip->mutex);
 				put_chip(map, loaner);
 				mutex_lock(&chip->mutex);
@@ -374,11 +374,11 @@ static void put_chip(struct map_info *map, struct flchip *chip)
 			 * Don't let the switch below mess things up since
 			 * we don't have ownership to resume anything.
 			 */
-			spin_unlock(&shared->lock);
+			mutex_unlock(&shared->lock);
 			wake_up(&chip->wq);
 			return;
 		}
-		spin_unlock(&shared->lock);
+		mutex_unlock(&shared->lock);
 	}
 
 	switch (chip->oldstate) {
diff --git a/include/linux/mtd/flashchip.h b/include/linux/mtd/flashchip.h
index f43e9b49b751..23cc10f8e343 100644
--- a/include/linux/mtd/flashchip.h
+++ b/include/linux/mtd/flashchip.h
@@ -92,7 +92,7 @@ struct flchip {
 /* This is used to handle contention on write/erase operations
    between partitions of the same physical chip. */
 struct flchip_shared {
-	spinlock_t lock;
+	struct mutex lock;
 	struct flchip *writing;
 	struct flchip *erasing;
 };
-- 
cgit v1.2.3-59-g8ed1b


From 2dc11581376829303b98eadb2de253bee065a56a Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 6 Aug 2010 09:25:50 -0600
Subject: of/device: Replace struct of_device with struct platform_device

of_device is just an alias for platform_device, so remove it entirely.  Also
replace to_of_device() with to_platform_device() and update comment blocks.

This patch was initially generated from the following semantic patch, and then
edited by hand to pick up the bits that coccinelle didn't catch.

@@
@@
-struct of_device
+struct platform_device

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Reviewed-by: David S. Miller <davem@davemloft.net>
---
 arch/powerpc/platforms/83xx/mpc837x_mds.c     |  2 +-
 arch/powerpc/platforms/83xx/mpc837x_rdb.c     |  2 +-
 arch/powerpc/sysdev/fsl_rio.c                 |  2 +-
 drivers/ata/pata_mpc52xx.c                    |  8 ++---
 drivers/ata/pata_of_platform.c                |  4 +--
 drivers/ata/sata_fsl.c                        |  8 ++---
 drivers/atm/fore200e.c                        | 26 ++++++++--------
 drivers/block/xsysace.c                       |  4 +--
 drivers/char/hw_random/n2-drv.c               |  4 +--
 drivers/char/hw_random/n2rng.h                |  2 +-
 drivers/char/hw_random/pasemi-rng.c           |  4 +--
 drivers/char/ipmi/ipmi_si_intf.c              |  4 +--
 drivers/char/rtc.c                            |  2 +-
 drivers/char/xilinx_hwicap/xilinx_hwicap.c    |  4 +--
 drivers/crypto/amcc/crypto4xx_core.c          |  4 +--
 drivers/crypto/amcc/crypto4xx_core.h          |  2 +-
 drivers/crypto/n2_core.c                      | 22 +++++++-------
 drivers/crypto/talitos.c                      |  6 ++--
 drivers/dma/fsldma.c                          |  4 +--
 drivers/dma/mpc512x_dma.c                     |  4 +--
 drivers/dma/ppc4xx/adma.c                     |  8 ++---
 drivers/edac/mpc85xx_edac.c                   | 12 ++++----
 drivers/edac/ppc4xx_edac.c                    | 12 ++++----
 drivers/hwmon/ams/ams.h                       |  2 +-
 drivers/hwmon/ultra45_env.c                   |  4 +--
 drivers/i2c/busses/i2c-cpm.c                  |  8 ++---
 drivers/i2c/busses/i2c-ibm_iic.c              |  6 ++--
 drivers/i2c/busses/i2c-mpc.c                  |  4 +--
 drivers/infiniband/hw/ehca/ehca_classes.h     |  2 +-
 drivers/infiniband/hw/ehca/ehca_main.c        |  4 +--
 drivers/input/misc/sparcspkr.c                | 10 +++---
 drivers/input/serio/i8042-sparcio.h           |  8 ++---
 drivers/input/serio/xilinx_ps2.c              |  4 +--
 drivers/leds/leds-gpio.c                      |  4 +--
 drivers/macintosh/macio_sysfs.c               |  6 ++--
 drivers/macintosh/smu.c                       |  6 ++--
 drivers/macintosh/therm_adt746x.c             |  2 +-
 drivers/macintosh/therm_pm72.c                |  6 ++--
 drivers/macintosh/therm_windtunnel.c          |  6 ++--
 drivers/media/video/fsl-viu.c                 |  8 ++---
 drivers/mmc/host/sdhci-of-core.c              |  8 ++---
 drivers/mtd/maps/physmap_of.c                 |  8 ++---
 drivers/mtd/maps/sun_uflash.c                 |  6 ++--
 drivers/mtd/nand/fsl_elbc_nand.c              |  4 +--
 drivers/mtd/nand/fsl_upm.c                    |  4 +--
 drivers/mtd/nand/mpc5121_nfc.c                |  4 +--
 drivers/mtd/nand/ndfc.c                       |  6 ++--
 drivers/mtd/nand/pasemi_nand.c                |  4 +--
 drivers/mtd/nand/socrates_nand.c              |  4 +--
 drivers/net/can/mscan/mpc5xxx_can.c           | 18 +++++------
 drivers/net/can/sja1000/sja1000_of_platform.c |  4 +--
 drivers/net/ehea/ehea.h                       |  4 +--
 drivers/net/ehea/ehea_main.c                  | 12 ++++----
 drivers/net/fec_mpc52xx.c                     |  8 ++---
 drivers/net/fec_mpc52xx_phy.c                 |  4 +--
 drivers/net/fs_enet/fs_enet-main.c            |  4 +--
 drivers/net/fs_enet/mac-fcc.c                 |  2 +-
 drivers/net/fs_enet/mac-fec.c                 |  2 +-
 drivers/net/fs_enet/mac-scc.c                 |  2 +-
 drivers/net/fs_enet/mii-bitbang.c             |  4 +--
 drivers/net/fs_enet/mii-fec.c                 |  4 +--
 drivers/net/fsl_pq_mdio.c                     |  4 +--
 drivers/net/gianfar.c                         | 10 +++---
 drivers/net/gianfar.h                         |  2 +-
 drivers/net/greth.c                           |  4 +--
 drivers/net/greth.h                           |  2 +-
 drivers/net/ibm_newemac/core.c                |  6 ++--
 drivers/net/ibm_newemac/core.h                | 12 ++++----
 drivers/net/ibm_newemac/mal.c                 |  4 +--
 drivers/net/ibm_newemac/mal.h                 |  2 +-
 drivers/net/ibm_newemac/rgmii.c               | 18 +++++------
 drivers/net/ibm_newemac/rgmii.h               | 16 +++++-----
 drivers/net/ibm_newemac/tah.c                 | 14 ++++-----
 drivers/net/ibm_newemac/tah.h                 | 12 ++++----
 drivers/net/ibm_newemac/zmii.c                | 18 +++++------
 drivers/net/ibm_newemac/zmii.h                | 16 +++++-----
 drivers/net/ll_temac_main.c                   |  8 ++---
 drivers/net/myri_sbus.c                       |  4 +--
 drivers/net/myri_sbus.h                       |  2 +-
 drivers/net/niu.c                             |  8 ++---
 drivers/net/phy/mdio-gpio.c                   |  4 +--
 drivers/net/sunbmac.c                         | 18 +++++------
 drivers/net/sunbmac.h                         |  4 +--
 drivers/net/sunhme.c                          | 22 +++++++-------
 drivers/net/sunhme.h                          |  2 +-
 drivers/net/sunlance.c                        | 20 ++++++------
 drivers/net/sunqe.c                           | 16 +++++-----
 drivers/net/sunqe.h                           |  4 +--
 drivers/net/ucc_geth.c                        |  8 ++---
 drivers/net/xilinx_emaclite.c                 |  6 ++--
 drivers/of/device.c                           |  2 +-
 drivers/parport/parport_sunbpp.c              |  4 +--
 drivers/pcmcia/electra_cf.c                   |  6 ++--
 drivers/pcmcia/m8xx_pcmcia.c                  |  4 +--
 drivers/rtc/rtc-mpc5121.c                     |  4 +--
 drivers/sbus/char/bbc_envctrl.c               |  6 ++--
 drivers/sbus/char/bbc_i2c.c                   | 18 +++++------
 drivers/sbus/char/bbc_i2c.h                   | 10 +++---
 drivers/sbus/char/display7seg.c               |  4 +--
 drivers/sbus/char/envctrl.c                   |  4 +--
 drivers/sbus/char/flash.c                     |  4 +--
 drivers/sbus/char/uctrl.c                     |  4 +--
 drivers/scsi/qlogicpti.c                      | 14 ++++-----
 drivers/scsi/qlogicpti.h                      |  2 +-
 drivers/scsi/sun_esp.c                        | 44 +++++++++++++--------------
 drivers/serial/apbuart.c                      |  2 +-
 drivers/serial/cpm_uart/cpm_uart_core.c       |  4 +--
 drivers/serial/mpc52xx_uart.c                 |  8 ++---
 drivers/serial/nwpserial.c                    |  2 +-
 drivers/serial/of_serial.c                    |  6 ++--
 drivers/serial/sunhv.c                        |  4 +--
 drivers/serial/sunsab.c                       |  8 ++---
 drivers/serial/sunsu.c                        |  8 ++---
 drivers/serial/sunzilog.c                     |  6 ++--
 drivers/serial/uartlite.c                     |  4 +--
 drivers/serial/ucc_uart.c                     |  4 +--
 drivers/spi/mpc512x_psc_spi.c                 |  4 +--
 drivers/spi/mpc52xx_psc_spi.c                 |  4 +--
 drivers/spi/mpc52xx_spi.c                     |  4 +--
 drivers/spi/spi_mpc8xxx.c                     |  4 +--
 drivers/spi/spi_ppc4xx.c                      |  6 ++--
 drivers/spi/xilinx_spi_of.c                   |  6 ++--
 drivers/usb/gadget/fsl_qe_udc.c               | 10 +++---
 drivers/usb/host/ehci-ppc-of.c                |  6 ++--
 drivers/usb/host/ehci-xilinx-of.c             | 12 ++++----
 drivers/usb/host/fhci-hcd.c                   |  4 +--
 drivers/usb/host/isp1760-if.c                 |  4 +--
 drivers/usb/host/ohci-ppc-of.c                |  6 ++--
 drivers/video/bw2.c                           |  4 +--
 drivers/video/cg14.c                          |  6 ++--
 drivers/video/cg3.c                           |  4 +--
 drivers/video/cg6.c                           |  6 ++--
 drivers/video/ffb.c                           |  4 +--
 drivers/video/fsl-diu-fb.c                    |  8 ++---
 drivers/video/leo.c                           |  6 ++--
 drivers/video/mb862xx/mb862xxfb.c             |  4 +--
 drivers/video/p9100.c                         |  4 +--
 drivers/video/platinumfb.c                    |  4 +--
 drivers/video/sunxvr1000.c                    |  4 +--
 drivers/video/tcx.c                           |  6 ++--
 drivers/video/xilinxfb.c                      |  4 +--
 drivers/watchdog/cpwd.c                       |  4 +--
 drivers/watchdog/gef_wdt.c                    |  2 +-
 drivers/watchdog/mpc8xxx_wdt.c                |  4 +--
 drivers/watchdog/riowd.c                      |  4 +--
 include/linux/of_device.h                     | 16 ----------
 include/linux/of_platform.h                   | 14 +++++++--
 sound/aoa/soundbus/core.c                     |  2 +-
 sound/aoa/soundbus/soundbus.h                 |  2 +-
 sound/aoa/soundbus/sysfs.c                    |  2 +-
 sound/soc/fsl/mpc5200_dma.c                   |  4 +--
 sound/soc/fsl/mpc5200_dma.h                   |  4 +--
 sound/soc/fsl/mpc5200_psc_ac97.c              |  4 +--
 sound/soc/fsl/mpc5200_psc_i2s.c               |  4 +--
 sound/soc/fsl/mpc8610_hpcd.c                  |  4 +--
 sound/sparc/amd7930.c                         |  8 ++---
 sound/sparc/cs4231.c                          | 18 +++++------
 sound/sparc/dbri.c                            |  8 ++---
 158 files changed, 519 insertions(+), 527 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/83xx/mpc837x_mds.c b/arch/powerpc/platforms/83xx/mpc837x_mds.c
index 51df7e754698..f9751c8905be 100644
--- a/arch/powerpc/platforms/83xx/mpc837x_mds.c
+++ b/arch/powerpc/platforms/83xx/mpc837x_mds.c
@@ -102,7 +102,7 @@ static struct of_device_id mpc837x_ids[] = {
 
 static int __init mpc837x_declare_of_platform_devices(void)
 {
-	/* Publish of_device */
+	/* Publish platform_device */
 	of_platform_bus_probe(NULL, mpc837x_ids, NULL);
 
 	return 0;
diff --git a/arch/powerpc/platforms/83xx/mpc837x_rdb.c b/arch/powerpc/platforms/83xx/mpc837x_rdb.c
index e00801c42540..910caa6b5810 100644
--- a/arch/powerpc/platforms/83xx/mpc837x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc837x_rdb.c
@@ -78,7 +78,7 @@ static struct of_device_id mpc837x_ids[] = {
 
 static int __init mpc837x_declare_of_platform_devices(void)
 {
-	/* Publish of_device */
+	/* Publish platform_device */
 	of_platform_bus_probe(NULL, mpc837x_ids, NULL);
 
 	return 0;
diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c
index 8bd86530ee25..6425abe5b7db 100644
--- a/arch/powerpc/sysdev/fsl_rio.c
+++ b/arch/powerpc/sysdev/fsl_rio.c
@@ -1332,7 +1332,7 @@ static inline void fsl_rio_info(struct device *dev, u32 ccsr)
 
 /**
  * fsl_rio_setup - Setup Freescale PowerPC RapidIO interface
- * @dev: of_device pointer
+ * @dev: platform_device pointer
  *
  * Initializes MPC85xx RapidIO hardware interface, configures
  * master port with system-specific info, and registers the
diff --git a/drivers/ata/pata_mpc52xx.c b/drivers/ata/pata_mpc52xx.c
index f087ab55b1df..8cc536e49a0a 100644
--- a/drivers/ata/pata_mpc52xx.c
+++ b/drivers/ata/pata_mpc52xx.c
@@ -680,7 +680,7 @@ mpc52xx_ata_remove_one(struct device *dev)
 /* ======================================================================== */
 
 static int __devinit
-mpc52xx_ata_probe(struct of_device *op, const struct of_device_id *match)
+mpc52xx_ata_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	unsigned int ipb_freq;
 	struct resource res_mem;
@@ -821,7 +821,7 @@ mpc52xx_ata_probe(struct of_device *op, const struct of_device_id *match)
 }
 
 static int
-mpc52xx_ata_remove(struct of_device *op)
+mpc52xx_ata_remove(struct platform_device *op)
 {
 	struct mpc52xx_ata_priv *priv;
 	int task_irq;
@@ -848,7 +848,7 @@ mpc52xx_ata_remove(struct of_device *op)
 #ifdef CONFIG_PM
 
 static int
-mpc52xx_ata_suspend(struct of_device *op, pm_message_t state)
+mpc52xx_ata_suspend(struct platform_device *op, pm_message_t state)
 {
 	struct ata_host *host = dev_get_drvdata(&op->dev);
 
@@ -856,7 +856,7 @@ mpc52xx_ata_suspend(struct of_device *op, pm_message_t state)
 }
 
 static int
-mpc52xx_ata_resume(struct of_device *op)
+mpc52xx_ata_resume(struct platform_device *op)
 {
 	struct ata_host *host = dev_get_drvdata(&op->dev);
 	struct mpc52xx_ata_priv *priv = host->private_data;
diff --git a/drivers/ata/pata_of_platform.c b/drivers/ata/pata_of_platform.c
index 5a1b82c08be9..480e043ce6b8 100644
--- a/drivers/ata/pata_of_platform.c
+++ b/drivers/ata/pata_of_platform.c
@@ -14,7 +14,7 @@
 #include <linux/of_platform.h>
 #include <linux/ata_platform.h>
 
-static int __devinit pata_of_platform_probe(struct of_device *ofdev,
+static int __devinit pata_of_platform_probe(struct platform_device *ofdev,
 					    const struct of_device_id *match)
 {
 	int ret;
@@ -78,7 +78,7 @@ static int __devinit pata_of_platform_probe(struct of_device *ofdev,
 				     reg_shift, pio_mask);
 }
 
-static int __devexit pata_of_platform_remove(struct of_device *ofdev)
+static int __devexit pata_of_platform_remove(struct platform_device *ofdev)
 {
 	return __pata_platform_remove(&ofdev->dev);
 }
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index 61c89b54ea23..4e4d42f1186a 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -1296,7 +1296,7 @@ static const struct ata_port_info sata_fsl_port_info[] = {
 	 },
 };
 
-static int sata_fsl_probe(struct of_device *ofdev,
+static int sata_fsl_probe(struct platform_device *ofdev,
 			const struct of_device_id *match)
 {
 	int retval = -ENXIO;
@@ -1370,7 +1370,7 @@ error_exit_with_cleanup:
 	return retval;
 }
 
-static int sata_fsl_remove(struct of_device *ofdev)
+static int sata_fsl_remove(struct platform_device *ofdev)
 {
 	struct ata_host *host = dev_get_drvdata(&ofdev->dev);
 	struct sata_fsl_host_priv *host_priv = host->private_data;
@@ -1387,13 +1387,13 @@ static int sata_fsl_remove(struct of_device *ofdev)
 }
 
 #ifdef CONFIG_PM
-static int sata_fsl_suspend(struct of_device *op, pm_message_t state)
+static int sata_fsl_suspend(struct platform_device *op, pm_message_t state)
 {
 	struct ata_host *host = dev_get_drvdata(&op->dev);
 	return ata_host_suspend(host, state);
 }
 
-static int sata_fsl_resume(struct of_device *op)
+static int sata_fsl_resume(struct platform_device *op)
 {
 	struct ata_host *host = dev_get_drvdata(&op->dev);
 	struct sata_fsl_host_priv *host_priv = host->private_data;
diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index b7385e077717..c8fc69c85a06 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -674,7 +674,7 @@ static void fore200e_sba_write(u32 val, volatile u32 __iomem *addr)
 
 static u32 fore200e_sba_dma_map(struct fore200e *fore200e, void* virt_addr, int size, int direction)
 {
-	struct of_device *op = fore200e->bus_dev;
+	struct platform_device *op = fore200e->bus_dev;
 	u32 dma_addr;
 
 	dma_addr = dma_map_single(&op->dev, virt_addr, size, direction);
@@ -687,7 +687,7 @@ static u32 fore200e_sba_dma_map(struct fore200e *fore200e, void* virt_addr, int
 
 static void fore200e_sba_dma_unmap(struct fore200e *fore200e, u32 dma_addr, int size, int direction)
 {
-	struct of_device *op = fore200e->bus_dev;
+	struct platform_device *op = fore200e->bus_dev;
 
 	DPRINTK(3, "SBUS DVMA unmapping: dma_addr = 0x%08x, size = %d, direction = %d,\n",
 		dma_addr, size, direction);
@@ -697,7 +697,7 @@ static void fore200e_sba_dma_unmap(struct fore200e *fore200e, u32 dma_addr, int
 
 static void fore200e_sba_dma_sync_for_cpu(struct fore200e *fore200e, u32 dma_addr, int size, int direction)
 {
-	struct of_device *op = fore200e->bus_dev;
+	struct platform_device *op = fore200e->bus_dev;
 
 	DPRINTK(3, "SBUS DVMA sync: dma_addr = 0x%08x, size = %d, direction = %d\n", dma_addr, size, direction);
     
@@ -706,7 +706,7 @@ static void fore200e_sba_dma_sync_for_cpu(struct fore200e *fore200e, u32 dma_add
 
 static void fore200e_sba_dma_sync_for_device(struct fore200e *fore200e, u32 dma_addr, int size, int direction)
 {
-	struct of_device *op = fore200e->bus_dev;
+	struct platform_device *op = fore200e->bus_dev;
 
 	DPRINTK(3, "SBUS DVMA sync: dma_addr = 0x%08x, size = %d, direction = %d\n", dma_addr, size, direction);
 
@@ -719,7 +719,7 @@ static void fore200e_sba_dma_sync_for_device(struct fore200e *fore200e, u32 dma_
 static int fore200e_sba_dma_chunk_alloc(struct fore200e *fore200e, struct chunk *chunk,
 					int size, int nbr, int alignment)
 {
-	struct of_device *op = fore200e->bus_dev;
+	struct platform_device *op = fore200e->bus_dev;
 
 	chunk->alloc_size = chunk->align_size = size * nbr;
 
@@ -738,7 +738,7 @@ static int fore200e_sba_dma_chunk_alloc(struct fore200e *fore200e, struct chunk
 /* free a DVMA consistent chunk of memory */
 static void fore200e_sba_dma_chunk_free(struct fore200e *fore200e, struct chunk *chunk)
 {
-	struct of_device *op = fore200e->bus_dev;
+	struct platform_device *op = fore200e->bus_dev;
 
 	dma_free_coherent(&op->dev, chunk->alloc_size,
 			  chunk->alloc_addr, chunk->dma_addr);
@@ -770,7 +770,7 @@ static void fore200e_sba_reset(struct fore200e *fore200e)
 
 static int __init fore200e_sba_map(struct fore200e *fore200e)
 {
-	struct of_device *op = fore200e->bus_dev;
+	struct platform_device *op = fore200e->bus_dev;
 	unsigned int bursts;
 
 	/* gain access to the SBA specific registers  */
@@ -800,7 +800,7 @@ static int __init fore200e_sba_map(struct fore200e *fore200e)
 
 static void fore200e_sba_unmap(struct fore200e *fore200e)
 {
-	struct of_device *op = fore200e->bus_dev;
+	struct platform_device *op = fore200e->bus_dev;
 
 	of_iounmap(&op->resource[0], fore200e->regs.sba.hcr, SBA200E_HCR_LENGTH);
 	of_iounmap(&op->resource[1], fore200e->regs.sba.bsr, SBA200E_BSR_LENGTH);
@@ -816,7 +816,7 @@ static int __init fore200e_sba_configure(struct fore200e *fore200e)
 
 static int __init fore200e_sba_prom_read(struct fore200e *fore200e, struct prom_data *prom)
 {
-	struct of_device *op = fore200e->bus_dev;
+	struct platform_device *op = fore200e->bus_dev;
 	const u8 *prop;
 	int len;
 
@@ -840,7 +840,7 @@ static int __init fore200e_sba_prom_read(struct fore200e *fore200e, struct prom_
 
 static int fore200e_sba_proc_read(struct fore200e *fore200e, char *page)
 {
-	struct of_device *op = fore200e->bus_dev;
+	struct platform_device *op = fore200e->bus_dev;
 	const struct linux_prom_registers *regs;
 
 	regs = of_get_property(op->dev.of_node, "reg", NULL);
@@ -2513,7 +2513,7 @@ fore200e_load_and_start_fw(struct fore200e* fore200e)
 	device = &((struct pci_dev *) fore200e->bus_dev)->dev;
 #ifdef CONFIG_SBUS
     else if (strcmp(fore200e->bus->model_name, "SBA-200E") == 0)
-	device = &((struct of_device *) fore200e->bus_dev)->dev;
+	device = &((struct platform_device *) fore200e->bus_dev)->dev;
 #endif
     else
 	return err;
@@ -2643,7 +2643,7 @@ fore200e_init(struct fore200e* fore200e)
 }
 
 #ifdef CONFIG_SBUS
-static int __devinit fore200e_sba_probe(struct of_device *op,
+static int __devinit fore200e_sba_probe(struct platform_device *op,
 					const struct of_device_id *match)
 {
 	const struct fore200e_bus *bus = match->data;
@@ -2675,7 +2675,7 @@ static int __devinit fore200e_sba_probe(struct of_device *op,
 	return 0;
 }
 
-static int __devexit fore200e_sba_remove(struct of_device *op)
+static int __devexit fore200e_sba_remove(struct platform_device *op)
 {
 	struct fore200e *fore200e = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index a7b83c0a7eb5..d7cb69db3067 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -1188,7 +1188,7 @@ static struct platform_driver ace_platform_driver = {
 
 #if defined(CONFIG_OF)
 static int __devinit
-ace_of_probe(struct of_device *op, const struct of_device_id *match)
+ace_of_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct resource res;
 	resource_size_t physaddr;
@@ -1220,7 +1220,7 @@ ace_of_probe(struct of_device *op, const struct of_device_id *match)
 	return ace_alloc(&op->dev, id ? *id : 0, physaddr, irq, bus_width);
 }
 
-static int __devexit ace_of_remove(struct of_device *op)
+static int __devexit ace_of_remove(struct platform_device *op)
 {
 	ace_free(&op->dev);
 	return 0;
diff --git a/drivers/char/hw_random/n2-drv.c b/drivers/char/hw_random/n2-drv.c
index 7a4f080f8356..1acdb2509511 100644
--- a/drivers/char/hw_random/n2-drv.c
+++ b/drivers/char/hw_random/n2-drv.c
@@ -619,7 +619,7 @@ static void __devinit n2rng_driver_version(void)
 		pr_info("%s", version);
 }
 
-static int __devinit n2rng_probe(struct of_device *op,
+static int __devinit n2rng_probe(struct platform_device *op,
 				 const struct of_device_id *match)
 {
 	int victoria_falls = (match->data != NULL);
@@ -714,7 +714,7 @@ out:
 	return err;
 }
 
-static int __devexit n2rng_remove(struct of_device *op)
+static int __devexit n2rng_remove(struct platform_device *op)
 {
 	struct n2rng *np = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/char/hw_random/n2rng.h b/drivers/char/hw_random/n2rng.h
index a2b81e7bfc18..4bea07f30978 100644
--- a/drivers/char/hw_random/n2rng.h
+++ b/drivers/char/hw_random/n2rng.h
@@ -65,7 +65,7 @@ struct n2rng_unit {
 };
 
 struct n2rng {
-	struct of_device	*op;
+	struct platform_device	*op;
 
 	unsigned long		flags;
 #define N2RNG_FLAG_VF		0x00000001 /* Victoria Falls RNG, else N2 */
diff --git a/drivers/char/hw_random/pasemi-rng.c b/drivers/char/hw_random/pasemi-rng.c
index 261ba8f22b8b..a31c830ca8cd 100644
--- a/drivers/char/hw_random/pasemi-rng.c
+++ b/drivers/char/hw_random/pasemi-rng.c
@@ -94,7 +94,7 @@ static struct hwrng pasemi_rng = {
 	.data_read	= pasemi_rng_data_read,
 };
 
-static int __devinit rng_probe(struct of_device *ofdev,
+static int __devinit rng_probe(struct platform_device *ofdev,
 			       const struct of_device_id *match)
 {
 	void __iomem *rng_regs;
@@ -123,7 +123,7 @@ static int __devinit rng_probe(struct of_device *ofdev,
 	return err;
 }
 
-static int __devexit rng_remove(struct of_device *dev)
+static int __devexit rng_remove(struct platform_device *dev)
 {
 	void __iomem *rng_regs = (void __iomem *)pasemi_rng.priv;
 
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 094bdc355b1f..b532d613fb5b 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -2502,7 +2502,7 @@ static struct pci_driver ipmi_pci_driver = {
 
 
 #ifdef CONFIG_PPC_OF
-static int __devinit ipmi_of_probe(struct of_device *dev,
+static int __devinit ipmi_of_probe(struct platform_device *dev,
 			 const struct of_device_id *match)
 {
 	struct smi_info *info;
@@ -2576,7 +2576,7 @@ static int __devinit ipmi_of_probe(struct of_device *dev,
 	return add_smi(info);
 }
 
-static int __devexit ipmi_of_remove(struct of_device *dev)
+static int __devexit ipmi_of_remove(struct platform_device *dev)
 {
 	cleanup_one_si(dev_get_drvdata(&dev->dev));
 	return 0;
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 95acb8c880f4..dfa8b3062fda 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -961,7 +961,7 @@ static int __init rtc_init(void)
 #endif
 #ifdef CONFIG_SPARC32
 	struct device_node *ebus_dp;
-	struct of_device *op;
+	struct platform_device *op;
 #else
 	void *r;
 #ifdef RTC_IRQ
diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.c b/drivers/char/xilinx_hwicap/xilinx_hwicap.c
index ed8a9cec2a05..0ed763cd2e77 100644
--- a/drivers/char/xilinx_hwicap/xilinx_hwicap.c
+++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.c
@@ -761,7 +761,7 @@ static struct platform_driver hwicap_platform_driver = {
 
 #if defined(CONFIG_OF)
 static int __devinit
-hwicap_of_probe(struct of_device *op, const struct of_device_id *match)
+hwicap_of_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct resource res;
 	const unsigned int *id;
@@ -798,7 +798,7 @@ hwicap_of_probe(struct of_device *op, const struct of_device_id *match)
 			regs);
 }
 
-static int __devexit hwicap_of_remove(struct of_device *op)
+static int __devexit hwicap_of_remove(struct platform_device *op)
 {
 	return hwicap_remove(&op->dev);
 }
diff --git a/drivers/crypto/amcc/crypto4xx_core.c b/drivers/crypto/amcc/crypto4xx_core.c
index 983530ba04a7..2b1baee525bc 100644
--- a/drivers/crypto/amcc/crypto4xx_core.c
+++ b/drivers/crypto/amcc/crypto4xx_core.c
@@ -1150,7 +1150,7 @@ struct crypto4xx_alg_common crypto4xx_alg[] = {
 /**
  * Module Initialization Routine
  */
-static int __init crypto4xx_probe(struct of_device *ofdev,
+static int __init crypto4xx_probe(struct platform_device *ofdev,
 				  const struct of_device_id *match)
 {
 	int rc;
@@ -1258,7 +1258,7 @@ err_alloc_dev:
 	return rc;
 }
 
-static int __exit crypto4xx_remove(struct of_device *ofdev)
+static int __exit crypto4xx_remove(struct platform_device *ofdev)
 {
 	struct device *dev = &ofdev->dev;
 	struct crypto4xx_core_device *core_dev = dev_get_drvdata(dev);
diff --git a/drivers/crypto/amcc/crypto4xx_core.h b/drivers/crypto/amcc/crypto4xx_core.h
index da9cbe3b9fc3..bac0bdeb4b5f 100644
--- a/drivers/crypto/amcc/crypto4xx_core.h
+++ b/drivers/crypto/amcc/crypto4xx_core.h
@@ -104,7 +104,7 @@ struct crypto4xx_device {
 
 struct crypto4xx_core_device {
 	struct device *device;
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 	struct crypto4xx_device *dev;
 	u32 int_status;
 	u32 irq;
diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c
index 26af2dd5d831..931fdd471d6d 100644
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -1552,7 +1552,7 @@ static void __exit n2_unregister_algs(void)
 
 /* To map CWQ queues to interrupt sources, the hypervisor API provides
  * a devino.  This isn't very useful to us because all of the
- * interrupts listed in the of_device node have been translated to
+ * interrupts listed in the device_node have been translated to
  * Linux virtual IRQ cookie numbers.
  *
  * So we have to back-translate, going through the 'intr' and 'ino'
@@ -1560,7 +1560,7 @@ static void __exit n2_unregister_algs(void)
  * 'interrupts' property entries, in order to to figure out which
  * devino goes to which already-translated IRQ.
  */
-static int find_devino_index(struct of_device *dev, struct spu_mdesc_info *ip,
+static int find_devino_index(struct platform_device *dev, struct spu_mdesc_info *ip,
 			     unsigned long dev_ino)
 {
 	const unsigned int *dev_intrs;
@@ -1588,7 +1588,7 @@ static int find_devino_index(struct of_device *dev, struct spu_mdesc_info *ip,
 	return -ENODEV;
 }
 
-static int spu_map_ino(struct of_device *dev, struct spu_mdesc_info *ip,
+static int spu_map_ino(struct platform_device *dev, struct spu_mdesc_info *ip,
 		       const char *irq_name, struct spu_queue *p,
 		       irq_handler_t handler)
 {
@@ -1736,7 +1736,7 @@ static void spu_list_destroy(struct list_head *list)
  * gathering cpu membership information.
  */
 static int spu_mdesc_walk_arcs(struct mdesc_handle *mdesc,
-			       struct of_device *dev,
+			       struct platform_device *dev,
 			       u64 node, struct spu_queue *p,
 			       struct spu_queue **table)
 {
@@ -1763,7 +1763,7 @@ static int spu_mdesc_walk_arcs(struct mdesc_handle *mdesc,
 
 /* Process an 'exec-unit' MDESC node of type 'cwq'.  */
 static int handle_exec_unit(struct spu_mdesc_info *ip, struct list_head *list,
-			    struct of_device *dev, struct mdesc_handle *mdesc,
+			    struct platform_device *dev, struct mdesc_handle *mdesc,
 			    u64 node, const char *iname, unsigned long q_type,
 			    irq_handler_t handler, struct spu_queue **table)
 {
@@ -1794,7 +1794,7 @@ static int handle_exec_unit(struct spu_mdesc_info *ip, struct list_head *list,
 	return spu_map_ino(dev, ip, iname, p, handler);
 }
 
-static int spu_mdesc_scan(struct mdesc_handle *mdesc, struct of_device *dev,
+static int spu_mdesc_scan(struct mdesc_handle *mdesc, struct platform_device *dev,
 			  struct spu_mdesc_info *ip, struct list_head *list,
 			  const char *exec_name, unsigned long q_type,
 			  irq_handler_t handler, struct spu_queue **table)
@@ -1855,7 +1855,7 @@ static int __devinit get_irq_props(struct mdesc_handle *mdesc, u64 node,
 }
 
 static int __devinit grab_mdesc_irq_props(struct mdesc_handle *mdesc,
-					  struct of_device *dev,
+					  struct platform_device *dev,
 					  struct spu_mdesc_info *ip,
 					  const char *node_name)
 {
@@ -2004,7 +2004,7 @@ static void __devinit n2_spu_driver_version(void)
 		pr_info("%s", version);
 }
 
-static int __devinit n2_crypto_probe(struct of_device *dev,
+static int __devinit n2_crypto_probe(struct platform_device *dev,
 				     const struct of_device_id *match)
 {
 	struct mdesc_handle *mdesc;
@@ -2081,7 +2081,7 @@ out_free_n2cp:
 	return err;
 }
 
-static int __devexit n2_crypto_remove(struct of_device *dev)
+static int __devexit n2_crypto_remove(struct platform_device *dev)
 {
 	struct n2_crypto *np = dev_get_drvdata(&dev->dev);
 
@@ -2116,7 +2116,7 @@ static void free_ncp(struct n2_mau *mp)
 	kfree(mp);
 }
 
-static int __devinit n2_mau_probe(struct of_device *dev,
+static int __devinit n2_mau_probe(struct platform_device *dev,
 				     const struct of_device_id *match)
 {
 	struct mdesc_handle *mdesc;
@@ -2184,7 +2184,7 @@ out_free_ncp:
 	return err;
 }
 
-static int __devexit n2_mau_remove(struct of_device *dev)
+static int __devexit n2_mau_remove(struct platform_device *dev)
 {
 	struct n2_mau *mp = dev_get_drvdata(&dev->dev);
 
diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 97f4af1d8a64..4bcd825b5739 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -118,7 +118,7 @@ struct talitos_channel {
 
 struct talitos_private {
 	struct device *dev;
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 	void __iomem *reg;
 	int irq;
 
@@ -2308,7 +2308,7 @@ static int hw_supports(struct device *dev, __be32 desc_hdr_template)
 	return ret;
 }
 
-static int talitos_remove(struct of_device *ofdev)
+static int talitos_remove(struct platform_device *ofdev)
 {
 	struct device *dev = &ofdev->dev;
 	struct talitos_private *priv = dev_get_drvdata(dev);
@@ -2401,7 +2401,7 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev,
 	return t_alg;
 }
 
-static int talitos_probe(struct of_device *ofdev,
+static int talitos_probe(struct platform_device *ofdev,
 			 const struct of_device_id *match)
 {
 	struct device *dev = &ofdev->dev;
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index f0fd6db6063c..cea08bed9cf9 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -1297,7 +1297,7 @@ static void fsl_dma_chan_remove(struct fsldma_chan *chan)
 	kfree(chan);
 }
 
-static int __devinit fsldma_of_probe(struct of_device *op,
+static int __devinit fsldma_of_probe(struct platform_device *op,
 			const struct of_device_id *match)
 {
 	struct fsldma_device *fdev;
@@ -1382,7 +1382,7 @@ out_return:
 	return err;
 }
 
-static int fsldma_of_remove(struct of_device *op)
+static int fsldma_of_remove(struct platform_device *op)
 {
 	struct fsldma_device *fdev;
 	unsigned int i;
diff --git a/drivers/dma/mpc512x_dma.c b/drivers/dma/mpc512x_dma.c
index 14a8c0f1698e..4e9cbf300594 100644
--- a/drivers/dma/mpc512x_dma.c
+++ b/drivers/dma/mpc512x_dma.c
@@ -627,7 +627,7 @@ mpc_dma_prep_memcpy(struct dma_chan *chan, dma_addr_t dst, dma_addr_t src,
 	return &mdesc->desc;
 }
 
-static int __devinit mpc_dma_probe(struct of_device *op,
+static int __devinit mpc_dma_probe(struct platform_device *op,
 					const struct of_device_id *match)
 {
 	struct device_node *dn = op->dev.of_node;
@@ -753,7 +753,7 @@ static int __devinit mpc_dma_probe(struct of_device *op,
 	return retval;
 }
 
-static int __devexit mpc_dma_remove(struct of_device *op)
+static int __devexit mpc_dma_remove(struct platform_device *op)
 {
 	struct device *dev = &op->dev;
 	struct mpc_dma *mdma = dev_get_drvdata(dev);
diff --git a/drivers/dma/ppc4xx/adma.c b/drivers/dma/ppc4xx/adma.c
index 7c3747902a37..0d58a4a4487f 100644
--- a/drivers/dma/ppc4xx/adma.c
+++ b/drivers/dma/ppc4xx/adma.c
@@ -4257,11 +4257,11 @@ static int ppc440spe_adma_setup_irqs(struct ppc440spe_adma_device *adev,
 				     struct ppc440spe_adma_chan *chan,
 				     int *initcode)
 {
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 	struct device_node *np;
 	int ret;
 
-	ofdev = container_of(adev->dev, struct of_device, dev);
+	ofdev = container_of(adev->dev, struct platform_device, dev);
 	np = ofdev->dev.of_node;
 	if (adev->id != PPC440SPE_XOR_ID) {
 		adev->err_irq = irq_of_parse_and_map(np, 1);
@@ -4393,7 +4393,7 @@ static void ppc440spe_adma_release_irqs(struct ppc440spe_adma_device *adev,
 /**
  * ppc440spe_adma_probe - probe the asynch device
  */
-static int __devinit ppc440spe_adma_probe(struct of_device *ofdev,
+static int __devinit ppc440spe_adma_probe(struct platform_device *ofdev,
 					  const struct of_device_id *match)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -4625,7 +4625,7 @@ out:
 /**
  * ppc440spe_adma_remove - remove the asynch device
  */
-static int __devexit ppc440spe_adma_remove(struct of_device *ofdev)
+static int __devexit ppc440spe_adma_remove(struct platform_device *ofdev)
 {
 	struct ppc440spe_adma_device *adev = dev_get_drvdata(&ofdev->dev);
 	struct device_node *np = ofdev->dev.of_node;
diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c
index 1052340e6802..8ea07b019543 100644
--- a/drivers/edac/mpc85xx_edac.c
+++ b/drivers/edac/mpc85xx_edac.c
@@ -200,7 +200,7 @@ static irqreturn_t mpc85xx_pci_isr(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit mpc85xx_pci_err_probe(struct of_device *op,
+static int __devinit mpc85xx_pci_err_probe(struct platform_device *op,
 					   const struct of_device_id *match)
 {
 	struct edac_pci_ctl_info *pci;
@@ -305,7 +305,7 @@ err:
 	return res;
 }
 
-static int mpc85xx_pci_err_remove(struct of_device *op)
+static int mpc85xx_pci_err_remove(struct platform_device *op)
 {
 	struct edac_pci_ctl_info *pci = dev_get_drvdata(&op->dev);
 	struct mpc85xx_pci_pdata *pdata = pci->pvt_info;
@@ -503,7 +503,7 @@ static irqreturn_t mpc85xx_l2_isr(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int __devinit mpc85xx_l2_err_probe(struct of_device *op,
+static int __devinit mpc85xx_l2_err_probe(struct platform_device *op,
 					  const struct of_device_id *match)
 {
 	struct edac_device_ctl_info *edac_dev;
@@ -613,7 +613,7 @@ err:
 	return res;
 }
 
-static int mpc85xx_l2_err_remove(struct of_device *op)
+static int mpc85xx_l2_err_remove(struct platform_device *op)
 {
 	struct edac_device_ctl_info *edac_dev = dev_get_drvdata(&op->dev);
 	struct mpc85xx_l2_pdata *pdata = edac_dev->pvt_info;
@@ -953,7 +953,7 @@ static void __devinit mpc85xx_init_csrows(struct mem_ctl_info *mci)
 	}
 }
 
-static int __devinit mpc85xx_mc_err_probe(struct of_device *op,
+static int __devinit mpc85xx_mc_err_probe(struct platform_device *op,
 					  const struct of_device_id *match)
 {
 	struct mem_ctl_info *mci;
@@ -1085,7 +1085,7 @@ err:
 	return res;
 }
 
-static int mpc85xx_mc_err_remove(struct of_device *op)
+static int mpc85xx_mc_err_remove(struct platform_device *op)
 {
 	struct mem_ctl_info *mci = dev_get_drvdata(&op->dev);
 	struct mpc85xx_mc_pdata *pdata = mci->pvt_info;
diff --git a/drivers/edac/ppc4xx_edac.c b/drivers/edac/ppc4xx_edac.c
index e78839e89a06..070cea41b661 100644
--- a/drivers/edac/ppc4xx_edac.c
+++ b/drivers/edac/ppc4xx_edac.c
@@ -184,9 +184,9 @@ struct ppc4xx_ecc_status {
 
 /* Function Prototypes */
 
-static int ppc4xx_edac_probe(struct of_device *device,
+static int ppc4xx_edac_probe(struct platform_device *device,
 			     const struct of_device_id *device_id);
-static int ppc4xx_edac_remove(struct of_device *device);
+static int ppc4xx_edac_remove(struct platform_device *device);
 
 /* Global Variables */
 
@@ -1014,7 +1014,7 @@ ppc4xx_edac_init_csrows(struct mem_ctl_info *mci, u32 mcopt1)
  */
 static int __devinit
 ppc4xx_edac_mc_init(struct mem_ctl_info *mci,
-		    struct of_device *op,
+		    struct platform_device *op,
 		    const struct of_device_id *match,
 		    const dcr_host_t *dcr_host,
 		    u32 mcopt1)
@@ -1108,7 +1108,7 @@ ppc4xx_edac_mc_init(struct mem_ctl_info *mci,
  * mapped and assigned.
  */
 static int __devinit
-ppc4xx_edac_register_irq(struct of_device *op, struct mem_ctl_info *mci)
+ppc4xx_edac_register_irq(struct platform_device *op, struct mem_ctl_info *mci)
 {
 	int status = 0;
 	int ded_irq, sec_irq;
@@ -1238,7 +1238,7 @@ ppc4xx_edac_map_dcrs(const struct device_node *np, dcr_host_t *dcr_host)
  * driver; otherwise, < 0 on error.
  */
 static int __devinit
-ppc4xx_edac_probe(struct of_device *op, const struct of_device_id *match)
+ppc4xx_edac_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	int status = 0;
 	u32 mcopt1, memcheck;
@@ -1359,7 +1359,7 @@ ppc4xx_edac_probe(struct of_device *op, const struct of_device_id *match)
  * Unconditionally returns 0.
  */
 static int
-ppc4xx_edac_remove(struct of_device *op)
+ppc4xx_edac_remove(struct platform_device *op)
 {
 	struct mem_ctl_info *mci = dev_get_drvdata(&op->dev);
 	struct ppc4xx_edac_pdata *pdata = mci->pvt_info;
diff --git a/drivers/hwmon/ams/ams.h b/drivers/hwmon/ams/ams.h
index b28d7e27a031..90f094d45450 100644
--- a/drivers/hwmon/ams/ams.h
+++ b/drivers/hwmon/ams/ams.h
@@ -23,7 +23,7 @@ struct ams {
 
 	/* General properties */
 	struct device_node *of_node;
-	struct of_device *of_dev;
+	struct platform_device *of_dev;
 	char has_device;
 	char vflag;
 	u32 orient1;
diff --git a/drivers/hwmon/ultra45_env.c b/drivers/hwmon/ultra45_env.c
index 89643261ccdb..d863e13a50b8 100644
--- a/drivers/hwmon/ultra45_env.c
+++ b/drivers/hwmon/ultra45_env.c
@@ -234,7 +234,7 @@ static const struct attribute_group env_group = {
 	.attrs = env_attributes,
 };
 
-static int __devinit env_probe(struct of_device *op,
+static int __devinit env_probe(struct platform_device *op,
 			       const struct of_device_id *match)
 {
 	struct env *p = kzalloc(sizeof(*p), GFP_KERNEL);
@@ -276,7 +276,7 @@ out_free:
 	goto out;
 }
 
-static int __devexit env_remove(struct of_device *op)
+static int __devexit env_remove(struct platform_device *op)
 {
 	struct env *p = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/i2c/busses/i2c-cpm.c b/drivers/i2c/busses/i2c-cpm.c
index e591de1bc704..f7bd2613cecc 100644
--- a/drivers/i2c/busses/i2c-cpm.c
+++ b/drivers/i2c/busses/i2c-cpm.c
@@ -105,7 +105,7 @@ struct i2c_reg {
 
 struct cpm_i2c {
 	char *base;
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 	struct i2c_adapter adap;
 	uint dp_addr;
 	int version; /* CPM1=1, CPM2=2 */
@@ -428,7 +428,7 @@ static const struct i2c_adapter cpm_ops = {
 
 static int __devinit cpm_i2c_setup(struct cpm_i2c *cpm)
 {
-	struct of_device *ofdev = cpm->ofdev;
+	struct platform_device *ofdev = cpm->ofdev;
 	const u32 *data;
 	int len, ret, i;
 	void __iomem *i2c_base;
@@ -634,7 +634,7 @@ static void cpm_i2c_shutdown(struct cpm_i2c *cpm)
 		cpm_muram_free(cpm->i2c_addr);
 }
 
-static int __devinit cpm_i2c_probe(struct of_device *ofdev,
+static int __devinit cpm_i2c_probe(struct platform_device *ofdev,
 			 const struct of_device_id *match)
 {
 	int result, len;
@@ -687,7 +687,7 @@ out_free:
 	return result;
 }
 
-static int __devexit cpm_i2c_remove(struct of_device *ofdev)
+static int __devexit cpm_i2c_remove(struct platform_device *ofdev)
 {
 	struct cpm_i2c *cpm = dev_get_drvdata(&ofdev->dev);
 
diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c
index 1168d61418c9..43ca32fddde2 100644
--- a/drivers/i2c/busses/i2c-ibm_iic.c
+++ b/drivers/i2c/busses/i2c-ibm_iic.c
@@ -661,7 +661,7 @@ static inline u8 iic_clckdiv(unsigned int opb)
 	return (u8)((opb + 9) / 10 - 1);
 }
 
-static int __devinit iic_request_irq(struct of_device *ofdev,
+static int __devinit iic_request_irq(struct platform_device *ofdev,
 				     struct ibm_iic_private *dev)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -692,7 +692,7 @@ static int __devinit iic_request_irq(struct of_device *ofdev,
 /*
  * Register single IIC interface
  */
-static int __devinit iic_probe(struct of_device *ofdev,
+static int __devinit iic_probe(struct platform_device *ofdev,
 			       const struct of_device_id *match)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -780,7 +780,7 @@ error_cleanup:
 /*
  * Cleanup initialized IIC interface
  */
-static int __devexit iic_remove(struct of_device *ofdev)
+static int __devexit iic_remove(struct platform_device *ofdev)
 {
 	struct ibm_iic_private *dev = dev_get_drvdata(&ofdev->dev);
 
diff --git a/drivers/i2c/busses/i2c-mpc.c b/drivers/i2c/busses/i2c-mpc.c
index 6545d1c99b61..a1c419a716af 100644
--- a/drivers/i2c/busses/i2c-mpc.c
+++ b/drivers/i2c/busses/i2c-mpc.c
@@ -560,7 +560,7 @@ static struct i2c_adapter mpc_ops = {
 	.timeout = HZ,
 };
 
-static int __devinit fsl_i2c_probe(struct of_device *op,
+static int __devinit fsl_i2c_probe(struct platform_device *op,
 				   const struct of_device_id *match)
 {
 	struct mpc_i2c *i2c;
@@ -646,7 +646,7 @@ static int __devinit fsl_i2c_probe(struct of_device *op,
 	return result;
 };
 
-static int __devexit fsl_i2c_remove(struct of_device *op)
+static int __devexit fsl_i2c_remove(struct platform_device *op)
 {
 	struct mpc_i2c *i2c = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
index 0136abd50dd4..aaf6023a4835 100644
--- a/drivers/infiniband/hw/ehca/ehca_classes.h
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -112,7 +112,7 @@ struct ehca_sport {
 
 struct ehca_shca {
 	struct ib_device ib_device;
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 	u8 num_ports;
 	int hw_level;
 	struct list_head shca_list;
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index ecb51b396c42..67c4534fb941 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -712,7 +712,7 @@ static struct attribute_group ehca_dev_attr_grp = {
 	.attrs = ehca_dev_attrs
 };
 
-static int __devinit ehca_probe(struct of_device *dev,
+static int __devinit ehca_probe(struct platform_device *dev,
 				const struct of_device_id *id)
 {
 	struct ehca_shca *shca;
@@ -878,7 +878,7 @@ probe1:
 	return -EINVAL;
 }
 
-static int __devexit ehca_remove(struct of_device *dev)
+static int __devexit ehca_remove(struct platform_device *dev)
 {
 	struct ehca_shca *shca = dev_get_drvdata(&dev->dev);
 	unsigned long flags;
diff --git a/drivers/input/misc/sparcspkr.c b/drivers/input/misc/sparcspkr.c
index f3bb92e9755f..8e130bf7d32b 100644
--- a/drivers/input/misc/sparcspkr.c
+++ b/drivers/input/misc/sparcspkr.c
@@ -173,7 +173,7 @@ static int __devinit sparcspkr_probe(struct device *dev)
 	return 0;
 }
 
-static int sparcspkr_shutdown(struct of_device *dev)
+static int sparcspkr_shutdown(struct platform_device *dev)
 {
 	struct sparcspkr_state *state = dev_get_drvdata(&dev->dev);
 	struct input_dev *input_dev = state->input_dev;
@@ -184,7 +184,7 @@ static int sparcspkr_shutdown(struct of_device *dev)
 	return 0;
 }
 
-static int __devinit bbc_beep_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit bbc_beep_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct sparcspkr_state *state;
 	struct bbc_beep_info *info;
@@ -231,7 +231,7 @@ out_err:
 	return err;
 }
 
-static int __devexit bbc_remove(struct of_device *op)
+static int __devexit bbc_remove(struct platform_device *op)
 {
 	struct sparcspkr_state *state = dev_get_drvdata(&op->dev);
 	struct input_dev *input_dev = state->input_dev;
@@ -269,7 +269,7 @@ static struct of_platform_driver bbc_beep_driver = {
 	.shutdown	= sparcspkr_shutdown,
 };
 
-static int __devinit grover_beep_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit grover_beep_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct sparcspkr_state *state;
 	struct grover_beep_info *info;
@@ -312,7 +312,7 @@ out_err:
 	return err;
 }
 
-static int __devexit grover_remove(struct of_device *op)
+static int __devexit grover_remove(struct platform_device *op)
 {
 	struct sparcspkr_state *state = dev_get_drvdata(&op->dev);
 	struct grover_beep_info *info = &state->u.grover;
diff --git a/drivers/input/serio/i8042-sparcio.h b/drivers/input/serio/i8042-sparcio.h
index cb2a24b94746..c5cc4508d6df 100644
--- a/drivers/input/serio/i8042-sparcio.h
+++ b/drivers/input/serio/i8042-sparcio.h
@@ -49,7 +49,7 @@ static inline void i8042_write_command(int val)
 #define OBP_PS2MS_NAME1		"kdmouse"
 #define OBP_PS2MS_NAME2		"mouse"
 
-static int __devinit sparc_i8042_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit sparc_i8042_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
 
@@ -57,7 +57,7 @@ static int __devinit sparc_i8042_probe(struct of_device *op, const struct of_dev
 	while (dp) {
 		if (!strcmp(dp->name, OBP_PS2KBD_NAME1) ||
 		    !strcmp(dp->name, OBP_PS2KBD_NAME2)) {
-			struct of_device *kbd = of_find_device_by_node(dp);
+			struct platform_device *kbd = of_find_device_by_node(dp);
 			unsigned int irq = kbd->archdata.irqs[0];
 			if (irq == 0xffffffff)
 				irq = op->archdata.irqs[0];
@@ -67,7 +67,7 @@ static int __devinit sparc_i8042_probe(struct of_device *op, const struct of_dev
 			kbd_res = &kbd->resource[0];
 		} else if (!strcmp(dp->name, OBP_PS2MS_NAME1) ||
 			   !strcmp(dp->name, OBP_PS2MS_NAME2)) {
-			struct of_device *ms = of_find_device_by_node(dp);
+			struct platform_device *ms = of_find_device_by_node(dp);
 			unsigned int irq = ms->archdata.irqs[0];
 			if (irq == 0xffffffff)
 				irq = op->archdata.irqs[0];
@@ -80,7 +80,7 @@ static int __devinit sparc_i8042_probe(struct of_device *op, const struct of_dev
 	return 0;
 }
 
-static int __devexit sparc_i8042_remove(struct of_device *op)
+static int __devexit sparc_i8042_remove(struct platform_device *op)
 {
 	of_iounmap(kbd_res, kbd_iobase, 8);
 
diff --git a/drivers/input/serio/xilinx_ps2.c b/drivers/input/serio/xilinx_ps2.c
index e2c028d2638f..bb14449fb022 100644
--- a/drivers/input/serio/xilinx_ps2.c
+++ b/drivers/input/serio/xilinx_ps2.c
@@ -232,7 +232,7 @@ static void sxps2_close(struct serio *pserio)
  * It returns 0, if the driver is bound to the PS/2 device, or a negative
  * value if there is an error.
  */
-static int __devinit xps2_of_probe(struct of_device *ofdev,
+static int __devinit xps2_of_probe(struct platform_device *ofdev,
 				   const struct of_device_id *match)
 {
 	struct resource r_irq; /* Interrupt resources */
@@ -332,7 +332,7 @@ failed1:
  * if the driver module is being unloaded. It frees any resources allocated to
  * the device.
  */
-static int __devexit xps2_of_remove(struct of_device *of_dev)
+static int __devexit xps2_of_remove(struct platform_device *of_dev)
 {
 	struct device *dev = &of_dev->dev;
 	struct xps2data *drvdata = dev_get_drvdata(dev);
diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
index cc22eeefa10b..ea57e05d08f3 100644
--- a/drivers/leds/leds-gpio.c
+++ b/drivers/leds/leds-gpio.c
@@ -224,7 +224,7 @@ struct gpio_led_of_platform_data {
 	struct gpio_led_data led_data[];
 };
 
-static int __devinit of_gpio_leds_probe(struct of_device *ofdev,
+static int __devinit of_gpio_leds_probe(struct platform_device *ofdev,
 					const struct of_device_id *match)
 {
 	struct device_node *np = ofdev->dev.of_node, *child;
@@ -283,7 +283,7 @@ err:
 	return ret;
 }
 
-static int __devexit of_gpio_leds_remove(struct of_device *ofdev)
+static int __devexit of_gpio_leds_remove(struct platform_device *ofdev)
 {
 	struct gpio_led_of_platform_data *pdata = dev_get_drvdata(&ofdev->dev);
 	int i;
diff --git a/drivers/macintosh/macio_sysfs.c b/drivers/macintosh/macio_sysfs.c
index 6024038a5b9d..8eb40afbd0f5 100644
--- a/drivers/macintosh/macio_sysfs.c
+++ b/drivers/macintosh/macio_sysfs.c
@@ -15,7 +15,7 @@ field##_show (struct device *dev, struct device_attribute *attr,	\
 static ssize_t
 compatible_show (struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct of_device *of;
+	struct platform_device *of;
 	const char *compat;
 	int cplen;
 	int length = 0;
@@ -52,9 +52,9 @@ static ssize_t modalias_show (struct device *dev, struct device_attribute *attr,
 static ssize_t devspec_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 
-	ofdev = to_of_device(dev);
+	ofdev = to_platform_device(dev);
 	return sprintf(buf, "%s\n", ofdev->dev.of_node->full_name);
 }
 
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index 2506c957712e..e58c3d33e035 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -75,7 +75,7 @@ struct smu_cmd_buf {
 struct smu_device {
 	spinlock_t		lock;
 	struct device_node	*of_node;
-	struct of_device	*of_dev;
+	struct platform_device	*of_dev;
 	int			doorbell;	/* doorbell gpio */
 	u32 __iomem		*db_buf;	/* doorbell buffer */
 	struct device_node	*db_node;
@@ -645,7 +645,7 @@ static void smu_expose_childs(struct work_struct *unused)
 
 static DECLARE_WORK(smu_expose_childs_work, smu_expose_childs);
 
-static int smu_platform_probe(struct of_device* dev,
+static int smu_platform_probe(struct platform_device* dev,
 			      const struct of_device_id *match)
 {
 	if (!smu)
@@ -695,7 +695,7 @@ static int __init smu_init_sysfs(void)
 
 device_initcall(smu_init_sysfs);
 
-struct of_device *smu_get_ofdev(void)
+struct platform_device *smu_get_ofdev(void)
 {
 	if (!smu)
 		return NULL;
diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c
index c42eeb43042d..d0d221332db0 100644
--- a/drivers/macintosh/therm_adt746x.c
+++ b/drivers/macintosh/therm_adt746x.c
@@ -84,7 +84,7 @@ struct thermostat {
 
 static enum {ADT7460, ADT7467} therm_type;
 static int therm_bus, therm_address;
-static struct of_device * of_dev;
+static struct platform_device * of_dev;
 static struct thermostat* thermostat;
 static struct task_struct *thread_therm = NULL;
 
diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
index e60605bd0ea9..44549272333c 100644
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -148,7 +148,7 @@
  * Driver statics
  */
 
-static struct of_device *		of_dev;
+static struct platform_device *		of_dev;
 static struct i2c_adapter *		u3_0;
 static struct i2c_adapter *		u3_1;
 static struct i2c_adapter *		k2;
@@ -2210,7 +2210,7 @@ static void fcu_lookup_fans(struct device_node *fcu_node)
 	}
 }
 
-static int fcu_of_probe(struct of_device* dev, const struct of_device_id *match)
+static int fcu_of_probe(struct platform_device* dev, const struct of_device_id *match)
 {
 	state = state_detached;
 
@@ -2221,7 +2221,7 @@ static int fcu_of_probe(struct of_device* dev, const struct of_device_id *match)
 	return i2c_add_driver(&therm_pm72_driver);
 }
 
-static int fcu_of_remove(struct of_device* dev)
+static int fcu_of_remove(struct platform_device* dev)
 {
 	i2c_del_driver(&therm_pm72_driver);
 
diff --git a/drivers/macintosh/therm_windtunnel.c b/drivers/macintosh/therm_windtunnel.c
index 5c9367acf0cf..133f195de1fd 100644
--- a/drivers/macintosh/therm_windtunnel.c
+++ b/drivers/macintosh/therm_windtunnel.c
@@ -52,7 +52,7 @@ static struct {
 	struct task_struct	*poll_task;
 	
 	struct mutex	 	lock;
-	struct of_device	*of_dev;
+	struct platform_device	*of_dev;
 	
 	struct i2c_client	*thermostat;
 	struct i2c_client	*fan;
@@ -444,13 +444,13 @@ static struct i2c_driver g4fan_driver = {
 /************************************************************************/
 
 static int
-therm_of_probe( struct of_device *dev, const struct of_device_id *match )
+therm_of_probe( struct platform_device *dev, const struct of_device_id *match )
 {
 	return i2c_add_driver( &g4fan_driver );
 }
 
 static int
-therm_of_remove( struct of_device *dev )
+therm_of_remove( struct platform_device *dev )
 {
 	i2c_del_driver( &g4fan_driver );
 	return 0;
diff --git a/drivers/media/video/fsl-viu.c b/drivers/media/video/fsl-viu.c
index 8f1c94f7e00c..43d208f1f586 100644
--- a/drivers/media/video/fsl-viu.c
+++ b/drivers/media/video/fsl-viu.c
@@ -1418,7 +1418,7 @@ static struct video_device viu_template = {
 	.current_norm   = V4L2_STD_NTSC_M,
 };
 
-static int __devinit viu_of_probe(struct of_device *op,
+static int __devinit viu_of_probe(struct platform_device *op,
 				  const struct of_device_id *match)
 {
 	struct viu_dev *viu_dev;
@@ -1549,7 +1549,7 @@ err:
 	return ret;
 }
 
-static int __devexit viu_of_remove(struct of_device *op)
+static int __devexit viu_of_remove(struct platform_device *op)
 {
 	struct v4l2_device *v4l2_dev = dev_get_drvdata(&op->dev);
 	struct viu_dev *dev = container_of(v4l2_dev, struct viu_dev, v4l2_dev);
@@ -1570,7 +1570,7 @@ static int __devexit viu_of_remove(struct of_device *op)
 }
 
 #ifdef CONFIG_PM
-static int viu_suspend(struct of_device *op, pm_message_t state)
+static int viu_suspend(struct platform_device *op, pm_message_t state)
 {
 	struct v4l2_device *v4l2_dev = dev_get_drvdata(&op->dev);
 	struct viu_dev *dev = container_of(v4l2_dev, struct viu_dev, v4l2_dev);
@@ -1579,7 +1579,7 @@ static int viu_suspend(struct of_device *op, pm_message_t state)
 	return 0;
 }
 
-static int viu_resume(struct of_device *op)
+static int viu_resume(struct platform_device *op)
 {
 	struct v4l2_device *v4l2_dev = dev_get_drvdata(&op->dev);
 	struct viu_dev *dev = container_of(v4l2_dev, struct viu_dev, v4l2_dev);
diff --git a/drivers/mmc/host/sdhci-of-core.c b/drivers/mmc/host/sdhci-of-core.c
index a2e9820cd42f..a6bd448a3b46 100644
--- a/drivers/mmc/host/sdhci-of-core.c
+++ b/drivers/mmc/host/sdhci-of-core.c
@@ -85,14 +85,14 @@ void sdhci_be32bs_writeb(struct sdhci_host *host, u8 val, int reg)
 
 #ifdef CONFIG_PM
 
-static int sdhci_of_suspend(struct of_device *ofdev, pm_message_t state)
+static int sdhci_of_suspend(struct platform_device *ofdev, pm_message_t state)
 {
 	struct sdhci_host *host = dev_get_drvdata(&ofdev->dev);
 
 	return mmc_suspend_host(host->mmc);
 }
 
-static int sdhci_of_resume(struct of_device *ofdev)
+static int sdhci_of_resume(struct platform_device *ofdev)
 {
 	struct sdhci_host *host = dev_get_drvdata(&ofdev->dev);
 
@@ -115,7 +115,7 @@ static bool __devinit sdhci_of_wp_inverted(struct device_node *np)
 	return machine_is(mpc837x_rdb) || machine_is(mpc837x_mds);
 }
 
-static int __devinit sdhci_of_probe(struct of_device *ofdev,
+static int __devinit sdhci_of_probe(struct platform_device *ofdev,
 				 const struct of_device_id *match)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -179,7 +179,7 @@ err_addr_map:
 	return ret;
 }
 
-static int __devexit sdhci_of_remove(struct of_device *ofdev)
+static int __devexit sdhci_of_remove(struct platform_device *ofdev)
 {
 	struct sdhci_host *host = dev_get_drvdata(&ofdev->dev);
 
diff --git a/drivers/mtd/maps/physmap_of.c b/drivers/mtd/maps/physmap_of.c
index ba124baa646d..8f9bab8dc8da 100644
--- a/drivers/mtd/maps/physmap_of.c
+++ b/drivers/mtd/maps/physmap_of.c
@@ -43,7 +43,7 @@ struct of_flash {
 #ifdef CONFIG_MTD_PARTITIONS
 #define OF_FLASH_PARTS(info)	((info)->parts)
 
-static int parse_obsolete_partitions(struct of_device *dev,
+static int parse_obsolete_partitions(struct platform_device *dev,
 				     struct of_flash *info,
 				     struct device_node *dp)
 {
@@ -93,7 +93,7 @@ static int parse_obsolete_partitions(struct of_device *dev,
 #define parse_partitions(info, dev)	(0)
 #endif /* MTD_PARTITIONS */
 
-static int of_flash_remove(struct of_device *dev)
+static int of_flash_remove(struct platform_device *dev)
 {
 	struct of_flash *info;
 	int i;
@@ -140,7 +140,7 @@ static int of_flash_remove(struct of_device *dev)
 /* Helper function to handle probing of the obsolete "direct-mapped"
  * compatible binding, which has an extra "probe-type" property
  * describing the type of flash probe necessary. */
-static struct mtd_info * __devinit obsolete_probe(struct of_device *dev,
+static struct mtd_info * __devinit obsolete_probe(struct platform_device *dev,
 						  struct map_info *map)
 {
 	struct device_node *dp = dev->dev.of_node;
@@ -215,7 +215,7 @@ static void __devinit of_free_probes(const char **probes)
 }
 #endif
 
-static int __devinit of_flash_probe(struct of_device *dev,
+static int __devinit of_flash_probe(struct platform_device *dev,
 				    const struct of_device_id *match)
 {
 #ifdef CONFIG_MTD_PARTITIONS
diff --git a/drivers/mtd/maps/sun_uflash.c b/drivers/mtd/maps/sun_uflash.c
index 8984236a8d0a..3582ba1f9b09 100644
--- a/drivers/mtd/maps/sun_uflash.c
+++ b/drivers/mtd/maps/sun_uflash.c
@@ -48,7 +48,7 @@ struct map_info uflash_map_templ = {
 	.bankwidth =	UFLASH_BUSWIDTH,
 };
 
-int uflash_devinit(struct of_device *op, struct device_node *dp)
+int uflash_devinit(struct platform_device *op, struct device_node *dp)
 {
 	struct uflash_dev *up;
 
@@ -108,7 +108,7 @@ int uflash_devinit(struct of_device *op, struct device_node *dp)
 	return 0;
 }
 
-static int __devinit uflash_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit uflash_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
 
@@ -121,7 +121,7 @@ static int __devinit uflash_probe(struct of_device *op, const struct of_device_i
 	return uflash_devinit(op, dp);
 }
 
-static int __devexit uflash_remove(struct of_device *op)
+static int __devexit uflash_remove(struct platform_device *op)
 {
 	struct uflash_dev *up = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/mtd/nand/fsl_elbc_nand.c b/drivers/mtd/nand/fsl_elbc_nand.c
index 5084cc517944..80de0bff6c3a 100644
--- a/drivers/mtd/nand/fsl_elbc_nand.c
+++ b/drivers/mtd/nand/fsl_elbc_nand.c
@@ -958,7 +958,7 @@ static int __devinit fsl_elbc_ctrl_init(struct fsl_elbc_ctrl *ctrl)
 	return 0;
 }
 
-static int fsl_elbc_ctrl_remove(struct of_device *ofdev)
+static int fsl_elbc_ctrl_remove(struct platform_device *ofdev)
 {
 	struct fsl_elbc_ctrl *ctrl = dev_get_drvdata(&ofdev->dev);
 	int i;
@@ -1013,7 +1013,7 @@ static irqreturn_t fsl_elbc_ctrl_irq(int irqno, void *data)
  * in the chip probe function.
 */
 
-static int __devinit fsl_elbc_ctrl_probe(struct of_device *ofdev,
+static int __devinit fsl_elbc_ctrl_probe(struct platform_device *ofdev,
                                          const struct of_device_id *match)
 {
 	struct device_node *child;
diff --git a/drivers/mtd/nand/fsl_upm.c b/drivers/mtd/nand/fsl_upm.c
index 1312eda57ba6..4eff8b25e5af 100644
--- a/drivers/mtd/nand/fsl_upm.c
+++ b/drivers/mtd/nand/fsl_upm.c
@@ -217,7 +217,7 @@ err:
 	return ret;
 }
 
-static int __devinit fun_probe(struct of_device *ofdev,
+static int __devinit fun_probe(struct platform_device *ofdev,
 			       const struct of_device_id *ofid)
 {
 	struct fsl_upm_nand *fun;
@@ -335,7 +335,7 @@ err1:
 	return ret;
 }
 
-static int __devexit fun_remove(struct of_device *ofdev)
+static int __devexit fun_remove(struct platform_device *ofdev)
 {
 	struct fsl_upm_nand *fun = dev_get_drvdata(&ofdev->dev);
 	int i;
diff --git a/drivers/mtd/nand/mpc5121_nfc.c b/drivers/mtd/nand/mpc5121_nfc.c
index 0a130dcaa129..df0c1da4ff49 100644
--- a/drivers/mtd/nand/mpc5121_nfc.c
+++ b/drivers/mtd/nand/mpc5121_nfc.c
@@ -647,7 +647,7 @@ static void mpc5121_nfc_free(struct device *dev, struct mtd_info *mtd)
 		iounmap(prv->csreg);
 }
 
-static int __devinit mpc5121_nfc_probe(struct of_device *op,
+static int __devinit mpc5121_nfc_probe(struct platform_device *op,
 					const struct of_device_id *match)
 {
 	struct device_node *rootnode, *dn = op->dev.of_node;
@@ -869,7 +869,7 @@ error:
 	return retval;
 }
 
-static int __devexit mpc5121_nfc_remove(struct of_device *op)
+static int __devexit mpc5121_nfc_remove(struct platform_device *op)
 {
 	struct device *dev = &op->dev;
 	struct mtd_info *mtd = dev_get_drvdata(dev);
diff --git a/drivers/mtd/nand/ndfc.c b/drivers/mtd/nand/ndfc.c
index 98fd2bdf8be1..510554e6c115 100644
--- a/drivers/mtd/nand/ndfc.c
+++ b/drivers/mtd/nand/ndfc.c
@@ -35,7 +35,7 @@
 
 
 struct ndfc_controller {
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 	void __iomem *ndfcbase;
 	struct mtd_info mtd;
 	struct nand_chip chip;
@@ -225,7 +225,7 @@ err:
 	return ret;
 }
 
-static int __devinit ndfc_probe(struct of_device *ofdev,
+static int __devinit ndfc_probe(struct platform_device *ofdev,
 				const struct of_device_id *match)
 {
 	struct ndfc_controller *ndfc = &ndfc_ctrl;
@@ -277,7 +277,7 @@ static int __devinit ndfc_probe(struct of_device *ofdev,
 	return 0;
 }
 
-static int __devexit ndfc_remove(struct of_device *ofdev)
+static int __devexit ndfc_remove(struct platform_device *ofdev)
 {
 	struct ndfc_controller *ndfc = dev_get_drvdata(&ofdev->dev);
 
diff --git a/drivers/mtd/nand/pasemi_nand.c b/drivers/mtd/nand/pasemi_nand.c
index f02af24d033a..6ddb2461d740 100644
--- a/drivers/mtd/nand/pasemi_nand.c
+++ b/drivers/mtd/nand/pasemi_nand.c
@@ -89,7 +89,7 @@ int pasemi_device_ready(struct mtd_info *mtd)
 	return !!(inl(lpcctl) & LBICTRL_LPCCTL_NR);
 }
 
-static int __devinit pasemi_nand_probe(struct of_device *ofdev,
+static int __devinit pasemi_nand_probe(struct platform_device *ofdev,
 				      const struct of_device_id *match)
 {
 	struct pci_dev *pdev;
@@ -185,7 +185,7 @@ static int __devinit pasemi_nand_probe(struct of_device *ofdev,
 	return err;
 }
 
-static int __devexit pasemi_nand_remove(struct of_device *ofdev)
+static int __devexit pasemi_nand_remove(struct platform_device *ofdev)
 {
 	struct nand_chip *chip;
 
diff --git a/drivers/mtd/nand/socrates_nand.c b/drivers/mtd/nand/socrates_nand.c
index cc728b12de82..a8e403eebedb 100644
--- a/drivers/mtd/nand/socrates_nand.c
+++ b/drivers/mtd/nand/socrates_nand.c
@@ -162,7 +162,7 @@ static const char *part_probes[] = { "cmdlinepart", NULL };
 /*
  * Probe for the NAND device.
  */
-static int __devinit socrates_nand_probe(struct of_device *ofdev,
+static int __devinit socrates_nand_probe(struct platform_device *ofdev,
 					 const struct of_device_id *ofid)
 {
 	struct socrates_nand_host *host;
@@ -276,7 +276,7 @@ out:
 /*
  * Remove a NAND device.
  */
-static int __devexit socrates_nand_remove(struct of_device *ofdev)
+static int __devexit socrates_nand_remove(struct platform_device *ofdev)
 {
 	struct socrates_nand_host *host = dev_get_drvdata(&ofdev->dev);
 	struct mtd_info *mtd = &host->mtd;
diff --git a/drivers/net/can/mscan/mpc5xxx_can.c b/drivers/net/can/mscan/mpc5xxx_can.c
index af753936e835..b1bdc909090f 100644
--- a/drivers/net/can/mscan/mpc5xxx_can.c
+++ b/drivers/net/can/mscan/mpc5xxx_can.c
@@ -38,7 +38,7 @@
 
 struct mpc5xxx_can_data {
 	unsigned int type;
-	u32 (*get_clock)(struct of_device *ofdev, const char *clock_name,
+	u32 (*get_clock)(struct platform_device *ofdev, const char *clock_name,
 			 int *mscan_clksrc);
 };
 
@@ -48,7 +48,7 @@ static struct of_device_id __devinitdata mpc52xx_cdm_ids[] = {
 	{}
 };
 
-static u32 __devinit mpc52xx_can_get_clock(struct of_device *ofdev,
+static u32 __devinit mpc52xx_can_get_clock(struct platform_device *ofdev,
 					   const char *clock_name,
 					   int *mscan_clksrc)
 {
@@ -101,7 +101,7 @@ static u32 __devinit mpc52xx_can_get_clock(struct of_device *ofdev,
 	return freq;
 }
 #else /* !CONFIG_PPC_MPC52xx */
-static u32 __devinit mpc52xx_can_get_clock(struct of_device *ofdev,
+static u32 __devinit mpc52xx_can_get_clock(struct platform_device *ofdev,
 					   const char *clock_name,
 					   int *mscan_clksrc)
 {
@@ -129,7 +129,7 @@ static struct of_device_id __devinitdata mpc512x_clock_ids[] = {
 	{}
 };
 
-static u32 __devinit mpc512x_can_get_clock(struct of_device *ofdev,
+static u32 __devinit mpc512x_can_get_clock(struct platform_device *ofdev,
 					   const char *clock_name,
 					   int *mscan_clksrc)
 {
@@ -239,7 +239,7 @@ exit_unmap:
 	return freq;
 }
 #else /* !CONFIG_PPC_MPC512x */
-static u32 __devinit mpc512x_can_get_clock(struct of_device *ofdev,
+static u32 __devinit mpc512x_can_get_clock(struct platform_device *ofdev,
 					   const char *clock_name,
 					   int *mscan_clksrc)
 {
@@ -247,7 +247,7 @@ static u32 __devinit mpc512x_can_get_clock(struct of_device *ofdev,
 }
 #endif /* CONFIG_PPC_MPC512x */
 
-static int __devinit mpc5xxx_can_probe(struct of_device *ofdev,
+static int __devinit mpc5xxx_can_probe(struct platform_device *ofdev,
 				       const struct of_device_id *id)
 {
 	struct mpc5xxx_can_data *data = (struct mpc5xxx_can_data *)id->data;
@@ -317,7 +317,7 @@ exit_unmap_mem:
 	return err;
 }
 
-static int __devexit mpc5xxx_can_remove(struct of_device *ofdev)
+static int __devexit mpc5xxx_can_remove(struct platform_device *ofdev)
 {
 	struct net_device *dev = dev_get_drvdata(&ofdev->dev);
 	struct mscan_priv *priv = netdev_priv(dev);
@@ -334,7 +334,7 @@ static int __devexit mpc5xxx_can_remove(struct of_device *ofdev)
 
 #ifdef CONFIG_PM
 static struct mscan_regs saved_regs;
-static int mpc5xxx_can_suspend(struct of_device *ofdev, pm_message_t state)
+static int mpc5xxx_can_suspend(struct platform_device *ofdev, pm_message_t state)
 {
 	struct net_device *dev = dev_get_drvdata(&ofdev->dev);
 	struct mscan_priv *priv = netdev_priv(dev);
@@ -345,7 +345,7 @@ static int mpc5xxx_can_suspend(struct of_device *ofdev, pm_message_t state)
 	return 0;
 }
 
-static int mpc5xxx_can_resume(struct of_device *ofdev)
+static int mpc5xxx_can_resume(struct platform_device *ofdev)
 {
 	struct net_device *dev = dev_get_drvdata(&ofdev->dev);
 	struct mscan_priv *priv = netdev_priv(dev);
diff --git a/drivers/net/can/sja1000/sja1000_of_platform.c b/drivers/net/can/sja1000/sja1000_of_platform.c
index ac1a83d7c204..5bfccfdf3bbb 100644
--- a/drivers/net/can/sja1000/sja1000_of_platform.c
+++ b/drivers/net/can/sja1000/sja1000_of_platform.c
@@ -67,7 +67,7 @@ static void sja1000_ofp_write_reg(const struct sja1000_priv *priv,
 	out_8(priv->reg_base + reg, val);
 }
 
-static int __devexit sja1000_ofp_remove(struct of_device *ofdev)
+static int __devexit sja1000_ofp_remove(struct platform_device *ofdev)
 {
 	struct net_device *dev = dev_get_drvdata(&ofdev->dev);
 	struct sja1000_priv *priv = netdev_priv(dev);
@@ -87,7 +87,7 @@ static int __devexit sja1000_ofp_remove(struct of_device *ofdev)
 	return 0;
 }
 
-static int __devinit sja1000_ofp_probe(struct of_device *ofdev,
+static int __devinit sja1000_ofp_probe(struct platform_device *ofdev,
 				       const struct of_device_id *id)
 {
 	struct device_node *np = ofdev->dev.of_node;
diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h
index 0060e422f171..99a929964e3c 100644
--- a/drivers/net/ehea/ehea.h
+++ b/drivers/net/ehea/ehea.h
@@ -413,7 +413,7 @@ struct ehea_port_res {
 
 struct ehea_adapter {
 	u64 handle;
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 	struct ehea_port *port[EHEA_MAX_PORTS];
 	struct ehea_eq *neq;       /* notification event queue */
 	struct tasklet_struct neq_tasklet;
@@ -465,7 +465,7 @@ struct ehea_port {
 	struct net_device *netdev;
 	struct net_device_stats stats;
 	struct ehea_port_res port_res[EHEA_MAX_PORT_RES];
-	struct of_device  ofdev; /* Open Firmware Device */
+	struct platform_device  ofdev; /* Open Firmware Device */
 	struct ehea_mc_list *mc_list;	 /* Multicast MAC addresses */
 	struct vlan_group *vgrp;
 	struct ehea_eq *qp_eq;
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index 3beba70b7dea..897719b49f96 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -107,10 +107,10 @@ struct ehea_fw_handle_array ehea_fw_handles;
 struct ehea_bcmc_reg_array ehea_bcmc_regs;
 
 
-static int __devinit ehea_probe_adapter(struct of_device *dev,
+static int __devinit ehea_probe_adapter(struct platform_device *dev,
 					const struct of_device_id *id);
 
-static int __devexit ehea_remove(struct of_device *dev);
+static int __devexit ehea_remove(struct platform_device *dev);
 
 static struct of_device_id ehea_device_table[] = {
 	{
@@ -3376,7 +3376,7 @@ static ssize_t ehea_remove_port(struct device *dev,
 static DEVICE_ATTR(probe_port, S_IWUSR, NULL, ehea_probe_port);
 static DEVICE_ATTR(remove_port, S_IWUSR, NULL, ehea_remove_port);
 
-int ehea_create_device_sysfs(struct of_device *dev)
+int ehea_create_device_sysfs(struct platform_device *dev)
 {
 	int ret = device_create_file(&dev->dev, &dev_attr_probe_port);
 	if (ret)
@@ -3387,13 +3387,13 @@ out:
 	return ret;
 }
 
-void ehea_remove_device_sysfs(struct of_device *dev)
+void ehea_remove_device_sysfs(struct platform_device *dev)
 {
 	device_remove_file(&dev->dev, &dev_attr_probe_port);
 	device_remove_file(&dev->dev, &dev_attr_remove_port);
 }
 
-static int __devinit ehea_probe_adapter(struct of_device *dev,
+static int __devinit ehea_probe_adapter(struct platform_device *dev,
 					const struct of_device_id *id)
 {
 	struct ehea_adapter *adapter;
@@ -3492,7 +3492,7 @@ out:
 	return ret;
 }
 
-static int __devexit ehea_remove(struct of_device *dev)
+static int __devexit ehea_remove(struct platform_device *dev)
 {
 	struct ehea_adapter *adapter = dev_get_drvdata(&dev->dev);
 	int i;
diff --git a/drivers/net/fec_mpc52xx.c b/drivers/net/fec_mpc52xx.c
index d1a5b17b2a95..e3e10b4add9c 100644
--- a/drivers/net/fec_mpc52xx.c
+++ b/drivers/net/fec_mpc52xx.c
@@ -850,7 +850,7 @@ static const struct net_device_ops mpc52xx_fec_netdev_ops = {
 /* ======================================================================== */
 
 static int __devinit
-mpc52xx_fec_probe(struct of_device *op, const struct of_device_id *match)
+mpc52xx_fec_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	int rv;
 	struct net_device *ndev;
@@ -995,7 +995,7 @@ err_netdev:
 }
 
 static int
-mpc52xx_fec_remove(struct of_device *op)
+mpc52xx_fec_remove(struct platform_device *op)
 {
 	struct net_device *ndev;
 	struct mpc52xx_fec_priv *priv;
@@ -1025,7 +1025,7 @@ mpc52xx_fec_remove(struct of_device *op)
 }
 
 #ifdef CONFIG_PM
-static int mpc52xx_fec_of_suspend(struct of_device *op, pm_message_t state)
+static int mpc52xx_fec_of_suspend(struct platform_device *op, pm_message_t state)
 {
 	struct net_device *dev = dev_get_drvdata(&op->dev);
 
@@ -1035,7 +1035,7 @@ static int mpc52xx_fec_of_suspend(struct of_device *op, pm_message_t state)
 	return 0;
 }
 
-static int mpc52xx_fec_of_resume(struct of_device *op)
+static int mpc52xx_fec_of_resume(struct platform_device *op)
 {
 	struct net_device *dev = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/net/fec_mpc52xx_phy.c b/drivers/net/fec_mpc52xx_phy.c
index dbaf72cbb233..0b4cb6f15984 100644
--- a/drivers/net/fec_mpc52xx_phy.c
+++ b/drivers/net/fec_mpc52xx_phy.c
@@ -61,7 +61,7 @@ static int mpc52xx_fec_mdio_write(struct mii_bus *bus, int phy_id, int reg,
 		data | FEC_MII_WRITE_FRAME);
 }
 
-static int mpc52xx_fec_mdio_probe(struct of_device *of,
+static int mpc52xx_fec_mdio_probe(struct platform_device *of,
 		const struct of_device_id *match)
 {
 	struct device *dev = &of->dev;
@@ -122,7 +122,7 @@ static int mpc52xx_fec_mdio_probe(struct of_device *of,
 	return err;
 }
 
-static int mpc52xx_fec_mdio_remove(struct of_device *of)
+static int mpc52xx_fec_mdio_remove(struct platform_device *of)
 {
 	struct device *dev = &of->dev;
 	struct mii_bus *bus = dev_get_drvdata(dev);
diff --git a/drivers/net/fs_enet/fs_enet-main.c b/drivers/net/fs_enet/fs_enet-main.c
index f08cff9020bd..d6e3111959ab 100644
--- a/drivers/net/fs_enet/fs_enet-main.c
+++ b/drivers/net/fs_enet/fs_enet-main.c
@@ -997,7 +997,7 @@ static const struct net_device_ops fs_enet_netdev_ops = {
 #endif
 };
 
-static int __devinit fs_enet_probe(struct of_device *ofdev,
+static int __devinit fs_enet_probe(struct platform_device *ofdev,
                                    const struct of_device_id *match)
 {
 	struct net_device *ndev;
@@ -1105,7 +1105,7 @@ out_free_fpi:
 	return ret;
 }
 
-static int fs_enet_remove(struct of_device *ofdev)
+static int fs_enet_remove(struct platform_device *ofdev)
 {
 	struct net_device *ndev = dev_get_drvdata(&ofdev->dev);
 	struct fs_enet_private *fep = netdev_priv(ndev);
diff --git a/drivers/net/fs_enet/mac-fcc.c b/drivers/net/fs_enet/mac-fcc.c
index 48e91b6242ce..7a84e45487e8 100644
--- a/drivers/net/fs_enet/mac-fcc.c
+++ b/drivers/net/fs_enet/mac-fcc.c
@@ -84,7 +84,7 @@ static inline int fcc_cr_cmd(struct fs_enet_private *fep, u32 op)
 
 static int do_pd_setup(struct fs_enet_private *fep)
 {
-	struct of_device *ofdev = to_of_device(fep->dev);
+	struct platform_device *ofdev = to_platform_device(fep->dev);
 	struct fs_platform_info *fpi = fep->fpi;
 	int ret = -EINVAL;
 
diff --git a/drivers/net/fs_enet/mac-fec.c b/drivers/net/fs_enet/mac-fec.c
index 7ca1642276d0..61035fc5599b 100644
--- a/drivers/net/fs_enet/mac-fec.c
+++ b/drivers/net/fs_enet/mac-fec.c
@@ -96,7 +96,7 @@ static int whack_reset(struct fec __iomem *fecp)
 
 static int do_pd_setup(struct fs_enet_private *fep)
 {
-	struct of_device *ofdev = to_of_device(fep->dev);
+	struct platform_device *ofdev = to_platform_device(fep->dev);
 
 	fep->interrupt = of_irq_to_resource(ofdev->dev.of_node, 0, NULL);
 	if (fep->interrupt == NO_IRQ)
diff --git a/drivers/net/fs_enet/mac-scc.c b/drivers/net/fs_enet/mac-scc.c
index a3c44544846d..22a02a767069 100644
--- a/drivers/net/fs_enet/mac-scc.c
+++ b/drivers/net/fs_enet/mac-scc.c
@@ -96,7 +96,7 @@ static inline int scc_cr_cmd(struct fs_enet_private *fep, u32 op)
 
 static int do_pd_setup(struct fs_enet_private *fep)
 {
-	struct of_device *ofdev = to_of_device(fep->dev);
+	struct platform_device *ofdev = to_platform_device(fep->dev);
 
 	fep->interrupt = of_irq_to_resource(ofdev->dev.of_node, 0, NULL);
 	if (fep->interrupt == NO_IRQ)
diff --git a/drivers/net/fs_enet/mii-bitbang.c b/drivers/net/fs_enet/mii-bitbang.c
index 3607340f3da7..3cda2b515471 100644
--- a/drivers/net/fs_enet/mii-bitbang.c
+++ b/drivers/net/fs_enet/mii-bitbang.c
@@ -150,7 +150,7 @@ static int __devinit fs_mii_bitbang_init(struct mii_bus *bus,
 	return 0;
 }
 
-static int __devinit fs_enet_mdio_probe(struct of_device *ofdev,
+static int __devinit fs_enet_mdio_probe(struct platform_device *ofdev,
                                         const struct of_device_id *match)
 {
 	struct mii_bus *new_bus;
@@ -200,7 +200,7 @@ out:
 	return ret;
 }
 
-static int fs_enet_mdio_remove(struct of_device *ofdev)
+static int fs_enet_mdio_remove(struct platform_device *ofdev)
 {
 	struct mii_bus *bus = dev_get_drvdata(&ofdev->dev);
 	struct bb_info *bitbang = bus->priv;
diff --git a/drivers/net/fs_enet/mii-fec.c b/drivers/net/fs_enet/mii-fec.c
index bddffd169b93..dbb9c48623df 100644
--- a/drivers/net/fs_enet/mii-fec.c
+++ b/drivers/net/fs_enet/mii-fec.c
@@ -101,7 +101,7 @@ static int fs_enet_fec_mii_reset(struct mii_bus *bus)
 	return 0;
 }
 
-static int __devinit fs_enet_mdio_probe(struct of_device *ofdev,
+static int __devinit fs_enet_mdio_probe(struct platform_device *ofdev,
                                         const struct of_device_id *match)
 {
 	struct resource res;
@@ -192,7 +192,7 @@ out:
 	return ret;
 }
 
-static int fs_enet_mdio_remove(struct of_device *ofdev)
+static int fs_enet_mdio_remove(struct platform_device *ofdev)
 {
 	struct mii_bus *bus = dev_get_drvdata(&ofdev->dev);
 	struct fec_info *fec = bus->priv;
diff --git a/drivers/net/fsl_pq_mdio.c b/drivers/net/fsl_pq_mdio.c
index f53f850b6418..d4bf91aac25f 100644
--- a/drivers/net/fsl_pq_mdio.c
+++ b/drivers/net/fsl_pq_mdio.c
@@ -265,7 +265,7 @@ static int get_ucc_id_for_range(u64 start, u64 end, u32 *ucc_id)
 #endif
 
 
-static int fsl_pq_mdio_probe(struct of_device *ofdev,
+static int fsl_pq_mdio_probe(struct platform_device *ofdev,
 		const struct of_device_id *match)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -425,7 +425,7 @@ err_free_priv:
 }
 
 
-static int fsl_pq_mdio_remove(struct of_device *ofdev)
+static int fsl_pq_mdio_remove(struct platform_device *ofdev)
 {
 	struct device *device = &ofdev->dev;
 	struct mii_bus *bus = dev_get_drvdata(device);
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index a1b6301bc674..4f7c3f3ca234 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -122,9 +122,9 @@ static irqreturn_t gfar_interrupt(int irq, void *dev_id);
 static void adjust_link(struct net_device *dev);
 static void init_registers(struct net_device *dev);
 static int init_phy(struct net_device *dev);
-static int gfar_probe(struct of_device *ofdev,
+static int gfar_probe(struct platform_device *ofdev,
 		const struct of_device_id *match);
-static int gfar_remove(struct of_device *ofdev);
+static int gfar_remove(struct platform_device *ofdev);
 static void free_skb_resources(struct gfar_private *priv);
 static void gfar_set_multi(struct net_device *dev);
 static void gfar_set_hash_for_addr(struct net_device *dev, u8 *addr);
@@ -605,7 +605,7 @@ static int gfar_parse_group(struct device_node *np,
 	return 0;
 }
 
-static int gfar_of_init(struct of_device *ofdev, struct net_device **pdev)
+static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev)
 {
 	const char *model;
 	const char *ctype;
@@ -959,7 +959,7 @@ static void gfar_detect_errata(struct gfar_private *priv)
 
 /* Set up the ethernet device structure, private data,
  * and anything else we need before we start */
-static int gfar_probe(struct of_device *ofdev,
+static int gfar_probe(struct platform_device *ofdev,
 		const struct of_device_id *match)
 {
 	u32 tempval;
@@ -1238,7 +1238,7 @@ register_fail:
 	return err;
 }
 
-static int gfar_remove(struct of_device *ofdev)
+static int gfar_remove(struct platform_device *ofdev)
 {
 	struct gfar_private *priv = dev_get_drvdata(&ofdev->dev);
 
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index 710810e2adb4..68984eb88ae0 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -1054,7 +1054,7 @@ struct gfar_private {
 
 	struct device_node *node;
 	struct net_device *ndev;
-	struct of_device *ofdev;
+	struct platform_device *ofdev;
 	enum gfar_errata errata;
 
 	struct gfar_priv_grp gfargrp[MAXGROUPS];
diff --git a/drivers/net/greth.c b/drivers/net/greth.c
index 4d09eab3548e..9ec4b7f28ca4 100644
--- a/drivers/net/greth.c
+++ b/drivers/net/greth.c
@@ -1373,7 +1373,7 @@ error:
 }
 
 /* Initialize the GRETH MAC */
-static int __devinit greth_of_probe(struct of_device *ofdev, const struct of_device_id *match)
+static int __devinit greth_of_probe(struct platform_device *ofdev, const struct of_device_id *match)
 {
 	struct net_device *dev;
 	struct greth_private *greth;
@@ -1572,7 +1572,7 @@ error1:
 	return err;
 }
 
-static int __devexit greth_of_remove(struct of_device *of_dev)
+static int __devexit greth_of_remove(struct platform_device *of_dev)
 {
 	struct net_device *ndev = dev_get_drvdata(&of_dev->dev);
 	struct greth_private *greth = netdev_priv(ndev);
diff --git a/drivers/net/greth.h b/drivers/net/greth.h
index 973388d6abca..03ad903cd676 100644
--- a/drivers/net/greth.h
+++ b/drivers/net/greth.h
@@ -118,7 +118,7 @@ struct greth_private {
 
 	int irq;
 
-	struct device *dev;	        /* Pointer to of_device->dev */
+	struct device *dev;	        /* Pointer to platform_device->dev */
 	struct net_device *netdev;
 	struct napi_struct napi;
 	spinlock_t devlock;
diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c
index eeec7bc2ce74..3506fd6ad726 100644
--- a/drivers/net/ibm_newemac/core.c
+++ b/drivers/net/ibm_newemac/core.c
@@ -2245,7 +2245,7 @@ static int emac_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
 struct emac_depentry {
 	u32			phandle;
 	struct device_node	*node;
-	struct of_device	*ofdev;
+	struct platform_device	*ofdev;
 	void			*drvdata;
 };
 
@@ -2719,7 +2719,7 @@ static const struct net_device_ops emac_gige_netdev_ops = {
 	.ndo_change_mtu		= emac_change_mtu,
 };
 
-static int __devinit emac_probe(struct of_device *ofdev,
+static int __devinit emac_probe(struct platform_device *ofdev,
 				const struct of_device_id *match)
 {
 	struct net_device *ndev;
@@ -2941,7 +2941,7 @@ static int __devinit emac_probe(struct of_device *ofdev,
 	return err;
 }
 
-static int __devexit emac_remove(struct of_device *ofdev)
+static int __devexit emac_remove(struct platform_device *ofdev)
 {
 	struct emac_instance *dev = dev_get_drvdata(&ofdev->dev);
 
diff --git a/drivers/net/ibm_newemac/core.h b/drivers/net/ibm_newemac/core.h
index b1cbe6fdfc7a..9e37e3d9c51d 100644
--- a/drivers/net/ibm_newemac/core.h
+++ b/drivers/net/ibm_newemac/core.h
@@ -170,12 +170,12 @@ struct emac_instance {
 	struct net_device		*ndev;
 	struct resource			rsrc_regs;
 	struct emac_regs		__iomem *emacp;
-	struct of_device		*ofdev;
+	struct platform_device		*ofdev;
 	struct device_node		**blist; /* bootlist entry */
 
 	/* MAL linkage */
 	u32				mal_ph;
-	struct of_device		*mal_dev;
+	struct platform_device		*mal_dev;
 	u32				mal_rx_chan;
 	u32				mal_tx_chan;
 	struct mal_instance		*mal;
@@ -196,24 +196,24 @@ struct emac_instance {
 
 	/* Shared MDIO if any */
 	u32				mdio_ph;
-	struct of_device		*mdio_dev;
+	struct platform_device		*mdio_dev;
 	struct emac_instance		*mdio_instance;
 	struct mutex			mdio_lock;
 
 	/* ZMII infos if any */
 	u32				zmii_ph;
 	u32				zmii_port;
-	struct of_device		*zmii_dev;
+	struct platform_device		*zmii_dev;
 
 	/* RGMII infos if any */
 	u32				rgmii_ph;
 	u32				rgmii_port;
-	struct of_device		*rgmii_dev;
+	struct platform_device		*rgmii_dev;
 
 	/* TAH infos if any */
 	u32				tah_ph;
 	u32				tah_port;
-	struct of_device		*tah_dev;
+	struct platform_device		*tah_dev;
 
 	/* IRQs */
 	int				wol_irq;
diff --git a/drivers/net/ibm_newemac/mal.c b/drivers/net/ibm_newemac/mal.c
index fcff9e0bd382..d5717e2123e1 100644
--- a/drivers/net/ibm_newemac/mal.c
+++ b/drivers/net/ibm_newemac/mal.c
@@ -517,7 +517,7 @@ void *mal_dump_regs(struct mal_instance *mal, void *buf)
 	return regs + 1;
 }
 
-static int __devinit mal_probe(struct of_device *ofdev,
+static int __devinit mal_probe(struct platform_device *ofdev,
 			       const struct of_device_id *match)
 {
 	struct mal_instance *mal;
@@ -730,7 +730,7 @@ static int __devinit mal_probe(struct of_device *ofdev,
 	return err;
 }
 
-static int __devexit mal_remove(struct of_device *ofdev)
+static int __devexit mal_remove(struct platform_device *ofdev)
 {
 	struct mal_instance *mal = dev_get_drvdata(&ofdev->dev);
 
diff --git a/drivers/net/ibm_newemac/mal.h b/drivers/net/ibm_newemac/mal.h
index 9ededfbf0726..66084214bf45 100644
--- a/drivers/net/ibm_newemac/mal.h
+++ b/drivers/net/ibm_newemac/mal.h
@@ -210,7 +210,7 @@ struct mal_instance {
 	dma_addr_t		bd_dma;
 	struct mal_descriptor	*bd_virt;
 
-	struct of_device	*ofdev;
+	struct platform_device	*ofdev;
 	int			index;
 	spinlock_t		lock;
 
diff --git a/drivers/net/ibm_newemac/rgmii.c b/drivers/net/ibm_newemac/rgmii.c
index 108919bcdf13..dd61798897ac 100644
--- a/drivers/net/ibm_newemac/rgmii.c
+++ b/drivers/net/ibm_newemac/rgmii.c
@@ -93,7 +93,7 @@ static inline u32 rgmii_mode_mask(int mode, int input)
 	}
 }
 
-int __devinit rgmii_attach(struct of_device *ofdev, int input, int mode)
+int __devinit rgmii_attach(struct platform_device *ofdev, int input, int mode)
 {
 	struct rgmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 	struct rgmii_regs __iomem *p = dev->base;
@@ -122,7 +122,7 @@ int __devinit rgmii_attach(struct of_device *ofdev, int input, int mode)
 	return 0;
 }
 
-void rgmii_set_speed(struct of_device *ofdev, int input, int speed)
+void rgmii_set_speed(struct platform_device *ofdev, int input, int speed)
 {
 	struct rgmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 	struct rgmii_regs __iomem *p = dev->base;
@@ -144,7 +144,7 @@ void rgmii_set_speed(struct of_device *ofdev, int input, int speed)
 	mutex_unlock(&dev->lock);
 }
 
-void rgmii_get_mdio(struct of_device *ofdev, int input)
+void rgmii_get_mdio(struct platform_device *ofdev, int input)
 {
 	struct rgmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 	struct rgmii_regs __iomem *p = dev->base;
@@ -165,7 +165,7 @@ void rgmii_get_mdio(struct of_device *ofdev, int input)
 	DBG2(dev, " fer = 0x%08x\n", fer);
 }
 
-void rgmii_put_mdio(struct of_device *ofdev, int input)
+void rgmii_put_mdio(struct platform_device *ofdev, int input)
 {
 	struct rgmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 	struct rgmii_regs __iomem *p = dev->base;
@@ -186,7 +186,7 @@ void rgmii_put_mdio(struct of_device *ofdev, int input)
 	mutex_unlock(&dev->lock);
 }
 
-void rgmii_detach(struct of_device *ofdev, int input)
+void rgmii_detach(struct platform_device *ofdev, int input)
 {
 	struct rgmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 	struct rgmii_regs __iomem *p;
@@ -206,13 +206,13 @@ void rgmii_detach(struct of_device *ofdev, int input)
 	mutex_unlock(&dev->lock);
 }
 
-int rgmii_get_regs_len(struct of_device *ofdev)
+int rgmii_get_regs_len(struct platform_device *ofdev)
 {
 	return sizeof(struct emac_ethtool_regs_subhdr) +
 		sizeof(struct rgmii_regs);
 }
 
-void *rgmii_dump_regs(struct of_device *ofdev, void *buf)
+void *rgmii_dump_regs(struct platform_device *ofdev, void *buf)
 {
 	struct rgmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 	struct emac_ethtool_regs_subhdr *hdr = buf;
@@ -228,7 +228,7 @@ void *rgmii_dump_regs(struct of_device *ofdev, void *buf)
 }
 
 
-static int __devinit rgmii_probe(struct of_device *ofdev,
+static int __devinit rgmii_probe(struct platform_device *ofdev,
 				 const struct of_device_id *match)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -293,7 +293,7 @@ static int __devinit rgmii_probe(struct of_device *ofdev,
 	return rc;
 }
 
-static int __devexit rgmii_remove(struct of_device *ofdev)
+static int __devexit rgmii_remove(struct platform_device *ofdev)
 {
 	struct rgmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 
diff --git a/drivers/net/ibm_newemac/rgmii.h b/drivers/net/ibm_newemac/rgmii.h
index c4a4b358a270..d69799049865 100644
--- a/drivers/net/ibm_newemac/rgmii.h
+++ b/drivers/net/ibm_newemac/rgmii.h
@@ -51,20 +51,20 @@ struct rgmii_instance {
 	int				users;
 
 	/* OF device instance */
-	struct of_device		*ofdev;
+	struct platform_device		*ofdev;
 };
 
 #ifdef CONFIG_IBM_NEW_EMAC_RGMII
 
 extern int rgmii_init(void);
 extern void rgmii_exit(void);
-extern int rgmii_attach(struct of_device *ofdev, int input, int mode);
-extern void rgmii_detach(struct of_device *ofdev, int input);
-extern void rgmii_get_mdio(struct of_device *ofdev, int input);
-extern void rgmii_put_mdio(struct of_device *ofdev, int input);
-extern void rgmii_set_speed(struct of_device *ofdev, int input, int speed);
-extern int rgmii_get_regs_len(struct of_device *ofdev);
-extern void *rgmii_dump_regs(struct of_device *ofdev, void *buf);
+extern int rgmii_attach(struct platform_device *ofdev, int input, int mode);
+extern void rgmii_detach(struct platform_device *ofdev, int input);
+extern void rgmii_get_mdio(struct platform_device *ofdev, int input);
+extern void rgmii_put_mdio(struct platform_device *ofdev, int input);
+extern void rgmii_set_speed(struct platform_device *ofdev, int input, int speed);
+extern int rgmii_get_regs_len(struct platform_device *ofdev);
+extern void *rgmii_dump_regs(struct platform_device *ofdev, void *buf);
 
 #else
 
diff --git a/drivers/net/ibm_newemac/tah.c b/drivers/net/ibm_newemac/tah.c
index 044637144c43..299aa49490c0 100644
--- a/drivers/net/ibm_newemac/tah.c
+++ b/drivers/net/ibm_newemac/tah.c
@@ -23,7 +23,7 @@
 #include "emac.h"
 #include "core.h"
 
-int __devinit tah_attach(struct of_device *ofdev, int channel)
+int __devinit tah_attach(struct platform_device *ofdev, int channel)
 {
 	struct tah_instance *dev = dev_get_drvdata(&ofdev->dev);
 
@@ -35,7 +35,7 @@ int __devinit tah_attach(struct of_device *ofdev, int channel)
 	return 0;
 }
 
-void tah_detach(struct of_device *ofdev, int channel)
+void tah_detach(struct platform_device *ofdev, int channel)
 {
 	struct tah_instance *dev = dev_get_drvdata(&ofdev->dev);
 
@@ -44,7 +44,7 @@ void tah_detach(struct of_device *ofdev, int channel)
 	mutex_unlock(&dev->lock);
 }
 
-void tah_reset(struct of_device *ofdev)
+void tah_reset(struct platform_device *ofdev)
 {
 	struct tah_instance *dev = dev_get_drvdata(&ofdev->dev);
 	struct tah_regs __iomem *p = dev->base;
@@ -66,13 +66,13 @@ void tah_reset(struct of_device *ofdev)
 		 TAH_MR_DIG);
 }
 
-int tah_get_regs_len(struct of_device *ofdev)
+int tah_get_regs_len(struct platform_device *ofdev)
 {
 	return sizeof(struct emac_ethtool_regs_subhdr) +
 		sizeof(struct tah_regs);
 }
 
-void *tah_dump_regs(struct of_device *ofdev, void *buf)
+void *tah_dump_regs(struct platform_device *ofdev, void *buf)
 {
 	struct tah_instance *dev = dev_get_drvdata(&ofdev->dev);
 	struct emac_ethtool_regs_subhdr *hdr = buf;
@@ -87,7 +87,7 @@ void *tah_dump_regs(struct of_device *ofdev, void *buf)
 	return regs + 1;
 }
 
-static int __devinit tah_probe(struct of_device *ofdev,
+static int __devinit tah_probe(struct platform_device *ofdev,
 			       const struct of_device_id *match)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -139,7 +139,7 @@ static int __devinit tah_probe(struct of_device *ofdev,
 	return rc;
 }
 
-static int __devexit tah_remove(struct of_device *ofdev)
+static int __devexit tah_remove(struct platform_device *ofdev)
 {
 	struct tah_instance *dev = dev_get_drvdata(&ofdev->dev);
 
diff --git a/drivers/net/ibm_newemac/tah.h b/drivers/net/ibm_newemac/tah.h
index a068b5658dad..61dbeca006d1 100644
--- a/drivers/net/ibm_newemac/tah.h
+++ b/drivers/net/ibm_newemac/tah.h
@@ -48,7 +48,7 @@ struct tah_instance {
 	int				users;
 
 	/* OF device instance */
-	struct of_device		*ofdev;
+	struct platform_device		*ofdev;
 };
 
 
@@ -74,11 +74,11 @@ struct tah_instance {
 
 extern int tah_init(void);
 extern void tah_exit(void);
-extern int tah_attach(struct of_device *ofdev, int channel);
-extern void tah_detach(struct of_device *ofdev, int channel);
-extern void tah_reset(struct of_device *ofdev);
-extern int tah_get_regs_len(struct of_device *ofdev);
-extern void *tah_dump_regs(struct of_device *ofdev, void *buf);
+extern int tah_attach(struct platform_device *ofdev, int channel);
+extern void tah_detach(struct platform_device *ofdev, int channel);
+extern void tah_reset(struct platform_device *ofdev);
+extern int tah_get_regs_len(struct platform_device *ofdev);
+extern void *tah_dump_regs(struct platform_device *ofdev, void *buf);
 
 #else
 
diff --git a/drivers/net/ibm_newemac/zmii.c b/drivers/net/ibm_newemac/zmii.c
index 046dcd069c45..34ed6ee8ca8a 100644
--- a/drivers/net/ibm_newemac/zmii.c
+++ b/drivers/net/ibm_newemac/zmii.c
@@ -82,7 +82,7 @@ static inline u32 zmii_mode_mask(int mode, int input)
 	}
 }
 
-int __devinit zmii_attach(struct of_device *ofdev, int input, int *mode)
+int __devinit zmii_attach(struct platform_device *ofdev, int input, int *mode)
 {
 	struct zmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 	struct zmii_regs __iomem *p = dev->base;
@@ -148,7 +148,7 @@ int __devinit zmii_attach(struct of_device *ofdev, int input, int *mode)
 	return 0;
 }
 
-void zmii_get_mdio(struct of_device *ofdev, int input)
+void zmii_get_mdio(struct platform_device *ofdev, int input)
 {
 	struct zmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 	u32 fer;
@@ -161,7 +161,7 @@ void zmii_get_mdio(struct of_device *ofdev, int input)
 	out_be32(&dev->base->fer, fer | ZMII_FER_MDI(input));
 }
 
-void zmii_put_mdio(struct of_device *ofdev, int input)
+void zmii_put_mdio(struct platform_device *ofdev, int input)
 {
 	struct zmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 
@@ -170,7 +170,7 @@ void zmii_put_mdio(struct of_device *ofdev, int input)
 }
 
 
-void zmii_set_speed(struct of_device *ofdev, int input, int speed)
+void zmii_set_speed(struct platform_device *ofdev, int input, int speed)
 {
 	struct zmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 	u32 ssr;
@@ -191,7 +191,7 @@ void zmii_set_speed(struct of_device *ofdev, int input, int speed)
 	mutex_unlock(&dev->lock);
 }
 
-void zmii_detach(struct of_device *ofdev, int input)
+void zmii_detach(struct platform_device *ofdev, int input)
 {
 	struct zmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 
@@ -210,13 +210,13 @@ void zmii_detach(struct of_device *ofdev, int input)
 	mutex_unlock(&dev->lock);
 }
 
-int zmii_get_regs_len(struct of_device *ofdev)
+int zmii_get_regs_len(struct platform_device *ofdev)
 {
 	return sizeof(struct emac_ethtool_regs_subhdr) +
 		sizeof(struct zmii_regs);
 }
 
-void *zmii_dump_regs(struct of_device *ofdev, void *buf)
+void *zmii_dump_regs(struct platform_device *ofdev, void *buf)
 {
 	struct zmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 	struct emac_ethtool_regs_subhdr *hdr = buf;
@@ -231,7 +231,7 @@ void *zmii_dump_regs(struct of_device *ofdev, void *buf)
 	return regs + 1;
 }
 
-static int __devinit zmii_probe(struct of_device *ofdev,
+static int __devinit zmii_probe(struct platform_device *ofdev,
 				const struct of_device_id *match)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -286,7 +286,7 @@ static int __devinit zmii_probe(struct of_device *ofdev,
 	return rc;
 }
 
-static int __devexit zmii_remove(struct of_device *ofdev)
+static int __devexit zmii_remove(struct platform_device *ofdev)
 {
 	struct zmii_instance *dev = dev_get_drvdata(&ofdev->dev);
 
diff --git a/drivers/net/ibm_newemac/zmii.h b/drivers/net/ibm_newemac/zmii.h
index 6c9beba0c4b6..1333fa2b2781 100644
--- a/drivers/net/ibm_newemac/zmii.h
+++ b/drivers/net/ibm_newemac/zmii.h
@@ -48,20 +48,20 @@ struct zmii_instance {
 	u32				fer_save;
 
 	/* OF device instance */
-	struct of_device		*ofdev;
+	struct platform_device		*ofdev;
 };
 
 #ifdef CONFIG_IBM_NEW_EMAC_ZMII
 
 extern int zmii_init(void);
 extern void zmii_exit(void);
-extern int zmii_attach(struct of_device *ofdev, int input, int *mode);
-extern void zmii_detach(struct of_device *ofdev, int input);
-extern void zmii_get_mdio(struct of_device *ofdev, int input);
-extern void zmii_put_mdio(struct of_device *ofdev, int input);
-extern void zmii_set_speed(struct of_device *ofdev, int input, int speed);
-extern int zmii_get_regs_len(struct of_device *ocpdev);
-extern void *zmii_dump_regs(struct of_device *ofdev, void *buf);
+extern int zmii_attach(struct platform_device *ofdev, int input, int *mode);
+extern void zmii_detach(struct platform_device *ofdev, int input);
+extern void zmii_get_mdio(struct platform_device *ofdev, int input);
+extern void zmii_put_mdio(struct platform_device *ofdev, int input);
+extern void zmii_set_speed(struct platform_device *ofdev, int input, int speed);
+extern int zmii_get_regs_len(struct platform_device *ocpdev);
+extern void *zmii_dump_regs(struct platform_device *ofdev, void *buf);
 
 #else
 # define zmii_init()		0
diff --git a/drivers/net/ll_temac_main.c b/drivers/net/ll_temac_main.c
index 4eea3f70c5cf..c7b624711f5e 100644
--- a/drivers/net/ll_temac_main.c
+++ b/drivers/net/ll_temac_main.c
@@ -159,7 +159,7 @@ static void temac_dma_dcr_out(struct temac_local *lp, int reg, u32 value)
  * temac_dcr_setup - If the DMA is DCR based, then setup the address and
  * I/O  functions
  */
-static int temac_dcr_setup(struct temac_local *lp, struct of_device *op,
+static int temac_dcr_setup(struct temac_local *lp, struct platform_device *op,
 				struct device_node *np)
 {
 	unsigned int dcrs;
@@ -184,7 +184,7 @@ static int temac_dcr_setup(struct temac_local *lp, struct of_device *op,
  * temac_dcr_setup - This is a stub for when DCR is not supported,
  * such as with MicroBlaze
  */
-static int temac_dcr_setup(struct temac_local *lp, struct of_device *op,
+static int temac_dcr_setup(struct temac_local *lp, struct platform_device *op,
 				struct device_node *np)
 {
 	return -1;
@@ -952,7 +952,7 @@ static const struct attribute_group temac_attr_group = {
 };
 
 static int __init
-temac_of_probe(struct of_device *op, const struct of_device_id *match)
+temac_of_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct device_node *np;
 	struct temac_local *lp;
@@ -1094,7 +1094,7 @@ temac_of_probe(struct of_device *op, const struct of_device_id *match)
 	return rc;
 }
 
-static int __devexit temac_of_remove(struct of_device *op)
+static int __devexit temac_of_remove(struct platform_device *op)
 {
 	struct net_device *ndev = dev_get_drvdata(&op->dev);
 	struct temac_local *lp = netdev_priv(ndev);
diff --git a/drivers/net/myri_sbus.c b/drivers/net/myri_sbus.c
index 04e552aa14ec..617f898ba5f0 100644
--- a/drivers/net/myri_sbus.c
+++ b/drivers/net/myri_sbus.c
@@ -926,7 +926,7 @@ static const struct net_device_ops myri_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
-static int __devinit myri_sbus_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit myri_sbus_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
 	static unsigned version_printed;
@@ -1124,7 +1124,7 @@ err:
 	return -ENODEV;
 }
 
-static int __devexit myri_sbus_remove(struct of_device *op)
+static int __devexit myri_sbus_remove(struct platform_device *op)
 {
 	struct myri_eth *mp = dev_get_drvdata(&op->dev);
 	struct net_device *net_dev = mp->dev;
diff --git a/drivers/net/myri_sbus.h b/drivers/net/myri_sbus.h
index ff363e95d9cf..80a2fa5cf757 100644
--- a/drivers/net/myri_sbus.h
+++ b/drivers/net/myri_sbus.h
@@ -288,7 +288,7 @@ struct myri_eth {
 	struct myri_eeprom		eeprom;		/* Local copy of EEPROM.      */
 	unsigned int			reg_size;	/* Size of register space.    */
 	unsigned int			shmem_base;	/* Offset to shared ram.      */
-	struct of_device		*myri_op;	/* Our OF device struct.    */
+	struct platform_device		*myri_op;	/* Our OF device struct.    */
 };
 
 /* We use this to acquire receive skb's that we can DMA directly into. */
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index 404f2d552888..bc695d53cdcc 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -9103,7 +9103,7 @@ retry:
 static int __devinit niu_n2_irq_init(struct niu *np, u8 *ldg_num_map)
 {
 #ifdef CONFIG_SPARC64
-	struct of_device *op = np->op;
+	struct platform_device *op = np->op;
 	const u32 *int_prop;
 	int i;
 
@@ -9688,7 +9688,7 @@ static void __devinit niu_driver_version(void)
 
 static struct net_device * __devinit niu_alloc_and_init(
 	struct device *gen_dev, struct pci_dev *pdev,
-	struct of_device *op, const struct niu_ops *ops,
+	struct platform_device *op, const struct niu_ops *ops,
 	u8 port)
 {
 	struct net_device *dev;
@@ -10064,7 +10064,7 @@ static const struct niu_ops niu_phys_ops = {
 	.unmap_single	= niu_phys_unmap_single,
 };
 
-static int __devinit niu_of_probe(struct of_device *op,
+static int __devinit niu_of_probe(struct platform_device *op,
 				  const struct of_device_id *match)
 {
 	union niu_parent_id parent_id;
@@ -10179,7 +10179,7 @@ err_out:
 	return err;
 }
 
-static int __devexit niu_of_remove(struct of_device *op)
+static int __devexit niu_of_remove(struct platform_device *op)
 {
 	struct net_device *dev = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index fc5fef2a8175..f62c7b717bc8 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -188,7 +188,7 @@ static int __devexit mdio_gpio_remove(struct platform_device *pdev)
 
 #ifdef CONFIG_OF_GPIO
 
-static int __devinit mdio_ofgpio_probe(struct of_device *ofdev,
+static int __devinit mdio_ofgpio_probe(struct platform_device *ofdev,
                                         const struct of_device_id *match)
 {
 	struct mdio_gpio_platform_data *pdata;
@@ -224,7 +224,7 @@ out_free:
 	return -ENODEV;
 }
 
-static int __devexit mdio_ofgpio_remove(struct of_device *ofdev)
+static int __devexit mdio_ofgpio_remove(struct platform_device *ofdev)
 {
 	mdio_gpio_bus_destroy(&ofdev->dev);
 	kfree(ofdev->dev.platform_data);
diff --git a/drivers/net/sunbmac.c b/drivers/net/sunbmac.c
index 09c071bd6ad4..618643e3ca3e 100644
--- a/drivers/net/sunbmac.c
+++ b/drivers/net/sunbmac.c
@@ -97,7 +97,7 @@ static int qec_global_reset(void __iomem *gregs)
 
 static void qec_init(struct bigmac *bp)
 {
-	struct of_device *qec_op = bp->qec_op;
+	struct platform_device *qec_op = bp->qec_op;
 	void __iomem *gregs = bp->gregs;
 	u8 bsizes = bp->bigmac_bursts;
 	u32 regval;
@@ -1083,8 +1083,8 @@ static const struct net_device_ops bigmac_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
-static int __devinit bigmac_ether_init(struct of_device *op,
-				       struct of_device *qec_op)
+static int __devinit bigmac_ether_init(struct platform_device *op,
+				       struct platform_device *qec_op)
 {
 	static int version_printed;
 	struct net_device *dev;
@@ -1242,25 +1242,25 @@ fail_and_cleanup:
 /* QEC can be the parent of either QuadEthernet or a BigMAC.  We want
  * the latter.
  */
-static int __devinit bigmac_sbus_probe(struct of_device *op,
+static int __devinit bigmac_sbus_probe(struct platform_device *op,
 				       const struct of_device_id *match)
 {
 	struct device *parent = op->dev.parent;
-	struct of_device *qec_op;
+	struct platform_device *qec_op;
 
-	qec_op = to_of_device(parent);
+	qec_op = to_platform_device(parent);
 
 	return bigmac_ether_init(op, qec_op);
 }
 
-static int __devexit bigmac_sbus_remove(struct of_device *op)
+static int __devexit bigmac_sbus_remove(struct platform_device *op)
 {
 	struct bigmac *bp = dev_get_drvdata(&op->dev);
 	struct device *parent = op->dev.parent;
 	struct net_device *net_dev = bp->dev;
-	struct of_device *qec_op;
+	struct platform_device *qec_op;
 
-	qec_op = to_of_device(parent);
+	qec_op = to_platform_device(parent);
 
 	unregister_netdev(net_dev);
 
diff --git a/drivers/net/sunbmac.h b/drivers/net/sunbmac.h
index 8840bc0b840b..8db88945b889 100644
--- a/drivers/net/sunbmac.h
+++ b/drivers/net/sunbmac.h
@@ -329,8 +329,8 @@ struct bigmac {
 	unsigned int		timer_ticks;
 
 	struct net_device_stats	enet_stats;
-	struct of_device	*qec_op;
-	struct of_device	*bigmac_op;
+	struct platform_device	*qec_op;
+	struct platform_device	*bigmac_op;
 	struct net_device	*dev;
 };
 
diff --git a/drivers/net/sunhme.c b/drivers/net/sunhme.c
index eec443f64079..bd0df1c14955 100644
--- a/drivers/net/sunhme.c
+++ b/drivers/net/sunhme.c
@@ -1591,7 +1591,7 @@ static int happy_meal_init(struct happy_meal *hp)
 		 */
 #ifdef CONFIG_SBUS
 		if ((hp->happy_flags & HFLAG_PCI) == 0) {
-			struct of_device *op = hp->happy_dev;
+			struct platform_device *op = hp->happy_dev;
 			if (sbus_can_dma_64bit()) {
 				sbus_set_sbus64(&op->dev,
 						hp->happy_bursts);
@@ -2480,7 +2480,7 @@ static void hme_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info
 #ifdef CONFIG_SBUS
 	else {
 		const struct linux_prom_registers *regs;
-		struct of_device *op = hp->happy_dev;
+		struct platform_device *op = hp->happy_dev;
 		regs = of_get_property(op->dev.of_node, "regs", NULL);
 		if (regs)
 			sprintf(info->bus_info, "SBUS:%d",
@@ -2515,13 +2515,13 @@ static int hme_version_printed;
  *
  * Return NULL on failure.
  */
-static struct quattro * __devinit quattro_sbus_find(struct of_device *child)
+static struct quattro * __devinit quattro_sbus_find(struct platform_device *child)
 {
 	struct device *parent = child->dev.parent;
-	struct of_device *op;
+	struct platform_device *op;
 	struct quattro *qp;
 
-	op = to_of_device(parent);
+	op = to_platform_device(parent);
 	qp = dev_get_drvdata(&op->dev);
 	if (qp)
 		return qp;
@@ -2551,7 +2551,7 @@ static int __init quattro_sbus_register_irqs(void)
 	struct quattro *qp;
 
 	for (qp = qfe_sbus_list; qp != NULL; qp = qp->next) {
-		struct of_device *op = qp->quattro_dev;
+		struct platform_device *op = qp->quattro_dev;
 		int err, qfe_slot, skip = 0;
 
 		for (qfe_slot = 0; qfe_slot < 4; qfe_slot++) {
@@ -2580,7 +2580,7 @@ static void quattro_sbus_free_irqs(void)
 	struct quattro *qp;
 
 	for (qp = qfe_sbus_list; qp != NULL; qp = qp->next) {
-		struct of_device *op = qp->quattro_dev;
+		struct platform_device *op = qp->quattro_dev;
 		int qfe_slot, skip = 0;
 
 		for (qfe_slot = 0; qfe_slot < 4; qfe_slot++) {
@@ -2639,7 +2639,7 @@ static const struct net_device_ops hme_netdev_ops = {
 };
 
 #ifdef CONFIG_SBUS
-static int __devinit happy_meal_sbus_probe_one(struct of_device *op, int is_qfe)
+static int __devinit happy_meal_sbus_probe_one(struct platform_device *op, int is_qfe)
 {
 	struct device_node *dp = op->dev.of_node, *sbus_dp;
 	struct quattro *qp = NULL;
@@ -2648,7 +2648,7 @@ static int __devinit happy_meal_sbus_probe_one(struct of_device *op, int is_qfe)
 	int i, qfe_slot = -1;
 	int err = -ENODEV;
 
-	sbus_dp = to_of_device(op->dev.parent)->dev.of_node;
+	sbus_dp = op->dev.parent->of_node;
 
 	/* We can match PCI devices too, do not accept those here. */
 	if (strcmp(sbus_dp->name, "sbus"))
@@ -3235,7 +3235,7 @@ static void happy_meal_pci_exit(void)
 #endif
 
 #ifdef CONFIG_SBUS
-static int __devinit hme_sbus_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit hme_sbus_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
 	const char *model = of_get_property(dp, "model", NULL);
@@ -3247,7 +3247,7 @@ static int __devinit hme_sbus_probe(struct of_device *op, const struct of_device
 	return happy_meal_sbus_probe_one(op, is_qfe);
 }
 
-static int __devexit hme_sbus_remove(struct of_device *op)
+static int __devexit hme_sbus_remove(struct platform_device *op)
 {
 	struct happy_meal *hp = dev_get_drvdata(&op->dev);
 	struct net_device *net_dev = hp->dev;
diff --git a/drivers/net/sunhme.h b/drivers/net/sunhme.h
index efd2ca0fcad3..756b5bf3aa89 100644
--- a/drivers/net/sunhme.h
+++ b/drivers/net/sunhme.h
@@ -407,7 +407,7 @@ struct happy_meal {
 	void (*write_rxd)(struct happy_meal_rxd *, u32, u32);
 #endif
 
-	/* This is either an of_device or a pci_dev. */
+	/* This is either an platform_device or a pci_dev. */
 	void			  *happy_dev;
 	struct device		  *dma_dev;
 
diff --git a/drivers/net/sunlance.c b/drivers/net/sunlance.c
index ee364fa75634..8dcb858f2168 100644
--- a/drivers/net/sunlance.c
+++ b/drivers/net/sunlance.c
@@ -250,7 +250,7 @@ struct lance_private {
 	int		rx_new, tx_new;
 	int		rx_old, tx_old;
 
-	struct of_device *ledma;	/* If set this points to ledma	*/
+	struct platform_device *ledma;	/* If set this points to ledma	*/
 	char		tpe;		/* cable-selection is TPE	*/
 	char		auto_select;	/* cable-selection by carrier	*/
 	char		burst_sizes;	/* ledma SBus burst sizes	*/
@@ -265,8 +265,8 @@ struct lance_private {
 	char	       	       *name;
 	dma_addr_t		init_block_dvma;
 	struct net_device      *dev;		  /* Backpointer	*/
-	struct of_device       *op;
-	struct of_device       *lebuffer;
+	struct platform_device       *op;
+	struct platform_device       *lebuffer;
 	struct timer_list       multicast_timer;
 };
 
@@ -1272,7 +1272,7 @@ static void lance_free_hwresources(struct lance_private *lp)
 	if (lp->lregs)
 		of_iounmap(&lp->op->resource[0], lp->lregs, LANCE_REG_SIZE);
 	if (lp->dregs) {
-		struct of_device *ledma = lp->ledma;
+		struct platform_device *ledma = lp->ledma;
 
 		of_iounmap(&ledma->resource[0], lp->dregs,
 			   resource_size(&ledma->resource[0]));
@@ -1319,9 +1319,9 @@ static const struct net_device_ops sparc_lance_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
-static int __devinit sparc_lance_probe_one(struct of_device *op,
-					   struct of_device *ledma,
-					   struct of_device *lebuffer)
+static int __devinit sparc_lance_probe_one(struct platform_device *op,
+					   struct platform_device *ledma,
+					   struct platform_device *lebuffer)
 {
 	struct device_node *dp = op->dev.of_node;
 	static unsigned version_printed;
@@ -1503,9 +1503,9 @@ fail:
 	return -ENODEV;
 }
 
-static int __devinit sunlance_sbus_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit sunlance_sbus_probe(struct platform_device *op, const struct of_device_id *match)
 {
-	struct of_device *parent = to_of_device(op->dev.parent);
+	struct platform_device *parent = to_platform_device(op->dev.parent);
 	struct device_node *parent_dp = parent->dev.of_node;
 	int err;
 
@@ -1519,7 +1519,7 @@ static int __devinit sunlance_sbus_probe(struct of_device *op, const struct of_d
 	return err;
 }
 
-static int __devexit sunlance_sbus_remove(struct of_device *op)
+static int __devexit sunlance_sbus_remove(struct platform_device *op)
 {
 	struct lance_private *lp = dev_get_drvdata(&op->dev);
 	struct net_device *net_dev = lp->dev;
diff --git a/drivers/net/sunqe.c b/drivers/net/sunqe.c
index 5f84a5dadedd..72e65d4666ef 100644
--- a/drivers/net/sunqe.c
+++ b/drivers/net/sunqe.c
@@ -689,7 +689,7 @@ static void qe_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
 {
 	const struct linux_prom_registers *regs;
 	struct sunqe *qep = netdev_priv(dev);
-	struct of_device *op;
+	struct platform_device *op;
 
 	strcpy(info->driver, "sunqe");
 	strcpy(info->version, "3.0");
@@ -720,7 +720,7 @@ static const struct ethtool_ops qe_ethtool_ops = {
 };
 
 /* This is only called once at boot time for each card probed. */
-static void qec_init_once(struct sunqec *qecp, struct of_device *op)
+static void qec_init_once(struct sunqec *qecp, struct platform_device *op)
 {
 	u8 bsizes = qecp->qec_bursts;
 
@@ -770,9 +770,9 @@ static u8 __devinit qec_get_burst(struct device_node *dp)
 	return bsizes;
 }
 
-static struct sunqec * __devinit get_qec(struct of_device *child)
+static struct sunqec * __devinit get_qec(struct platform_device *child)
 {
-	struct of_device *op = to_of_device(child->dev.parent);
+	struct platform_device *op = to_platform_device(child->dev.parent);
 	struct sunqec *qecp;
 
 	qecp = dev_get_drvdata(&op->dev);
@@ -836,7 +836,7 @@ static const struct net_device_ops qec_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
-static int __devinit qec_ether_init(struct of_device *op)
+static int __devinit qec_ether_init(struct platform_device *op)
 {
 	static unsigned version_printed;
 	struct net_device *dev;
@@ -941,12 +941,12 @@ fail:
 	return res;
 }
 
-static int __devinit qec_sbus_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit qec_sbus_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	return qec_ether_init(op);
 }
 
-static int __devexit qec_sbus_remove(struct of_device *op)
+static int __devexit qec_sbus_remove(struct platform_device *op)
 {
 	struct sunqe *qp = dev_get_drvdata(&op->dev);
 	struct net_device *net_dev = qp->dev;
@@ -997,7 +997,7 @@ static void __exit qec_exit(void)
 
 	while (root_qec_dev) {
 		struct sunqec *next = root_qec_dev->next_module;
-		struct of_device *op = root_qec_dev->op;
+		struct platform_device *op = root_qec_dev->op;
 
 		free_irq(op->archdata.irqs[0], (void *) root_qec_dev);
 		of_iounmap(&op->resource[0], root_qec_dev->gregs,
diff --git a/drivers/net/sunqe.h b/drivers/net/sunqe.h
index 5813a7b2faa5..581781b6b2fa 100644
--- a/drivers/net/sunqe.h
+++ b/drivers/net/sunqe.h
@@ -314,7 +314,7 @@ struct sunqec {
 	void __iomem		*gregs;		/* QEC Global Registers         */
 	struct sunqe		*qes[4];	/* Each child MACE              */
 	unsigned int            qec_bursts;	/* Support burst sizes          */
-	struct of_device	*op;		/* QEC's OF device              */
+	struct platform_device	*op;		/* QEC's OF device              */
 	struct sunqec		*next_module;	/* List of all QECs in system   */
 };
 
@@ -342,7 +342,7 @@ struct sunqe {
 	__u32				buffers_dvma;	/* DVMA visible address.       */
 	struct sunqec			*parent;
 	u8				mconfig;	/* Base MACE mconfig value     */
-	struct of_device		*op;		/* QE's OF device struct       */
+	struct platform_device		*op;		/* QE's OF device struct       */
 	struct net_device		*dev;		/* QE's netdevice struct       */
 	int				channel;	/* Who am I?                   */
 };
diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
index 8d532f9b50d0..a4c3f5708246 100644
--- a/drivers/net/ucc_geth.c
+++ b/drivers/net/ucc_geth.c
@@ -3601,7 +3601,7 @@ static void ucc_geth_timeout(struct net_device *dev)
 
 #ifdef CONFIG_PM
 
-static int ucc_geth_suspend(struct of_device *ofdev, pm_message_t state)
+static int ucc_geth_suspend(struct platform_device *ofdev, pm_message_t state)
 {
 	struct net_device *ndev = dev_get_drvdata(&ofdev->dev);
 	struct ucc_geth_private *ugeth = netdev_priv(ndev);
@@ -3629,7 +3629,7 @@ static int ucc_geth_suspend(struct of_device *ofdev, pm_message_t state)
 	return 0;
 }
 
-static int ucc_geth_resume(struct of_device *ofdev)
+static int ucc_geth_resume(struct platform_device *ofdev)
 {
 	struct net_device *ndev = dev_get_drvdata(&ofdev->dev);
 	struct ucc_geth_private *ugeth = netdev_priv(ndev);
@@ -3732,7 +3732,7 @@ static const struct net_device_ops ucc_geth_netdev_ops = {
 #endif
 };
 
-static int ucc_geth_probe(struct of_device* ofdev, const struct of_device_id *match)
+static int ucc_geth_probe(struct platform_device* ofdev, const struct of_device_id *match)
 {
 	struct device *device = &ofdev->dev;
 	struct device_node *np = ofdev->dev.of_node;
@@ -3954,7 +3954,7 @@ static int ucc_geth_probe(struct of_device* ofdev, const struct of_device_id *ma
 	return 0;
 }
 
-static int ucc_geth_remove(struct of_device* ofdev)
+static int ucc_geth_remove(struct platform_device* ofdev)
 {
 	struct device *device = &ofdev->dev;
 	struct net_device *dev = dev_get_drvdata(device);
diff --git a/drivers/net/xilinx_emaclite.c b/drivers/net/xilinx_emaclite.c
index b2c2f391b29d..ecbbb688eba0 100644
--- a/drivers/net/xilinx_emaclite.c
+++ b/drivers/net/xilinx_emaclite.c
@@ -1086,7 +1086,7 @@ static void xemaclite_remove_ndev(struct net_device *ndev)
  *
  * Return:	Value of the parameter if the parameter is found, or 0 otherwise
  */
-static bool get_bool(struct of_device *ofdev, const char *s)
+static bool get_bool(struct platform_device *ofdev, const char *s)
 {
 	u32 *p = (u32 *)of_get_property(ofdev->dev.of_node, s, NULL);
 
@@ -1115,7 +1115,7 @@ static struct net_device_ops xemaclite_netdev_ops;
  * Return:	0, if the driver is bound to the Emaclite device, or
  *		a negative error if there is failure.
  */
-static int __devinit xemaclite_of_probe(struct of_device *ofdev,
+static int __devinit xemaclite_of_probe(struct platform_device *ofdev,
 					const struct of_device_id *match)
 {
 	struct resource r_irq; /* Interrupt resources */
@@ -1240,7 +1240,7 @@ error2:
  *
  * Return:	0, always.
  */
-static int __devexit xemaclite_of_remove(struct of_device *of_dev)
+static int __devexit xemaclite_of_remove(struct platform_device *of_dev)
 {
 	struct device *dev = &of_dev->dev;
 	struct net_device *ndev = dev_get_drvdata(dev);
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 0d8a0644f540..92de0eb74aea 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -14,7 +14,7 @@
  * @ids: array of of device match structures to search in
  * @dev: the of device structure to match against
  *
- * Used by a driver to check whether an of_device present in the
+ * Used by a driver to check whether an platform_device present in the
  * system is in its list of supported devices.
  */
 const struct of_device_id *of_match_device(const struct of_device_id *matches,
diff --git a/drivers/parport/parport_sunbpp.c b/drivers/parport/parport_sunbpp.c
index 210a6441a066..55ba118f1cf1 100644
--- a/drivers/parport/parport_sunbpp.c
+++ b/drivers/parport/parport_sunbpp.c
@@ -286,7 +286,7 @@ static struct parport_operations parport_sunbpp_ops =
 	.owner		= THIS_MODULE,
 };
 
-static int __devinit bpp_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit bpp_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct parport_operations *ops;
 	struct bpp_regs __iomem *regs;
@@ -351,7 +351,7 @@ out_unmap:
 	return err;
 }
 
-static int __devexit bpp_remove(struct of_device *op)
+static int __devexit bpp_remove(struct platform_device *op)
 {
 	struct parport *p = dev_get_drvdata(&op->dev);
 	struct parport_operations *ops = p->ops;
diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c
index f94d8281cfb0..546d3024b6f0 100644
--- a/drivers/pcmcia/electra_cf.c
+++ b/drivers/pcmcia/electra_cf.c
@@ -44,7 +44,7 @@ struct electra_cf_socket {
 	unsigned		present:1;
 	unsigned		active:1;
 
-	struct of_device	*ofdev;
+	struct platform_device	*ofdev;
 	unsigned long		mem_phys;
 	void __iomem *		mem_base;
 	unsigned long		mem_size;
@@ -181,7 +181,7 @@ static struct pccard_operations electra_cf_ops = {
 	.set_mem_map		= electra_cf_set_mem_map,
 };
 
-static int __devinit electra_cf_probe(struct of_device *ofdev,
+static int __devinit electra_cf_probe(struct platform_device *ofdev,
 				      const struct of_device_id *match)
 {
 	struct device *device = &ofdev->dev;
@@ -325,7 +325,7 @@ fail1:
 
 }
 
-static int __devexit electra_cf_remove(struct of_device *ofdev)
+static int __devexit electra_cf_remove(struct platform_device *ofdev)
 {
 	struct device *device = &ofdev->dev;
 	struct electra_cf_socket *cf;
diff --git a/drivers/pcmcia/m8xx_pcmcia.c b/drivers/pcmcia/m8xx_pcmcia.c
index 25e5e30a18af..84a70a59ef95 100644
--- a/drivers/pcmcia/m8xx_pcmcia.c
+++ b/drivers/pcmcia/m8xx_pcmcia.c
@@ -1150,7 +1150,7 @@ static struct pccard_operations m8xx_services = {
 	.set_mem_map = m8xx_set_mem_map,
 };
 
-static int __init m8xx_probe(struct of_device *ofdev,
+static int __init m8xx_probe(struct platform_device *ofdev,
 			     const struct of_device_id *match)
 {
 	struct pcmcia_win *w;
@@ -1250,7 +1250,7 @@ static int __init m8xx_probe(struct of_device *ofdev,
 	return 0;
 }
 
-static int m8xx_remove(struct of_device *ofdev)
+static int m8xx_remove(struct platform_device *ofdev)
 {
 	u32 m, i;
 	struct pcmcia_win *w;
diff --git a/drivers/rtc/rtc-mpc5121.c b/drivers/rtc/rtc-mpc5121.c
index db5d8c416d26..dfcdf0901d21 100644
--- a/drivers/rtc/rtc-mpc5121.c
+++ b/drivers/rtc/rtc-mpc5121.c
@@ -268,7 +268,7 @@ static const struct rtc_class_ops mpc5121_rtc_ops = {
 	.update_irq_enable = mpc5121_rtc_update_irq_enable,
 };
 
-static int __devinit mpc5121_rtc_probe(struct of_device *op,
+static int __devinit mpc5121_rtc_probe(struct platform_device *op,
 					const struct of_device_id *match)
 {
 	struct mpc5121_rtc_data *rtc;
@@ -338,7 +338,7 @@ out_free:
 	return err;
 }
 
-static int __devexit mpc5121_rtc_remove(struct of_device *op)
+static int __devexit mpc5121_rtc_remove(struct platform_device *op)
 {
 	struct mpc5121_rtc_data *rtc = dev_get_drvdata(&op->dev);
 	struct mpc5121_rtc_regs __iomem *regs = rtc->regs;
diff --git a/drivers/sbus/char/bbc_envctrl.c b/drivers/sbus/char/bbc_envctrl.c
index 103fdf6b0b89..160e7510aca6 100644
--- a/drivers/sbus/char/bbc_envctrl.c
+++ b/drivers/sbus/char/bbc_envctrl.c
@@ -443,7 +443,7 @@ static int kenvctrld(void *__unused)
 	return 0;
 }
 
-static void attach_one_temp(struct bbc_i2c_bus *bp, struct of_device *op,
+static void attach_one_temp(struct bbc_i2c_bus *bp, struct platform_device *op,
 			    int temp_idx)
 {
 	struct bbc_cpu_temperature *tp;
@@ -488,7 +488,7 @@ static void attach_one_temp(struct bbc_i2c_bus *bp, struct of_device *op,
 	tp->fan_todo[FAN_CPU] = FAN_SAME;
 }
 
-static void attach_one_fan(struct bbc_i2c_bus *bp, struct of_device *op,
+static void attach_one_fan(struct bbc_i2c_bus *bp, struct platform_device *op,
 			   int fan_idx)
 {
 	struct bbc_fan_control *fp;
@@ -559,7 +559,7 @@ static void destroy_all_fans(struct bbc_i2c_bus *bp)
 
 int bbc_envctrl_init(struct bbc_i2c_bus *bp)
 {
-	struct of_device *op;
+	struct platform_device *op;
 	int temp_index = 0;
 	int fan_index = 0;
 	int devidx = 0;
diff --git a/drivers/sbus/char/bbc_i2c.c b/drivers/sbus/char/bbc_i2c.c
index 3e89c313e98d..614a5e114a19 100644
--- a/drivers/sbus/char/bbc_i2c.c
+++ b/drivers/sbus/char/bbc_i2c.c
@@ -51,7 +51,7 @@
  * The second controller also connects to the smartcard reader, if present.
  */
 
-static void set_device_claimage(struct bbc_i2c_bus *bp, struct of_device *op, int val)
+static void set_device_claimage(struct bbc_i2c_bus *bp, struct platform_device *op, int val)
 {
 	int i;
 
@@ -66,9 +66,9 @@ static void set_device_claimage(struct bbc_i2c_bus *bp, struct of_device *op, in
 #define claim_device(BP,ECHILD)		set_device_claimage(BP,ECHILD,1)
 #define release_device(BP,ECHILD)	set_device_claimage(BP,ECHILD,0)
 
-struct of_device *bbc_i2c_getdev(struct bbc_i2c_bus *bp, int index)
+struct platform_device *bbc_i2c_getdev(struct bbc_i2c_bus *bp, int index)
 {
-	struct of_device *op = NULL;
+	struct platform_device *op = NULL;
 	int curidx = 0, i;
 
 	for (i = 0; i < NUM_CHILDREN; i++) {
@@ -86,7 +86,7 @@ out:
 	return NULL;
 }
 
-struct bbc_i2c_client *bbc_i2c_attach(struct bbc_i2c_bus *bp, struct of_device *op)
+struct bbc_i2c_client *bbc_i2c_attach(struct bbc_i2c_bus *bp, struct platform_device *op)
 {
 	struct bbc_i2c_client *client;
 	const u32 *reg;
@@ -114,7 +114,7 @@ struct bbc_i2c_client *bbc_i2c_attach(struct bbc_i2c_bus *bp, struct of_device *
 void bbc_i2c_detach(struct bbc_i2c_client *client)
 {
 	struct bbc_i2c_bus *bp = client->bp;
-	struct of_device *op = client->op;
+	struct platform_device *op = client->op;
 
 	release_device(bp, op);
 	kfree(client);
@@ -297,7 +297,7 @@ static void __init reset_one_i2c(struct bbc_i2c_bus *bp)
 	writeb(I2C_PCF_IDLE, bp->i2c_control_regs + 0x0);
 }
 
-static struct bbc_i2c_bus * __init attach_one_i2c(struct of_device *op, int index)
+static struct bbc_i2c_bus * __init attach_one_i2c(struct platform_device *op, int index)
 {
 	struct bbc_i2c_bus *bp;
 	struct device_node *dp;
@@ -330,7 +330,7 @@ static struct bbc_i2c_bus * __init attach_one_i2c(struct of_device *op, int inde
 	for (dp = op->dev.of_node->child;
 	     dp && entry < 8;
 	     dp = dp->sibling, entry++) {
-		struct of_device *child_op;
+		struct platform_device *child_op;
 
 		child_op = of_find_device_by_node(dp);
 		bp->devs[entry].device = child_op;
@@ -361,7 +361,7 @@ fail:
 extern int bbc_envctrl_init(struct bbc_i2c_bus *bp);
 extern void bbc_envctrl_cleanup(struct bbc_i2c_bus *bp);
 
-static int __devinit bbc_i2c_probe(struct of_device *op,
+static int __devinit bbc_i2c_probe(struct platform_device *op,
 				   const struct of_device_id *match)
 {
 	struct bbc_i2c_bus *bp;
@@ -386,7 +386,7 @@ static int __devinit bbc_i2c_probe(struct of_device *op,
 	return err;
 }
 
-static int __devexit bbc_i2c_remove(struct of_device *op)
+static int __devexit bbc_i2c_remove(struct platform_device *op)
 {
 	struct bbc_i2c_bus *bp = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/sbus/char/bbc_i2c.h b/drivers/sbus/char/bbc_i2c.h
index 83c4811b7b5e..4b4531066e75 100644
--- a/drivers/sbus/char/bbc_i2c.h
+++ b/drivers/sbus/char/bbc_i2c.h
@@ -7,7 +7,7 @@
 
 struct bbc_i2c_client {
 	struct bbc_i2c_bus		*bp;
-	struct of_device		*op;
+	struct platform_device		*op;
 	int				bus;
 	int				address;
 };
@@ -64,16 +64,16 @@ struct bbc_i2c_bus {
 	struct list_head		temps;
 	struct list_head		fans;
 
-	struct of_device		*op;
+	struct platform_device		*op;
 	struct {
-		struct of_device	*device;
+		struct platform_device	*device;
 		int			client_claimed;
 	} devs[NUM_CHILDREN];
 };
 
 /* Probing and attachment. */
-extern struct of_device *bbc_i2c_getdev(struct bbc_i2c_bus *, int);
-extern struct bbc_i2c_client *bbc_i2c_attach(struct bbc_i2c_bus *bp, struct of_device *);
+extern struct platform_device *bbc_i2c_getdev(struct bbc_i2c_bus *, int);
+extern struct bbc_i2c_client *bbc_i2c_attach(struct bbc_i2c_bus *bp, struct platform_device *);
 extern void bbc_i2c_detach(struct bbc_i2c_client *);
 
 /* Register read/write.  NOTE: Blocking! */
diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c
index 47db97583ea7..1690e53fb84a 100644
--- a/drivers/sbus/char/display7seg.c
+++ b/drivers/sbus/char/display7seg.c
@@ -170,7 +170,7 @@ static struct miscdevice d7s_miscdev = {
 	.fops		= &d7s_fops
 };
 
-static int __devinit d7s_probe(struct of_device *op,
+static int __devinit d7s_probe(struct platform_device *op,
 			       const struct of_device_id *match)
 {
 	struct device_node *opts;
@@ -236,7 +236,7 @@ out_free:
 	goto out;
 }
 
-static int __devexit d7s_remove(struct of_device *op)
+static int __devexit d7s_remove(struct platform_device *op)
 {
 	struct d7s *p = dev_get_drvdata(&op->dev);
 	u8 regs = readb(p->regs);
diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c
index 3c27f45e2b6d..078e5f4520ef 100644
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -1027,7 +1027,7 @@ static int kenvctrld(void *__unused)
 	return 0;
 }
 
-static int __devinit envctrl_probe(struct of_device *op,
+static int __devinit envctrl_probe(struct platform_device *op,
 				   const struct of_device_id *match)
 {
 	struct device_node *dp;
@@ -1104,7 +1104,7 @@ out_iounmap:
 	return err;
 }
 
-static int __devexit envctrl_remove(struct of_device *op)
+static int __devexit envctrl_remove(struct platform_device *op)
 {
 	int index;
 
diff --git a/drivers/sbus/char/flash.c b/drivers/sbus/char/flash.c
index 8bb31c584b64..2b4b4b613c48 100644
--- a/drivers/sbus/char/flash.c
+++ b/drivers/sbus/char/flash.c
@@ -160,7 +160,7 @@ static const struct file_operations flash_fops = {
 
 static struct miscdevice flash_dev = { FLASH_MINOR, "flash", &flash_fops };
 
-static int __devinit flash_probe(struct of_device *op,
+static int __devinit flash_probe(struct platform_device *op,
 				 const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
@@ -192,7 +192,7 @@ static int __devinit flash_probe(struct of_device *op,
 	return misc_register(&flash_dev);
 }
 
-static int __devexit flash_remove(struct of_device *op)
+static int __devexit flash_remove(struct platform_device *op)
 {
 	misc_deregister(&flash_dev);
 
diff --git a/drivers/sbus/char/uctrl.c b/drivers/sbus/char/uctrl.c
index 41eb6725ff5f..1b345be5cc02 100644
--- a/drivers/sbus/char/uctrl.c
+++ b/drivers/sbus/char/uctrl.c
@@ -348,7 +348,7 @@ static void uctrl_get_external_status(struct uctrl_driver *driver)
 	
 }
 
-static int __devinit uctrl_probe(struct of_device *op,
+static int __devinit uctrl_probe(struct platform_device *op,
 				 const struct of_device_id *match)
 {
 	struct uctrl_driver *p;
@@ -404,7 +404,7 @@ out_free:
 	goto out;
 }
 
-static int __devexit uctrl_remove(struct of_device *op)
+static int __devexit uctrl_remove(struct platform_device *op)
 {
 	struct uctrl_driver *p = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/scsi/qlogicpti.c b/drivers/scsi/qlogicpti.c
index 53d7ed0dc169..f8c561cf751e 100644
--- a/drivers/scsi/qlogicpti.c
+++ b/drivers/scsi/qlogicpti.c
@@ -704,7 +704,7 @@ static void __devexit qpti_chain_del(struct qlogicpti *qpti)
 
 static int __devinit qpti_map_regs(struct qlogicpti *qpti)
 {
-	struct of_device *op = qpti->op;
+	struct platform_device *op = qpti->op;
 
 	qpti->qregs = of_ioremap(&op->resource[0], 0,
 				 resource_size(&op->resource[0]),
@@ -727,7 +727,7 @@ static int __devinit qpti_map_regs(struct qlogicpti *qpti)
 
 static int __devinit qpti_register_irq(struct qlogicpti *qpti)
 {
-	struct of_device *op = qpti->op;
+	struct platform_device *op = qpti->op;
 
 	qpti->qhost->irq = qpti->irq = op->archdata.irqs[0];
 
@@ -752,7 +752,7 @@ fail:
 
 static void __devinit qpti_get_scsi_id(struct qlogicpti *qpti)
 {
-	struct of_device *op = qpti->op;
+	struct platform_device *op = qpti->op;
 	struct device_node *dp;
 
 	dp = op->dev.of_node;
@@ -773,7 +773,7 @@ static void __devinit qpti_get_scsi_id(struct qlogicpti *qpti)
 
 static void qpti_get_bursts(struct qlogicpti *qpti)
 {
-	struct of_device *op = qpti->op;
+	struct platform_device *op = qpti->op;
 	u8 bursts, bmask;
 
 	bursts = of_getintprop_default(op->dev.of_node, "burst-sizes", 0xff);
@@ -806,7 +806,7 @@ static void qpti_get_clock(struct qlogicpti *qpti)
  */
 static int __devinit qpti_map_queues(struct qlogicpti *qpti)
 {
-	struct of_device *op = qpti->op;
+	struct platform_device *op = qpti->op;
 
 #define QSIZE(entries)	(((entries) + 1) * QUEUE_ENTRY_LEN)
 	qpti->res_cpu = dma_alloc_coherent(&op->dev,
@@ -1290,7 +1290,7 @@ static struct scsi_host_template qpti_template = {
 	.use_clustering		= ENABLE_CLUSTERING,
 };
 
-static int __devinit qpti_sbus_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit qpti_sbus_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct scsi_host_template *tpnt = match->data;
 	struct device_node *dp = op->dev.of_node;
@@ -1401,7 +1401,7 @@ fail_unlink:
 	return -ENODEV;
 }
 
-static int __devexit qpti_sbus_remove(struct of_device *op)
+static int __devexit qpti_sbus_remove(struct platform_device *op)
 {
 	struct qlogicpti *qpti = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/scsi/qlogicpti.h b/drivers/scsi/qlogicpti.h
index e3c74d1ee2db..4377e87ee79c 100644
--- a/drivers/scsi/qlogicpti.h
+++ b/drivers/scsi/qlogicpti.h
@@ -342,7 +342,7 @@ struct qlogicpti {
 	u_int	                  req_in_ptr;		/* index of next request slot */
 	u_int	                  res_out_ptr;		/* index of next result slot  */
 	long	                  send_marker;		/* must we send a marker?     */
-	struct of_device	 *op;
+	struct platform_device	 *op;
 	unsigned long		  __pad;
 
 	int                       cmd_count[MAX_TARGETS];
diff --git a/drivers/scsi/sun_esp.c b/drivers/scsi/sun_esp.c
index 89ba6fe02f80..193b37ba1834 100644
--- a/drivers/scsi/sun_esp.c
+++ b/drivers/scsi/sun_esp.c
@@ -44,7 +44,7 @@ enum dvma_rev {
 };
 
 static int __devinit esp_sbus_setup_dma(struct esp *esp,
-					struct of_device *dma_of)
+					struct platform_device *dma_of)
 {
 	esp->dma = dma_of;
 
@@ -81,7 +81,7 @@ static int __devinit esp_sbus_setup_dma(struct esp *esp,
 
 static int __devinit esp_sbus_map_regs(struct esp *esp, int hme)
 {
-	struct of_device *op = esp->dev;
+	struct platform_device *op = esp->dev;
 	struct resource *res;
 
 	/* On HME, two reg sets exist, first is DVMA,
@@ -101,7 +101,7 @@ static int __devinit esp_sbus_map_regs(struct esp *esp, int hme)
 
 static int __devinit esp_sbus_map_command_block(struct esp *esp)
 {
-	struct of_device *op = esp->dev;
+	struct platform_device *op = esp->dev;
 
 	esp->command_block = dma_alloc_coherent(&op->dev, 16,
 						&esp->command_block_dma,
@@ -114,15 +114,15 @@ static int __devinit esp_sbus_map_command_block(struct esp *esp)
 static int __devinit esp_sbus_register_irq(struct esp *esp)
 {
 	struct Scsi_Host *host = esp->host;
-	struct of_device *op = esp->dev;
+	struct platform_device *op = esp->dev;
 
 	host->irq = op->archdata.irqs[0];
 	return request_irq(host->irq, scsi_esp_intr, IRQF_SHARED, "ESP", esp);
 }
 
-static void __devinit esp_get_scsi_id(struct esp *esp, struct of_device *espdma)
+static void __devinit esp_get_scsi_id(struct esp *esp, struct platform_device *espdma)
 {
-	struct of_device *op = esp->dev;
+	struct platform_device *op = esp->dev;
 	struct device_node *dp;
 
 	dp = op->dev.of_node;
@@ -144,7 +144,7 @@ done:
 
 static void __devinit esp_get_differential(struct esp *esp)
 {
-	struct of_device *op = esp->dev;
+	struct platform_device *op = esp->dev;
 	struct device_node *dp;
 
 	dp = op->dev.of_node;
@@ -156,7 +156,7 @@ static void __devinit esp_get_differential(struct esp *esp)
 
 static void __devinit esp_get_clock_params(struct esp *esp)
 {
-	struct of_device *op = esp->dev;
+	struct platform_device *op = esp->dev;
 	struct device_node *bus_dp, *dp;
 	int fmhz;
 
@@ -170,10 +170,10 @@ static void __devinit esp_get_clock_params(struct esp *esp)
 	esp->cfreq = fmhz;
 }
 
-static void __devinit esp_get_bursts(struct esp *esp, struct of_device *dma_of)
+static void __devinit esp_get_bursts(struct esp *esp, struct platform_device *dma_of)
 {
 	struct device_node *dma_dp = dma_of->dev.of_node;
-	struct of_device *op = esp->dev;
+	struct platform_device *op = esp->dev;
 	struct device_node *dp;
 	u8 bursts, val;
 
@@ -195,7 +195,7 @@ static void __devinit esp_get_bursts(struct esp *esp, struct of_device *dma_of)
 	esp->bursts = bursts;
 }
 
-static void __devinit esp_sbus_get_props(struct esp *esp, struct of_device *espdma)
+static void __devinit esp_sbus_get_props(struct esp *esp, struct platform_device *espdma)
 {
 	esp_get_scsi_id(esp, espdma);
 	esp_get_differential(esp);
@@ -216,7 +216,7 @@ static u8 sbus_esp_read8(struct esp *esp, unsigned long reg)
 static dma_addr_t sbus_esp_map_single(struct esp *esp, void *buf,
 				      size_t sz, int dir)
 {
-	struct of_device *op = esp->dev;
+	struct platform_device *op = esp->dev;
 
 	return dma_map_single(&op->dev, buf, sz, dir);
 }
@@ -224,7 +224,7 @@ static dma_addr_t sbus_esp_map_single(struct esp *esp, void *buf,
 static int sbus_esp_map_sg(struct esp *esp, struct scatterlist *sg,
 				  int num_sg, int dir)
 {
-	struct of_device *op = esp->dev;
+	struct platform_device *op = esp->dev;
 
 	return dma_map_sg(&op->dev, sg, num_sg, dir);
 }
@@ -232,7 +232,7 @@ static int sbus_esp_map_sg(struct esp *esp, struct scatterlist *sg,
 static void sbus_esp_unmap_single(struct esp *esp, dma_addr_t addr,
 				  size_t sz, int dir)
 {
-	struct of_device *op = esp->dev;
+	struct platform_device *op = esp->dev;
 
 	dma_unmap_single(&op->dev, addr, sz, dir);
 }
@@ -240,7 +240,7 @@ static void sbus_esp_unmap_single(struct esp *esp, dma_addr_t addr,
 static void sbus_esp_unmap_sg(struct esp *esp, struct scatterlist *sg,
 			      int num_sg, int dir)
 {
-	struct of_device *op = esp->dev;
+	struct platform_device *op = esp->dev;
 
 	dma_unmap_sg(&op->dev, sg, num_sg, dir);
 }
@@ -256,7 +256,7 @@ static void sbus_esp_reset_dma(struct esp *esp)
 {
 	int can_do_burst16, can_do_burst32, can_do_burst64;
 	int can_do_sbus64, lim;
-	struct of_device *op;
+	struct platform_device *op;
 	u32 val;
 
 	can_do_burst16 = (esp->bursts & DMA_BURST16) != 0;
@@ -487,8 +487,8 @@ static const struct esp_driver_ops sbus_esp_ops = {
 	.dma_error	=	sbus_esp_dma_error,
 };
 
-static int __devinit esp_sbus_probe_one(struct of_device *op,
-					struct of_device *espdma,
+static int __devinit esp_sbus_probe_one(struct platform_device *op,
+					struct platform_device *espdma,
 					int hme)
 {
 	struct scsi_host_template *tpnt = &scsi_esp_template;
@@ -562,11 +562,11 @@ fail:
 	return err;
 }
 
-static int __devinit esp_sbus_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit esp_sbus_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct device_node *dma_node = NULL;
 	struct device_node *dp = op->dev.of_node;
-	struct of_device *dma_of = NULL;
+	struct platform_device *dma_of = NULL;
 	int hme = 0;
 
 	if (dp->parent &&
@@ -585,10 +585,10 @@ static int __devinit esp_sbus_probe(struct of_device *op, const struct of_device
 	return esp_sbus_probe_one(op, dma_of, hme);
 }
 
-static int __devexit esp_sbus_remove(struct of_device *op)
+static int __devexit esp_sbus_remove(struct platform_device *op)
 {
 	struct esp *esp = dev_get_drvdata(&op->dev);
-	struct of_device *dma_of = esp->dma;
+	struct platform_device *dma_of = esp->dma;
 	unsigned int irq = esp->host->irq;
 	bool is_hme;
 	u32 val;
diff --git a/drivers/serial/apbuart.c b/drivers/serial/apbuart.c
index 0099b8692b60..cc01c650a144 100644
--- a/drivers/serial/apbuart.c
+++ b/drivers/serial/apbuart.c
@@ -551,7 +551,7 @@ static struct uart_driver grlib_apbuart_driver = {
 /* OF Platform Driver                                                       */
 /* ======================================================================== */
 
-static int __devinit apbuart_probe(struct of_device *op,
+static int __devinit apbuart_probe(struct platform_device *op,
 				   const struct of_device_id *match)
 {
 	int i = -1;
diff --git a/drivers/serial/cpm_uart/cpm_uart_core.c b/drivers/serial/cpm_uart/cpm_uart_core.c
index 6016179db533..f2b8adcc6c92 100644
--- a/drivers/serial/cpm_uart/cpm_uart_core.c
+++ b/drivers/serial/cpm_uart/cpm_uart_core.c
@@ -1340,7 +1340,7 @@ static struct uart_driver cpm_reg = {
 
 static int probe_index;
 
-static int __devinit cpm_uart_probe(struct of_device *ofdev,
+static int __devinit cpm_uart_probe(struct platform_device *ofdev,
                                     const struct of_device_id *match)
 {
 	int index = probe_index++;
@@ -1364,7 +1364,7 @@ static int __devinit cpm_uart_probe(struct of_device *ofdev,
 	return uart_add_one_port(&cpm_reg, &pinfo->port);
 }
 
-static int __devexit cpm_uart_remove(struct of_device *ofdev)
+static int __devexit cpm_uart_remove(struct platform_device *ofdev)
 {
 	struct uart_cpm_port *pinfo = dev_get_drvdata(&ofdev->dev);
 	return uart_remove_one_port(&cpm_reg, &pinfo->port);
diff --git a/drivers/serial/mpc52xx_uart.c b/drivers/serial/mpc52xx_uart.c
index 1a88b363005c..8dedb266f143 100644
--- a/drivers/serial/mpc52xx_uart.c
+++ b/drivers/serial/mpc52xx_uart.c
@@ -1298,7 +1298,7 @@ static struct of_device_id mpc52xx_uart_of_match[] = {
 };
 
 static int __devinit
-mpc52xx_uart_of_probe(struct of_device *op, const struct of_device_id *match)
+mpc52xx_uart_of_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	int idx = -1;
 	unsigned int uartclk;
@@ -1369,7 +1369,7 @@ mpc52xx_uart_of_probe(struct of_device *op, const struct of_device_id *match)
 }
 
 static int
-mpc52xx_uart_of_remove(struct of_device *op)
+mpc52xx_uart_of_remove(struct platform_device *op)
 {
 	struct uart_port *port = dev_get_drvdata(&op->dev);
 	dev_set_drvdata(&op->dev, NULL);
@@ -1382,7 +1382,7 @@ mpc52xx_uart_of_remove(struct of_device *op)
 
 #ifdef CONFIG_PM
 static int
-mpc52xx_uart_of_suspend(struct of_device *op, pm_message_t state)
+mpc52xx_uart_of_suspend(struct platform_device *op, pm_message_t state)
 {
 	struct uart_port *port = (struct uart_port *) dev_get_drvdata(&op->dev);
 
@@ -1393,7 +1393,7 @@ mpc52xx_uart_of_suspend(struct of_device *op, pm_message_t state)
 }
 
 static int
-mpc52xx_uart_of_resume(struct of_device *op)
+mpc52xx_uart_of_resume(struct platform_device *op)
 {
 	struct uart_port *port = (struct uart_port *) dev_get_drvdata(&op->dev);
 
diff --git a/drivers/serial/nwpserial.c b/drivers/serial/nwpserial.c
index e65b0d9202a5..de173671e3d0 100644
--- a/drivers/serial/nwpserial.c
+++ b/drivers/serial/nwpserial.c
@@ -344,7 +344,7 @@ int nwpserial_register_port(struct uart_port *port)
 
 	mutex_lock(&nwpserial_mutex);
 
-	dn = to_of_device(port->dev)->dev.of_node;
+	dn = port->dev->of_node;
 	if (dn == NULL)
 		goto out;
 
diff --git a/drivers/serial/of_serial.c b/drivers/serial/of_serial.c
index a48d9080f552..659a695bdad6 100644
--- a/drivers/serial/of_serial.c
+++ b/drivers/serial/of_serial.c
@@ -27,7 +27,7 @@ struct of_serial_info {
 /*
  * Fill a struct uart_port for a given device node
  */
-static int __devinit of_platform_serial_setup(struct of_device *ofdev,
+static int __devinit of_platform_serial_setup(struct platform_device *ofdev,
 					int type, struct uart_port *port)
 {
 	struct resource resource;
@@ -80,7 +80,7 @@ static int __devinit of_platform_serial_setup(struct of_device *ofdev,
 /*
  * Try to register a serial port
  */
-static int __devinit of_platform_serial_probe(struct of_device *ofdev,
+static int __devinit of_platform_serial_probe(struct platform_device *ofdev,
 						const struct of_device_id *id)
 {
 	struct of_serial_info *info;
@@ -134,7 +134,7 @@ out:
 /*
  * Release a line
  */
-static int of_platform_serial_remove(struct of_device *ofdev)
+static int of_platform_serial_remove(struct platform_device *ofdev)
 {
 	struct of_serial_info *info = dev_get_drvdata(&ofdev->dev);
 	switch (info->type) {
diff --git a/drivers/serial/sunhv.c b/drivers/serial/sunhv.c
index a779e22d213e..c9014868297d 100644
--- a/drivers/serial/sunhv.c
+++ b/drivers/serial/sunhv.c
@@ -519,7 +519,7 @@ static struct console sunhv_console = {
 	.data	=	&sunhv_reg,
 };
 
-static int __devinit hv_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit hv_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct uart_port *port;
 	unsigned long minor;
@@ -598,7 +598,7 @@ out_free_port:
 	return err;
 }
 
-static int __devexit hv_remove(struct of_device *dev)
+static int __devexit hv_remove(struct platform_device *dev)
 {
 	struct uart_port *port = dev_get_drvdata(&dev->dev);
 
diff --git a/drivers/serial/sunsab.c b/drivers/serial/sunsab.c
index 9845fb1cfb1f..5b246b18f42f 100644
--- a/drivers/serial/sunsab.c
+++ b/drivers/serial/sunsab.c
@@ -883,7 +883,7 @@ static int sunsab_console_setup(struct console *con, char *options)
 	printk("Console: ttyS%d (SAB82532)\n",
 	       (sunsab_reg.minor - 64) + con->index);
 
-	sunserial_console_termios(con, to_of_device(up->port.dev)->dev.of_node);
+	sunserial_console_termios(con, up->port.dev->of_node);
 
 	switch (con->cflag & CBAUD) {
 	case B150: baud = 150; break;
@@ -954,7 +954,7 @@ static inline struct console *SUNSAB_CONSOLE(void)
 #endif
 
 static int __devinit sunsab_init_one(struct uart_sunsab_port *up,
-				     struct of_device *op,
+				     struct platform_device *op,
 				     unsigned long offset,
 				     int line)
 {
@@ -1006,7 +1006,7 @@ static int __devinit sunsab_init_one(struct uart_sunsab_port *up,
 	return 0;
 }
 
-static int __devinit sab_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit sab_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	static int inst;
 	struct uart_sunsab_port *up;
@@ -1062,7 +1062,7 @@ out:
 	return err;
 }
 
-static int __devexit sab_remove(struct of_device *op)
+static int __devexit sab_remove(struct platform_device *op)
 {
 	struct uart_sunsab_port *up = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c
index 3cdf74822db5..551ebfe3ccbb 100644
--- a/drivers/serial/sunsu.c
+++ b/drivers/serial/sunsu.c
@@ -1200,7 +1200,7 @@ static int __devinit sunsu_kbd_ms_init(struct uart_sunsu_port *up)
 		return -ENODEV;
 
 	printk("%s: %s port at %llx, irq %u\n",
-	       to_of_device(up->port.dev)->dev.of_node->full_name,
+	       up->port.dev->of_node->full_name,
 	       (up->su_type == SU_PORT_KBD) ? "Keyboard" : "Mouse",
 	       (unsigned long long) up->port.mapbase,
 	       up->port.irq);
@@ -1352,7 +1352,7 @@ static int __init sunsu_console_setup(struct console *co, char *options)
 	spin_lock_init(&port->lock);
 
 	/* Get firmware console settings.  */
-	sunserial_console_termios(co, to_of_device(port->dev)->dev.of_node);
+	sunserial_console_termios(co, port->dev->of_node);
 
 	memset(&termios, 0, sizeof(struct ktermios));
 	termios.c_cflag = co->cflag;
@@ -1406,7 +1406,7 @@ static enum su_type __devinit su_get_type(struct device_node *dp)
 	return SU_PORT_PORT;
 }
 
-static int __devinit su_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit su_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	static int inst;
 	struct device_node *dp = op->dev.of_node;
@@ -1497,7 +1497,7 @@ out_unmap:
 	return err;
 }
 
-static int __devexit su_remove(struct of_device *op)
+static int __devexit su_remove(struct platform_device *op)
 {
 	struct uart_sunsu_port *up = dev_get_drvdata(&op->dev);
 	bool kbdms = false;
diff --git a/drivers/serial/sunzilog.c b/drivers/serial/sunzilog.c
index d1e6bcb59546..c1967ac1c07f 100644
--- a/drivers/serial/sunzilog.c
+++ b/drivers/serial/sunzilog.c
@@ -1230,7 +1230,7 @@ static int __init sunzilog_console_setup(struct console *con, char *options)
 	       (sunzilog_reg.minor - 64) + con->index, con->index);
 
 	/* Get firmware console settings.  */
-	sunserial_console_termios(con, to_of_device(up->port.dev)->dev.of_node);
+	sunserial_console_termios(con, up->port.dev->of_node);
 
 	/* Firmware console speed is limited to 150-->38400 baud so
 	 * this hackish cflag thing is OK.
@@ -1399,7 +1399,7 @@ static void __devinit sunzilog_init_hw(struct uart_sunzilog_port *up)
 
 static int zilog_irq = -1;
 
-static int __devinit zs_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit zs_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	static int kbm_inst, uart_inst;
 	int inst;
@@ -1516,7 +1516,7 @@ static void __devexit zs_remove_one(struct uart_sunzilog_port *up)
 		uart_remove_one_port(&sunzilog_reg, &up->port);
 }
 
-static int __devexit zs_remove(struct of_device *op)
+static int __devexit zs_remove(struct platform_device *op)
 {
 	struct uart_sunzilog_port *up = dev_get_drvdata(&op->dev);
 	struct zilog_layout __iomem *regs;
diff --git a/drivers/serial/uartlite.c b/drivers/serial/uartlite.c
index caf085d3a76a..9b03d7b3e456 100644
--- a/drivers/serial/uartlite.c
+++ b/drivers/serial/uartlite.c
@@ -584,7 +584,7 @@ static struct platform_driver ulite_platform_driver = {
  */
 #if defined(CONFIG_OF) && (defined(CONFIG_PPC32) || defined(CONFIG_MICROBLAZE))
 static int __devinit
-ulite_of_probe(struct of_device *op, const struct of_device_id *match)
+ulite_of_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct resource res;
 	const unsigned int *id;
@@ -605,7 +605,7 @@ ulite_of_probe(struct of_device *op, const struct of_device_id *match)
 	return ulite_assign(&op->dev, id ? *id : -1, res.start, irq);
 }
 
-static int __devexit ulite_of_remove(struct of_device *op)
+static int __devexit ulite_of_remove(struct platform_device *op)
 {
 	return ulite_release(&op->dev);
 }
diff --git a/drivers/serial/ucc_uart.c b/drivers/serial/ucc_uart.c
index 907b06f5c447..3f4848e2174a 100644
--- a/drivers/serial/ucc_uart.c
+++ b/drivers/serial/ucc_uart.c
@@ -1194,7 +1194,7 @@ static void uart_firmware_cont(const struct firmware *fw, void *context)
 	release_firmware(fw);
 }
 
-static int ucc_uart_probe(struct of_device *ofdev,
+static int ucc_uart_probe(struct platform_device *ofdev,
 	const struct of_device_id *match)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -1462,7 +1462,7 @@ static int ucc_uart_probe(struct of_device *ofdev,
 	return 0;
 }
 
-static int ucc_uart_remove(struct of_device *ofdev)
+static int ucc_uart_remove(struct platform_device *ofdev)
 {
 	struct uart_qe_port *qe_port = dev_get_drvdata(&ofdev->dev);
 
diff --git a/drivers/spi/mpc512x_psc_spi.c b/drivers/spi/mpc512x_psc_spi.c
index 10baac3f8ea5..cddbfceb324f 100644
--- a/drivers/spi/mpc512x_psc_spi.c
+++ b/drivers/spi/mpc512x_psc_spi.c
@@ -507,7 +507,7 @@ static int __exit mpc512x_psc_spi_do_remove(struct device *dev)
 	return 0;
 }
 
-static int __init mpc512x_psc_spi_of_probe(struct of_device *op,
+static int __init mpc512x_psc_spi_of_probe(struct platform_device *op,
 					   const struct of_device_id *match)
 {
 	const u32 *regaddr_p;
@@ -539,7 +539,7 @@ static int __init mpc512x_psc_spi_of_probe(struct of_device *op,
 				irq_of_parse_and_map(op->dev.of_node, 0), id);
 }
 
-static int __exit mpc512x_psc_spi_of_remove(struct of_device *op)
+static int __exit mpc512x_psc_spi_of_remove(struct platform_device *op)
 {
 	return mpc512x_psc_spi_do_remove(&op->dev);
 }
diff --git a/drivers/spi/mpc52xx_psc_spi.c b/drivers/spi/mpc52xx_psc_spi.c
index 66d170147dcc..983fbbfce76e 100644
--- a/drivers/spi/mpc52xx_psc_spi.c
+++ b/drivers/spi/mpc52xx_psc_spi.c
@@ -465,7 +465,7 @@ static int __exit mpc52xx_psc_spi_do_remove(struct device *dev)
 	return 0;
 }
 
-static int __init mpc52xx_psc_spi_of_probe(struct of_device *op,
+static int __init mpc52xx_psc_spi_of_probe(struct platform_device *op,
 	const struct of_device_id *match)
 {
 	const u32 *regaddr_p;
@@ -495,7 +495,7 @@ static int __init mpc52xx_psc_spi_of_probe(struct of_device *op,
 				irq_of_parse_and_map(op->dev.of_node, 0), id);
 }
 
-static int __exit mpc52xx_psc_spi_of_remove(struct of_device *op)
+static int __exit mpc52xx_psc_spi_of_remove(struct platform_device *op)
 {
 	return mpc52xx_psc_spi_do_remove(&op->dev);
 }
diff --git a/drivers/spi/mpc52xx_spi.c b/drivers/spi/mpc52xx_spi.c
index 56136ff00e01..ec9f0b1bf864 100644
--- a/drivers/spi/mpc52xx_spi.c
+++ b/drivers/spi/mpc52xx_spi.c
@@ -390,7 +390,7 @@ static int mpc52xx_spi_transfer(struct spi_device *spi, struct spi_message *m)
 /*
  * OF Platform Bus Binding
  */
-static int __devinit mpc52xx_spi_probe(struct of_device *op,
+static int __devinit mpc52xx_spi_probe(struct platform_device *op,
 				       const struct of_device_id *match)
 {
 	struct spi_master *master;
@@ -530,7 +530,7 @@ static int __devinit mpc52xx_spi_probe(struct of_device *op,
 	return rc;
 }
 
-static int __devexit mpc52xx_spi_remove(struct of_device *op)
+static int __devexit mpc52xx_spi_remove(struct platform_device *op)
 {
 	struct spi_master *master = dev_get_drvdata(&op->dev);
 	struct mpc52xx_spi *ms = spi_master_get_devdata(master);
diff --git a/drivers/spi/spi_mpc8xxx.c b/drivers/spi/spi_mpc8xxx.c
index aad9ae1b9c69..d31b57f7baaf 100644
--- a/drivers/spi/spi_mpc8xxx.c
+++ b/drivers/spi/spi_mpc8xxx.c
@@ -1236,7 +1236,7 @@ static int of_mpc8xxx_spi_free_chipselects(struct device *dev)
 	return 0;
 }
 
-static int __devinit of_mpc8xxx_spi_probe(struct of_device *ofdev,
+static int __devinit of_mpc8xxx_spi_probe(struct platform_device *ofdev,
 					  const struct of_device_id *ofid)
 {
 	struct device *dev = &ofdev->dev;
@@ -1308,7 +1308,7 @@ err_clk:
 	return ret;
 }
 
-static int __devexit of_mpc8xxx_spi_remove(struct of_device *ofdev)
+static int __devexit of_mpc8xxx_spi_remove(struct platform_device *ofdev)
 {
 	int ret;
 
diff --git a/drivers/spi/spi_ppc4xx.c b/drivers/spi/spi_ppc4xx.c
index 0f5fa7e2a550..80e172d3e72a 100644
--- a/drivers/spi/spi_ppc4xx.c
+++ b/drivers/spi/spi_ppc4xx.c
@@ -388,9 +388,9 @@ static void free_gpios(struct ppc4xx_spi *hw)
 }
 
 /*
- * of_device layer stuff...
+ * platform_device layer stuff...
  */
-static int __init spi_ppc4xx_of_probe(struct of_device *op,
+static int __init spi_ppc4xx_of_probe(struct platform_device *op,
 				      const struct of_device_id *match)
 {
 	struct ppc4xx_spi *hw;
@@ -565,7 +565,7 @@ free_master:
 	return ret;
 }
 
-static int __exit spi_ppc4xx_of_remove(struct of_device *op)
+static int __exit spi_ppc4xx_of_remove(struct platform_device *op)
 {
 	struct spi_master *master = dev_get_drvdata(&op->dev);
 	struct ppc4xx_spi *hw = spi_master_get_devdata(master);
diff --git a/drivers/spi/xilinx_spi_of.c b/drivers/spi/xilinx_spi_of.c
index f53d3f6b9f61..b66c2dbf20a5 100644
--- a/drivers/spi/xilinx_spi_of.c
+++ b/drivers/spi/xilinx_spi_of.c
@@ -38,7 +38,7 @@
 #include "xilinx_spi.h"
 
 
-static int __devinit xilinx_spi_of_probe(struct of_device *ofdev,
+static int __devinit xilinx_spi_of_probe(struct platform_device *ofdev,
 	const struct of_device_id *match)
 {
 	struct spi_master *master;
@@ -84,7 +84,7 @@ static int __devinit xilinx_spi_of_probe(struct of_device *ofdev,
 	return 0;
 }
 
-static int __devexit xilinx_spi_remove(struct of_device *ofdev)
+static int __devexit xilinx_spi_remove(struct platform_device *ofdev)
 {
 	xilinx_spi_deinit(dev_get_drvdata(&ofdev->dev));
 	dev_set_drvdata(&ofdev->dev, 0);
@@ -93,7 +93,7 @@ static int __devexit xilinx_spi_remove(struct of_device *ofdev)
 	return 0;
 }
 
-static int __exit xilinx_spi_of_remove(struct of_device *op)
+static int __exit xilinx_spi_of_remove(struct platform_device *op)
 {
 	return xilinx_spi_remove(op);
 }
diff --git a/drivers/usb/gadget/fsl_qe_udc.c b/drivers/usb/gadget/fsl_qe_udc.c
index 9648b75f0283..a5ea2c1d8c93 100644
--- a/drivers/usb/gadget/fsl_qe_udc.c
+++ b/drivers/usb/gadget/fsl_qe_udc.c
@@ -2398,7 +2398,7 @@ int usb_gadget_unregister_driver(struct usb_gadget_driver *driver)
 EXPORT_SYMBOL(usb_gadget_unregister_driver);
 
 /* udc structure's alloc and setup, include ep-param alloc */
-static struct qe_udc __devinit *qe_udc_config(struct of_device *ofdev)
+static struct qe_udc __devinit *qe_udc_config(struct platform_device *ofdev)
 {
 	struct qe_udc *udc;
 	struct device_node *np = ofdev->dev.of_node;
@@ -2523,7 +2523,7 @@ static void qe_udc_release(struct device *dev)
 }
 
 /* Driver probe functions */
-static int __devinit qe_udc_probe(struct of_device *ofdev,
+static int __devinit qe_udc_probe(struct platform_device *ofdev,
 			const struct of_device_id *match)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -2679,18 +2679,18 @@ err1:
 }
 
 #ifdef CONFIG_PM
-static int qe_udc_suspend(struct of_device *dev, pm_message_t state)
+static int qe_udc_suspend(struct platform_device *dev, pm_message_t state)
 {
 	return -ENOTSUPP;
 }
 
-static int qe_udc_resume(struct of_device *dev)
+static int qe_udc_resume(struct platform_device *dev)
 {
 	return -ENOTSUPP;
 }
 #endif
 
-static int __devexit qe_udc_remove(struct of_device *ofdev)
+static int __devexit qe_udc_remove(struct platform_device *ofdev)
 {
 	struct qe_ep *ep;
 	unsigned int size;
diff --git a/drivers/usb/host/ehci-ppc-of.c b/drivers/usb/host/ehci-ppc-of.c
index 5aec92866ab3..335ee699fd85 100644
--- a/drivers/usb/host/ehci-ppc-of.c
+++ b/drivers/usb/host/ehci-ppc-of.c
@@ -106,7 +106,7 @@ ppc44x_enable_bmt(struct device_node *dn)
 
 
 static int __devinit
-ehci_hcd_ppc_of_probe(struct of_device *op, const struct of_device_id *match)
+ehci_hcd_ppc_of_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct device_node *dn = op->dev.of_node;
 	struct usb_hcd *hcd;
@@ -210,7 +210,7 @@ err_rmr:
 }
 
 
-static int ehci_hcd_ppc_of_remove(struct of_device *op)
+static int ehci_hcd_ppc_of_remove(struct platform_device *op)
 {
 	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
 	struct ehci_hcd *ehci = hcd_to_ehci(hcd);
@@ -253,7 +253,7 @@ static int ehci_hcd_ppc_of_remove(struct of_device *op)
 }
 
 
-static int ehci_hcd_ppc_of_shutdown(struct of_device *op)
+static int ehci_hcd_ppc_of_shutdown(struct platform_device *op)
 {
 	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/usb/host/ehci-xilinx-of.c b/drivers/usb/host/ehci-xilinx-of.c
index 4899f451add9..6c8076ad821d 100644
--- a/drivers/usb/host/ehci-xilinx-of.c
+++ b/drivers/usb/host/ehci-xilinx-of.c
@@ -140,7 +140,7 @@ static const struct hc_driver ehci_xilinx_of_hc_driver = {
 
 /**
  * ehci_hcd_xilinx_of_probe - Probe method for the USB host controller
- * @op:		pointer to the of_device to which the host controller bound
+ * @op:		pointer to the platform_device bound to the host controller
  * @match:	pointer to of_device_id structure, not used
  *
  * This function requests resources and sets up appropriate properties for the
@@ -149,7 +149,7 @@ static const struct hc_driver ehci_xilinx_of_hc_driver = {
  * entry, and sets an appropriate value for hcd->has_tt.
  */
 static int __devinit
-ehci_hcd_xilinx_of_probe(struct of_device *op, const struct of_device_id *match)
+ehci_hcd_xilinx_of_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct device_node *dn = op->dev.of_node;
 	struct usb_hcd *hcd;
@@ -242,12 +242,12 @@ err_rmr:
 
 /**
  * ehci_hcd_xilinx_of_remove - shutdown hcd and release resources
- * @op:		pointer to of_device structure that is to be removed
+ * @op:		pointer to platform_device structure that is to be removed
  *
  * Remove the hcd structure, and release resources that has been requested
  * during probe.
  */
-static int ehci_hcd_xilinx_of_remove(struct of_device *op)
+static int ehci_hcd_xilinx_of_remove(struct platform_device *op)
 {
 	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
 	dev_set_drvdata(&op->dev, NULL);
@@ -266,11 +266,11 @@ static int ehci_hcd_xilinx_of_remove(struct of_device *op)
 
 /**
  * ehci_hcd_xilinx_of_shutdown - shutdown the hcd
- * @op:		pointer to of_device structure that is to be removed
+ * @op:		pointer to platform_device structure that is to be removed
  *
  * Properly shutdown the hcd, call driver's shutdown routine.
  */
-static int ehci_hcd_xilinx_of_shutdown(struct of_device *op)
+static int ehci_hcd_xilinx_of_shutdown(struct platform_device *op)
 {
 	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/usb/host/fhci-hcd.c b/drivers/usb/host/fhci-hcd.c
index c7c8392a88b9..20092a27a1e8 100644
--- a/drivers/usb/host/fhci-hcd.c
+++ b/drivers/usb/host/fhci-hcd.c
@@ -561,7 +561,7 @@ static const struct hc_driver fhci_driver = {
 	.hub_control = fhci_hub_control,
 };
 
-static int __devinit of_fhci_probe(struct of_device *ofdev,
+static int __devinit of_fhci_probe(struct platform_device *ofdev,
 				   const struct of_device_id *ofid)
 {
 	struct device *dev = &ofdev->dev;
@@ -801,7 +801,7 @@ static int __devexit fhci_remove(struct device *dev)
 	return 0;
 }
 
-static int __devexit of_fhci_remove(struct of_device *ofdev)
+static int __devexit of_fhci_remove(struct platform_device *ofdev)
 {
 	return fhci_remove(&ofdev->dev);
 }
diff --git a/drivers/usb/host/isp1760-if.c b/drivers/usb/host/isp1760-if.c
index ec85d0c3cc3e..3b28dbfca058 100644
--- a/drivers/usb/host/isp1760-if.c
+++ b/drivers/usb/host/isp1760-if.c
@@ -27,7 +27,7 @@
 #endif
 
 #ifdef CONFIG_PPC_OF
-static int of_isp1760_probe(struct of_device *dev,
+static int of_isp1760_probe(struct platform_device *dev,
 		const struct of_device_id *match)
 {
 	struct usb_hcd *hcd;
@@ -95,7 +95,7 @@ release_reg:
 	return ret;
 }
 
-static int of_isp1760_remove(struct of_device *dev)
+static int of_isp1760_remove(struct platform_device *dev)
 {
 	struct usb_hcd *hcd = dev_get_drvdata(&dev->dev);
 
diff --git a/drivers/usb/host/ohci-ppc-of.c b/drivers/usb/host/ohci-ppc-of.c
index df165917412a..b2c2dbf08766 100644
--- a/drivers/usb/host/ohci-ppc-of.c
+++ b/drivers/usb/host/ohci-ppc-of.c
@@ -81,7 +81,7 @@ static const struct hc_driver ohci_ppc_of_hc_driver = {
 
 
 static int __devinit
-ohci_hcd_ppc_of_probe(struct of_device *op, const struct of_device_id *match)
+ohci_hcd_ppc_of_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct device_node *dn = op->dev.of_node;
 	struct usb_hcd *hcd;
@@ -183,7 +183,7 @@ err_rmr:
 	return rv;
 }
 
-static int ohci_hcd_ppc_of_remove(struct of_device *op)
+static int ohci_hcd_ppc_of_remove(struct platform_device *op)
 {
 	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
 	dev_set_drvdata(&op->dev, NULL);
@@ -201,7 +201,7 @@ static int ohci_hcd_ppc_of_remove(struct of_device *op)
 	return 0;
 }
 
-static int ohci_hcd_ppc_of_shutdown(struct of_device *op)
+static int ohci_hcd_ppc_of_shutdown(struct platform_device *op)
 {
 	struct usb_hcd *hcd = dev_get_drvdata(&op->dev);
 
diff --git a/drivers/video/bw2.c b/drivers/video/bw2.c
index c7796637bafd..4dc13467281d 100644
--- a/drivers/video/bw2.c
+++ b/drivers/video/bw2.c
@@ -273,7 +273,7 @@ static int __devinit bw2_do_default_mode(struct bw2_par *par,
 	return 0;
 }
 
-static int __devinit bw2_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit bw2_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
 	struct fb_info *info;
@@ -350,7 +350,7 @@ out_err:
 	return err;
 }
 
-static int __devexit bw2_remove(struct of_device *op)
+static int __devexit bw2_remove(struct platform_device *op)
 {
 	struct fb_info *info = dev_get_drvdata(&op->dev);
 	struct bw2_par *par = info->par;
diff --git a/drivers/video/cg14.c b/drivers/video/cg14.c
index d09fde8beb69..24249535ac86 100644
--- a/drivers/video/cg14.c
+++ b/drivers/video/cg14.c
@@ -446,7 +446,7 @@ static struct sbus_mmap_map __cg14_mmap_map[CG14_MMAP_ENTRIES] __devinitdata = {
 	{ .size = 0 }
 };
 
-static void cg14_unmap_regs(struct of_device *op, struct fb_info *info,
+static void cg14_unmap_regs(struct platform_device *op, struct fb_info *info,
 			    struct cg14_par *par)
 {
 	if (par->regs)
@@ -463,7 +463,7 @@ static void cg14_unmap_regs(struct of_device *op, struct fb_info *info,
 			   info->screen_base, info->fix.smem_len);
 }
 
-static int __devinit cg14_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit cg14_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
 	struct fb_info *info;
@@ -570,7 +570,7 @@ out_err:
 	return err;
 }
 
-static int __devexit cg14_remove(struct of_device *op)
+static int __devexit cg14_remove(struct platform_device *op)
 {
 	struct fb_info *info = dev_get_drvdata(&op->dev);
 	struct cg14_par *par = info->par;
diff --git a/drivers/video/cg3.c b/drivers/video/cg3.c
index 64aa29809fb9..09c0c3c42482 100644
--- a/drivers/video/cg3.c
+++ b/drivers/video/cg3.c
@@ -346,7 +346,7 @@ static int __devinit cg3_do_default_mode(struct cg3_par *par)
 	return 0;
 }
 
-static int __devinit cg3_probe(struct of_device *op,
+static int __devinit cg3_probe(struct platform_device *op,
 			       const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
@@ -433,7 +433,7 @@ out_err:
 	return err;
 }
 
-static int __devexit cg3_remove(struct of_device *op)
+static int __devexit cg3_remove(struct platform_device *op)
 {
 	struct fb_info *info = dev_get_drvdata(&op->dev);
 	struct cg3_par *par = info->par;
diff --git a/drivers/video/cg6.c b/drivers/video/cg6.c
index 2389a719dcc7..2b5a97058b08 100644
--- a/drivers/video/cg6.c
+++ b/drivers/video/cg6.c
@@ -718,7 +718,7 @@ static void __devinit cg6_chip_init(struct fb_info *info)
 	sbus_writel(info->var.yres - 1, &fbc->clipmaxy);
 }
 
-static void cg6_unmap_regs(struct of_device *op, struct fb_info *info,
+static void cg6_unmap_regs(struct platform_device *op, struct fb_info *info,
 			   struct cg6_par *par)
 {
 	if (par->fbc)
@@ -737,7 +737,7 @@ static void cg6_unmap_regs(struct of_device *op, struct fb_info *info,
 			   info->fix.smem_len);
 }
 
-static int __devinit cg6_probe(struct of_device *op,
+static int __devinit cg6_probe(struct platform_device *op,
 				const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
@@ -827,7 +827,7 @@ out_err:
 	return err;
 }
 
-static int __devexit cg6_remove(struct of_device *op)
+static int __devexit cg6_remove(struct platform_device *op)
 {
 	struct fb_info *info = dev_get_drvdata(&op->dev);
 	struct cg6_par *par = info->par;
diff --git a/drivers/video/ffb.c b/drivers/video/ffb.c
index f6ecfab296d3..6739b2af3bc0 100644
--- a/drivers/video/ffb.c
+++ b/drivers/video/ffb.c
@@ -893,7 +893,7 @@ static void ffb_init_fix(struct fb_info *info)
 	info->fix.accel = FB_ACCEL_SUN_CREATOR;
 }
 
-static int __devinit ffb_probe(struct of_device *op,
+static int __devinit ffb_probe(struct platform_device *op,
 			       const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
@@ -1023,7 +1023,7 @@ out_err:
 	return err;
 }
 
-static int __devexit ffb_remove(struct of_device *op)
+static int __devexit ffb_remove(struct platform_device *op)
 {
 	struct fb_info *info = dev_get_drvdata(&op->dev);
 	struct ffb_par *par = info->par;
diff --git a/drivers/video/fsl-diu-fb.c b/drivers/video/fsl-diu-fb.c
index e38ad2224540..8bbbf08fa3ce 100644
--- a/drivers/video/fsl-diu-fb.c
+++ b/drivers/video/fsl-diu-fb.c
@@ -1393,7 +1393,7 @@ static void free_irq_local(int irq)
  * Power management hooks. Note that we won't be called from IRQ context,
  * unlike the blank functions above, so we may sleep.
  */
-static int fsl_diu_suspend(struct of_device *ofdev, pm_message_t state)
+static int fsl_diu_suspend(struct platform_device *ofdev, pm_message_t state)
 {
 	struct fsl_diu_data *machine_data;
 
@@ -1403,7 +1403,7 @@ static int fsl_diu_suspend(struct of_device *ofdev, pm_message_t state)
 	return 0;
 }
 
-static int fsl_diu_resume(struct of_device *ofdev)
+static int fsl_diu_resume(struct platform_device *ofdev)
 {
 	struct fsl_diu_data *machine_data;
 
@@ -1487,7 +1487,7 @@ static ssize_t show_monitor(struct device *device,
 	return diu_ops.show_monitor_port(machine_data->monitor_port, buf);
 }
 
-static int __devinit fsl_diu_probe(struct of_device *ofdev,
+static int __devinit fsl_diu_probe(struct platform_device *ofdev,
 	const struct of_device_id *match)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -1667,7 +1667,7 @@ error2:
 }
 
 
-static int fsl_diu_remove(struct of_device *ofdev)
+static int fsl_diu_remove(struct platform_device *ofdev)
 {
 	struct fsl_diu_data *machine_data;
 	int i;
diff --git a/drivers/video/leo.c b/drivers/video/leo.c
index ad677637ffbb..b599e5e36ced 100644
--- a/drivers/video/leo.c
+++ b/drivers/video/leo.c
@@ -529,7 +529,7 @@ static void leo_fixup_var_rgb(struct fb_var_screeninfo *var)
 	var->transp.length = 0;
 }
 
-static void leo_unmap_regs(struct of_device *op, struct fb_info *info,
+static void leo_unmap_regs(struct platform_device *op, struct fb_info *info,
 			   struct leo_par *par)
 {
 	if (par->lc_ss0_usr)
@@ -547,7 +547,7 @@ static void leo_unmap_regs(struct of_device *op, struct fb_info *info,
 		of_iounmap(&op->resource[0], info->screen_base, 0x800000);
 }
 
-static int __devinit leo_probe(struct of_device *op,
+static int __devinit leo_probe(struct platform_device *op,
 			       const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
@@ -637,7 +637,7 @@ out_err:
 	return err;
 }
 
-static int __devexit leo_remove(struct of_device *op)
+static int __devexit leo_remove(struct platform_device *op)
 {
 	struct fb_info *info = dev_get_drvdata(&op->dev);
 	struct leo_par *par = info->par;
diff --git a/drivers/video/mb862xx/mb862xxfb.c b/drivers/video/mb862xx/mb862xxfb.c
index 4e2b8cc3d460..b1c4374cf940 100644
--- a/drivers/video/mb862xx/mb862xxfb.c
+++ b/drivers/video/mb862xx/mb862xxfb.c
@@ -550,7 +550,7 @@ static int mb862xx_gdc_init(struct mb862xxfb_par *par)
 	return 0;
 }
 
-static int __devinit of_platform_mb862xx_probe(struct of_device *ofdev,
+static int __devinit of_platform_mb862xx_probe(struct platform_device *ofdev,
 					       const struct of_device_id *id)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -669,7 +669,7 @@ fbrel:
 	return ret;
 }
 
-static int __devexit of_platform_mb862xx_remove(struct of_device *ofdev)
+static int __devexit of_platform_mb862xx_remove(struct platform_device *ofdev)
 {
 	struct fb_info *fbi = dev_get_drvdata(&ofdev->dev);
 	struct mb862xxfb_par *par = fbi->par;
diff --git a/drivers/video/p9100.c b/drivers/video/p9100.c
index 688b055abab2..b6c3fc2db632 100644
--- a/drivers/video/p9100.c
+++ b/drivers/video/p9100.c
@@ -249,7 +249,7 @@ static void p9100_init_fix(struct fb_info *info, int linebytes, struct device_no
 	info->fix.accel = FB_ACCEL_SUN_CGTHREE;
 }
 
-static int __devinit p9100_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit p9100_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
 	struct fb_info *info;
@@ -326,7 +326,7 @@ out_err:
 	return err;
 }
 
-static int __devexit p9100_remove(struct of_device *op)
+static int __devexit p9100_remove(struct platform_device *op)
 {
 	struct fb_info *info = dev_get_drvdata(&op->dev);
 	struct p9100_par *par = info->par;
diff --git a/drivers/video/platinumfb.c b/drivers/video/platinumfb.c
index 72a1f4c04732..a50e1977b316 100644
--- a/drivers/video/platinumfb.c
+++ b/drivers/video/platinumfb.c
@@ -533,7 +533,7 @@ static int __init platinumfb_setup(char *options)
 #define invalidate_cache(addr)
 #endif
 
-static int __devinit platinumfb_probe(struct of_device* odev,
+static int __devinit platinumfb_probe(struct platform_device* odev,
 				      const struct of_device_id *match)
 {
 	struct device_node	*dp = odev->dev.of_node;
@@ -646,7 +646,7 @@ static int __devinit platinumfb_probe(struct of_device* odev,
 	return rc;
 }
 
-static int __devexit platinumfb_remove(struct of_device* odev)
+static int __devexit platinumfb_remove(struct platform_device* odev)
 {
 	struct fb_info		*info = dev_get_drvdata(&odev->dev);
 	struct fb_info_platinum	*pinfo = info->par;
diff --git a/drivers/video/sunxvr1000.c b/drivers/video/sunxvr1000.c
index 7288934c0d49..5dbe06af2226 100644
--- a/drivers/video/sunxvr1000.c
+++ b/drivers/video/sunxvr1000.c
@@ -111,7 +111,7 @@ static int __devinit gfb_set_fbinfo(struct gfb_info *gp)
         return 0;
 }
 
-static int __devinit gfb_probe(struct of_device *op,
+static int __devinit gfb_probe(struct platform_device *op,
 			       const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
@@ -172,7 +172,7 @@ err_out:
 	return err;
 }
 
-static int __devexit gfb_remove(struct of_device *op)
+static int __devexit gfb_remove(struct platform_device *op)
 {
 	struct fb_info *info = dev_get_drvdata(&op->dev);
 	struct gfb_info *gp = info->par;
diff --git a/drivers/video/tcx.c b/drivers/video/tcx.c
index f375e0db6776..77ad27955cf0 100644
--- a/drivers/video/tcx.c
+++ b/drivers/video/tcx.c
@@ -342,7 +342,7 @@ tcx_init_fix(struct fb_info *info, int linebytes)
 	info->fix.accel = FB_ACCEL_SUN_TCX;
 }
 
-static void tcx_unmap_regs(struct of_device *op, struct fb_info *info,
+static void tcx_unmap_regs(struct platform_device *op, struct fb_info *info,
 			   struct tcx_par *par)
 {
 	if (par->tec)
@@ -362,7 +362,7 @@ static void tcx_unmap_regs(struct of_device *op, struct fb_info *info,
 			   info->screen_base, info->fix.smem_len);
 }
 
-static int __devinit tcx_probe(struct of_device *op,
+static int __devinit tcx_probe(struct platform_device *op,
 			       const struct of_device_id *match)
 {
 	struct device_node *dp = op->dev.of_node;
@@ -486,7 +486,7 @@ out_err:
 	return err;
 }
 
-static int __devexit tcx_remove(struct of_device *op)
+static int __devexit tcx_remove(struct platform_device *op)
 {
 	struct fb_info *info = dev_get_drvdata(&op->dev);
 	struct tcx_par *par = info->par;
diff --git a/drivers/video/xilinxfb.c b/drivers/video/xilinxfb.c
index 29b5daacc217..0c9ce88e95e8 100644
--- a/drivers/video/xilinxfb.c
+++ b/drivers/video/xilinxfb.c
@@ -397,7 +397,7 @@ static int xilinxfb_release(struct device *dev)
  */
 
 static int __devinit
-xilinxfb_of_probe(struct of_device *op, const struct of_device_id *match)
+xilinxfb_of_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	const u32 *prop;
 	u32 *p;
@@ -477,7 +477,7 @@ xilinxfb_of_probe(struct of_device *op, const struct of_device_id *match)
 	return -ENODEV;
 }
 
-static int __devexit xilinxfb_of_remove(struct of_device *op)
+static int __devexit xilinxfb_of_remove(struct platform_device *op)
 {
 	return xilinxfb_release(&op->dev);
 }
diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c
index 30a2512fd52e..566343b3c131 100644
--- a/drivers/watchdog/cpwd.c
+++ b/drivers/watchdog/cpwd.c
@@ -526,7 +526,7 @@ static const struct file_operations cpwd_fops = {
 	.release =		cpwd_release,
 };
 
-static int __devinit cpwd_probe(struct of_device *op,
+static int __devinit cpwd_probe(struct platform_device *op,
 				const struct of_device_id *match)
 {
 	struct device_node *options;
@@ -639,7 +639,7 @@ out_free:
 	goto out;
 }
 
-static int __devexit cpwd_remove(struct of_device *op)
+static int __devexit cpwd_remove(struct platform_device *op)
 {
 	struct cpwd *p = dev_get_drvdata(&op->dev);
 	int i;
diff --git a/drivers/watchdog/gef_wdt.c b/drivers/watchdog/gef_wdt.c
index 1df284f9c2a1..9c21d19043a6 100644
--- a/drivers/watchdog/gef_wdt.c
+++ b/drivers/watchdog/gef_wdt.c
@@ -260,7 +260,7 @@ static struct miscdevice gef_wdt_miscdev = {
 };
 
 
-static int __devinit gef_wdt_probe(struct of_device *dev,
+static int __devinit gef_wdt_probe(struct platform_device *dev,
 	const struct of_device_id *match)
 {
 	int timeout = 10;
diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c
index 4cda64dd309c..8fa213cdb499 100644
--- a/drivers/watchdog/mpc8xxx_wdt.c
+++ b/drivers/watchdog/mpc8xxx_wdt.c
@@ -185,7 +185,7 @@ static struct miscdevice mpc8xxx_wdt_miscdev = {
 	.fops	= &mpc8xxx_wdt_fops,
 };
 
-static int __devinit mpc8xxx_wdt_probe(struct of_device *ofdev,
+static int __devinit mpc8xxx_wdt_probe(struct platform_device *ofdev,
 				       const struct of_device_id *match)
 {
 	int ret;
@@ -238,7 +238,7 @@ err_unmap:
 	return ret;
 }
 
-static int __devexit mpc8xxx_wdt_remove(struct of_device *ofdev)
+static int __devexit mpc8xxx_wdt_remove(struct platform_device *ofdev)
 {
 	mpc8xxx_wdt_pr_warn("watchdog removed");
 	del_timer_sync(&wdt_timer);
diff --git a/drivers/watchdog/riowd.c b/drivers/watchdog/riowd.c
index 4082b4ace1fc..3faee1ae64bd 100644
--- a/drivers/watchdog/riowd.c
+++ b/drivers/watchdog/riowd.c
@@ -172,7 +172,7 @@ static struct miscdevice riowd_miscdev = {
 	.fops	= &riowd_fops
 };
 
-static int __devinit riowd_probe(struct of_device *op,
+static int __devinit riowd_probe(struct platform_device *op,
 				 const struct of_device_id *match)
 {
 	struct riowd *p;
@@ -219,7 +219,7 @@ out:
 	return err;
 }
 
-static int __devexit riowd_remove(struct of_device *op)
+static int __devexit riowd_remove(struct platform_device *op)
 {
 	struct riowd *p = dev_get_drvdata(&op->dev);
 
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 35aa44ad9f2c..835f85ecd2de 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -1,20 +1,6 @@
 #ifndef _LINUX_OF_DEVICE_H
 #define _LINUX_OF_DEVICE_H
 
-/*
- * The of_device *was* a kind of "base class" that was a superset of
- * struct device for use by devices attached to an OF node and probed
- * using OF properties.  However, the important bit of OF-style
- * probing, namely the device node pointer, has been moved into the
- * common struct device when CONFIG_OF is set to make OF-style probing
- * available to all bus types.  So now, just make of_device and
- * platform_device equivalent so that current of_platform bus users
- * can be transparently migrated over to using the platform bus.
- *
- * This line will go away once all references to of_device are removed
- * from the kernel.
- */
-#define of_device platform_device
 #include <linux/platform_device.h>
 #include <linux/of_platform.h> /* temporary until merge */
 
@@ -23,8 +9,6 @@
 #include <linux/of.h>
 #include <linux/mod_devicetable.h>
 
-#define	to_of_device(d) container_of(d, struct of_device, dev)
-
 extern const struct of_device_id *of_match_device(
 	const struct of_device_id *matches, const struct device *dev);
 extern void of_device_make_bus_id(struct device *dev);
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 4e6d989c06df..a68716ad38ce 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -19,9 +19,17 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 
-/*
- * An of_platform_driver driver is attached to a basic of_device on
- * the "platform bus" (platform_bus_type).
+/**
+ * of_platform_driver - Legacy of-aware driver for platform devices.
+ *
+ * An of_platform_driver driver is attached to a basic platform_device on
+ * ether the "platform bus" (platform_bus_type), or the ibm ebus
+ * (ibmebus_bus_type).
+ *
+ * of_platform_driver is being phased out when used with the platform_bus_type,
+ * and regular platform_drivers should be used instead.  When the transition
+ * is complete, only ibmebus will be using this structure, and the
+ * platform_driver member of this structure will be removed.
  */
 struct of_platform_driver
 {
diff --git a/sound/aoa/soundbus/core.c b/sound/aoa/soundbus/core.c
index 99ca7120e269..7487eb76e034 100644
--- a/sound/aoa/soundbus/core.c
+++ b/sound/aoa/soundbus/core.c
@@ -59,7 +59,7 @@ static int soundbus_probe(struct device *dev)
 static int soundbus_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
 	struct soundbus_dev * soundbus_dev;
-	struct of_device * of;
+	struct platform_device * of;
 	const char *compat;
 	int retval = 0;
 	int cplen, seen = 0;
diff --git a/sound/aoa/soundbus/soundbus.h b/sound/aoa/soundbus/soundbus.h
index a0f223c13f66..adecbf36f4f6 100644
--- a/sound/aoa/soundbus/soundbus.h
+++ b/sound/aoa/soundbus/soundbus.h
@@ -141,7 +141,7 @@ struct soundbus_dev {
 	struct list_head onbuslist;
 
 	/* the of device it represents */
-	struct of_device ofdev;
+	struct platform_device ofdev;
 
 	/* what modules go by */
 	char modalias[32];
diff --git a/sound/aoa/soundbus/sysfs.c b/sound/aoa/soundbus/sysfs.c
index 6496e754f00a..e0980b5c2cd8 100644
--- a/sound/aoa/soundbus/sysfs.c
+++ b/sound/aoa/soundbus/sysfs.c
@@ -16,7 +16,7 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
 	struct soundbus_dev *sdev = to_soundbus_device(dev);
-	struct of_device *of = &sdev->ofdev;
+	struct platform_device *of = &sdev->ofdev;
 	int length;
 
 	if (*sdev->modalias) {
diff --git a/sound/soc/fsl/mpc5200_dma.c b/sound/soc/fsl/mpc5200_dma.c
index 1d4e7164e80a..3dcd1469f283 100644
--- a/sound/soc/fsl/mpc5200_dma.c
+++ b/sound/soc/fsl/mpc5200_dma.c
@@ -369,7 +369,7 @@ struct snd_soc_platform mpc5200_audio_dma_platform = {
 };
 EXPORT_SYMBOL_GPL(mpc5200_audio_dma_platform);
 
-int mpc5200_audio_dma_create(struct of_device *op)
+int mpc5200_audio_dma_create(struct platform_device *op)
 {
 	phys_addr_t fifo;
 	struct psc_dma *psc_dma;
@@ -488,7 +488,7 @@ out_unmap:
 }
 EXPORT_SYMBOL_GPL(mpc5200_audio_dma_create);
 
-int mpc5200_audio_dma_destroy(struct of_device *op)
+int mpc5200_audio_dma_destroy(struct platform_device *op)
 {
 	struct psc_dma *psc_dma = dev_get_drvdata(&op->dev);
 
diff --git a/sound/soc/fsl/mpc5200_dma.h b/sound/soc/fsl/mpc5200_dma.h
index e1ec6d91ea38..ca99586f2ad9 100644
--- a/sound/soc/fsl/mpc5200_dma.h
+++ b/sound/soc/fsl/mpc5200_dma.h
@@ -81,8 +81,8 @@ to_psc_dma_stream(struct snd_pcm_substream *substream, struct psc_dma *psc_dma)
 	return &psc_dma->playback;
 }
 
-int mpc5200_audio_dma_create(struct of_device *op);
-int mpc5200_audio_dma_destroy(struct of_device *op);
+int mpc5200_audio_dma_create(struct platform_device *op);
+int mpc5200_audio_dma_destroy(struct platform_device *op);
 
 extern struct snd_soc_platform mpc5200_audio_dma_platform;
 
diff --git a/sound/soc/fsl/mpc5200_psc_ac97.c b/sound/soc/fsl/mpc5200_psc_ac97.c
index e2ee220bfb7e..a0310170a170 100644
--- a/sound/soc/fsl/mpc5200_psc_ac97.c
+++ b/sound/soc/fsl/mpc5200_psc_ac97.c
@@ -263,7 +263,7 @@ EXPORT_SYMBOL_GPL(psc_ac97_dai);
  * - Probe/remove operations
  * - OF device match table
  */
-static int __devinit psc_ac97_of_probe(struct of_device *op,
+static int __devinit psc_ac97_of_probe(struct platform_device *op,
 				      const struct of_device_id *match)
 {
 	int rc, i;
@@ -303,7 +303,7 @@ static int __devinit psc_ac97_of_probe(struct of_device *op,
 	return 0;
 }
 
-static int __devexit psc_ac97_of_remove(struct of_device *op)
+static int __devexit psc_ac97_of_remove(struct platform_device *op)
 {
 	return mpc5200_audio_dma_destroy(op);
 }
diff --git a/sound/soc/fsl/mpc5200_psc_i2s.c b/sound/soc/fsl/mpc5200_psc_i2s.c
index 4f455bd6851f..9aee748eca7d 100644
--- a/sound/soc/fsl/mpc5200_psc_i2s.c
+++ b/sound/soc/fsl/mpc5200_psc_i2s.c
@@ -153,7 +153,7 @@ EXPORT_SYMBOL_GPL(psc_i2s_dai);
  * - Probe/remove operations
  * - OF device match table
  */
-static int __devinit psc_i2s_of_probe(struct of_device *op,
+static int __devinit psc_i2s_of_probe(struct platform_device *op,
 				      const struct of_device_id *match)
 {
 	int rc;
@@ -206,7 +206,7 @@ static int __devinit psc_i2s_of_probe(struct of_device *op,
 
 }
 
-static int __devexit psc_i2s_of_remove(struct of_device *op)
+static int __devexit psc_i2s_of_remove(struct platform_device *op)
 {
 	return mpc5200_audio_dma_destroy(op);
 }
diff --git a/sound/soc/fsl/mpc8610_hpcd.c b/sound/soc/fsl/mpc8610_hpcd.c
index 3a501062c244..3b13b8d65262 100644
--- a/sound/soc/fsl/mpc8610_hpcd.c
+++ b/sound/soc/fsl/mpc8610_hpcd.c
@@ -200,7 +200,7 @@ static struct snd_soc_ops mpc8610_hpcd_ops = {
  * SSI devices.  We also probably aren't compatible with the generic Elo DMA
  * device driver.
  */
-static int mpc8610_hpcd_probe(struct of_device *ofdev,
+static int mpc8610_hpcd_probe(struct platform_device *ofdev,
 	const struct of_device_id *match)
 {
 	struct device_node *np = ofdev->dev.of_node;
@@ -534,7 +534,7 @@ error:
  *
  * This function is called when the OF device is removed.
  */
-static int mpc8610_hpcd_remove(struct of_device *ofdev)
+static int mpc8610_hpcd_remove(struct platform_device *ofdev)
 {
 	struct platform_device *sound_device = dev_get_drvdata(&ofdev->dev);
 	struct mpc8610_hpcd_data *machine_data =
diff --git a/sound/sparc/amd7930.c b/sound/sparc/amd7930.c
index 9eb1a4e0363b..f8bcfc30f800 100644
--- a/sound/sparc/amd7930.c
+++ b/sound/sparc/amd7930.c
@@ -336,7 +336,7 @@ struct snd_amd7930 {
 	int			pgain;
 	int			mgain;
 
-	struct of_device	*op;
+	struct platform_device	*op;
 	unsigned int		irq;
 	struct snd_amd7930	*next;
 };
@@ -906,7 +906,7 @@ static int __devinit snd_amd7930_mixer(struct snd_amd7930 *amd)
 
 static int snd_amd7930_free(struct snd_amd7930 *amd)
 {
-	struct of_device *op = amd->op;
+	struct platform_device *op = amd->op;
 
 	amd7930_idle(amd);
 
@@ -934,7 +934,7 @@ static struct snd_device_ops snd_amd7930_dev_ops = {
 };
 
 static int __devinit snd_amd7930_create(struct snd_card *card,
-					struct of_device *op,
+					struct platform_device *op,
 					int irq, int dev,
 					struct snd_amd7930 **ramd)
 {
@@ -1002,7 +1002,7 @@ static int __devinit snd_amd7930_create(struct snd_card *card,
 	return 0;
 }
 
-static int __devinit amd7930_sbus_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit amd7930_sbus_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct resource *rp = &op->resource[0];
 	static int dev_num;
diff --git a/sound/sparc/cs4231.c b/sound/sparc/cs4231.c
index 68570ee2c9bb..c276086c3b57 100644
--- a/sound/sparc/cs4231.c
+++ b/sound/sparc/cs4231.c
@@ -111,7 +111,7 @@ struct snd_cs4231 {
 	struct mutex		mce_mutex;	/* mutex for mce register */
 	struct mutex		open_mutex;	/* mutex for ALSA open/close */
 
-	struct of_device	*op;
+	struct platform_device	*op;
 	unsigned int		irq[2];
 	unsigned int		regs_size;
 	struct snd_cs4231	*next;
@@ -1771,7 +1771,7 @@ static unsigned int sbus_dma_addr(struct cs4231_dma_control *dma_cont)
 
 static int snd_cs4231_sbus_free(struct snd_cs4231 *chip)
 {
-	struct of_device *op = chip->op;
+	struct platform_device *op = chip->op;
 
 	if (chip->irq[0])
 		free_irq(chip->irq[0], chip);
@@ -1794,7 +1794,7 @@ static struct snd_device_ops snd_cs4231_sbus_dev_ops = {
 };
 
 static int __devinit snd_cs4231_sbus_create(struct snd_card *card,
-					    struct of_device *op,
+					    struct platform_device *op,
 					    int dev)
 {
 	struct snd_cs4231 *chip = card->private_data;
@@ -1856,7 +1856,7 @@ static int __devinit snd_cs4231_sbus_create(struct snd_card *card,
 	return 0;
 }
 
-static int __devinit cs4231_sbus_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit cs4231_sbus_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct resource *rp = &op->resource[0];
 	struct snd_card *card;
@@ -1931,7 +1931,7 @@ static unsigned int _ebus_dma_addr(struct cs4231_dma_control *dma_cont)
 
 static int snd_cs4231_ebus_free(struct snd_cs4231 *chip)
 {
-	struct of_device *op = chip->op;
+	struct platform_device *op = chip->op;
 
 	if (chip->c_dma.ebus_info.regs) {
 		ebus_dma_unregister(&chip->c_dma.ebus_info);
@@ -1960,7 +1960,7 @@ static struct snd_device_ops snd_cs4231_ebus_dev_ops = {
 };
 
 static int __devinit snd_cs4231_ebus_create(struct snd_card *card,
-					    struct of_device *op,
+					    struct platform_device *op,
 					    int dev)
 {
 	struct snd_cs4231 *chip = card->private_data;
@@ -2048,7 +2048,7 @@ static int __devinit snd_cs4231_ebus_create(struct snd_card *card,
 	return 0;
 }
 
-static int __devinit cs4231_ebus_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit cs4231_ebus_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct snd_card *card;
 	int err;
@@ -2072,7 +2072,7 @@ static int __devinit cs4231_ebus_probe(struct of_device *op, const struct of_dev
 }
 #endif
 
-static int __devinit cs4231_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit cs4231_probe(struct platform_device *op, const struct of_device_id *match)
 {
 #ifdef EBUS_SUPPORT
 	if (!strcmp(op->dev.of_node->parent->name, "ebus"))
@@ -2086,7 +2086,7 @@ static int __devinit cs4231_probe(struct of_device *op, const struct of_device_i
 	return -ENODEV;
 }
 
-static int __devexit cs4231_remove(struct of_device *op)
+static int __devexit cs4231_remove(struct platform_device *op)
 {
 	struct snd_cs4231 *chip = dev_get_drvdata(&op->dev);
 
diff --git a/sound/sparc/dbri.c b/sound/sparc/dbri.c
index c421901c48d0..39cd5d69d051 100644
--- a/sound/sparc/dbri.c
+++ b/sound/sparc/dbri.c
@@ -299,7 +299,7 @@ struct dbri_streaminfo {
 /* This structure holds the information for both chips (DBRI & CS4215) */
 struct snd_dbri {
 	int regs_size, irq;	/* Needed for unload */
-	struct of_device *op;	/* OF device info */
+	struct platform_device *op;	/* OF device info */
 	spinlock_t lock;
 
 	struct dbri_dma *dma;	/* Pointer to our DMA block */
@@ -2523,7 +2523,7 @@ static void __devinit snd_dbri_proc(struct snd_card *card)
 static void snd_dbri_free(struct snd_dbri *dbri);
 
 static int __devinit snd_dbri_create(struct snd_card *card,
-				     struct of_device *op,
+				     struct platform_device *op,
 				     int irq, int dev)
 {
 	struct snd_dbri *dbri = card->private_data;
@@ -2592,7 +2592,7 @@ static void snd_dbri_free(struct snd_dbri *dbri)
 				  (void *)dbri->dma, dbri->dma_dvma);
 }
 
-static int __devinit dbri_probe(struct of_device *op, const struct of_device_id *match)
+static int __devinit dbri_probe(struct platform_device *op, const struct of_device_id *match)
 {
 	struct snd_dbri *dbri;
 	struct resource *rp;
@@ -2662,7 +2662,7 @@ _err:
 	return err;
 }
 
-static int __devexit dbri_remove(struct of_device *op)
+static int __devexit dbri_remove(struct platform_device *op)
 {
 	struct snd_card *card = dev_get_drvdata(&op->dev);
 
-- 
cgit v1.2.3-59-g8ed1b


From 31d1d48e199e99077fb30f6fb9a793be7bec756f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Aug 2010 16:34:43 +0100
Subject: Fix init ordering of /dev/console vs callers of modprobe

Make /dev/console get initialised before any initialisation routine that
invokes modprobe because if modprobe fails, it's going to want to open
/dev/console, presumably to write an error message to.

The problem with that is that if the /dev/console driver is not yet
initialised, the chardev handler will call request_module() to invoke
modprobe, which will fail, because we never compile /dev/console as a
module.

This will lead to a modprobe loop, showing the following in the kernel
log:

	request_module: runaway loop modprobe char-major-5-1
	request_module: runaway loop modprobe char-major-5-1
	request_module: runaway loop modprobe char-major-5-1
	request_module: runaway loop modprobe char-major-5-1
	request_module: runaway loop modprobe char-major-5-1

This can happen, for example, when the built in md5 module can't find
the built in cryptomgr module (because the latter fails to initialise).
The md5 module comes before the call to tty_init(), presumably because
'crypto' comes before 'drivers' alphabetically.

Fix this by calling tty_init() from chrdev_init().

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/mem.c    | 2 +-
 drivers/char/tty_io.c | 4 ++--
 fs/char_dev.c         | 1 +
 include/linux/tty.h   | 3 +++
 4 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index f54dab8acdcd..a398ecdbd758 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -916,7 +916,7 @@ static int __init chr_dev_init(void)
 			      NULL, devlist[minor].name);
 	}
 
-	return 0;
+	return tty_init();
 }
 
 fs_initcall(chr_dev_init);
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index d71f0fc34b46..507441ac6edb 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -3128,7 +3128,7 @@ static struct cdev tty_cdev, console_cdev;
  * Ok, now we can initialize the rest of the tty devices and can count
  * on memory allocations, interrupts etc..
  */
-static int __init tty_init(void)
+int __init tty_init(void)
 {
 	cdev_init(&tty_cdev, &tty_fops);
 	if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) ||
@@ -3149,4 +3149,4 @@ static int __init tty_init(void)
 #endif
 	return 0;
 }
-module_init(tty_init);
+
diff --git a/fs/char_dev.c b/fs/char_dev.c
index d6db933df2b2..f80a4f25123c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
+#include <linux/tty.h>
 
 #include "internal.h"
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 931078b73226..7802a243ee13 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -552,6 +552,9 @@ static inline void tty_audit_push_task(struct task_struct *tsk,
 }
 #endif
 
+/* tty_io.c */
+extern int __init tty_init(void);
+
 /* tty_ioctl.c */
 extern int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file,
 		       unsigned int cmd, unsigned long arg);
-- 
cgit v1.2.3-59-g8ed1b


From d5eff1a3412f6d75bf28f423c5015ece8055407a Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Tue, 3 Aug 2010 13:04:00 -0400
Subject: NFS: Fix /proc/mount for legacy binary interface

Add a flag so we know if we mounted the NFS server using the legacy
binary interface.  If we used the legacy interface, then we should not
show the mountd options.

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c            | 4 ++++
 include/linux/nfs_mount.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f9df16de4a56..f1ae39f6cb02 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -546,6 +546,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 {
 	struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
 
+	if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE)
+		return;
+
 	switch (sap->sa_family) {
 	case AF_INET: {
 		struct sockaddr_in *sin = (struct sockaddr_in *)sap;
@@ -1780,6 +1783,7 @@ static int nfs_validate_mount_data(void *options,
 		 * can deal with.
 		 */
 		args->flags		= data->flags & NFS_MOUNT_FLAGMASK;
+		args->flags		|= NFS_MOUNT_LEGACY_INTERFACE;
 		args->rsize		= data->rsize;
 		args->wsize		= data->wsize;
 		args->timeo		= data->timeo;
diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h
index 4499016e6d0d..5d59ae861aa6 100644
--- a/include/linux/nfs_mount.h
+++ b/include/linux/nfs_mount.h
@@ -69,5 +69,6 @@ struct nfs_mount_data {
 #define NFS_MOUNT_LOOKUP_CACHE_NONEG	0x10000
 #define NFS_MOUNT_LOOKUP_CACHE_NONE	0x20000
 #define NFS_MOUNT_NORESVPORT		0x40000
+#define NFS_MOUNT_LEGACY_INTERFACE	0x80000
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 9261ec1a8d7b17e2540bef7cad3470870d13b61e Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 6 Aug 2010 15:36:47 -0500
Subject: console: Fix compilation regression

A regression of building without CONFIG_HW_CONSOLE was introduced with
commit b45cfba4e9005d64d419718e7ff7f7cab44c1994 (vt,console,kdb:
implement atomic console enter/leave functions).

ERROR: "con_debug_enter" [drivers/serial/kgdboc.ko] undefined!
ERROR: "vc_cons" [drivers/serial/kgdboc.ko] undefined!
ERROR: "fg_console" [drivers/serial/kgdboc.ko] undefined!
ERROR: "con_debug_leave" [drivers/serial/kgdboc.ko] undefined!

When there is no HW console the con_debug_enter and con_debug_leave
functions should have no code.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
CC: Jesse Barnes <jbarnes@virtuousgeek.org>
Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
---
 include/linux/console.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index f76fc297322d..95cf6f08a59d 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -79,8 +79,13 @@ int register_con_driver(const struct consw *csw, int first, int last);
 int unregister_con_driver(const struct consw *csw);
 int take_over_console(const struct consw *sw, int first, int last, int deflt);
 void give_up_console(const struct consw *sw);
+#ifdef CONFIG_HW_CONSOLE
 int con_debug_enter(struct vc_data *vc);
 int con_debug_leave(void);
+#else
+#define con_debug_enter(vc) (0)
+#define con_debug_leave() (0)
+#endif
 
 /* scroll */
 #define SM_UP       (1)
-- 
cgit v1.2.3-59-g8ed1b


From e2e1a148bc45855816ae6b4692ce29d0020fa22e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 9 Jun 2010 10:42:09 +0200
Subject: block: add sysfs knob for turning off disk entropy contributions

There are two reasons for doing this:

- On SSD disks, the completion times aren't as random as they
  are for rotational drives. So it's questionable whether they
  should contribute to the random pool in the first place.

- Calling add_disk_randomness() has a lot of overhead.

This adds /sys/block/<dev>/queue/add_random that will allow you to
switch off on a per-device basis. The default setting is on, so there
should be no functional changes from this patch.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c       |  3 ++-
 block/blk-sysfs.c      | 28 ++++++++++++++++++++++++++++
 include/linux/blkdev.h |  5 ++++-
 3 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index f0640d7f800f..b4131d29148c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2111,7 +2111,8 @@ static bool blk_update_bidi_request(struct request *rq, int error,
 	    blk_update_request(rq->next_rq, error, bidi_bytes))
 		return true;
 
-	add_disk_randomness(rq->rq_disk);
+	if (blk_queue_add_random(rq->q))
+		add_disk_randomness(rq->rq_disk);
 
 	return false;
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 306759bbdf1b..58b53c354c2c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -250,6 +250,27 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 	return ret;
 }
 
+static ssize_t queue_random_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(blk_queue_add_random(q), page);
+}
+
+static ssize_t queue_random_store(struct request_queue *q, const char *page,
+				  size_t count)
+{
+	unsigned long val;
+	ssize_t ret = queue_var_store(&val, page, count);
+
+	spin_lock_irq(q->queue_lock);
+	if (val)
+		queue_flag_set(QUEUE_FLAG_ADD_RANDOM, q);
+	else
+		queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
+	spin_unlock_irq(q->queue_lock);
+
+	return ret;
+}
+
 static ssize_t queue_iostats_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(blk_queue_io_stat(q), page);
@@ -374,6 +395,12 @@ static struct queue_sysfs_entry queue_iostats_entry = {
 	.store = queue_iostats_store,
 };
 
+static struct queue_sysfs_entry queue_random_entry = {
+	.attr = {.name = "add_random", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_random_show,
+	.store = queue_random_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -394,6 +421,7 @@ static struct attribute *default_attrs[] = {
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
 	&queue_iostats_entry.attr,
+	&queue_random_entry.attr,
 	NULL,
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 09a840264d6f..b8224ea4a5de 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -467,11 +467,13 @@ struct request_queue
 #define QUEUE_FLAG_IO_STAT     15	/* do IO stats */
 #define QUEUE_FLAG_DISCARD     16	/* supports DISCARD */
 #define QUEUE_FLAG_NOXMERGES   17	/* No extended merges */
+#define QUEUE_FLAG_ADD_RANDOM  18	/* Contributes to random pool */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_CLUSTER) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-				 (1 << QUEUE_FLAG_SAME_COMP))
+				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
+				 (1 << QUEUE_FLAG_ADD_RANDOM))
 
 static inline int queue_is_locked(struct request_queue *q)
 {
@@ -596,6 +598,7 @@ enum {
 	test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
 #define blk_queue_nonrot(q)	test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
 #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
+#define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
 #define blk_queue_flushing(q)	((q)->ordseq)
 #define blk_queue_stackable(q)	\
 	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
-- 
cgit v1.2.3-59-g8ed1b


From 41f2df62894bfcd3bf868af916b32b90aa7168dc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 17 Jun 2010 08:54:16 +0200
Subject: block: BARRIER request should imply SYNC

A barrier request should by defintion have priority in get_request
and let the queue be unplugged immediately as it's blocking all forward
progress due to the queue draining.

Most filesystems already get this implicitly by the way how submit_bh
treats the buffer_ordered flag, and gfs2 sets it explicitly.  But btrfs
and XFS are still forgetting to set the flag, as is blkdev_issue_flush
and some places in DM/MD.

For XFS on metadata heavy workloads this gives a consistent speedup
in the 2-3% range.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/gfs2/log.c      | 2 +-
 include/linux/fs.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6a857e24f947..efc3539ac5a1 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -595,7 +595,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
 		goto skip_barrier;
 	get_bh(bh);
-	submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh);
+	submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
 	wait_on_buffer(bh);
 	if (buffer_eopnotsupp(bh)) {
 		clear_buffer_eopnotsupp(bh);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 68ca1b0491af..598878831497 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -136,7 +136,7 @@ struct inodes_stat_t {
  * SWRITE_SYNC
  * SWRITE_SYNC_PLUG	Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
  *			See SWRITE.
- * WRITE_BARRIER	Like WRITE, but tells the block layer that all
+ * WRITE_BARRIER	Like WRITE_SYNC, but tells the block layer that all
  *			previously submitted writes must be safely on storage
  *			before this one is started. Also guarantees that when
  *			this write is complete, it itself is also safely on
@@ -159,7 +159,7 @@ struct inodes_stat_t {
 #define SWRITE_SYNC_PLUG	\
 			(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
 #define SWRITE_SYNC	(SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
-#define WRITE_BARRIER	(WRITE | (1 << BIO_RW_BARRIER))
+#define WRITE_BARRIER	(WRITE_SYNC | (1 << BIO_RW_BARRIER))
 
 /*
  * These aren't really reads or writes, they pass down information about
-- 
cgit v1.2.3-59-g8ed1b


From bfe172310e58225f0d07f9354b683abacbd6a0d8 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 31 May 2010 15:59:03 +0900
Subject: block: kill ISA_DMA_THRESHOLD usage

block uses ISA_DMA_THRESHOLD for BLK_BOUNCE_ISA. Only SCSI uses
ISA_DMA_THRESHOLD for ancient drivers with non-zero
unchecked_isa_dma. Nowadays drivers (and subsystems) use dma_mask
properly instead of ISA_DMA_THRESHOLD.

Documentation/scsi/scsi_mid_low_api.txt says:

unchecked_isa_dma - 1=>only use bottom 16 MB of ram (ISA DMA addressing
                   restriction), 0=>can use full 32 bit (or better) DMA
                   address space

So block simply uses DMA_BIT_MASK(24) for BLK_BOUNCE_ISA for SCSI.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: James Bottomley <James.Bottomley@suse.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b8224ea4a5de..d7ae241a9e55 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -712,7 +712,7 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn;
 #define BLK_BOUNCE_HIGH		-1ULL
 #endif
 #define BLK_BOUNCE_ANY		(-1ULL)
-#define BLK_BOUNCE_ISA		(ISA_DMA_THRESHOLD)
+#define BLK_BOUNCE_ISA		(DMA_BIT_MASK(24))
 
 /*
  * default timeout for SG_IO if none specified
-- 
cgit v1.2.3-59-g8ed1b


From 33659ebbae262228eef4e0fe990f393d1f0ed941 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 7 Aug 2010 18:17:56 +0200
Subject: block: remove wrappers for request type/flags

Remove all the trivial wrappers for the cmd_type and cmd_flags fields in
struct requests.  This allows much easier grepping for different request
types instead of unwinding through macros.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c                 |  7 ++--
 block/blk-core.c                    | 13 ++++----
 block/blk-exec.c                    |  2 +-
 block/blk-merge.c                   |  4 +--
 block/blk.h                         |  6 ++--
 block/cfq-iosched.c                 | 19 ++++++-----
 block/elevator.c                    | 16 +++++----
 drivers/ata/libata-scsi.c           |  2 +-
 drivers/block/cciss.c               | 49 ++++++++++++++++-----------
 drivers/block/hd.c                  |  2 +-
 drivers/block/mg_disk.c             |  4 +--
 drivers/block/nbd.c                 |  2 +-
 drivers/block/osdblk.c              |  3 +-
 drivers/block/paride/pd.c           |  2 +-
 drivers/block/ps3disk.c             |  2 +-
 drivers/block/ub.c                  |  8 ++---
 drivers/block/viodasd.c             |  2 +-
 drivers/block/virtio_blk.c          | 15 +++++----
 drivers/block/xd.c                  |  2 +-
 drivers/block/xen-blkfront.c        |  4 +--
 drivers/block/xsysace.c             |  2 +-
 drivers/cdrom/gdrom.c               |  2 +-
 drivers/cdrom/viocd.c               |  2 +-
 drivers/ide/ide-atapi.c             | 17 ++++++----
 drivers/ide/ide-cd.c                | 66 ++++++++++++++++++++-----------------
 drivers/ide/ide-disk.c              |  2 +-
 drivers/ide/ide-eh.c                |  5 +--
 drivers/ide/ide-floppy.c            | 25 +++++++++-----
 drivers/ide/ide-io.c                |  8 ++---
 drivers/ide/ide-pm.c                |  8 ++---
 drivers/ide/ide-tape.c              |  3 +-
 drivers/md/dm.c                     | 10 +++---
 drivers/memstick/core/mspro_block.c |  3 +-
 drivers/message/i2o/i2o_block.c     |  2 +-
 drivers/mmc/card/queue.c            |  2 +-
 drivers/mtd/mtd_blkdevs.c           |  4 +--
 drivers/scsi/scsi_error.c           | 10 +++---
 drivers/scsi/scsi_lib.c             |  5 +--
 drivers/scsi/sd.c                   | 12 +++----
 drivers/scsi/sun3_NCR5380.c         |  2 +-
 drivers/scsi/sun3_scsi.c            |  2 +-
 drivers/scsi/sun3_scsi_vme.c        |  2 +-
 drivers/staging/hv/blkvsc_drv.c     |  8 +++--
 include/linux/blkdev.h              | 41 ++++++++---------------
 include/linux/blktrace_api.h        |  2 +-
 include/trace/events/block.h        | 15 ++++++---
 kernel/trace/blktrace.c             | 10 +++---
 47 files changed, 236 insertions(+), 198 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 0d710c9d403b..74e404393172 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -79,7 +79,7 @@ unsigned blk_ordered_req_seq(struct request *rq)
 	 *
 	 * http://thread.gmane.org/gmane.linux.kernel/537473
 	 */
-	if (!blk_fs_request(rq))
+	if (rq->cmd_type != REQ_TYPE_FS)
 		return QUEUE_ORDSEQ_DRAIN;
 
 	if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
@@ -236,7 +236,8 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 {
 	struct request *rq = *rqp;
-	const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
+	const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
+				(rq->cmd_flags & REQ_HARDBARRIER);
 
 	if (!q->ordseq) {
 		if (!is_barrier)
@@ -261,7 +262,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 	 */
 
 	/* Special requests are not subject to ordering rules. */
-	if (!blk_fs_request(rq) &&
+	if (rq->cmd_type != REQ_TYPE_FS &&
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
 		return true;
 
diff --git a/block/blk-core.c b/block/blk-core.c
index b4131d29148c..dca43a31e725 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -184,7 +184,7 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
 	       rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
 
-	if (blk_pc_request(rq)) {
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		printk(KERN_INFO "  cdb: ");
 		for (bit = 0; bit < BLK_MAX_CDB; bit++)
 			printk("%02x ", rq->cmd[bit]);
@@ -1796,7 +1796,7 @@ struct request *blk_peek_request(struct request_queue *q)
 			 * sees this request (possibly after
 			 * requeueing).  Notify IO scheduler.
 			 */
-			if (blk_sorted_rq(rq))
+			if (rq->cmd_flags & REQ_SORTED)
 				elv_activate_rq(q, rq);
 
 			/*
@@ -1984,10 +1984,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	 * TODO: tj: This is too subtle.  It would be better to let
 	 * low level drivers do what they see fit.
 	 */
-	if (blk_fs_request(req))
+	if (req->cmd_type == REQ_TYPE_FS)
 		req->errors = 0;
 
-	if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {
+	if (error && req->cmd_type == REQ_TYPE_FS &&
+	    !(req->cmd_flags & REQ_QUIET)) {
 		printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",
 				req->rq_disk ? req->rq_disk->disk_name : "?",
 				(unsigned long long)blk_rq_pos(req));
@@ -2074,7 +2075,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	req->buffer = bio_data(req->bio);
 
 	/* update sector only for requests with clear definition of sector */
-	if (blk_fs_request(req) || blk_discard_rq(req))
+	if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD))
 		req->__sector += total_bytes >> 9;
 
 	/* mixed attributes always follow the first bio */
@@ -2127,7 +2128,7 @@ static void blk_finish_request(struct request *req, int error)
 
 	BUG_ON(blk_queued_rq(req));
 
-	if (unlikely(laptop_mode) && blk_fs_request(req))
+	if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS)
 		laptop_io_completion(&req->q->backing_dev_info);
 
 	blk_delete_timer(req);
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 49557e91f0da..e1672f14840e 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -57,7 +57,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	__elv_add_request(q, rq, where, 1);
 	__generic_unplug_device(q);
 	/* the queue is stopped so it won't be plugged+unplugged */
-	if (blk_pm_resume_request(rq))
+	if (rq->cmd_type == REQ_TYPE_PM_RESUME)
 		q->request_fn(q);
 	spin_unlock_irq(q->queue_lock);
 }
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5e7dc9973458..87e4fb7d0e98 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -226,7 +226,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 {
 	unsigned short max_sectors;
 
-	if (unlikely(blk_pc_request(req)))
+	if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
 		max_sectors = queue_max_hw_sectors(q);
 	else
 		max_sectors = queue_max_sectors(q);
@@ -250,7 +250,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 {
 	unsigned short max_sectors;
 
-	if (unlikely(blk_pc_request(req)))
+	if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
 		max_sectors = queue_max_hw_sectors(q);
 	else
 		max_sectors = queue_max_sectors(q);
diff --git a/block/blk.h b/block/blk.h
index 5ee3d7e72feb..6e7dc87141e4 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -161,8 +161,10 @@ static inline int blk_cpu_to_group(int cpu)
  */
 static inline int blk_do_io_stat(struct request *rq)
 {
-	return rq->rq_disk && blk_rq_io_stat(rq) &&
-	       (blk_fs_request(rq) || blk_discard_rq(rq));
+	return rq->rq_disk &&
+	       (rq->cmd_flags & REQ_IO_STAT) &&
+	       (rq->cmd_type == REQ_TYPE_FS ||
+	        (rq->cmd_flags & REQ_DISCARD));
 }
 
 #endif
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7982b830db58..d4edeb8fceb8 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -646,9 +646,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
 		return rq1;
 	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
 		return rq2;
-	if (rq_is_meta(rq1) && !rq_is_meta(rq2))
+	if ((rq1->cmd_flags & REQ_RW_META) && !(rq2->cmd_flags & REQ_RW_META))
 		return rq1;
-	else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
+	else if ((rq2->cmd_flags & REQ_RW_META) &&
+		 !(rq1->cmd_flags & REQ_RW_META))
 		return rq2;
 
 	s1 = blk_rq_pos(rq1);
@@ -1484,7 +1485,7 @@ static void cfq_remove_request(struct request *rq)
 	cfqq->cfqd->rq_queued--;
 	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
 					rq_data_dir(rq), rq_is_sync(rq));
-	if (rq_is_meta(rq)) {
+	if (rq->cmd_flags & REQ_RW_META) {
 		WARN_ON(!cfqq->meta_pending);
 		cfqq->meta_pending--;
 	}
@@ -3176,7 +3177,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * So both queues are sync. Let the new request get disk time if
 	 * it's a metadata request and the current queue is doing regular IO.
 	 */
-	if (rq_is_meta(rq) && !cfqq->meta_pending)
+	if ((rq->cmd_flags & REQ_RW_META) && !cfqq->meta_pending)
 		return true;
 
 	/*
@@ -3230,7 +3231,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_io_context *cic = RQ_CIC(rq);
 
 	cfqd->rq_queued++;
-	if (rq_is_meta(rq))
+	if (rq->cmd_flags & REQ_RW_META)
 		cfqq->meta_pending++;
 
 	cfq_update_io_thinktime(cfqd, cic);
@@ -3365,7 +3366,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	unsigned long now;
 
 	now = jiffies;
-	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
+	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
+		     !!(rq->cmd_flags & REQ_NOIDLE));
 
 	cfq_update_hw_tag(cfqd);
 
@@ -3419,11 +3421,12 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			cfq_slice_expired(cfqd, 1);
 		else if (sync && cfqq_empty &&
 			 !cfq_close_cooperator(cfqd, cfqq)) {
-			cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
+			cfqd->noidle_tree_requires_idle |=
+				!(rq->cmd_flags & REQ_NOIDLE);
 			/*
 			 * Idling is enabled for SYNC_WORKLOAD.
 			 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
-			 * only if we processed at least one !rq_noidle request
+			 * only if we processed at least one !REQ_NOIDLE request
 			 */
 			if (cfqd->serving_type == SYNC_WORKLOAD
 			    || cfqd->noidle_tree_requires_idle
diff --git a/block/elevator.c b/block/elevator.c
index 923a9139106c..aa99b59c03d6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -428,7 +428,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 	list_for_each_prev(entry, &q->queue_head) {
 		struct request *pos = list_entry_rq(entry);
 
-		if (blk_discard_rq(rq) != blk_discard_rq(pos))
+		if ((rq->cmd_flags & REQ_DISCARD) !=
+		    (pos->cmd_flags & REQ_DISCARD))
 			break;
 		if (rq_data_dir(rq) != rq_data_dir(pos))
 			break;
@@ -558,7 +559,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]--;
-		if (blk_sorted_rq(rq))
+		if (rq->cmd_flags & REQ_SORTED)
 			elv_deactivate_rq(q, rq);
 	}
 
@@ -644,7 +645,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		break;
 
 	case ELEVATOR_INSERT_SORT:
-		BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq));
+		BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
+		       !(rq->cmd_flags & REQ_DISCARD));
 		rq->cmd_flags |= REQ_SORTED;
 		q->nr_sorted++;
 		if (rq_mergeable(rq)) {
@@ -716,7 +718,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		/*
 		 * toggle ordered color
 		 */
-		if (blk_barrier_rq(rq))
+		if (rq->cmd_flags & REQ_HARDBARRIER)
 			q->ordcolor ^= 1;
 
 		/*
@@ -729,7 +731,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		 * this request is scheduling boundary, update
 		 * end_sector
 		 */
-		if (blk_fs_request(rq) || blk_discard_rq(rq)) {
+		if (rq->cmd_type == REQ_TYPE_FS ||
+		    (rq->cmd_flags & REQ_DISCARD)) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = rq;
 		}
@@ -843,7 +846,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]--;
-		if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
+		if ((rq->cmd_flags & REQ_SORTED) &&
+		    e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index a54273d2c3c6..a5c08b082edb 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1111,7 +1111,7 @@ static void ata_scsi_sdev_config(struct scsi_device *sdev)
  */
 static int atapi_drain_needed(struct request *rq)
 {
-	if (likely(!blk_pc_request(rq)))
+	if (likely(rq->cmd_type != REQ_TYPE_BLOCK_PC))
 		return 0;
 
 	if (!blk_rq_bytes(rq) || (rq->cmd_flags & REQ_RW))
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 10a0268a1f92..11b377762b8e 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1783,7 +1783,7 @@ static void cciss_softirq_done(struct request *rq)
 #endif				/* CCISS_DEBUG */
 
 	/* set the residual count for pc requests */
-	if (blk_pc_request(rq))
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
 		rq->resid_len = cmd->err_info->ResidualCnt;
 
 	blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
@@ -2983,7 +2983,7 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 	driver_byte = DRIVER_OK;
 	msg_byte = cmd->err_info->CommandStatus; /* correct?  seems too device specific */
 
-	if (blk_pc_request(cmd->rq))
+	if (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC)
 		host_byte = DID_PASSTHROUGH;
 	else
 		host_byte = DID_OK;
@@ -2992,7 +2992,7 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 		host_byte, driver_byte);
 
 	if (cmd->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION) {
-		if (!blk_pc_request(cmd->rq))
+		if (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC)
 			printk(KERN_WARNING "cciss: cmd %p "
 			       "has SCSI Status 0x%x\n",
 			       cmd, cmd->err_info->ScsiStatus);
@@ -3002,15 +3002,17 @@ static inline int evaluate_target_status(ctlr_info_t *h,
 	/* check the sense key */
 	sense_key = 0xf & cmd->err_info->SenseInfo[2];
 	/* no status or recovered error */
-	if (((sense_key == 0x0) || (sense_key == 0x1)) && !blk_pc_request(cmd->rq))
+	if (((sense_key == 0x0) || (sense_key == 0x1)) &&
+	    (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC))
 		error_value = 0;
 
 	if (check_for_unit_attention(h, cmd)) {
-		*retry_cmd = !blk_pc_request(cmd->rq);
+		*retry_cmd = !(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC);
 		return 0;
 	}
 
-	if (!blk_pc_request(cmd->rq)) { /* Not SG_IO or similar? */
+	/* Not SG_IO or similar? */
+	if (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC) {
 		if (error_value != 0)
 			printk(KERN_WARNING "cciss: cmd %p has CHECK CONDITION"
 			       " sense key = 0x%x\n", cmd, sense_key);
@@ -3052,7 +3054,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		rq->errors = evaluate_target_status(h, cmd, &retry_cmd);
 		break;
 	case CMD_DATA_UNDERRUN:
-		if (blk_fs_request(cmd->rq)) {
+		if (cmd->rq->cmd_type == REQ_TYPE_FS) {
 			printk(KERN_WARNING "cciss: cmd %p has"
 			       " completed with data underrun "
 			       "reported\n", cmd);
@@ -3060,7 +3062,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		}
 		break;
 	case CMD_DATA_OVERRUN:
-		if (blk_fs_request(cmd->rq))
+		if (cmd->rq->cmd_type == REQ_TYPE_FS)
 			printk(KERN_WARNING "cciss: cmd %p has"
 			       " completed with data overrun "
 			       "reported\n", cmd);
@@ -3070,42 +3072,48 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       "reported invalid\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_PROTOCOL_ERR:
 		printk(KERN_WARNING "cciss: cmd %p has "
 		       "protocol error \n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_HARDWARE_ERR:
 		printk(KERN_WARNING "cciss: cmd %p had "
 		       " hardware error\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_CONNECTION_LOST:
 		printk(KERN_WARNING "cciss: cmd %p had "
 		       "connection lost\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_ABORTED:
 		printk(KERN_WARNING "cciss: cmd %p was "
 		       "aborted\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ABORT);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ABORT);
 		break;
 	case CMD_ABORT_FAILED:
 		printk(KERN_WARNING "cciss: cmd %p reports "
 		       "abort failed\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_UNSOLICITED_ABORT:
 		printk(KERN_WARNING "cciss%d: unsolicited "
@@ -3121,13 +3129,15 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 			       "many times\n", h->ctlr, cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ABORT);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ABORT);
 		break;
 	case CMD_TIMEOUT:
 		printk(KERN_WARNING "cciss: cmd %p timedout\n", cmd);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	default:
 		printk(KERN_WARNING "cciss: cmd %p returned "
@@ -3135,7 +3145,8 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		       cmd->err_info->CommandStatus);
 		rq->errors = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
-			blk_pc_request(cmd->rq) ? DID_PASSTHROUGH : DID_ERROR);
+			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+				DID_PASSTHROUGH : DID_ERROR);
 	}
 
 after_error_processing:
@@ -3294,7 +3305,7 @@ static void do_cciss_request(struct request_queue *q)
 		c->Header.SGList = h->max_cmd_sgentries;
 	set_performant_mode(h, c);
 
-	if (likely(blk_fs_request(creq))) {
+	if (likely(creq->cmd_type == REQ_TYPE_FS)) {
 		if(h->cciss_read == CCISS_READ_10) {
 			c->Request.CDB[1] = 0;
 			c->Request.CDB[2] = (start_blk >> 24) & 0xff; /* MSB */
@@ -3324,7 +3335,7 @@ static void do_cciss_request(struct request_queue *q)
 			c->Request.CDB[13]= blk_rq_sectors(creq) & 0xff;
 			c->Request.CDB[14] = c->Request.CDB[15] = 0;
 		}
-	} else if (blk_pc_request(creq)) {
+	} else if (creq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		c->Request.CDBLen = creq->cmd_len;
 		memcpy(c->Request.CDB, creq->cmd, BLK_MAX_CDB);
 	} else {
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 81c78b3ce2df..30ec6b37424e 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -627,7 +627,7 @@ repeat:
 		req_data_dir(req) == READ ? "read" : "writ",
 		cyl, head, sec, nsect, req->buffer);
 #endif
-	if (blk_fs_request(req)) {
+	if (req->cmd_type == REQ_TYPE_FS) {
 		switch (rq_data_dir(req)) {
 		case READ:
 			hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 28db925dbdad..b82c5ce5e9df 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -670,7 +670,7 @@ static void mg_request_poll(struct request_queue *q)
 				break;
 		}
 
-		if (unlikely(!blk_fs_request(host->req))) {
+		if (unlikely(host->req->cmd_type != REQ_TYPE_FS)) {
 			mg_end_request_cur(host, -EIO);
 			continue;
 		}
@@ -756,7 +756,7 @@ static void mg_request(struct request_queue *q)
 			continue;
 		}
 
-		if (unlikely(!blk_fs_request(req))) {
+		if (unlikely(req->cmd_type != REQ_TYPE_FS)) {
 			mg_end_request_cur(host, -EIO);
 			continue;
 		}
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 218d091f3c52..2e74e7d475ca 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -448,7 +448,7 @@ static void nbd_clear_que(struct nbd_device *lo)
 
 static void nbd_handle_req(struct nbd_device *lo, struct request *req)
 {
-	if (!blk_fs_request(req))
+	if (req->cmd_type != REQ_TYPE_FS)
 		goto error_out;
 
 	nbd_cmd(req) = NBD_CMD_READ;
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 6cd8b705b11b..819002ba3433 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -310,7 +310,8 @@ static void osdblk_rq_fn(struct request_queue *q)
 			break;
 
 		/* filter out block requests we don't understand */
-		if (!blk_fs_request(rq) && !blk_barrier_rq(rq)) {
+		if (rq->cmd_type != REQ_TYPE_FS &&
+		    !(rq->cmd_flags & REQ_HARDBARRIER)) {
 			blk_end_request_all(rq, 0);
 			continue;
 		}
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index c1e5cd029b23..4e8b9bff3abe 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -439,7 +439,7 @@ static char *pd_buf;		/* buffer for request in progress */
 
 static enum action do_pd_io_start(void)
 {
-	if (blk_special_request(pd_req)) {
+	if (pd_req->cmd_type == REQ_TYPE_SPECIAL) {
 		phase = pd_special;
 		return pd_special();
 	}
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 3b419e3fffa1..5f208c0bf156 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -196,7 +196,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
 
 	while ((req = blk_fetch_request(q))) {
-		if (blk_fs_request(req)) {
+		if (req->cmd_type == REQ_TYPE_FS) {
 			if (ps3disk_submit_request_sg(dev, req))
 				break;
 		} else if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 0536b5b29adc..034b34440ffa 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -648,7 +648,7 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 		return 0;
 	}
 
-	if (lun->changed && !blk_pc_request(rq)) {
+	if (lun->changed && rq->cmd_type != REQ_TYPE_BLOCK_PC)
 		blk_start_request(rq);
 		ub_end_rq(rq, SAM_STAT_CHECK_CONDITION);
 		return 0;
@@ -684,7 +684,7 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 	}
 	urq->nsg = n_elem;
 
-	if (blk_pc_request(rq)) {
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		ub_cmd_build_packet(sc, lun, cmd, urq);
 	} else {
 		ub_cmd_build_block(sc, lun, cmd, urq);
@@ -781,7 +781,7 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 	rq = urq->rq;
 
 	if (cmd->error == 0) {
-		if (blk_pc_request(rq)) {
+		if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 			if (cmd->act_len >= rq->resid_len)
 				rq->resid_len = 0;
 			else
@@ -795,7 +795,7 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 			}
 		}
 	} else {
-		if (blk_pc_request(rq)) {
+		if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 			/* UB_SENSE_SIZE is smaller than SCSI_SENSE_BUFFERSIZE */
 			memcpy(rq->sense, sc->top_sense, UB_SENSE_SIZE);
 			rq->sense_len = UB_SENSE_SIZE;
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 788d93882ab9..5663d3c284c8 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -361,7 +361,7 @@ static void do_viodasd_request(struct request_queue *q)
 		if (req == NULL)
 			return;
 		/* check that request contains a valid command */
-		if (!blk_fs_request(req)) {
+		if (req->cmd_type != REQ_TYPE_FS) {
 			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
 			continue;
 		}
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 258bc2ae2885..774144334ece 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -65,13 +65,16 @@ static void blk_done(struct virtqueue *vq)
 			break;
 		}
 
-		if (blk_pc_request(vbr->req)) {
+		switch (vbr->req->cmd_type) {
+		case REQ_TYPE_BLOCK_PC:
 			vbr->req->resid_len = vbr->in_hdr.residual;
 			vbr->req->sense_len = vbr->in_hdr.sense_len;
 			vbr->req->errors = vbr->in_hdr.errors;
-		}
-		if (blk_special_request(vbr->req))
+			break;
+		case REQ_TYPE_SPECIAL:
 			vbr->req->errors = (error != 0);
+			break;
+		}
 
 		__blk_end_request_all(vbr->req, error);
 		list_del(&vbr->list);
@@ -123,7 +126,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		BUG();
 	}
 
-	if (blk_barrier_rq(vbr->req))
+	if (vbr->req->cmd_flags & REQ_HARDBARRIER)
 		vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
 
 	sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
@@ -134,12 +137,12 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 	 * block, and before the normal inhdr we put the sense data and the
 	 * inhdr with additional status information before the normal inhdr.
 	 */
-	if (blk_pc_request(vbr->req))
+	if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC)
 		sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
 
 	num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
 
-	if (blk_pc_request(vbr->req)) {
+	if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
 		sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, 96);
 		sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
 			   sizeof(vbr->in_hdr));
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 18a80ff57ce8..4dc298376098 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -322,7 +322,7 @@ static void do_xd_request (struct request_queue * q)
 		int res = -EIO;
 		int retry;
 
-		if (!blk_fs_request(req))
+		if (req->cmd_type != REQ_TYPE_FS) {
 			goto done;
 		if (block + count > get_capacity(req->rq_disk))
 			goto done;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 82ed403147c0..495533e66542 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -238,7 +238,7 @@ static int blkif_queue_request(struct request *req)
 
 	ring_req->operation = rq_data_dir(req) ?
 		BLKIF_OP_WRITE : BLKIF_OP_READ;
-	if (blk_barrier_rq(req))
+	if (req->cmd_flags & REQ_HARDBARRIER)
 		ring_req->operation = BLKIF_OP_WRITE_BARRIER;
 
 	ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
@@ -309,7 +309,7 @@ static void do_blkif_request(struct request_queue *rq)
 
 		blk_start_request(req);
 
-		if (!blk_fs_request(req)) {
+		if (req->cmd_type != REQ_TYPE_FS) {
 			__blk_end_request_all(req, -EIO);
 			continue;
 		}
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index a7b83c0a7eb5..ac278ac908d5 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -465,7 +465,7 @@ struct request *ace_get_next_request(struct request_queue * q)
 	struct request *req;
 
 	while ((req = blk_peek_request(q)) != NULL) {
-		if (blk_fs_request(req))
+		if (req->cmd_type == REQ_TYPE_FS)
 			break;
 		blk_start_request(req);
 		__blk_end_request_all(req, -EIO);
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 03c71f7698cb..7c05ddc63ae8 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -643,7 +643,7 @@ static void gdrom_request(struct request_queue *rq)
 	struct request *req;
 
 	while ((req = blk_fetch_request(rq)) != NULL) {
-		if (!blk_fs_request(req)) {
+		if (req->cmd_type != REQ_TYPE_FS) {
 			printk(KERN_DEBUG "GDROM: Non-fs request ignored\n");
 			__blk_end_request_all(req, -EIO);
 			continue;
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 451cd7071b1d..14e420168764 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -298,7 +298,7 @@ static void do_viocd_request(struct request_queue *q)
 	struct request *req;
 
 	while ((rwreq == 0) && ((req = blk_fetch_request(q)) != NULL)) {
-		if (!blk_fs_request(req))
+		if (req->cmd_type != REQ_TYPE_FS)
 			__blk_end_request_all(req, -EIO);
 		else if (send_request(req) < 0) {
 			printk(VIOCD_KERN_WARNING
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index f9daffd7d0e3..3117a894d20e 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -190,7 +190,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 
 	BUG_ON(sense_len > sizeof(*sense));
 
-	if (blk_sense_request(rq) || drive->sense_rq_armed)
+	if (rq->cmd_type == REQ_TYPE_SENSE || drive->sense_rq_armed)
 		return;
 
 	memset(sense, 0, sizeof(*sense));
@@ -307,13 +307,16 @@ EXPORT_SYMBOL_GPL(ide_cd_expiry);
 
 int ide_cd_get_xferlen(struct request *rq)
 {
-	if (blk_fs_request(rq))
+	switch (rq->cmd_type)
+	case REQ_TYPE_FS:
 		return 32768;
-	else if (blk_sense_request(rq) || blk_pc_request(rq) ||
-			 rq->cmd_type == REQ_TYPE_ATA_PC)
+	case REQ_TYPE_SENSE:
+	case REQ_TYPE_BLOCK_PC:
+	case REQ_TYPE_ATA_PC:
 		return blk_rq_bytes(rq);
-	else
+	default:
 		return 0;
+	}
 }
 EXPORT_SYMBOL_GPL(ide_cd_get_xferlen);
 
@@ -474,12 +477,12 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		if (uptodate == 0)
 			drive->failed_pc = NULL;
 
-		if (blk_special_request(rq)) {
+		if (rq->cmd_type == REQ_TYPE_SPECIAL)
 			rq->errors = 0;
 			error = 0;
 		} else {
 
-			if (blk_fs_request(rq) == 0 && uptodate <= 0) {
+			if (req->cmd_type != REQ_TYPE_FS && uptodate <= 0) {
 				if (rq->errors == 0)
 					rq->errors = -EIO;
 			}
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 64207df8da82..26a3688de467 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -176,7 +176,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 			if (!sense->valid)
 				break;
 			if (failed_command == NULL ||
-					!blk_fs_request(failed_command))
+			    failed_command->cmd_type != REQ_TYPE_FS)
 				break;
 			sector = (sense->information[0] << 24) |
 				 (sense->information[1] << 16) |
@@ -292,7 +292,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 				  "stat 0x%x",
 				  rq->cmd[0], rq->cmd_type, err, stat);
 
-	if (blk_sense_request(rq)) {
+	if (rq->cmd_type == REQ_TYPE_SENSE) {
 		/*
 		 * We got an error trying to get sense info from the drive
 		 * (probably while trying to recover from a former error).
@@ -303,7 +303,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 	}
 
 	/* if we have an error, pass CHECK_CONDITION as the SCSI status byte */
-	if (blk_pc_request(rq) && !rq->errors)
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !rq->errors)
 		rq->errors = SAM_STAT_CHECK_CONDITION;
 
 	if (blk_noretry_request(rq))
@@ -311,13 +311,14 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 	switch (sense_key) {
 	case NOT_READY:
-		if (blk_fs_request(rq) && rq_data_dir(rq) == WRITE) {
+		if (rq->cmd_type == REQ_TYPE_FS && rq_data_dir(rq) == WRITE) {
 			if (ide_cd_breathe(drive, rq))
 				return 1;
 		} else {
 			cdrom_saw_media_change(drive);
 
-			if (blk_fs_request(rq) && !blk_rq_quiet(rq))
+			if (rq->cmd_type == REQ_TYPE_FS &&
+			    !(rq->cmd_flags & REQ_QUIET)) {
 				printk(KERN_ERR PFX "%s: tray open\n",
 					drive->name);
 		}
@@ -326,7 +327,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 	case UNIT_ATTENTION:
 		cdrom_saw_media_change(drive);
 
-		if (blk_fs_request(rq) == 0)
+		if (rq->cmd_type != REQ_TYPE_FS)
 			return 0;
 
 		/*
@@ -352,7 +353,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 		 * No point in retrying after an illegal request or data
 		 * protect error.
 		 */
-		if (!blk_rq_quiet(rq))
+		if (!(rq->cmd_flags & REQ_QUIET))
 			ide_dump_status(drive, "command error", stat);
 		do_end_request = 1;
 		break;
@@ -361,20 +362,20 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 		 * No point in re-trying a zillion times on a bad sector.
 		 * If we got here the error is not correctable.
 		 */
-		if (!blk_rq_quiet(rq))
+		if (!(rq->cmd_flags & REQ_QUIET))
 			ide_dump_status(drive, "media error "
 					"(bad sector)", stat);
 		do_end_request = 1;
 		break;
 	case BLANK_CHECK:
 		/* disk appears blank? */
-		if (!blk_rq_quiet(rq))
+		if (!(rq->cmd_flags & REQ_QUIET))
 			ide_dump_status(drive, "media error (blank)",
 					stat);
 		do_end_request = 1;
 		break;
 	default:
-		if (blk_fs_request(rq) == 0)
+		if (req->cmd_type != REQ_TYPE_FS)
 			break;
 		if (err & ~ATA_ABORTED) {
 			/* go to the default handler for other errors */
@@ -385,7 +386,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 			do_end_request = 1;
 	}
 
-	if (blk_fs_request(rq) == 0) {
+	if (rq->cmd_type != REQ_TYPE_FS) {
 		rq->cmd_flags |= REQ_FAILED;
 		do_end_request = 1;
 	}
@@ -525,7 +526,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	ide_expiry_t *expiry = NULL;
 	int dma_error = 0, dma, thislen, uptodate = 0;
 	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0;
-	int sense = blk_sense_request(rq);
+	int sense = (rq->cmd_type == REQ_TYPE_SENSE);
 	unsigned int timeout;
 	u16 len;
 	u8 ireason, stat;
@@ -568,7 +569,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 	ide_read_bcount_and_ireason(drive, &len, &ireason);
 
-	thislen = blk_fs_request(rq) ? len : cmd->nleft;
+	thislen = (rq->cmd_type == REQ_TYPE_FS) ? len : cmd->nleft;
 	if (thislen > len)
 		thislen = len;
 
@@ -577,7 +578,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 	/* If DRQ is clear, the command has completed. */
 	if ((stat & ATA_DRQ) == 0) {
-		if (blk_fs_request(rq)) {
+		if (rq->cmd_type == REQ_TYPE_FS) {
 			/*
 			 * If we're not done reading/writing, complain.
 			 * Otherwise, complete the command normally.
@@ -591,7 +592,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 					rq->cmd_flags |= REQ_FAILED;
 				uptodate = 0;
 			}
-		} else if (!blk_pc_request(rq)) {
+		} else if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
 			ide_cd_request_sense_fixup(drive, cmd);
 
 			uptodate = cmd->nleft ? 0 : 1;
@@ -640,7 +641,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 	/* pad, if necessary */
 	if (len > 0) {
-		if (blk_fs_request(rq) == 0 || write == 0)
+		if (rq->cmd_type != REQ_TYPE_FS || write == 0)
 			ide_pad_transfer(drive, write, len);
 		else {
 			printk(KERN_ERR PFX "%s: confused, missing data\n",
@@ -649,11 +650,11 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		}
 	}
 
-	if (blk_pc_request(rq)) {
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		timeout = rq->timeout;
 	} else {
 		timeout = ATAPI_WAIT_PC;
-		if (!blk_fs_request(rq))
+		if (rq->cmd_type != REQ_TYPE_FS)
 			expiry = ide_cd_expiry;
 	}
 
@@ -662,7 +663,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	return ide_started;
 
 out_end:
-	if (blk_pc_request(rq) && rc == 0) {
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC && rc == 0) {
 		rq->resid_len = 0;
 		blk_end_request_all(rq, 0);
 		hwif->rq = NULL;
@@ -670,7 +671,7 @@ out_end:
 		if (sense && uptodate)
 			ide_cd_complete_failed_rq(drive, rq);
 
-		if (blk_fs_request(rq)) {
+		if (rq->cmd_type == REQ_TYPE_FS) {
 			if (cmd->nleft == 0)
 				uptodate = 1;
 		} else {
@@ -682,7 +683,7 @@ out_end:
 			ide_cd_error_cmd(drive, cmd);
 
 		/* make sure it's fully ended */
-		if (blk_fs_request(rq) == 0) {
+		if (rq->cmd_type != REQ_TYPE_FS) {
 			rq->resid_len -= cmd->nbytes - cmd->nleft;
 			if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE))
 				rq->resid_len += cmd->last_xfer_len;
@@ -742,7 +743,7 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 	ide_debug_log(IDE_DBG_PC, "rq->cmd[0]: 0x%x, rq->cmd_type: 0x%x",
 				  rq->cmd[0], rq->cmd_type);
 
-	if (blk_pc_request(rq))
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
 		rq->cmd_flags |= REQ_QUIET;
 	else
 		rq->cmd_flags &= ~REQ_FAILED;
@@ -783,21 +784,26 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 	if (drive->debug_mask & IDE_DBG_RQ)
 		blk_dump_rq_flags(rq, "ide_cd_do_request");
 
-	if (blk_fs_request(rq)) {
+	switch (rq->cmd_type) {
+	case REQ_TYPE_FS:
 		if (cdrom_start_rw(drive, rq) == ide_stopped)
 			goto out_end;
-	} else if (blk_sense_request(rq) || blk_pc_request(rq) ||
-		   rq->cmd_type == REQ_TYPE_ATA_PC) {
+		break;
+	case REQ_TYPE_SENSE:
+	case REQ_TYPE_BLOCK_PC:
+	case REQ_TYPE_ATA_PC:
 		if (!rq->timeout)
 			rq->timeout = ATAPI_WAIT_PC;
 
 		cdrom_do_block_pc(drive, rq);
-	} else if (blk_special_request(rq)) {
+		break;
+	case REQ_TYPE_SPECIAL:
 		/* right now this can only be a reset... */
 		uptodate = 1;
 		goto out_end;
-	} else
+	default:
 		BUG();
+	}
 
 	/* prepare sense request for this command */
 	ide_prep_sense(drive, rq);
@@ -809,7 +815,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	cmd.rq = rq;
 
-	if (blk_fs_request(rq) || blk_rq_bytes(rq)) {
+	if (rq->cmd_type == REQ_TYPE_FS || blk_rq_bytes(rq)) {
 		ide_init_sg_cmd(&cmd, blk_rq_bytes(rq));
 		ide_map_sg(drive, &cmd);
 	}
@@ -1365,9 +1371,9 @@ static int ide_cdrom_prep_pc(struct request *rq)
 
 static int ide_cdrom_prep_fn(struct request_queue *q, struct request *rq)
 {
-	if (blk_fs_request(rq))
+	if (rq->cmd_type == REQ_TYPE_FS)
 		return ide_cdrom_prep_fs(q, rq);
-	else if (blk_pc_request(rq))
+	else if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
 		return ide_cdrom_prep_pc(rq);
 
 	return 0;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 33d65039cce9..df3d91ba1c96 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -184,7 +184,7 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 	ide_hwif_t *hwif = drive->hwif;
 
 	BUG_ON(drive->dev_flags & IDE_DFLAG_BLOCKED);
-	BUG_ON(!blk_fs_request(rq));
+	BUG_ON(rq->cmd_type != REQ_TYPE_FS);
 
 	ledtrig_ide_activity();
 
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index e9abf2c3c335..c0aa93fb7a60 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -122,7 +122,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 		return ide_stopped;
 
 	/* retry only "normal" I/O: */
-	if (!blk_fs_request(rq)) {
+	if (rq->cmd_type != REQ_TYPE_FS) {
 		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
 			struct ide_cmd *cmd = rq->special;
 
@@ -146,7 +146,8 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
 {
 	struct request *rq = drive->hwif->rq;
 
-	if (rq && blk_special_request(rq) && rq->cmd[0] == REQ_DRIVE_RESET) {
+	if (rq && rq->cmd_type == REQ_TYPE_SPECIAL &&
+	    rq->cmd[0] == REQ_DRIVE_RESET) {
 		if (err <= 0 && rq->errors == 0)
 			rq->errors = -EIO;
 		ide_complete_rq(drive, err ? err : 0, blk_rq_bytes(rq));
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 4713bdca20b6..c7d0737bb18a 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -73,7 +73,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc)
 		drive->failed_pc = NULL;
 
 	if (pc->c[0] == GPCMD_READ_10 || pc->c[0] == GPCMD_WRITE_10 ||
-	    (rq && blk_pc_request(rq)))
+	    (rq && rq->cmd_type == REQ_TYPE_BLOCK_PC))
 		uptodate = 1; /* FIXME */
 	else if (pc->c[0] == GPCMD_REQUEST_SENSE) {
 
@@ -98,7 +98,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc)
 			       "Aborting request!\n");
 	}
 
-	if (blk_special_request(rq))
+	if (rq->cmd_type == REQ_TYPE_SPECIAL)
 		rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
 
 	return uptodate;
@@ -247,14 +247,16 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		} else
 			printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
 
-		if (blk_special_request(rq)) {
+		if (rq->cmd_type == REQ_TYPE_SPECIAL) {
 			rq->errors = 0;
 			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 			return ide_stopped;
 		} else
 			goto out_end;
 	}
-	if (blk_fs_request(rq)) {
+
+	switch (rq->cmd_type) {
+	case REQ_TYPE_FS:
 		if (((long)blk_rq_pos(rq) % floppy->bs_factor) ||
 		    (blk_rq_sectors(rq) % floppy->bs_factor)) {
 			printk(KERN_ERR PFX "%s: unsupported r/w rq size\n",
@@ -263,13 +265,18 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		}
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
-	} else if (blk_special_request(rq) || blk_sense_request(rq)) {
+		break;
+	case REQ_TYPE_SPECIAL:
+	case REQ_TYPE_SENSE:
 		pc = (struct ide_atapi_pc *)rq->special;
-	} else if (blk_pc_request(rq)) {
+		break;
+	case REQ_TYPE_BLOCK_PC:
 		pc = &floppy->queued_pc;
 		idefloppy_blockpc_cmd(floppy, pc, rq);
-	} else
+		break;
+	default:
 		BUG();
+	}
 
 	ide_prep_sense(drive, rq);
 
@@ -280,7 +287,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 
 	cmd.rq = rq;
 
-	if (blk_fs_request(rq) || blk_rq_bytes(rq)) {
+	if (rq->cmd_type == REQ_TYPE_FS || blk_rq_bytes(rq)) {
 		ide_init_sg_cmd(&cmd, blk_rq_bytes(rq));
 		ide_map_sg(drive, &cmd);
 	}
@@ -290,7 +297,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 	return ide_floppy_issue_pc(drive, &cmd, pc);
 out_end:
 	drive->failed_pc = NULL;
-	if (blk_fs_request(rq) == 0 && rq->errors == 0)
+	if (rq->cmd_type != REQ_TYPE_FS && rq->errors == 0)
 		rq->errors = -EIO;
 	ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
 	return ide_stopped;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 172ac9218154..9304a7e54d9e 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -135,7 +135,7 @@ EXPORT_SYMBOL(ide_complete_rq);
 
 void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 {
-	u8 drv_req = blk_special_request(rq) && rq->rq_disk;
+	u8 drv_req = (rq->cmd_type == REQ_TYPE_SPECIAL) && rq->rq_disk;
 	u8 media = drive->media;
 
 	drive->failed_pc = NULL;
@@ -145,7 +145,7 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 	} else {
 		if (media == ide_tape)
 			rq->errors = IDE_DRV_ERROR_GENERAL;
-		else if (blk_fs_request(rq) == 0 && rq->errors == 0)
+		else if (rq->cmd_type != REQ_TYPE_FS && rq->errors == 0)
 			rq->errors = -EIO;
 	}
 
@@ -307,7 +307,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 {
 	ide_startstop_t startstop;
 
-	BUG_ON(!blk_rq_started(rq));
+	BUG_ON(!(rq->cmd_flags & REQ_STARTED));
 
 #ifdef DEBUG
 	printk("%s: start_request: current=0x%08lx\n",
@@ -353,7 +353,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 			    pm->pm_step == IDE_PM_COMPLETED)
 				ide_complete_pm_rq(drive, rq);
 			return startstop;
-		} else if (!rq->rq_disk && blk_special_request(rq))
+		} else if (!rq->rq_disk && rq->cmd_type == REQ_TYPE_SPECIAL) {
 			/*
 			 * TODO: Once all ULDs have been modified to
 			 * check for specific op codes rather than
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 1c08311b0a0e..92406097efeb 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -191,10 +191,10 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 #ifdef DEBUG_PM
 	printk("%s: completing PM request, %s\n", drive->name,
-	       blk_pm_suspend_request(rq) ? "suspend" : "resume");
+	       (rq->cmd_type == REQ_TYPE_PM_SUSPEND) ? "suspend" : "resume");
 #endif
 	spin_lock_irqsave(q->queue_lock, flags);
-	if (blk_pm_suspend_request(rq))
+	if (rq->cmd_type == REQ_TYPE_PM_SUSPEND)
 		blk_stop_queue(q);
 	else
 		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
@@ -210,11 +210,11 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
 	struct request_pm_state *pm = rq->special;
 
-	if (blk_pm_suspend_request(rq) &&
+	if (rq->cmd_type == REQ_TYPE_PM_SUSPEND &&
 	    pm->pm_step == IDE_PM_START_SUSPEND)
 		/* Mark drive blocked when starting the suspend sequence. */
 		drive->dev_flags |= IDE_DFLAG_BLOCKED;
-	else if (blk_pm_resume_request(rq) &&
+	else if (rq->cmd_type == REQ_TYPE_PM_RESUME &&
 		 pm->pm_step == IDE_PM_START_RESUME) {
 		/*
 		 * The first thing we do on wakeup is to wait for BSY bit to
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index b07232880ec9..635fd72d4728 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -577,7 +577,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		      rq->cmd[0], (unsigned long long)blk_rq_pos(rq),
 		      blk_rq_sectors(rq));
 
-	BUG_ON(!(blk_special_request(rq) || blk_sense_request(rq)));
+	BUG_ON(!(rq->cmd_type == REQ_TYPE_SPECIAL ||
+		 rq->cmd_type == REQ_TYPE_SENSE));
 
 	/* Retry a failed packet command */
 	if (drive->failed_pc && drive->pc->c[0] == REQUEST_SENSE) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d21e1284604f..1e0e6dd51501 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -792,12 +792,12 @@ static void dm_end_request(struct request *clone, int error)
 {
 	int rw = rq_data_dir(clone);
 	int run_queue = 1;
-	bool is_barrier = blk_barrier_rq(clone);
+	bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 	struct request *rq = tio->orig;
 
-	if (blk_pc_request(rq) && !is_barrier) {
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) {
 		rq->errors = clone->errors;
 		rq->resid_len = clone->resid_len;
 
@@ -844,7 +844,7 @@ void dm_requeue_unmapped_request(struct request *clone)
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 
-	if (unlikely(blk_barrier_rq(clone))) {
+	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
 		/*
 		 * Barrier clones share an original request.
 		 * Leave it to dm_end_request(), which handles this special
@@ -943,7 +943,7 @@ static void dm_complete_request(struct request *clone, int error)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct request *rq = tio->orig;
 
-	if (unlikely(blk_barrier_rq(clone))) {
+	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
 		/*
 		 * Barrier clones share an original request.  So can't use
 		 * softirq_done with the original.
@@ -972,7 +972,7 @@ void dm_kill_unmapped_request(struct request *clone, int error)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct request *rq = tio->orig;
 
-	if (unlikely(blk_barrier_rq(clone))) {
+	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
 		/*
 		 * Barrier clones share an original request.
 		 * Leave it to dm_end_request(), which handles this special
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 8327e248520a..56645408d225 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -805,7 +805,8 @@ static void mspro_block_start(struct memstick_dev *card)
 
 static int mspro_block_prepare_req(struct request_queue *q, struct request *req)
 {
-	if (!blk_fs_request(req) && !blk_pc_request(req)) {
+	if (req->cmd_type != REQ_TYPE_FS &&
+	    req->cmd_type != REQ_TYPE_BLOCK_PC) {
 		blk_dump_rq_flags(req, "MSPro unsupported request");
 		return BLKPREP_KILL;
 	}
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index fc593fbab696..108f0c2b2bfd 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -883,7 +883,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 		if (!req)
 			break;
 
-		if (blk_fs_request(req)) {
+		if (req->cmd_type == REQ_TYPE_FS) {
 			struct i2o_block_delayed_request *dreq;
 			struct i2o_block_request *ireq = req->special;
 			unsigned int queue_depth;
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index d6ded247d941..ec92bcbdeddb 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -32,7 +32,7 @@ static int mmc_prep_request(struct request_queue *q, struct request *req)
 	/*
 	 * We only like normal block requests.
 	 */
-	if (!blk_fs_request(req)) {
+	if (req->cmd_type != REQ_TYPE_FS) {
 		blk_dump_rq_flags(req, "MMC bad request");
 		return BLKPREP_KILL;
 	}
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 03e19c1965cc..475af42745cb 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -73,14 +73,14 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 
 	buf = req->buffer;
 
-	if (!blk_fs_request(req))
+	if (req->cmd_type != REQ_TYPE_FS)
 		return -EIO;
 
 	if (blk_rq_pos(req) + blk_rq_cur_sectors(req) >
 	    get_capacity(req->rq_disk))
 		return -EIO;
 
-	if (blk_discard_rq(req))
+	if (req->cmd_flags & REQ_DISCARD)
 		return tr->discard(dev, block, nsect);
 
 	switch(rq_data_dir(req)) {
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index a5d630f5f519..1b88af89d0c7 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -307,7 +307,7 @@ static int scsi_check_sense(struct scsi_cmnd *scmd)
 		    (sshdr.asc == 0x04) && (sshdr.ascq == 0x02))
 			return FAILED;
 
-		if (blk_barrier_rq(scmd->request))
+		if (scmd->request->cmd_flags & REQ_HARDBARRIER)
 			/*
 			 * barrier requests should always retry on UA
 			 * otherwise block will get a spurious error
@@ -1318,16 +1318,16 @@ int scsi_noretry_cmd(struct scsi_cmnd *scmd)
 	case DID_OK:
 		break;
 	case DID_BUS_BUSY:
-		return blk_failfast_transport(scmd->request);
+		return (scmd->request->cmd_flags & REQ_FAILFAST_TRANSPORT);
 	case DID_PARITY:
-		return blk_failfast_dev(scmd->request);
+		return (scmd->request->cmd_flags & REQ_FAILFAST_DEV);
 	case DID_ERROR:
 		if (msg_byte(scmd->result) == COMMAND_COMPLETE &&
 		    status_byte(scmd->result) == RESERVATION_CONFLICT)
 			return 0;
 		/* fall through */
 	case DID_SOFT_ERROR:
-		return blk_failfast_driver(scmd->request);
+		return (scmd->request->cmd_flags & REQ_FAILFAST_DRIVER);
 	}
 
 	switch (status_byte(scmd->result)) {
@@ -1336,7 +1336,7 @@ int scsi_noretry_cmd(struct scsi_cmnd *scmd)
 		 * assume caller has checked sense and determinted
 		 * the check condition was retryable.
 		 */
-		return blk_failfast_dev(scmd->request);
+		return (scmd->request->cmd_flags & REQ_FAILFAST_DEV);
 	}
 
 	return 0;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1646fe7cbd4b..5f1160841b0e 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -722,7 +722,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			sense_deferred = scsi_sense_is_deferred(&sshdr);
 	}
 
-	if (blk_pc_request(req)) { /* SG_IO ioctl from block level */
+	if (req->cmd_type == REQ_TYPE_BLOCK_PC) { /* SG_IO ioctl from block level */
 		req->errors = result;
 		if (result) {
 			if (sense_valid && req->sense) {
@@ -757,7 +757,8 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		}
 	}
 
-	BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
+	/* no bidi support for !REQ_TYPE_BLOCK_PC yet */
+	BUG_ON(blk_bidi_rq(req));
 
 	/*
 	 * Next deal with any sectors which we were able to correctly
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 8802e48bc063..a3fdf4dc59da 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -485,7 +485,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	 * Discard request come in as REQ_TYPE_FS but we turn them into
 	 * block PC requests to make life easier.
 	 */
-	if (blk_discard_rq(rq))
+	if (rq->cmd_flags & REQ_DISCARD)
 		ret = sd_prepare_discard(rq);
 
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
@@ -636,7 +636,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 		SCpnt->cmnd[0] = VARIABLE_LENGTH_CMD;
 		SCpnt->cmnd[7] = 0x18;
 		SCpnt->cmnd[9] = (rq_data_dir(rq) == READ) ? READ_32 : WRITE_32;
-		SCpnt->cmnd[10] = protect | (blk_fua_rq(rq) ? 0x8 : 0);
+		SCpnt->cmnd[10] = protect | ((rq->cmd_flags & REQ_FUA) ? 0x8 : 0);
 
 		/* LBA */
 		SCpnt->cmnd[12] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0;
@@ -661,7 +661,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 		SCpnt->cmnd[31] = (unsigned char) this_count & 0xff;
 	} else if (block > 0xffffffff) {
 		SCpnt->cmnd[0] += READ_16 - READ_6;
-		SCpnt->cmnd[1] = protect | (blk_fua_rq(rq) ? 0x8 : 0);
+		SCpnt->cmnd[1] = protect | ((rq->cmd_flags & REQ_FUA) ? 0x8 : 0);
 		SCpnt->cmnd[2] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0;
 		SCpnt->cmnd[3] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0;
 		SCpnt->cmnd[4] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0;
@@ -682,7 +682,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 			this_count = 0xffff;
 
 		SCpnt->cmnd[0] += READ_10 - READ_6;
-		SCpnt->cmnd[1] = protect | (blk_fua_rq(rq) ? 0x8 : 0);
+		SCpnt->cmnd[1] = protect | ((rq->cmd_flags & REQ_FUA) ? 0x8 : 0);
 		SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
 		SCpnt->cmnd[3] = (unsigned char) (block >> 16) & 0xff;
 		SCpnt->cmnd[4] = (unsigned char) (block >> 8) & 0xff;
@@ -691,7 +691,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 		SCpnt->cmnd[7] = (unsigned char) (this_count >> 8) & 0xff;
 		SCpnt->cmnd[8] = (unsigned char) this_count & 0xff;
 	} else {
-		if (unlikely(blk_fua_rq(rq))) {
+		if (unlikely(rq->cmd_flags & REQ_FUA)) {
 			/*
 			 * This happens only if this drive failed
 			 * 10byte rw command with ILLEGAL_REQUEST
@@ -1112,7 +1112,7 @@ static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd)
 	u64 bad_lba;
 	int info_valid;
 
-	if (!blk_fs_request(scmd->request))
+	if (scmd->request->cmd_type != REQ_TYPE_FS)
 		return 0;
 
 	info_valid = scsi_get_sense_info_fld(scmd->sense_buffer,
diff --git a/drivers/scsi/sun3_NCR5380.c b/drivers/scsi/sun3_NCR5380.c
index b5838d547c68..713620ed70d9 100644
--- a/drivers/scsi/sun3_NCR5380.c
+++ b/drivers/scsi/sun3_NCR5380.c
@@ -2022,7 +2022,7 @@ static void NCR5380_information_transfer (struct Scsi_Host *instance)
 		if((count > SUN3_DMA_MINSIZE) && (sun3_dma_setup_done
 						  != cmd))
 		{
-			if(blk_fs_request(cmd->request)) {
+			if (cmd->request->cmd_type == REQ_TYPE_FS) {
 				sun3scsi_dma_setup(d, count,
 						   rq_data_dir(cmd->request));
 				sun3_dma_setup_done = cmd;
diff --git a/drivers/scsi/sun3_scsi.c b/drivers/scsi/sun3_scsi.c
index e606cf0a2eb7..613f5880d135 100644
--- a/drivers/scsi/sun3_scsi.c
+++ b/drivers/scsi/sun3_scsi.c
@@ -524,7 +524,7 @@ static inline unsigned long sun3scsi_dma_xfer_len(unsigned long wanted,
 						  struct scsi_cmnd *cmd,
 						  int write_flag)
 {
-	if(blk_fs_request(cmd->request))
+	if (cmd->request->cmd_type == REQ_TYPE_FS)
  		return wanted;
 	else
 		return 0;
diff --git a/drivers/scsi/sun3_scsi_vme.c b/drivers/scsi/sun3_scsi_vme.c
index aaa4fd0dd1b9..7c526b8e30ac 100644
--- a/drivers/scsi/sun3_scsi_vme.c
+++ b/drivers/scsi/sun3_scsi_vme.c
@@ -458,7 +458,7 @@ static inline unsigned long sun3scsi_dma_xfer_len(unsigned long wanted,
 						  struct scsi_cmnd *cmd,
 						  int write_flag)
 {
-	if(blk_fs_request(cmd->request))
+	if (cmd->request->cmd_type == REQ_TYPE_FS)
  		return wanted;
 	else
 		return 0;
diff --git a/drivers/staging/hv/blkvsc_drv.c b/drivers/staging/hv/blkvsc_drv.c
index 61bd0be5fb18..a9aff90e58e0 100644
--- a/drivers/staging/hv/blkvsc_drv.c
+++ b/drivers/staging/hv/blkvsc_drv.c
@@ -823,7 +823,8 @@ static void blkvsc_init_rw(struct blkvsc_request *blkvsc_req)
 			blkvsc_req->cmnd[0] = READ_16;
 		}
 
-		blkvsc_req->cmnd[1] |= blk_fua_rq(blkvsc_req->req) ? 0x8 : 0;
+		blkvsc_req->cmnd[1] |=
+			(blkvsc_req->req->cmd_flags & REQ_FUA) ? 0x8 : 0;
 
 		*(unsigned long long *)&blkvsc_req->cmnd[2] =
 				cpu_to_be64(blkvsc_req->sector_start);
@@ -839,7 +840,8 @@ static void blkvsc_init_rw(struct blkvsc_request *blkvsc_req)
 			blkvsc_req->cmnd[0] = READ_10;
 		}
 
-		blkvsc_req->cmnd[1] |= blk_fua_rq(blkvsc_req->req) ? 0x8 : 0;
+		blkvsc_req->cmnd[1] |=
+			(blkvsc_req->req->cmd_flags & REQ_FUA) ? 0x8 : 0;
 
 		*(unsigned int *)&blkvsc_req->cmnd[2] =
 				cpu_to_be32(blkvsc_req->sector_start);
@@ -1286,7 +1288,7 @@ static void blkvsc_request(struct request_queue *queue)
 		DPRINT_DBG(BLKVSC_DRV, "- req %p\n", req);
 
 		blkdev = req->rq_disk->private_data;
-		if (blkdev->shutting_down || !blk_fs_request(req) ||
+		if (blkdev->shutting_down || req->cmd_type != REQ_TYPE_FS ||
 		    blkdev->media_not_present) {
 			__blk_end_request_cur(req, 0);
 			continue;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d7ae241a9e55..3ecd28ef9ba4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -604,33 +604,20 @@ enum {
 	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
 
-#define blk_fs_request(rq)	((rq)->cmd_type == REQ_TYPE_FS)
-#define blk_pc_request(rq)	((rq)->cmd_type == REQ_TYPE_BLOCK_PC)
-#define blk_special_request(rq)	((rq)->cmd_type == REQ_TYPE_SPECIAL)
-#define blk_sense_request(rq)	((rq)->cmd_type == REQ_TYPE_SENSE)
-
-#define blk_failfast_dev(rq)	((rq)->cmd_flags & REQ_FAILFAST_DEV)
-#define blk_failfast_transport(rq) ((rq)->cmd_flags & REQ_FAILFAST_TRANSPORT)
-#define blk_failfast_driver(rq)	((rq)->cmd_flags & REQ_FAILFAST_DRIVER)
-#define blk_noretry_request(rq)	(blk_failfast_dev(rq) ||	\
-				 blk_failfast_transport(rq) ||	\
-				 blk_failfast_driver(rq))
-#define blk_rq_started(rq)	((rq)->cmd_flags & REQ_STARTED)
-#define blk_rq_io_stat(rq)	((rq)->cmd_flags & REQ_IO_STAT)
-#define blk_rq_quiet(rq)	((rq)->cmd_flags & REQ_QUIET)
-
-#define blk_account_rq(rq)	(blk_rq_started(rq) && (blk_fs_request(rq) || blk_discard_rq(rq))) 
-
-#define blk_pm_suspend_request(rq)	((rq)->cmd_type == REQ_TYPE_PM_SUSPEND)
-#define blk_pm_resume_request(rq)	((rq)->cmd_type == REQ_TYPE_PM_RESUME)
+#define blk_noretry_request(rq) \
+	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
+			     REQ_FAILFAST_DRIVER))
+
+#define blk_account_rq(rq) \
+	(((rq)->cmd_flags & REQ_STARTED) && \
+	 ((rq)->cmd_type == REQ_TYPE_FS || \
+	  ((rq)->cmd_flags & REQ_DISCARD)))
+
 #define blk_pm_request(rq)	\
-	(blk_pm_suspend_request(rq) || blk_pm_resume_request(rq))
+	((rq)->cmd_type == REQ_TYPE_PM_SUSPEND || \
+	 (rq)->cmd_type == REQ_TYPE_PM_RESUME)
 
 #define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
-#define blk_sorted_rq(rq)	((rq)->cmd_flags & REQ_SORTED)
-#define blk_barrier_rq(rq)	((rq)->cmd_flags & REQ_HARDBARRIER)
-#define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
-#define blk_discard_rq(rq)	((rq)->cmd_flags & REQ_DISCARD)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 /* rq->queuelist of dequeued request must be list_empty() */
 #define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
@@ -652,9 +639,6 @@ static inline bool rq_is_sync(struct request *rq)
 	return rw_is_sync(rq->cmd_flags);
 }
 
-#define rq_is_meta(rq)		((rq)->cmd_flags & REQ_RW_META)
-#define rq_noidle(rq)		((rq)->cmd_flags & REQ_NOIDLE)
-
 static inline int blk_queue_full(struct request_queue *q, int sync)
 {
 	if (sync)
@@ -687,7 +671,8 @@ static inline void blk_clear_queue_full(struct request_queue *q, int sync)
 	(REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
 #define rq_mergeable(rq)	\
 	(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
-	 (blk_discard_rq(rq) || blk_fs_request((rq))))
+	 (((rq)->cmd_flags & REQ_DISCARD) || \
+	  (rq)->cmd_type == REQ_TYPE_FS))
 
 /*
  * q->prep_rq_fn return values
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 416bf62d6d46..23faa67e8022 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -224,7 +224,7 @@ static inline int blk_trace_init_sysfs(struct device *dev)
 
 static inline int blk_cmd_buf_len(struct request *rq)
 {
-	return blk_pc_request(rq) ? rq->cmd_len * 3 : 1;
+	return (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? rq->cmd_len * 3 : 1;
 }
 
 extern void blk_dump_cmd(char *buf, struct request *rq);
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index d870a918559c..d8ce278515c3 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -25,8 +25,10 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
 
 	TP_fast_assign(
 		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
-		__entry->sector    = blk_pc_request(rq) ? 0 : blk_rq_pos(rq);
-		__entry->nr_sector = blk_pc_request(rq) ? 0 : blk_rq_sectors(rq);
+		__entry->sector    = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+					0 : blk_rq_pos(rq);
+		__entry->nr_sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+					0 : blk_rq_sectors(rq);
 		__entry->errors    = rq->errors;
 
 		blk_fill_rwbs_rq(__entry->rwbs, rq);
@@ -109,9 +111,12 @@ DECLARE_EVENT_CLASS(block_rq,
 
 	TP_fast_assign(
 		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
-		__entry->sector    = blk_pc_request(rq) ? 0 : blk_rq_pos(rq);
-		__entry->nr_sector = blk_pc_request(rq) ? 0 : blk_rq_sectors(rq);
-		__entry->bytes     = blk_pc_request(rq) ? blk_rq_bytes(rq) : 0;
+		__entry->sector    = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+					0 : blk_rq_pos(rq);
+		__entry->nr_sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+					0 : blk_rq_sectors(rq);
+		__entry->bytes     = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+					blk_rq_bytes(rq) : 0;
 
 		blk_fill_rwbs_rq(__entry->rwbs, rq);
 		blk_dump_cmd(__get_str(cmd), rq);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 638711c17504..4f149944cb89 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -661,10 +661,10 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (likely(!bt))
 		return;
 
-	if (blk_discard_rq(rq))
+	if (rq->cmd_flags & REQ_DISCARD)
 		rw |= (1 << BIO_RW_DISCARD);
 
-	if (blk_pc_request(rq)) {
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
 		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
 				what, rq->errors, rq->cmd_len, rq->cmd);
@@ -925,7 +925,7 @@ void blk_add_driver_data(struct request_queue *q,
 	if (likely(!bt))
 		return;
 
-	if (blk_pc_request(rq))
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
 		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
 				BLK_TA_DRV_DATA, rq->errors, len, data);
 	else
@@ -1730,7 +1730,7 @@ void blk_dump_cmd(char *buf, struct request *rq)
 	int len = rq->cmd_len;
 	unsigned char *cmd = rq->cmd;
 
-	if (!blk_pc_request(rq)) {
+	if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
 		buf[0] = '\0';
 		return;
 	}
@@ -1779,7 +1779,7 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
 	int rw = rq->cmd_flags & 0x03;
 	int bytes;
 
-	if (blk_discard_rq(rq))
+	if (rq->cmd_flags & REQ_DISCARD)
 		rw |= (1 << BIO_RW_DISCARD);
 
 	bytes = blk_rq_bytes(rq);
-- 
cgit v1.2.3-59-g8ed1b


From 7b6d91daee5cac6402186ff224c3af39d79f4a0e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 7 Aug 2010 18:20:39 +0200
Subject: block: unify flags for struct bio and struct request

Remove the current bio flags and reuse the request flags for the bio, too.
This allows to more easily trace the type of I/O from the filesystem
down to the block driver.  There were two flags in the bio that were
missing in the requests:  BIO_RW_UNPLUG and BIO_RW_AHEAD.  Also I've
renamed two request flags that had a superflous RW in them.

Note that the flags are in bio.h despite having the REQ_ name - as
blkdev.h includes bio.h that is the only way to go for now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c                |   2 +-
 block/blk-core.c                   |  37 +++--------
 block/blk-map.c                    |   2 +-
 block/blk-merge.c                  |   2 +-
 block/cfq-iosched.c                |  14 ++---
 block/elevator.c                   |   3 +-
 drivers/ata/libata-scsi.c          |   2 +-
 drivers/block/aoe/aoeblk.c         |   2 +-
 drivers/block/brd.c                |   2 +-
 drivers/block/drbd/drbd_actlog.c   |   8 +--
 drivers/block/drbd/drbd_main.c     |   6 +-
 drivers/block/drbd/drbd_receiver.c |  22 +++----
 drivers/block/drbd/drbd_req.c      |   2 +-
 drivers/block/loop.c               |   2 +-
 drivers/block/pktcdvd.c            |   2 +-
 drivers/block/umem.c               |   2 +-
 drivers/ide/ide-cd_ioctl.c         |   2 +-
 drivers/ide/ide-floppy.c           |   2 +-
 drivers/md/dm-io.c                 |  12 ++--
 drivers/md/dm-kcopyd.c             |   2 +-
 drivers/md/dm-raid1.c              |   2 +-
 drivers/md/dm-stripe.c             |   2 +-
 drivers/md/dm.c                    |  14 ++---
 drivers/md/linear.c                |   2 +-
 drivers/md/md.c                    |  10 +--
 drivers/md/md.h                    |   4 +-
 drivers/md/multipath.c             |   8 +--
 drivers/md/raid0.c                 |   2 +-
 drivers/md/raid1.c                 |  22 +++----
 drivers/md/raid10.c                |  12 ++--
 drivers/md/raid5.c                 |   2 +-
 drivers/scsi/osd/osd_initiator.c   |   8 +--
 fs/bio.c                           |   5 +-
 fs/btrfs/disk-io.c                 |   8 +--
 fs/btrfs/inode.c                   |   6 +-
 fs/btrfs/volumes.c                 |  18 +++---
 fs/exofs/ios.c                     |   2 +-
 fs/gfs2/log.c                      |   4 +-
 fs/gfs2/meta_io.c                  |   8 +--
 fs/gfs2/ops_fstype.c               |   2 +-
 fs/nilfs2/segbuf.c                 |   2 +-
 include/linux/bio.h                | 125 +++++++++++++++++++++++--------------
 include/linux/blkdev.h             |  66 +-------------------
 include/linux/fs.h                 |  38 +++++------
 kernel/power/block_io.c            |   2 +-
 kernel/trace/blktrace.c            |  27 ++++----
 mm/page_io.c                       |   2 +-
 47 files changed, 242 insertions(+), 289 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 74e404393172..7c6f4a714687 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -203,7 +203,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 		/* initialize proxy request and queue it */
 		blk_rq_init(q, rq);
 		if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
-			rq->cmd_flags |= REQ_RW;
+			rq->cmd_flags |= REQ_WRITE;
 		if (q->ordered & QUEUE_ORDERED_DO_FUA)
 			rq->cmd_flags |= REQ_FUA;
 		init_request_from_bio(rq, q->orig_bar_rq->bio);
diff --git a/block/blk-core.c b/block/blk-core.c
index dca43a31e725..66c3cfe94d0a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1140,25 +1140,9 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	req->cpu = bio->bi_comp_cpu;
 	req->cmd_type = REQ_TYPE_FS;
 
-	/*
-	 * Inherit FAILFAST from bio (for read-ahead, and explicit
-	 * FAILFAST).  FAILFAST flags are identical for req and bio.
-	 */
-	if (bio_rw_flagged(bio, BIO_RW_AHEAD))
+	req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
+	if (bio->bi_rw & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
-	else
-		req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK;
-
-	if (bio_rw_flagged(bio, BIO_RW_DISCARD))
-		req->cmd_flags |= REQ_DISCARD;
-	if (bio_rw_flagged(bio, BIO_RW_BARRIER))
-		req->cmd_flags |= REQ_HARDBARRIER;
-	if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
-		req->cmd_flags |= REQ_RW_SYNC;
-	if (bio_rw_flagged(bio, BIO_RW_META))
-		req->cmd_flags |= REQ_RW_META;
-	if (bio_rw_flagged(bio, BIO_RW_NOIDLE))
-		req->cmd_flags |= REQ_NOIDLE;
 
 	req->errors = 0;
 	req->__sector = bio->bi_sector;
@@ -1181,12 +1165,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	int el_ret;
 	unsigned int bytes = bio->bi_size;
 	const unsigned short prio = bio_prio(bio);
-	const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
-	const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
+	const bool sync = (bio->bi_rw & REQ_SYNC);
+	const bool unplug = (bio->bi_rw & REQ_UNPLUG);
 	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	int rw_flags;
 
-	if (bio_rw_flagged(bio, BIO_RW_BARRIER) &&
+	if ((bio->bi_rw & REQ_HARDBARRIER) &&
 	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
@@ -1200,7 +1184,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
+	if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1275,7 +1259,7 @@ get_rq:
 	 */
 	rw_flags = bio_data_dir(bio);
 	if (sync)
-		rw_flags |= REQ_RW_SYNC;
+		rw_flags |= REQ_SYNC;
 
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
@@ -1464,7 +1448,7 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
-		if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&
+		if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
 			     nr_sectors > queue_max_hw_sectors(q))) {
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
 			       bdevname(bio->bi_bdev, b),
@@ -1497,8 +1481,7 @@ static inline void __generic_make_request(struct bio *bio)
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
 
-		if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
-		    !blk_queue_discard(q)) {
+		if ((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(q)) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
@@ -2365,7 +2348,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
 	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
-	rq->cmd_flags |= bio->bi_rw & REQ_RW;
+	rq->cmd_flags |= bio->bi_rw & REQ_WRITE;
 
 	if (bio_has_data(bio)) {
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
diff --git a/block/blk-map.c b/block/blk-map.c
index 9083cf0180cc..c65d7593f7f1 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -307,7 +307,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 		return PTR_ERR(bio);
 
 	if (rq_data_dir(rq) == WRITE)
-		bio->bi_rw |= (1 << BIO_RW);
+		bio->bi_rw |= (1 << REQ_WRITE);
 
 	if (do_copy)
 		rq->cmd_flags |= REQ_COPY_USER;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 87e4fb7d0e98..4852475521ea 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -180,7 +180,7 @@ new_segment:
 	}
 
 	if (q->dma_drain_size && q->dma_drain_needed(rq)) {
-		if (rq->cmd_flags & REQ_RW)
+		if (rq->cmd_flags & REQ_WRITE)
 			memset(q->dma_drain_buffer, 0, q->dma_drain_size);
 
 		sg->page_link &= ~0x02;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d4edeb8fceb8..eb4086f7dfef 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -458,7 +458,7 @@ static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
  */
 static inline bool cfq_bio_sync(struct bio *bio)
 {
-	return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);
+	return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
 }
 
 /*
@@ -646,10 +646,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
 		return rq1;
 	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
 		return rq2;
-	if ((rq1->cmd_flags & REQ_RW_META) && !(rq2->cmd_flags & REQ_RW_META))
+	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
 		return rq1;
-	else if ((rq2->cmd_flags & REQ_RW_META) &&
-		 !(rq1->cmd_flags & REQ_RW_META))
+	else if ((rq2->cmd_flags & REQ_META) &&
+		 !(rq1->cmd_flags & REQ_META))
 		return rq2;
 
 	s1 = blk_rq_pos(rq1);
@@ -1485,7 +1485,7 @@ static void cfq_remove_request(struct request *rq)
 	cfqq->cfqd->rq_queued--;
 	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
 					rq_data_dir(rq), rq_is_sync(rq));
-	if (rq->cmd_flags & REQ_RW_META) {
+	if (rq->cmd_flags & REQ_META) {
 		WARN_ON(!cfqq->meta_pending);
 		cfqq->meta_pending--;
 	}
@@ -3177,7 +3177,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * So both queues are sync. Let the new request get disk time if
 	 * it's a metadata request and the current queue is doing regular IO.
 	 */
-	if ((rq->cmd_flags & REQ_RW_META) && !cfqq->meta_pending)
+	if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
 		return true;
 
 	/*
@@ -3231,7 +3231,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_io_context *cic = RQ_CIC(rq);
 
 	cfqd->rq_queued++;
-	if (rq->cmd_flags & REQ_RW_META)
+	if (rq->cmd_flags & REQ_META)
 		cfqq->meta_pending++;
 
 	cfq_update_io_thinktime(cfqd, cic);
diff --git a/block/elevator.c b/block/elevator.c
index aa99b59c03d6..816a7c8d6394 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -79,8 +79,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 	/*
 	 * Don't merge file system requests and discard requests
 	 */
-	if (bio_rw_flagged(bio, BIO_RW_DISCARD) !=
-	    bio_rw_flagged(rq->bio, BIO_RW_DISCARD))
+	if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD))
 		return 0;
 
 	/*
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index a5c08b082edb..0a8cd3484791 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1114,7 +1114,7 @@ static int atapi_drain_needed(struct request *rq)
 	if (likely(rq->cmd_type != REQ_TYPE_BLOCK_PC))
 		return 0;
 
-	if (!blk_rq_bytes(rq) || (rq->cmd_flags & REQ_RW))
+	if (!blk_rq_bytes(rq) || (rq->cmd_flags & REQ_WRITE))
 		return 0;
 
 	return atapi_cmd_type(rq->cmd[0]) == ATAPI_MISC;
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 035cefe4045a..65deffde60ac 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -173,7 +173,7 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
 		BUG();
 		bio_endio(bio, -ENXIO);
 		return 0;
-	} else if (bio_rw_flagged(bio, BIO_RW_BARRIER)) {
+	} else if (bio->bi_rw & REQ_HARDBARRIER) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	} else if (bio->bi_io_vec == NULL) {
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index f1bf79d9bc0a..1b218c6b6820 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -340,7 +340,7 @@ static int brd_make_request(struct request_queue *q, struct bio *bio)
 						get_capacity(bdev->bd_disk))
 		goto out;
 
-	if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
+	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
 		err = 0;
 		discard_from_brd(brd, sector, bio->bi_size);
 		goto out;
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index df018990c422..9400845d602e 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -79,8 +79,8 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 	md_io.error = 0;
 
 	if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
-		rw |= (1 << BIO_RW_BARRIER);
-	rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO));
+		rw |= REQ_HARDBARRIER;
+	rw |= REQ_UNPLUG | REQ_SYNC;
 
  retry:
 	bio = bio_alloc(GFP_NOIO, 1);
@@ -103,11 +103,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 	/* check for unsupported barrier op.
 	 * would rather check on EOPNOTSUPP, but that is not reliable.
 	 * don't try again for ANY return value != 0 */
-	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) {
+	if (unlikely((bio->bi_rw & REQ_HARDBARRIER) && !ok)) {
 		/* Try again with no barrier */
 		dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
 		set_bit(MD_NO_BARRIER, &mdev->flags);
-		rw &= ~(1 << BIO_RW_BARRIER);
+		rw &= ~REQ_HARDBARRIER;
 		bio_put(bio);
 		goto retry;
 	}
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 7258c95e895e..e2ab13d99d69 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2425,15 +2425,15 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 	/* NOTE: no need to check if barriers supported here as we would
 	 *       not pass the test in make_request_common in that case
 	 */
-	if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
+	if (req->master_bio->bi_rw & REQ_HARDBARRIER) {
 		dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
 		/* dp_flags |= DP_HARDBARRIER; */
 	}
-	if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
+	if (req->master_bio->bi_rw & REQ_SYNC)
 		dp_flags |= DP_RW_SYNC;
 	/* for now handle SYNCIO and UNPLUG
 	 * as if they still were one and the same flag */
-	if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
+	if (req->master_bio->bi_rw & REQ_UNPLUG)
 		dp_flags |= DP_RW_SYNC;
 	if (mdev->state.conn >= C_SYNC_SOURCE &&
 	    mdev->state.conn <= C_PAUSED_SYNC_T)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index dff48701b84d..cba1deb7b271 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1180,7 +1180,7 @@ next_bio:
 	bio->bi_sector = sector;
 	bio->bi_bdev = mdev->ldev->backing_bdev;
 	/* we special case some flags in the multi-bio case, see below
-	 * (BIO_RW_UNPLUG, BIO_RW_BARRIER) */
+	 * (REQ_UNPLUG, REQ_HARDBARRIER) */
 	bio->bi_rw = rw;
 	bio->bi_private = e;
 	bio->bi_end_io = drbd_endio_sec;
@@ -1209,16 +1209,16 @@ next_bio:
 		bios = bios->bi_next;
 		bio->bi_next = NULL;
 
-		/* strip off BIO_RW_UNPLUG unless it is the last bio */
+		/* strip off REQ_UNPLUG unless it is the last bio */
 		if (bios)
-			bio->bi_rw &= ~(1<<BIO_RW_UNPLUG);
+			bio->bi_rw &= ~REQ_UNPLUG;
 
 		drbd_generic_make_request(mdev, fault_type, bio);
 
-		/* strip off BIO_RW_BARRIER,
+		/* strip off REQ_HARDBARRIER,
 		 * unless it is the first or last bio */
 		if (bios && bios->bi_next)
-			bios->bi_rw &= ~(1<<BIO_RW_BARRIER);
+			bios->bi_rw &= ~REQ_HARDBARRIER;
 	} while (bios);
 	maybe_kick_lo(mdev);
 	return 0;
@@ -1233,7 +1233,7 @@ fail:
 }
 
 /**
- * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
+ * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
  * @mdev:	DRBD device.
  * @w:		work object.
  * @cancel:	The connection will be closed anyways (unused in this callback)
@@ -1245,7 +1245,7 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea
 	   (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
 	   so that we can finish that epoch in drbd_may_finish_epoch().
 	   That is necessary if we already have a long chain of Epochs, before
-	   we realize that BIO_RW_BARRIER is actually not supported */
+	   we realize that REQ_HARDBARRIER is actually not supported */
 
 	/* As long as the -ENOTSUPP on the barrier is reported immediately
 	   that will never trigger. If it is reported late, we will just
@@ -1824,14 +1824,14 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
 		epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
 		if (epoch == e->epoch) {
 			set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
-			rw |= (1<<BIO_RW_BARRIER);
+			rw |= REQ_HARDBARRIER;
 			e->flags |= EE_IS_BARRIER;
 		} else {
 			if (atomic_read(&epoch->epoch_size) > 1 ||
 			    !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
 				set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
 				set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
-				rw |= (1<<BIO_RW_BARRIER);
+				rw |= REQ_HARDBARRIER;
 				e->flags |= EE_IS_BARRIER;
 			}
 		}
@@ -1841,10 +1841,10 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
 	dp_flags = be32_to_cpu(p->dp_flags);
 	if (dp_flags & DP_HARDBARRIER) {
 		dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
-		/* rw |= (1<<BIO_RW_BARRIER); */
+		/* rw |= REQ_HARDBARRIER; */
 	}
 	if (dp_flags & DP_RW_SYNC)
-		rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
+		rw |= REQ_SYNC | REQ_UNPLUG;
 	if (dp_flags & DP_MAY_SET_IN_SYNC)
 		e->flags |= EE_MAY_SET_IN_SYNC;
 
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 654f1ef5cbb0..f761d98a4e90 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -997,7 +997,7 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
 	 * because of those XXX, this is not yet enabled,
 	 * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
 	 */
-	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) {
+	if (unlikely(bio->bi_rw & REQ_HARDBARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags)) {
 		/* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6120922f459f..fedfdb7d3cdf 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -476,7 +476,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
 
 	if (bio_rw(bio) == WRITE) {
-		bool barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
+		bool barrier = (bio->bi_rw & REQ_HARDBARRIER);
 		struct file *file = lo->lo_backing_file;
 
 		if (barrier) {
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 8a549db2aa78..9f3e4454274b 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1221,7 +1221,7 @@ static int pkt_start_recovery(struct packet_data *pkt)
 	pkt->bio->bi_flags = 1 << BIO_UPTODATE;
 	pkt->bio->bi_idx = 0;
 
-	BUG_ON(pkt->bio->bi_rw != (1 << BIO_RW));
+	BUG_ON(pkt->bio->bi_rw != REQ_WRITE);
 	BUG_ON(pkt->bio->bi_vcnt != pkt->frames);
 	BUG_ON(pkt->bio->bi_size != pkt->frames * CD_FRAMESIZE);
 	BUG_ON(pkt->bio->bi_end_io != pkt_end_io_packet_write);
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 2f9470ff8f7c..8be57151f5d6 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -478,7 +478,7 @@ static void process_page(unsigned long data)
 				le32_to_cpu(desc->local_addr)>>9,
 				le32_to_cpu(desc->transfer_size));
 			dump_dmastat(card, control);
-		} else if (test_bit(BIO_RW, &bio->bi_rw) &&
+		} else if ((bio->bi_rw & REQ_WRITE) &&
 			   le32_to_cpu(desc->local_addr) >> 9 ==
 				card->init_size) {
 			card->init_size += le32_to_cpu(desc->transfer_size) >> 9;
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 02712bf045c1..766b3deeb23c 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -454,7 +454,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
 	   touch it at all. */
 
 	if (cgc->data_direction == CGC_DATA_WRITE)
-		flags |= REQ_RW;
+		flags |= REQ_WRITE;
 
 	if (cgc->sense)
 		memset(cgc->sense, 0, sizeof(struct request_sense));
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index c7d0737bb18a..5406b6ea3ad1 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -207,7 +207,7 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive,
 	memcpy(rq->cmd, pc->c, 12);
 
 	pc->rq = rq;
-	if (rq->cmd_flags & REQ_RW)
+	if (rq->cmd_flags & REQ_WRITE)
 		pc->flags |= PC_FLAG_WRITING;
 
 	pc->flags |= PC_FLAG_DMA_OK;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 10f457ca6af2..0590c75b0ab6 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -356,7 +356,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	BUG_ON(num_regions > DM_IO_MAX_REGIONS);
 
 	if (sync)
-		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+		rw |= REQ_SYNC | REQ_UNPLUG;
 
 	/*
 	 * For multiple regions we need to be careful to rewind
@@ -364,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	 */
 	for (i = 0; i < num_regions; i++) {
 		*dp = old_pages;
-		if (where[i].count || (rw & (1 << BIO_RW_BARRIER)))
+		if (where[i].count || (rw & REQ_HARDBARRIER))
 			do_region(rw, i, where + i, dp, io);
 	}
 
@@ -412,8 +412,8 @@ retry:
 	}
 	set_current_state(TASK_RUNNING);
 
-	if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
-		rw &= ~(1 << BIO_RW_BARRIER);
+	if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
+		rw &= ~REQ_HARDBARRIER;
 		goto retry;
 	}
 
@@ -479,8 +479,8 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
  * New collapsed (a)synchronous interface.
  *
  * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
- * the queue with blk_unplug() some time later or set the BIO_RW_SYNC bit in
- * io_req->bi_rw. If you fail to do one of these, the IO will be submitted to
+ * the queue with blk_unplug() some time later or set REQ_SYNC in
+io_req->bi_rw. If you fail to do one of these, the IO will be submitted to
  * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
  */
 int dm_io(struct dm_io_request *io_req, unsigned num_regions,
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index addf83475040..d8587bac5682 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -345,7 +345,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
 	int r;
 	struct dm_io_request io_req = {
-		.bi_rw = job->rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG),
+		.bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG,
 		.mem.type = DM_IO_PAGE_LIST,
 		.mem.ptr.pl = job->pages,
 		.mem.offset = job->offset,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ddda531723dc..74136262d654 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1211,7 +1211,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	if (error == -EOPNOTSUPP)
 		goto out;
 
-	if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
+	if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD))
 		goto out;
 
 	if (unlikely(error)) {
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index e610725db766..d6e28d732b4d 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -284,7 +284,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
 	if (!error)
 		return 0; /* I/O complete */
 
-	if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
+	if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD))
 		return error;
 
 	if (error == -EOPNOTSUPP)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1e0e6dd51501..d6f77baeafd6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -614,7 +614,7 @@ static void dec_pending(struct dm_io *io, int error)
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md)) {
-				if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER))
+				if (!(io->bio->bi_rw & REQ_HARDBARRIER))
 					bio_list_add_head(&md->deferred,
 							  io->bio);
 			} else
@@ -626,7 +626,7 @@ static void dec_pending(struct dm_io *io, int error)
 		io_error = io->error;
 		bio = io->bio;
 
-		if (bio_rw_flagged(bio, BIO_RW_BARRIER)) {
+		if (bio->bi_rw & REQ_HARDBARRIER) {
 			/*
 			 * There can be just one barrier request so we use
 			 * a per-device variable for error reporting.
@@ -1106,7 +1106,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
 
 	clone->bi_sector = sector;
 	clone->bi_bdev = bio->bi_bdev;
-	clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER);
+	clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
 	clone->bi_vcnt = 1;
 	clone->bi_size = to_bytes(len);
 	clone->bi_io_vec->bv_offset = offset;
@@ -1133,7 +1133,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 
 	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 	__bio_clone(clone, bio);
-	clone->bi_rw &= ~(1 << BIO_RW_BARRIER);
+	clone->bi_rw &= ~REQ_HARDBARRIER;
 	clone->bi_destructor = dm_bio_destructor;
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
@@ -1301,7 +1301,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
 	ci.map = dm_get_live_table(md);
 	if (unlikely(!ci.map)) {
-		if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
+		if (!(bio->bi_rw & REQ_HARDBARRIER))
 			bio_io_error(bio);
 		else
 			if (!md->barrier_error)
@@ -1414,7 +1414,7 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
 	 * we have to queue this io for later.
 	 */
 	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-	    unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
+	    unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
 		up_read(&md->io_lock);
 
 		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -2296,7 +2296,7 @@ static void dm_wq_work(struct work_struct *work)
 		if (dm_request_based(md))
 			generic_make_request(c);
 		else {
-			if (bio_rw_flagged(c, BIO_RW_BARRIER))
+			if (c->bi_rw & REQ_HARDBARRIER)
 				process_barrier(md, c);
 			else
 				__split_and_process_bio(md, c);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 7e0e057db9a7..ba19060bcf3f 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -294,7 +294,7 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
 	dev_info_t *tmp_dev;
 	sector_t start_sector;
 
-	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
+	if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
 		md_barrier_request(mddev, bio);
 		return 0;
 	}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index cb20d0b0555a..1893af678779 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -353,7 +353,7 @@ static void md_submit_barrier(struct work_struct *ws)
 		/* an empty barrier - all done */
 		bio_endio(bio, 0);
 	else {
-		bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
+		bio->bi_rw &= ~REQ_HARDBARRIER;
 		if (mddev->pers->make_request(mddev, bio))
 			generic_make_request(bio);
 		mddev->barrier = POST_REQUEST_BARRIER;
@@ -675,11 +675,11 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 	 * if zero is reached.
 	 * If an error occurred, call md_error
 	 *
-	 * As we might need to resubmit the request if BIO_RW_BARRIER
+	 * As we might need to resubmit the request if REQ_HARDBARRIER
 	 * causes ENOTSUPP, we allocate a spare bio...
 	 */
 	struct bio *bio = bio_alloc(GFP_NOIO, 1);
-	int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
+	int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
 
 	bio->bi_bdev = rdev->bdev;
 	bio->bi_sector = sector;
@@ -691,7 +691,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 	atomic_inc(&mddev->pending_writes);
 	if (!test_bit(BarriersNotsupp, &rdev->flags)) {
 		struct bio *rbio;
-		rw |= (1<<BIO_RW_BARRIER);
+		rw |= REQ_HARDBARRIER;
 		rbio = bio_clone(bio, GFP_NOIO);
 		rbio->bi_private = bio;
 		rbio->bi_end_io = super_written_barrier;
@@ -736,7 +736,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	struct completion event;
 	int ret;
 
-	rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+	rw |= REQ_SYNC | REQ_UNPLUG;
 
 	bio->bi_bdev = bdev;
 	bio->bi_sector = sector;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 10597bfec000..fc56e0f21c80 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -67,7 +67,7 @@ struct mdk_rdev_s
 #define	Faulty		1		/* device is known to have a fault */
 #define	In_sync		2		/* device is in_sync with rest of array */
 #define	WriteMostly	4		/* Avoid reading if at all possible */
-#define	BarriersNotsupp	5		/* BIO_RW_BARRIER is not supported */
+#define	BarriersNotsupp	5		/* REQ_HARDBARRIER is not supported */
 #define	AllReserved	6		/* If whole device is reserved for
 					 * one array */
 #define	AutoDetected	7		/* added by auto-detect */
@@ -254,7 +254,7 @@ struct mddev_s
 							 * fails.  Only supported
 							 */
 	struct bio			*biolist; 	/* bios that need to be retried
-							 * because BIO_RW_BARRIER is not supported
+							 * because REQ_HARDBARRIER is not supported
 							 */
 
 	atomic_t			recovery_active; /* blocks scheduled, but not written */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 410fb60699ac..0307d217e7a4 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -91,7 +91,7 @@ static void multipath_end_request(struct bio *bio, int error)
 
 	if (uptodate)
 		multipath_end_bh_io(mp_bh, 0);
-	else if (!bio_rw_flagged(bio, BIO_RW_AHEAD)) {
+	else if (!(bio->bi_rw & REQ_RAHEAD)) {
 		/*
 		 * oops, IO error:
 		 */
@@ -142,7 +142,7 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
 
-	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
+	if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
 		md_barrier_request(mddev, bio);
 		return 0;
 	}
@@ -163,7 +163,7 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 	mp_bh->bio = *bio;
 	mp_bh->bio.bi_sector += multipath->rdev->data_offset;
 	mp_bh->bio.bi_bdev = multipath->rdev->bdev;
-	mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
+	mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT;
 	mp_bh->bio.bi_end_io = multipath_end_request;
 	mp_bh->bio.bi_private = mp_bh;
 	generic_make_request(&mp_bh->bio);
@@ -398,7 +398,7 @@ static void multipathd (mddev_t *mddev)
 			*bio = *(mp_bh->master_bio);
 			bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset;
 			bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
-			bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
+			bio->bi_rw |= REQ_FAILFAST_TRANSPORT;
 			bio->bi_end_io = multipath_end_request;
 			bio->bi_private = mp_bh;
 			generic_make_request(bio);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 563abed5a2cb..6f7af46d623c 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -483,7 +483,7 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
 	struct strip_zone *zone;
 	mdk_rdev_t *tmp_dev;
 
-	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
+	if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
 		md_barrier_request(mddev, bio);
 		return 0;
 	}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a948da8012de..73cc74ffc26b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -787,7 +787,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	struct bio_list bl;
 	struct page **behind_pages = NULL;
 	const int rw = bio_data_dir(bio);
-	const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
+	const bool do_sync = (bio->bi_rw & REQ_SYNC);
 	bool do_barriers;
 	mdk_rdev_t *blocked_rdev;
 
@@ -822,7 +822,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		finish_wait(&conf->wait_barrier, &w);
 	}
 	if (unlikely(!mddev->barriers_work &&
-		     bio_rw_flagged(bio, BIO_RW_BARRIER))) {
+		     (bio->bi_rw & REQ_HARDBARRIER))) {
 		if (rw == WRITE)
 			md_write_end(mddev);
 		bio_endio(bio, -EOPNOTSUPP);
@@ -877,7 +877,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
 		read_bio->bi_end_io = raid1_end_read_request;
-		read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
+		read_bio->bi_rw = READ | do_sync;
 		read_bio->bi_private = r1_bio;
 
 		generic_make_request(read_bio);
@@ -959,7 +959,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	atomic_set(&r1_bio->remaining, 0);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
-	do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER);
+	do_barriers = bio->bi_rw & REQ_HARDBARRIER;
 	if (do_barriers)
 		set_bit(R1BIO_Barrier, &r1_bio->state);
 
@@ -975,8 +975,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) |
-			(do_sync << BIO_RW_SYNCIO);
+		mbio->bi_rw = WRITE | do_barriers | do_sync;
 		mbio->bi_private = r1_bio;
 
 		if (behind_pages) {
@@ -1633,7 +1632,7 @@ static void raid1d(mddev_t *mddev)
 			sync_request_write(mddev, r1_bio);
 			unplug = 1;
 		} else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
-			/* some requests in the r1bio were BIO_RW_BARRIER
+			/* some requests in the r1bio were REQ_HARDBARRIER
 			 * requests which failed with -EOPNOTSUPP.  Hohumm..
 			 * Better resubmit without the barrier.
 			 * We know which devices to resubmit for, because
@@ -1641,7 +1640,7 @@ static void raid1d(mddev_t *mddev)
 			 * We already have a nr_pending reference on these rdevs.
 			 */
 			int i;
-			const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
+			const bool do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
 			clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
 			clear_bit(R1BIO_Barrier, &r1_bio->state);
 			for (i=0; i < conf->raid_disks; i++)
@@ -1662,8 +1661,7 @@ static void raid1d(mddev_t *mddev)
 						conf->mirrors[i].rdev->data_offset;
 					bio->bi_bdev = conf->mirrors[i].rdev->bdev;
 					bio->bi_end_io = raid1_end_write_request;
-					bio->bi_rw = WRITE |
-						(do_sync << BIO_RW_SYNCIO);
+					bio->bi_rw = WRITE | do_sync;
 					bio->bi_private = r1_bio;
 					r1_bio->bios[i] = bio;
 					generic_make_request(bio);
@@ -1698,7 +1696,7 @@ static void raid1d(mddev_t *mddev)
 				       (unsigned long long)r1_bio->sector);
 				raid_end_bio_io(r1_bio);
 			} else {
-				const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
+				const bool do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
 				r1_bio->bios[r1_bio->read_disk] =
 					mddev->ro ? IO_BLOCKED : NULL;
 				r1_bio->read_disk = disk;
@@ -1715,7 +1713,7 @@ static void raid1d(mddev_t *mddev)
 				bio->bi_sector = r1_bio->sector + rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				bio->bi_end_io = raid1_end_read_request;
-				bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
+				bio->bi_rw = READ | do_sync;
 				bio->bi_private = r1_bio;
 				unplug = 1;
 				generic_make_request(bio);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 42e64e4e5e25..62ecb6650fd0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -799,12 +799,12 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	int i;
 	int chunk_sects = conf->chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
-	const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
+	const bool do_sync = (bio->bi_rw & REQ_SYNC);
 	struct bio_list bl;
 	unsigned long flags;
 	mdk_rdev_t *blocked_rdev;
 
-	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
+	if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
 		md_barrier_request(mddev, bio);
 		return 0;
 	}
@@ -879,7 +879,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 			mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
 		read_bio->bi_end_io = raid10_end_read_request;
-		read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
+		read_bio->bi_rw = READ | do_sync;
 		read_bio->bi_private = r10_bio;
 
 		generic_make_request(read_bio);
@@ -947,7 +947,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 			conf->mirrors[d].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		mbio->bi_end_io	= raid10_end_write_request;
-		mbio->bi_rw = WRITE | (do_sync << BIO_RW_SYNCIO);
+		mbio->bi_rw = WRITE | do_sync;
 		mbio->bi_private = r10_bio;
 
 		atomic_inc(&r10_bio->remaining);
@@ -1716,7 +1716,7 @@ static void raid10d(mddev_t *mddev)
 				raid_end_bio_io(r10_bio);
 				bio_put(bio);
 			} else {
-				const bool do_sync = bio_rw_flagged(r10_bio->master_bio, BIO_RW_SYNCIO);
+				const bool do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
 				bio_put(bio);
 				rdev = conf->mirrors[mirror].rdev;
 				if (printk_ratelimit())
@@ -1730,7 +1730,7 @@ static void raid10d(mddev_t *mddev)
 				bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
 					+ rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
-				bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
+				bio->bi_rw = READ | do_sync;
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = raid10_end_read_request;
 				unplug = 1;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 96c690279fc6..20ac2f14376a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3958,7 +3958,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 	const int rw = bio_data_dir(bi);
 	int remaining;
 
-	if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
+	if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) {
 		/* Drain all pending writes.  We only really need
 		 * to ensure they have been submitted, but this is
 		 * easier.
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index ee4b6914667f..fda4de3440c4 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -716,7 +716,7 @@ static int _osd_req_list_objects(struct osd_request *or,
 		return PTR_ERR(bio);
 	}
 
-	bio->bi_rw &= ~(1 << BIO_RW);
+	bio->bi_rw &= ~REQ_WRITE;
 	or->in.bio = bio;
 	or->in.total_bytes = bio->bi_size;
 	return 0;
@@ -814,7 +814,7 @@ void osd_req_write(struct osd_request *or,
 {
 	_osd_req_encode_common(or, OSD_ACT_WRITE, obj, offset, len);
 	WARN_ON(or->out.bio || or->out.total_bytes);
-	WARN_ON(0 ==  bio_rw_flagged(bio, BIO_RW));
+	WARN_ON(0 == (bio->bi_rw & REQ_WRITE));
 	or->out.bio = bio;
 	or->out.total_bytes = len;
 }
@@ -829,7 +829,7 @@ int osd_req_write_kern(struct osd_request *or,
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
-	bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
+	bio->bi_rw |= REQ_WRITE; /* FIXME: bio_set_dir() */
 	osd_req_write(or, obj, offset, bio, len);
 	return 0;
 }
@@ -865,7 +865,7 @@ void osd_req_read(struct osd_request *or,
 {
 	_osd_req_encode_common(or, OSD_ACT_READ, obj, offset, len);
 	WARN_ON(or->in.bio || or->in.total_bytes);
-	WARN_ON(1 == bio_rw_flagged(bio, BIO_RW));
+	WARN_ON(1 == (bio->bi_rw & REQ_WRITE));
 	or->in.bio = bio;
 	or->in.total_bytes = len;
 }
diff --git a/fs/bio.c b/fs/bio.c
index e7bf6ca64dcf..8abb2dfb2e7c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -843,7 +843,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	if (!bio)
 		goto out_bmd;
 
-	bio->bi_rw |= (!write_to_vm << BIO_RW);
+	if (!write_to_vm)
+		bio->bi_rw |= REQ_WRITE;
 
 	ret = 0;
 
@@ -1024,7 +1025,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	 * set data direction, and check if mapped pages need bouncing
 	 */
 	if (!write_to_vm)
-		bio->bi_rw |= (1 << BIO_RW);
+		bio->bi_rw |= REQ_WRITE;
 
 	bio->bi_bdev = bdev;
 	bio->bi_flags |= (1 << BIO_USER_MAPPED);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 34f7c375567e..64f10082f048 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -480,7 +480,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
 	end_io_wq->work.func = end_workqueue_fn;
 	end_io_wq->work.flags = 0;
 
-	if (bio->bi_rw & (1 << BIO_RW)) {
+	if (bio->bi_rw & REQ_WRITE) {
 		if (end_io_wq->metadata)
 			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
 					   &end_io_wq->work);
@@ -604,7 +604,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 
 	atomic_inc(&fs_info->nr_async_submits);
 
-	if (rw & (1 << BIO_RW_SYNCIO))
+	if (rw & REQ_SYNC)
 		btrfs_set_work_high_prio(&async->work);
 
 	btrfs_queue_worker(&fs_info->workers, &async->work);
@@ -668,7 +668,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 					  bio, 1);
 	BUG_ON(ret);
 
-	if (!(rw & (1 << BIO_RW))) {
+	if (!(rw & REQ_WRITE)) {
 		/*
 		 * called for a read, do the setup so that checksum validation
 		 * can happen in the async kernel threads
@@ -1427,7 +1427,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	 * ram and up to date before trying to verify things.  For
 	 * blocksize <= pagesize, it is basically a noop
 	 */
-	if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
+	if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
 	    !bio_ready_for_csum(bio)) {
 		btrfs_queue_worker(&fs_info->endio_meta_workers,
 				   &end_io_wq->work);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1bff92ad4744..e975d7180a88 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1429,7 +1429,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
 
-	if (!(rw & (1 << BIO_RW))) {
+	if (!(rw & REQ_WRITE)) {
 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
 			return btrfs_submit_compressed_read(inode, bio,
 						    mirror_num, bio_flags);
@@ -1841,7 +1841,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 	bio->bi_size = 0;
 
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
-	if (failed_bio->bi_rw & (1 << BIO_RW))
+	if (failed_bio->bi_rw & REQ_WRITE)
 		rw = WRITE;
 	else
 		rw = READ;
@@ -5642,7 +5642,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
 	struct bio_vec *bvec = bio->bi_io_vec;
 	u64 start;
 	int skip_sum;
-	int write = rw & (1 << BIO_RW);
+	int write = rw & REQ_WRITE;
 	int ret = 0;
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d6e3af8be95b..dd318ff280b2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -258,7 +258,7 @@ loop_lock:
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 
-		if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
+		if (cur->bi_rw & REQ_SYNC)
 			num_sync_run++;
 
 		submit_bio(cur->bi_rw, cur);
@@ -2651,7 +2651,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int max_errors = 0;
 	struct btrfs_multi_bio *multi = NULL;
 
-	if (multi_ret && !(rw & (1 << BIO_RW)))
+	if (multi_ret && !(rw & REQ_WRITE))
 		stripes_allocated = 1;
 again:
 	if (multi_ret) {
@@ -2687,7 +2687,7 @@ again:
 		mirror_num = 0;
 
 	/* if our multi bio struct is too small, back off and try again */
-	if (rw & (1 << BIO_RW)) {
+	if (rw & REQ_WRITE) {
 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 				 BTRFS_BLOCK_GROUP_DUP)) {
 			stripes_required = map->num_stripes;
@@ -2697,7 +2697,7 @@ again:
 			max_errors = 1;
 		}
 	}
-	if (multi_ret && (rw & (1 << BIO_RW)) &&
+	if (multi_ret && (rw & REQ_WRITE) &&
 	    stripes_allocated < stripes_required) {
 		stripes_allocated = map->num_stripes;
 		free_extent_map(em);
@@ -2733,7 +2733,7 @@ again:
 	num_stripes = 1;
 	stripe_index = 0;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		if (unplug_page || (rw & (1 << BIO_RW)))
+		if (unplug_page || (rw & REQ_WRITE))
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
@@ -2744,7 +2744,7 @@ again:
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (rw & (1 << BIO_RW))
+		if (rw & REQ_WRITE)
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
@@ -2755,7 +2755,7 @@ again:
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
 
-		if (unplug_page || (rw & (1 << BIO_RW)))
+		if (unplug_page || (rw & REQ_WRITE))
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
@@ -2945,7 +2945,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	struct btrfs_pending_bios *pending_bios;
 
 	/* don't bother with additional async steps for reads, right now */
-	if (!(rw & (1 << BIO_RW))) {
+	if (!(rw & REQ_WRITE)) {
 		bio_get(bio);
 		submit_bio(rw, bio);
 		bio_put(bio);
@@ -2964,7 +2964,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	bio->bi_rw |= rw;
 
 	spin_lock(&device->io_lock);
-	if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
+	if (bio->bi_rw & REQ_SYNC)
 		pending_bios = &device->pending_sync_bios;
 	else
 		pending_bios = &device->pending_bios;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 4337cad7777b..e2732203fa93 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -599,7 +599,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
 			} else {
 				bio = master_dev->bio;
 				/* FIXME: bio_set_dir() */
-				bio->bi_rw |= (1 << BIO_RW);
+				bio->bi_rw |= REQ_WRITE;
 			}
 
 			osd_req_write(or, &ios->obj, per_dev->offset, bio,
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index efc3539ac5a1..cde1248a6225 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -595,7 +595,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
 		goto skip_barrier;
 	get_bh(bh);
-	submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+	submit_bh(WRITE_BARRIER | REQ_META, bh);
 	wait_on_buffer(bh);
 	if (buffer_eopnotsupp(bh)) {
 		clear_buffer_eopnotsupp(bh);
@@ -605,7 +605,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 		lock_buffer(bh);
 skip_barrier:
 		get_bh(bh);
-		submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
+		submit_bh(WRITE_SYNC | REQ_META, bh);
 		wait_on_buffer(bh);
 	}
 	if (!buffer_uptodate(bh))
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 18176d0b75d7..f3b071f921aa 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -36,8 +36,8 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 {
 	struct buffer_head *bh, *head;
 	int nr_underway = 0;
-	int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
-			WRITE_SYNC_PLUG : WRITE));
+	int write_op = REQ_META |
+		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!page_has_buffers(page));
@@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 	}
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
-	submit_bh(READ_SYNC | (1 << BIO_RW_META), bh);
+	submit_bh(READ_SYNC | REQ_META, bh);
 	if (!(flags & DIO_WAIT))
 		return 0;
 
@@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (buffer_uptodate(first_bh))
 		goto out;
 	if (!buffer_locked(first_bh))
-		ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh);
+		ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh);
 
 	dblock++;
 	extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3593b3a7290e..fd4f8946abf5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -275,7 +275,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
 
 	bio->bi_end_io = end_bio_io_page;
 	bio->bi_private = page;
-	submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
+	submit_bio(READ_SYNC | REQ_META, bio);
 	wait_on_page_locked(page);
 	bio_put(bio);
 	if (!PageUptodate(page)) {
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 2e6a2723b8fa..4588fb9e93df 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -508,7 +508,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
 		 * Last BIO is always sent through the following
 		 * submission.
 		 */
-		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+		rw |= REQ_SYNC | REQ_UNPLUG;
 		res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
 	}
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7fc5606e6ea5..4d379c8250ae 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -138,55 +138,83 @@ struct bio {
 #define BIO_POOL_IDX(bio)	((bio)->bi_flags >> BIO_POOL_OFFSET)	
 
 /*
- * bio bi_rw flags
- *
- * bit 0 -- data direction
- *	If not set, bio is a read from device. If set, it's a write to device.
- * bit 1 -- fail fast device errors
- * bit 2 -- fail fast transport errors
- * bit 3 -- fail fast driver errors
- * bit 4 -- rw-ahead when set
- * bit 5 -- barrier
- *	Insert a serialization point in the IO queue, forcing previously
- *	submitted IO to be completed before this one is issued.
- * bit 6 -- synchronous I/O hint.
- * bit 7 -- Unplug the device immediately after submitting this bio.
- * bit 8 -- metadata request
- *	Used for tracing to differentiate metadata and data IO. May also
- *	get some preferential treatment in the IO scheduler
- * bit 9 -- discard sectors
- *	Informs the lower level device that this range of sectors is no longer
- *	used by the file system and may thus be freed by the device. Used
- *	for flash based storage.
- *	Don't want driver retries for any fast fail whatever the reason.
- * bit 10 -- Tell the IO scheduler not to wait for more requests after this
-	one has been submitted, even if it is a SYNC request.
+ * Request flags.  For use in the cmd_flags field of struct request, and in
+ * bi_rw of struct bio.  Note that some flags are only valid in either one.
  */
-enum bio_rw_flags {
-	BIO_RW,
-	BIO_RW_FAILFAST_DEV,
-	BIO_RW_FAILFAST_TRANSPORT,
-	BIO_RW_FAILFAST_DRIVER,
-	/* above flags must match REQ_* */
-	BIO_RW_AHEAD,
-	BIO_RW_BARRIER,
-	BIO_RW_SYNCIO,
-	BIO_RW_UNPLUG,
-	BIO_RW_META,
-	BIO_RW_DISCARD,
-	BIO_RW_NOIDLE,
+enum rq_flag_bits {
+	/* common flags */
+	__REQ_WRITE,		/* not set, read. set, write */
+	__REQ_FAILFAST_DEV,	/* no driver retries of device errors */
+	__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
+	__REQ_FAILFAST_DRIVER,	/* no driver retries of driver errors */
+
+	__REQ_HARDBARRIER,	/* may not be passed by drive either */
+	__REQ_SYNC,		/* request is sync (sync write or read) */
+	__REQ_META,		/* metadata io request */
+	__REQ_DISCARD,		/* request to discard sectors */
+	__REQ_NOIDLE,		/* don't anticipate more IO after this one */
+
+	/* bio only flags */
+	__REQ_UNPLUG,		/* unplug the immediately after submission */
+	__REQ_RAHEAD,		/* read ahead, can fail anytime */
+
+	/* request only flags */
+	__REQ_SORTED,		/* elevator knows about this request */
+	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
+	__REQ_FUA,		/* forced unit access */
+	__REQ_NOMERGE,		/* don't touch this for merging */
+	__REQ_STARTED,		/* drive already may have started this one */
+	__REQ_DONTPREP,		/* don't call prep for this one */
+	__REQ_QUEUED,		/* uses queueing */
+	__REQ_ELVPRIV,		/* elevator private data attached */
+	__REQ_FAILED,		/* set if the request failed */
+	__REQ_QUIET,		/* don't worry about errors */
+	__REQ_PREEMPT,		/* set for "ide_preempt" requests */
+	__REQ_ORDERED_COLOR,	/* is before or after barrier */
+	__REQ_ALLOCED,		/* request came from our alloc pool */
+	__REQ_COPY_USER,	/* contains copies of user pages */
+	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
+	__REQ_IO_STAT,		/* account I/O stat */
+	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
+	__REQ_NR_BITS,		/* stops here */
 };
 
-/*
- * First four bits must match between bio->bi_rw and rq->cmd_flags, make
- * that explicit here.
- */
-#define BIO_RW_RQ_MASK		0xf
-
-static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag)
-{
-	return (bio->bi_rw & (1 << flag)) != 0;
-}
+#define REQ_WRITE		(1 << __REQ_WRITE)
+#define REQ_FAILFAST_DEV	(1 << __REQ_FAILFAST_DEV)
+#define REQ_FAILFAST_TRANSPORT	(1 << __REQ_FAILFAST_TRANSPORT)
+#define REQ_FAILFAST_DRIVER	(1 << __REQ_FAILFAST_DRIVER)
+#define REQ_HARDBARRIER		(1 << __REQ_HARDBARRIER)
+#define REQ_SYNC		(1 << __REQ_SYNC)
+#define REQ_META		(1 << __REQ_META)
+#define REQ_DISCARD		(1 << __REQ_DISCARD)
+#define REQ_NOIDLE		(1 << __REQ_NOIDLE)
+
+#define REQ_FAILFAST_MASK \
+	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
+#define REQ_COMMON_MASK \
+	(REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \
+	 REQ_META| REQ_DISCARD | REQ_NOIDLE)
+
+#define REQ_UNPLUG		(1 << __REQ_UNPLUG)
+#define REQ_RAHEAD		(1 << __REQ_RAHEAD)
+
+#define REQ_SORTED		(1 << __REQ_SORTED)
+#define REQ_SOFTBARRIER		(1 << __REQ_SOFTBARRIER)
+#define REQ_FUA			(1 << __REQ_FUA)
+#define REQ_NOMERGE		(1 << __REQ_NOMERGE)
+#define REQ_STARTED		(1 << __REQ_STARTED)
+#define REQ_DONTPREP		(1 << __REQ_DONTPREP)
+#define REQ_QUEUED		(1 << __REQ_QUEUED)
+#define REQ_ELVPRIV		(1 << __REQ_ELVPRIV)
+#define REQ_FAILED		(1 << __REQ_FAILED)
+#define REQ_QUIET		(1 << __REQ_QUIET)
+#define REQ_PREEMPT		(1 << __REQ_PREEMPT)
+#define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
+#define REQ_ALLOCED		(1 << __REQ_ALLOCED)
+#define REQ_COPY_USER		(1 << __REQ_COPY_USER)
+#define REQ_INTEGRITY		(1 << __REQ_INTEGRITY)
+#define REQ_IO_STAT		(1 << __REQ_IO_STAT)
+#define REQ_MIXED_MERGE		(1 << __REQ_MIXED_MERGE)
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
@@ -211,7 +239,10 @@ static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag)
 #define bio_offset(bio)		bio_iovec((bio))->bv_offset
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_idx)
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
-#define bio_empty_barrier(bio)	(bio_rw_flagged(bio, BIO_RW_BARRIER) && !bio_has_data(bio) && !bio_rw_flagged(bio, BIO_RW_DISCARD))
+#define bio_empty_barrier(bio) \
+	((bio->bi_rw & REQ_HARDBARRIER) && \
+	 !bio_has_data(bio) && \
+	 !(bio->bi_rw & REQ_DISCARD))
 
 static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3ecd28ef9ba4..3fc0f5908619 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -84,70 +84,6 @@ enum {
 	REQ_LB_OP_FLUSH = 0x41,		/* flush request */
 };
 
-/*
- * request type modified bits. first four bits match BIO_RW* bits, important
- */
-enum rq_flag_bits {
-	__REQ_RW,		/* not set, read. set, write */
-	__REQ_FAILFAST_DEV,	/* no driver retries of device errors */
-	__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
-	__REQ_FAILFAST_DRIVER,	/* no driver retries of driver errors */
-	/* above flags must match BIO_RW_* */
-	__REQ_DISCARD,		/* request to discard sectors */
-	__REQ_SORTED,		/* elevator knows about this request */
-	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
-	__REQ_HARDBARRIER,	/* may not be passed by drive either */
-	__REQ_FUA,		/* forced unit access */
-	__REQ_NOMERGE,		/* don't touch this for merging */
-	__REQ_STARTED,		/* drive already may have started this one */
-	__REQ_DONTPREP,		/* don't call prep for this one */
-	__REQ_QUEUED,		/* uses queueing */
-	__REQ_ELVPRIV,		/* elevator private data attached */
-	__REQ_FAILED,		/* set if the request failed */
-	__REQ_QUIET,		/* don't worry about errors */
-	__REQ_PREEMPT,		/* set for "ide_preempt" requests */
-	__REQ_ORDERED_COLOR,	/* is before or after barrier */
-	__REQ_RW_SYNC,		/* request is sync (sync write or read) */
-	__REQ_ALLOCED,		/* request came from our alloc pool */
-	__REQ_RW_META,		/* metadata io request */
-	__REQ_COPY_USER,	/* contains copies of user pages */
-	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
-	__REQ_NOIDLE,		/* Don't anticipate more IO after this one */
-	__REQ_IO_STAT,		/* account I/O stat */
-	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
-	__REQ_NR_BITS,		/* stops here */
-};
-
-#define REQ_RW		(1 << __REQ_RW)
-#define REQ_FAILFAST_DEV	(1 << __REQ_FAILFAST_DEV)
-#define REQ_FAILFAST_TRANSPORT	(1 << __REQ_FAILFAST_TRANSPORT)
-#define REQ_FAILFAST_DRIVER	(1 << __REQ_FAILFAST_DRIVER)
-#define REQ_DISCARD	(1 << __REQ_DISCARD)
-#define REQ_SORTED	(1 << __REQ_SORTED)
-#define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
-#define REQ_HARDBARRIER	(1 << __REQ_HARDBARRIER)
-#define REQ_FUA		(1 << __REQ_FUA)
-#define REQ_NOMERGE	(1 << __REQ_NOMERGE)
-#define REQ_STARTED	(1 << __REQ_STARTED)
-#define REQ_DONTPREP	(1 << __REQ_DONTPREP)
-#define REQ_QUEUED	(1 << __REQ_QUEUED)
-#define REQ_ELVPRIV	(1 << __REQ_ELVPRIV)
-#define REQ_FAILED	(1 << __REQ_FAILED)
-#define REQ_QUIET	(1 << __REQ_QUIET)
-#define REQ_PREEMPT	(1 << __REQ_PREEMPT)
-#define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
-#define REQ_RW_SYNC	(1 << __REQ_RW_SYNC)
-#define REQ_ALLOCED	(1 << __REQ_ALLOCED)
-#define REQ_RW_META	(1 << __REQ_RW_META)
-#define REQ_COPY_USER	(1 << __REQ_COPY_USER)
-#define REQ_INTEGRITY	(1 << __REQ_INTEGRITY)
-#define REQ_NOIDLE	(1 << __REQ_NOIDLE)
-#define REQ_IO_STAT	(1 << __REQ_IO_STAT)
-#define REQ_MIXED_MERGE	(1 << __REQ_MIXED_MERGE)
-
-#define REQ_FAILFAST_MASK	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | \
-				 REQ_FAILFAST_DRIVER)
-
 #define BLK_MAX_CDB	16
 
 /*
@@ -631,7 +567,7 @@ enum {
  */
 static inline bool rw_is_sync(unsigned int rw_flags)
 {
-	return !(rw_flags & REQ_RW) || (rw_flags & REQ_RW_SYNC);
+	return !(rw_flags & REQ_WRITE) || (rw_flags & REQ_SYNC);
 }
 
 static inline bool rq_is_sync(struct request *rq)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 598878831497..c5c92943c767 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -144,29 +144,31 @@ struct inodes_stat_t {
  *			of this IO.
  *
  */
-#define RW_MASK		1
-#define RWA_MASK	2
-#define READ 0
-#define WRITE 1
-#define READA 2		/* read-ahead  - don't block if no resources */
-#define SWRITE 3	/* for ll_rw_block() - wait for buffer lock */
-#define READ_SYNC	(READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
-#define READ_META	(READ | (1 << BIO_RW_META))
-#define WRITE_SYNC_PLUG	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
-#define WRITE_SYNC	(WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
-#define WRITE_ODIRECT_PLUG	(WRITE | (1 << BIO_RW_SYNCIO))
-#define WRITE_META	(WRITE | (1 << BIO_RW_META))
-#define SWRITE_SYNC_PLUG	\
-			(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
-#define SWRITE_SYNC	(SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
-#define WRITE_BARRIER	(WRITE_SYNC | (1 << BIO_RW_BARRIER))
+#define RW_MASK			1
+#define RWA_MASK		2
+
+#define READ			0
+#define WRITE			1
+#define READA			2 /* readahead  - don't block if no resources */
+#define SWRITE			3 /* for ll_rw_block() - wait for buffer lock */
+
+#define READ_SYNC		(READ | REQ_SYNC | REQ_UNPLUG)
+#define READ_META		(READ | REQ_META)
+#define WRITE_SYNC_PLUG		(WRITE | REQ_SYNC | REQ_NOIDLE)
+#define WRITE_SYNC		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
+#define WRITE_ODIRECT_PLUG	(WRITE | REQ_SYNC)
+#define WRITE_META		(WRITE | REQ_META)
+#define WRITE_BARRIER		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+				 REQ_HARDBARRIER)
+#define SWRITE_SYNC_PLUG	(SWRITE | REQ_SYNC | REQ_NOIDLE)
+#define SWRITE_SYNC		(SWRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
 
 /*
  * These aren't really reads or writes, they pass down information about
  * parts of device that are now unused by the file system.
  */
-#define DISCARD_NOBARRIER (WRITE | (1 << BIO_RW_DISCARD))
-#define DISCARD_BARRIER (DISCARD_NOBARRIER | (1 << BIO_RW_BARRIER))
+#define DISCARD_NOBARRIER	(WRITE | REQ_DISCARD)
+#define DISCARD_BARRIER		(WRITE | REQ_DISCARD | REQ_HARDBARRIER)
 
 #define SEL_IN		1
 #define SEL_OUT		2
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 97024fd40cd5..83bbc7c02df9 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
 static int submit(int rw, struct block_device *bdev, sector_t sector,
 		struct page *page, struct bio **bio_chain)
 {
-	const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+	const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
 	struct bio *bio;
 
 	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 4f149944cb89..3b4a695051b6 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -169,9 +169,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
 				 BLK_TC_ACT(BLK_TC_WRITE) };
 
+#define BLK_TC_HARDBARRIER	BLK_TC_BARRIER
+#define BLK_TC_RAHEAD		BLK_TC_AHEAD
+
 /* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
-	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
+#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
+	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
 
 /*
  * The worker for the various blk_add_trace*() types. Fills out a
@@ -194,9 +197,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		return;
 
 	what |= ddir_act[rw & WRITE];
-	what |= MASK_TC_BIT(rw, BARRIER);
-	what |= MASK_TC_BIT(rw, SYNCIO);
-	what |= MASK_TC_BIT(rw, AHEAD);
+	what |= MASK_TC_BIT(rw, HARDBARRIER);
+	what |= MASK_TC_BIT(rw, SYNC);
+	what |= MASK_TC_BIT(rw, RAHEAD);
 	what |= MASK_TC_BIT(rw, META);
 	what |= MASK_TC_BIT(rw, DISCARD);
 
@@ -662,7 +665,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 		return;
 
 	if (rq->cmd_flags & REQ_DISCARD)
-		rw |= (1 << BIO_RW_DISCARD);
+		rw |= REQ_DISCARD;
 
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
@@ -1755,20 +1758,20 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 
 	if (rw & WRITE)
 		rwbs[i++] = 'W';
-	else if (rw & 1 << BIO_RW_DISCARD)
+	else if (rw & REQ_DISCARD)
 		rwbs[i++] = 'D';
 	else if (bytes)
 		rwbs[i++] = 'R';
 	else
 		rwbs[i++] = 'N';
 
-	if (rw & 1 << BIO_RW_AHEAD)
+	if (rw & REQ_RAHEAD)
 		rwbs[i++] = 'A';
-	if (rw & 1 << BIO_RW_BARRIER)
+	if (rw & REQ_HARDBARRIER)
 		rwbs[i++] = 'B';
-	if (rw & 1 << BIO_RW_SYNCIO)
+	if (rw & REQ_SYNC)
 		rwbs[i++] = 'S';
-	if (rw & 1 << BIO_RW_META)
+	if (rw & REQ_META)
 		rwbs[i++] = 'M';
 
 	rwbs[i] = '\0';
@@ -1780,7 +1783,7 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
 	int bytes;
 
 	if (rq->cmd_flags & REQ_DISCARD)
-		rw |= (1 << BIO_RW_DISCARD);
+		rw |= REQ_DISCARD;
 
 	bytes = blk_rq_bytes(rq);
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 31a3b962230a..2dee975bf469 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		goto out;
 	}
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+		rw |= REQ_SYNC | REQ_UNPLUG;
 	count_vm_event(PSWPOUT);
 	set_page_writeback(page);
 	unlock_page(page);
-- 
cgit v1.2.3-59-g8ed1b


From c1955ce32fdb0877b7a1b22feb2669358f65be76 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 19 Jun 2010 23:08:06 +0200
Subject: writeback: remove wb_list

The wb_list member of struct backing_device_info always has exactly one
element.  Just use the direct bdi->wb pointer instead and simplify some
code.

Also remove bdi_task_init which is now trivial to prepare for the next
patch.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c           |  4 +--
 include/linux/backing-dev.h |  5 +--
 mm/backing-dev.c            | 83 ++++++++++++++++-----------------------------
 3 files changed, 32 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d5be1693ac93..d67989b8ba44 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -73,9 +73,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 	 * If the default thread isn't there, make sure we add it. When
 	 * it gets created and wakes up, we'll run this work.
 	 */
-	if (unlikely(list_empty_careful(&bdi->wb_list)))
+	if (unlikely(!bdi->wb.task)) {
 		wake_up_process(default_backing_dev_info.wb.task);
-	else {
+	} else {
 		struct bdi_writeback *wb = &bdi->wb;
 
 		if (wb->task)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e9aec0d099df..50f146146169 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -45,8 +45,6 @@ enum bdi_stat_item {
 #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
 
 struct bdi_writeback {
-	struct list_head list;			/* hangs off the bdi */
-
 	struct backing_dev_info *bdi;		/* our parent bdi */
 	unsigned int nr;
 
@@ -80,8 +78,7 @@ struct backing_dev_info {
 	unsigned int max_ratio, max_prop_frac;
 
 	struct bdi_writeback wb;  /* default writeback info for this bdi */
-	spinlock_t wb_lock;	  /* protects update side of wb_list */
-	struct list_head wb_list; /* the flusher threads hanging off this bdi */
+	spinlock_t wb_lock;	  /* protects work_list */
 
 	struct list_head work_list;
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 123bcef13e51..6c2a09c8922c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -65,28 +65,21 @@ static void bdi_debug_init(void)
 static int bdi_debug_stats_show(struct seq_file *m, void *v)
 {
 	struct backing_dev_info *bdi = m->private;
-	struct bdi_writeback *wb;
+	struct bdi_writeback *wb = &bdi->wb;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
 	unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
 	struct inode *inode;
 
-	/*
-	 * inode lock is enough here, the bdi->wb_list is protected by
-	 * RCU on the reader side
-	 */
 	nr_wb = nr_dirty = nr_io = nr_more_io = 0;
 	spin_lock(&inode_lock);
-	list_for_each_entry(wb, &bdi->wb_list, list) {
-		nr_wb++;
-		list_for_each_entry(inode, &wb->b_dirty, i_list)
-			nr_dirty++;
-		list_for_each_entry(inode, &wb->b_io, i_list)
-			nr_io++;
-		list_for_each_entry(inode, &wb->b_more_io, i_list)
-			nr_more_io++;
-	}
+	list_for_each_entry(inode, &wb->b_dirty, i_list)
+		nr_dirty++;
+	list_for_each_entry(inode, &wb->b_io, i_list)
+		nr_io++;
+	list_for_each_entry(inode, &wb->b_more_io, i_list)
+		nr_more_io++;
 	spin_unlock(&inode_lock);
 
 	get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
@@ -98,19 +91,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "BdiDirtyThresh:   %8lu kB\n"
 		   "DirtyThresh:      %8lu kB\n"
 		   "BackgroundThresh: %8lu kB\n"
-		   "WritebackThreads: %8lu\n"
 		   "b_dirty:          %8lu\n"
 		   "b_io:             %8lu\n"
 		   "b_more_io:        %8lu\n"
 		   "bdi_list:         %8u\n"
-		   "state:            %8lx\n"
-		   "wb_list:          %8u\n",
+		   "state:            %8lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
 		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
 		   K(bdi_thresh), K(dirty_thresh),
-		   K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
-		   !list_empty(&bdi->bdi_list), bdi->state,
-		   !list_empty(&bdi->wb_list));
+		   K(background_thresh), nr_dirty, nr_io, nr_more_io,
+		   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
 
 	return 0;
@@ -270,24 +260,6 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_more_io);
 }
 
-static void bdi_task_init(struct backing_dev_info *bdi,
-			  struct bdi_writeback *wb)
-{
-	struct task_struct *tsk = current;
-
-	spin_lock(&bdi->wb_lock);
-	list_add_tail_rcu(&wb->list, &bdi->wb_list);
-	spin_unlock(&bdi->wb_lock);
-
-	tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
-	set_freezable();
-
-	/*
-	 * Our parent may run at a different priority, just set us to normal
-	 */
-	set_user_nice(tsk, 0);
-}
-
 static int bdi_start_fn(void *ptr)
 {
 	struct bdi_writeback *wb = ptr;
@@ -301,7 +273,13 @@ static int bdi_start_fn(void *ptr)
 	list_add_rcu(&bdi->bdi_list, &bdi_list);
 	spin_unlock_bh(&bdi_lock);
 
-	bdi_task_init(bdi, wb);
+	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+	set_freezable();
+
+	/*
+	 * Our parent may run at a different priority, just set us to normal
+	 */
+	set_user_nice(current, 0);
 
 	/*
 	 * Clear pending bit and wakeup anybody waiting to tear us down
@@ -312,12 +290,7 @@ static int bdi_start_fn(void *ptr)
 
 	ret = bdi_writeback_task(wb);
 
-	/*
-	 * Remove us from the list
-	 */
-	spin_lock(&bdi->wb_lock);
-	list_del_rcu(&wb->list);
-	spin_unlock(&bdi->wb_lock);
+	wb->task = NULL;
 
 	/*
 	 * Flush any work that raced with us exiting. No new work
@@ -326,7 +299,6 @@ static int bdi_start_fn(void *ptr)
 	if (!list_empty(&bdi->work_list))
 		wb_do_writeback(wb, 1);
 
-	wb->task = NULL;
 	return ret;
 }
 
@@ -391,7 +363,13 @@ static int bdi_forker_task(void *ptr)
 {
 	struct bdi_writeback *me = ptr;
 
-	bdi_task_init(me->bdi, me);
+	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+	set_freezable();
+
+	/*
+	 * Our parent may run at a different priority, just set us to normal
+	 */
+	set_user_nice(current, 0);
 
 	for (;;) {
 		struct backing_dev_info *bdi, *tmp;
@@ -598,8 +576,6 @@ EXPORT_SYMBOL(bdi_register_dev);
  */
 static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 {
-	struct bdi_writeback *wb;
-
 	if (!bdi_cap_writeback_dirty(bdi))
 		return;
 
@@ -615,14 +591,14 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 	bdi_remove_from_list(bdi);
 
 	/*
-	 * Finally, kill the kernel threads. We don't need to be RCU
+	 * Finally, kill the kernel thread. We don't need to be RCU
 	 * safe anymore, since the bdi is gone from visibility. Force
 	 * unfreeze of the thread before calling kthread_stop(), otherwise
 	 * it would never exet if it is currently stuck in the refrigerator.
 	 */
-	list_for_each_entry(wb, &bdi->wb_list, list) {
-		thaw_process(wb->task);
-		kthread_stop(wb->task);
+	if (bdi->wb.task) {
+		thaw_process(bdi->wb.task);
+		kthread_stop(bdi->wb.task);
 	}
 }
 
@@ -667,7 +643,6 @@ int bdi_init(struct backing_dev_info *bdi)
 	spin_lock_init(&bdi->wb_lock);
 	INIT_RCU_HEAD(&bdi->rcu_head);
 	INIT_LIST_HEAD(&bdi->bdi_list);
-	INIT_LIST_HEAD(&bdi->wb_list);
 	INIT_LIST_HEAD(&bdi->work_list);
 
 	bdi_wb_init(&bdi->wb, bdi);
-- 
cgit v1.2.3-59-g8ed1b


From 082439004b31adc146e96e5f1c574dd2b57dcd93 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 19 Jun 2010 23:08:22 +0200
Subject: writeback: merge bdi_writeback_task and bdi_start_fn

Move all code for the writeback thread into fs/fs-writeback.c instead of
splitting it over two functions in two files.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c           | 35 ++++++++++++++++++++++++++++++++++-
 include/linux/backing-dev.h |  2 +-
 mm/backing-dev.c            | 44 +-------------------------------------------
 3 files changed, 36 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d67989b8ba44..c8471b3ddccf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -775,12 +775,36 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
  * Handle writeback of dirty data for the device backed by this bdi. Also
  * wakes up periodically and does kupdated style flushing.
  */
-int bdi_writeback_task(struct bdi_writeback *wb)
+int bdi_writeback_thread(void *data)
 {
+	struct bdi_writeback *wb = data;
+	struct backing_dev_info *bdi = wb->bdi;
 	unsigned long last_active = jiffies;
 	unsigned long wait_jiffies = -1UL;
 	long pages_written;
 
+	/*
+	 * Add us to the active bdi_list
+	 */
+	spin_lock_bh(&bdi_lock);
+	list_add_rcu(&bdi->bdi_list, &bdi_list);
+	spin_unlock_bh(&bdi_lock);
+
+	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+	set_freezable();
+
+	/*
+	 * Our parent may run at a different priority, just set us to normal
+	 */
+	set_user_nice(current, 0);
+
+	/*
+	 * Clear pending bit and wakeup anybody waiting to tear us down
+	 */
+	clear_bit(BDI_pending, &bdi->state);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&bdi->state, BDI_pending);
+
 	while (!kthread_should_stop()) {
 		pages_written = wb_do_writeback(wb, 0);
 
@@ -813,9 +837,18 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 		try_to_freeze();
 	}
 
+	wb->task = NULL;
+
+	/*
+	 * Flush any work that raced with us exiting. No new work
+	 * will be added, since this bdi isn't discoverable anymore.
+	 */
+	if (!list_empty(&bdi->work_list))
+		wb_do_writeback(wb, 1);
 	return 0;
 }
 
+
 /*
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 50f146146169..e536f3a74e60 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -102,7 +102,7 @@ void bdi_unregister(struct backing_dev_info *bdi);
 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
-int bdi_writeback_task(struct bdi_writeback *wb);
+int bdi_writeback_thread(void *data);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6c2a09c8922c..bceac647e4d1 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -260,48 +260,6 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_more_io);
 }
 
-static int bdi_start_fn(void *ptr)
-{
-	struct bdi_writeback *wb = ptr;
-	struct backing_dev_info *bdi = wb->bdi;
-	int ret;
-
-	/*
-	 * Add us to the active bdi_list
-	 */
-	spin_lock_bh(&bdi_lock);
-	list_add_rcu(&bdi->bdi_list, &bdi_list);
-	spin_unlock_bh(&bdi_lock);
-
-	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
-	set_freezable();
-
-	/*
-	 * Our parent may run at a different priority, just set us to normal
-	 */
-	set_user_nice(current, 0);
-
-	/*
-	 * Clear pending bit and wakeup anybody waiting to tear us down
-	 */
-	clear_bit(BDI_pending, &bdi->state);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&bdi->state, BDI_pending);
-
-	ret = bdi_writeback_task(wb);
-
-	wb->task = NULL;
-
-	/*
-	 * Flush any work that raced with us exiting. No new work
-	 * will be added, since this bdi isn't discoverable anymore.
-	 */
-	if (!list_empty(&bdi->work_list))
-		wb_do_writeback(wb, 1);
-
-	return ret;
-}
-
 int bdi_has_dirty_io(struct backing_dev_info *bdi)
 {
 	return wb_has_dirty_io(&bdi->wb);
@@ -425,7 +383,7 @@ static int bdi_forker_task(void *ptr)
 		spin_unlock_bh(&bdi_lock);
 
 		wb = &bdi->wb;
-		wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
+		wb->task = kthread_run(bdi_writeback_thread, wb, "flush-%s",
 					dev_name(bdi->dev));
 		/*
 		 * If task creation fails, then readd the bdi to
-- 
cgit v1.2.3-59-g8ed1b


From 66ac0280197981f88774e74b60c8e5f9f07c1dba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 18 Jun 2010 16:59:42 +0200
Subject: block: don't allocate a payload for discard request

Allocating a fixed payload for discard requests always was a horrible hack,
and it's not coming to byte us when adding support for discard in DM/MD.

So change the code to leave the allocation of a payload to the lowlevel
driver.  Unfortunately that means we'll need another hack, which allows
us to update the various block layer length fields indicating that we
have a payload.  Instead of hiding this in sd.c, which we already partially
do for UNMAP support add a documented helper in the core block layer for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c       | 32 +++++++++++++++++++++++++++++++
 block/blk-lib.c        | 33 ++++++--------------------------
 drivers/scsi/sd.c      | 52 +++++++++++++++++++++++++++++++++-----------------
 include/linux/blkdev.h |  2 ++
 4 files changed, 74 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 66c3cfe94d0a..3531d8e1da04 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1135,6 +1135,38 @@ void blk_put_request(struct request *req)
 }
 EXPORT_SYMBOL(blk_put_request);
 
+/**
+ * blk_add_request_payload - add a payload to a request
+ * @rq: request to update
+ * @page: page backing the payload
+ * @len: length of the payload.
+ *
+ * This allows to later add a payload to an already submitted request by
+ * a block driver.  The driver needs to take care of freeing the payload
+ * itself.
+ *
+ * Note that this is a quite horrible hack and nothing but handling of
+ * discard requests should ever use it.
+ */
+void blk_add_request_payload(struct request *rq, struct page *page,
+		unsigned int len)
+{
+	struct bio *bio = rq->bio;
+
+	bio->bi_io_vec->bv_page = page;
+	bio->bi_io_vec->bv_offset = 0;
+	bio->bi_io_vec->bv_len = len;
+
+	bio->bi_size = len;
+	bio->bi_vcnt = 1;
+	bio->bi_phys_segments = 1;
+
+	rq->__data_len = rq->resid_len = len;
+	rq->nr_phys_segments = 1;
+	rq->buffer = bio_data(bio);
+}
+EXPORT_SYMBOL_GPL(blk_add_request_payload);
+
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cpu = bio->bi_comp_cpu;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index d0216b9f22d4..e16185b0d8e1 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -19,7 +19,6 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
 
 	if (bio->bi_private)
 		complete(bio->bi_private);
-	__free_page(bio_page(bio));
 
 	bio_put(bio);
 }
@@ -43,7 +42,6 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	int type = flags & BLKDEV_IFL_BARRIER ?
 		DISCARD_BARRIER : DISCARD_NOBARRIER;
 	struct bio *bio;
-	struct page *page;
 	int ret = 0;
 
 	if (!q)
@@ -53,35 +51,21 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		return -EOPNOTSUPP;
 
 	while (nr_sects && !ret) {
-		unsigned int sector_size = q->limits.logical_block_size;
 		unsigned int max_discard_sectors =
 			min(q->limits.max_discard_sectors, UINT_MAX >> 9);
 
 		bio = bio_alloc(gfp_mask, 1);
-		if (!bio)
-			goto out;
+		if (!bio) {
+			ret = -ENOMEM;
+			break;
+		}
+
 		bio->bi_sector = sector;
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
 		if (flags & BLKDEV_IFL_WAIT)
 			bio->bi_private = &wait;
 
-		/*
-		 * Add a zeroed one-sector payload as that's what
-		 * our current implementations need.  If we'll ever need
-		 * more the interface will need revisiting.
-		 */
-		page = alloc_page(gfp_mask | __GFP_ZERO);
-		if (!page)
-			goto out_free_bio;
-		if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
-			goto out_free_page;
-
-		/*
-		 * And override the bio size - the way discard works we
-		 * touch many more blocks on disk than the actual payload
-		 * length.
-		 */
 		if (nr_sects > max_discard_sectors) {
 			bio->bi_size = max_discard_sectors << 9;
 			nr_sects -= max_discard_sectors;
@@ -103,13 +87,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			ret = -EIO;
 		bio_put(bio);
 	}
+
 	return ret;
-out_free_page:
-	__free_page(page);
-out_free_bio:
-	bio_put(bio);
-out:
-	return -ENOMEM;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index a3fdf4dc59da..86da819c70eb 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -411,22 +411,25 @@ static void sd_prot_op(struct scsi_cmnd *scmd, unsigned int dif)
 }
 
 /**
- * sd_prepare_discard - unmap blocks on thinly provisioned device
+ * scsi_setup_discard_cmnd - unmap blocks on thinly provisioned device
+ * @sdp: scsi device to operate one
  * @rq: Request to prepare
  *
  * Will issue either UNMAP or WRITE SAME(16) depending on preference
  * indicated by target device.
  **/
-static int sd_prepare_discard(struct request *rq)
+static int scsi_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq)
 {
 	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
 	struct bio *bio = rq->bio;
 	sector_t sector = bio->bi_sector;
-	unsigned int num = bio_sectors(bio);
+	unsigned int nr_sectors = bio_sectors(bio);
+	unsigned int len;
+	struct page *page;
 
 	if (sdkp->device->sector_size == 4096) {
 		sector >>= 3;
-		num >>= 3;
+		nr_sectors >>= 3;
 	}
 
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
@@ -434,31 +437,35 @@ static int sd_prepare_discard(struct request *rq)
 
 	memset(rq->cmd, 0, rq->cmd_len);
 
+	page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+	if (!page)
+		return BLKPREP_DEFER;
+
 	if (sdkp->unmap) {
-		char *buf = kmap_atomic(bio_page(bio), KM_USER0);
+		char *buf = page_address(page);
 
+		rq->cmd_len = 10;
 		rq->cmd[0] = UNMAP;
 		rq->cmd[8] = 24;
-		rq->cmd_len = 10;
-
-		/* Ensure that data length matches payload */
-		rq->__data_len = bio->bi_size = bio->bi_io_vec->bv_len = 24;
 
 		put_unaligned_be16(6 + 16, &buf[0]);
 		put_unaligned_be16(16, &buf[2]);
 		put_unaligned_be64(sector, &buf[8]);
-		put_unaligned_be32(num, &buf[16]);
+		put_unaligned_be32(nr_sectors, &buf[16]);
 
-		kunmap_atomic(buf, KM_USER0);
+		len = 24;
 	} else {
+		rq->cmd_len = 16;
 		rq->cmd[0] = WRITE_SAME_16;
 		rq->cmd[1] = 0x8; /* UNMAP */
 		put_unaligned_be64(sector, &rq->cmd[2]);
-		put_unaligned_be32(num, &rq->cmd[10]);
-		rq->cmd_len = 16;
+		put_unaligned_be32(nr_sectors, &rq->cmd[10]);
+
+		len = sdkp->device->sector_size;
 	}
 
-	return BLKPREP_OK;
+	blk_add_request_payload(rq, page, len);
+	return scsi_setup_blk_pc_cmnd(sdp, rq);
 }
 
 /**
@@ -485,10 +492,10 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	 * Discard request come in as REQ_TYPE_FS but we turn them into
 	 * block PC requests to make life easier.
 	 */
-	if (rq->cmd_flags & REQ_DISCARD)
-		ret = sd_prepare_discard(rq);
-
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
+	if (rq->cmd_flags & REQ_DISCARD) {
+		ret = scsi_setup_discard_cmnd(sdp, rq);
+		goto out;
+	} else if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		ret = scsi_setup_blk_pc_cmnd(sdp, rq);
 		goto out;
 	} else if (rq->cmd_type != REQ_TYPE_FS) {
@@ -1163,6 +1170,15 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 	int sense_valid = 0;
 	int sense_deferred = 0;
 
+	/*
+	 * If this is a discard request that originated from the kernel
+	 * we need to free our payload here.  Note that we need to check
+	 * the request flag as the normal payload rules apply for
+	 * pass-through UNMAP / WRITE SAME requests.
+	 */
+	if (SCpnt->request->cmd_flags & REQ_DISCARD)
+		__free_page(bio_page(SCpnt->request->bio));
+
 	if (result) {
 		sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr);
 		if (sense_valid)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3fc0f5908619..204fbe22354d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -705,6 +705,8 @@ extern struct request *blk_make_request(struct request_queue *, struct bio *,
 					gfp_t);
 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
+extern void blk_add_request_payload(struct request *rq, struct page *page,
+		unsigned int len);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-- 
cgit v1.2.3-59-g8ed1b


From 1676effca4cd2a6b32e6e8e0ecaa91522dfda6fa Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 21 Jun 2010 11:02:48 +0200
Subject: gcc-4.6: fs: fix unused but set warnings

No real bugs I believe, just some dead code, and some
shut up code.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/splice.c           | 2 --
 include/linux/audit.h | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index efdbfece9932..ec11c52d646d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -597,7 +597,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	struct page *pages[PIPE_DEF_BUFFERS];
 	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
-	pgoff_t index;
 	ssize_t res;
 	size_t this_len;
 	int error;
@@ -621,7 +620,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 			goto shrink_ret;
 	}
 
-	index = *ppos >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index f391d45c8aea..e24afabc548f 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -544,7 +544,7 @@ extern int audit_signals;
 #define audit_putname(n) do { ; } while (0)
 #define __audit_inode(n,d) do { ; } while (0)
 #define __audit_inode_child(i,p) do { ; } while (0)
-#define audit_inode(n,d) do { ; } while (0)
+#define audit_inode(n,d) do { (void)(d); } while (0)
 #define audit_inode_child(i,p) do { ; } while (0)
 #define audit_core_dumps(i) do { ; } while (0)
 #define auditsc_get_stamp(c,t,s) (0)
-- 
cgit v1.2.3-59-g8ed1b


From 28018c242a4ec7017bbbf81d2d3952f820a27118 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@suse.de>
Date: Thu, 1 Jul 2010 19:49:17 +0900
Subject: block: implement an unprep function corresponding directly to prep

Reviewed-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c        | 25 +++++++++++++++++++++++++
 block/blk-settings.c    | 17 +++++++++++++++++
 drivers/scsi/scsi_lib.c |  2 +-
 include/linux/blkdev.h  |  4 ++++
 4 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3c3789492c10..5ab3ac22930c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -608,6 +608,7 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
 
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
+	q->unprep_rq_fn		= NULL;
 	q->unplug_fn		= generic_unplug_device;
 	q->queue_flags		= QUEUE_FLAG_DEFAULT;
 	q->queue_lock		= lock;
@@ -2133,6 +2134,26 @@ static bool blk_update_bidi_request(struct request *rq, int error,
 	return false;
 }
 
+/**
+ * blk_unprep_request - unprepare a request
+ * @req:	the request
+ *
+ * This function makes a request ready for complete resubmission (or
+ * completion).  It happens only after all error handling is complete,
+ * so represents the appropriate moment to deallocate any resources
+ * that were allocated to the request in the prep_rq_fn.  The queue
+ * lock is held when calling this.
+ */
+void blk_unprep_request(struct request *req)
+{
+	struct request_queue *q = req->q;
+
+	req->cmd_flags &= ~REQ_DONTPREP;
+	if (q->unprep_rq_fn)
+		q->unprep_rq_fn(q, req);
+}
+EXPORT_SYMBOL_GPL(blk_unprep_request);
+
 /*
  * queue lock must be held
  */
@@ -2148,6 +2169,10 @@ static void blk_finish_request(struct request *req, int error)
 
 	blk_delete_timer(req);
 
+	if (req->cmd_flags & REQ_DONTPREP)
+		blk_unprep_request(req);
+
+
 	blk_account_io_done(req);
 
 	if (req->end_io)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index f5ed5a1187ba..a234f4bf1d6f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -36,6 +36,23 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 }
 EXPORT_SYMBOL(blk_queue_prep_rq);
 
+/**
+ * blk_queue_unprep_rq - set an unprepare_request function for queue
+ * @q:		queue
+ * @ufn:	unprepare_request function
+ *
+ * It's possible for a queue to register an unprepare_request callback
+ * which is invoked before the request is finally completed. The goal
+ * of the function is to deallocate any data that was allocated in the
+ * prepare_request callback.
+ *
+ */
+void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn)
+{
+	q->unprep_rq_fn = ufn;
+}
+EXPORT_SYMBOL(blk_queue_unprep_rq);
+
 /**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:		queue
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 5f1160841b0e..ee836193f531 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -85,7 +85,7 @@ static void scsi_unprep_request(struct request *req)
 {
 	struct scsi_cmnd *cmd = req->special;
 
-	req->cmd_flags &= ~REQ_DONTPREP;
+	blk_unprep_request(req);
 	req->special = NULL;
 
 	scsi_put_command(cmd);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 204fbe22354d..6bba04c7ec48 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -200,6 +200,7 @@ struct request_pm_state
 typedef void (request_fn_proc) (struct request_queue *q);
 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
+typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
 
 struct bio_vec;
@@ -282,6 +283,7 @@ struct request_queue
 	request_fn_proc		*request_fn;
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
+	unprep_rq_fn		*unprep_rq_fn;
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	prepare_flush_fn	*prepare_flush_fn;
@@ -841,6 +843,7 @@ extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
 extern void blk_abort_queue(struct request_queue *);
+extern void blk_unprep_request(struct request *);
 
 /*
  * Access functions for manipulating queue properties
@@ -885,6 +888,7 @@ extern int blk_queue_dma_drain(struct request_queue *q,
 extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
+extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
-- 
cgit v1.2.3-59-g8ed1b


From 8749534fe6826596b71bc409c872b047a8e2755b Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Sat, 3 Jul 2010 17:45:32 +0900
Subject: block: introduce REQ_FLUSH flag

SCSI-ml needs a way to mark a request as flush request in
q->prepare_flush_fn because it needs to identify them later (e.g. in
q->request_fn or prep_rq_fn).

queue_flush sets REQ_HARDBARRIER in rq->cmd_flags however the block
layer also sends normal REQ_TYPE_FS requests with REQ_HARDBARRIER. So
SCSI-ml can't use REQ_HARDBARRIER to identify flush requests.

We could change the block layer to clear REQ_HARDBARRIER bit before
sending non flush requests to the lower layers. However, intorudcing
the new flag looks cleaner (surely easier).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: James Bottomley <James.Bottomley@suse.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Alasdair G Kergon <agk@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c | 2 +-
 include/linux/bio.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 7c6f4a714687..a3482425c507 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -143,7 +143,7 @@ static void queue_flush(struct request_queue *q, unsigned which)
 	}
 
 	blk_rq_init(q, rq);
-	rq->cmd_flags = REQ_HARDBARRIER;
+	rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
 	rq->rq_disk = q->bar_rq.rq_disk;
 	rq->end_io = end_io;
 	q->prepare_flush_fn(q, rq);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4d379c8250ae..f655b54c9ef3 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -174,6 +174,7 @@ enum rq_flag_bits {
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_COPY_USER,	/* contains copies of user pages */
 	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
+	__REQ_FLUSH,		/* request for cache flush */
 	__REQ_IO_STAT,		/* account I/O stat */
 	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
 	__REQ_NR_BITS,		/* stops here */
@@ -213,6 +214,7 @@ enum rq_flag_bits {
 #define REQ_ALLOCED		(1 << __REQ_ALLOCED)
 #define REQ_COPY_USER		(1 << __REQ_COPY_USER)
 #define REQ_INTEGRITY		(1 << __REQ_INTEGRITY)
+#define REQ_FLUSH		(1 << __REQ_FLUSH)
 #define REQ_IO_STAT		(1 << __REQ_IO_STAT)
 #define REQ_MIXED_MERGE		(1 << __REQ_MIXED_MERGE)
 
-- 
cgit v1.2.3-59-g8ed1b


From 00fff26539bfe3fad21c164fc4002d9ede056fb0 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Sat, 3 Jul 2010 17:45:40 +0900
Subject: block: remove q->prepare_flush_fn completely

This removes q->prepare_flush_fn completely (changes the
blk_queue_ordered API).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c          | 7 +------
 drivers/block/brd.c          | 2 +-
 drivers/block/loop.c         | 2 +-
 drivers/block/osdblk.c       | 2 +-
 drivers/block/ps3disk.c      | 2 +-
 drivers/block/virtio_blk.c   | 4 ++--
 drivers/block/xen-blkfront.c | 3 +--
 drivers/ide/ide-disk.c       | 2 +-
 drivers/md/dm.c              | 2 +-
 drivers/mmc/card/queue.c     | 2 +-
 drivers/s390/block/dasd.c    | 2 +-
 drivers/scsi/sd.c            | 2 +-
 include/linux/blkdev.h       | 4 +---
 13 files changed, 14 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 7ce0a32a21fd..eefbde835308 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -13,7 +13,6 @@
  * blk_queue_ordered - does this queue support ordered writes
  * @q:        the request queue
  * @ordered:  one of QUEUE_ORDERED_*
- * @prepare_flush_fn: rq setup helper for cache flush ordered writes
  *
  * Description:
  *   For journalled file systems, doing ordered writes on a commit
@@ -22,8 +21,7 @@
  *   feature should call this function and indicate so.
  *
  **/
-int blk_queue_ordered(struct request_queue *q, unsigned ordered,
-		      prepare_flush_fn *prepare_flush_fn)
+int blk_queue_ordered(struct request_queue *q, unsigned ordered)
 {
 	if (ordered != QUEUE_ORDERED_NONE &&
 	    ordered != QUEUE_ORDERED_DRAIN &&
@@ -38,7 +36,6 @@ int blk_queue_ordered(struct request_queue *q, unsigned ordered,
 
 	q->ordered = ordered;
 	q->next_ordered = ordered;
-	q->prepare_flush_fn = prepare_flush_fn;
 
 	return 0;
 }
@@ -140,8 +137,6 @@ static void queue_flush(struct request_queue *q, unsigned which)
 	rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
 	rq->rq_disk = q->bar_rq.rq_disk;
 	rq->end_io = end_io;
-	if (q->prepare_flush_fn)
-		q->prepare_flush_fn(q, rq);
 
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 1b218c6b6820..1d2c18620f9a 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -479,7 +479,7 @@ static struct brd_device *brd_alloc(int i)
 	if (!brd->brd_queue)
 		goto out_free_dev;
 	blk_queue_make_request(brd->brd_queue, brd_make_request);
-	blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG, NULL);
+	blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
 	blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 	blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fedfdb7d3cdf..d285a5481965 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -831,7 +831,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->lo_queue->unplug_fn = loop_unplug;
 
 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
-		blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN, NULL);
+		blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN);
 
 	set_capacity(lo->lo_disk, size);
 	bd_set_size(bdev, size << 9);
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 9639565a9a6a..2284b4f05c62 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -439,7 +439,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
 	blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
 
 	blk_queue_prep_rq(q, blk_queue_start_tag);
-	blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH, NULL);
+	blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
 
 	disk->queue = q;
 
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index ab528a480f98..e9da874d0419 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -468,7 +468,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
 	blk_queue_dma_alignment(queue, dev->blk_size-1);
 	blk_queue_logical_block_size(queue, dev->blk_size);
 
-	blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH, NULL);
+	blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH);
 
 	blk_queue_max_segments(queue, -1);
 	blk_queue_max_segment_size(queue, dev->bounce_size);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index b277f9e6abac..0a3222fd4442 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -366,9 +366,9 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 
 	/* If barriers are supported, tell block layer that queue is ordered */
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
-		blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH, NULL);
+		blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
 	else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER))
-		blk_queue_ordered(q, QUEUE_ORDERED_TAG, NULL);
+		blk_queue_ordered(q, QUEUE_ORDERED_TAG);
 
 	/* If disk is read-only in the host, the guest should obey */
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 495533e66542..76af65b654e3 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -373,8 +373,7 @@ static int xlvbd_barrier(struct blkfront_info *info)
 	int err;
 
 	err = blk_queue_ordered(info->rq,
-				info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
-				NULL);
+				info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE);
 
 	if (err)
 		return err;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index c22e6226e9e9..7433e07de30e 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -549,7 +549,7 @@ static void update_ordered(ide_drive_t *drive)
 	} else
 		ordered = QUEUE_ORDERED_DRAIN;
 
-	blk_queue_ordered(drive->queue, ordered, NULL);
+	blk_queue_ordered(drive->queue, ordered);
 }
 
 ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 00c810551814..d505a96845c1 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1901,7 +1901,7 @@ static struct mapped_device *alloc_dev(int minor)
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
 	blk_queue_lld_busy(md->queue, dm_lld_busy);
-	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH, NULL);
+	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
 
 	md->disk = alloc_disk(1);
 	if (!md->disk)
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index ec92bcbdeddb..c77eb49eda0e 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -128,7 +128,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock
 	mq->req = NULL;
 
 	blk_queue_prep_rq(mq->queue, mmc_prep_request);
-	blk_queue_ordered(mq->queue, QUEUE_ORDERED_DRAIN, NULL);
+	blk_queue_ordered(mq->queue, QUEUE_ORDERED_DRAIN);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
 
 #ifdef CONFIG_MMC_BLOCK_BOUNCE
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 33975e922d65..17b033d0e050 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2196,7 +2196,7 @@ static void dasd_setup_queue(struct dasd_block *block)
 	 */
 	blk_queue_max_segment_size(block->request_queue, PAGE_SIZE);
 	blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1);
-	blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN, NULL);
+	blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN);
 }
 
 /*
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index e8c295e01466..d9a4314a1948 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2135,7 +2135,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	else
 		ordered = QUEUE_ORDERED_DRAIN;
 
-	blk_queue_ordered(sdkp->disk->queue, ordered, NULL);
+	blk_queue_ordered(sdkp->disk->queue, ordered);
 
 	set_capacity(disk, sdkp->capacity);
 	kfree(buffer);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6bba04c7ec48..3a2c5d9a9288 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -212,7 +212,6 @@ struct bvec_merge_data {
 };
 typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *,
 			     struct bio_vec *);
-typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
 typedef int (lld_busy_fn) (struct request_queue *q);
@@ -286,7 +285,6 @@ struct request_queue
 	unprep_rq_fn		*unprep_rq_fn;
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
-	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
 	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
@@ -896,7 +894,7 @@ extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
+extern int blk_queue_ordered(struct request_queue *, unsigned);
 extern bool blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
-- 
cgit v1.2.3-59-g8ed1b


From a89f5c899db3c6be4bb426e4efb72ecee29a93b5 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 6 Jul 2010 09:03:18 +0200
Subject: block: remove unused REQ_TYPE_LINUX_BLOCK

Nobody uses REQ_TYPE_LINUX_BLOCK (and its REQ_LB_OP_*).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Jeff Garzik <jgarzik@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blkdev.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3a2c5d9a9288..baf5258f5985 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -60,7 +60,6 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
 	REQ_TYPE_SPECIAL,		/* driver defined type */
-	REQ_TYPE_LINUX_BLOCK,		/* generic block layer message */
 	/*
 	 * for ATA/ATAPI devices. this really doesn't belong here, ide should
 	 * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver
@@ -70,20 +69,6 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_ATA_PC,
 };
 
-/*
- * For request of type REQ_TYPE_LINUX_BLOCK, rq->cmd[0] is the opcode being
- * sent down (similar to how REQ_TYPE_BLOCK_PC means that ->cmd[] holds a
- * SCSI cdb.
- *
- * 0x00 -> 0x3f are driver private, to be used for whatever purpose they need,
- * typically to differentiate REQ_TYPE_SPECIAL requests.
- *
- */
-enum {
-	REQ_LB_OP_EJECT	= 0x40,		/* eject request */
-	REQ_LB_OP_FLUSH = 0x41,		/* flush request */
-};
-
 #define BLK_MAX_CDB	16
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 8a6cfeb6deca3a8fefd639d898b0d163c0b5d368 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 8 Jul 2010 10:18:46 +0200
Subject: block: push down BKL into .locked_ioctl

As a preparation for the removal of the big kernel
lock in the block layer, this removes the BKL
from the common ioctl handling code, moving it
into every single driver still using it.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/ioctl.c                   | 11 +----------
 drivers/block/amiflop.c         | 17 +++++++++++++++--
 drivers/block/ataflop.c         | 18 +++++++++++++++---
 drivers/block/brd.c             |  5 ++++-
 drivers/block/cciss.c           |  8 +++++---
 drivers/block/cpqarray.c        | 18 ++++++++++++++++--
 drivers/block/floppy.c          | 17 +++++++++++++++--
 drivers/block/nbd.c             |  5 ++++-
 drivers/block/paride/pcd.c      | 11 +++++++++--
 drivers/block/paride/pd.c       |  5 ++++-
 drivers/block/paride/pf.c       |  6 +++++-
 drivers/block/pktcdvd.c         | 13 +++++++++----
 drivers/block/swim.c            |  5 ++++-
 drivers/block/swim3.c           | 17 +++++++++++++++--
 drivers/block/ub.c              | 10 ++++++++--
 drivers/block/virtio_blk.c      | 17 +++++++++++++++--
 drivers/block/xd.c              | 17 +++++++++++++++--
 drivers/block/xen-blkfront.c    |  2 +-
 drivers/cdrom/gdrom.c           | 11 +++++++++--
 drivers/cdrom/viocd.c           | 11 +++++++++--
 drivers/ide/ide-cd.c            | 18 ++++++++++++++++--
 drivers/ide/ide-disk_ioctl.c    |  9 +++++++--
 drivers/ide/ide-floppy_ioctl.c  | 12 +++++++++---
 drivers/ide/ide-gd.c            |  2 +-
 drivers/ide/ide-tape.c          | 10 ++++++++--
 drivers/message/i2o/i2o_block.c | 22 +++++-----------------
 drivers/mtd/mtd_blkdevs.c       |  5 ++++-
 drivers/scsi/sd.c               | 17 ++++++++++++-----
 drivers/scsi/sr.c               | 18 +++++++++++++-----
 include/linux/blkdev.h          |  1 -
 30 files changed, 253 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index e8eb679f2f9b..1cfa8d449d90 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -163,18 +163,10 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned cmd, unsigned long arg)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	int ret;
 
 	if (disk->fops->ioctl)
 		return disk->fops->ioctl(bdev, mode, cmd, arg);
 
-	if (disk->fops->locked_ioctl) {
-		lock_kernel();
-		ret = disk->fops->locked_ioctl(bdev, mode, cmd, arg);
-		unlock_kernel();
-		return ret;
-	}
-
 	return -ENOTTY;
 }
 /*
@@ -185,8 +177,7 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
 EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
 
 /*
- * always keep this in sync with compat_blkdev_ioctl() and
- * compat_blkdev_locked_ioctl()
+ * always keep this in sync with compat_blkdev_ioctl()
  */
 int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 			unsigned long arg)
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 832798aa14f6..0fa26359304c 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -60,6 +60,7 @@
 #include <linux/hdreg.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/smp_lock.h>
 #include <linux/amifdreg.h>
 #include <linux/amifd.h>
 #include <linux/buffer_head.h>
@@ -1423,7 +1424,7 @@ static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
 		    unsigned int cmd, unsigned long param)
 {
 	struct amiga_floppy_struct *p = bdev->bd_disk->private_data;
@@ -1500,6 +1501,18 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
 	return 0;
 }
 
+static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long param)
+{
+	int ret;
+
+	lock_kernel();
+	ret = fd_locked_ioctl(bdev, mode, cmd, param);
+	unlock_kernel();
+
+	return ret;
+}
+
 static void fd_probe(int dev)
 {
 	unsigned long code;
@@ -1638,7 +1651,7 @@ static const struct block_device_operations floppy_fops = {
 	.owner		= THIS_MODULE,
 	.open		= floppy_open,
 	.release	= floppy_release,
-	.locked_ioctl	= fd_ioctl,
+	.ioctl		= fd_ioctl,
 	.getgeo		= fd_getgeo,
 	.media_changed	= amiga_floppy_change,
 };
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index e35cf59cbfde..1bb8bfcfdbd9 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -67,6 +67,7 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
+#include <linux/smp_lock.h>
 
 #include <asm/atafd.h>
 #include <asm/atafdreg.h>
@@ -359,7 +360,7 @@ static void finish_fdc( void );
 static void finish_fdc_done( int dummy );
 static void setup_req_params( int drive );
 static void redo_fd_request( void);
-static int fd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
                      cmd, unsigned long param);
 static void fd_probe( int drive );
 static int fd_test_drive_present( int drive );
@@ -1480,7 +1481,7 @@ void do_fd_request(struct request_queue * q)
 	atari_enable_irq( IRQ_MFP_FDC );
 }
 
-static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
 		    unsigned int cmd, unsigned long param)
 {
 	struct gendisk *disk = bdev->bd_disk;
@@ -1665,6 +1666,17 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
 	}
 }
 
+static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	lock_kernel();
+	ret = fd_locked_ioctl(bdev, mode, cmd, arg);
+	unlock_kernel();
+
+	return ret;
+}
 
 /* Initialize the 'unit' variable for drive 'drive' */
 
@@ -1855,7 +1867,7 @@ static const struct block_device_operations floppy_fops = {
 	.owner		= THIS_MODULE,
 	.open		= floppy_open,
 	.release	= floppy_release,
-	.locked_ioctl	= fd_ioctl,
+	.ioctl		= fd_ioctl,
 	.media_changed	= check_floppy_change,
 	.revalidate_disk= floppy_revalidate,
 };
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 1d2c18620f9a..1c7f63792ff8 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -15,6 +15,7 @@
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/highmem.h>
+#include <linux/smp_lock.h>
 #include <linux/radix-tree.h>
 #include <linux/buffer_head.h> /* invalidate_bh_lrus() */
 #include <linux/slab.h>
@@ -401,6 +402,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
 	 * ram device BLKFLSBUF has special semantics, we want to actually
 	 * release and destroy the ramdisk data.
 	 */
+	lock_kernel();
 	mutex_lock(&bdev->bd_mutex);
 	error = -EBUSY;
 	if (bdev->bd_openers <= 1) {
@@ -417,13 +419,14 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
 		error = 0;
 	}
 	mutex_unlock(&bdev->bd_mutex);
+	unlock_kernel();
 
 	return error;
 }
 
 static const struct block_device_operations brd_fops = {
 	.owner =		THIS_MODULE,
-	.locked_ioctl =		brd_ioctl,
+	.ioctl =		brd_ioctl,
 #ifdef CONFIG_BLK_DEV_XIP
 	.direct_access =	brd_direct_access,
 #endif
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 11b377762b8e..a6c0494dd054 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -179,6 +179,8 @@ static irqreturn_t do_cciss_intx(int irq, void *dev_id);
 static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id);
 static int cciss_open(struct block_device *bdev, fmode_t mode);
 static int cciss_release(struct gendisk *disk, fmode_t mode);
+static int do_ioctl(struct block_device *bdev, fmode_t mode,
+		    unsigned int cmd, unsigned long arg);
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 		       unsigned int cmd, unsigned long arg);
 static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo);
@@ -237,7 +239,7 @@ static const struct block_device_operations cciss_fops = {
 	.owner = THIS_MODULE,
 	.open = cciss_open,
 	.release = cciss_release,
-	.locked_ioctl = cciss_ioctl,
+	.ioctl = do_ioctl,
 	.getgeo = cciss_getgeo,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = cciss_compat_ioctl,
@@ -1057,8 +1059,6 @@ static int cciss_release(struct gendisk *disk, fmode_t mode)
 	return 0;
 }
 
-#ifdef CONFIG_COMPAT
-
 static int do_ioctl(struct block_device *bdev, fmode_t mode,
 		    unsigned cmd, unsigned long arg)
 {
@@ -1069,6 +1069,8 @@ static int do_ioctl(struct block_device *bdev, fmode_t mode,
 	return ret;
 }
 
+#ifdef CONFIG_COMPAT
+
 static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode,
 				  unsigned cmd, unsigned long arg);
 static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode,
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index abb4ec6690fc..c459aeea3c0c 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -35,6 +35,7 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/hdreg.h>
+#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/genhd.h>
@@ -197,7 +198,7 @@ static const struct block_device_operations ida_fops  = {
 	.owner		= THIS_MODULE,
 	.open		= ida_open,
 	.release	= ida_release,
-	.locked_ioctl	= ida_ioctl,
+	.ioctl		= ida_ioctl,
 	.getgeo		= ida_getgeo,
 	.revalidate_disk= ida_revalidate,
 };
@@ -1128,7 +1129,7 @@ static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo)
  *  ida_ioctl does some miscellaneous stuff like reporting drive geometry,
  *  setting readahead and submitting commands from userspace to the controller.
  */
-static int ida_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+static int ida_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
 {
 	drv_info_t *drv = get_drv(bdev->bd_disk);
 	ctlr_info_t *host = get_host(bdev->bd_disk);
@@ -1192,6 +1193,19 @@ out_passthru:
 	}
 		
 }
+
+static int ida_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long param)
+{
+	int ret;
+
+	lock_kernel();
+	ret = ida_locked_ioctl(bdev, mode, cmd, param);
+	unlock_kernel();
+
+	return ret;
+}
+
 /*
  * ida_ctlr_ioctl is for passing commands to the controller from userspace.
  * The command block (io) has already been copied to kernel space for us,
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 82c30f9f81ca..40419b066aa9 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -178,6 +178,7 @@ static int print_unex = 1;
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/bio.h>
+#include <linux/smp_lock.h>
 #include <linux/string.h>
 #include <linux/jiffies.h>
 #include <linux/fcntl.h>
@@ -3371,7 +3372,7 @@ static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static int fd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 		    unsigned long param)
 {
 	int drive = (long)bdev->bd_disk->private_data;
@@ -3547,6 +3548,18 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 	return 0;
 }
 
+static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long param)
+{
+	int ret;
+
+	lock_kernel();
+	ret = fd_locked_ioctl(bdev, mode, cmd, param);
+	unlock_kernel();
+
+	return ret;
+}
+
 static void __init config_types(void)
 {
 	bool has_drive = false;
@@ -3848,7 +3861,7 @@ static const struct block_device_operations floppy_fops = {
 	.owner			= THIS_MODULE,
 	.open			= floppy_open,
 	.release		= floppy_release,
-	.locked_ioctl		= fd_ioctl,
+	.ioctl			= fd_ioctl,
 	.getgeo			= fd_getgeo,
 	.media_changed		= check_floppy_change,
 	.revalidate_disk	= floppy_revalidate,
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 2e74e7d475ca..6751789fb379 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -24,6 +24,7 @@
 #include <linux/errno.h>
 #include <linux/file.h>
 #include <linux/ioctl.h>
+#include <linux/smp_lock.h>
 #include <linux/compiler.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
@@ -716,9 +717,11 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
 	dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
 			lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
 
+	lock_kernel();
 	mutex_lock(&lo->tx_lock);
 	error = __nbd_ioctl(bdev, lo, cmd, arg);
 	mutex_unlock(&lo->tx_lock);
+	unlock_kernel();
 
 	return error;
 }
@@ -726,7 +729,7 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
 static const struct block_device_operations nbd_fops =
 {
 	.owner =	THIS_MODULE,
-	.locked_ioctl =	nbd_ioctl,
+	.ioctl =	nbd_ioctl,
 };
 
 /*
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 71acf4e53356..daba7a62a663 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -138,6 +138,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
 #include <linux/cdrom.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 
 static DEFINE_SPINLOCK(pcd_lock);
@@ -238,7 +239,13 @@ static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode,
 				unsigned cmd, unsigned long arg)
 {
 	struct pcd_unit *cd = bdev->bd_disk->private_data;
-	return cdrom_ioctl(&cd->info, bdev, mode, cmd, arg);
+	int ret;
+
+	lock_kernel();
+	ret = cdrom_ioctl(&cd->info, bdev, mode, cmd, arg);
+	unlock_kernel();
+
+	return ret;
 }
 
 static int pcd_block_media_changed(struct gendisk *disk)
@@ -251,7 +258,7 @@ static const struct block_device_operations pcd_bdops = {
 	.owner		= THIS_MODULE,
 	.open		= pcd_block_open,
 	.release	= pcd_block_release,
-	.locked_ioctl	= pcd_block_ioctl,
+	.ioctl		= pcd_block_ioctl,
 	.media_changed	= pcd_block_media_changed,
 };
 
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 4e8b9bff3abe..c4d6ed9846ca 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -153,6 +153,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV};
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
 #include <linux/kernel.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include <linux/workqueue.h>
 
@@ -768,8 +769,10 @@ static int pd_ioctl(struct block_device *bdev, fmode_t mode,
 
 	switch (cmd) {
 	case CDROMEJECT:
+		lock_kernel();
 		if (disk->access == 1)
 			pd_special_command(disk, pd_eject);
+		unlock_kernel();
 		return 0;
 	default:
 		return -EINVAL;
@@ -812,7 +815,7 @@ static const struct block_device_operations pd_fops = {
 	.owner		= THIS_MODULE,
 	.open		= pd_open,
 	.release	= pd_release,
-	.locked_ioctl	= pd_ioctl,
+	.ioctl		= pd_ioctl,
 	.getgeo		= pd_getgeo,
 	.media_changed	= pd_check_media,
 	.revalidate_disk= pd_revalidate
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index c059aab3006b..38b4d566b816 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -152,6 +152,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_LUN, D_DLY};
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 
 static DEFINE_SPINLOCK(pf_spin_lock);
@@ -266,7 +267,7 @@ static const struct block_device_operations pf_fops = {
 	.owner		= THIS_MODULE,
 	.open		= pf_open,
 	.release	= pf_release,
-	.locked_ioctl	= pf_ioctl,
+	.ioctl		= pf_ioctl,
 	.getgeo		= pf_getgeo,
 	.media_changed	= pf_check_media,
 };
@@ -342,7 +343,10 @@ static int pf_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
 
 	if (pf->access != 1)
 		return -EBUSY;
+	lock_kernel();
 	pf_eject(pf);
+	unlock_kernel();
+
 	return 0;
 }
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 9f3e4454274b..40f1e31f42c4 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -57,6 +57,7 @@
 #include <linux/seq_file.h>
 #include <linux/miscdevice.h>
 #include <linux/freezer.h>
+#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <scsi/scsi_cmnd.h>
@@ -2762,10 +2763,12 @@ out_mem:
 static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
 {
 	struct pktcdvd_device *pd = bdev->bd_disk->private_data;
+	int ret;
 
 	VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd,
 		MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
 
+	lock_kernel();
 	switch (cmd) {
 	case CDROMEJECT:
 		/*
@@ -2783,14 +2786,16 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 	case CDROM_LAST_WRITTEN:
 	case CDROM_SEND_PACKET:
 	case SCSI_IOCTL_SEND_COMMAND:
-		return __blkdev_driver_ioctl(pd->bdev, mode, cmd, arg);
+		ret = __blkdev_driver_ioctl(pd->bdev, mode, cmd, arg);
+		break;
 
 	default:
 		VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd);
-		return -ENOTTY;
+		ret = -ENOTTY;
 	}
+	unlock_kernel();
 
-	return 0;
+	return ret;
 }
 
 static int pkt_media_changed(struct gendisk *disk)
@@ -2812,7 +2817,7 @@ static const struct block_device_operations pktcdvd_ops = {
 	.owner =		THIS_MODULE,
 	.open =			pkt_open,
 	.release =		pkt_close,
-	.locked_ioctl =		pkt_ioctl,
+	.ioctl =		pkt_ioctl,
 	.media_changed =	pkt_media_changed,
 };
 
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index e463657569ff..f04f74e3758f 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -20,6 +20,7 @@
 #include <linux/fd.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/smp_lock.h>
 #include <linux/hdreg.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
@@ -690,7 +691,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
 	case FDEJECT:
 		if (fs->ref_count != 1)
 			return -EBUSY;
+		lock_kernel();
 		err = floppy_eject(fs);
+		unlock_kernel();
 		return err;
 
 	case FDGETPRM:
@@ -753,7 +756,7 @@ static const struct block_device_operations floppy_fops = {
 	.owner		 = THIS_MODULE,
 	.open		 = floppy_open,
 	.release	 = floppy_release,
-	.locked_ioctl	 = floppy_ioctl,
+	.ioctl		 = floppy_ioctl,
 	.getgeo		 = floppy_getgeo,
 	.media_changed	 = floppy_check_change,
 	.revalidate_disk = floppy_revalidate,
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index ed6fb91123ab..f3657b2a5386 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -25,6 +25,7 @@
 #include <linux/ioctl.h>
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
+#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <asm/io.h>
@@ -839,7 +840,7 @@ static int fd_eject(struct floppy_state *fs)
 static struct floppy_struct floppy_type =
 	{ 2880,18,2,80,0,0x1B,0x00,0xCF,0x6C,NULL };	/*  7 1.44MB 3.5"   */
 
-static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
+static int floppy_locked_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long param)
 {
 	struct floppy_state *fs = bdev->bd_disk->private_data;
@@ -867,6 +868,18 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
 	return -ENOTTY;
 }
 
+static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
+				 unsigned int cmd, unsigned long param)
+{
+	int ret;
+
+	lock_kernel();
+	ret = floppy_locked_ioctl(bdev, mode, cmd, param);
+	unlock_kernel();
+
+	return ret;
+}
+
 static int floppy_open(struct block_device *bdev, fmode_t mode)
 {
 	struct floppy_state *fs = bdev->bd_disk->private_data;
@@ -997,7 +1010,7 @@ static int floppy_revalidate(struct gendisk *disk)
 static const struct block_device_operations floppy_fops = {
 	.open		= floppy_open,
 	.release	= floppy_release,
-	.locked_ioctl	= floppy_ioctl,
+	.ioctl		= floppy_ioctl,
 	.media_changed	= floppy_check_change,
 	.revalidate_disk= floppy_revalidate,
 };
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index aaf27fb4efd6..102ed52d0e0f 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -28,6 +28,7 @@
 #include <linux/timer.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
+#include <linux/smp_lock.h>
 #include <scsi/scsi.h>
 
 #define DRV_NAME "ub"
@@ -1729,8 +1730,13 @@ static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode,
 {
 	struct gendisk *disk = bdev->bd_disk;
 	void __user *usermem = (void __user *) arg;
+	int ret;
+
+	lock_kernel();
+	ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem);
+	unlock_kernel();
 
-	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem);
+	return ret;
 }
 
 /*
@@ -1794,7 +1800,7 @@ static const struct block_device_operations ub_bd_fops = {
 	.owner		= THIS_MODULE,
 	.open		= ub_bd_open,
 	.release	= ub_bd_release,
-	.locked_ioctl	= ub_bd_ioctl,
+	.ioctl		= ub_bd_ioctl,
 	.media_changed	= ub_bd_media_changed,
 	.revalidate_disk = ub_bd_revalidate,
 };
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 0a3222fd4442..7b0f7b624adf 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -2,6 +2,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/smp_lock.h>
 #include <linux/hdreg.h>
 #include <linux/virtio.h>
 #include <linux/virtio_blk.h>
@@ -217,7 +218,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 	return blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
 }
 
-static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
 			 unsigned cmd, unsigned long data)
 {
 	struct gendisk *disk = bdev->bd_disk;
@@ -243,6 +244,18 @@ static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 			      (void __user *)data);
 }
 
+static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long param)
+{
+	int ret;
+
+	lock_kernel();
+	ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
+	unlock_kernel();
+
+	return ret;
+}
+
 /* We provide getgeo only to please some old bootloader/partitioning tools */
 static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
 {
@@ -269,7 +282,7 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
 }
 
 static const struct block_device_operations virtblk_fops = {
-	.locked_ioctl = virtblk_ioctl,
+	.ioctl  = virtblk_ioctl,
 	.owner  = THIS_MODULE,
 	.getgeo = virtblk_getgeo,
 };
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index b16a3a926cf2..d5a3cd750561 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -46,6 +46,7 @@
 #include <linux/init.h>
 #include <linux/wait.h>
 #include <linux/blkdev.h>
+#include <linux/smp_lock.h>
 #include <linux/blkpg.h>
 #include <linux/delay.h>
 #include <linux/io.h>
@@ -133,7 +134,7 @@ static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static const struct block_device_operations xd_fops = {
 	.owner	= THIS_MODULE,
-	.locked_ioctl	= xd_ioctl,
+	.ioctl	= xd_ioctl,
 	.getgeo = xd_getgeo,
 };
 static DECLARE_WAIT_QUEUE_HEAD(xd_wait_int);
@@ -347,7 +348,7 @@ static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 }
 
 /* xd_ioctl: handle device ioctl's */
-static int xd_ioctl(struct block_device *bdev, fmode_t mode, u_int cmd, u_long arg)
+static int xd_locked_ioctl(struct block_device *bdev, fmode_t mode, u_int cmd, u_long arg)
 {
 	switch (cmd) {
 		case HDIO_SET_DMA:
@@ -375,6 +376,18 @@ static int xd_ioctl(struct block_device *bdev, fmode_t mode, u_int cmd, u_long a
 	}
 }
 
+static int xd_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long param)
+{
+	int ret;
+
+	lock_kernel();
+	ret = xd_locked_ioctl(bdev, mode, cmd, param);
+	unlock_kernel();
+
+	return ret;
+}
+
 /* xd_readwrite: handle a read/write request */
 static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_int count)
 {
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 76af65b654e3..9119cd3d56a4 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1045,7 +1045,7 @@ static const struct block_device_operations xlvbd_block_fops =
 	.open = blkif_open,
 	.release = blkif_release,
 	.getgeo = blkif_getgeo,
-	.locked_ioctl = blkif_ioctl,
+	.ioctl = blkif_ioctl,
 };
 
 
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 5219b57deb36..1772fd914fb9 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -34,6 +34,7 @@
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
 #include <linux/device.h>
+#include <linux/smp_lock.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <linux/platform_device.h>
@@ -509,7 +510,13 @@ static int gdrom_bdops_mediachanged(struct gendisk *disk)
 static int gdrom_bdops_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned cmd, unsigned long arg)
 {
-	return cdrom_ioctl(gd.cd_info, bdev, mode, cmd, arg);
+	int ret;
+
+	lock_kernel();
+	ret = cdrom_ioctl(gd.cd_info, bdev, mode, cmd, arg);
+	unlock_kernel();
+
+	return ret;
 }
 
 static const struct block_device_operations gdrom_bdops = {
@@ -517,7 +524,7 @@ static const struct block_device_operations gdrom_bdops = {
 	.open			= gdrom_bdops_open,
 	.release		= gdrom_bdops_release,
 	.media_changed		= gdrom_bdops_mediachanged,
-	.locked_ioctl		= gdrom_bdops_ioctl,
+	.ioctl			= gdrom_bdops_ioctl,
 };
 
 static irqreturn_t gdrom_command_interrupt(int irq, void *dev_id)
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 1fa6628d150b..16dada0627ee 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -42,6 +42,7 @@
 #include <linux/module.h>
 #include <linux/completion.h>
 #include <linux/proc_fs.h>
+#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/scatterlist.h>
 
@@ -167,7 +168,13 @@ static int viocd_blk_ioctl(struct block_device *bdev, fmode_t mode,
 		unsigned cmd, unsigned long arg)
 {
 	struct disk_info *di = bdev->bd_disk->private_data;
-	return cdrom_ioctl(&di->viocd_info, bdev, mode, cmd, arg);
+	int ret;
+
+	lock_kernel();
+	ret = cdrom_ioctl(&di->viocd_info, bdev, mode, cmd, arg);
+	unlock_kernel();
+
+	return ret;
 }
 
 static int viocd_blk_media_changed(struct gendisk *disk)
@@ -180,7 +187,7 @@ static const struct block_device_operations viocd_fops = {
 	.owner =		THIS_MODULE,
 	.open =			viocd_blk_open,
 	.release =		viocd_blk_release,
-	.locked_ioctl =		viocd_blk_ioctl,
+	.ioctl =		viocd_blk_ioctl,
 	.media_changed =	viocd_blk_media_changed,
 };
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index ef7e3a9bee51..bf9f61a5c2f8 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -31,6 +31,7 @@
 #include <linux/delay.h>
 #include <linux/timer.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/errno.h>
@@ -1654,7 +1655,7 @@ static int idecd_get_spindown(struct cdrom_device_info *cdi, unsigned long arg)
 	return 0;
 }
 
-static int idecd_ioctl(struct block_device *bdev, fmode_t mode,
+static int idecd_locked_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long arg)
 {
 	struct cdrom_info *info = ide_drv_g(bdev->bd_disk, cdrom_info);
@@ -1676,6 +1677,19 @@ static int idecd_ioctl(struct block_device *bdev, fmode_t mode,
 	return err;
 }
 
+static int idecd_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	lock_kernel();
+	ret = idecd_locked_ioctl(bdev, mode, cmd, arg);
+	unlock_kernel();
+
+	return ret;
+}
+
+
 static int idecd_media_changed(struct gendisk *disk)
 {
 	struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
@@ -1696,7 +1710,7 @@ static const struct block_device_operations idecd_ops = {
 	.owner			= THIS_MODULE,
 	.open			= idecd_open,
 	.release		= idecd_release,
-	.locked_ioctl		= idecd_ioctl,
+	.ioctl			= idecd_ioctl,
 	.media_changed		= idecd_media_changed,
 	.revalidate_disk	= idecd_revalidate_disk
 };
diff --git a/drivers/ide/ide-disk_ioctl.c b/drivers/ide/ide-disk_ioctl.c
index 7b783dd7c0be..ec94c66918f6 100644
--- a/drivers/ide/ide-disk_ioctl.c
+++ b/drivers/ide/ide-disk_ioctl.c
@@ -1,6 +1,7 @@
 #include <linux/kernel.h>
 #include <linux/ide.h>
 #include <linux/hdreg.h>
+#include <linux/smp_lock.h>
 
 #include "ide-disk.h"
 
@@ -18,9 +19,13 @@ int ide_disk_ioctl(ide_drive_t *drive, struct block_device *bdev, fmode_t mode,
 {
 	int err;
 
+	lock_kernel();
 	err = ide_setting_ioctl(drive, bdev, cmd, arg, ide_disk_ioctl_settings);
 	if (err != -EOPNOTSUPP)
-		return err;
+		goto out;
 
-	return generic_ide_ioctl(drive, bdev, cmd, arg);
+	err = generic_ide_ioctl(drive, bdev, cmd, arg);
+out:
+	unlock_kernel();
+	return err;
 }
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
index 9c2288234dea..fd3d05ab3417 100644
--- a/drivers/ide/ide-floppy_ioctl.c
+++ b/drivers/ide/ide-floppy_ioctl.c
@@ -5,6 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/ide.h>
 #include <linux/cdrom.h>
+#include <linux/smp_lock.h>
 
 #include <asm/unaligned.h>
 
@@ -275,12 +276,15 @@ int ide_floppy_ioctl(ide_drive_t *drive, struct block_device *bdev,
 	void __user *argp = (void __user *)arg;
 	int err;
 
-	if (cmd == CDROMEJECT || cmd == CDROM_LOCKDOOR)
-		return ide_floppy_lockdoor(drive, &pc, arg, cmd);
+	lock_kernel();
+	if (cmd == CDROMEJECT || cmd == CDROM_LOCKDOOR) {
+		err = ide_floppy_lockdoor(drive, &pc, arg, cmd);
+		goto out;
+	}
 
 	err = ide_floppy_format_ioctl(drive, &pc, mode, cmd, argp);
 	if (err != -ENOTTY)
-		return err;
+		goto out;
 
 	/*
 	 * skip SCSI_IOCTL_SEND_COMMAND (deprecated)
@@ -293,5 +297,7 @@ int ide_floppy_ioctl(ide_drive_t *drive, struct block_device *bdev,
 	if (err == -ENOTTY)
 		err = generic_ide_ioctl(drive, bdev, cmd, arg);
 
+out:
+	unlock_kernel();
 	return err;
 }
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index c102d23d9b38..883f0c979c9f 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -323,7 +323,7 @@ static const struct block_device_operations ide_gd_ops = {
 	.owner			= THIS_MODULE,
 	.open			= ide_gd_open,
 	.release		= ide_gd_release,
-	.locked_ioctl		= ide_gd_ioctl,
+	.ioctl			= ide_gd_ioctl,
 	.getgeo			= ide_gd_getgeo,
 	.media_changed		= ide_gd_media_changed,
 	.unlock_native_capacity	= ide_gd_unlock_native_capacity,
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 635fd72d4728..39b0a5c45f07 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -32,6 +32,7 @@
 #include <linux/errno.h>
 #include <linux/genhd.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/ide.h>
@@ -1927,9 +1928,14 @@ static int idetape_ioctl(struct block_device *bdev, fmode_t mode,
 {
 	struct ide_tape_obj *tape = ide_drv_g(bdev->bd_disk, ide_tape_obj);
 	ide_drive_t *drive = tape->drive;
-	int err = generic_ide_ioctl(drive, bdev, cmd, arg);
+	int err;
+
+	lock_kernel();
+	err = generic_ide_ioctl(drive, bdev, cmd, arg);
 	if (err == -EINVAL)
 		err = idetape_blkdev_ioctl(drive, cmd, arg);
+	unlock_kernel();
+
 	return err;
 }
 
@@ -1937,7 +1943,7 @@ static const struct block_device_operations idetape_block_ops = {
 	.owner		= THIS_MODULE,
 	.open		= idetape_open,
 	.release	= idetape_release,
-	.locked_ioctl	= idetape_ioctl,
+	.ioctl		= idetape_ioctl,
 };
 
 static int ide_tape_probe(ide_drive_t *drive)
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index b8233ff863e3..d1bdf8abe5db 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -53,7 +53,6 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/i2o.h>
-#include <linux/smp_lock.h>
 
 #include <linux/mempool.h>
 
@@ -653,40 +652,30 @@ static int i2o_block_ioctl(struct block_device *bdev, fmode_t mode,
 {
 	struct gendisk *disk = bdev->bd_disk;
 	struct i2o_block_device *dev = disk->private_data;
-	int ret = -ENOTTY;
 
 	/* Anyone capable of this syscall can do *real bad* things */
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	lock_kernel();
 	switch (cmd) {
 	case BLKI2OGRSTRAT:
-		ret = put_user(dev->rcache, (int __user *)arg);
-		break;
+		return put_user(dev->rcache, (int __user *)arg);
 	case BLKI2OGWSTRAT:
-		ret = put_user(dev->wcache, (int __user *)arg);
-		break;
+		return put_user(dev->wcache, (int __user *)arg);
 	case BLKI2OSRSTRAT:
-		ret = -EINVAL;
 		if (arg < 0 || arg > CACHE_SMARTFETCH)
-			break;
+			return -EINVAL;
 		dev->rcache = arg;
-		ret = 0;
 		break;
 	case BLKI2OSWSTRAT:
-		ret = -EINVAL;
 		if (arg != 0
 		    && (arg < CACHE_WRITETHROUGH || arg > CACHE_SMARTBACK))
-			break;
+			return -EINVAL;
 		dev->wcache = arg;
-		ret = 0;
 		break;
 	}
-	unlock_kernel();
-
-	return ret;
+	return -ENOTTY;
 };
 
 /**
@@ -942,7 +931,6 @@ static const struct block_device_operations i2o_block_fops = {
 	.open = i2o_block_open,
 	.release = i2o_block_release,
 	.ioctl = i2o_block_ioctl,
-	.compat_ioctl = i2o_block_ioctl,
 	.getgeo = i2o_block_getgeo,
 	.media_changed = i2o_block_media_changed
 };
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 475af42745cb..8c83b11a77d5 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -15,6 +15,7 @@
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
 #include <linux/spinlock.h>
+#include <linux/smp_lock.h>
 #include <linux/hdreg.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
@@ -237,6 +238,7 @@ static int blktrans_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!dev)
 		return ret;
 
+	lock_kernel();
 	mutex_lock(&dev->lock);
 
 	if (!dev->mtd)
@@ -250,6 +252,7 @@ static int blktrans_ioctl(struct block_device *bdev, fmode_t mode,
 	}
 unlock:
 	mutex_unlock(&dev->lock);
+	unlock_kernel();
 	blktrans_dev_put(dev);
 	return ret;
 }
@@ -258,7 +261,7 @@ static const struct block_device_operations mtd_blktrans_ops = {
 	.owner		= THIS_MODULE,
 	.open		= blktrans_open,
 	.release	= blktrans_release,
-	.locked_ioctl	= blktrans_ioctl,
+	.ioctl		= blktrans_ioctl,
 	.getgeo		= blktrans_getgeo,
 };
 
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 1d0c4b7c3b69..633ac32b25c1 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -46,6 +46,7 @@
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
 #include <linux/delay.h>
+#include <linux/smp_lock.h>
 #include <linux/mutex.h>
 #include <linux/string_helpers.h>
 #include <linux/async.h>
@@ -924,6 +925,7 @@ static int sd_ioctl(struct block_device *bdev, fmode_t mode,
 	SCSI_LOG_IOCTL(1, printk("sd_ioctl: disk=%s, cmd=0x%x\n",
 						disk->disk_name, cmd));
 
+	lock_kernel();
 	/*
 	 * If we are in the middle of error recovery, don't let anyone
 	 * else try and use this device.  Also, if error recovery fails, it
@@ -933,7 +935,7 @@ static int sd_ioctl(struct block_device *bdev, fmode_t mode,
 	error = scsi_nonblockable_ioctl(sdp, cmd, p,
 					(mode & FMODE_NDELAY) != 0);
 	if (!scsi_block_when_processing_errors(sdp) || !error)
-		return error;
+		goto out;
 
 	/*
 	 * Send SCSI addressing ioctls directly to mid level, send other
@@ -943,13 +945,18 @@ static int sd_ioctl(struct block_device *bdev, fmode_t mode,
 	switch (cmd) {
 		case SCSI_IOCTL_GET_IDLUN:
 		case SCSI_IOCTL_GET_BUS_NUMBER:
-			return scsi_ioctl(sdp, cmd, p);
+			error = scsi_ioctl(sdp, cmd, p);
+			break;
 		default:
 			error = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, p);
 			if (error != -ENOTTY)
-				return error;
+				break;
+			error = scsi_ioctl(sdp, cmd, p);
+			break;
 	}
-	return scsi_ioctl(sdp, cmd, p);
+out:
+	unlock_kernel();
+	return error;
 }
 
 static void set_media_not_present(struct scsi_disk *sdkp)
@@ -1123,7 +1130,7 @@ static const struct block_device_operations sd_fops = {
 	.owner			= THIS_MODULE,
 	.open			= sd_open,
 	.release		= sd_release,
-	.locked_ioctl		= sd_ioctl,
+	.ioctl			= sd_ioctl,
 	.getgeo			= sd_getgeo,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl		= sd_compat_ioctl,
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 0a90abc7f140..d42fa6468f41 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -44,6 +44,7 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/mutex.h>
+#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 
@@ -493,6 +494,8 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	void __user *argp = (void __user *)arg;
 	int ret;
 
+	lock_kernel();
+
 	/*
 	 * Send SCSI addressing ioctls directly to mid level, send other
 	 * ioctls to cdrom/block level.
@@ -500,12 +503,13 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	switch (cmd) {
 	case SCSI_IOCTL_GET_IDLUN:
 	case SCSI_IOCTL_GET_BUS_NUMBER:
-		return scsi_ioctl(sdev, cmd, argp);
+		ret = scsi_ioctl(sdev, cmd, argp);
+		goto out;
 	}
 
 	ret = cdrom_ioctl(&cd->cdi, bdev, mode, cmd, arg);
 	if (ret != -ENOSYS)
-		return ret;
+		goto out;
 
 	/*
 	 * ENODEV means that we didn't recognise the ioctl, or that we
@@ -516,8 +520,12 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	ret = scsi_nonblockable_ioctl(sdev, cmd, argp,
 					(mode & FMODE_NDELAY) != 0);
 	if (ret != -ENODEV)
-		return ret;
-	return scsi_ioctl(sdev, cmd, argp);
+		goto out;
+	ret = scsi_ioctl(sdev, cmd, argp);
+
+out:
+	unlock_kernel();
+	return ret;
 }
 
 static int sr_block_media_changed(struct gendisk *disk)
@@ -531,7 +539,7 @@ static const struct block_device_operations sr_bdops =
 	.owner		= THIS_MODULE,
 	.open		= sr_block_open,
 	.release	= sr_block_release,
-	.locked_ioctl	= sr_block_ioctl,
+	.ioctl		= sr_block_ioctl,
 	.media_changed	= sr_block_media_changed,
 	/* 
 	 * No compat_ioctl for now because sr_block_ioctl never
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index baf5258f5985..a8b05fc80c6d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1246,7 +1246,6 @@ static inline int blk_integrity_rq(struct request *rq)
 struct block_device_operations {
 	int (*open) (struct block_device *, fmode_t);
 	int (*release) (struct gendisk *, fmode_t);
-	int (*locked_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*direct_access) (struct block_device *, sector_t,
-- 
cgit v1.2.3-59-g8ed1b


From 62c2a7d969f30163f733c81158254b3095b23e72 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 7 Jul 2010 16:51:26 +0200
Subject: block: push BKL into blktrace ioctls

The blktrace driver currently needs the BKL, but
we should not need to take that in the block layer,
so just push it down into the driver itself.

It is quite likely that the BKL is not actually
required in blktrace code and could be removed
in a follow-on patch.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/compat_ioctl.c         | 56 --------------------------------------------
 block/ioctl.c                |  2 --
 include/linux/blktrace_api.h | 12 ++++++++++
 kernel/trace/blktrace.c      | 43 ++++++++++++++++++++++++++++++++++
 4 files changed, 55 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index f26051f44681..d53085637731 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -535,56 +535,6 @@ out:
 	return err;
 }
 
-struct compat_blk_user_trace_setup {
-	char name[32];
-	u16 act_mask;
-	u32 buf_size;
-	u32 buf_nr;
-	compat_u64 start_lba;
-	compat_u64 end_lba;
-	u32 pid;
-};
-#define BLKTRACESETUP32 _IOWR(0x12, 115, struct compat_blk_user_trace_setup)
-
-static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg)
-{
-	struct blk_user_trace_setup buts;
-	struct compat_blk_user_trace_setup cbuts;
-	struct request_queue *q;
-	char b[BDEVNAME_SIZE];
-	int ret;
-
-	q = bdev_get_queue(bdev);
-	if (!q)
-		return -ENXIO;
-
-	if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
-		return -EFAULT;
-
-	bdevname(bdev, b);
-
-	buts = (struct blk_user_trace_setup) {
-		.act_mask = cbuts.act_mask,
-		.buf_size = cbuts.buf_size,
-		.buf_nr = cbuts.buf_nr,
-		.start_lba = cbuts.start_lba,
-		.end_lba = cbuts.end_lba,
-		.pid = cbuts.pid,
-	};
-	memcpy(&buts.name, &cbuts.name, 32);
-
-	mutex_lock(&bdev->bd_mutex);
-	ret = do_blk_trace_setup(q, b, bdev->bd_dev, bdev, &buts);
-	mutex_unlock(&bdev->bd_mutex);
-	if (ret)
-		return ret;
-
-	if (copy_to_user(arg, &buts.name, 32))
-		return -EFAULT;
-
-	return 0;
-}
-
 static int compat_blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned cmd, unsigned long arg)
 {
@@ -802,16 +752,10 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return compat_put_u64(arg, bdev->bd_inode->i_size);
 
 	case BLKTRACESETUP32:
-		lock_kernel();
-		ret = compat_blk_trace_setup(bdev, compat_ptr(arg));
-		unlock_kernel();
-		return ret;
 	case BLKTRACESTART: /* compatible */
 	case BLKTRACESTOP:  /* compatible */
 	case BLKTRACETEARDOWN: /* compatible */
-		lock_kernel();
 		ret = blk_trace_ioctl(bdev, cmd, compat_ptr(arg));
-		unlock_kernel();
 		return ret;
 	default:
 		if (disk->fops->compat_ioctl)
diff --git a/block/ioctl.c b/block/ioctl.c
index 1cfa8d449d90..9d91e830b320 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -320,9 +320,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKTRACESTOP:
 	case BLKTRACESETUP:
 	case BLKTRACETEARDOWN:
-		lock_kernel();
 		ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg);
-		unlock_kernel();
 		break;
 	default:
 		ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 23faa67e8022..07c698621ad0 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -5,6 +5,7 @@
 #ifdef __KERNEL__
 #include <linux/blkdev.h>
 #include <linux/relay.h>
+#include <linux/compat.h>
 #endif
 
 /*
@@ -203,6 +204,17 @@ extern int blk_trace_init_sysfs(struct device *dev);
 
 extern struct attribute_group blk_trace_attr_group;
 
+struct compat_blk_user_trace_setup {
+	char name[32];
+	u16 act_mask;
+	u32 buf_size;
+	u32 buf_nr;
+	compat_u64 start_lba;
+	compat_u64 end_lba;
+	u32 pid;
+};
+#define BLKTRACESETUP32 _IOWR(0x12, 115, struct compat_blk_user_trace_setup)
+
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 # define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 # define blk_trace_shutdown(q)				do { } while (0)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3b4a695051b6..82499a5bdcb7 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -552,6 +552,41 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 }
 EXPORT_SYMBOL_GPL(blk_trace_setup);
 
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+static int compat_blk_trace_setup(struct request_queue *q, char *name,
+				  dev_t dev, struct block_device *bdev,
+				  char __user *arg)
+{
+	struct blk_user_trace_setup buts;
+	struct compat_blk_user_trace_setup cbuts;
+	int ret;
+
+	if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
+		return -EFAULT;
+
+	buts = (struct blk_user_trace_setup) {
+		.act_mask = cbuts.act_mask,
+		.buf_size = cbuts.buf_size,
+		.buf_nr = cbuts.buf_nr,
+		.start_lba = cbuts.start_lba,
+		.end_lba = cbuts.end_lba,
+		.pid = cbuts.pid,
+	};
+	memcpy(&buts.name, &cbuts.name, 32);
+
+	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(arg, &buts.name, 32)) {
+		blk_trace_remove(q);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+#endif
+
 int blk_trace_startstop(struct request_queue *q, int start)
 {
 	int ret;
@@ -604,6 +639,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	if (!q)
 		return -ENXIO;
 
+	lock_kernel();
 	mutex_lock(&bdev->bd_mutex);
 
 	switch (cmd) {
@@ -611,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 		bdevname(bdev, b);
 		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+	case BLKTRACESETUP32:
+		bdevname(bdev, b);
+		ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+		break;
+#endif
 	case BLKTRACESTART:
 		start = 1;
 	case BLKTRACESTOP:
@@ -625,6 +667,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	}
 
 	mutex_unlock(&bdev->bd_mutex);
+	unlock_kernel();
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 2669b19fa4debcdd6a660ace1a124c0900f113e6 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Fri, 9 Jul 2010 14:24:38 +1000
Subject: block: fix for block tracing build error

block/compat_ioctl.c: In function 'compat_blkdev_ioctl':
block/compat_ioctl.c:754: error: 'BLKTRACESETUP32' undeclared (first use in this function)

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blktrace_api.h | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 07c698621ad0..3395cf7130f5 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -204,17 +204,6 @@ extern int blk_trace_init_sysfs(struct device *dev);
 
 extern struct attribute_group blk_trace_attr_group;
 
-struct compat_blk_user_trace_setup {
-	char name[32];
-	u16 act_mask;
-	u32 buf_size;
-	u32 buf_nr;
-	compat_u64 start_lba;
-	compat_u64 end_lba;
-	u32 pid;
-};
-#define BLKTRACESETUP32 _IOWR(0x12, 115, struct compat_blk_user_trace_setup)
-
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 # define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 # define blk_trace_shutdown(q)				do { } while (0)
@@ -232,6 +221,21 @@ static inline int blk_trace_init_sysfs(struct device *dev)
 
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
+#ifdef CONFIG_COMPAT
+
+struct compat_blk_user_trace_setup {
+	char name[32];
+	u16 act_mask;
+	u32 buf_size;
+	u32 buf_nr;
+	compat_u64 start_lba;
+	compat_u64 end_lba;
+	u32 pid;
+};
+#define BLKTRACESETUP32 _IOWR(0x12, 115, struct compat_blk_user_trace_setup)
+
+#endif
+
 #if defined(CONFIG_EVENT_TRACING) && defined(CONFIG_BLOCK)
 
 static inline int blk_cmd_buf_len(struct request *rq)
-- 
cgit v1.2.3-59-g8ed1b


From edca4a380584a65a16839bdee33ec82244f0f88e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 3 Aug 2010 12:54:51 +0200
Subject: block: disallow FS recursion from sb_issue_discard allocation

Filesystems can call sb_issue_discard on a memory reclaim path
(e.g. ext4 calls sb_issue_discard during journal commit).

Use GFP_NOFS in sb_issue_discard to avoid recursing back into the FS.

Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a8b05fc80c6d..89c855c5655c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -933,7 +933,7 @@ static inline int sb_issue_discard(struct super_block *sb,
 {
 	block <<= (sb->s_blocksize_bits - 9);
 	nr_blocks <<= (sb->s_blocksize_bits - 9);
-	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL,
+	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS,
 				   BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From aca27ba9618276dd2f777bcd5a1419589ccf1ca8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 3 Aug 2010 13:14:33 +0200
Subject: bio, fs: update RWA_MASK, READA and SWRITE to match the corresponding
 BIO_RW_* bits

Commit a82afdf (block: use the same failfast bits for bio and request)
moved BIO_RW_* bits around such that they match up with REQ_* bits.
Unfortunately, fs.h hard coded RW_MASK, RWA_MASK, READ, WRITE, READA
and SWRITE as 0, 1, 2 and 3, and expected them to match with BIO_RW_*
bits.  READ/WRITE didn't change but BIO_RW_AHEAD was moved to bit 4
instead of bit 1, breaking RWA_MASK, READA and SWRITE.

This patch updates RWA_MASK, READA and SWRITE such that they match the
BIO_RW_* bits again.  A follow up patch will update the definitions to
directly use BIO_RW_* bits so that this kind of breakage won't happen
again.

Neil also spotted missing RWA_MASK conversion.

Stable: The offending commit a82afdf was released with v2.6.32, so
this patch should be applied to all kernels since then but it must
_NOT_ be applied to kernels earlier than that.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-bisected-by: Vladislav Bolkhovitin <vst@vlnb.net>
Root-caused-by: Neil Brown <neilb@suse.de>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/fs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c5c92943c767..55dad7bca25b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -145,12 +145,12 @@ struct inodes_stat_t {
  *
  */
 #define RW_MASK			1
-#define RWA_MASK		2
+#define RWA_MASK		16
 
 #define READ			0
 #define WRITE			1
-#define READA			2 /* readahead  - don't block if no resources */
-#define SWRITE			3 /* for ll_rw_block() - wait for buffer lock */
+#define READA			16 /* readahead - don't block if no resources */
+#define SWRITE			17 /* for ll_rw_block(), wait for buffer lock */
 
 #define READ_SYNC		(READ | REQ_SYNC | REQ_UNPLUG)
 #define READ_META		(READ | REQ_META)
-- 
cgit v1.2.3-59-g8ed1b


From 7cc015811ef8992dfcce314d0ed9642bc18143d1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 3 Aug 2010 13:14:58 +0200
Subject: bio, fs: separate out bio_types.h and define READ/WRITE constants in
 terms of BIO_RW_* flags

linux/fs.h hard coded READ/WRITE constants which should match BIO_RW_*
flags.  This is fragile and caused breakage during BIO_RW_* flag
rearrangement.  The hardcoding is to avoid include dependency hell.

Create linux/bio_types.h which contatins definitions for bio data
structures and flags and include it from bio.h and fs.h, and make fs.h
define all READ/WRITE related constants in terms of BIO_RW_* flags.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/bio.h       | 183 +------------------------------------------
 include/linux/blk_types.h | 193 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h        |  15 ++--
 3 files changed, 204 insertions(+), 187 deletions(-)
 create mode 100644 include/linux/blk_types.h

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index f655b54c9ef3..5274103434ad 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -9,7 +9,7 @@
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
-
+ *
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
@@ -28,6 +28,9 @@
 
 #include <asm/io.h>
 
+/* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
+#include <linux/blk_types.h>
+
 #define BIO_DEBUG
 
 #ifdef BIO_DEBUG
@@ -40,184 +43,6 @@
 #define BIO_MAX_SIZE		(BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
 #define BIO_MAX_SECTORS		(BIO_MAX_SIZE >> 9)
 
-/*
- * was unsigned short, but we might as well be ready for > 64kB I/O pages
- */
-struct bio_vec {
-	struct page	*bv_page;
-	unsigned int	bv_len;
-	unsigned int	bv_offset;
-};
-
-struct bio_set;
-struct bio;
-struct bio_integrity_payload;
-typedef void (bio_end_io_t) (struct bio *, int);
-typedef void (bio_destructor_t) (struct bio *);
-
-/*
- * main unit of I/O for the block layer and lower layers (ie drivers and
- * stacking drivers)
- */
-struct bio {
-	sector_t		bi_sector;	/* device address in 512 byte
-						   sectors */
-	struct bio		*bi_next;	/* request queue link */
-	struct block_device	*bi_bdev;
-	unsigned long		bi_flags;	/* status, command, etc */
-	unsigned long		bi_rw;		/* bottom bits READ/WRITE,
-						 * top bits priority
-						 */
-
-	unsigned short		bi_vcnt;	/* how many bio_vec's */
-	unsigned short		bi_idx;		/* current index into bvl_vec */
-
-	/* Number of segments in this BIO after
-	 * physical address coalescing is performed.
-	 */
-	unsigned int		bi_phys_segments;
-
-	unsigned int		bi_size;	/* residual I/O count */
-
-	/*
-	 * To keep track of the max segment size, we account for the
-	 * sizes of the first and last mergeable segments in this bio.
-	 */
-	unsigned int		bi_seg_front_size;
-	unsigned int		bi_seg_back_size;
-
-	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
-
-	unsigned int		bi_comp_cpu;	/* completion CPU */
-
-	atomic_t		bi_cnt;		/* pin count */
-
-	struct bio_vec		*bi_io_vec;	/* the actual vec list */
-
-	bio_end_io_t		*bi_end_io;
-
-	void			*bi_private;
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-	struct bio_integrity_payload *bi_integrity;  /* data integrity */
-#endif
-
-	bio_destructor_t	*bi_destructor;	/* destructor */
-
-	/*
-	 * We can inline a number of vecs at the end of the bio, to avoid
-	 * double allocations for a small number of bio_vecs. This member
-	 * MUST obviously be kept at the very end of the bio.
-	 */
-	struct bio_vec		bi_inline_vecs[0];
-};
-
-/*
- * bio flags
- */
-#define BIO_UPTODATE	0	/* ok after I/O completion */
-#define BIO_RW_BLOCK	1	/* RW_AHEAD set, and read/write would block */
-#define BIO_EOF		2	/* out-out-bounds error */
-#define BIO_SEG_VALID	3	/* bi_phys_segments valid */
-#define BIO_CLONED	4	/* doesn't own data */
-#define BIO_BOUNCED	5	/* bio is a bounce bio */
-#define BIO_USER_MAPPED 6	/* contains user pages */
-#define BIO_EOPNOTSUPP	7	/* not supported */
-#define BIO_CPU_AFFINE	8	/* complete bio on same CPU as submitted */
-#define BIO_NULL_MAPPED 9	/* contains invalid user pages */
-#define BIO_FS_INTEGRITY 10	/* fs owns integrity data, not block layer */
-#define BIO_QUIET	11	/* Make BIO Quiet */
-#define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
-
-/*
- * top 4 bits of bio flags indicate the pool this bio came from
- */
-#define BIO_POOL_BITS		(4)
-#define BIO_POOL_NONE		((1UL << BIO_POOL_BITS) - 1)
-#define BIO_POOL_OFFSET		(BITS_PER_LONG - BIO_POOL_BITS)
-#define BIO_POOL_MASK		(1UL << BIO_POOL_OFFSET)
-#define BIO_POOL_IDX(bio)	((bio)->bi_flags >> BIO_POOL_OFFSET)	
-
-/*
- * Request flags.  For use in the cmd_flags field of struct request, and in
- * bi_rw of struct bio.  Note that some flags are only valid in either one.
- */
-enum rq_flag_bits {
-	/* common flags */
-	__REQ_WRITE,		/* not set, read. set, write */
-	__REQ_FAILFAST_DEV,	/* no driver retries of device errors */
-	__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
-	__REQ_FAILFAST_DRIVER,	/* no driver retries of driver errors */
-
-	__REQ_HARDBARRIER,	/* may not be passed by drive either */
-	__REQ_SYNC,		/* request is sync (sync write or read) */
-	__REQ_META,		/* metadata io request */
-	__REQ_DISCARD,		/* request to discard sectors */
-	__REQ_NOIDLE,		/* don't anticipate more IO after this one */
-
-	/* bio only flags */
-	__REQ_UNPLUG,		/* unplug the immediately after submission */
-	__REQ_RAHEAD,		/* read ahead, can fail anytime */
-
-	/* request only flags */
-	__REQ_SORTED,		/* elevator knows about this request */
-	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
-	__REQ_FUA,		/* forced unit access */
-	__REQ_NOMERGE,		/* don't touch this for merging */
-	__REQ_STARTED,		/* drive already may have started this one */
-	__REQ_DONTPREP,		/* don't call prep for this one */
-	__REQ_QUEUED,		/* uses queueing */
-	__REQ_ELVPRIV,		/* elevator private data attached */
-	__REQ_FAILED,		/* set if the request failed */
-	__REQ_QUIET,		/* don't worry about errors */
-	__REQ_PREEMPT,		/* set for "ide_preempt" requests */
-	__REQ_ORDERED_COLOR,	/* is before or after barrier */
-	__REQ_ALLOCED,		/* request came from our alloc pool */
-	__REQ_COPY_USER,	/* contains copies of user pages */
-	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
-	__REQ_FLUSH,		/* request for cache flush */
-	__REQ_IO_STAT,		/* account I/O stat */
-	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
-	__REQ_NR_BITS,		/* stops here */
-};
-
-#define REQ_WRITE		(1 << __REQ_WRITE)
-#define REQ_FAILFAST_DEV	(1 << __REQ_FAILFAST_DEV)
-#define REQ_FAILFAST_TRANSPORT	(1 << __REQ_FAILFAST_TRANSPORT)
-#define REQ_FAILFAST_DRIVER	(1 << __REQ_FAILFAST_DRIVER)
-#define REQ_HARDBARRIER		(1 << __REQ_HARDBARRIER)
-#define REQ_SYNC		(1 << __REQ_SYNC)
-#define REQ_META		(1 << __REQ_META)
-#define REQ_DISCARD		(1 << __REQ_DISCARD)
-#define REQ_NOIDLE		(1 << __REQ_NOIDLE)
-
-#define REQ_FAILFAST_MASK \
-	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
-#define REQ_COMMON_MASK \
-	(REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \
-	 REQ_META| REQ_DISCARD | REQ_NOIDLE)
-
-#define REQ_UNPLUG		(1 << __REQ_UNPLUG)
-#define REQ_RAHEAD		(1 << __REQ_RAHEAD)
-
-#define REQ_SORTED		(1 << __REQ_SORTED)
-#define REQ_SOFTBARRIER		(1 << __REQ_SOFTBARRIER)
-#define REQ_FUA			(1 << __REQ_FUA)
-#define REQ_NOMERGE		(1 << __REQ_NOMERGE)
-#define REQ_STARTED		(1 << __REQ_STARTED)
-#define REQ_DONTPREP		(1 << __REQ_DONTPREP)
-#define REQ_QUEUED		(1 << __REQ_QUEUED)
-#define REQ_ELVPRIV		(1 << __REQ_ELVPRIV)
-#define REQ_FAILED		(1 << __REQ_FAILED)
-#define REQ_QUIET		(1 << __REQ_QUIET)
-#define REQ_PREEMPT		(1 << __REQ_PREEMPT)
-#define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
-#define REQ_ALLOCED		(1 << __REQ_ALLOCED)
-#define REQ_COPY_USER		(1 << __REQ_COPY_USER)
-#define REQ_INTEGRITY		(1 << __REQ_INTEGRITY)
-#define REQ_FLUSH		(1 << __REQ_FLUSH)
-#define REQ_IO_STAT		(1 << __REQ_IO_STAT)
-#define REQ_MIXED_MERGE		(1 << __REQ_MIXED_MERGE)
-
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
  */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
new file mode 100644
index 000000000000..118523734af0
--- /dev/null
+++ b/include/linux/blk_types.h
@@ -0,0 +1,193 @@
+/*
+ * Block data types and constants.  Directly include this file only to
+ * break include dependency loop.
+ */
+#ifndef __LINUX_BLK_TYPES_H
+#define __LINUX_BLK_TYPES_H
+
+#ifdef CONFIG_BLOCK
+
+#include <linux/types.h>
+
+struct bio_set;
+struct bio;
+struct bio_integrity_payload;
+struct page;
+struct block_device;
+typedef void (bio_end_io_t) (struct bio *, int);
+typedef void (bio_destructor_t) (struct bio *);
+
+/*
+ * was unsigned short, but we might as well be ready for > 64kB I/O pages
+ */
+struct bio_vec {
+	struct page	*bv_page;
+	unsigned int	bv_len;
+	unsigned int	bv_offset;
+};
+
+/*
+ * main unit of I/O for the block layer and lower layers (ie drivers and
+ * stacking drivers)
+ */
+struct bio {
+	sector_t		bi_sector;	/* device address in 512 byte
+						   sectors */
+	struct bio		*bi_next;	/* request queue link */
+	struct block_device	*bi_bdev;
+	unsigned long		bi_flags;	/* status, command, etc */
+	unsigned long		bi_rw;		/* bottom bits READ/WRITE,
+						 * top bits priority
+						 */
+
+	unsigned short		bi_vcnt;	/* how many bio_vec's */
+	unsigned short		bi_idx;		/* current index into bvl_vec */
+
+	/* Number of segments in this BIO after
+	 * physical address coalescing is performed.
+	 */
+	unsigned int		bi_phys_segments;
+
+	unsigned int		bi_size;	/* residual I/O count */
+
+	/*
+	 * To keep track of the max segment size, we account for the
+	 * sizes of the first and last mergeable segments in this bio.
+	 */
+	unsigned int		bi_seg_front_size;
+	unsigned int		bi_seg_back_size;
+
+	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
+
+	unsigned int		bi_comp_cpu;	/* completion CPU */
+
+	atomic_t		bi_cnt;		/* pin count */
+
+	struct bio_vec		*bi_io_vec;	/* the actual vec list */
+
+	bio_end_io_t		*bi_end_io;
+
+	void			*bi_private;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	struct bio_integrity_payload *bi_integrity;  /* data integrity */
+#endif
+
+	bio_destructor_t	*bi_destructor;	/* destructor */
+
+	/*
+	 * We can inline a number of vecs at the end of the bio, to avoid
+	 * double allocations for a small number of bio_vecs. This member
+	 * MUST obviously be kept at the very end of the bio.
+	 */
+	struct bio_vec		bi_inline_vecs[0];
+};
+
+/*
+ * bio flags
+ */
+#define BIO_UPTODATE	0	/* ok after I/O completion */
+#define BIO_RW_BLOCK	1	/* RW_AHEAD set, and read/write would block */
+#define BIO_EOF		2	/* out-out-bounds error */
+#define BIO_SEG_VALID	3	/* bi_phys_segments valid */
+#define BIO_CLONED	4	/* doesn't own data */
+#define BIO_BOUNCED	5	/* bio is a bounce bio */
+#define BIO_USER_MAPPED 6	/* contains user pages */
+#define BIO_EOPNOTSUPP	7	/* not supported */
+#define BIO_CPU_AFFINE	8	/* complete bio on same CPU as submitted */
+#define BIO_NULL_MAPPED 9	/* contains invalid user pages */
+#define BIO_FS_INTEGRITY 10	/* fs owns integrity data, not block layer */
+#define BIO_QUIET	11	/* Make BIO Quiet */
+#define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
+
+/*
+ * top 4 bits of bio flags indicate the pool this bio came from
+ */
+#define BIO_POOL_BITS		(4)
+#define BIO_POOL_NONE		((1UL << BIO_POOL_BITS) - 1)
+#define BIO_POOL_OFFSET		(BITS_PER_LONG - BIO_POOL_BITS)
+#define BIO_POOL_MASK		(1UL << BIO_POOL_OFFSET)
+#define BIO_POOL_IDX(bio)	((bio)->bi_flags >> BIO_POOL_OFFSET)
+
+/*
+ * Request flags.  For use in the cmd_flags field of struct request, and in
+ * bi_rw of struct bio.  Note that some flags are only valid in either one.
+ */
+enum rq_flag_bits {
+	/* common flags */
+	__REQ_WRITE,		/* not set, read. set, write */
+	__REQ_FAILFAST_DEV,	/* no driver retries of device errors */
+	__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
+	__REQ_FAILFAST_DRIVER,	/* no driver retries of driver errors */
+
+	__REQ_HARDBARRIER,	/* may not be passed by drive either */
+	__REQ_SYNC,		/* request is sync (sync write or read) */
+	__REQ_META,		/* metadata io request */
+	__REQ_DISCARD,		/* request to discard sectors */
+	__REQ_NOIDLE,		/* don't anticipate more IO after this one */
+
+	/* bio only flags */
+	__REQ_UNPLUG,		/* unplug the immediately after submission */
+	__REQ_RAHEAD,		/* read ahead, can fail anytime */
+
+	/* request only flags */
+	__REQ_SORTED,		/* elevator knows about this request */
+	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
+	__REQ_FUA,		/* forced unit access */
+	__REQ_NOMERGE,		/* don't touch this for merging */
+	__REQ_STARTED,		/* drive already may have started this one */
+	__REQ_DONTPREP,		/* don't call prep for this one */
+	__REQ_QUEUED,		/* uses queueing */
+	__REQ_ELVPRIV,		/* elevator private data attached */
+	__REQ_FAILED,		/* set if the request failed */
+	__REQ_QUIET,		/* don't worry about errors */
+	__REQ_PREEMPT,		/* set for "ide_preempt" requests */
+	__REQ_ORDERED_COLOR,	/* is before or after barrier */
+	__REQ_ALLOCED,		/* request came from our alloc pool */
+	__REQ_COPY_USER,	/* contains copies of user pages */
+	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
+	__REQ_FLUSH,		/* request for cache flush */
+	__REQ_IO_STAT,		/* account I/O stat */
+	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
+	__REQ_NR_BITS,		/* stops here */
+};
+
+#define REQ_WRITE		(1 << __REQ_WRITE)
+#define REQ_FAILFAST_DEV	(1 << __REQ_FAILFAST_DEV)
+#define REQ_FAILFAST_TRANSPORT	(1 << __REQ_FAILFAST_TRANSPORT)
+#define REQ_FAILFAST_DRIVER	(1 << __REQ_FAILFAST_DRIVER)
+#define REQ_HARDBARRIER		(1 << __REQ_HARDBARRIER)
+#define REQ_SYNC		(1 << __REQ_SYNC)
+#define REQ_META		(1 << __REQ_META)
+#define REQ_DISCARD		(1 << __REQ_DISCARD)
+#define REQ_NOIDLE		(1 << __REQ_NOIDLE)
+
+#define REQ_FAILFAST_MASK \
+	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
+#define REQ_COMMON_MASK \
+	(REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \
+	 REQ_META| REQ_DISCARD | REQ_NOIDLE)
+
+#define REQ_UNPLUG		(1 << __REQ_UNPLUG)
+#define REQ_RAHEAD		(1 << __REQ_RAHEAD)
+
+#define REQ_SORTED		(1 << __REQ_SORTED)
+#define REQ_SOFTBARRIER		(1 << __REQ_SOFTBARRIER)
+#define REQ_FUA			(1 << __REQ_FUA)
+#define REQ_NOMERGE		(1 << __REQ_NOMERGE)
+#define REQ_STARTED		(1 << __REQ_STARTED)
+#define REQ_DONTPREP		(1 << __REQ_DONTPREP)
+#define REQ_QUEUED		(1 << __REQ_QUEUED)
+#define REQ_ELVPRIV		(1 << __REQ_ELVPRIV)
+#define REQ_FAILED		(1 << __REQ_FAILED)
+#define REQ_QUIET		(1 << __REQ_QUIET)
+#define REQ_PREEMPT		(1 << __REQ_PREEMPT)
+#define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
+#define REQ_ALLOCED		(1 << __REQ_ALLOCED)
+#define REQ_COPY_USER		(1 << __REQ_COPY_USER)
+#define REQ_INTEGRITY		(1 << __REQ_INTEGRITY)
+#define REQ_FLUSH		(1 << __REQ_FLUSH)
+#define REQ_IO_STAT		(1 << __REQ_IO_STAT)
+#define REQ_MIXED_MERGE		(1 << __REQ_MIXED_MERGE)
+
+#endif /* CONFIG_BLOCK */
+#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 55dad7bca25b..c53911277210 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -8,6 +8,7 @@
 
 #include <linux/limits.h>
 #include <linux/ioctl.h>
+#include <linux/blk_types.h>
 
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -117,7 +118,7 @@ struct inodes_stat_t {
  *			immediately wait on this read without caring about
  *			unplugging.
  * READA		Used for read-ahead operations. Lower priority, and the
- *			 block layer could (in theory) choose to ignore this
+ *			block layer could (in theory) choose to ignore this
  *			request if it runs into resource problems.
  * WRITE		A normal async write. Device will be plugged.
  * SWRITE		Like WRITE, but a special case for ll_rw_block() that
@@ -144,13 +145,13 @@ struct inodes_stat_t {
  *			of this IO.
  *
  */
-#define RW_MASK			1
-#define RWA_MASK		16
+#define RW_MASK			REQ_WRITE
+#define RWA_MASK		REQ_RAHEAD
 
 #define READ			0
-#define WRITE			1
-#define READA			16 /* readahead - don't block if no resources */
-#define SWRITE			17 /* for ll_rw_block(), wait for buffer lock */
+#define WRITE			RW_MASK
+#define READA			RWA_MASK
+#define SWRITE			(WRITE | READA)
 
 #define READ_SYNC		(READ | REQ_SYNC | REQ_UNPLUG)
 #define READ_META		(READ | REQ_META)
@@ -2200,7 +2201,6 @@ static inline void insert_inode_hash(struct inode *inode) {
 extern void file_move(struct file *f, struct list_head *list);
 extern void file_kill(struct file *f);
 #ifdef CONFIG_BLOCK
-struct bio;
 extern void submit_bio(int, struct bio *);
 extern int bdev_read_only(struct block_device *);
 #endif
@@ -2267,7 +2267,6 @@ static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
 #endif
 
 #ifdef CONFIG_BLOCK
-struct bio;
 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
 			    loff_t file_offset);
 void dio_end_io(struct bio *bio, int error);
-- 
cgit v1.2.3-59-g8ed1b


From 4aeefdc69f7b6f3f287e6fd8d4b213953b9e92d8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 3 Aug 2010 13:22:51 +0200
Subject: coda: fixup clash with block layer REQ_* defines

CODA should not be using defines in the global name space of
that nature, prefix them with CODA_.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/coda/psdev.c            | 12 ++++++------
 fs/coda/upcall.c           | 12 ++++++------
 include/linux/coda_psdev.h |  8 ++++----
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 66b9cf79c5ba..de89645777c7 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -177,7 +177,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 		nbytes = req->uc_outSize; /* don't have more space! */
 	}
         if (copy_from_user(req->uc_data, buf, nbytes)) {
-		req->uc_flags |= REQ_ABORT;
+		req->uc_flags |= CODA_REQ_ABORT;
 		wake_up(&req->uc_sleep);
 		retval = -EFAULT;
 		goto out;
@@ -254,8 +254,8 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
 	        retval = -EFAULT;
         
 	/* If request was not a signal, enqueue and don't free */
-	if (!(req->uc_flags & REQ_ASYNC)) {
-		req->uc_flags |= REQ_READ;
+	if (!(req->uc_flags & CODA_REQ_ASYNC)) {
+		req->uc_flags |= CODA_REQ_READ;
 		list_add_tail(&(req->uc_chain), &vcp->vc_processing);
 		goto out;
 	}
@@ -315,19 +315,19 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
 		list_del(&req->uc_chain);
 
 		/* Async requests need to be freed here */
-		if (req->uc_flags & REQ_ASYNC) {
+		if (req->uc_flags & CODA_REQ_ASYNC) {
 			CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
 			kfree(req);
 			continue;
 		}
-		req->uc_flags |= REQ_ABORT;
+		req->uc_flags |= CODA_REQ_ABORT;
 		wake_up(&req->uc_sleep);
 	}
 
 	list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) {
 		list_del(&req->uc_chain);
 
-		req->uc_flags |= REQ_ABORT;
+		req->uc_flags |= CODA_REQ_ABORT;
 		wake_up(&req->uc_sleep);
 	}
 
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index f09c5ed76f6c..b8893ab6f9e6 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -604,7 +604,7 @@ static void coda_unblock_signals(sigset_t *old)
 			       (((r)->uc_opcode != CODA_CLOSE && \
 				 (r)->uc_opcode != CODA_STORE && \
 				 (r)->uc_opcode != CODA_RELEASE) || \
-				(r)->uc_flags & REQ_READ))
+				(r)->uc_flags & CODA_REQ_READ))
 
 static inline void coda_waitfor_upcall(struct upc_req *req)
 {
@@ -624,7 +624,7 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
 			set_current_state(TASK_UNINTERRUPTIBLE);
 
 		/* got a reply */
-		if (req->uc_flags & (REQ_WRITE | REQ_ABORT))
+		if (req->uc_flags & (CODA_REQ_WRITE | CODA_REQ_ABORT))
 			break;
 
 		if (blocked && time_after(jiffies, timeout) &&
@@ -708,7 +708,7 @@ static int coda_upcall(struct venus_comm *vcp,
 	coda_waitfor_upcall(req);
 
 	/* Op went through, interrupt or not... */
-	if (req->uc_flags & REQ_WRITE) {
+	if (req->uc_flags & CODA_REQ_WRITE) {
 		out = (union outputArgs *)req->uc_data;
 		/* here we map positive Venus errors to kernel errors */
 		error = -out->oh.result;
@@ -717,13 +717,13 @@ static int coda_upcall(struct venus_comm *vcp,
 	}
 
 	error = -EINTR;
-	if ((req->uc_flags & REQ_ABORT) || !signal_pending(current)) {
+	if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) {
 		printk(KERN_WARNING "coda: Unexpected interruption.\n");
 		goto exit;
 	}
 
 	/* Interrupted before venus read it. */
-	if (!(req->uc_flags & REQ_READ))
+	if (!(req->uc_flags & CODA_REQ_READ))
 		goto exit;
 
 	/* Venus saw the upcall, make sure we can send interrupt signal */
@@ -747,7 +747,7 @@ static int coda_upcall(struct venus_comm *vcp,
 	sig_inputArgs->ih.opcode = CODA_SIGNAL;
 	sig_inputArgs->ih.unique = req->uc_unique;
 
-	sig_req->uc_flags = REQ_ASYNC;
+	sig_req->uc_flags = CODA_REQ_ASYNC;
 	sig_req->uc_opcode = sig_inputArgs->ih.opcode;
 	sig_req->uc_unique = sig_inputArgs->ih.unique;
 	sig_req->uc_inSize = sizeof(struct coda_in_hdr);
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 8859e2ede9fe..284b520934a0 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -86,9 +86,9 @@ struct upc_req {
 	wait_queue_head_t   uc_sleep;   /* process' wait queue */
 };
 
-#define REQ_ASYNC  0x1
-#define REQ_READ   0x2
-#define REQ_WRITE  0x4
-#define REQ_ABORT  0x8
+#define CODA_REQ_ASYNC  0x1
+#define CODA_REQ_READ   0x2
+#define CODA_REQ_WRITE  0x4
+#define CODA_REQ_ABORT  0x8
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 6f904ff0e39ea88f81eb77e8dfb4e1238492f0a8 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 25 Jul 2010 14:29:11 +0300
Subject: writeback: harmonize writeback threads naming

The write-back code mixes words "thread" and "task" for the same things. This
is not a big deal, but still an inconsistency.

hch: a convention I tend to use and I've seen in various places
is to always use _task for the storage of the task_struct pointer,
and thread everywhere else.  This especially helps with having
foo_thread for the actual thread and foo_task for a global
variable keeping the task_struct pointer

This patch renames:
* 'bdi_add_default_flusher_task()' -> 'bdi_add_default_flusher_thread()'
* 'bdi_forker_task()'              -> 'bdi_forker_thread()'

because bdi threads are 'bdi_writeback_thread()', so these names are more
consistent.

This patch also amends commentaries and makes them refer the forker and bdi
threads as "thread", not "task".

Also, while on it, make 'bdi_add_default_flusher_thread()' declaration use
'static void' instead of 'void static' and make checkpatch.pl happy.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c           |  2 +-
 include/linux/backing-dev.h |  2 +-
 mm/backing-dev.c            | 26 +++++++++++++-------------
 3 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 261570deb22c..002be0ff2ab3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -840,7 +840,7 @@ int bdi_writeback_thread(void *data)
 
 			/*
 			 * Longest period of inactivity that we tolerate. If we
-			 * see dirty data again later, the task will get
+			 * see dirty data again later, the thread will get
 			 * recreated automatically.
 			 */
 			max_idle = max(5UL * 60 * HZ, wait_jiffies);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e536f3a74e60..f0936f5f85dd 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -50,7 +50,7 @@ struct bdi_writeback {
 
 	unsigned long last_old_flush;		/* last old data flush */
 
-	struct task_struct	*task;		/* writeback task */
+	struct task_struct	*task;		/* writeback thread */
 	struct list_head	b_dirty;	/* dirty inodes */
 	struct list_head	b_io;		/* parked for writeback */
 	struct list_head	b_more_io;	/* parked for more writeback */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index ac78a3336181..4e9ed2a8521f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -50,7 +50,7 @@ static struct timer_list sync_supers_timer;
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
 
-static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
+static void bdi_add_default_flusher_thread(struct backing_dev_info *bdi);
 
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
@@ -279,10 +279,10 @@ static void bdi_flush_io(struct backing_dev_info *bdi)
 }
 
 /*
- * kupdated() used to do this. We cannot do it from the bdi_forker_task()
+ * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
  * or we risk deadlocking on ->s_umount. The longer term solution would be
  * to implement sync_supers_bdi() or similar and simply do it from the
- * bdi writeback tasks individually.
+ * bdi writeback thread individually.
  */
 static int bdi_sync_supers(void *unused)
 {
@@ -318,7 +318,7 @@ static void sync_supers_timer_fn(unsigned long unused)
 	bdi_arm_supers_timer();
 }
 
-static int bdi_forker_task(void *ptr)
+static int bdi_forker_thread(void *ptr)
 {
 	struct bdi_writeback *me = ptr;
 
@@ -354,7 +354,7 @@ static int bdi_forker_task(void *ptr)
 			    !bdi_has_dirty_io(bdi))
 				continue;
 
-			bdi_add_default_flusher_task(bdi);
+			bdi_add_default_flusher_thread(bdi);
 		}
 
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -376,7 +376,7 @@ static int bdi_forker_task(void *ptr)
 
 		/*
 		 * This is our real job - check for pending entries in
-		 * bdi_pending_list, and create the tasks that got added
+		 * bdi_pending_list, and create the threads that got added
 		 */
 		bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
 				 bdi_list);
@@ -387,7 +387,7 @@ static int bdi_forker_task(void *ptr)
 		wb->task = kthread_run(bdi_writeback_thread, wb, "flush-%s",
 					dev_name(bdi->dev));
 		/*
-		 * If task creation fails, then readd the bdi to
+		 * If thread creation fails, then readd the bdi to
 		 * the pending list and force writeout of the bdi
 		 * from this forker thread. That will free some memory
 		 * and we can try again.
@@ -430,10 +430,10 @@ static void bdi_add_to_pending(struct rcu_head *head)
 }
 
 /*
- * Add the default flusher task that gets created for any bdi
+ * Add the default flusher thread that gets created for any bdi
  * that has dirty data pending writeout
  */
-void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
+static void bdi_add_default_flusher_thread(struct backing_dev_info *bdi)
 {
 	if (!bdi_cap_writeback_dirty(bdi))
 		return;
@@ -445,10 +445,10 @@ void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
 	}
 
 	/*
-	 * Check with the helper whether to proceed adding a task. Will only
+	 * Check with the helper whether to proceed adding a thread. Will only
 	 * abort if we two or more simultanous calls to
-	 * bdi_add_default_flusher_task() occured, further additions will block
-	 * waiting for previous additions to finish.
+	 * bdi_add_default_flusher_thread() occured, further additions will
+	 * block waiting for previous additions to finish.
 	 */
 	if (!test_and_set_bit(BDI_pending, &bdi->state)) {
 		list_del_rcu(&bdi->bdi_list);
@@ -506,7 +506,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 	if (bdi_cap_flush_forker(bdi)) {
 		struct bdi_writeback *wb = &bdi->wb;
 
-		wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
+		wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
 						dev_name(dev));
 		if (IS_ERR(wb->task)) {
 			wb->task = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 080dcec41709be72613133f695be75b98dd43e88 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 25 Jul 2010 14:29:16 +0300
Subject: writeback: simplify bdi code a little

This patch simplifies bdi code a little by removing the 'pending_list' which is
redundant. Indeed, currently the forker thread ('bdi_forker_thread()') is
working like this:

1. In a loop, fetch all bdi's which have works but have no writeback thread and
   move them to the 'pending_list'.
2. If the list is empty, sleep for 5 sec.
3. Otherwise, take one bdi from the list, fork the writeback thread for this
   bdi, and repeat the loop.

IOW, it first moves everything to the 'pending_list', then process only one
element, and so on. This patch simplifies the algorithm, which is now as
follows.

1. Find the first bdi which has a work and remove it from the global list of
   bdi's (bdi_list).
2. If there was not such bdi, sleep 5 sec.
3. Fork the writeback thread for this bdi and repeat the loop.

IOW, now we find the first bdi to process, process it, and so on. This is
simpler and involves less lists.

The bonus now is that we can get rid of a couple of functions, as well as
remove complications which involve 'rcu_call()' and 'bdi->rcu_head'.

This patch also makes sure we use 'list_add_tail_rcu()', instead of plain
'list_add_tail()', but this piece of code is going to be removed in the next
patch anyway.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/backing-dev.h |  1 -
 mm/backing-dev.c            | 82 ++++++++++-----------------------------------
 2 files changed, 18 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f0936f5f85dd..95ecb2bebca8 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -58,7 +58,6 @@ struct bdi_writeback {
 
 struct backing_dev_info {
 	struct list_head bdi_list;
-	struct rcu_head rcu_head;
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
 	unsigned long state;	/* Always use atomic bitops on this */
 	unsigned int capabilities; /* Device capabilities */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 72e6eb96efe2..dbc66815a0fe 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -50,8 +50,6 @@ static struct timer_list sync_supers_timer;
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
 
-static void bdi_add_default_flusher_thread(struct backing_dev_info *bdi);
-
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -331,6 +329,7 @@ static int bdi_forker_thread(void *ptr)
 	set_user_nice(current, 0);
 
 	for (;;) {
+		bool fork = false;
 		struct task_struct *task;
 		struct backing_dev_info *bdi, *tmp;
 
@@ -349,23 +348,30 @@ static int bdi_forker_thread(void *ptr)
 		 * a thread registered. If so, set that up.
 		 */
 		list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
+			if (!bdi_cap_writeback_dirty(bdi))
+				continue;
 			if (bdi->wb.task)
 				continue;
 			if (list_empty(&bdi->work_list) &&
 			    !bdi_has_dirty_io(bdi))
 				continue;
 
-			bdi_add_default_flusher_thread(bdi);
+			WARN(!test_bit(BDI_registered, &bdi->state),
+			     "bdi %p/%s is not registered!\n", bdi, bdi->name);
+
+			list_del_rcu(&bdi->bdi_list);
+			fork = true;
+			break;
 		}
+		spin_unlock_bh(&bdi_lock);
 
 		/* Keep working if default bdi still has things to do */
 		if (!list_empty(&me->bdi->work_list))
 			__set_current_state(TASK_RUNNING);
 
-		if (list_empty(&bdi_pending_list)) {
+		if (!fork) {
 			unsigned long wait;
 
-			spin_unlock_bh(&bdi_lock);
 			wait = msecs_to_jiffies(dirty_writeback_interval * 10);
 			if (wait)
 				schedule_timeout(wait);
@@ -378,13 +384,13 @@ static int bdi_forker_thread(void *ptr)
 		__set_current_state(TASK_RUNNING);
 
 		/*
-		 * This is our real job - check for pending entries in
-		 * bdi_pending_list, and create the threads that got added
+		 * Set the pending bit - if someone will try to unregister this
+		 * bdi - it'll wait on this bit.
 		 */
-		bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
-				 bdi_list);
-		list_del_init(&bdi->bdi_list);
-		spin_unlock_bh(&bdi_lock);
+		set_bit(BDI_pending, &bdi->state);
+
+		/* Make sure no one uses the picked bdi */
+		synchronize_rcu();
 
 		task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s",
 				   dev_name(bdi->dev));
@@ -397,7 +403,7 @@ static int bdi_forker_thread(void *ptr)
 			 * flush other bdi's to free memory.
 			 */
 			spin_lock_bh(&bdi_lock);
-			list_add_tail(&bdi->bdi_list, &bdi_pending_list);
+			list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
 			spin_unlock_bh(&bdi_lock);
 
 			bdi_flush_io(bdi);
@@ -408,57 +414,6 @@ static int bdi_forker_thread(void *ptr)
 	return 0;
 }
 
-static void bdi_add_to_pending(struct rcu_head *head)
-{
-	struct backing_dev_info *bdi;
-
-	bdi = container_of(head, struct backing_dev_info, rcu_head);
-	INIT_LIST_HEAD(&bdi->bdi_list);
-
-	spin_lock(&bdi_lock);
-	list_add_tail(&bdi->bdi_list, &bdi_pending_list);
-	spin_unlock(&bdi_lock);
-
-	/*
-	 * We are now on the pending list, wake up bdi_forker_task()
-	 * to finish the job and add us back to the active bdi_list
-	 */
-	wake_up_process(default_backing_dev_info.wb.task);
-}
-
-/*
- * Add the default flusher thread that gets created for any bdi
- * that has dirty data pending writeout
- */
-static void bdi_add_default_flusher_thread(struct backing_dev_info *bdi)
-{
-	if (!bdi_cap_writeback_dirty(bdi))
-		return;
-
-	if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
-		printk(KERN_ERR "bdi %p/%s is not registered!\n",
-							bdi, bdi->name);
-		return;
-	}
-
-	/*
-	 * Check with the helper whether to proceed adding a thread. Will only
-	 * abort if we two or more simultanous calls to
-	 * bdi_add_default_flusher_thread() occured, further additions will
-	 * block waiting for previous additions to finish.
-	 */
-	if (!test_and_set_bit(BDI_pending, &bdi->state)) {
-		list_del_rcu(&bdi->bdi_list);
-
-		/*
-		 * We must wait for the current RCU period to end before
-		 * moving to the pending list. So schedule that operation
-		 * from an RCU callback.
-		 */
-		call_rcu(&bdi->rcu_head, bdi_add_to_pending);
-	}
-}
-
 /*
  * Remove bdi from bdi_list, and ensure that it is no longer visible
  */
@@ -599,7 +554,6 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = PROP_FRAC_BASE;
 	spin_lock_init(&bdi->wb_lock);
-	INIT_RCU_HEAD(&bdi->rcu_head);
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->work_list);
 
-- 
cgit v1.2.3-59-g8ed1b


From ecd584030da67ede1bf17955746a6ce834d9fc6b Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 25 Jul 2010 14:29:18 +0300
Subject: writeback: move last_active to bdi

Currently bdi threads use local variable 'last_active' which stores last time
when the bdi thread did some useful work. Move this local variable to 'struct
bdi_writeback'. This is just a preparation for the further patches which will
make the forker thread decide when bdi threads should be killed.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c           |  6 +++---
 include/linux/backing-dev.h | 13 +++++++------
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 57fbfd0ebc52..9f5cab75c157 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -800,12 +800,12 @@ int bdi_writeback_thread(void *data)
 {
 	struct bdi_writeback *wb = data;
 	struct backing_dev_info *bdi = wb->bdi;
-	unsigned long last_active = jiffies;
 	unsigned long wait_jiffies = -1UL;
 	long pages_written;
 
 	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
 	set_freezable();
+	wb->last_active = jiffies;
 
 	/*
 	 * Our parent may run at a different priority, just set us to normal
@@ -827,7 +827,7 @@ int bdi_writeback_thread(void *data)
 		trace_writeback_pages_written(pages_written);
 
 		if (pages_written)
-			last_active = jiffies;
+			wb->last_active = jiffies;
 		else if (wait_jiffies != -1UL) {
 			unsigned long max_idle;
 
@@ -837,7 +837,7 @@ int bdi_writeback_thread(void *data)
 			 * recreated automatically.
 			 */
 			max_idle = max(5UL * 60 * HZ, wait_jiffies);
-			if (time_after(jiffies, max_idle + last_active))
+			if (time_after(jiffies, max_idle + wb->last_active))
 				break;
 		}
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 95ecb2bebca8..71b6223e0a77 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -45,15 +45,16 @@ enum bdi_stat_item {
 #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
 
 struct bdi_writeback {
-	struct backing_dev_info *bdi;		/* our parent bdi */
+	struct backing_dev_info *bdi;	/* our parent bdi */
 	unsigned int nr;
 
-	unsigned long last_old_flush;		/* last old data flush */
+	unsigned long last_old_flush;	/* last old data flush */
+	unsigned long last_active;	/* last time bdi thread was active */
 
-	struct task_struct	*task;		/* writeback thread */
-	struct list_head	b_dirty;	/* dirty inodes */
-	struct list_head	b_io;		/* parked for writeback */
-	struct list_head	b_more_io;	/* parked for more writeback */
+	struct task_struct *task;	/* writeback thread */
+	struct list_head b_dirty;	/* dirty inodes */
+	struct list_head b_io;		/* parked for writeback */
+	struct list_head b_more_io;	/* parked for more writeback */
 };
 
 struct backing_dev_info {
-- 
cgit v1.2.3-59-g8ed1b


From 6467716a37673e8d47b4984eb19839bdad0a8353 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 25 Jul 2010 14:29:22 +0300
Subject: writeback: optimize periodic bdi thread wakeups

Whe the first inode for a bdi is marked dirty, we wake up the bdi thread which
should take care of the periodic background write-out. However, the write-out
will actually start only 'dirty_writeback_interval' centisecs later, so we can
delay the wake-up.

This change was requested by Nick Piggin who pointed out that if we delay the
wake-up, we weed out 2 unnecessary contex switches, which matters because
'__mark_inode_dirty()' is a hot-path function.

This patch introduces a new function - 'bdi_wakeup_thread_delayed()', which
sets up a timer to wake-up the bdi thread and returns. So the wake-up is
delayed.

We also delete the timer in bdi threads just before writing-back. And
synchronously delete it when unregistering bdi. At the unregister point the bdi
does not have any users, so no one can arm it again.

Since now we take 'bdi->wb_lock' in the timer, which can execute in softirq
context, we have to use 'spin_lock_bh()' for 'bdi->wb_lock'. This patch makes
this change as well.

This patch also moves the 'bdi_wb_init()' function down in the file to avoid
forward-declaration of 'bdi_wakeup_thread_delayed()'.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c           | 36 +++++++---------------
 include/linux/backing-dev.h |  2 ++
 mm/backing-dev.c            | 73 +++++++++++++++++++++++++++++++++++----------
 3 files changed, 70 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 55f6e46e06f1..bfa2df2c7ce2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -76,7 +76,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 {
 	trace_writeback_queue(bdi, work);
 
-	spin_lock(&bdi->wb_lock);
+	spin_lock_bh(&bdi->wb_lock);
 	list_add_tail(&work->list, &bdi->work_list);
 	if (bdi->wb.task) {
 		wake_up_process(bdi->wb.task);
@@ -88,7 +88,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 		trace_writeback_nothread(bdi, work);
 		wake_up_process(default_backing_dev_info.wb.task);
 	}
-	spin_unlock(&bdi->wb_lock);
+	spin_unlock_bh(&bdi->wb_lock);
 }
 
 static void
@@ -704,13 +704,13 @@ get_next_work_item(struct backing_dev_info *bdi)
 {
 	struct wb_writeback_work *work = NULL;
 
-	spin_lock(&bdi->wb_lock);
+	spin_lock_bh(&bdi->wb_lock);
 	if (!list_empty(&bdi->work_list)) {
 		work = list_entry(bdi->work_list.next,
 				  struct wb_writeback_work, list);
 		list_del_init(&work->list);
 	}
-	spin_unlock(&bdi->wb_lock);
+	spin_unlock_bh(&bdi->wb_lock);
 	return work;
 }
 
@@ -810,6 +810,12 @@ int bdi_writeback_thread(void *data)
 	trace_writeback_thread_start(bdi);
 
 	while (!kthread_should_stop()) {
+		/*
+		 * Remove own delayed wake-up timer, since we are already awake
+		 * and we'll take care of the preriodic write-back.
+		 */
+		del_timer(&wb->wakeup_timer);
+
 		pages_written = wb_do_writeback(wb, 0);
 
 		trace_writeback_pages_written(pages_written);
@@ -868,26 +874,6 @@ void wakeup_flusher_threads(long nr_pages)
 	rcu_read_unlock();
 }
 
-/*
- * This function is used when the first inode for this bdi is marked dirty. It
- * wakes-up the corresponding bdi thread which should then take care of the
- * periodic background write-out of dirty inodes.
- */
-static void wakeup_bdi_thread(struct backing_dev_info *bdi)
-{
-	spin_lock(&bdi->wb_lock);
-	if (bdi->wb.task)
-		wake_up_process(bdi->wb.task);
-	else
-		/*
-		 * When bdi tasks are inactive for long time, they are killed.
-		 * In this case we have to wake-up the forker thread which
-		 * should create and run the bdi thread.
-		 */
-		wake_up_process(default_backing_dev_info.wb.task);
-	spin_unlock(&bdi->wb_lock);
-}
-
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 {
 	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1019,7 +1005,7 @@ out:
 	spin_unlock(&inode_lock);
 
 	if (wakeup_bdi)
-		wakeup_bdi_thread(bdi);
+		bdi_wakeup_thread_delayed(bdi);
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 71b6223e0a77..7628219e5386 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -52,6 +52,7 @@ struct bdi_writeback {
 	unsigned long last_active;	/* last time bdi thread was active */
 
 	struct task_struct *task;	/* writeback thread */
+	struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
@@ -105,6 +106,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi);
 int bdi_writeback_thread(void *data);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);
+void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a9a08d88a745..cfff7225138c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -248,17 +248,6 @@ static int __init default_bdi_init(void)
 }
 subsys_initcall(default_bdi_init);
 
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
-{
-	memset(wb, 0, sizeof(*wb));
-
-	wb->bdi = bdi;
-	wb->last_old_flush = jiffies;
-	INIT_LIST_HEAD(&wb->b_dirty);
-	INIT_LIST_HEAD(&wb->b_io);
-	INIT_LIST_HEAD(&wb->b_more_io);
-}
-
 int bdi_has_dirty_io(struct backing_dev_info *bdi)
 {
 	return wb_has_dirty_io(&bdi->wb);
@@ -316,6 +305,43 @@ static void sync_supers_timer_fn(unsigned long unused)
 	bdi_arm_supers_timer();
 }
 
+static void wakeup_timer_fn(unsigned long data)
+{
+	struct backing_dev_info *bdi = (struct backing_dev_info *)data;
+
+	spin_lock_bh(&bdi->wb_lock);
+	if (bdi->wb.task) {
+		wake_up_process(bdi->wb.task);
+	} else {
+		/*
+		 * When bdi tasks are inactive for long time, they are killed.
+		 * In this case we have to wake-up the forker thread which
+		 * should create and run the bdi thread.
+		 */
+		wake_up_process(default_backing_dev_info.wb.task);
+	}
+	spin_unlock_bh(&bdi->wb_lock);
+}
+
+/*
+ * This function is used when the first inode for this bdi is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ */
+void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+{
+	unsigned long timeout;
+
+	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
+}
+
 /*
  * Calculate the longest interval (jiffies) bdi threads are allowed to be
  * inactive.
@@ -353,8 +379,10 @@ static int bdi_forker_thread(void *ptr)
 		 * Temporary measure, we want to make sure we don't see
 		 * dirty data on the default backing_dev_info
 		 */
-		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
+		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
+			del_timer(&me->wakeup_timer);
 			wb_do_writeback(me, 0);
+		}
 
 		spin_lock_bh(&bdi_lock);
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -386,7 +414,7 @@ static int bdi_forker_thread(void *ptr)
 				break;
 			}
 
-			spin_lock(&bdi->wb_lock);
+			spin_lock_bh(&bdi->wb_lock);
 			/*
 			 * If there is no work to do and the bdi thread was
 			 * inactive long enough - kill it. The wb_lock is taken
@@ -403,7 +431,7 @@ static int bdi_forker_thread(void *ptr)
 				action = KILL_THREAD;
 				break;
 			}
-			spin_unlock(&bdi->wb_lock);
+			spin_unlock_bh(&bdi->wb_lock);
 		}
 		spin_unlock_bh(&bdi_lock);
 
@@ -427,9 +455,9 @@ static int bdi_forker_thread(void *ptr)
 				 * The spinlock makes sure we do not lose
 				 * wake-ups when racing with 'bdi_queue_work()'.
 				 */
-				spin_lock(&bdi->wb_lock);
+				spin_lock_bh(&bdi->wb_lock);
 				bdi->wb.task = task;
-				spin_unlock(&bdi->wb_lock);
+				spin_unlock_bh(&bdi->wb_lock);
 			}
 			break;
 
@@ -586,6 +614,7 @@ void bdi_unregister(struct backing_dev_info *bdi)
 	if (bdi->dev) {
 		trace_writeback_bdi_unregister(bdi);
 		bdi_prune_sb(bdi);
+		del_timer_sync(&bdi->wb.wakeup_timer);
 
 		if (!bdi_cap_flush_forker(bdi))
 			bdi_wb_shutdown(bdi);
@@ -596,6 +625,18 @@ void bdi_unregister(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(bdi_unregister);
 
+static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+	memset(wb, 0, sizeof(*wb));
+
+	wb->bdi = bdi;
+	wb->last_old_flush = jiffies;
+	INIT_LIST_HEAD(&wb->b_dirty);
+	INIT_LIST_HEAD(&wb->b_io);
+	INIT_LIST_HEAD(&wb->b_more_io);
+	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+}
+
 int bdi_init(struct backing_dev_info *bdi)
 {
 	int i, err;
-- 
cgit v1.2.3-59-g8ed1b


From e7f52dfb4f378ea1bbfd4476f4e8ba42f5fb332c Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 3 Aug 2010 20:20:20 +0200
Subject: drbd: revert "delay probes", feature is being re-implemented
 differently

It was a now abandoned attempt to throttle resync bandwidth
based on the delay it causes on the bulk data socket.
It has no userbase yet, and has been disabled by
9173465ccb51c09cc3102a10af93e9f469a0af6f already.
This removes the now unused code.

The basic feature, namely using up "idle" bandwith
of network and disk IO subsystem, with minimal impact
to application IO, is being reimplemented differently.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/drbd/drbd_int.h      |  16 +-----
 drivers/block/drbd/drbd_main.c     |  75 ------------------------
 drivers/block/drbd/drbd_nl.c       |   4 --
 drivers/block/drbd/drbd_proc.c     |  19 +------
 drivers/block/drbd/drbd_receiver.c | 113 ++++++-------------------------------
 drivers/block/drbd/drbd_worker.c   |  15 +----
 include/linux/drbd.h               |   2 +-
 include/linux/drbd_nl.h            |   9 +--
 8 files changed, 28 insertions(+), 225 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 485ed8c7d623..352441b0f92f 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -550,12 +550,6 @@ struct p_delay_probe {
 	u32	offset;	 /* usecs the probe got sent after the reference time point */
 } __packed;
 
-struct delay_probe {
-	struct list_head list;
-	unsigned int seq_num;
-	struct timeval time;
-};
-
 /* DCBP: Drbd Compressed Bitmap Packet ... */
 static inline enum drbd_bitmap_code
 DCBP_get_code(struct p_compressed_bm *p)
@@ -942,11 +936,9 @@ struct drbd_conf {
 	unsigned int ko_count;
 	struct drbd_work  resync_work,
 			  unplug_work,
-			  md_sync_work,
-			  delay_probe_work;
+			  md_sync_work;
 	struct timer_list resync_timer;
 	struct timer_list md_sync_timer;
-	struct timer_list delay_probe_timer;
 
 	/* Used after attach while negotiating new disk state. */
 	union drbd_state new_state_tmp;
@@ -1062,12 +1054,6 @@ struct drbd_conf {
 	u64 ed_uuid; /* UUID of the exposed data */
 	struct mutex state_mutex;
 	char congestion_reason;  /* Why we where congested... */
-	struct list_head delay_probes; /* protected by peer_seq_lock */
-	int data_delay;   /* Delay of packets on the data-sock behind meta-sock */
-	unsigned int delay_seq; /* To generate sequence numbers of delay probes */
-	struct timeval dps_time; /* delay-probes-start-time */
-	unsigned int dp_volume_last;  /* send_cnt of last delay probe */
-	int c_sync_rate; /* current resync rate after delay_probe magic */
 };
 
 static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 668f06378214..fa650dd85b90 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2184,43 +2184,6 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
 	return ok;
 }
 
-static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
-{
-	struct p_delay_probe dp;
-	int offset, ok = 0;
-	struct timeval now;
-
-	mutex_lock(&ds->mutex);
-	if (likely(ds->socket)) {
-		do_gettimeofday(&now);
-		offset = now.tv_usec - mdev->dps_time.tv_usec +
-			 (now.tv_sec - mdev->dps_time.tv_sec) * 1000000;
-		dp.seq_num  = cpu_to_be32(mdev->delay_seq);
-		dp.offset   = cpu_to_be32(offset);
-
-		ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE,
-				    (struct p_header *)&dp, sizeof(dp), 0);
-	}
-	mutex_unlock(&ds->mutex);
-
-	return ok;
-}
-
-static int drbd_send_delay_probes(struct drbd_conf *mdev)
-{
-	int ok;
-
-	mdev->delay_seq++;
-	do_gettimeofday(&mdev->dps_time);
-	ok = drbd_send_delay_probe(mdev, &mdev->meta);
-	ok = ok && drbd_send_delay_probe(mdev, &mdev->data);
-
-	mdev->dp_volume_last = mdev->send_cnt;
-	mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10);
-
-	return ok;
-}
-
 /* called on sndtimeo
  * returns FALSE if we should retry,
  * TRUE if we think connection is dead
@@ -2369,27 +2332,6 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
 	return 1;
 }
 
-static void consider_delay_probes(struct drbd_conf *mdev)
-{
-	return;
-}
-
-static int w_delay_probes(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
-	if (!cancel && mdev->state.conn == C_SYNC_SOURCE)
-		drbd_send_delay_probes(mdev);
-
-	return 1;
-}
-
-static void delay_probe_timer_fn(unsigned long data)
-{
-	struct drbd_conf *mdev = (struct drbd_conf *) data;
-
-	if (list_empty(&mdev->delay_probe_work.list))
-		drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work);
-}
-
 /* Used to send write requests
  * R_PRIMARY -> Peer	(P_DATA)
  */
@@ -2453,9 +2395,6 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 
 	drbd_put_data_sock(mdev);
 
-	if (ok)
-		consider_delay_probes(mdev);
-
 	return ok;
 }
 
@@ -2502,9 +2441,6 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
 
 	drbd_put_data_sock(mdev);
 
-	if (ok)
-		consider_delay_probes(mdev);
-
 	return ok;
 }
 
@@ -2666,10 +2602,6 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
 		/* .rate = */		DRBD_RATE_DEF,
 		/* .after = */		DRBD_AFTER_DEF,
 		/* .al_extents = */	DRBD_AL_EXTENTS_DEF,
-		/* .dp_volume = */	DRBD_DP_VOLUME_DEF,
-		/* .dp_interval = */	DRBD_DP_INTERVAL_DEF,
-		/* .throttle_th = */	DRBD_RS_THROTTLE_TH_DEF,
-		/* .hold_off_th = */	DRBD_RS_HOLD_OFF_TH_DEF,
 		/* .verify_alg = */	{}, 0,
 		/* .cpu_mask = */	{}, 0,
 		/* .csums_alg = */	{}, 0,
@@ -2736,24 +2668,17 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	INIT_LIST_HEAD(&mdev->unplug_work.list);
 	INIT_LIST_HEAD(&mdev->md_sync_work.list);
 	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
-	INIT_LIST_HEAD(&mdev->delay_probes);
-	INIT_LIST_HEAD(&mdev->delay_probe_work.list);
 
 	mdev->resync_work.cb  = w_resync_inactive;
 	mdev->unplug_work.cb  = w_send_write_hint;
 	mdev->md_sync_work.cb = w_md_sync;
 	mdev->bm_io_work.w.cb = w_bitmap_io;
-	mdev->delay_probe_work.cb = w_delay_probes;
 	init_timer(&mdev->resync_timer);
 	init_timer(&mdev->md_sync_timer);
-	init_timer(&mdev->delay_probe_timer);
 	mdev->resync_timer.function = resync_timer_fn;
 	mdev->resync_timer.data = (unsigned long) mdev;
 	mdev->md_sync_timer.function = md_sync_timer_fn;
 	mdev->md_sync_timer.data = (unsigned long) mdev;
-	mdev->delay_probe_timer.function = delay_probe_timer_fn;
-	mdev->delay_probe_timer.data = (unsigned long) mdev;
-
 
 	init_waitqueue_head(&mdev->misc_wait);
 	init_waitqueue_head(&mdev->state_wait);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 2151f18b21de..73131c5ae339 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1557,10 +1557,6 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 		sc.rate       = DRBD_RATE_DEF;
 		sc.after      = DRBD_AFTER_DEF;
 		sc.al_extents = DRBD_AL_EXTENTS_DEF;
-		sc.dp_volume  = DRBD_DP_VOLUME_DEF;
-		sc.dp_interval = DRBD_DP_INTERVAL_DEF;
-		sc.throttle_th = DRBD_RS_THROTTLE_TH_DEF;
-		sc.hold_off_th = DRBD_RS_HOLD_OFF_TH_DEF;
 	} else
 		memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
 
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index d0f1767ea4c3..be3374b68460 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -73,21 +73,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 	seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
 	/* if more than 1 GB display in MB */
 	if (mdev->rs_total > 0x100000L)
-		seq_printf(seq, "(%lu/%lu)M",
+		seq_printf(seq, "(%lu/%lu)M\n\t",
 			    (unsigned long) Bit2KB(rs_left >> 10),
 			    (unsigned long) Bit2KB(mdev->rs_total >> 10));
 	else
-		seq_printf(seq, "(%lu/%lu)K",
+		seq_printf(seq, "(%lu/%lu)K\n\t",
 			    (unsigned long) Bit2KB(rs_left),
 			    (unsigned long) Bit2KB(mdev->rs_total));
 
-	if (mdev->state.conn == C_SYNC_TARGET)
-		seq_printf(seq, " queue_delay: %d.%d ms\n\t",
-			   mdev->data_delay / 1000,
-			   (mdev->data_delay % 1000) / 100);
-	else if (mdev->state.conn == C_SYNC_SOURCE)
-		seq_printf(seq, " delay_probe: %u\n\t", mdev->delay_seq);
-
 	/* see drivers/md/md.c
 	 * We do not want to overflow, so the order of operands and
 	 * the * 100 / 100 trick are important. We do a +1 to be
@@ -135,14 +128,6 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 	else
 		seq_printf(seq, " (%ld)", dbdt);
 
-	if (mdev->state.conn == C_SYNC_TARGET) {
-		if (mdev->c_sync_rate > 1000)
-			seq_printf(seq, " want: %d,%03d",
-				   mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
-		else
-			seq_printf(seq, " want: %d", mdev->c_sync_rate);
-	}
-
 	seq_printf(seq, " K/sec\n");
 }
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index cba1deb7b271..20abef531c99 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3555,14 +3555,15 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
 	return ok;
 }
 
-static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
+static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
 {
 	/* TODO zero copy sink :) */
 	static char sink[128];
 	int size, want, r;
 
-	dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
-	     h->command, h->length);
+	if (!silent)
+		dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
+		     h->command, h->length);
 
 	size = h->length;
 	while (size > 0) {
@@ -3574,101 +3575,25 @@ static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
 	return size == 0;
 }
 
-static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
-{
-	if (mdev->state.disk >= D_INCONSISTENT)
-		drbd_kick_lo(mdev);
-
-	/* Make sure we've acked all the TCP data associated
-	 * with the data requests being unplugged */
-	drbd_tcp_quickack(mdev->data.socket);
-
-	return TRUE;
-}
-
-static void timeval_sub_us(struct timeval* tv, unsigned int us)
+static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
 {
-	tv->tv_sec -= us / 1000000;
-	us = us % 1000000;
-	if (tv->tv_usec > us) {
-		tv->tv_usec += 1000000;
-		tv->tv_sec--;
-	}
-	tv->tv_usec -= us;
+	return receive_skip_(mdev, h, 0);
 }
 
-static void got_delay_probe(struct drbd_conf *mdev, int from, struct p_delay_probe *p)
+static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h)
 {
-	struct delay_probe *dp;
-	struct list_head *le;
-	struct timeval now;
-	int seq_num;
-	int offset;
-	int data_delay;
-
-	seq_num = be32_to_cpu(p->seq_num);
-	offset  = be32_to_cpu(p->offset);
-
-	spin_lock(&mdev->peer_seq_lock);
-	if (!list_empty(&mdev->delay_probes)) {
-		if (from == USE_DATA_SOCKET)
-			le = mdev->delay_probes.next;
-		else
-			le = mdev->delay_probes.prev;
-
-		dp = list_entry(le, struct delay_probe, list);
-
-		if (dp->seq_num == seq_num) {
-			list_del(le);
-			spin_unlock(&mdev->peer_seq_lock);
-			do_gettimeofday(&now);
-			timeval_sub_us(&now, offset);
-			data_delay =
-				now.tv_usec - dp->time.tv_usec +
-				(now.tv_sec - dp->time.tv_sec) * 1000000;
-
-			if (data_delay > 0)
-				mdev->data_delay = data_delay;
-
-			kfree(dp);
-			return;
-		}
-
-		if (dp->seq_num > seq_num) {
-			spin_unlock(&mdev->peer_seq_lock);
-			dev_warn(DEV, "Previous allocation failure of struct delay_probe?\n");
-			return; /* Do not alloca a struct delay_probe.... */
-		}
-	}
-	spin_unlock(&mdev->peer_seq_lock);
-
-	dp = kmalloc(sizeof(struct delay_probe), GFP_NOIO);
-	if (!dp) {
-		dev_warn(DEV, "Failed to allocate a struct delay_probe, do not worry.\n");
-		return;
-	}
-
-	dp->seq_num = seq_num;
-	do_gettimeofday(&dp->time);
-	timeval_sub_us(&dp->time, offset);
-
-	spin_lock(&mdev->peer_seq_lock);
-	if (from == USE_DATA_SOCKET)
-		list_add(&dp->list, &mdev->delay_probes);
-	else
-		list_add_tail(&dp->list, &mdev->delay_probes);
-	spin_unlock(&mdev->peer_seq_lock);
+	return receive_skip_(mdev, h, 1);
 }
 
-static int receive_delay_probe(struct drbd_conf *mdev, struct p_header *h)
+static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
 {
-	struct p_delay_probe *p = (struct p_delay_probe *)h;
+	if (mdev->state.disk >= D_INCONSISTENT)
+		drbd_kick_lo(mdev);
 
-	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
-	if (drbd_recv(mdev, h->payload, h->length) != h->length)
-		return FALSE;
+	/* Make sure we've acked all the TCP data associated
+	 * with the data requests being unplugged */
+	drbd_tcp_quickack(mdev->data.socket);
 
-	got_delay_probe(mdev, USE_DATA_SOCKET, p);
 	return TRUE;
 }
 
@@ -3695,7 +3620,7 @@ static drbd_cmd_handler_f drbd_default_handler[] = {
 	[P_OV_REQUEST]      = receive_DataRequest,
 	[P_OV_REPLY]        = receive_DataRequest,
 	[P_CSUM_RS_REQUEST]    = receive_DataRequest,
-	[P_DELAY_PROBE]     = receive_delay_probe,
+	[P_DELAY_PROBE]     = receive_skip_silent,
 	/* anything missing from this table is in
 	 * the asender_tbl, see get_asender_cmd */
 	[P_MAX_CMD]	    = NULL,
@@ -4472,11 +4397,9 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
 	return TRUE;
 }
 
-static int got_delay_probe_m(struct drbd_conf *mdev, struct p_header *h)
+static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h)
 {
-	struct p_delay_probe *p = (struct p_delay_probe *)h;
-
-	got_delay_probe(mdev, USE_META_SOCKET, p);
+	/* IGNORE */
 	return TRUE;
 }
 
@@ -4504,7 +4427,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
 	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
 	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
 	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
-	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe), got_delay_probe_m },
+	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe), got_something_to_ignore_m },
 	[P_MAX_CMD]	    = { 0, NULL },
 	};
 	if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index b623ceee2a4a..ca4a16cea2d8 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -424,18 +424,6 @@ void resync_timer_fn(unsigned long data)
 		drbd_queue_work(&mdev->data.work, &mdev->resync_work);
 }
 
-static int calc_resync_rate(struct drbd_conf *mdev)
-{
-	int d = mdev->data_delay / 1000; /* us -> ms */
-	int td = mdev->sync_conf.throttle_th * 100;  /* 0.1s -> ms */
-	int hd = mdev->sync_conf.hold_off_th * 100;  /* 0.1s -> ms */
-	int cr = mdev->sync_conf.rate;
-
-	return d <= td ? cr :
-		d >= hd ? 0 :
-		cr + (cr * (td - d) / (hd - td));
-}
-
 int w_make_resync_request(struct drbd_conf *mdev,
 		struct drbd_work *w, int cancel)
 {
@@ -473,8 +461,7 @@ int w_make_resync_request(struct drbd_conf *mdev,
 	max_segment_size = mdev->agreed_pro_version < 94 ?
 		queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
 
-	mdev->c_sync_rate = calc_resync_rate(mdev);
-	number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
+	number = SLEEP_TIME * mdev->sync_conf.rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
 	pe = atomic_read(&mdev->rs_pending_cnt);
 
 	mutex_lock(&mdev->data.mutex);
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index b8d2516668aa..479ee3a1d901 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,7 +53,7 @@
 
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.8"
+#define REL_VERSION "8.3.8.1"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 94
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
index ce77a746fc9d..5f042810a56c 100644
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
@@ -78,10 +78,11 @@ NL_PACKET(syncer_conf, 8,
 	NL_INTEGER(	30,	T_MAY_IGNORE,	rate)
 	NL_INTEGER(	31,	T_MAY_IGNORE,	after)
 	NL_INTEGER(	32,	T_MAY_IGNORE,	al_extents)
-	NL_INTEGER(     71,	T_MAY_IGNORE,	dp_volume)
-	NL_INTEGER(     72,	T_MAY_IGNORE,	dp_interval)
-	NL_INTEGER(     73,	T_MAY_IGNORE,	throttle_th)
-	NL_INTEGER(     74,	T_MAY_IGNORE,	hold_off_th)
+/*	NL_INTEGER(     71,	T_MAY_IGNORE,	dp_volume)
+ *	NL_INTEGER(     72,	T_MAY_IGNORE,	dp_interval)
+ *	NL_INTEGER(     73,	T_MAY_IGNORE,	throttle_th)
+ *	NL_INTEGER(     74,	T_MAY_IGNORE,	hold_off_th)
+ * feature will be reimplemented differently with 8.3.9 */
 	NL_STRING(      52,     T_MAY_IGNORE,   verify_alg,     SHARED_SECRET_MAX)
 	NL_STRING(      51,     T_MAY_IGNORE,   cpu_mask,       32)
 	NL_STRING(	64,	T_MAY_IGNORE,	csums_alg,	SHARED_SECRET_MAX)
-- 
cgit v1.2.3-59-g8ed1b


From 387ac089361fbe5ef287e6950c5c40f6b18e5c55 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 5 Aug 2010 08:34:13 +0200
Subject: block: fix missing export of blk_types.h

Stephen reports:

  After merging the block tree, today's linux-next build (x86_64
  allmodconfig) failed like this:

  usr/include/linux/fs.h:11: included file 'linux/blk_types.h' is not exported

  Caused by commit 9d3dbbcd9a84518ff5e32ffe671d06a48cf84fd9 ("bio, fs:
  separate out bio_types.h and define READ/WRITE constants in terms of
  BIO_RW_* flags").

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/Kbuild | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 2fc8e14cc24a..671715b869fc 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -39,6 +39,7 @@ header-y += ax25.h
 header-y += b1lli.h
 header-y += baycom.h
 header-y += bfs_fs.h
+header-y += blk_types.h
 header-y += blkpg.h
 header-y += bpqether.h
 header-y += bsg.h
-- 
cgit v1.2.3-59-g8ed1b


From 78ef7fab0eb0a5b159842bac89aed74bb0aa7bfe Mon Sep 17 00:00:00 2001
From: Barry Song <21cnbao@gmail.com>
Date: Fri, 15 Jan 2010 15:50:14 +0800
Subject: mtd-physmap: add support users can assign the probe type in board
 files

There are three reasons to add this support:
1. users probably know the interface type of their flashs, then probe
can be faster if they give the right type in platform data since wrong
types will not be detected.
2. sometimes, detecting can cause destory to system. For example, for
kernel XIP, detecting can cause NOR enter a mode instructions can not
be fetched right, which will make kernel crash.
3. For a new probe which is not listed in the rom_probe_types, if users
assign it in board files, physmap can still probe it.

Signed-off-by: Barry Song <21cnbao@gmail.com>
Signed-off-by: Mike Frysinger <vapier.adi@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/maps/physmap.c  | 8 ++++++--
 include/linux/mtd/physmap.h | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/maps/physmap.c b/drivers/mtd/maps/physmap.c
index 829aa4bee54f..4c18b98a3110 100644
--- a/drivers/mtd/maps/physmap.c
+++ b/drivers/mtd/maps/physmap.c
@@ -136,8 +136,12 @@ static int physmap_flash_probe(struct platform_device *dev)
 		simple_map_init(&info->map[i]);
 
 		probe_type = rom_probe_types;
-		for (; info->mtd[i] == NULL && *probe_type != NULL; probe_type++)
-			info->mtd[i] = do_map_probe(*probe_type, &info->map[i]);
+		if (physmap_data->probe_type == NULL) {
+			for (; info->mtd[i] == NULL && *probe_type != NULL; probe_type++)
+				info->mtd[i] = do_map_probe(*probe_type, &info->map[i]);
+		} else
+			info->mtd[i] = do_map_probe(physmap_data->probe_type, &info->map[i]);
+
 		if (info->mtd[i] == NULL) {
 			dev_err(&dev->dev, "map_probe failed\n");
 			err = -ENXIO;
diff --git a/include/linux/mtd/physmap.h b/include/linux/mtd/physmap.h
index 76f7cabf07d3..bcfd9f777454 100644
--- a/include/linux/mtd/physmap.h
+++ b/include/linux/mtd/physmap.h
@@ -25,6 +25,7 @@ struct physmap_flash_data {
 	void			(*set_vpp)(struct map_info *, int);
 	unsigned int		nr_parts;
 	unsigned int		pfow_base;
+	char                    *probe_type;
 	struct mtd_partition	*parts;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 6088c0587706b2cf21ce50c11576718bff5fae0c Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 8 Aug 2010 14:15:22 +0100
Subject: jffs2: Update copyright notices

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/background.c  | 1 +
 fs/jffs2/build.c       | 1 +
 fs/jffs2/compr.c       | 5 +++--
 fs/jffs2/compr.h       | 1 +
 fs/jffs2/compr_lzo.c   | 1 +
 fs/jffs2/compr_rtime.c | 1 +
 fs/jffs2/compr_rubin.c | 1 +
 fs/jffs2/compr_zlib.c  | 1 +
 fs/jffs2/debug.c       | 1 +
 fs/jffs2/debug.h       | 1 +
 fs/jffs2/dir.c         | 1 +
 fs/jffs2/erase.c       | 1 +
 fs/jffs2/file.c        | 1 +
 fs/jffs2/fs.c          | 1 +
 fs/jffs2/gc.c          | 1 +
 fs/jffs2/ioctl.c       | 1 +
 fs/jffs2/jffs2_fs_i.h  | 1 +
 fs/jffs2/jffs2_fs_sb.h | 1 +
 include/linux/jffs2.h  | 3 ++-
 19 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 55f1dde2fa8b..404111b016c9 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index c5e1450d79f9..a906f538d11c 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index f0294410868d..617a1e5694c1 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -2,11 +2,12 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
- * Created by Arjan van de Ven <arjanv@redhat.com>
- *
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
  *		    University of Szeged, Hungary
  *
+ * Created by Arjan van de Ven <arjan@infradead.org>
+ *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
  */
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 7d1d72faa774..e471a9106fd9 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -3,6 +3,7 @@
  *
  * Copyright © 2004   Ferenc Havasi <havasi@inf.u-szeged.hu>,
  *		      University of Szeged, Hungary
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index cd02acafde8a..ed25ae7c98eb 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2007 Nokia Corporation. All rights reserved.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by Richard Purdie <rpurdie@openedhand.com>
  *
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 546d1538d076..9696ad9ef5f7 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by Arjan van de Ven <arjanv@redhat.com>
  *
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index 170d289ac785..a12b4f763373 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by Arjan van de Ven <arjanv@redhat.com>
  *
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index b46661a42758..97fc45de6f81 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index ec3538413926..e0b76c87a91a 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index a113ecc3bafe..c4f8eef5ca68 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 166062a68230..1aaf98df49d0 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 6286ad9b00f7..abac961f617b 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e7291c161a19..52749127aa6c 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 26037e2d6154..29d0970f9da3 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index f5e96bd656e8..846a79452497 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
index 9d41f43e47bb..859a598af020 100644
--- a/fs/jffs2/ioctl.c
+++ b/fs/jffs2/ioctl.c
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index c6923da98263..2e4a86763c07 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 85ef6dbb1be7..6784bc89add1 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -2,6 +2,7 @@
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
  * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/include/linux/jffs2.h b/include/linux/jffs2.h
index 0874ab59ffef..b9898894ec8d 100644
--- a/include/linux/jffs2.h
+++ b/include/linux/jffs2.h
@@ -1,7 +1,8 @@
 /*
  * JFFS2 -- Journalling Flash File System, Version 2.
  *
- * Copyright (C) 2001-2003 Red Hat, Inc.
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * Created by David Woodhouse <dwmw2@infradead.org>
  *
-- 
cgit v1.2.3-59-g8ed1b


From 9f2cc6f759ca0b072107c171a3b5cd79c7ea5de3 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hsweeten@visionengravers.com>
Date: Fri, 21 May 2010 18:38:52 -0500
Subject: watchdog: wdt_pci.c: move ids to pci_ids.h

Move the VENDOR/DEVICE ids to pci_ids.h.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
---
 drivers/watchdog/wdt_pci.c | 15 +--------------
 include/linux/pci_ids.h    |  3 +++
 2 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/watchdog/wdt_pci.c b/drivers/watchdog/wdt_pci.c
index 7b22e3cdbc81..6130c88fa5ac 100644
--- a/drivers/watchdog/wdt_pci.c
+++ b/drivers/watchdog/wdt_pci.c
@@ -60,19 +60,6 @@
 
 #define PFX "wdt_pci: "
 
-/*
- * Until Access I/O gets their application for a PCI vendor ID approved,
- * I don't think that it's appropriate to move these constants into the
- * regular pci_ids.h file. -- JPN 2000/01/18
- */
-
-#ifndef PCI_VENDOR_ID_ACCESSIO
-#define PCI_VENDOR_ID_ACCESSIO 0x494f
-#endif
-#ifndef PCI_DEVICE_ID_WDG_CSM
-#define PCI_DEVICE_ID_WDG_CSM 0x22c0
-#endif
-
 /* We can only use 1 card due to the /dev/watchdog restriction */
 static int dev_count;
 
@@ -743,7 +730,7 @@ static void __devexit wdtpci_remove_one(struct pci_dev *pdev)
 static struct pci_device_id wdtpci_pci_tbl[] = {
 	{
 		.vendor	   = PCI_VENDOR_ID_ACCESSIO,
-		.device	   = PCI_DEVICE_ID_WDG_CSM,
+		.device	   = PCI_DEVICE_ID_ACCESSIO_WDG_CSM,
 		.subvendor = PCI_ANY_ID,
 		.subdevice = PCI_ANY_ID,
 	},
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index c81eec4d3c35..f6a3b2d36cad 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2372,6 +2372,9 @@
 #define PCI_VENDOR_ID_AKS		0x416c
 #define PCI_DEVICE_ID_AKS_ALADDINCARD	0x0100
 
+#define PCI_VENDOR_ID_ACCESSIO		0x494f
+#define PCI_DEVICE_ID_ACCESSIO_WDG_CSM	0x22c0
+
 #define PCI_VENDOR_ID_S3		0x5333
 #define PCI_DEVICE_ID_S3_TRIO		0x8811
 #define PCI_DEVICE_ID_S3_868		0x8880
-- 
cgit v1.2.3-59-g8ed1b


From a1452a3771c4eb85bd779790b040efdc36f4274e Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 8 Aug 2010 20:58:20 +0100
Subject: mtd: Update copyright notices

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 drivers/mtd/afs.c              |  2 +-
 drivers/mtd/cmdlinepart.c      | 17 ++++++++++++++++-
 drivers/mtd/ftl.c              |  2 +-
 drivers/mtd/inftlcore.c        |  6 +++---
 drivers/mtd/inftlmount.c       |  4 ++--
 drivers/mtd/mtd_blkdevs.c      | 18 ++++++++++++++++--
 drivers/mtd/mtdblock.c         | 19 +++++++++++++++++--
 drivers/mtd/mtdblock_ro.c      | 19 +++++++++++++++++--
 drivers/mtd/mtdchar.c          | 16 +++++++++++++++-
 drivers/mtd/mtdconcat.c        | 18 ++++++++++++++++--
 drivers/mtd/mtdcore.c          | 20 +++++++++++++++++---
 drivers/mtd/mtdoops.c          |  2 +-
 drivers/mtd/mtdpart.c          | 20 ++++++++++++++++----
 drivers/mtd/mtdsuper.c         |  2 ++
 drivers/mtd/nftlcore.c         | 25 ++++++++++++++++++-------
 drivers/mtd/nftlmount.c        |  3 ++-
 drivers/mtd/ofpart.c           |  4 ++--
 drivers/mtd/redboot.c          | 18 ++++++++++++++++++
 drivers/mtd/rfd_ftl.c          |  2 +-
 drivers/mtd/ssfdc.c            |  2 +-
 include/linux/mtd/bbm.h        | 18 ++++++++++++++++--
 include/linux/mtd/blktrans.h   | 16 ++++++++++++++--
 include/linux/mtd/cfi.h        | 20 +++++++++++++++++---
 include/linux/mtd/cfi_endian.h | 19 +++++++++++++++++++
 include/linux/mtd/concat.h     | 17 +++++++++++++++--
 include/linux/mtd/doc2000.h    | 23 ++++++++++++++++++-----
 include/linux/mtd/flashchip.h  | 19 +++++++++++++++----
 include/linux/mtd/gen_probe.h  | 19 +++++++++++++++++--
 include/linux/mtd/map.h        | 18 ++++++++++++++++++
 include/linux/mtd/mtd.h        | 17 +++++++++++++++--
 include/linux/mtd/nand.h       |  6 +++---
 include/linux/mtd/nand_ecc.h   |  4 +++-
 include/linux/mtd/nftl.h       | 17 ++++++++++++++++-
 include/mtd/mtd-abi.h          | 17 ++++++++++++++++-
 include/mtd/mtd-user.h         | 17 ++++++++++++++++-
 include/mtd/nftl-user.h        | 16 +++++++++++++++-
 include/mtd/ubi-user.h         |  2 +-
 37 files changed, 416 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/afs.c b/drivers/mtd/afs.c
index cec7ab98b2a9..302372c08b56 100644
--- a/drivers/mtd/afs.c
+++ b/drivers/mtd/afs.c
@@ -2,7 +2,7 @@
 
     drivers/mtd/afs.c: ARM Flash Layout/Partitioning
 
-    Copyright (C) 2000 ARM Limited
+    Copyright © 2000 ARM Limited
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
diff --git a/drivers/mtd/cmdlinepart.c b/drivers/mtd/cmdlinepart.c
index 1479da6d3aa6..e790f38893b0 100644
--- a/drivers/mtd/cmdlinepart.c
+++ b/drivers/mtd/cmdlinepart.c
@@ -1,7 +1,22 @@
 /*
  * Read flash partition table from command line
  *
- * Copyright 2002 SYSGO Real-Time Solutions GmbH
+ * Copyright © 2002      SYSGO Real-Time Solutions GmbH
+ * Copyright © 2002-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
  * The format for the command line is as follows:
  *
diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index 62da9eb7032b..4d6a64c387ec 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -26,7 +26,7 @@
 
     The initial developer of the original code is David A. Hinds
     <dahinds@users.sourceforge.net>.  Portions created by David A. Hinds
-    are Copyright (C) 1999 David A. Hinds.  All Rights Reserved.
+    are Copyright © 1999 David A. Hinds.  All Rights Reserved.
 
     Alternatively, the contents of this file may be used under the
     terms of the GNU General Public License version 2 (the "GPL"), in
diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index 015a7fe1b6ee..d7592e67d048 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -1,11 +1,11 @@
 /*
  * inftlcore.c -- Linux driver for Inverse Flash Translation Layer (INFTL)
  *
- * (C) Copyright 2002, Greg Ungerer (gerg@snapgear.com)
+ * Copyright © 2002, Greg Ungerer (gerg@snapgear.com)
  *
  * Based heavily on the nftlcore.c code which is:
- * (c) 1999 Machine Vision Holdings, Inc.
- * Author: David Woodhouse <dwmw2@infradead.org>
+ * Copyright © 1999 Machine Vision Holdings, Inc.
+ * Copyright © 1999 David Woodhouse <dwmw2@infradead.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c
index 8f988d7d3c5c..7772d2618d9f 100644
--- a/drivers/mtd/inftlmount.c
+++ b/drivers/mtd/inftlmount.c
@@ -2,11 +2,11 @@
  * inftlmount.c -- INFTL mount code with extensive checks.
  *
  * Author: Greg Ungerer (gerg@snapgear.com)
- * (C) Copyright 2002-2003, Greg Ungerer (gerg@snapgear.com)
+ * Copyright © 2002-2003, Greg Ungerer (gerg@snapgear.com)
  *
  * Based heavily on the nftlmount.c code which is:
  * Author: Fabrice Bellard (fabrice.bellard@netgem.com)
- * Copyright (C) 2000 Netgem S.A.
+ * Copyright © 2000 Netgem S.A.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 89e07e5af577..1d2144d77470 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -1,7 +1,21 @@
 /*
- * (C) 2003 David Woodhouse <dwmw2@infradead.org>
+ * Interface to Linux block layer for MTD 'translation layers'.
  *
- * Interface to Linux 2.5 block layer for MTD 'translation layers'.
+ * Copyright © 2003-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
  */
 
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
index e6edbec609fd..1e74ad961040 100644
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -1,8 +1,23 @@
 /*
  * Direct MTD block device access
  *
- * (C) 2000-2003 Nicolas Pitre <nico@fluxnic.net>
- * (C) 1999-2003 David Woodhouse <dwmw2@infradead.org>
+ * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org>
+ * Copyright © 2000-2003 Nicolas Pitre <nico@fluxnic.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
  */
 
 #include <linux/fs.h>
diff --git a/drivers/mtd/mtdblock_ro.c b/drivers/mtd/mtdblock_ro.c
index d0d3f79f9d03..795a8c0a05b8 100644
--- a/drivers/mtd/mtdblock_ro.c
+++ b/drivers/mtd/mtdblock_ro.c
@@ -1,7 +1,22 @@
 /*
- * (C) 2003 David Woodhouse <dwmw2@infradead.org>
- *
  * Simple read-only (writable only for RAM) mtdblock driver
+ *
+ * Copyright © 2001-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
  */
 
 #include <linux/init.h>
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index c27e65e3e291..f6d76f1adca0 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -1,5 +1,19 @@
 /*
- * Character-device access to raw MTD devices.
+ * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
  */
 
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 4567bc373780..bf8de0943103 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -1,11 +1,25 @@
 /*
  * MTD device concatenation layer
  *
- * (C) 2002 Robert Kaiser <rkaiser@sysgo.de>
+ * Copyright © 2002 Robert Kaiser <rkaiser@sysgo.de>
+ * Copyright © 2002-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * NAND support by Christian Gan <cgan@iders.ca>
  *
- * This code is GPL
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index a1b8b70d2d0a..93a11f3def44 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -2,9 +2,23 @@
  * Core registration and callback routines for MTD
  * drivers and users.
  *
- * bdi bits are:
- * Copyright © 2006 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
+ * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org>
+ * Copyright © 2006      Red Hat UK Limited 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
  */
 
 #include <linux/module.h>
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index 328313c3dccb..1ee72f3f0512 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -1,7 +1,7 @@
 /*
  * MTD Oops/Panic logger
  *
- * Copyright (C) 2007 Nokia Corporation. All rights reserved.
+ * Copyright © 2007 Nokia Corporation. All rights reserved.
  *
  * Author: Richard Purdie <rpurdie@openedhand.com>
  *
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 4c539ded0b70..fc5507410278 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -1,12 +1,24 @@
 /*
  * Simple MTD partitioning layer
  *
- * (C) 2000 Nicolas Pitre <nico@fluxnic.net>
+ * Copyright © 2000 Nicolas Pitre <nico@fluxnic.net>
+ * Copyright © 2002 Thomas Gleixner <gleixner@linutronix.de>
+ * Copyright © 2000-2010 David Woodhouse <dwmw2@infradead.org>
  *
- * This code is GPL
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
- * 	02-21-2002	Thomas Gleixner <gleixner@autronix.de>
- *			added support for read_oob, write_oob
  */
 
 #include <linux/module.h>
diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index bd9a443ccf69..38e2ab07e7a3 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -1,6 +1,8 @@
 /* MTD-based superblock management
  *
  * Copyright © 2001-2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright © 2001-2010 David Woodhouse <dwmw2@infradead.org>
+ *
  * Written by:  David Howells <dhowells@redhat.com>
  *              David Woodhouse <dwmw2@infradead.org>
  *
diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c
index a4578bf903aa..b155666acfbe 100644
--- a/drivers/mtd/nftlcore.c
+++ b/drivers/mtd/nftlcore.c
@@ -1,11 +1,22 @@
-/* Linux driver for NAND Flash Translation Layer      */
-/* (c) 1999 Machine Vision Holdings, Inc.             */
-/* Author: David Woodhouse <dwmw2@infradead.org>      */
-
 /*
-  The contents of this file are distributed under the GNU General
-  Public License version 2. The author places no additional
-  restrictions of any kind on it.
+ * Linux driver for NAND Flash Translation Layer
+ *
+ * Copyright © 1999 Machine Vision Holdings, Inc.
+ * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #define PRERELEASE
diff --git a/drivers/mtd/nftlmount.c b/drivers/mtd/nftlmount.c
index 8b22b1836e9f..e3cd1ffad2f6 100644
--- a/drivers/mtd/nftlmount.c
+++ b/drivers/mtd/nftlmount.c
@@ -2,7 +2,8 @@
  * NFTL mount code with extensive checks
  *
  * Author: Fabrice Bellard (fabrice.bellard@netgem.com)
- * Copyright (C) 2000 Netgem S.A.
+ * Copyright © 2000 Netgem S.A.
+ * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/drivers/mtd/ofpart.c b/drivers/mtd/ofpart.c
index 4f0d635674f3..8bf7dc6d1ce6 100644
--- a/drivers/mtd/ofpart.c
+++ b/drivers/mtd/ofpart.c
@@ -1,11 +1,11 @@
 /*
  * Flash partitions described by the OF (or flattened) device tree
  *
- * Copyright (C) 2006 MontaVista Software Inc.
+ * Copyright © 2006 MontaVista Software Inc.
  * Author: Vitaly Wool <vwool@ru.mvista.com>
  *
  * Revised to handle newer style flash binding by:
- *   Copyright (C) 2007 David Gibson, IBM Corporation.
+ *   Copyright © 2007 David Gibson, IBM Corporation.
  *
  * This program is free software; you can redistribute  it and/or modify it
  * under  the terms of  the GNU General  Public License as published by the
diff --git a/drivers/mtd/redboot.c b/drivers/mtd/redboot.c
index 2d600a1bf2aa..7a87d07cd79f 100644
--- a/drivers/mtd/redboot.c
+++ b/drivers/mtd/redboot.c
@@ -1,6 +1,24 @@
 /*
  * Parse RedBoot-style Flash Image System (FIS) tables and
  * produce a Linux partition array to match.
+ *
+ * Copyright © 2001      Red Hat UK Limited
+ * Copyright © 2001-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/mtd/rfd_ftl.c b/drivers/mtd/rfd_ftl.c
index 63b83c0d9a13..cc4d1805b864 100644
--- a/drivers/mtd/rfd_ftl.c
+++ b/drivers/mtd/rfd_ftl.c
@@ -1,7 +1,7 @@
 /*
  * rfd_ftl.c -- resident flash disk (flash translation layer)
  *
- * Copyright (C) 2005  Sean Young <sean@mess.org>
+ * Copyright © 2005  Sean Young <sean@mess.org>
  *
  * This type of flash translation layer (FTL) is used by the Embedded BIOS
  * by General Software. It is known as the Resident Flash Disk (RFD), see:
diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c
index 81c4ecdc11f5..5cd189793332 100644
--- a/drivers/mtd/ssfdc.c
+++ b/drivers/mtd/ssfdc.c
@@ -1,6 +1,6 @@
 /*
  * Linux driver for SSFDC Flash Translation Layer (Read only)
- * (c) 2005 Eptar srl
+ * © 2005 Eptar srl
  * Author: Claudio Lanconelli <lanconelli.claudio@eptar.com>
  *
  * Based on NTFL and MTDBLOCK_RO drivers
diff --git a/include/linux/mtd/bbm.h b/include/linux/mtd/bbm.h
index a04b962492a8..7fa20beb2ab9 100644
--- a/include/linux/mtd/bbm.h
+++ b/include/linux/mtd/bbm.h
@@ -4,12 +4,26 @@
  *  NAND family Bad Block Management (BBM) header file
  *    - Bad Block Table (BBT) implementation
  *
- *  Copyright (c) 2005 Samsung Electronics
+ *  Copyright © 2005 Samsung Electronics
  *  Kyungmin Park <kyungmin.park@samsung.com>
  *
- *  Copyright (c) 2000-2005
+ *  Copyright © 2000-2005
  *  Thomas Gleixner <tglx@linuxtronix.de>
  *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
  */
 #ifndef __LINUX_MTD_BBM_H
 #define __LINUX_MTD_BBM_H
diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h
index b481ccd7ff3c..26529ebd59cc 100644
--- a/include/linux/mtd/blktrans.h
+++ b/include/linux/mtd/blktrans.h
@@ -1,7 +1,19 @@
 /*
- * (C) 2003 David Woodhouse <dwmw2@infradead.org>
+ * Copyright © 2003-2010 David Woodhouse <dwmw2@infradead.org>
  *
- * Interface to Linux block layer for MTD 'translation layers'.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
  */
 
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index 574d9ee066f1..d2118b0eac9a 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -1,6 +1,20 @@
-
-/* Common Flash Interface structures
- * See http://support.intel.com/design/flash/technote/index.htm
+/*
+ * Copyright © 2000-2010 David Woodhouse <dwmw2@infradead.org> et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
  */
 
 #ifndef __MTD_CFI_H__
diff --git a/include/linux/mtd/cfi_endian.h b/include/linux/mtd/cfi_endian.h
index d802f7736be3..51cc3f5917a8 100644
--- a/include/linux/mtd/cfi_endian.h
+++ b/include/linux/mtd/cfi_endian.h
@@ -1,3 +1,22 @@
+/*
+ * Copyright © 2001-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
 #include <asm/byteorder.h>
 
 #ifndef CONFIG_MTD_CFI_ADV_OPTIONS
diff --git a/include/linux/mtd/concat.h b/include/linux/mtd/concat.h
index e80c674daeb3..ccdbe93a909c 100644
--- a/include/linux/mtd/concat.h
+++ b/include/linux/mtd/concat.h
@@ -1,9 +1,22 @@
 /*
  * MTD device concatenation layer definitions
  *
- * (C) 2002 Robert Kaiser <rkaiser@sysgo.de>
+ * Copyright © 2002      Robert Kaiser <rkaiser@sysgo.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
- * This code is GPL
  */
 
 #ifndef MTD_CONCAT_H
diff --git a/include/linux/mtd/doc2000.h b/include/linux/mtd/doc2000.h
index 0a6d516ab71d..0f6fea73a1f6 100644
--- a/include/linux/mtd/doc2000.h
+++ b/include/linux/mtd/doc2000.h
@@ -1,12 +1,25 @@
 /*
  * Linux driver for Disk-On-Chip devices
  *
- * Copyright (C) 1999 Machine Vision Holdings, Inc.
- * Copyright (C) 2001-2003 David Woodhouse <dwmw2@infradead.org>
- * Copyright (C) 2002-2003 Greg Ungerer <gerg@snapgear.com>
- * Copyright (C) 2002-2003 SnapGear Inc
+ * Copyright © 1999 Machine Vision Holdings, Inc.
+ * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org>
+ * Copyright © 2002-2003 Greg Ungerer <gerg@snapgear.com>
+ * Copyright © 2002-2003 SnapGear Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
- * Released under GPL
  */
 
 #ifndef __MTD_DOC2000_H__
diff --git a/include/linux/mtd/flashchip.h b/include/linux/mtd/flashchip.h
index 23cc10f8e343..b63fa457febd 100644
--- a/include/linux/mtd/flashchip.h
+++ b/include/linux/mtd/flashchip.h
@@ -1,10 +1,21 @@
-
 /*
- * struct flchip definition
+ * Copyright © 2000      Red Hat UK Limited
+ * Copyright © 2000-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  *
- * Contains information about the location and state of a given flash device
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
- * (C) 2000 Red Hat. GPLd.
  */
 
 #ifndef __MTD_FLASHCHIP_H__
diff --git a/include/linux/mtd/gen_probe.h b/include/linux/mtd/gen_probe.h
index df362ddf2949..2c456054fded 100644
--- a/include/linux/mtd/gen_probe.h
+++ b/include/linux/mtd/gen_probe.h
@@ -1,6 +1,21 @@
 /*
- * (C) 2001, 2001 Red Hat, Inc.
- * GPL'd
+ * Copyright © 2001      Red Hat UK Limited
+ * Copyright © 2001-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
  */
 
 #ifndef __LINUX_MTD_GEN_PROBE_H__
diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index de89eca864ce..eea3a4fb7405 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright © 2000-2010 David Woodhouse <dwmw2@infradead.org> et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
 
 /* Overhauled routines for dealing with different mmap regions of flash */
 
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 43b7d72c6116..eae914e97f33 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -1,7 +1,20 @@
 /*
- * Copyright (C) 1999-2003 David Woodhouse <dwmw2@infradead.org> et al.
+ * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org> et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
- * Released under GPL
  */
 
 #ifndef __MTD_MTD_H__
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 50f3aa00a452..102e12c58cb3 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1,9 +1,9 @@
 /*
  *  linux/include/linux/mtd/nand.h
  *
- *  Copyright (c) 2000 David Woodhouse <dwmw2@infradead.org>
- *                     Steven J. Hill <sjhill@realitydiluted.com>
- *		       Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright © 2000-2010 David Woodhouse <dwmw2@infradead.org>
+ *                        Steven J. Hill <sjhill@realitydiluted.com>
+ *		          Thomas Gleixner <tglx@linutronix.de>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/mtd/nand_ecc.h b/include/linux/mtd/nand_ecc.h
index 41bc013571d0..4d8406c81652 100644
--- a/include/linux/mtd/nand_ecc.h
+++ b/include/linux/mtd/nand_ecc.h
@@ -1,7 +1,9 @@
 /*
  *  drivers/mtd/nand_ecc.h
  *
- *  Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com)
+ *  Copyright (C) 2000-2010 Steven J. Hill <sjhill@realitydiluted.com>
+ *			    David Woodhouse <dwmw2@infradead.org>
+ *			    Thomas Gleixner <tglx@linutronix.de>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
diff --git a/include/linux/mtd/nftl.h b/include/linux/mtd/nftl.h
index dcaf611ed748..b059629e22bc 100644
--- a/include/linux/mtd/nftl.h
+++ b/include/linux/mtd/nftl.h
@@ -1,5 +1,20 @@
 /*
- * (C) 1999-2003 David Woodhouse <dwmw2@infradead.org>
+ * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
  */
 
 #ifndef __MTD_NFTL_H__
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index e12872e3c694..4debb4514634 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -1,5 +1,20 @@
 /*
- * Portions of MTD ABI definition which are shared by kernel and user space
+ * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org> et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
  */
 
 #ifndef __MTD_ABI_H__
diff --git a/include/mtd/mtd-user.h b/include/mtd/mtd-user.h
index 170ceca3b2d0..aa3c2f86a913 100644
--- a/include/mtd/mtd-user.h
+++ b/include/mtd/mtd-user.h
@@ -1,5 +1,20 @@
 /*
- * MTD ABI header for use by user space only.
+ * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
  */
 
 #ifndef __MTD_USER_H__
diff --git a/include/mtd/nftl-user.h b/include/mtd/nftl-user.h
index 98e9e57f22de..bdeabd86ad99 100644
--- a/include/mtd/nftl-user.h
+++ b/include/mtd/nftl-user.h
@@ -1,5 +1,19 @@
 /*
- * Parts of NFTL headers shared with userspace
+ * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
  */
 
diff --git a/include/mtd/ubi-user.h b/include/mtd/ubi-user.h
index 466a8320f1e6..c0d47ad4b103 100644
--- a/include/mtd/ubi-user.h
+++ b/include/mtd/ubi-user.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) International Business Machines Corp., 2006
+ * Copyright © International Business Machines Corp., 2006
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
-- 
cgit v1.2.3-59-g8ed1b


From 6ae0185fe201eae0548dace2a84acb5050fc8606 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 8 Aug 2010 21:19:42 +0100
Subject: mtd: Remove obsolete <mtd/compatmac.h> include

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/chips/cfi_cmdset_0001.c |  1 -
 drivers/mtd/chips/cfi_cmdset_0002.c |  1 -
 drivers/mtd/chips/cfi_cmdset_0020.c |  1 -
 drivers/mtd/chips/cfi_util.c        |  1 -
 drivers/mtd/chips/chipreg.c         |  1 -
 drivers/mtd/chips/map_absent.c      |  1 -
 drivers/mtd/chips/map_ram.c         |  1 -
 drivers/mtd/chips/map_rom.c         |  1 -
 drivers/mtd/devices/docecc.c        |  1 -
 drivers/mtd/devices/docprobe.c      |  1 -
 drivers/mtd/devices/mtdram.c        |  1 -
 drivers/mtd/devices/pmc551.c        |  1 -
 drivers/mtd/inftlmount.c            |  1 -
 drivers/mtd/mtdchar.c               |  1 -
 drivers/mtd/mtdcore.c               |  1 -
 drivers/mtd/mtdpart.c               |  1 -
 drivers/mtd/nand/diskonchip.c       |  1 -
 drivers/mtd/nand/nand_base.c        |  1 -
 drivers/mtd/nand/nand_bbt.c         |  1 -
 drivers/mtd/nand/rtc_from4.c        |  1 -
 drivers/mtd/onenand/onenand_bbt.c   |  1 -
 fs/jffs2/nodelist.h                 |  1 -
 include/linux/mtd/compatmac.h       | 10 ----------
 include/linux/mtd/map.h             |  1 -
 include/linux/mtd/mtd.h             |  1 -
 25 files changed, 34 deletions(-)
 delete mode 100644 include/linux/mtd/compatmac.h

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index 97d5546f9ea4..9e2b7e9e0ad9 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -34,7 +34,6 @@
 #include <linux/mtd/xip.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/compatmac.h>
 #include <linux/mtd/cfi.h>
 
 /* #define CMDSET0001_DISABLE_ERASE_SUSPEND_ON_WRITE */
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index bd20d1ff1b0d..3e6c47bdce53 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -33,7 +33,6 @@
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/reboot.h>
-#include <linux/mtd/compatmac.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/cfi.h>
diff --git a/drivers/mtd/chips/cfi_cmdset_0020.c b/drivers/mtd/chips/cfi_cmdset_0020.c
index e54e8c169d76..314af1f5a370 100644
--- a/drivers/mtd/chips/cfi_cmdset_0020.c
+++ b/drivers/mtd/chips/cfi_cmdset_0020.c
@@ -33,7 +33,6 @@
 #include <linux/mtd/map.h>
 #include <linux/mtd/cfi.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/compatmac.h>
 
 
 static int cfi_staa_read(struct mtd_info *, loff_t, size_t, size_t *, u_char *);
diff --git a/drivers/mtd/chips/cfi_util.c b/drivers/mtd/chips/cfi_util.c
index d7c2c672757e..e503b2ca894d 100644
--- a/drivers/mtd/chips/cfi_util.c
+++ b/drivers/mtd/chips/cfi_util.c
@@ -22,7 +22,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/cfi.h>
-#include <linux/mtd/compatmac.h>
 
 int __xipram cfi_qry_present(struct map_info *map, __u32 base,
 			     struct cfi_private *cfi)
diff --git a/drivers/mtd/chips/chipreg.c b/drivers/mtd/chips/chipreg.c
index c85760968227..da1f96f385c7 100644
--- a/drivers/mtd/chips/chipreg.c
+++ b/drivers/mtd/chips/chipreg.c
@@ -10,7 +10,6 @@
 #include <linux/slab.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/compatmac.h>
 
 static DEFINE_SPINLOCK(chip_drvs_lock);
 static LIST_HEAD(chip_drvs_list);
diff --git a/drivers/mtd/chips/map_absent.c b/drivers/mtd/chips/map_absent.c
index 494d30d0631a..f2b872946871 100644
--- a/drivers/mtd/chips/map_absent.c
+++ b/drivers/mtd/chips/map_absent.c
@@ -25,7 +25,6 @@
 #include <linux/init.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
-#include <linux/mtd/compatmac.h>
 
 static int map_absent_read (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
 static int map_absent_write (struct mtd_info *, loff_t, size_t, size_t *, const u_char *);
diff --git a/drivers/mtd/chips/map_ram.c b/drivers/mtd/chips/map_ram.c
index 6bdc50c727e7..67640ccb2d41 100644
--- a/drivers/mtd/chips/map_ram.c
+++ b/drivers/mtd/chips/map_ram.c
@@ -13,7 +13,6 @@
 #include <linux/init.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
-#include <linux/mtd/compatmac.h>
 
 
 static int mapram_read (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
diff --git a/drivers/mtd/chips/map_rom.c b/drivers/mtd/chips/map_rom.c
index 076090a67b90..593f73d480d2 100644
--- a/drivers/mtd/chips/map_rom.c
+++ b/drivers/mtd/chips/map_rom.c
@@ -13,7 +13,6 @@
 #include <linux/init.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
-#include <linux/mtd/compatmac.h>
 
 static int maprom_read (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
 static int maprom_write (struct mtd_info *, loff_t, size_t, size_t *, const u_char *);
diff --git a/drivers/mtd/devices/docecc.c b/drivers/mtd/devices/docecc.c
index a19cda52da5c..a99838bb2dc0 100644
--- a/drivers/mtd/devices/docecc.c
+++ b/drivers/mtd/devices/docecc.c
@@ -31,7 +31,6 @@
 #include <linux/init.h>
 #include <linux/types.h>
 
-#include <linux/mtd/compatmac.h> /* for min() in older kernels */
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/doc2000.h>
 
diff --git a/drivers/mtd/devices/docprobe.c b/drivers/mtd/devices/docprobe.c
index 6e62922942b1..d374603493a7 100644
--- a/drivers/mtd/devices/docprobe.c
+++ b/drivers/mtd/devices/docprobe.c
@@ -49,7 +49,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/doc2000.h>
-#include <linux/mtd/compatmac.h>
 
 /* Where to look for the devices? */
 #ifndef CONFIG_MTD_DOCPROBE_ADDRESS
diff --git a/drivers/mtd/devices/mtdram.c b/drivers/mtd/devices/mtdram.c
index fce5ff7589aa..26a6e809013d 100644
--- a/drivers/mtd/devices/mtdram.c
+++ b/drivers/mtd/devices/mtdram.c
@@ -14,7 +14,6 @@
 #include <linux/ioport.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
-#include <linux/mtd/compatmac.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/mtdram.h>
 
diff --git a/drivers/mtd/devices/pmc551.c b/drivers/mtd/devices/pmc551.c
index fc8ea0a57ac2..ef0aba0ce58f 100644
--- a/drivers/mtd/devices/pmc551.c
+++ b/drivers/mtd/devices/pmc551.c
@@ -98,7 +98,6 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/pmc551.h>
-#include <linux/mtd/compatmac.h>
 
 static struct mtd_info *pmc551list;
 
diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c
index 7772d2618d9f..104052e774b0 100644
--- a/drivers/mtd/inftlmount.c
+++ b/drivers/mtd/inftlmount.c
@@ -34,7 +34,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nftl.h>
 #include <linux/mtd/inftl.h>
-#include <linux/mtd/compatmac.h>
 
 /*
  * find_boot_record: Find the INFTL Media Header and its Spare copy which
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index f6d76f1adca0..638827a25b77 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -33,7 +33,6 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
-#include <linux/mtd/compatmac.h>
 
 #include <asm/uaccess.h>
 
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 93a11f3def44..527cebf58da4 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -31,7 +31,6 @@
 #include <linux/err.h>
 #include <linux/ioctl.h>
 #include <linux/init.h>
-#include <linux/mtd/compatmac.h>
 #include <linux/proc_fs.h>
 #include <linux/idr.h>
 #include <linux/backing-dev.h>
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index fc5507410278..dc6558568876 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -29,7 +29,6 @@
 #include <linux/kmod.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
-#include <linux/mtd/compatmac.h>
 
 /* Our partition linked list */
 static LIST_HEAD(mtd_partitions);
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index 51315f55f905..b7f8de7b2780 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -29,7 +29,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/doc2000.h>
-#include <linux/mtd/compatmac.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/inftl.h>
 
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index ee6a6f866b50..16a1714df008 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -42,7 +42,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/nand_ecc.h>
-#include <linux/mtd/compatmac.h>
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
 #include <linux/leds.h>
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 469de17107e5..5fedf4a74f16 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -55,7 +55,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/nand_ecc.h>
-#include <linux/mtd/compatmac.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/mtd/nand/rtc_from4.c b/drivers/mtd/nand/rtc_from4.c
index a033c4cd8e16..67440b5beef8 100644
--- a/drivers/mtd/nand/rtc_from4.c
+++ b/drivers/mtd/nand/rtc_from4.c
@@ -24,7 +24,6 @@
 #include <linux/rslib.h>
 #include <linux/bitrev.h>
 #include <linux/module.h>
-#include <linux/mtd/compatmac.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
diff --git a/drivers/mtd/onenand/onenand_bbt.c b/drivers/mtd/onenand/onenand_bbt.c
index a91fcac1af01..01ab5b3c453b 100644
--- a/drivers/mtd/onenand/onenand_bbt.c
+++ b/drivers/mtd/onenand/onenand_bbt.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/onenand.h>
-#include <linux/mtd/compatmac.h>
 
 /**
  * check_short_pattern - [GENERIC] check if a pattern is in the buffer
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index a881a42f19e3..523a91691052 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -24,7 +24,6 @@
 #ifdef __ECOS
 #include "os-ecos.h"
 #else
-#include <linux/mtd/compatmac.h> /* For compatibility with older kernels */
 #include "os-linux.h"
 #endif
 
diff --git a/include/linux/mtd/compatmac.h b/include/linux/mtd/compatmac.h
deleted file mode 100644
index 7d1300d9bd51..000000000000
--- a/include/linux/mtd/compatmac.h
+++ /dev/null
@@ -1,10 +0,0 @@
-
-#ifndef __LINUX_MTD_COMPATMAC_H__
-#define __LINUX_MTD_COMPATMAC_H__
-
-/* Nothing to see here. We write 2.5-compatible code and this
-   file makes it all OK in older kernels, but it's empty in _current_
-   kernels. Include guard just to make GCC ignore it in future inclusions
-   anyway... */
-
-#endif /* __LINUX_MTD_COMPATMAC_H__ */
diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index eea3a4fb7405..a9e6ba46865e 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -27,7 +27,6 @@
 #include <linux/string.h>
 #include <linux/bug.h>
 
-#include <linux/mtd/compatmac.h>
 
 #include <asm/unaligned.h>
 #include <asm/system.h>
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index eae914e97f33..8485e42a9b09 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -26,7 +26,6 @@
 #include <linux/notifier.h>
 #include <linux/device.h>
 
-#include <linux/mtd/compatmac.h>
 #include <mtd/mtd-abi.h>
 
 #include <asm/div64.h>
-- 
cgit v1.2.3-59-g8ed1b


From c9243f5bdd6637b2bb7dc254b54d9edf957ef17e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 4 Jul 2010 00:15:07 +0200
Subject: autofs/autofs4: Move compat_ioctl handling into fs

Handling of autofs ioctl numbers does not need to be generic
and can easily be done directly in autofs itself.

This also pushes the BKL into autofs and autofs4 ioctl
methods.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ian Kent <raven@themaw.net>
Cc: Autofs <autofs@linux.kernel.org>
Cc: John Kacur <jkacur@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 fs/autofs/root.c        | 67 ++++++++++++++++++++++++++++++++++++++++++++++---
 fs/autofs4/root.c       | 49 ++++++++++++++++++++++++++++++++++++
 fs/compat_ioctl.c       | 36 --------------------------
 include/linux/auto_fs.h |  1 +
 4 files changed, 114 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 9a0520b50663..11b1ea786d00 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
+#include <linux/compat.h>
 #include <linux/smp_lock.h>
 #include "autofs_i.h"
 
@@ -25,13 +26,17 @@ static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
 static int autofs_root_unlink(struct inode *,struct dentry *);
 static int autofs_root_rmdir(struct inode *,struct dentry *);
 static int autofs_root_mkdir(struct inode *,struct dentry *,int);
-static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
+static long autofs_root_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 
 const struct file_operations autofs_root_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= autofs_root_readdir,
-	.ioctl		= autofs_root_ioctl,
+	.unlocked_ioctl	= autofs_root_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= autofs_root_compat_ioctl,
+#endif
 };
 
 const struct inode_operations autofs_root_inode_operations = {
@@ -492,6 +497,25 @@ static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 
 /* Get/set timeout ioctl() operation */
+#ifdef CONFIG_COMPAT
+static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi,
+					 unsigned int __user *p)
+{
+	unsigned long ntimeout;
+
+	if (get_user(ntimeout, p) ||
+	    put_user(sbi->exp_timeout / HZ, p))
+		return -EFAULT;
+
+	if (ntimeout > UINT_MAX/HZ)
+		sbi->exp_timeout = 0;
+	else
+		sbi->exp_timeout = ntimeout * HZ;
+
+	return 0;
+}
+#endif
+
 static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
 					 unsigned long __user *p)
 {
@@ -546,7 +570,7 @@ static inline int autofs_expire_run(struct super_block *sb,
  * ioctl()'s on the root directory is the chief method for the daemon to
  * generate kernel reactions
  */
-static int autofs_root_ioctl(struct inode *inode, struct file *filp,
+static int autofs_do_root_ioctl(struct inode *inode, struct file *filp,
 			     unsigned int cmd, unsigned long arg)
 {
 	struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
@@ -571,6 +595,10 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp,
 		return 0;
 	case AUTOFS_IOC_PROTOVER: /* Get protocol version */
 		return autofs_get_protover(argp);
+#ifdef CONFIG_COMPAT
+	case AUTOFS_IOC_SETTIMEOUT32:
+		return autofs_compat_get_set_timeout(sbi, argp);
+#endif
 	case AUTOFS_IOC_SETTIMEOUT:
 		return autofs_get_set_timeout(sbi, argp);
 	case AUTOFS_IOC_EXPIRE:
@@ -579,4 +607,37 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp,
 	default:
 		return -ENOSYS;
 	}
+
+}
+
+static long autofs_root_ioctl(struct file *filp,
+			     unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	lock_kernel();
+	ret = autofs_do_root_ioctl(filp->f_path.dentry->d_inode,
+				   filp, cmd, arg);
+	unlock_kernel();
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long autofs_root_compat_ioctl(struct file *filp,
+			     unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret;
+
+	lock_kernel();
+	if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
+		ret = autofs_do_root_ioctl(inode, filp, cmd, arg);
+	else
+		ret = autofs_do_root_ioctl(inode, filp, cmd,
+			(unsigned long)compat_ptr(arg));
+	unlock_kernel();
+
+	return ret;
 }
+#endif
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index db4117ed7803..48e056e70fd6 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,7 +18,9 @@
 #include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
+#include <linux/compat.h>
 #include <linux/smp_lock.h>
+
 #include "autofs_i.h"
 
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
@@ -26,6 +28,7 @@ static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
 static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -40,6 +43,9 @@ const struct file_operations autofs4_root_operations = {
 	.readdir	= dcache_readdir,
 	.llseek		= dcache_dir_lseek,
 	.unlocked_ioctl	= autofs4_root_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= autofs4_root_compat_ioctl,
+#endif
 };
 
 const struct file_operations autofs4_dir_operations = {
@@ -840,6 +846,26 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 
 /* Get/set timeout ioctl() operation */
+#ifdef CONFIG_COMPAT
+static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
+					 compat_ulong_t __user *p)
+{
+	int rv;
+	unsigned long ntimeout;
+
+	if ((rv = get_user(ntimeout, p)) ||
+	     (rv = put_user(sbi->exp_timeout/HZ, p)))
+		return rv;
+
+	if (ntimeout > UINT_MAX/HZ)
+		sbi->exp_timeout = 0;
+	else
+		sbi->exp_timeout = ntimeout * HZ;
+
+	return 0;
+}
+#endif
+
 static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
 					 unsigned long __user *p)
 {
@@ -933,6 +959,10 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 		return autofs4_get_protosubver(sbi, p);
 	case AUTOFS_IOC_SETTIMEOUT:
 		return autofs4_get_set_timeout(sbi, p);
+#ifdef CONFIG_COMPAT
+	case AUTOFS_IOC_SETTIMEOUT32:
+		return autofs4_compat_get_set_timeout(sbi, p);
+#endif
 
 	case AUTOFS_IOC_ASKUMOUNT:
 		return autofs4_ask_umount(filp->f_path.mnt, p);
@@ -961,3 +991,22 @@ static long autofs4_root_ioctl(struct file *filp,
 
 	return ret;
 }
+
+#ifdef CONFIG_COMPAT
+static long autofs4_root_compat_ioctl(struct file *filp,
+			     unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int ret;
+
+	lock_kernel();
+	if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
+		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+	else
+		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
+			(unsigned long)compat_ptr(arg));
+	unlock_kernel();
+
+	return ret;
+}
+#endif
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 641640dc7ae5..618f38136304 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -131,23 +131,6 @@ static int w_long(unsigned int fd, unsigned int cmd,
 	return err;
 }
 
-static int rw_long(unsigned int fd, unsigned int cmd,
-		compat_ulong_t __user *argp)
-{
-	mm_segment_t old_fs = get_fs();
-	int err;
-	unsigned long val;
-
-	if(get_user(val, argp))
-		return -EFAULT;
-	set_fs (KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long)&val);
-	set_fs (old_fs);
-	if (!err && put_user(val, argp))
-		return -EFAULT;
-	return err;
-}
-
 struct compat_video_event {
 	int32_t		type;
 	compat_time_t	timestamp;
@@ -594,12 +577,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
 	return err;
 }
 
-static int ioc_settimeout(unsigned int fd, unsigned int cmd,
-		compat_ulong_t __user *argp)
-{
-	return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, argp);
-}
-
 /* Bluetooth ioctls */
 #define HCIUARTSETPROTO	_IOW('U', 200, int)
 #define HCIUARTGETPROTO	_IOR('U', 201, int)
@@ -1281,13 +1258,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
 COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
 COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
 COMPATIBLE_IOCTL(OSS_GETVERSION)
-/* AUTOFS */
-COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC)
-COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
-COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
-COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI)
-COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER)
-COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT)
 /* Raw devices */
 COMPATIBLE_IOCTL(RAW_SETBIND)
 COMPATIBLE_IOCTL(RAW_GETBIND)
@@ -1552,9 +1522,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
 	case RAW_GETBIND:
 		return raw_ioctl(fd, cmd, argp);
 #endif
-#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
-	case AUTOFS_IOC_SETTIMEOUT32:
-		return ioc_settimeout(fd, cmd, argp);
 	/* One SMB ioctl needs translations. */
 #define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
 	case SMB_IOC_GETMOUNTUID_32:
@@ -1609,9 +1576,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
 	case KDSKBMETA:
 	case KDSKBLED:
 	case KDSETLED:
-	/* AUTOFS */
-	case AUTOFS_IOC_READY:
-	case AUTOFS_IOC_FAIL:
 	/* NBD */
 	case NBD_SET_SOCK:
 	case NBD_SET_BLKSIZE:
diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h
index 7b09c8348fd3..da64e15004b6 100644
--- a/include/linux/auto_fs.h
+++ b/include/linux/auto_fs.h
@@ -79,6 +79,7 @@ struct autofs_packet_expire {
 #define AUTOFS_IOC_FAIL       _IO(0x93,0x61)
 #define AUTOFS_IOC_CATATONIC  _IO(0x93,0x62)
 #define AUTOFS_IOC_PROTOVER   _IOR(0x93,0x63,int)
+#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,compat_ulong_t)
 #define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93,0x64,unsigned long)
 #define AUTOFS_IOC_EXPIRE     _IOR(0x93,0x65,struct autofs_packet_expire)
 
-- 
cgit v1.2.3-59-g8ed1b


From 5fd8f7388c9a8601c2dbe0da458df602fe427e83 Mon Sep 17 00:00:00 2001
From: Sylwester Nawrocki <s.nawrocki@samsung.com>
Date: Tue, 3 Aug 2010 09:50:29 -0300
Subject: V4L/DVB: v4l: Add driver for Samsung S5P SoC video postprocessor

This driver exports a video device node per each camera interface/
video postprocessor (FIMC) device contained in Samsung S5P SoC series.
The driver is based on v4l2-mem2mem framework.

Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Pawel Osciak <p.osciak@samsung.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 Documentation/DocBook/v4l/pixfmt-packed-rgb.xml |   78 ++
 drivers/media/video/Kconfig                     |    9 +
 drivers/media/video/Makefile                    |    1 +
 drivers/media/video/s5p-fimc/Makefile           |    3 +
 drivers/media/video/s5p-fimc/fimc-core.c        | 1570 +++++++++++++++++++++++
 drivers/media/video/s5p-fimc/fimc-core.h        |  465 +++++++
 drivers/media/video/s5p-fimc/fimc-reg.c         |  527 ++++++++
 drivers/media/video/s5p-fimc/regs-fimc.h        |  293 +++++
 include/linux/videodev2.h                       |    1 +
 9 files changed, 2947 insertions(+)
 create mode 100644 drivers/media/video/s5p-fimc/Makefile
 create mode 100644 drivers/media/video/s5p-fimc/fimc-core.c
 create mode 100644 drivers/media/video/s5p-fimc/fimc-core.h
 create mode 100644 drivers/media/video/s5p-fimc/fimc-reg.c
 create mode 100644 drivers/media/video/s5p-fimc/regs-fimc.h

(limited to 'include/linux')

diff --git a/Documentation/DocBook/v4l/pixfmt-packed-rgb.xml b/Documentation/DocBook/v4l/pixfmt-packed-rgb.xml
index d2dd697a81d8..26e879231088 100644
--- a/Documentation/DocBook/v4l/pixfmt-packed-rgb.xml
+++ b/Documentation/DocBook/v4l/pixfmt-packed-rgb.xml
@@ -240,6 +240,45 @@ colorspace <constant>V4L2_COLORSPACE_SRGB</constant>.</para>
 	    <entry>r<subscript>1</subscript></entry>
 	    <entry>r<subscript>0</subscript></entry>
 	  </row>
+	  <row id="V4L2-PIX-FMT-BGR666">
+	    <entry><constant>V4L2_PIX_FMT_BGR666</constant></entry>
+	    <entry>'BGRH'</entry>
+	    <entry></entry>
+	    <entry>b<subscript>5</subscript></entry>
+	    <entry>b<subscript>4</subscript></entry>
+	    <entry>b<subscript>3</subscript></entry>
+	    <entry>b<subscript>2</subscript></entry>
+	    <entry>b<subscript>1</subscript></entry>
+	    <entry>b<subscript>0</subscript></entry>
+	    <entry>g<subscript>5</subscript></entry>
+	    <entry>g<subscript>4</subscript></entry>
+	    <entry></entry>
+	    <entry>g<subscript>3</subscript></entry>
+	    <entry>g<subscript>2</subscript></entry>
+	    <entry>g<subscript>1</subscript></entry>
+	    <entry>g<subscript>0</subscript></entry>
+	    <entry>r<subscript>5</subscript></entry>
+	    <entry>r<subscript>4</subscript></entry>
+	    <entry>r<subscript>3</subscript></entry>
+	    <entry>r<subscript>2</subscript></entry>
+	    <entry></entry>
+	    <entry>r<subscript>1</subscript></entry>
+	    <entry>r<subscript>0</subscript></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	  </row>
 	  <row id="V4L2-PIX-FMT-BGR24">
 	    <entry><constant>V4L2_PIX_FMT_BGR24</constant></entry>
 	    <entry>'BGR3'</entry>
@@ -700,6 +739,45 @@ defined in error. Drivers may interpret them as in <xref
 	    <entry>b<subscript>1</subscript></entry>
 	    <entry>b<subscript>0</subscript></entry>
 	  </row>
+	  <row id="V4L2-PIX-FMT-BGR666">
+	    <entry><constant>V4L2_PIX_FMT_BGR666</constant></entry>
+	    <entry>'BGRH'</entry>
+	    <entry></entry>
+	    <entry>b<subscript>5</subscript></entry>
+	    <entry>b<subscript>4</subscript></entry>
+	    <entry>b<subscript>3</subscript></entry>
+	    <entry>b<subscript>2</subscript></entry>
+	    <entry>b<subscript>1</subscript></entry>
+	    <entry>b<subscript>0</subscript></entry>
+	    <entry>g<subscript>5</subscript></entry>
+	    <entry>g<subscript>4</subscript></entry>
+	    <entry></entry>
+	    <entry>g<subscript>3</subscript></entry>
+	    <entry>g<subscript>2</subscript></entry>
+	    <entry>g<subscript>1</subscript></entry>
+	    <entry>g<subscript>0</subscript></entry>
+	    <entry>r<subscript>5</subscript></entry>
+	    <entry>r<subscript>4</subscript></entry>
+	    <entry>r<subscript>3</subscript></entry>
+	    <entry>r<subscript>2</subscript></entry>
+	    <entry></entry>
+	    <entry>r<subscript>1</subscript></entry>
+	    <entry>r<subscript>0</subscript></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	    <entry></entry>
+	  </row>
 	  <row><!-- id="V4L2-PIX-FMT-BGR24" -->
 	    <entry><constant>V4L2_PIX_FMT_BGR24</constant></entry>
 	    <entry>'BGR3'</entry>
diff --git a/drivers/media/video/Kconfig b/drivers/media/video/Kconfig
index 2e15903b976d..c70b67deb9b2 100644
--- a/drivers/media/video/Kconfig
+++ b/drivers/media/video/Kconfig
@@ -1016,4 +1016,13 @@ config VIDEO_MEM2MEM_TESTDEV
 	  This is a virtual test device for the memory-to-memory driver
 	  framework.
 
+config  VIDEO_SAMSUNG_S5P_FIMC
+	tristate "Samsung S5P FIMC (video postprocessor) driver"
+	depends on VIDEO_DEV && VIDEO_V4L2 && PLAT_S5P
+	select VIDEOBUF_DMA_CONTIG
+	select V4L2_MEM2MEM_DEV
+	help
+	  This is a v4l2 driver for the S5P camera interface
+	  (video postprocessor)
+
 endif # V4L_MEM2MEM_DRIVERS
diff --git a/drivers/media/video/Makefile b/drivers/media/video/Makefile
index 1051ecc602e7..c76fef390797 100644
--- a/drivers/media/video/Makefile
+++ b/drivers/media/video/Makefile
@@ -163,6 +163,7 @@ obj-$(CONFIG_VIDEO_MX3)			+= mx3_camera.o
 obj-$(CONFIG_VIDEO_PXA27x)		+= pxa_camera.o
 obj-$(CONFIG_VIDEO_SH_MOBILE_CSI2)	+= sh_mobile_csi2.o
 obj-$(CONFIG_VIDEO_SH_MOBILE_CEU)	+= sh_mobile_ceu_camera.o
+obj-$(CONFIG_VIDEO_SAMSUNG_S5P_FIMC) 	+= s5p-fimc/
 
 obj-$(CONFIG_ARCH_DAVINCI)		+= davinci/
 
diff --git a/drivers/media/video/s5p-fimc/Makefile b/drivers/media/video/s5p-fimc/Makefile
new file mode 100644
index 000000000000..0d9d54132ecc
--- /dev/null
+++ b/drivers/media/video/s5p-fimc/Makefile
@@ -0,0 +1,3 @@
+
+obj-$(CONFIG_VIDEO_SAMSUNG_S5P_FIMC) := s5p-fimc.o
+s5p-fimc-y := fimc-core.o fimc-reg.o
diff --git a/drivers/media/video/s5p-fimc/fimc-core.c b/drivers/media/video/s5p-fimc/fimc-core.c
new file mode 100644
index 000000000000..6558a2ea9ffd
--- /dev/null
+++ b/drivers/media/video/s5p-fimc/fimc-core.c
@@ -0,0 +1,1570 @@
+/*
+ * S5P camera interface (video postprocessor) driver
+ *
+ * Copyright (c) 2010 Samsung Electronics
+ *
+ * Sylwester Nawrocki, <s.nawrocki@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation, either version 2 of the License,
+ * or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/bug.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/clk.h>
+#include <media/v4l2-ioctl.h>
+#include <media/videobuf-dma-contig.h>
+
+#include "fimc-core.h"
+
+static char *fimc_clock_name[NUM_FIMC_CLOCKS] = { "sclk_fimc", "fimc" };
+
+static struct fimc_fmt fimc_formats[] = {
+	{
+		.name	= "RGB565",
+		.fourcc	= V4L2_PIX_FMT_RGB565X,
+		.depth	= 16,
+		.color	= S5P_FIMC_RGB565,
+		.buff_cnt = 1,
+		.planes_cnt = 1
+	}, {
+		.name	= "BGR666",
+		.fourcc	= V4L2_PIX_FMT_BGR666,
+		.depth	= 32,
+		.color	= S5P_FIMC_RGB666,
+		.buff_cnt = 1,
+		.planes_cnt = 1
+	}, {
+		.name = "XRGB-8-8-8-8, 24 bpp",
+		.fourcc	= V4L2_PIX_FMT_RGB24,
+		.depth = 32,
+		.color	= S5P_FIMC_RGB888,
+		.buff_cnt = 1,
+		.planes_cnt = 1
+	}, {
+		.name	= "YUV 4:2:2 packed, YCbYCr",
+		.fourcc	= V4L2_PIX_FMT_YUYV,
+		.depth	= 16,
+		.color	= S5P_FIMC_YCBYCR422,
+		.buff_cnt = 1,
+		.planes_cnt = 1
+		}, {
+		.name	= "YUV 4:2:2 packed, CbYCrY",
+		.fourcc	= V4L2_PIX_FMT_UYVY,
+		.depth	= 16,
+		.color	= S5P_FIMC_CBYCRY422,
+		.buff_cnt = 1,
+		.planes_cnt = 1
+	}, {
+		.name	= "YUV 4:2:2 packed, CrYCbY",
+		.fourcc	= V4L2_PIX_FMT_VYUY,
+		.depth	= 16,
+		.color	= S5P_FIMC_CRYCBY422,
+		.buff_cnt = 1,
+		.planes_cnt = 1
+	}, {
+		.name	= "YUV 4:2:2 packed, YCrYCb",
+		.fourcc	= V4L2_PIX_FMT_YVYU,
+		.depth	= 16,
+		.color	= S5P_FIMC_YCRYCB422,
+		.buff_cnt = 1,
+		.planes_cnt = 1
+	}, {
+		.name	= "YUV 4:2:2 planar, Y/Cb/Cr",
+		.fourcc	= V4L2_PIX_FMT_YUV422P,
+		.depth	= 12,
+		.color	= S5P_FIMC_YCBCR422,
+		.buff_cnt = 1,
+		.planes_cnt = 3
+	}, {
+		.name	= "YUV 4:2:2 planar, Y/CbCr",
+		.fourcc	= V4L2_PIX_FMT_NV16,
+		.depth	= 16,
+		.color	= S5P_FIMC_YCBCR422,
+		.buff_cnt = 1,
+		.planes_cnt = 2
+	}, {
+		.name	= "YUV 4:2:2 planar, Y/CrCb",
+		.fourcc	= V4L2_PIX_FMT_NV61,
+		.depth	= 16,
+		.color	= S5P_FIMC_RGB565,
+		.buff_cnt = 1,
+		.planes_cnt = 2
+	}, {
+		.name	= "YUV 4:2:0 planar, YCbCr",
+		.fourcc	= V4L2_PIX_FMT_YUV420,
+		.depth	= 12,
+		.color	= S5P_FIMC_YCBCR420,
+		.buff_cnt = 1,
+		.planes_cnt = 3
+	}, {
+		.name	= "YUV 4:2:0 planar, Y/CbCr",
+		.fourcc	= V4L2_PIX_FMT_NV12,
+		.depth	= 12,
+		.color	= S5P_FIMC_YCBCR420,
+		.buff_cnt = 1,
+		.planes_cnt = 2
+	}
+ };
+
+static struct v4l2_queryctrl fimc_ctrls[] = {
+	{
+		.id		= V4L2_CID_HFLIP,
+		.type		= V4L2_CTRL_TYPE_BOOLEAN,
+		.name		= "Horizontal flip",
+		.minimum	= 0,
+		.maximum	= 1,
+		.default_value	= 0,
+	},
+	{
+		.id		= V4L2_CID_VFLIP,
+		.type		= V4L2_CTRL_TYPE_BOOLEAN,
+		.name		= "Vertical flip",
+		.minimum	= 0,
+		.maximum	= 1,
+		.default_value	= 0,
+	},
+	{
+		.id		= V4L2_CID_ROTATE,
+		.type		= V4L2_CTRL_TYPE_INTEGER,
+		.name		= "Rotation (CCW)",
+		.minimum	= 0,
+		.maximum	= 270,
+		.step		= 90,
+		.default_value	= 0,
+	},
+};
+
+
+static struct v4l2_queryctrl *get_ctrl(int id)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(fimc_ctrls); ++i)
+		if (id == fimc_ctrls[i].id)
+			return &fimc_ctrls[i];
+	return NULL;
+}
+
+static int fimc_check_scaler_ratio(struct v4l2_rect *r, struct fimc_frame *f)
+{
+	if (r->width > f->width) {
+		if (f->width > (r->width * SCALER_MAX_HRATIO))
+			return -EINVAL;
+	} else {
+		if ((f->width * SCALER_MAX_HRATIO) < r->width)
+			return -EINVAL;
+	}
+
+	if (r->height > f->height) {
+		if (f->height > (r->height * SCALER_MAX_VRATIO))
+			return -EINVAL;
+	} else {
+		if ((f->height * SCALER_MAX_VRATIO) < r->height)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int fimc_get_scaler_factor(u32 src, u32 tar, u32 *ratio, u32 *shift)
+{
+	if (src >= tar * 64) {
+		return -EINVAL;
+	} else if (src >= tar * 32) {
+		*ratio = 32;
+		*shift = 5;
+	} else if (src >= tar * 16) {
+		*ratio = 16;
+		*shift = 4;
+	} else if (src >= tar * 8) {
+		*ratio = 8;
+		*shift = 3;
+	} else if (src >= tar * 4) {
+		*ratio = 4;
+		*shift = 2;
+	} else if (src >= tar * 2) {
+		*ratio = 2;
+		*shift = 1;
+	} else {
+		*ratio = 1;
+		*shift = 0;
+	}
+
+	return 0;
+}
+
+static int fimc_set_scaler_info(struct fimc_ctx *ctx)
+{
+	struct fimc_scaler *sc = &ctx->scaler;
+	struct fimc_frame *s_frame = &ctx->s_frame;
+	struct fimc_frame *d_frame = &ctx->d_frame;
+	int tx, ty, sx, sy;
+	int ret;
+
+	tx = d_frame->width;
+	ty = d_frame->height;
+	if (tx <= 0 || ty <= 0) {
+		v4l2_err(&ctx->fimc_dev->m2m.v4l2_dev,
+			"invalid target size: %d x %d", tx, ty);
+		return -EINVAL;
+	}
+
+	sx = s_frame->width;
+	sy = s_frame->height;
+	if (sx <= 0 || sy <= 0) {
+		err("invalid source size: %d x %d", sx, sy);
+		return -EINVAL;
+	}
+
+	sc->real_width = sx;
+	sc->real_height = sy;
+	dbg("sx= %d, sy= %d, tx= %d, ty= %d", sx, sy, tx, ty);
+
+	ret = fimc_get_scaler_factor(sx, tx, &sc->pre_hratio, &sc->hfactor);
+	if (ret)
+		return ret;
+
+	ret = fimc_get_scaler_factor(sy, ty,  &sc->pre_vratio, &sc->vfactor);
+	if (ret)
+		return ret;
+
+	sc->pre_dst_width = sx / sc->pre_hratio;
+	sc->pre_dst_height = sy / sc->pre_vratio;
+
+	sc->main_hratio = (sx << 8) / (tx << sc->hfactor);
+	sc->main_vratio = (sy << 8) / (ty << sc->vfactor);
+
+	sc->scaleup_h = (tx >= sx) ? 1 : 0;
+	sc->scaleup_v = (ty >= sy) ? 1 : 0;
+
+	/* check to see if input and output size/format differ */
+	if (s_frame->fmt->color == d_frame->fmt->color
+		&& s_frame->width == d_frame->width
+		&& s_frame->height == d_frame->height)
+		sc->copy_mode = 1;
+	else
+		sc->copy_mode = 0;
+
+	return 0;
+}
+
+
+static irqreturn_t fimc_isr(int irq, void *priv)
+{
+	struct fimc_vid_buffer *src_buf, *dst_buf;
+	struct fimc_dev *fimc = (struct fimc_dev *)priv;
+	struct fimc_ctx *ctx;
+
+	BUG_ON(!fimc);
+	fimc_hw_clear_irq(fimc);
+
+	spin_lock(&fimc->slock);
+
+	if (test_and_clear_bit(ST_M2M_PEND, &fimc->state)) {
+		ctx = v4l2_m2m_get_curr_priv(fimc->m2m.m2m_dev);
+		if (!ctx || !ctx->m2m_ctx)
+			goto isr_unlock;
+		src_buf = v4l2_m2m_src_buf_remove(ctx->m2m_ctx);
+		dst_buf = v4l2_m2m_dst_buf_remove(ctx->m2m_ctx);
+		if (src_buf && dst_buf) {
+			spin_lock(&fimc->irqlock);
+			src_buf->vb.state = dst_buf->vb.state =  VIDEOBUF_DONE;
+			wake_up(&src_buf->vb.done);
+			wake_up(&dst_buf->vb.done);
+			spin_unlock(&fimc->irqlock);
+			v4l2_m2m_job_finish(fimc->m2m.m2m_dev, ctx->m2m_ctx);
+		}
+	}
+
+isr_unlock:
+	spin_unlock(&fimc->slock);
+	return IRQ_HANDLED;
+}
+
+/* The color format (planes_cnt, buff_cnt) must be already configured. */
+static int fimc_prepare_addr(struct fimc_ctx *ctx,
+		struct fimc_vid_buffer *buf, enum v4l2_buf_type type)
+{
+	struct fimc_frame *frame;
+	struct fimc_addr *paddr;
+	u32 pix_size;
+	int ret = 0;
+
+	ctx_m2m_get_frame(frame, ctx, type);
+	paddr = &frame->paddr;
+
+	if (!buf)
+		return -EINVAL;
+
+	pix_size = frame->width * frame->height;
+
+	dbg("buff_cnt= %d, planes_cnt= %d, frame->size= %d, pix_size= %d",
+		frame->fmt->buff_cnt, frame->fmt->planes_cnt,
+		frame->size, pix_size);
+
+	if (frame->fmt->buff_cnt == 1) {
+		paddr->y = videobuf_to_dma_contig(&buf->vb);
+		switch (frame->fmt->planes_cnt) {
+		case 1:
+			paddr->cb = 0;
+			paddr->cr = 0;
+			break;
+		case 2:
+			/* decompose Y into Y/Cb */
+			paddr->cb = (u32)(paddr->y + pix_size);
+			paddr->cr = 0;
+			break;
+		case 3:
+			paddr->cb = (u32)(paddr->y + pix_size);
+			/* decompose Y into Y/Cb/Cr */
+			if (S5P_FIMC_YCBCR420 == frame->fmt->color)
+				paddr->cr = (u32)(paddr->cb
+						+ (pix_size >> 2));
+			else /* 422 */
+				paddr->cr = (u32)(paddr->cb
+						+ (pix_size >> 1));
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	dbg("PHYS_ADDR: type= %d, y= 0x%X  cb= 0x%X cr= 0x%X ret= %d",
+	type, paddr->y, paddr->cb, paddr->cr, ret);
+
+	return ret;
+}
+
+/* Set order for 1 and 2 plane YCBCR 4:2:2 formats. */
+static void fimc_set_yuv_order(struct fimc_ctx *ctx)
+{
+	/* The one only mode supported in SoC. */
+	ctx->in_order_2p = S5P_FIMC_LSB_CRCB;
+	ctx->out_order_2p = S5P_FIMC_LSB_CRCB;
+
+	/* Set order for 1 plane input formats. */
+	switch (ctx->s_frame.fmt->color) {
+	case S5P_FIMC_YCRYCB422:
+		ctx->in_order_1p = S5P_FIMC_IN_YCRYCB;
+		break;
+	case S5P_FIMC_CBYCRY422:
+		ctx->in_order_1p = S5P_FIMC_IN_CBYCRY;
+		break;
+	case S5P_FIMC_CRYCBY422:
+		ctx->in_order_1p = S5P_FIMC_IN_CRYCBY;
+		break;
+	case S5P_FIMC_YCBYCR422:
+	default:
+		ctx->in_order_1p = S5P_FIMC_IN_YCBYCR;
+		break;
+	}
+	dbg("ctx->in_order_1p= %d", ctx->in_order_1p);
+
+	switch (ctx->d_frame.fmt->color) {
+	case S5P_FIMC_YCRYCB422:
+		ctx->out_order_1p = S5P_FIMC_OUT_YCRYCB;
+		break;
+	case S5P_FIMC_CBYCRY422:
+		ctx->out_order_1p = S5P_FIMC_OUT_CBYCRY;
+		break;
+	case S5P_FIMC_CRYCBY422:
+		ctx->out_order_1p = S5P_FIMC_OUT_CRYCBY;
+		break;
+	case S5P_FIMC_YCBYCR422:
+	default:
+		ctx->out_order_1p = S5P_FIMC_OUT_YCBYCR;
+		break;
+	}
+	dbg("ctx->out_order_1p= %d", ctx->out_order_1p);
+}
+
+/**
+ * fimc_prepare_config - check dimensions, operation and color mode
+ *			 and pre-calculate offset and the scaling coefficients.
+ *
+ * @ctx: hardware context information
+ * @flags: flags indicating which parameters to check/update
+ *
+ * Return: 0 if dimensions are valid or non zero otherwise.
+ */
+static int fimc_prepare_config(struct fimc_ctx *ctx, u32 flags)
+{
+	struct fimc_frame *s_frame, *d_frame;
+	struct fimc_vid_buffer *buf = NULL;
+	struct samsung_fimc_variant *variant = ctx->fimc_dev->variant;
+	int ret = 0;
+
+	s_frame = &ctx->s_frame;
+	d_frame = &ctx->d_frame;
+
+	if (flags & FIMC_PARAMS) {
+		if ((ctx->out_path == FIMC_DMA) &&
+			(ctx->rotation == 90 || ctx->rotation == 270)) {
+			swap(d_frame->f_width, d_frame->f_height);
+			swap(d_frame->width, d_frame->height);
+		}
+
+		/* Prepare the output offset ratios for scaler. */
+		d_frame->dma_offset.y_h = d_frame->offs_h;
+		if (!variant->pix_hoff)
+			d_frame->dma_offset.y_h *= (d_frame->fmt->depth >> 3);
+
+		d_frame->dma_offset.y_v = d_frame->offs_v;
+
+		d_frame->dma_offset.cb_h = d_frame->offs_h;
+		d_frame->dma_offset.cb_v = d_frame->offs_v;
+
+		d_frame->dma_offset.cr_h = d_frame->offs_h;
+		d_frame->dma_offset.cr_v = d_frame->offs_v;
+
+		if (!variant->pix_hoff && d_frame->fmt->planes_cnt == 3) {
+			d_frame->dma_offset.cb_h >>= 1;
+			d_frame->dma_offset.cb_v >>= 1;
+			d_frame->dma_offset.cr_h >>= 1;
+			d_frame->dma_offset.cr_v >>= 1;
+		}
+
+		dbg("out offset: color= %d, y_h= %d, y_v= %d",
+			d_frame->fmt->color,
+			d_frame->dma_offset.y_h, d_frame->dma_offset.y_v);
+
+		/* Prepare the input offset ratios for scaler. */
+		s_frame->dma_offset.y_h = s_frame->offs_h;
+		if (!variant->pix_hoff)
+			s_frame->dma_offset.y_h *= (s_frame->fmt->depth >> 3);
+		s_frame->dma_offset.y_v = s_frame->offs_v;
+
+		s_frame->dma_offset.cb_h = s_frame->offs_h;
+		s_frame->dma_offset.cb_v = s_frame->offs_v;
+
+		s_frame->dma_offset.cr_h = s_frame->offs_h;
+		s_frame->dma_offset.cr_v = s_frame->offs_v;
+
+		if (!variant->pix_hoff && s_frame->fmt->planes_cnt == 3) {
+			s_frame->dma_offset.cb_h >>= 1;
+			s_frame->dma_offset.cb_v >>= 1;
+			s_frame->dma_offset.cr_h >>= 1;
+			s_frame->dma_offset.cr_v >>= 1;
+		}
+
+		dbg("in offset: color= %d, y_h= %d, y_v= %d",
+			s_frame->fmt->color, s_frame->dma_offset.y_h,
+			s_frame->dma_offset.y_v);
+
+		fimc_set_yuv_order(ctx);
+
+		/* Check against the scaler ratio. */
+		if (s_frame->height > (SCALER_MAX_VRATIO * d_frame->height) ||
+		    s_frame->width > (SCALER_MAX_HRATIO * d_frame->width)) {
+			err("out of scaler range");
+			return -EINVAL;
+		}
+	}
+
+	/* Input DMA mode is not allowed when the scaler is disabled. */
+	ctx->scaler.enabled = 1;
+
+	if (flags & FIMC_SRC_ADDR) {
+		buf = v4l2_m2m_next_src_buf(ctx->m2m_ctx);
+		ret = fimc_prepare_addr(ctx, buf,
+			V4L2_BUF_TYPE_VIDEO_OUTPUT);
+		if (ret)
+			return ret;
+	}
+
+	if (flags & FIMC_DST_ADDR) {
+		buf = v4l2_m2m_next_dst_buf(ctx->m2m_ctx);
+		ret = fimc_prepare_addr(ctx, buf,
+			V4L2_BUF_TYPE_VIDEO_CAPTURE);
+	}
+
+	return ret;
+}
+
+static void fimc_dma_run(void *priv)
+{
+	struct fimc_ctx *ctx = priv;
+	struct fimc_dev *fimc;
+	unsigned long flags;
+	u32 ret;
+
+	if (WARN(!ctx, "null hardware context"))
+		return;
+
+	fimc = ctx->fimc_dev;
+
+	spin_lock_irqsave(&ctx->slock, flags);
+	set_bit(ST_M2M_PEND, &fimc->state);
+
+	ctx->state |= (FIMC_SRC_ADDR | FIMC_DST_ADDR);
+	ret = fimc_prepare_config(ctx, ctx->state);
+	if (ret) {
+		err("general configuration error");
+		goto dma_unlock;
+	}
+
+	if (fimc->m2m.ctx != ctx)
+		ctx->state |= FIMC_PARAMS;
+
+	fimc_hw_set_input_addr(fimc, &ctx->s_frame.paddr);
+
+	if (ctx->state & FIMC_PARAMS) {
+		fimc_hw_set_input_path(ctx);
+		fimc_hw_set_in_dma(ctx);
+		if (fimc_set_scaler_info(ctx)) {
+			err("scaler configuration error");
+			goto dma_unlock;
+		}
+		fimc_hw_set_prescaler(ctx);
+		fimc_hw_set_scaler(ctx);
+		fimc_hw_set_target_format(ctx);
+		fimc_hw_set_rotation(ctx);
+		fimc_hw_set_effect(ctx);
+	}
+
+	fimc_hw_set_output_path(ctx);
+	if (ctx->state & (FIMC_DST_ADDR | FIMC_PARAMS))
+		fimc_hw_set_output_addr(fimc, &ctx->d_frame.paddr);
+
+	if (ctx->state & FIMC_PARAMS)
+		fimc_hw_set_out_dma(ctx);
+
+	if (ctx->scaler.enabled)
+		fimc_hw_start_scaler(fimc);
+	fimc_hw_en_capture(ctx);
+
+	ctx->state = 0;
+	fimc_hw_start_in_dma(fimc);
+
+	fimc->m2m.ctx = ctx;
+
+dma_unlock:
+	spin_unlock_irqrestore(&ctx->slock, flags);
+}
+
+/* Nothing done in job_abort. */
+static void fimc_job_abort(void *priv) {}
+
+static void fimc_buf_release(struct videobuf_queue *vq,
+				    struct videobuf_buffer *vb)
+{
+	videobuf_dma_contig_free(vq, vb);
+	vb->state = VIDEOBUF_NEEDS_INIT;
+}
+
+static int fimc_buf_setup(struct videobuf_queue *vq, unsigned int *count,
+				unsigned int *size)
+{
+	struct fimc_ctx *ctx = vq->priv_data;
+	struct fimc_frame *frame;
+
+	ctx_m2m_get_frame(frame, ctx, vq->type);
+
+	*size = (frame->width * frame->height * frame->fmt->depth) >> 3;
+	if (0 == *count)
+		*count = 1;
+	return 0;
+}
+
+static int fimc_buf_prepare(struct videobuf_queue *vq,
+		struct videobuf_buffer *vb, enum v4l2_field field)
+{
+	struct fimc_ctx *ctx = vq->priv_data;
+	struct v4l2_device *v4l2_dev = &ctx->fimc_dev->m2m.v4l2_dev;
+	struct fimc_frame *frame;
+	int ret;
+
+	ctx_m2m_get_frame(frame, ctx, vq->type);
+
+	if (vb->baddr) {
+		if (vb->bsize < frame->size) {
+			v4l2_err(v4l2_dev,
+				"User-provided buffer too small (%d < %d)\n",
+				 vb->bsize, frame->size);
+			WARN_ON(1);
+			return -EINVAL;
+		}
+	} else if (vb->state != VIDEOBUF_NEEDS_INIT
+		   && vb->bsize < frame->size) {
+		return -EINVAL;
+	}
+
+	vb->width       = frame->width;
+	vb->height      = frame->height;
+	vb->bytesperline = (frame->width * frame->fmt->depth) >> 3;
+	vb->size        = frame->size;
+	vb->field       = field;
+
+	if (VIDEOBUF_NEEDS_INIT == vb->state) {
+		ret = videobuf_iolock(vq, vb, NULL);
+		if (ret) {
+			v4l2_err(v4l2_dev, "Iolock failed\n");
+			fimc_buf_release(vq, vb);
+			return ret;
+		}
+	}
+	vb->state = VIDEOBUF_PREPARED;
+
+	return 0;
+}
+
+static void fimc_buf_queue(struct videobuf_queue *vq,
+				  struct videobuf_buffer *vb)
+{
+	struct fimc_ctx *ctx = vq->priv_data;
+	v4l2_m2m_buf_queue(ctx->m2m_ctx, vq, vb);
+}
+
+struct videobuf_queue_ops fimc_qops = {
+	.buf_setup	= fimc_buf_setup,
+	.buf_prepare	= fimc_buf_prepare,
+	.buf_queue	= fimc_buf_queue,
+	.buf_release	= fimc_buf_release,
+};
+
+static int fimc_m2m_querycap(struct file *file, void *priv,
+			   struct v4l2_capability *cap)
+{
+	struct fimc_ctx *ctx = file->private_data;
+	struct fimc_dev *fimc = ctx->fimc_dev;
+
+	strncpy(cap->driver, fimc->pdev->name, sizeof(cap->driver) - 1);
+	strncpy(cap->card, fimc->pdev->name, sizeof(cap->card) - 1);
+	cap->bus_info[0] = 0;
+	cap->version = KERNEL_VERSION(1, 0, 0);
+	cap->capabilities = V4L2_CAP_STREAMING |
+		V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT;
+
+	return 0;
+}
+
+static int fimc_m2m_enum_fmt(struct file *file, void *priv,
+				struct v4l2_fmtdesc *f)
+{
+	struct fimc_fmt *fmt;
+
+	if (f->index >= ARRAY_SIZE(fimc_formats))
+		return -EINVAL;
+
+	fmt = &fimc_formats[f->index];
+	strncpy(f->description, fmt->name, sizeof(f->description) - 1);
+	f->pixelformat = fmt->fourcc;
+	return 0;
+}
+
+static int fimc_m2m_g_fmt(struct file *file, void *priv, struct v4l2_format *f)
+{
+	struct fimc_ctx *ctx = priv;
+	struct fimc_frame *frame;
+
+	ctx_m2m_get_frame(frame, ctx, f->type);
+
+	f->fmt.pix.width	= frame->width;
+	f->fmt.pix.height	= frame->height;
+	f->fmt.pix.field	= V4L2_FIELD_NONE;
+	f->fmt.pix.pixelformat	= frame->fmt->fourcc;
+
+	return 0;
+}
+
+static struct fimc_fmt *find_format(struct v4l2_format *f)
+{
+	struct fimc_fmt *fmt;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(fimc_formats); ++i) {
+		fmt = &fimc_formats[i];
+		if (fmt->fourcc == f->fmt.pix.pixelformat)
+			break;
+	}
+	if (i == ARRAY_SIZE(fimc_formats))
+		return NULL;
+
+	return fmt;
+}
+
+static int fimc_m2m_try_fmt(struct file *file, void *priv,
+			     struct v4l2_format *f)
+{
+	struct fimc_fmt *fmt;
+	u32 max_width, max_height, mod_x, mod_y;
+	struct fimc_ctx *ctx = priv;
+	struct fimc_dev *fimc = ctx->fimc_dev;
+	struct v4l2_pix_format *pix = &f->fmt.pix;
+	struct samsung_fimc_variant *variant = fimc->variant;
+
+	fmt = find_format(f);
+	if (!fmt) {
+		v4l2_err(&fimc->m2m.v4l2_dev,
+			 "Fourcc format (0x%X) invalid.\n",  pix->pixelformat);
+		return -EINVAL;
+	}
+
+	if (pix->field == V4L2_FIELD_ANY)
+		pix->field = V4L2_FIELD_NONE;
+	else if (V4L2_FIELD_NONE != pix->field)
+		return -EINVAL;
+
+	if (f->type == V4L2_BUF_TYPE_VIDEO_OUTPUT) {
+		max_width = variant->scaler_dis_w;
+		max_height = variant->scaler_dis_w;
+		mod_x = variant->min_inp_pixsize;
+		mod_y = variant->min_inp_pixsize;
+	} else if (f->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) {
+		max_width = variant->out_rot_dis_w;
+		max_height = variant->out_rot_dis_w;
+		mod_x = variant->min_out_pixsize;
+		mod_y = variant->min_out_pixsize;
+	} else {
+		err("Wrong stream type (%d)", f->type);
+		return -EINVAL;
+	}
+
+	dbg("max_w= %d, max_h= %d", max_width, max_height);
+
+	if (pix->height > max_height)
+		pix->height = max_height;
+	if (pix->width > max_width)
+		pix->width = max_width;
+
+	if (tiled_fmt(fmt)) {
+		mod_x = 64; /* 64x32 tile */
+		mod_y = 32;
+	}
+
+	dbg("mod_x= 0x%X, mod_y= 0x%X", mod_x, mod_y);
+
+	pix->width = (pix->width == 0) ? mod_x : ALIGN(pix->width, mod_x);
+	pix->height = (pix->height == 0) ? mod_y : ALIGN(pix->height, mod_y);
+
+	if (pix->bytesperline == 0 ||
+	    pix->bytesperline * 8 / fmt->depth > pix->width)
+		pix->bytesperline = (pix->width * fmt->depth) >> 3;
+
+	if (pix->sizeimage == 0)
+		pix->sizeimage = pix->height * pix->bytesperline;
+
+	dbg("pix->bytesperline= %d, fmt->depth= %d",
+	    pix->bytesperline, fmt->depth);
+
+	return 0;
+}
+
+
+static int fimc_m2m_s_fmt(struct file *file, void *priv, struct v4l2_format *f)
+{
+	struct fimc_ctx *ctx = priv;
+	struct v4l2_device *v4l2_dev = &ctx->fimc_dev->m2m.v4l2_dev;
+	struct videobuf_queue *src_vq, *dst_vq;
+	struct fimc_frame *frame;
+	struct v4l2_pix_format *pix;
+	unsigned long flags;
+	int ret = 0;
+
+	BUG_ON(!ctx);
+
+	ret = fimc_m2m_try_fmt(file, priv, f);
+	if (ret)
+		return ret;
+
+	mutex_lock(&ctx->fimc_dev->lock);
+
+	src_vq = v4l2_m2m_get_src_vq(ctx->m2m_ctx);
+	dst_vq = v4l2_m2m_get_dst_vq(ctx->m2m_ctx);
+
+	mutex_lock(&src_vq->vb_lock);
+	mutex_lock(&dst_vq->vb_lock);
+
+	if (f->type == V4L2_BUF_TYPE_VIDEO_OUTPUT) {
+		if (videobuf_queue_is_busy(src_vq)) {
+			v4l2_err(v4l2_dev, "%s queue busy\n", __func__);
+			ret = -EBUSY;
+			goto s_fmt_out;
+		}
+		frame = &ctx->s_frame;
+		spin_lock_irqsave(&ctx->slock, flags);
+		ctx->state |= FIMC_SRC_FMT;
+		spin_unlock_irqrestore(&ctx->slock, flags);
+
+	} else if (f->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) {
+		if (videobuf_queue_is_busy(dst_vq)) {
+			v4l2_err(v4l2_dev, "%s queue busy\n", __func__);
+			ret = -EBUSY;
+			goto s_fmt_out;
+		}
+		frame = &ctx->d_frame;
+		spin_lock_irqsave(&ctx->slock, flags);
+		ctx->state |= FIMC_DST_FMT;
+		spin_unlock_irqrestore(&ctx->slock, flags);
+	} else {
+		v4l2_err(&ctx->fimc_dev->m2m.v4l2_dev,
+			 "Wrong buffer/video queue type (%d)\n", f->type);
+		return -EINVAL;
+	}
+
+	pix = &f->fmt.pix;
+	frame->fmt = find_format(f);
+	if (!frame->fmt) {
+		ret = -EINVAL;
+		goto s_fmt_out;
+	}
+
+	frame->f_width = pix->bytesperline * 8 / frame->fmt->depth;
+	frame->f_height = pix->sizeimage/pix->bytesperline;
+	frame->width = pix->width;
+	frame->height = pix->height;
+	frame->o_width = pix->width;
+	frame->o_height = pix->height;
+	frame->offs_h = 0;
+	frame->offs_v = 0;
+	frame->size = (pix->width * pix->height * frame->fmt->depth) >> 3;
+	src_vq->field = dst_vq->field = pix->field;
+	spin_lock_irqsave(&ctx->slock, flags);
+	ctx->state |= FIMC_PARAMS;
+	spin_unlock_irqrestore(&ctx->slock, flags);
+
+	dbg("f_width= %d, f_height= %d", frame->f_width, frame->f_height);
+
+s_fmt_out:
+	mutex_unlock(&dst_vq->vb_lock);
+	mutex_unlock(&src_vq->vb_lock);
+	mutex_unlock(&ctx->fimc_dev->lock);
+	return ret;
+}
+
+static int fimc_m2m_reqbufs(struct file *file, void *priv,
+			  struct v4l2_requestbuffers *reqbufs)
+{
+	struct fimc_ctx *ctx = priv;
+	return v4l2_m2m_reqbufs(file, ctx->m2m_ctx, reqbufs);
+}
+
+static int fimc_m2m_querybuf(struct file *file, void *priv,
+			   struct v4l2_buffer *buf)
+{
+	struct fimc_ctx *ctx = priv;
+	return v4l2_m2m_querybuf(file, ctx->m2m_ctx, buf);
+}
+
+static int fimc_m2m_qbuf(struct file *file, void *priv,
+			  struct v4l2_buffer *buf)
+{
+	struct fimc_ctx *ctx = priv;
+
+	return v4l2_m2m_qbuf(file, ctx->m2m_ctx, buf);
+}
+
+static int fimc_m2m_dqbuf(struct file *file, void *priv,
+			   struct v4l2_buffer *buf)
+{
+	struct fimc_ctx *ctx = priv;
+	return v4l2_m2m_dqbuf(file, ctx->m2m_ctx, buf);
+}
+
+static int fimc_m2m_streamon(struct file *file, void *priv,
+			   enum v4l2_buf_type type)
+{
+	struct fimc_ctx *ctx = priv;
+	return v4l2_m2m_streamon(file, ctx->m2m_ctx, type);
+}
+
+static int fimc_m2m_streamoff(struct file *file, void *priv,
+			    enum v4l2_buf_type type)
+{
+	struct fimc_ctx *ctx = priv;
+	return v4l2_m2m_streamoff(file, ctx->m2m_ctx, type);
+}
+
+int fimc_m2m_queryctrl(struct file *file, void *priv,
+			    struct v4l2_queryctrl *qc)
+{
+	struct v4l2_queryctrl *c;
+	c = get_ctrl(qc->id);
+	if (!c)
+		return -EINVAL;
+	*qc = *c;
+	return 0;
+}
+
+int fimc_m2m_g_ctrl(struct file *file, void *priv,
+			 struct v4l2_control *ctrl)
+{
+	struct fimc_ctx *ctx = priv;
+
+	switch (ctrl->id) {
+	case V4L2_CID_HFLIP:
+		ctrl->value = (FLIP_X_AXIS & ctx->flip) ? 1 : 0;
+		break;
+	case V4L2_CID_VFLIP:
+		ctrl->value = (FLIP_Y_AXIS & ctx->flip) ? 1 : 0;
+		break;
+	case V4L2_CID_ROTATE:
+		ctrl->value = ctx->rotation;
+		break;
+	default:
+		v4l2_err(&ctx->fimc_dev->m2m.v4l2_dev, "Invalid control\n");
+		return -EINVAL;
+	}
+	dbg("ctrl->value= %d", ctrl->value);
+	return 0;
+}
+
+static int check_ctrl_val(struct fimc_ctx *ctx,
+			  struct v4l2_control *ctrl)
+{
+	struct v4l2_queryctrl *c;
+	c = get_ctrl(ctrl->id);
+	if (!c)
+		return -EINVAL;
+
+	if (ctrl->value < c->minimum || ctrl->value > c->maximum
+		|| (c->step != 0 && ctrl->value % c->step != 0)) {
+		v4l2_err(&ctx->fimc_dev->m2m.v4l2_dev,
+		"Invalid control value\n");
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+int fimc_m2m_s_ctrl(struct file *file, void *priv,
+			 struct v4l2_control *ctrl)
+{
+	struct fimc_ctx *ctx = priv;
+	struct samsung_fimc_variant *variant = ctx->fimc_dev->variant;
+	unsigned long flags;
+	int ret = 0;
+
+	ret = check_ctrl_val(ctx, ctrl);
+	if (ret)
+		return ret;
+
+	switch (ctrl->id) {
+	case V4L2_CID_HFLIP:
+		if (ctx->rotation != 0)
+			return 0;
+		if (ctrl->value)
+			ctx->flip |= FLIP_X_AXIS;
+		else
+			ctx->flip &= ~FLIP_X_AXIS;
+		break;
+
+	case V4L2_CID_VFLIP:
+		if (ctx->rotation != 0)
+			return 0;
+		if (ctrl->value)
+			ctx->flip |= FLIP_Y_AXIS;
+		else
+			ctx->flip &= ~FLIP_Y_AXIS;
+		break;
+
+	case V4L2_CID_ROTATE:
+		if (ctrl->value == 90 || ctrl->value == 270) {
+			if (ctx->out_path == FIMC_LCDFIFO &&
+			    !variant->has_inp_rot) {
+				return -EINVAL;
+			} else if (ctx->in_path == FIMC_DMA &&
+				   !variant->has_out_rot) {
+				return -EINVAL;
+			}
+		}
+		ctx->rotation = ctrl->value;
+		if (ctrl->value == 180)
+			ctx->flip = FLIP_XY_AXIS;
+		break;
+
+	default:
+		v4l2_err(&ctx->fimc_dev->m2m.v4l2_dev, "Invalid control\n");
+		return -EINVAL;
+	}
+	spin_lock_irqsave(&ctx->slock, flags);
+	ctx->state |= FIMC_PARAMS;
+	spin_unlock_irqrestore(&ctx->slock, flags);
+	return 0;
+}
+
+
+static int fimc_m2m_cropcap(struct file *file, void *fh,
+			     struct v4l2_cropcap *cr)
+{
+	struct fimc_frame *frame;
+	struct fimc_ctx *ctx = fh;
+
+	ctx_m2m_get_frame(frame, ctx, cr->type);
+
+	cr->bounds.left = 0;
+	cr->bounds.top = 0;
+	cr->bounds.width = frame->f_width;
+	cr->bounds.height = frame->f_height;
+	cr->defrect.left = frame->offs_h;
+	cr->defrect.top = frame->offs_v;
+	cr->defrect.width = frame->o_width;
+	cr->defrect.height = frame->o_height;
+	return 0;
+}
+
+static int fimc_m2m_g_crop(struct file *file, void *fh, struct v4l2_crop *cr)
+{
+	struct fimc_frame *frame;
+	struct fimc_ctx *ctx = file->private_data;
+
+	ctx_m2m_get_frame(frame, ctx, cr->type);
+
+	cr->c.left = frame->offs_h;
+	cr->c.top = frame->offs_v;
+	cr->c.width = frame->width;
+	cr->c.height = frame->height;
+
+	return 0;
+}
+
+static int fimc_m2m_s_crop(struct file *file, void *fh, struct v4l2_crop *cr)
+{
+	struct fimc_ctx *ctx = file->private_data;
+	struct fimc_dev *fimc = ctx->fimc_dev;
+	unsigned long flags;
+	struct fimc_frame *f;
+	u32 min_size;
+	int ret = 0;
+
+	if (cr->c.top < 0 || cr->c.left < 0) {
+		v4l2_err(&fimc->m2m.v4l2_dev,
+			"doesn't support negative values for top & left\n");
+		return -EINVAL;
+	}
+
+	if (cr->c.width  <= 0 || cr->c.height <= 0) {
+		v4l2_err(&fimc->m2m.v4l2_dev,
+			"crop width and height must be greater than 0\n");
+		return -EINVAL;
+	}
+
+	ctx_m2m_get_frame(f, ctx, cr->type);
+
+	/* Adjust to required pixel boundary. */
+	min_size = (cr->type == V4L2_BUF_TYPE_VIDEO_OUTPUT) ?
+		fimc->variant->min_inp_pixsize : fimc->variant->min_out_pixsize;
+
+	cr->c.width = round_down(cr->c.width, min_size);
+	cr->c.height = round_down(cr->c.height, min_size);
+	cr->c.left = round_down(cr->c.left + 1, min_size);
+	cr->c.top = round_down(cr->c.top + 1, min_size);
+
+	if ((cr->c.left + cr->c.width > f->o_width)
+		|| (cr->c.top + cr->c.height > f->o_height)) {
+		v4l2_err(&fimc->m2m.v4l2_dev, "Error in S_CROP params\n");
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&ctx->slock, flags);
+	if ((ctx->state & FIMC_SRC_FMT) && (ctx->state & FIMC_DST_FMT)) {
+		/* Check for the pixel scaling ratio when cropping input img. */
+		if (cr->type == V4L2_BUF_TYPE_VIDEO_OUTPUT)
+			ret = fimc_check_scaler_ratio(&cr->c, &ctx->d_frame);
+		else if (cr->type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
+			ret = fimc_check_scaler_ratio(&cr->c, &ctx->s_frame);
+
+		if (ret) {
+			spin_unlock_irqrestore(&ctx->slock, flags);
+			v4l2_err(&fimc->m2m.v4l2_dev,  "Out of scaler range");
+			return -EINVAL;
+		}
+	}
+	ctx->state |= FIMC_PARAMS;
+	spin_unlock_irqrestore(&ctx->slock, flags);
+
+	f->offs_h = cr->c.left;
+	f->offs_v = cr->c.top;
+	f->width = cr->c.width;
+	f->height = cr->c.height;
+	return 0;
+}
+
+static const struct v4l2_ioctl_ops fimc_m2m_ioctl_ops = {
+	.vidioc_querycap		= fimc_m2m_querycap,
+
+	.vidioc_enum_fmt_vid_cap	= fimc_m2m_enum_fmt,
+	.vidioc_enum_fmt_vid_out	= fimc_m2m_enum_fmt,
+
+	.vidioc_g_fmt_vid_cap		= fimc_m2m_g_fmt,
+	.vidioc_g_fmt_vid_out		= fimc_m2m_g_fmt,
+
+	.vidioc_try_fmt_vid_cap		= fimc_m2m_try_fmt,
+	.vidioc_try_fmt_vid_out		= fimc_m2m_try_fmt,
+
+	.vidioc_s_fmt_vid_cap		= fimc_m2m_s_fmt,
+	.vidioc_s_fmt_vid_out		= fimc_m2m_s_fmt,
+
+	.vidioc_reqbufs			= fimc_m2m_reqbufs,
+	.vidioc_querybuf		= fimc_m2m_querybuf,
+
+	.vidioc_qbuf			= fimc_m2m_qbuf,
+	.vidioc_dqbuf			= fimc_m2m_dqbuf,
+
+	.vidioc_streamon		= fimc_m2m_streamon,
+	.vidioc_streamoff		= fimc_m2m_streamoff,
+
+	.vidioc_queryctrl		= fimc_m2m_queryctrl,
+	.vidioc_g_ctrl			= fimc_m2m_g_ctrl,
+	.vidioc_s_ctrl			= fimc_m2m_s_ctrl,
+
+	.vidioc_g_crop			= fimc_m2m_g_crop,
+	.vidioc_s_crop			= fimc_m2m_s_crop,
+	.vidioc_cropcap			= fimc_m2m_cropcap
+
+};
+
+static void queue_init(void *priv, struct videobuf_queue *vq,
+		       enum v4l2_buf_type type)
+{
+	struct fimc_ctx *ctx = priv;
+	struct fimc_dev *fimc = ctx->fimc_dev;
+
+	videobuf_queue_dma_contig_init(vq, &fimc_qops,
+		fimc->m2m.v4l2_dev.dev,
+		&fimc->irqlock, type, V4L2_FIELD_NONE,
+		sizeof(struct fimc_vid_buffer), priv);
+}
+
+static int fimc_m2m_open(struct file *file)
+{
+	struct fimc_dev *fimc = video_drvdata(file);
+	struct fimc_ctx *ctx = NULL;
+	int err = 0;
+
+	mutex_lock(&fimc->lock);
+	fimc->m2m.refcnt++;
+	set_bit(ST_OUTDMA_RUN, &fimc->state);
+	mutex_unlock(&fimc->lock);
+
+
+	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	file->private_data = ctx;
+	ctx->fimc_dev = fimc;
+	/* default format */
+	ctx->s_frame.fmt = &fimc_formats[0];
+	ctx->d_frame.fmt = &fimc_formats[0];
+	/* per user process device context initialization */
+	ctx->state = 0;
+	ctx->flags = 0;
+	ctx->effect.type = S5P_FIMC_EFFECT_ORIGINAL;
+	ctx->in_path = FIMC_DMA;
+	ctx->out_path = FIMC_DMA;
+	spin_lock_init(&ctx->slock);
+
+	ctx->m2m_ctx = v4l2_m2m_ctx_init(ctx, fimc->m2m.m2m_dev, queue_init);
+	if (IS_ERR(ctx->m2m_ctx)) {
+		err = PTR_ERR(ctx->m2m_ctx);
+		kfree(ctx);
+	}
+	return err;
+}
+
+static int fimc_m2m_release(struct file *file)
+{
+	struct fimc_ctx *ctx = file->private_data;
+	struct fimc_dev *fimc = ctx->fimc_dev;
+
+	v4l2_m2m_ctx_release(ctx->m2m_ctx);
+	kfree(ctx);
+	mutex_lock(&fimc->lock);
+	if (--fimc->m2m.refcnt <= 0)
+		clear_bit(ST_OUTDMA_RUN, &fimc->state);
+	mutex_unlock(&fimc->lock);
+	return 0;
+}
+
+static unsigned int fimc_m2m_poll(struct file *file,
+				     struct poll_table_struct *wait)
+{
+	struct fimc_ctx *ctx = file->private_data;
+	return v4l2_m2m_poll(file, ctx->m2m_ctx, wait);
+}
+
+
+static int fimc_m2m_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct fimc_ctx *ctx = file->private_data;
+	return v4l2_m2m_mmap(file, ctx->m2m_ctx, vma);
+}
+
+static const struct v4l2_file_operations fimc_m2m_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fimc_m2m_open,
+	.release	= fimc_m2m_release,
+	.poll		= fimc_m2m_poll,
+	.ioctl		= video_ioctl2,
+	.mmap		= fimc_m2m_mmap,
+};
+
+static struct v4l2_m2m_ops m2m_ops = {
+	.device_run	= fimc_dma_run,
+	.job_abort	= fimc_job_abort,
+};
+
+
+static int fimc_register_m2m_device(struct fimc_dev *fimc)
+{
+	struct video_device *vfd;
+	struct platform_device *pdev;
+	struct v4l2_device *v4l2_dev;
+	int ret = 0;
+
+	if (!fimc)
+		return -ENODEV;
+
+	pdev = fimc->pdev;
+	v4l2_dev = &fimc->m2m.v4l2_dev;
+
+	/* set name if it is empty */
+	if (!v4l2_dev->name[0])
+		snprintf(v4l2_dev->name, sizeof(v4l2_dev->name),
+			 "%s.m2m", dev_name(&pdev->dev));
+
+	ret = v4l2_device_register(&pdev->dev, v4l2_dev);
+	if (ret)
+		return ret;;
+
+	vfd = video_device_alloc();
+	if (!vfd) {
+		v4l2_err(v4l2_dev, "Failed to allocate video device\n");
+		goto err_m2m_r1;
+	}
+
+	vfd->fops	= &fimc_m2m_fops;
+	vfd->ioctl_ops	= &fimc_m2m_ioctl_ops;
+	vfd->minor	= -1;
+	vfd->release	= video_device_release;
+
+	snprintf(vfd->name, sizeof(vfd->name), "%s:m2m", dev_name(&pdev->dev));
+
+	video_set_drvdata(vfd, fimc);
+	platform_set_drvdata(pdev, fimc);
+
+	fimc->m2m.vfd = vfd;
+	fimc->m2m.m2m_dev = v4l2_m2m_init(&m2m_ops);
+	if (IS_ERR(fimc->m2m.m2m_dev)) {
+		v4l2_err(v4l2_dev, "failed to initialize v4l2-m2m device\n");
+		ret = PTR_ERR(fimc->m2m.m2m_dev);
+		goto err_m2m_r2;
+	}
+
+	ret = video_register_device(vfd, VFL_TYPE_GRABBER, -1);
+	if (ret) {
+		v4l2_err(v4l2_dev,
+			 "%s(): failed to register video device\n", __func__);
+		goto err_m2m_r3;
+	}
+	v4l2_info(v4l2_dev,
+		  "FIMC m2m driver registered as /dev/video%d\n", vfd->num);
+
+	return 0;
+
+err_m2m_r3:
+	v4l2_m2m_release(fimc->m2m.m2m_dev);
+err_m2m_r2:
+	video_device_release(fimc->m2m.vfd);
+err_m2m_r1:
+	v4l2_device_unregister(v4l2_dev);
+
+	return ret;
+}
+
+static void fimc_unregister_m2m_device(struct fimc_dev *fimc)
+{
+	if (fimc) {
+		v4l2_m2m_release(fimc->m2m.m2m_dev);
+		video_unregister_device(fimc->m2m.vfd);
+		video_device_release(fimc->m2m.vfd);
+		v4l2_device_unregister(&fimc->m2m.v4l2_dev);
+	}
+}
+
+static void fimc_clk_release(struct fimc_dev *fimc)
+{
+	int i;
+	for (i = 0; i < NUM_FIMC_CLOCKS; i++) {
+		if (fimc->clock[i]) {
+			clk_disable(fimc->clock[i]);
+			clk_put(fimc->clock[i]);
+		}
+	}
+}
+
+static int fimc_clk_get(struct fimc_dev *fimc)
+{
+	int i;
+	for (i = 0; i < NUM_FIMC_CLOCKS; i++) {
+		fimc->clock[i] = clk_get(&fimc->pdev->dev, fimc_clock_name[i]);
+		if (IS_ERR(fimc->clock[i])) {
+			dev_err(&fimc->pdev->dev,
+				"failed to get fimc clock: %s\n",
+				fimc_clock_name[i]);
+			return -ENXIO;
+		}
+		clk_enable(fimc->clock[i]);
+	}
+	return 0;
+}
+
+static int fimc_probe(struct platform_device *pdev)
+{
+	struct fimc_dev *fimc;
+	struct resource *res;
+	struct samsung_fimc_driverdata *drv_data;
+	int ret = 0;
+
+	dev_dbg(&pdev->dev, "%s():\n", __func__);
+
+	drv_data = (struct samsung_fimc_driverdata *)
+		platform_get_device_id(pdev)->driver_data;
+
+	if (pdev->id >= drv_data->devs_cnt) {
+		dev_err(&pdev->dev, "Invalid platform device id: %d\n",
+			pdev->id);
+		return -EINVAL;
+	}
+
+	fimc = kzalloc(sizeof(struct fimc_dev), GFP_KERNEL);
+	if (!fimc)
+		return -ENOMEM;
+
+	fimc->id = pdev->id;
+	fimc->variant = drv_data->variant[fimc->id];
+	fimc->pdev = pdev;
+	fimc->state = ST_IDLE;
+
+	spin_lock_init(&fimc->irqlock);
+	spin_lock_init(&fimc->slock);
+
+	mutex_init(&fimc->lock);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "failed to find the registers\n");
+		ret = -ENOENT;
+		goto err_info;
+	}
+
+	fimc->regs_res = request_mem_region(res->start, resource_size(res),
+			dev_name(&pdev->dev));
+	if (!fimc->regs_res) {
+		dev_err(&pdev->dev, "failed to obtain register region\n");
+		ret = -ENOENT;
+		goto err_info;
+	}
+
+	fimc->regs = ioremap(res->start, resource_size(res));
+	if (!fimc->regs) {
+		dev_err(&pdev->dev, "failed to map registers\n");
+		ret = -ENXIO;
+		goto err_req_region;
+	}
+
+	ret = fimc_clk_get(fimc);
+	if (ret)
+		goto err_regs_unmap;
+
+	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "failed to get IRQ resource\n");
+		ret = -ENXIO;
+		goto err_clk;
+	}
+	fimc->irq = res->start;
+
+	fimc_hw_reset(fimc);
+
+	ret = request_irq(fimc->irq, fimc_isr, 0, pdev->name, fimc);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to install irq (%d)\n", ret);
+		goto err_clk;
+	}
+
+	fimc->work_queue = create_workqueue(dev_name(&fimc->pdev->dev));
+	if (!fimc->work_queue)
+		goto err_irq;
+
+	ret = fimc_register_m2m_device(fimc);
+	if (ret)
+		goto err_wq;
+
+	fimc_hw_en_lastirq(fimc, true);
+
+	dev_dbg(&pdev->dev, "%s(): fimc-%d registered successfully\n",
+		__func__, fimc->id);
+
+	return 0;
+
+err_wq:
+	destroy_workqueue(fimc->work_queue);
+err_irq:
+	free_irq(fimc->irq, fimc);
+err_clk:
+	fimc_clk_release(fimc);
+err_regs_unmap:
+	iounmap(fimc->regs);
+err_req_region:
+	release_resource(fimc->regs_res);
+	kfree(fimc->regs_res);
+err_info:
+	kfree(fimc);
+	dev_err(&pdev->dev, "failed to install\n");
+	return ret;
+}
+
+static int __devexit fimc_remove(struct platform_device *pdev)
+{
+	struct fimc_dev *fimc =
+		(struct fimc_dev *)platform_get_drvdata(pdev);
+
+	v4l2_info(&fimc->m2m.v4l2_dev, "Removing %s\n", pdev->name);
+
+	free_irq(fimc->irq, fimc);
+
+	fimc_hw_reset(fimc);
+
+	fimc_unregister_m2m_device(fimc);
+	fimc_clk_release(fimc);
+	iounmap(fimc->regs);
+	release_resource(fimc->regs_res);
+	kfree(fimc->regs_res);
+	kfree(fimc);
+	return 0;
+}
+
+static struct samsung_fimc_variant fimc01_variant_s5p = {
+	.has_inp_rot	= 1,
+	.has_out_rot	= 1,
+	.min_inp_pixsize = 16,
+	.min_out_pixsize = 16,
+
+	.scaler_en_w	= 3264,
+	.scaler_dis_w	= 8192,
+	.in_rot_en_h	= 1920,
+	.in_rot_dis_w	= 8192,
+	.out_rot_en_w	= 1920,
+	.out_rot_dis_w	= 4224,
+};
+
+static struct samsung_fimc_variant fimc2_variant_s5p = {
+	.min_inp_pixsize = 16,
+	.min_out_pixsize = 16,
+
+	.scaler_en_w	= 4224,
+	.scaler_dis_w	= 8192,
+	.in_rot_en_h	= 1920,
+	.in_rot_dis_w	= 8192,
+	.out_rot_en_w	= 1920,
+	.out_rot_dis_w	= 4224,
+};
+
+static struct samsung_fimc_variant fimc01_variant_s5pv210 = {
+	.has_inp_rot	= 1,
+	.has_out_rot	= 1,
+	.min_inp_pixsize = 16,
+	.min_out_pixsize = 32,
+
+	.scaler_en_w	= 4224,
+	.scaler_dis_w	= 8192,
+	.in_rot_en_h	= 1920,
+	.in_rot_dis_w	= 8192,
+	.out_rot_en_w	= 1920,
+	.out_rot_dis_w	= 4224,
+};
+
+static struct samsung_fimc_variant fimc2_variant_s5pv210 = {
+	.min_inp_pixsize = 16,
+	.min_out_pixsize = 32,
+
+	.scaler_en_w	= 1920,
+	.scaler_dis_w	= 8192,
+	.in_rot_en_h	= 1280,
+	.in_rot_dis_w	= 8192,
+	.out_rot_en_w	= 1280,
+	.out_rot_dis_w	= 1920,
+};
+
+static struct samsung_fimc_driverdata fimc_drvdata_s5p = {
+	.variant = {
+		[0] = &fimc01_variant_s5p,
+		[1] = &fimc01_variant_s5p,
+		[2] = &fimc2_variant_s5p,
+	},
+	.devs_cnt = 3
+};
+
+static struct samsung_fimc_driverdata fimc_drvdata_s5pv210 = {
+	.variant = {
+		[0] = &fimc01_variant_s5pv210,
+		[1] = &fimc01_variant_s5pv210,
+		[2] = &fimc2_variant_s5pv210,
+	},
+	.devs_cnt = 3
+};
+
+static struct platform_device_id fimc_driver_ids[] = {
+	{
+		.name		= "s5p-fimc",
+		.driver_data	= (unsigned long)&fimc_drvdata_s5p,
+	}, {
+		.name		= "s5pv210-fimc",
+		.driver_data	= (unsigned long)&fimc_drvdata_s5pv210,
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(platform, fimc_driver_ids);
+
+static struct platform_driver fimc_driver = {
+	.probe		= fimc_probe,
+	.remove	= __devexit_p(fimc_remove),
+	.id_table	= fimc_driver_ids,
+	.driver = {
+		.name	= MODULE_NAME,
+		.owner	= THIS_MODULE,
+	}
+};
+
+static char banner[] __initdata = KERN_INFO
+	"S5PC Camera Interface V4L2 Driver, (c) 2010 Samsung Electronics\n";
+
+static int __init fimc_init(void)
+{
+	u32 ret;
+	printk(banner);
+
+	ret = platform_driver_register(&fimc_driver);
+	if (ret) {
+		printk(KERN_ERR "FIMC platform driver register failed\n");
+		return -1;
+	}
+	return 0;
+}
+
+static void __exit fimc_exit(void)
+{
+	platform_driver_unregister(&fimc_driver);
+}
+
+module_init(fimc_init);
+module_exit(fimc_exit);
+
+MODULE_AUTHOR("Sylwester Nawrocki, s.nawrocki@samsung.com");
+MODULE_DESCRIPTION("S3C/S5P FIMC (video postprocessor) driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/media/video/s5p-fimc/fimc-core.h b/drivers/media/video/s5p-fimc/fimc-core.h
new file mode 100644
index 000000000000..f121b939626a
--- /dev/null
+++ b/drivers/media/video/s5p-fimc/fimc-core.h
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2010 Samsung Electronics
+ *
+ * Sylwester Nawrocki, <s.nawrocki@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef FIMC_CORE_H_
+#define FIMC_CORE_H_
+
+#include <linux/types.h>
+#include <media/videobuf-core.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-mem2mem.h>
+#include <linux/videodev2.h>
+#include "regs-fimc.h"
+
+#define err(fmt, args...) \
+	printk(KERN_ERR "%s:%d: " fmt "\n", __func__, __LINE__, ##args)
+
+#ifdef DEBUG
+#define dbg(fmt, args...) \
+	printk(KERN_DEBUG "%s:%d: " fmt "\n", __func__, __LINE__, ##args)
+#else
+#define dbg(fmt, args...)
+#endif
+
+#define ctx_m2m_get_frame(frame, ctx, type) do { \
+	if (V4L2_BUF_TYPE_VIDEO_OUTPUT == (type)) { \
+		frame = &(ctx)->s_frame; \
+	} else if (V4L2_BUF_TYPE_VIDEO_CAPTURE == (type)) { \
+		frame = &(ctx)->d_frame; \
+	} else { \
+		v4l2_err(&(ctx)->fimc_dev->m2m.v4l2_dev,\
+			"Wrong buffer/video queue type (%d)\n", type); \
+		return -EINVAL; \
+	} \
+} while (0)
+
+#define NUM_FIMC_CLOCKS		2
+#define MODULE_NAME		"s5p-fimc"
+#define FIMC_MAX_DEVS		3
+#define FIMC_MAX_OUT_BUFS	4
+#define SCALER_MAX_HRATIO	64
+#define SCALER_MAX_VRATIO	64
+
+enum {
+	ST_IDLE,
+	ST_OUTDMA_RUN,
+	ST_M2M_PEND,
+};
+
+#define fimc_m2m_active(dev) test_bit(ST_OUTDMA_RUN, &(dev)->state)
+#define fimc_m2m_pending(dev) test_bit(ST_M2M_PEND, &(dev)->state)
+
+enum fimc_datapath {
+	FIMC_ITU_CAM_A,
+	FIMC_ITU_CAM_B,
+	FIMC_MIPI_CAM,
+	FIMC_DMA,
+	FIMC_LCDFIFO,
+	FIMC_WRITEBACK
+};
+
+enum fimc_color_fmt {
+	S5P_FIMC_RGB565,
+	S5P_FIMC_RGB666,
+	S5P_FIMC_RGB888,
+	S5P_FIMC_YCBCR420,
+	S5P_FIMC_YCBCR422,
+	S5P_FIMC_YCBYCR422,
+	S5P_FIMC_YCRYCB422,
+	S5P_FIMC_CBYCRY422,
+	S5P_FIMC_CRYCBY422,
+	S5P_FIMC_RGB30_LOCAL,
+	S5P_FIMC_YCBCR444_LOCAL,
+	S5P_FIMC_MAX_COLOR = S5P_FIMC_YCBCR444_LOCAL,
+	S5P_FIMC_COLOR_MASK = 0x0F,
+};
+
+/* Y/Cb/Cr components order at DMA output for 1 plane YCbCr 4:2:2 formats. */
+#define	S5P_FIMC_OUT_CRYCBY	S5P_CIOCTRL_ORDER422_CRYCBY
+#define	S5P_FIMC_OUT_CBYCRY	S5P_CIOCTRL_ORDER422_YCRYCB
+#define	S5P_FIMC_OUT_YCRYCB	S5P_CIOCTRL_ORDER422_CBYCRY
+#define	S5P_FIMC_OUT_YCBYCR	S5P_CIOCTRL_ORDER422_YCBYCR
+
+/* Input Y/Cb/Cr components order for 1 plane YCbCr 4:2:2 color formats. */
+#define	S5P_FIMC_IN_CRYCBY	S5P_MSCTRL_ORDER422_CRYCBY
+#define	S5P_FIMC_IN_CBYCRY	S5P_MSCTRL_ORDER422_YCRYCB
+#define	S5P_FIMC_IN_YCRYCB	S5P_MSCTRL_ORDER422_CBYCRY
+#define	S5P_FIMC_IN_YCBYCR	S5P_MSCTRL_ORDER422_YCBYCR
+
+/* Cb/Cr chrominance components order for 2 plane Y/CbCr 4:2:2 formats. */
+#define	S5P_FIMC_LSB_CRCB	S5P_CIOCTRL_ORDER422_2P_LSB_CRCB
+
+/* The embedded image effect selection */
+#define	S5P_FIMC_EFFECT_ORIGINAL	S5P_CIIMGEFF_FIN_BYPASS
+#define	S5P_FIMC_EFFECT_ARBITRARY	S5P_CIIMGEFF_FIN_ARBITRARY
+#define	S5P_FIMC_EFFECT_NEGATIVE	S5P_CIIMGEFF_FIN_NEGATIVE
+#define	S5P_FIMC_EFFECT_ARTFREEZE	S5P_CIIMGEFF_FIN_ARTFREEZE
+#define	S5P_FIMC_EFFECT_EMBOSSING	S5P_CIIMGEFF_FIN_EMBOSSING
+#define	S5P_FIMC_EFFECT_SIKHOUETTE	S5P_CIIMGEFF_FIN_SILHOUETTE
+
+/* The hardware context state. */
+#define	FIMC_PARAMS			(1 << 0)
+#define	FIMC_SRC_ADDR			(1 << 1)
+#define	FIMC_DST_ADDR			(1 << 2)
+#define	FIMC_SRC_FMT			(1 << 3)
+#define	FIMC_DST_FMT			(1 << 4)
+
+/* Image conversion flags */
+#define	FIMC_IN_DMA_ACCESS_TILED	(1 << 0)
+#define	FIMC_IN_DMA_ACCESS_LINEAR	(0 << 0)
+#define	FIMC_OUT_DMA_ACCESS_TILED	(1 << 1)
+#define	FIMC_OUT_DMA_ACCESS_LINEAR	(0 << 1)
+#define	FIMC_SCAN_MODE_PROGRESSIVE	(0 << 2)
+#define	FIMC_SCAN_MODE_INTERLACED	(1 << 2)
+/* YCbCr data dynamic range for RGB-YUV color conversion. Y/Cb/Cr: (0 ~ 255) */
+#define	FIMC_COLOR_RANGE_WIDE		(0 << 3)
+/* Y (16 ~ 235), Cb/Cr (16 ~ 240) */
+#define	FIMC_COLOR_RANGE_NARROW		(1 << 3)
+
+#define	FLIP_NONE			0
+#define	FLIP_X_AXIS			1
+#define	FLIP_Y_AXIS			2
+#define	FLIP_XY_AXIS			(FLIP_X_AXIS | FLIP_Y_AXIS)
+
+/**
+ * struct fimc_fmt - the driver's internal color format data
+ * @name: format description
+ * @fourcc: the fourcc code for this format
+ * @color: the corresponding fimc_color_fmt
+ * @depth: number of bits per pixel
+ * @buff_cnt: number of physically non-contiguous data planes
+ * @planes_cnt: number of physically contiguous data planes
+ */
+struct fimc_fmt {
+	char	*name;
+	u32	fourcc;
+	u32	color;
+	u32	depth;
+	u16	buff_cnt;
+	u16	planes_cnt;
+};
+
+/**
+ * struct fimc_dma_offset - pixel offset information for DMA
+ * @y_h:	y value horizontal offset
+ * @y_v:	y value vertical offset
+ * @cb_h:	cb value horizontal offset
+ * @cb_v:	cb value vertical offset
+ * @cr_h:	cr value horizontal offset
+ * @cr_v:	cr value vertical offset
+ */
+struct fimc_dma_offset {
+	int	y_h;
+	int	y_v;
+	int	cb_h;
+	int	cb_v;
+	int	cr_h;
+	int	cr_v;
+};
+
+/**
+ * struct fimc_effect - the configuration data for the "Arbitrary" image effect
+ * @type:	effect type
+ * @pat_cb:	cr value when type is "arbitrary"
+ * @pat_cr:	cr value when type is "arbitrary"
+ */
+struct fimc_effect {
+	u32	type;
+	u8	pat_cb;
+	u8	pat_cr;
+};
+
+/**
+ * struct fimc_scaler - the configuration data for FIMC inetrnal scaler
+ *
+ * @enabled:		the flag set when the scaler is used
+ * @hfactor:		horizontal shift factor
+ * @vfactor:		vertical shift factor
+ * @pre_hratio:		horizontal ratio of the prescaler
+ * @pre_vratio:		vertical ratio of the prescaler
+ * @pre_dst_width:	the prescaler's destination width
+ * @pre_dst_height:	the prescaler's destination height
+ * @scaleup_h:		flag indicating scaling up horizontally
+ * @scaleup_v:		flag indicating scaling up vertically
+ * @main_hratio:	the main scaler's horizontal ratio
+ * @main_vratio:	the main scaler's vertical ratio
+ * @real_width:		source width - offset
+ * @real_height:	source height - offset
+ * @copy_mode:		flag set if one-to-one mode is used, i.e. no scaling
+ *			and color format conversion
+ */
+struct fimc_scaler {
+	u32	enabled;
+	u32	hfactor;
+	u32	vfactor;
+	u32	pre_hratio;
+	u32	pre_vratio;
+	u32	pre_dst_width;
+	u32	pre_dst_height;
+	u32	scaleup_h;
+	u32	scaleup_v;
+	u32	main_hratio;
+	u32	main_vratio;
+	u32	real_width;
+	u32	real_height;
+	u32	copy_mode;
+};
+
+/**
+ * struct fimc_addr - the FIMC physical address set for DMA
+ *
+ * @y:	 luminance plane physical address
+ * @cb:	 Cb plane physical address
+ * @cr:	 Cr plane physical address
+ */
+struct fimc_addr {
+	u32	y;
+	u32	cb;
+	u32	cr;
+};
+
+/**
+ * struct fimc_vid_buffer - the driver's video buffer
+ * @vb:	v4l videobuf buffer
+ */
+struct fimc_vid_buffer {
+	struct videobuf_buffer	vb;
+};
+
+/**
+ * struct fimc_frame - input/output frame format properties
+ *
+ * @f_width:	image full width (virtual screen size)
+ * @f_height:	image full height (virtual screen size)
+ * @o_width:	original image width as set by S_FMT
+ * @o_height:	original image height as set by S_FMT
+ * @offs_h:	image horizontal pixel offset
+ * @offs_v:	image vertical pixel offset
+ * @width:	image pixel width
+ * @height:	image pixel weight
+ * @paddr:	image frame buffer physical addresses
+ * @buf_cnt:	number of buffers depending on a color format
+ * @size:	image size in bytes
+ * @color:	color format
+ * @dma_offset:	DMA offset in bytes
+ */
+struct fimc_frame {
+	u32	f_width;
+	u32	f_height;
+	u32	o_width;
+	u32	o_height;
+	u32	offs_h;
+	u32	offs_v;
+	u32	width;
+	u32	height;
+	u32	size;
+	struct fimc_addr	paddr;
+	struct fimc_dma_offset	dma_offset;
+	struct fimc_fmt		*fmt;
+};
+
+/**
+ * struct fimc_m2m_device - v4l2 memory-to-memory device data
+ * @vfd: the video device node for v4l2 m2m mode
+ * @v4l2_dev: v4l2 device for m2m mode
+ * @m2m_dev: v4l2 memory-to-memory device data
+ * @ctx: hardware context data
+ * @refcnt: the reference counter
+ */
+struct fimc_m2m_device {
+	struct video_device	*vfd;
+	struct v4l2_device	v4l2_dev;
+	struct v4l2_m2m_dev	*m2m_dev;
+	struct fimc_ctx		*ctx;
+	int			refcnt;
+};
+
+/**
+ * struct samsung_fimc_variant - camera interface variant information
+ *
+ * @pix_hoff: indicate whether horizontal offset is in pixels or in bytes
+ * @has_inp_rot: set if has input rotator
+ * @has_out_rot: set if has output rotator
+ * @min_inp_pixsize: minimum input pixel size
+ * @min_out_pixsize: minimum output pixel size
+ * @scaler_en_w: maximum input pixel width when the scaler is enabled
+ * @scaler_dis_w: maximum input pixel width when the scaler is disabled
+ * @in_rot_en_h: maximum input width when the input rotator is used
+ * @in_rot_dis_w: maximum input width when the input rotator is used
+ * @out_rot_en_w: maximum output width for the output rotator enabled
+ * @out_rot_dis_w: maximum output width for the output rotator enabled
+ */
+struct samsung_fimc_variant {
+	unsigned int	pix_hoff:1;
+	unsigned int	has_inp_rot:1;
+	unsigned int	has_out_rot:1;
+
+	u16		min_inp_pixsize;
+	u16		min_out_pixsize;
+	u16		scaler_en_w;
+	u16		scaler_dis_w;
+	u16		in_rot_en_h;
+	u16		in_rot_dis_w;
+	u16		out_rot_en_w;
+	u16		out_rot_dis_w;
+};
+
+/**
+ * struct samsung_fimc_driverdata - per-device type driver data for init time.
+ *
+ * @variant: the variant information for this driver.
+ * @dev_cnt: number of fimc sub-devices available in SoC
+ */
+struct samsung_fimc_driverdata {
+	struct samsung_fimc_variant *variant[FIMC_MAX_DEVS];
+	int	devs_cnt;
+};
+
+struct fimc_ctx;
+
+/**
+ * struct fimc_subdev - abstraction for a FIMC entity
+ *
+ * @slock:	the spinlock protecting this data structure
+ * @lock:	the mutex protecting this data structure
+ * @pdev:	pointer to the FIMC platform device
+ * @id:		FIMC device index (0..2)
+ * @clock[]:	the clocks required for FIMC operation
+ * @regs:	the mapped hardware registers
+ * @regs_res:	the resource claimed for IO registers
+ * @irq:	interrupt number of the FIMC subdevice
+ * @irqlock:	spinlock protecting videbuffer queue
+ * @m2m:	memory-to-memory V4L2 device information
+ * @state:	the FIMC device state flags
+ */
+struct fimc_dev {
+	spinlock_t			slock;
+	struct mutex			lock;
+	struct platform_device		*pdev;
+	struct samsung_fimc_variant	*variant;
+	int				id;
+	struct clk			*clock[NUM_FIMC_CLOCKS];
+	void __iomem			*regs;
+	struct resource			*regs_res;
+	int				irq;
+	spinlock_t			irqlock;
+	struct workqueue_struct		*work_queue;
+	struct fimc_m2m_device		m2m;
+	unsigned long			state;
+};
+
+/**
+ * fimc_ctx - the device context data
+ *
+ * @lock:		mutex protecting this data structure
+ * @s_frame:		source frame properties
+ * @d_frame:		destination frame properties
+ * @out_order_1p:	output 1-plane YCBCR order
+ * @out_order_2p:	output 2-plane YCBCR order
+ * @in_order_1p		input 1-plane YCBCR order
+ * @in_order_2p:	input 2-plane YCBCR order
+ * @in_path:		input mode (DMA or camera)
+ * @out_path:		output mode (DMA or FIFO)
+ * @scaler:		image scaler properties
+ * @effect:		image effect
+ * @rotation:		image clockwise rotation in degrees
+ * @flip:		image flip mode
+ * @flags:		an additional flags for image conversion
+ * @state:		flags to keep track of user configuration
+ * @fimc_dev:		the FIMC device this context applies to
+ * @m2m_ctx:		memory-to-memory device context
+ */
+struct fimc_ctx {
+	spinlock_t		slock;
+	struct fimc_frame	s_frame;
+	struct fimc_frame	d_frame;
+	u32			out_order_1p;
+	u32			out_order_2p;
+	u32			in_order_1p;
+	u32			in_order_2p;
+	enum fimc_datapath	in_path;
+	enum fimc_datapath	out_path;
+	struct fimc_scaler	scaler;
+	struct fimc_effect	effect;
+	int			rotation;
+	u32			flip;
+	u32			flags;
+	u32			state;
+	struct fimc_dev		*fimc_dev;
+	struct v4l2_m2m_ctx	*m2m_ctx;
+};
+
+
+static inline int tiled_fmt(struct fimc_fmt *fmt)
+{
+	return 0;
+}
+
+static inline void fimc_hw_clear_irq(struct fimc_dev *dev)
+{
+	u32 cfg = readl(dev->regs + S5P_CIGCTRL);
+	cfg |= S5P_CIGCTRL_IRQ_CLR;
+	writel(cfg, dev->regs + S5P_CIGCTRL);
+}
+
+static inline void fimc_hw_start_scaler(struct fimc_dev *dev)
+{
+	u32 cfg = readl(dev->regs + S5P_CISCCTRL);
+	cfg |= S5P_CISCCTRL_SCALERSTART;
+	writel(cfg, dev->regs + S5P_CISCCTRL);
+}
+
+static inline void fimc_hw_stop_scaler(struct fimc_dev *dev)
+{
+	u32 cfg = readl(dev->regs + S5P_CISCCTRL);
+	cfg &= ~S5P_CISCCTRL_SCALERSTART;
+	writel(cfg, dev->regs + S5P_CISCCTRL);
+}
+
+static inline void fimc_hw_dis_capture(struct fimc_dev *dev)
+{
+	u32 cfg = readl(dev->regs + S5P_CIIMGCPT);
+	cfg &= ~(S5P_CIIMGCPT_IMGCPTEN | S5P_CIIMGCPT_IMGCPTEN_SC);
+	writel(cfg, dev->regs + S5P_CIIMGCPT);
+}
+
+static inline void fimc_hw_start_in_dma(struct fimc_dev *dev)
+{
+	u32 cfg = readl(dev->regs + S5P_MSCTRL);
+	cfg |= S5P_MSCTRL_ENVID;
+	writel(cfg, dev->regs + S5P_MSCTRL);
+}
+
+static inline void fimc_hw_stop_in_dma(struct fimc_dev *dev)
+{
+	u32 cfg = readl(dev->regs + S5P_MSCTRL);
+	cfg &= ~S5P_MSCTRL_ENVID;
+	writel(cfg, dev->regs + S5P_MSCTRL);
+}
+
+/* -----------------------------------------------------*/
+/* fimc-reg.c						*/
+void fimc_hw_reset(struct fimc_dev *dev);
+void fimc_hw_set_rotation(struct fimc_ctx *ctx);
+void fimc_hw_set_target_format(struct fimc_ctx *ctx);
+void fimc_hw_set_out_dma(struct fimc_ctx *ctx);
+void fimc_hw_en_lastirq(struct fimc_dev *dev, int enable);
+void fimc_hw_en_irq(struct fimc_dev *dev, int enable);
+void fimc_hw_set_prescaler(struct fimc_ctx *ctx);
+void fimc_hw_set_scaler(struct fimc_ctx *ctx);
+void fimc_hw_en_capture(struct fimc_ctx *ctx);
+void fimc_hw_set_effect(struct fimc_ctx *ctx);
+void fimc_hw_set_in_dma(struct fimc_ctx *ctx);
+void fimc_hw_set_input_path(struct fimc_ctx *ctx);
+void fimc_hw_set_output_path(struct fimc_ctx *ctx);
+void fimc_hw_set_input_addr(struct fimc_dev *dev, struct fimc_addr *paddr);
+void fimc_hw_set_output_addr(struct fimc_dev *dev, struct fimc_addr *paddr);
+
+#endif /* FIMC_CORE_H_ */
diff --git a/drivers/media/video/s5p-fimc/fimc-reg.c b/drivers/media/video/s5p-fimc/fimc-reg.c
new file mode 100644
index 000000000000..5570f1ce0c9c
--- /dev/null
+++ b/drivers/media/video/s5p-fimc/fimc-reg.c
@@ -0,0 +1,527 @@
+/*
+ * Register interface file for Samsung Camera Interface (FIMC) driver
+ *
+ * Copyright (c) 2010 Samsung Electronics
+ *
+ * Sylwester Nawrocki, s.nawrocki@samsung.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <mach/map.h>
+
+#include "fimc-core.h"
+
+
+void fimc_hw_reset(struct fimc_dev *dev)
+{
+	u32 cfg;
+
+	cfg = readl(dev->regs + S5P_CISRCFMT);
+	cfg |= S5P_CISRCFMT_ITU601_8BIT;
+	writel(cfg, dev->regs + S5P_CISRCFMT);
+
+	/* Software reset. */
+	cfg = readl(dev->regs + S5P_CIGCTRL);
+	cfg |= (S5P_CIGCTRL_SWRST | S5P_CIGCTRL_IRQ_LEVEL);
+	writel(cfg, dev->regs + S5P_CIGCTRL);
+	msleep(1);
+
+	cfg = readl(dev->regs + S5P_CIGCTRL);
+	cfg &= ~S5P_CIGCTRL_SWRST;
+	writel(cfg, dev->regs + S5P_CIGCTRL);
+
+}
+
+void fimc_hw_set_rotation(struct fimc_ctx *ctx)
+{
+	u32 cfg, flip;
+	struct fimc_dev *dev = ctx->fimc_dev;
+
+	cfg = readl(dev->regs + S5P_CITRGFMT);
+	cfg &= ~(S5P_CITRGFMT_INROT90 | S5P_CITRGFMT_OUTROT90);
+
+	flip = readl(dev->regs + S5P_MSCTRL);
+	flip &= ~S5P_MSCTRL_FLIP_MASK;
+
+	/*
+	 * The input and output rotator cannot work simultaneously.
+	 * Use the output rotator in output DMA mode or the input rotator
+	 * in direct fifo output mode.
+	 */
+	if (ctx->rotation == 90 || ctx->rotation == 270) {
+		if (ctx->out_path == FIMC_LCDFIFO) {
+			cfg |= S5P_CITRGFMT_INROT90;
+			if (ctx->rotation == 270)
+				flip |= S5P_MSCTRL_FLIP_180;
+		} else {
+			cfg |= S5P_CITRGFMT_OUTROT90;
+			if (ctx->rotation == 270)
+				cfg |= S5P_CITRGFMT_FLIP_180;
+		}
+	} else if (ctx->rotation == 180) {
+		if (ctx->out_path == FIMC_LCDFIFO)
+			flip |= S5P_MSCTRL_FLIP_180;
+		else
+			cfg |= S5P_CITRGFMT_FLIP_180;
+	}
+	if (ctx->rotation == 180 || ctx->rotation == 270)
+		writel(flip, dev->regs + S5P_MSCTRL);
+	writel(cfg, dev->regs + S5P_CITRGFMT);
+}
+
+static u32 fimc_hw_get_in_flip(u32 ctx_flip)
+{
+	u32 flip = S5P_MSCTRL_FLIP_NORMAL;
+
+	switch (ctx_flip) {
+	case FLIP_X_AXIS:
+		flip = S5P_MSCTRL_FLIP_X_MIRROR;
+		break;
+	case FLIP_Y_AXIS:
+		flip = S5P_MSCTRL_FLIP_Y_MIRROR;
+		break;
+	case FLIP_XY_AXIS:
+		flip = S5P_MSCTRL_FLIP_180;
+		break;
+	}
+
+	return flip;
+}
+
+static u32 fimc_hw_get_target_flip(u32 ctx_flip)
+{
+	u32 flip = S5P_CITRGFMT_FLIP_NORMAL;
+
+	switch (ctx_flip) {
+	case FLIP_X_AXIS:
+		flip = S5P_CITRGFMT_FLIP_X_MIRROR;
+		break;
+	case FLIP_Y_AXIS:
+		flip = S5P_CITRGFMT_FLIP_Y_MIRROR;
+		break;
+	case FLIP_XY_AXIS:
+		flip = S5P_CITRGFMT_FLIP_180;
+		break;
+	case FLIP_NONE:
+		break;
+
+	}
+	return flip;
+}
+
+void fimc_hw_set_target_format(struct fimc_ctx *ctx)
+{
+	u32 cfg;
+	struct fimc_dev *dev = ctx->fimc_dev;
+	struct fimc_frame *frame = &ctx->d_frame;
+
+	dbg("w= %d, h= %d color: %d", frame->width,
+		frame->height, frame->fmt->color);
+
+	cfg = readl(dev->regs + S5P_CITRGFMT);
+	cfg &= ~(S5P_CITRGFMT_FMT_MASK | S5P_CITRGFMT_HSIZE_MASK |
+		  S5P_CITRGFMT_VSIZE_MASK);
+
+	switch (frame->fmt->color) {
+	case S5P_FIMC_RGB565:
+	case S5P_FIMC_RGB666:
+	case S5P_FIMC_RGB888:
+		cfg |= S5P_CITRGFMT_RGB;
+		break;
+	case S5P_FIMC_YCBCR420:
+		cfg |= S5P_CITRGFMT_YCBCR420;
+		break;
+	case S5P_FIMC_YCBYCR422:
+	case S5P_FIMC_YCRYCB422:
+	case S5P_FIMC_CBYCRY422:
+	case S5P_FIMC_CRYCBY422:
+		if (frame->fmt->planes_cnt == 1)
+			cfg |= S5P_CITRGFMT_YCBCR422_1P;
+		else
+			cfg |= S5P_CITRGFMT_YCBCR422;
+		break;
+	default:
+		break;
+	}
+
+	cfg |= S5P_CITRGFMT_HSIZE(frame->width);
+	cfg |= S5P_CITRGFMT_VSIZE(frame->height);
+
+	if (ctx->rotation == 0) {
+		cfg &= ~S5P_CITRGFMT_FLIP_MASK;
+		cfg |= fimc_hw_get_target_flip(ctx->flip);
+	}
+	writel(cfg, dev->regs + S5P_CITRGFMT);
+
+	cfg = readl(dev->regs + S5P_CITAREA) & ~S5P_CITAREA_MASK;
+	cfg |= (frame->width * frame->height);
+	writel(cfg, dev->regs + S5P_CITAREA);
+}
+
+static void fimc_hw_set_out_dma_size(struct fimc_ctx *ctx)
+{
+	struct fimc_dev *dev = ctx->fimc_dev;
+	struct fimc_frame *frame = &ctx->d_frame;
+	u32 cfg = 0;
+
+	if (ctx->rotation == 90 || ctx->rotation == 270) {
+		cfg |= S5P_ORIG_SIZE_HOR(frame->f_height);
+		cfg |= S5P_ORIG_SIZE_VER(frame->f_width);
+	} else {
+		cfg |= S5P_ORIG_SIZE_HOR(frame->f_width);
+		cfg |= S5P_ORIG_SIZE_VER(frame->f_height);
+	}
+	writel(cfg, dev->regs + S5P_ORGOSIZE);
+}
+
+void fimc_hw_set_out_dma(struct fimc_ctx *ctx)
+{
+	u32 cfg;
+	struct fimc_dev *dev = ctx->fimc_dev;
+	struct fimc_frame *frame = &ctx->d_frame;
+	struct fimc_dma_offset *offset = &frame->dma_offset;
+
+	/* Set the input dma offsets. */
+	cfg = 0;
+	cfg |= S5P_CIO_OFFS_HOR(offset->y_h);
+	cfg |= S5P_CIO_OFFS_VER(offset->y_v);
+	writel(cfg, dev->regs + S5P_CIOYOFF);
+
+	cfg = 0;
+	cfg |= S5P_CIO_OFFS_HOR(offset->cb_h);
+	cfg |= S5P_CIO_OFFS_VER(offset->cb_v);
+	writel(cfg, dev->regs + S5P_CIOCBOFF);
+
+	cfg = 0;
+	cfg |= S5P_CIO_OFFS_HOR(offset->cr_h);
+	cfg |= S5P_CIO_OFFS_VER(offset->cr_v);
+	writel(cfg, dev->regs + S5P_CIOCROFF);
+
+	fimc_hw_set_out_dma_size(ctx);
+
+	/* Configure chroma components order. */
+	cfg = readl(dev->regs + S5P_CIOCTRL);
+
+	cfg &= ~(S5P_CIOCTRL_ORDER2P_MASK | S5P_CIOCTRL_ORDER422_MASK |
+		 S5P_CIOCTRL_YCBCR_PLANE_MASK);
+
+	if (frame->fmt->planes_cnt == 1)
+		cfg |= ctx->out_order_1p;
+	else if (frame->fmt->planes_cnt == 2)
+		cfg |= ctx->out_order_2p | S5P_CIOCTRL_YCBCR_2PLANE;
+	else if (frame->fmt->planes_cnt == 3)
+		cfg |= S5P_CIOCTRL_YCBCR_3PLANE;
+
+	writel(cfg, dev->regs + S5P_CIOCTRL);
+}
+
+static void fimc_hw_en_autoload(struct fimc_dev *dev, int enable)
+{
+	u32 cfg = readl(dev->regs + S5P_ORGISIZE);
+	if (enable)
+		cfg |= S5P_CIREAL_ISIZE_AUTOLOAD_EN;
+	else
+		cfg &= ~S5P_CIREAL_ISIZE_AUTOLOAD_EN;
+	writel(cfg, dev->regs + S5P_ORGISIZE);
+}
+
+void fimc_hw_en_lastirq(struct fimc_dev *dev, int enable)
+{
+	unsigned long flags;
+	u32 cfg;
+
+	spin_lock_irqsave(&dev->slock, flags);
+
+	cfg = readl(dev->regs + S5P_CIOCTRL);
+	if (enable)
+		cfg |= S5P_CIOCTRL_LASTIRQ_ENABLE;
+	else
+		cfg &= ~S5P_CIOCTRL_LASTIRQ_ENABLE;
+	writel(cfg, dev->regs + S5P_CIOCTRL);
+
+	spin_unlock_irqrestore(&dev->slock, flags);
+}
+
+void fimc_hw_set_prescaler(struct fimc_ctx *ctx)
+{
+	struct fimc_dev *dev =  ctx->fimc_dev;
+	struct fimc_scaler *sc = &ctx->scaler;
+	u32 cfg = 0, shfactor;
+
+	shfactor = 10 - (sc->hfactor + sc->vfactor);
+
+	cfg |= S5P_CISCPRERATIO_SHFACTOR(shfactor);
+	cfg |= S5P_CISCPRERATIO_HOR(sc->pre_hratio);
+	cfg |= S5P_CISCPRERATIO_VER(sc->pre_vratio);
+	writel(cfg, dev->regs + S5P_CISCPRERATIO);
+
+	cfg = 0;
+	cfg |= S5P_CISCPREDST_WIDTH(sc->pre_dst_width);
+	cfg |= S5P_CISCPREDST_HEIGHT(sc->pre_dst_height);
+	writel(cfg, dev->regs + S5P_CISCPREDST);
+}
+
+void fimc_hw_set_scaler(struct fimc_ctx *ctx)
+{
+	struct fimc_dev *dev = ctx->fimc_dev;
+	struct fimc_scaler *sc = &ctx->scaler;
+	struct fimc_frame *src_frame = &ctx->s_frame;
+	struct fimc_frame *dst_frame = &ctx->d_frame;
+	u32 cfg = 0;
+
+	if (!(ctx->flags & FIMC_COLOR_RANGE_NARROW))
+		cfg |= (S5P_CISCCTRL_CSCR2Y_WIDE | S5P_CISCCTRL_CSCY2R_WIDE);
+
+	if (!sc->enabled)
+		cfg |= S5P_CISCCTRL_SCALERBYPASS;
+
+	if (sc->scaleup_h)
+		cfg |= S5P_CISCCTRL_SCALEUP_H;
+
+	if (sc->scaleup_v)
+		cfg |= S5P_CISCCTRL_SCALEUP_V;
+
+	if (sc->copy_mode)
+		cfg |= S5P_CISCCTRL_ONE2ONE;
+
+
+	if (ctx->in_path == FIMC_DMA) {
+		if (src_frame->fmt->color == S5P_FIMC_RGB565)
+			cfg |= S5P_CISCCTRL_INRGB_FMT_RGB565;
+		else if (src_frame->fmt->color == S5P_FIMC_RGB666)
+			cfg |= S5P_CISCCTRL_INRGB_FMT_RGB666;
+		else if (src_frame->fmt->color == S5P_FIMC_RGB888)
+			cfg |= S5P_CISCCTRL_INRGB_FMT_RGB888;
+	}
+
+	if (ctx->out_path == FIMC_DMA) {
+		if (dst_frame->fmt->color == S5P_FIMC_RGB565)
+			cfg |= S5P_CISCCTRL_OUTRGB_FMT_RGB565;
+		else if (dst_frame->fmt->color == S5P_FIMC_RGB666)
+			cfg |= S5P_CISCCTRL_OUTRGB_FMT_RGB666;
+		else if (dst_frame->fmt->color == S5P_FIMC_RGB888)
+			cfg |= S5P_CISCCTRL_OUTRGB_FMT_RGB888;
+	} else {
+		cfg |= S5P_CISCCTRL_OUTRGB_FMT_RGB888;
+
+		if (ctx->flags & FIMC_SCAN_MODE_INTERLACED)
+			cfg |= S5P_CISCCTRL_INTERLACE;
+	}
+
+	dbg("main_hratio= 0x%X  main_vratio= 0x%X",
+		sc->main_hratio, sc->main_vratio);
+
+	cfg |= S5P_CISCCTRL_SC_HORRATIO(sc->main_hratio);
+	cfg |= S5P_CISCCTRL_SC_VERRATIO(sc->main_vratio);
+
+	writel(cfg, dev->regs + S5P_CISCCTRL);
+}
+
+void fimc_hw_en_capture(struct fimc_ctx *ctx)
+{
+	struct fimc_dev *dev = ctx->fimc_dev;
+	u32 cfg;
+
+	cfg = readl(dev->regs + S5P_CIIMGCPT);
+	/* One shot mode for output DMA or freerun for FIFO. */
+	if (ctx->out_path == FIMC_DMA)
+		cfg |= S5P_CIIMGCPT_CPT_FREN_ENABLE;
+	else
+		cfg &= ~S5P_CIIMGCPT_CPT_FREN_ENABLE;
+
+	if (ctx->scaler.enabled)
+		cfg |= S5P_CIIMGCPT_IMGCPTEN_SC;
+
+	writel(cfg | S5P_CIIMGCPT_IMGCPTEN, dev->regs + S5P_CIIMGCPT);
+}
+
+void fimc_hw_set_effect(struct fimc_ctx *ctx)
+{
+	struct fimc_dev *dev = ctx->fimc_dev;
+	struct fimc_effect *effect = &ctx->effect;
+	u32 cfg = (S5P_CIIMGEFF_IE_ENABLE | S5P_CIIMGEFF_IE_SC_AFTER);
+
+	cfg |= effect->type;
+
+	if (effect->type == S5P_FIMC_EFFECT_ARBITRARY) {
+		cfg |= S5P_CIIMGEFF_PAT_CB(effect->pat_cb);
+		cfg |= S5P_CIIMGEFF_PAT_CR(effect->pat_cr);
+	}
+
+	writel(cfg, dev->regs + S5P_CIIMGEFF);
+}
+
+static void fimc_hw_set_in_dma_size(struct fimc_ctx *ctx)
+{
+	struct fimc_dev *dev = ctx->fimc_dev;
+	struct fimc_frame *frame = &ctx->s_frame;
+	u32 cfg_o = 0;
+	u32 cfg_r = 0;
+
+	if (FIMC_LCDFIFO == ctx->out_path)
+		cfg_r |=  S5P_CIREAL_ISIZE_AUTOLOAD_EN;
+
+	cfg_o |= S5P_ORIG_SIZE_HOR(frame->f_width);
+	cfg_o |= S5P_ORIG_SIZE_VER(frame->f_height);
+	cfg_r |= S5P_CIREAL_ISIZE_WIDTH(frame->width);
+	cfg_r |= S5P_CIREAL_ISIZE_HEIGHT(frame->height);
+
+	writel(cfg_o, dev->regs + S5P_ORGISIZE);
+	writel(cfg_r, dev->regs + S5P_CIREAL_ISIZE);
+}
+
+void fimc_hw_set_in_dma(struct fimc_ctx *ctx)
+{
+	struct fimc_dev *dev = ctx->fimc_dev;
+	struct fimc_frame *frame = &ctx->s_frame;
+	struct fimc_dma_offset *offset = &frame->dma_offset;
+	u32 cfg = 0;
+
+	/* Set the pixel offsets. */
+	cfg |= S5P_CIO_OFFS_HOR(offset->y_h);
+	cfg |= S5P_CIO_OFFS_VER(offset->y_v);
+	writel(cfg, dev->regs + S5P_CIIYOFF);
+
+	cfg = 0;
+	cfg |= S5P_CIO_OFFS_HOR(offset->cb_h);
+	cfg |= S5P_CIO_OFFS_VER(offset->cb_v);
+	writel(cfg, dev->regs + S5P_CIICBOFF);
+
+	cfg = 0;
+	cfg |= S5P_CIO_OFFS_HOR(offset->cr_h);
+	cfg |= S5P_CIO_OFFS_VER(offset->cr_v);
+	writel(cfg, dev->regs + S5P_CIICROFF);
+
+	/* Input original and real size. */
+	fimc_hw_set_in_dma_size(ctx);
+
+	/* Autoload is used currently only in FIFO mode. */
+	fimc_hw_en_autoload(dev, ctx->out_path == FIMC_LCDFIFO);
+
+	/* Set the input DMA to process single frame only. */
+	cfg = readl(dev->regs + S5P_MSCTRL);
+	cfg &= ~(S5P_MSCTRL_FLIP_MASK
+		| S5P_MSCTRL_INFORMAT_MASK
+		| S5P_MSCTRL_IN_BURST_COUNT_MASK
+		| S5P_MSCTRL_INPUT_MASK
+		| S5P_MSCTRL_C_INT_IN_MASK
+		| S5P_MSCTRL_2P_IN_ORDER_MASK);
+
+	cfg |= (S5P_MSCTRL_FRAME_COUNT(1) | S5P_MSCTRL_INPUT_MEMORY);
+
+	switch (frame->fmt->color) {
+	case S5P_FIMC_RGB565:
+	case S5P_FIMC_RGB666:
+	case S5P_FIMC_RGB888:
+		cfg |= S5P_MSCTRL_INFORMAT_RGB;
+		break;
+	case S5P_FIMC_YCBCR420:
+		cfg |= S5P_MSCTRL_INFORMAT_YCBCR420;
+
+		if (frame->fmt->planes_cnt == 2)
+			cfg |= ctx->in_order_2p | S5P_MSCTRL_C_INT_IN_2PLANE;
+		else
+			cfg |= S5P_MSCTRL_C_INT_IN_3PLANE;
+
+		break;
+	case S5P_FIMC_YCBYCR422:
+	case S5P_FIMC_YCRYCB422:
+	case S5P_FIMC_CBYCRY422:
+	case S5P_FIMC_CRYCBY422:
+		if (frame->fmt->planes_cnt == 1) {
+			cfg |= ctx->in_order_1p
+				| S5P_MSCTRL_INFORMAT_YCBCR422_1P;
+		} else {
+			cfg |= S5P_MSCTRL_INFORMAT_YCBCR422;
+
+			if (frame->fmt->planes_cnt == 2)
+				cfg |= ctx->in_order_2p
+					| S5P_MSCTRL_C_INT_IN_2PLANE;
+			else
+				cfg |= S5P_MSCTRL_C_INT_IN_3PLANE;
+		}
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Input DMA flip mode (and rotation).
+	 * Do not allow simultaneous rotation and flipping.
+	 */
+	if (!ctx->rotation && ctx->out_path == FIMC_LCDFIFO)
+		cfg |= fimc_hw_get_in_flip(ctx->flip);
+
+	writel(cfg, dev->regs + S5P_MSCTRL);
+
+	/* Input/output DMA linear/tiled mode. */
+	cfg = readl(dev->regs + S5P_CIDMAPARAM);
+	cfg &= ~S5P_CIDMAPARAM_TILE_MASK;
+
+	if (tiled_fmt(ctx->s_frame.fmt))
+		cfg |= S5P_CIDMAPARAM_R_64X32;
+
+	if (tiled_fmt(ctx->d_frame.fmt))
+		cfg |= S5P_CIDMAPARAM_W_64X32;
+
+	writel(cfg, dev->regs + S5P_CIDMAPARAM);
+}
+
+
+void fimc_hw_set_input_path(struct fimc_ctx *ctx)
+{
+	struct fimc_dev *dev = ctx->fimc_dev;
+
+	u32 cfg = readl(dev->regs + S5P_MSCTRL);
+	cfg &= ~S5P_MSCTRL_INPUT_MASK;
+
+	if (ctx->in_path == FIMC_DMA)
+		cfg |= S5P_MSCTRL_INPUT_MEMORY;
+	else
+		cfg |= S5P_MSCTRL_INPUT_EXTCAM;
+
+	writel(cfg, dev->regs + S5P_MSCTRL);
+}
+
+void fimc_hw_set_output_path(struct fimc_ctx *ctx)
+{
+	struct fimc_dev *dev = ctx->fimc_dev;
+
+	u32 cfg = readl(dev->regs + S5P_CISCCTRL);
+	cfg &= ~S5P_CISCCTRL_LCDPATHEN_FIFO;
+	if (ctx->out_path == FIMC_LCDFIFO)
+		cfg |= S5P_CISCCTRL_LCDPATHEN_FIFO;
+	writel(cfg, dev->regs + S5P_CISCCTRL);
+}
+
+void fimc_hw_set_input_addr(struct fimc_dev *dev, struct fimc_addr *paddr)
+{
+	u32 cfg = 0;
+
+	cfg = readl(dev->regs + S5P_CIREAL_ISIZE);
+	cfg |= S5P_CIREAL_ISIZE_ADDR_CH_DIS;
+	writel(cfg, dev->regs + S5P_CIREAL_ISIZE);
+
+	writel(paddr->y, dev->regs + S5P_CIIYSA0);
+	writel(paddr->cb, dev->regs + S5P_CIICBSA0);
+	writel(paddr->cr, dev->regs + S5P_CIICRSA0);
+
+	cfg &= ~S5P_CIREAL_ISIZE_ADDR_CH_DIS;
+	writel(cfg, dev->regs + S5P_CIREAL_ISIZE);
+}
+
+void fimc_hw_set_output_addr(struct fimc_dev *dev, struct fimc_addr *paddr)
+{
+	int i;
+	/* Set all the output register sets to point to single video buffer. */
+	for (i = 0; i < FIMC_MAX_OUT_BUFS; i++) {
+		writel(paddr->y, dev->regs + S5P_CIOYSA(i));
+		writel(paddr->cb, dev->regs + S5P_CIOCBSA(i));
+		writel(paddr->cr, dev->regs + S5P_CIOCRSA(i));
+	}
+}
diff --git a/drivers/media/video/s5p-fimc/regs-fimc.h b/drivers/media/video/s5p-fimc/regs-fimc.h
new file mode 100644
index 000000000000..a3cfe824db00
--- /dev/null
+++ b/drivers/media/video/s5p-fimc/regs-fimc.h
@@ -0,0 +1,293 @@
+/*
+ * Register definition file for Samsung Camera Interface (FIMC) driver
+ *
+ * Copyright (c) 2010 Samsung Electronics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef REGS_FIMC_H_
+#define REGS_FIMC_H_
+
+#define S5P_CIOYSA(__x)			(0x18 + (__x) * 4)
+#define S5P_CIOCBSA(__x)		(0x28 + (__x) * 4)
+#define S5P_CIOCRSA(__x)		(0x38 + (__x) * 4)
+
+/* Input source format */
+#define S5P_CISRCFMT			0x00
+#define S5P_CISRCFMT_ITU601_8BIT	(1 << 31)
+#define S5P_CISRCFMT_ITU601_16BIT	(1 << 29)
+#define S5P_CISRCFMT_ORDER422_YCBYCR	(0 << 14)
+#define S5P_CISRCFMT_ORDER422_YCRYCB	(1 << 14)
+#define S5P_CISRCFMT_ORDER422_CBYCRY	(2 << 14)
+#define S5P_CISRCFMT_ORDER422_CRYCBY	(3 << 14)
+#define S5P_CISRCFMT_HSIZE(x)		((x) << 16)
+#define S5P_CISRCFMT_VSIZE(x)		((x) << 0)
+
+/* Window offset */
+#define S5P_CIWDOFST			0x04
+#define S5P_CIWDOFST_WINOFSEN		(1 << 31)
+#define S5P_CIWDOFST_CLROVFIY		(1 << 30)
+#define S5P_CIWDOFST_CLROVRLB		(1 << 29)
+#define S5P_CIWDOFST_WINHOROFST_MASK	(0x7ff << 16)
+#define S5P_CIWDOFST_CLROVFICB		(1 << 15)
+#define S5P_CIWDOFST_CLROVFICR		(1 << 14)
+#define S5P_CIWDOFST_WINHOROFST(x)	((x) << 16)
+#define S5P_CIWDOFST_WINVEROFST(x)	((x) << 0)
+#define S5P_CIWDOFST_WINVEROFST_MASK	(0xfff << 0)
+
+/* Global control */
+#define S5P_CIGCTRL			0x08
+#define S5P_CIGCTRL_SWRST		(1 << 31)
+#define S5P_CIGCTRL_CAMRST_A		(1 << 30)
+#define S5P_CIGCTRL_SELCAM_ITU_A	(1 << 29)
+#define S5P_CIGCTRL_SELCAM_ITU_MASK	(1 << 29)
+#define S5P_CIGCTRL_TESTPAT_NORMAL	(0 << 27)
+#define S5P_CIGCTRL_TESTPAT_COLOR_BAR	(1 << 27)
+#define S5P_CIGCTRL_TESTPAT_HOR_INC	(2 << 27)
+#define S5P_CIGCTRL_TESTPAT_VER_INC	(3 << 27)
+#define S5P_CIGCTRL_TESTPAT_MASK	(3 << 27)
+#define S5P_CIGCTRL_TESTPAT_SHIFT	(27)
+#define S5P_CIGCTRL_INVPOLPCLK		(1 << 26)
+#define S5P_CIGCTRL_INVPOLVSYNC		(1 << 25)
+#define S5P_CIGCTRL_INVPOLHREF		(1 << 24)
+#define S5P_CIGCTRL_IRQ_OVFEN		(1 << 22)
+#define S5P_CIGCTRL_HREF_MASK		(1 << 21)
+#define S5P_CIGCTRL_IRQ_LEVEL		(1 << 20)
+#define S5P_CIGCTRL_IRQ_CLR		(1 << 19)
+#define S5P_CIGCTRL_IRQ_ENABLE		(1 << 16)
+#define S5P_CIGCTRL_SHDW_DISABLE	(1 << 12)
+#define S5P_CIGCTRL_SELCAM_MIPI_A	(1 << 7)
+#define S5P_CIGCTRL_CAMIF_SELWB		(1 << 6)
+#define S5P_CIGCTRL_INVPOLHSYNC		(1 << 4)
+#define S5P_CIGCTRL_SELCAM_MIPI		(1 << 3)
+#define S5P_CIGCTRL_INTERLACE		(1 << 0)
+
+/* Window offset 2 */
+#define S5P_CIWDOFST2			0x14
+#define S5P_CIWDOFST2_HOROFF_MASK	(0xfff << 16)
+#define S5P_CIWDOFST2_VEROFF_MASK	(0xfff << 0)
+#define S5P_CIWDOFST2_HOROFF(x)		((x) << 16)
+#define S5P_CIWDOFST2_VEROFF(x)		((x) << 0)
+
+/* Output DMA Y plane start address */
+#define S5P_CIOYSA1			0x18
+#define S5P_CIOYSA2			0x1c
+#define S5P_CIOYSA3			0x20
+#define S5P_CIOYSA4			0x24
+
+/* Output DMA Cb plane start address */
+#define S5P_CIOCBSA1			0x28
+#define S5P_CIOCBSA2			0x2c
+#define S5P_CIOCBSA3			0x30
+#define S5P_CIOCBSA4			0x34
+
+/* Output DMA Cr plane start address */
+#define S5P_CIOCRSA1			0x38
+#define S5P_CIOCRSA2			0x3c
+#define S5P_CIOCRSA3			0x40
+#define S5P_CIOCRSA4			0x44
+
+/* Target image format */
+#define S5P_CITRGFMT			0x48
+#define S5P_CITRGFMT_INROT90		(1 << 31)
+#define S5P_CITRGFMT_YCBCR420		(0 << 29)
+#define S5P_CITRGFMT_YCBCR422		(1 << 29)
+#define S5P_CITRGFMT_YCBCR422_1P	(2 << 29)
+#define S5P_CITRGFMT_RGB		(3 << 29)
+#define S5P_CITRGFMT_FMT_MASK		(3 << 29)
+#define S5P_CITRGFMT_HSIZE_MASK		(0xfff << 16)
+#define S5P_CITRGFMT_FLIP_SHIFT		(14)
+#define S5P_CITRGFMT_FLIP_NORMAL	(0 << 14)
+#define S5P_CITRGFMT_FLIP_X_MIRROR	(1 << 14)
+#define S5P_CITRGFMT_FLIP_Y_MIRROR	(2 << 14)
+#define S5P_CITRGFMT_FLIP_180		(3 << 14)
+#define S5P_CITRGFMT_FLIP_MASK		(3 << 14)
+#define S5P_CITRGFMT_OUTROT90		(1 << 13)
+#define S5P_CITRGFMT_VSIZE_MASK		(0xfff << 0)
+#define S5P_CITRGFMT_HSIZE(x)		((x) << 16)
+#define S5P_CITRGFMT_VSIZE(x)		((x) << 0)
+
+/* Output DMA control */
+#define S5P_CIOCTRL			0x4c
+#define S5P_CIOCTRL_ORDER422_MASK	(3 << 0)
+#define S5P_CIOCTRL_ORDER422_CRYCBY	(0 << 0)
+#define S5P_CIOCTRL_ORDER422_YCRYCB	(1 << 0)
+#define S5P_CIOCTRL_ORDER422_CBYCRY	(2 << 0)
+#define S5P_CIOCTRL_ORDER422_YCBYCR	(3 << 0)
+#define S5P_CIOCTRL_LASTIRQ_ENABLE	(1 << 2)
+#define S5P_CIOCTRL_YCBCR_3PLANE	(0 << 3)
+#define S5P_CIOCTRL_YCBCR_2PLANE	(1 << 3)
+#define S5P_CIOCTRL_YCBCR_PLANE_MASK	(1 << 3)
+#define S5P_CIOCTRL_ORDER2P_SHIFT	(24)
+#define S5P_CIOCTRL_ORDER2P_MASK	(3 << 24)
+#define S5P_CIOCTRL_ORDER422_2P_LSB_CRCB (0 << 24)
+
+/* Pre-scaler control 1 */
+#define S5P_CISCPRERATIO		0x50
+#define S5P_CISCPRERATIO_SHFACTOR(x)	((x) << 28)
+#define S5P_CISCPRERATIO_HOR(x)		((x) << 16)
+#define S5P_CISCPRERATIO_VER(x)		((x) << 0)
+
+#define S5P_CISCPREDST			0x54
+#define S5P_CISCPREDST_WIDTH(x)		((x) << 16)
+#define S5P_CISCPREDST_HEIGHT(x)	((x) << 0)
+
+/* Main scaler control */
+#define S5P_CISCCTRL			0x58
+#define S5P_CISCCTRL_SCALERBYPASS	(1 << 31)
+#define S5P_CISCCTRL_SCALEUP_H		(1 << 30)
+#define S5P_CISCCTRL_SCALEUP_V		(1 << 29)
+#define S5P_CISCCTRL_CSCR2Y_WIDE	(1 << 28)
+#define S5P_CISCCTRL_CSCY2R_WIDE	(1 << 27)
+#define S5P_CISCCTRL_LCDPATHEN_FIFO	(1 << 26)
+#define S5P_CISCCTRL_INTERLACE		(1 << 25)
+#define S5P_CISCCTRL_SCALERSTART	(1 << 15)
+#define S5P_CISCCTRL_INRGB_FMT_RGB565	(0 << 13)
+#define S5P_CISCCTRL_INRGB_FMT_RGB666	(1 << 13)
+#define S5P_CISCCTRL_INRGB_FMT_RGB888	(2 << 13)
+#define S5P_CISCCTRL_INRGB_FMT_MASK	(3 << 13)
+#define S5P_CISCCTRL_OUTRGB_FMT_RGB565	(0 << 11)
+#define S5P_CISCCTRL_OUTRGB_FMT_RGB666	(1 << 11)
+#define S5P_CISCCTRL_OUTRGB_FMT_RGB888	(2 << 11)
+#define S5P_CISCCTRL_OUTRGB_FMT_MASK	(3 << 11)
+#define S5P_CISCCTRL_RGB_EXT		(1 << 10)
+#define S5P_CISCCTRL_ONE2ONE		(1 << 9)
+#define S5P_CISCCTRL_SC_HORRATIO(x)	((x) << 16)
+#define S5P_CISCCTRL_SC_VERRATIO(x)	((x) << 0)
+
+/* Target area */
+#define S5P_CITAREA			0x5c
+#define S5P_CITAREA_MASK		0x0fffffff
+
+/* General status */
+#define S5P_CISTATUS			0x64
+#define S5P_CISTATUS_OVFIY		(1 << 31)
+#define S5P_CISTATUS_OVFICB		(1 << 30)
+#define S5P_CISTATUS_OVFICR		(1 << 29)
+#define S5P_CISTATUS_VSYNC		(1 << 28)
+#define S5P_CISTATUS_WINOFF_EN		(1 << 25)
+#define S5P_CISTATUS_IMGCPT_EN		(1 << 22)
+#define S5P_CISTATUS_IMGCPT_SCEN	(1 << 21)
+#define S5P_CISTATUS_VSYNC_A		(1 << 20)
+#define S5P_CISTATUS_VSYNC_B		(1 << 19)
+#define S5P_CISTATUS_OVRLB		(1 << 18)
+#define S5P_CISTATUS_FRAME_END		(1 << 17)
+#define S5P_CISTATUS_LASTCAPT_END	(1 << 16)
+#define S5P_CISTATUS_VVALID_A		(1 << 15)
+#define S5P_CISTATUS_VVALID_B		(1 << 14)
+
+/* Image capture control */
+#define S5P_CIIMGCPT			0xc0
+#define S5P_CIIMGCPT_IMGCPTEN		(1 << 31)
+#define S5P_CIIMGCPT_IMGCPTEN_SC	(1 << 30)
+#define S5P_CIIMGCPT_CPT_FREN_ENABLE	(1 << 25)
+#define S5P_CIIMGCPT_CPT_FRMOD_CNT	(1 << 18)
+
+/* Frame capture sequence */
+#define S5P_CICPTSEQ			0xc4
+
+/* Image effect */
+#define S5P_CIIMGEFF			0xd0
+#define S5P_CIIMGEFF_IE_DISABLE		(0 << 30)
+#define S5P_CIIMGEFF_IE_ENABLE		(1 << 30)
+#define S5P_CIIMGEFF_IE_SC_BEFORE	(0 << 29)
+#define S5P_CIIMGEFF_IE_SC_AFTER	(1 << 29)
+#define S5P_CIIMGEFF_FIN_BYPASS		(0 << 26)
+#define S5P_CIIMGEFF_FIN_ARBITRARY	(1 << 26)
+#define S5P_CIIMGEFF_FIN_NEGATIVE	(2 << 26)
+#define S5P_CIIMGEFF_FIN_ARTFREEZE	(3 << 26)
+#define S5P_CIIMGEFF_FIN_EMBOSSING	(4 << 26)
+#define S5P_CIIMGEFF_FIN_SILHOUETTE	(5 << 26)
+#define S5P_CIIMGEFF_FIN_MASK		(7 << 26)
+#define S5P_CIIMGEFF_PAT_CBCR_MASK	((0xff < 13) | (0xff < 0))
+#define S5P_CIIMGEFF_PAT_CB(x)		((x) << 13)
+#define S5P_CIIMGEFF_PAT_CR(x)		((x) << 0)
+
+/* Input DMA Y/Cb/Cr plane start address 0 */
+#define S5P_CIIYSA0			0xd4
+#define S5P_CIICBSA0			0xd8
+#define S5P_CIICRSA0			0xdc
+
+/* Real input DMA image size */
+#define S5P_CIREAL_ISIZE		0xf8
+#define S5P_CIREAL_ISIZE_AUTOLOAD_EN	(1 << 31)
+#define S5P_CIREAL_ISIZE_ADDR_CH_DIS	(1 << 30)
+#define S5P_CIREAL_ISIZE_HEIGHT(x)	((x) << 16)
+#define S5P_CIREAL_ISIZE_WIDTH(x)	((x) << 0)
+
+
+/* Input DMA control */
+#define S5P_MSCTRL			0xfc
+#define S5P_MSCTRL_IN_BURST_COUNT_MASK	(3 << 24)
+#define S5P_MSCTRL_2P_IN_ORDER_MASK	(3 << 16)
+#define S5P_MSCTRL_2P_IN_ORDER_SHIFT	16
+#define S5P_MSCTRL_C_INT_IN_3PLANE	(0 << 15)
+#define S5P_MSCTRL_C_INT_IN_2PLANE	(1 << 15)
+#define S5P_MSCTRL_C_INT_IN_MASK	(1 << 15)
+#define S5P_MSCTRL_FLIP_SHIFT		13
+#define S5P_MSCTRL_FLIP_MASK		(3 << 13)
+#define S5P_MSCTRL_FLIP_NORMAL		(0 << 13)
+#define S5P_MSCTRL_FLIP_X_MIRROR	(1 << 13)
+#define S5P_MSCTRL_FLIP_Y_MIRROR	(2 << 13)
+#define S5P_MSCTRL_FLIP_180		(3 << 13)
+#define S5P_MSCTRL_ORDER422_SHIFT	4
+#define S5P_MSCTRL_ORDER422_CRYCBY	(0 << 4)
+#define S5P_MSCTRL_ORDER422_YCRYCB	(1 << 4)
+#define S5P_MSCTRL_ORDER422_CBYCRY	(2 << 4)
+#define S5P_MSCTRL_ORDER422_YCBYCR	(3 << 4)
+#define S5P_MSCTRL_ORDER422_MASK	(3 << 4)
+#define S5P_MSCTRL_INPUT_EXTCAM		(0 << 3)
+#define S5P_MSCTRL_INPUT_MEMORY		(1 << 3)
+#define S5P_MSCTRL_INPUT_MASK		(1 << 3)
+#define S5P_MSCTRL_INFORMAT_YCBCR420	(0 << 1)
+#define S5P_MSCTRL_INFORMAT_YCBCR422	(1 << 1)
+#define S5P_MSCTRL_INFORMAT_YCBCR422_1P	(2 << 1)
+#define S5P_MSCTRL_INFORMAT_RGB		(3 << 1)
+#define S5P_MSCTRL_INFORMAT_MASK	(3 << 1)
+#define S5P_MSCTRL_ENVID		(1 << 0)
+#define S5P_MSCTRL_FRAME_COUNT(x)	((x) << 24)
+
+/* Input DMA Y/Cb/Cr plane start address 1 */
+#define S5P_CIIYSA1			0x144
+#define S5P_CIICBSA1			0x148
+#define S5P_CIICRSA1			0x14c
+
+/* Output DMA Y/Cb/Cr offset */
+#define S5P_CIOYOFF			0x168
+#define S5P_CIOCBOFF			0x16c
+#define S5P_CIOCROFF			0x170
+
+/* Input DMA Y/Cb/Cr offset */
+#define S5P_CIIYOFF			0x174
+#define S5P_CIICBOFF			0x178
+#define S5P_CIICROFF			0x17c
+
+#define S5P_CIO_OFFS_VER(x)		((x) << 16)
+#define S5P_CIO_OFFS_HOR(x)		((x) << 0)
+
+/* Input DMA original image size */
+#define S5P_ORGISIZE			0x180
+
+/* Output DMA original image size */
+#define S5P_ORGOSIZE			0x184
+
+#define S5P_ORIG_SIZE_VER(x)		((x) << 16)
+#define S5P_ORIG_SIZE_HOR(x)		((x) << 0)
+
+/* Real output DMA image size (extension register) */
+#define S5P_CIEXTEN			0x188
+
+#define S5P_CIDMAPARAM			0x18c
+#define S5P_CIDMAPARAM_R_LINEAR		(0 << 29)
+#define S5P_CIDMAPARAM_R_64X32		(3 << 29)
+#define S5P_CIDMAPARAM_W_LINEAR		(0 << 13)
+#define S5P_CIDMAPARAM_W_64X32		(3 << 13)
+#define S5P_CIDMAPARAM_TILE_MASK	((3 << 29) | (3 << 13))
+
+/* MIPI CSI image format */
+#define S5P_CSIIMGFMT			0x194
+
+#endif /* REGS_FIMC_H_ */
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 047f7e6edb86..61490c6dcdbd 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -277,6 +277,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_RGB565  v4l2_fourcc('R', 'G', 'B', 'P') /* 16  RGB-5-6-5     */
 #define V4L2_PIX_FMT_RGB555X v4l2_fourcc('R', 'G', 'B', 'Q') /* 16  RGB-5-5-5 BE  */
 #define V4L2_PIX_FMT_RGB565X v4l2_fourcc('R', 'G', 'B', 'R') /* 16  RGB-5-6-5 BE  */
+#define V4L2_PIX_FMT_BGR666  v4l2_fourcc('B', 'G', 'R', 'H') /* 18  BGR-6-6-6	  */
 #define V4L2_PIX_FMT_BGR24   v4l2_fourcc('B', 'G', 'R', '3') /* 24  BGR-8-8-8     */
 #define V4L2_PIX_FMT_RGB24   v4l2_fourcc('R', 'G', 'B', '3') /* 24  RGB-8-8-8     */
 #define V4L2_PIX_FMT_BGR32   v4l2_fourcc('B', 'G', 'R', '4') /* 32  BGR-8-8-8-8   */
-- 
cgit v1.2.3-59-g8ed1b


From 0e4f6a791b1e8cfde75a74e2f885642ecb3fe9d8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 4 Jul 2010 12:18:57 +0400
Subject: Fix reiserfs_file_release()

a) count file openers correctly; i_count use was completely wrong
b) use new mutex for exclusion between final close/open/truncate,
to protect tailpacking logics.  i_mutex use was wrong and resulted
in deadlocks.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/file.c            | 50 ++++++++++++++++++++++++-------------------
 fs/reiserfs/inode.c           |  2 --
 fs/reiserfs/super.c           |  2 ++
 include/linux/reiserfs_fs_i.h |  4 ++--
 4 files changed, 32 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index b82cdd8a45dd..6846371498b6 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -38,20 +38,24 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 
 	BUG_ON(!S_ISREG(inode->i_mode));
 
+        if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
+		return 0;
+
+	mutex_lock(&(REISERFS_I(inode)->tailpack));
+
+        if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
+		mutex_unlock(&(REISERFS_I(inode)->tailpack));
+		return 0;
+	}
+
 	/* fast out for when nothing needs to be done */
-	if ((atomic_read(&inode->i_count) > 1 ||
-	     !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
+	if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
 	     !tail_has_to_be_packed(inode)) &&
 	    REISERFS_I(inode)->i_prealloc_count <= 0) {
+		mutex_unlock(&(REISERFS_I(inode)->tailpack));
 		return 0;
 	}
 
-	mutex_lock(&inode->i_mutex);
-
-	mutex_lock(&(REISERFS_I(inode)->i_mmap));
-	if (REISERFS_I(inode)->i_flags & i_ever_mapped)
-		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-
 	reiserfs_write_lock(inode->i_sb);
 	/* freeing preallocation only involves relogging blocks that
 	 * are already in the current transaction.  preallocation gets
@@ -94,9 +98,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 	if (!err)
 		err = jbegin_failure;
 
-	if (!err && atomic_read(&inode->i_count) <= 1 &&
+	if (!err &&
 	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
 	    tail_has_to_be_packed(inode)) {
+
 		/* if regular file is released by last holder and it has been
 		   appended (we append by unformatted node only) or its direct
 		   item(s) had to be converted, then it may have to be
@@ -104,27 +109,28 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 		err = reiserfs_truncate_file(inode, 0);
 	}
       out:
-	mutex_unlock(&(REISERFS_I(inode)->i_mmap));
-	mutex_unlock(&inode->i_mutex);
 	reiserfs_write_unlock(inode->i_sb);
+	mutex_unlock(&(REISERFS_I(inode)->tailpack));
 	return err;
 }
 
-static int reiserfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int reiserfs_file_open(struct inode *inode, struct file *file)
 {
-	struct inode *inode;
-
-	inode = file->f_path.dentry->d_inode;
-	mutex_lock(&(REISERFS_I(inode)->i_mmap));
-	REISERFS_I(inode)->i_flags |= i_ever_mapped;
-	mutex_unlock(&(REISERFS_I(inode)->i_mmap));
-
-	return generic_file_mmap(file, vma);
+	int err = dquot_file_open(inode, file);
+        if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
+		/* somebody might be tailpacking on final close; wait for it */
+		mutex_lock(&(REISERFS_I(inode)->tailpack));
+		atomic_inc(&REISERFS_I(inode)->openers);
+		mutex_unlock(&(REISERFS_I(inode)->tailpack));
+	}
+	return err;
 }
 
 static void reiserfs_vfs_truncate_file(struct inode *inode)
 {
+	mutex_lock(&(REISERFS_I(inode)->tailpack));
 	reiserfs_truncate_file(inode, 1);
+	mutex_unlock(&(REISERFS_I(inode)->tailpack));
 }
 
 /* Sync a reiserfs file. */
@@ -288,8 +294,8 @@ const struct file_operations reiserfs_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = reiserfs_compat_ioctl,
 #endif
-	.mmap = reiserfs_file_mmap,
-	.open = dquot_file_open,
+	.mmap = generic_file_mmap,
+	.open = reiserfs_file_open,
 	.release = reiserfs_file_release,
 	.fsync = reiserfs_sync_file,
 	.aio_read = generic_file_aio_read,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0f22fdaf54ac..6edac85c2b93 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1138,7 +1138,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
 	REISERFS_I(inode)->i_prealloc_count = 0;
 	REISERFS_I(inode)->i_trans_id = 0;
 	REISERFS_I(inode)->i_jl = NULL;
-	mutex_init(&(REISERFS_I(inode)->i_mmap));
 	reiserfs_init_xattr_rwsem(inode);
 
 	if (stat_data_v1(ih)) {
@@ -1841,7 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	REISERFS_I(inode)->i_attrs =
 	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
 	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
-	mutex_init(&(REISERFS_I(inode)->i_mmap));
 	reiserfs_init_xattr_rwsem(inode);
 
 	/* key to search for correct place for new stat data */
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 9822fa15118b..1e1ee9056eb6 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -525,6 +525,8 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
 	    kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
+	atomic_set(&ei->openers, 0);
+	mutex_init(&ei->tailpack);
 	return &ei->vfs_inode;
 }
 
diff --git a/include/linux/reiserfs_fs_i.h b/include/linux/reiserfs_fs_i.h
index 89f4d3abbf5a..97959bdfe214 100644
--- a/include/linux/reiserfs_fs_i.h
+++ b/include/linux/reiserfs_fs_i.h
@@ -25,7 +25,6 @@ typedef enum {
 	i_link_saved_truncate_mask = 0x0020,
 	i_has_xattr_dir = 0x0040,
 	i_data_log = 0x0080,
-	i_ever_mapped = 0x0100
 } reiserfs_inode_flags;
 
 struct reiserfs_inode_info {
@@ -53,7 +52,8 @@ struct reiserfs_inode_info {
 	 ** flushed */
 	unsigned int i_trans_id;
 	struct reiserfs_journal_list *i_jl;
-	struct mutex i_mmap;
+	atomic_t openers;
+	struct mutex tailpack;
 #ifdef CONFIG_REISERFS_FS_XATTR
 	struct rw_semaphore i_xattr_sem;
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From eafdc7d190a944c755a9fe68573c193e6e0217e7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 11:29:53 +0200
Subject: sort out blockdev_direct_IO variants

Move the call to vmtruncate to get rid of accessive blocks to the callers
in prepearation of the new truncate calling sequence.  This was only done
for DIO_LOCKING filesystems, so the __blockdev_direct_IO_newtrunc variant
was not needed anyway.  Get rid of blockdev_direct_IO_no_locking and
its _newtrunc variant while at it as just opencoding the two additional
paramters is shorted than the name suffix.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c              |  5 ++-
 fs/direct-io.c              | 74 ++++++++++++---------------------------------
 fs/ext2/inode.c             |  2 +-
 fs/ext3/inode.c             | 11 +++++++
 fs/ext4/inode.c             | 15 +++++++--
 fs/fat/inode.c              |  4 +--
 fs/gfs2/aops.c              |  6 ++--
 fs/hfs/inode.c              | 17 ++++++++++-
 fs/hfsplus/inode.c          | 17 ++++++++++-
 fs/jfs/inode.c              | 17 ++++++++++-
 fs/nilfs2/inode.c           | 13 ++++++++
 fs/ocfs2/aops.c             |  9 +++---
 fs/reiserfs/inode.c         | 17 ++++++++++-
 fs/xfs/linux-2.6/xfs_aops.c | 16 +++++-----
 include/linux/fs.h          | 42 ++++---------------------
 15 files changed, 146 insertions(+), 119 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 99d6af811747..65a0c26508e5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,9 +172,8 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
-	return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode,
-				I_BDEV(inode), iov, offset, nr_segs,
-				blkdev_get_blocks, NULL);
+	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
+				    nr_segs, blkdev_get_blocks, NULL, NULL, 0);
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index a10cb91cadea..51f270b479b6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1136,8 +1136,27 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	return ret;
 }
 
+/*
+ * This is a library function for use by filesystem drivers.
+ *
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
+ *
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
+ */
 ssize_t
-__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags)
@@ -1233,57 +1252,4 @@ __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
 out:
 	return retval;
 }
-EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
-
-/*
- * This is a library function for use by filesystem drivers.
- *
- * The locking rules are governed by the flags parameter:
- *  - if the flags value contains DIO_LOCKING we use a fancy locking
- *    scheme for dumb filesystems.
- *    For writes this function is called under i_mutex and returns with
- *    i_mutex held, for reads, i_mutex is not held on entry, but it is
- *    taken and dropped again before returning.
- *    For reads and writes i_alloc_sem is taken in shared mode and released
- *    on I/O completion (which may happen asynchronously after returning to
- *    the caller).
- *
- *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- *    internal locking but rather rely on the filesystem to synchronize
- *    direct I/O reads/writes versus each other and truncate.
- *    For reads and writes both i_mutex and i_alloc_sem are not held on
- *    entry and are never taken.
- */
-ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
-	struct block_device *bdev, const struct iovec *iov, loff_t offset,
-	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-	dio_submit_t submit_io,	int flags)
-{
-	ssize_t retval;
-
-	retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
-			offset, nr_segs, get_block, end_io, submit_io, flags);
-	/*
-	 * In case of error extending write may have instantiated a few
-	 * blocks outside i_size. Trim these off again for DIO_LOCKING.
-	 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
-	 * their own manner. This is a further example of where the old
-	 * truncate sequence is inadequate.
-	 *
-	 * NOTE: filesystems with their own locking have to handle this
-	 * on their own.
-	 */
-	if (flags & DIO_LOCKING) {
-		if (unlikely((rw & WRITE) && retval < 0)) {
-			loff_t isize = i_size_read(inode);
-			loff_t end = offset + iov_length(iov, nr_segs);
-
-			if (end > isize)
-				vmtruncate(inode, isize);
-		}
-	}
-
-	return retval;
-}
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 3675088cb88c..f36e967e4fde 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -838,7 +838,7 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	struct inode *inode = mapping->host;
 	ssize_t ret;
 
-	ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
 				iov, offset, nr_segs, ext2_get_block, NULL);
 	if (ret < 0 && (rw & WRITE))
 		ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 735f0190ec2a..a66f3fe33672 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1785,6 +1785,17 @@ retry:
 	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext3_get_block, NULL);
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0afc8c1d8cf3..d6a7701018a6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3545,15 +3545,24 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 
 retry:
 	if (rw == READ && ext4_should_dioread_nolock(inode))
-		ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+		ret = __blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
-				 ext4_get_block, NULL);
-	else
+				 ext4_get_block, NULL, NULL, 0);
+	else {
 		ret = blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL);
+
+		if (unlikely((rw & WRITE) && ret < 0)) {
+			loff_t isize = i_size_read(inode);
+			loff_t end = offset + iov_length(iov, nr_segs);
+
+			if (end > isize)
+				vmtruncate(inode, isize);
+		}
+	}
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7bf45aee56d7..ffe7c6fdc1ec 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -212,8 +212,8 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 	 * FAT need to use the DIO_LOCKING for avoiding the race
 	 * condition of fat_get_block() and ->truncate().
 	 */
-	ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
-				iov, offset, nr_segs, fat_get_block, NULL);
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+				 iov, offset, nr_segs, fat_get_block, NULL);
 	if (ret < 0 && (rw & WRITE))
 		fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
 
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 9f8b52500d63..703000d6e4d2 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1047,9 +1047,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 	if (rv != 1)
 		goto out; /* dio not valid, fall back to buffered i/o */
 
-	rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
-					   iov, offset, nr_segs,
-					   gfs2_get_block_direct, NULL);
+	rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, gfs2_get_block_direct,
+				  NULL, NULL, 0);
 out:
 	gfs2_glock_dq_m(1, &gh);
 	gfs2_holder_uninit(&gh);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 14f5cb1b9fdc..07b2464b5716 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -112,9 +112,24 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+	ssize_t ret;
 
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, hfs_get_block, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return ret;
 }
 
 static int hfs_writepages(struct address_space *mapping,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 9bbb82924a22..486021773911 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -105,9 +105,24 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+	ssize_t ret;
 
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, hfsplus_get_block, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return ret;
 }
 
 static int hfsplus_writepages(struct address_space *mapping,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index ed9ba6fe04f5..79e6cda28181 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -317,9 +317,24 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
 
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				offset, nr_segs, jfs_get_block, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return ret;
 }
 
 const struct address_space_operations jfs_aops = {
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 39e038ac8fcb..1dd9e6a7d787 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -237,6 +237,19 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	/* Needs synchronization with the cleaner */
 	size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, nilfs_get_block, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && size < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
 	return size;
 }
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 96337a4fbbdf..0de69c9a08be 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -643,11 +643,10 @@ static ssize_t ocfs2_direct_IO(int rw,
 	if (i_size_read(inode) <= offset)
 		return 0;
 
-	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
-					    inode->i_sb->s_bdev, iov, offset,
-					    nr_segs,
-					    ocfs2_direct_IO_get_blocks,
-					    ocfs2_dio_end_io);
+	ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+				   iov, offset, nr_segs,
+				   ocfs2_direct_IO_get_blocks,
+				   ocfs2_dio_end_io, NULL, 0);
 
 	mlog_exit(ret);
 	return ret;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6edac85c2b93..4c1fb548ab64 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3057,10 +3057,25 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
 
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs,
 				  reiserfs_get_blocks_direct_io, NULL);
+
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again.
+	 */
+	if (unlikely((rw & WRITE) && ret < 0)) {
+		loff_t isize = i_size_read(inode);
+		loff_t end = offset + iov_length(iov, nr_segs);
+
+		if (end > isize)
+			vmtruncate(inode, isize);
+	}
+
+	return ret;
 }
 
 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d24e78f32f3e..7968d41e27ad 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1478,17 +1478,17 @@ xfs_vm_direct_IO(
 	if (rw & WRITE) {
 		iocb->private = xfs_alloc_ioend(inode, IO_NEW);
 
-		ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
-						    offset, nr_segs,
-						    xfs_get_blocks_direct,
-						    xfs_end_io_direct_write);
+		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
+					    offset, nr_segs,
+					    xfs_get_blocks_direct,
+					    xfs_end_io_direct_write, NULL, 0);
 		if (ret != -EIOCBQUEUED && iocb->private)
 			xfs_destroy_ioend(iocb->private);
 	} else {
-		ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
-						    offset, nr_segs,
-						    xfs_get_blocks_direct,
-						    NULL);
+		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
+					    offset, nr_segs,
+					    xfs_get_blocks_direct,
+					    NULL, NULL, 0);
 	}
 
 	return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f91affb7d530..b347b2d5666f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2269,16 +2269,6 @@ static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
 struct bio;
 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
 			    loff_t file_offset);
-void dio_end_io(struct bio *bio, int error);
-
-ssize_t __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
-	struct block_device *bdev, const struct iovec *iov, loff_t offset,
-	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-	dio_submit_t submit_io, int lock_type);
-ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
-	struct block_device *bdev, const struct iovec *iov, loff_t offset,
-	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-	dio_submit_t submit_io,	int lock_type);
 
 enum {
 	/* need locking between buffered and direct access */
@@ -2288,24 +2278,13 @@ enum {
 	DIO_SKIP_HOLES	= 0x02,
 };
 
-static inline ssize_t blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb,
-	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
-	loff_t offset, unsigned long nr_segs, get_block_t get_block,
-	dio_iodone_t end_io)
-{
-	return __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, offset,
-				    nr_segs, get_block, end_io, NULL,
-				    DIO_LOCKING | DIO_SKIP_HOLES);
-}
+void dio_end_io(struct bio *bio, int error);
+
+ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset,
+	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+	dio_submit_t submit_io,	int flags);
 
-static inline ssize_t blockdev_direct_IO_no_locking_newtrunc(int rw, struct kiocb *iocb,
-	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
-	loff_t offset, unsigned long nr_segs, get_block_t get_block,
-	dio_iodone_t end_io)
-{
-	return __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, offset,
-				nr_segs, get_block, end_io, NULL, 0);
-}
 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
 	loff_t offset, unsigned long nr_segs, get_block_t get_block,
@@ -2315,15 +2294,6 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 				    nr_segs, get_block, end_io, NULL,
 				    DIO_LOCKING | DIO_SKIP_HOLES);
 }
-
-static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
-	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
-	loff_t offset, unsigned long nr_segs, get_block_t get_block,
-	dio_iodone_t end_io)
-{
-	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-				    nr_segs, get_block, end_io, NULL, 0);
-}
 #endif
 
 extern const struct file_operations generic_ro_fops;
-- 
cgit v1.2.3-59-g8ed1b


From ea0f04e59543bafb3d2cbe37a0d375acb0bb2c34 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 11:29:54 +0200
Subject: get rid of nobh_write_begin_newtrunc

Move the call to vmtruncate to get rid of accessive blocks to the only
remaining caller and rename the non-truncating version to nobh_write_begin.

Get rid of the superflous file argument to it while we're at it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c                 | 37 ++++---------------------------------
 fs/ext2/inode.c             |  9 ++-------
 fs/jfs/inode.c              | 11 ++++++++++-
 include/linux/buffer_head.h |  6 +-----
 4 files changed, 17 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index d54812b198e9..559daf76bca4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2510,11 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
 }
 
 /*
- * Filesystems implementing the new truncate sequence should use the
- * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
  * The filesystem needs to handle block truncation upon failure.
  */
-int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
+int nobh_write_begin(struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block)
@@ -2547,7 +2547,7 @@ int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 		unlock_page(page);
 		page_cache_release(page);
 		*pagep = NULL;
-		return block_write_begin_newtrunc(file, mapping, pos, len,
+		return block_write_begin_newtrunc(NULL, mapping, pos, len,
 					flags, pagep, fsdata, get_block);
 	}
 
@@ -2654,35 +2654,6 @@ out_release:
 
 	return ret;
 }
-EXPORT_SYMBOL(nobh_write_begin_newtrunc);
-
-/*
- * On entry, the page is fully not uptodate.
- * On exit the page is fully uptodate in the areas outside (from,to)
- */
-int nobh_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata,
-			get_block_t *get_block)
-{
-	int ret;
-
-	ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
-					pagep, fsdata, get_block);
-
-	/*
-	 * prepare_write() may have instantiated a few blocks
-	 * outside i_size.  Trim these off again. Don't need
-	 * i_size_read because we hold i_mutex.
-	 */
-	if (unlikely(ret)) {
-		loff_t isize = mapping->host->i_size;
-		if (pos + len > isize)
-			vmtruncate(mapping->host, isize);
-	}
-
-	return ret;
-}
 EXPORT_SYMBOL(nobh_write_begin);
 
 int nobh_write_end(struct file *file, struct address_space *mapping,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index f36e967e4fde..348805cd4109 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -806,13 +806,8 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
 {
 	int ret;
 
-	/*
-	 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework
-	 * directory handling code to pass around offsets rather than struct
-	 * pages in order to make this work easily.
-	 */
-	ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep,
-						fsdata, ext2_get_block);
+	ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
+			       ext2_get_block);
 	if (ret < 0)
 		ext2_write_failed(mapping, pos + len);
 	return ret;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 79e6cda28181..c38dc1806281 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -303,8 +303,17 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
 {
-	return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	int ret;
+
+	ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
 				jfs_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 1b9ba193b789..cfda5f0b2a4b 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -231,11 +231,7 @@ void block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
 int file_fsync(struct file *, int);
-int nobh_write_begin_newtrunc(struct file *, struct address_space *,
-				loff_t, unsigned, unsigned,
-				struct page **, void **, get_block_t*);
-int nobh_write_begin(struct file *, struct address_space *,
-				loff_t, unsigned, unsigned,
+int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned,
 				struct page **, void **, get_block_t*);
 int nobh_write_end(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
-- 
cgit v1.2.3-59-g8ed1b


From 282dc178849882289d30e58b54be6b2799b351aa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 11:29:55 +0200
Subject: get rid of cont_write_begin_newtrunc

Move the call to vmtruncate to get rid of accessive blocks to the callers
in preparation of the new truncate sequence and rename the non-truncating
version to cont_write_begin.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/adfs/inode.c             | 11 ++++++++++-
 fs/affs/file.c              | 11 ++++++++++-
 fs/buffer.c                 | 21 +--------------------
 fs/fat/inode.c              |  2 +-
 fs/hfs/inode.c              | 11 ++++++++++-
 fs/hfsplus/inode.c          | 11 ++++++++++-
 fs/hpfs/file.c              | 11 ++++++++++-
 fs/qnx4/inode.c             | 11 ++++++++++-
 include/linux/buffer_head.h |  3 ---
 9 files changed, 62 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 6f850b06ab62..b3dec193036b 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -50,10 +50,19 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int ret;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				adfs_get_block,
 				&ADFS_I(mapping->host)->mmu_private);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 322710c3eedf..c4a9875bd1a6 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -406,10 +406,19 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int ret;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				affs_get_block,
 				&AFFS_I(mapping->host)->mmu_private);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/buffer.c b/fs/buffer.c
index 559daf76bca4..14529ec759b9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2351,7 +2351,7 @@ out:
  * For moronic filesystems that do not allow holes in file.
  * We may have to extend the file.
  */
-int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
+int cont_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block, loff_t *bytes)
@@ -2377,25 +2377,6 @@ int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 out:
 	return err;
 }
-EXPORT_SYMBOL(cont_write_begin_newtrunc);
-
-int cont_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata,
-			get_block_t *get_block, loff_t *bytes)
-{
-	int ret;
-
-	ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
-					pagep, fsdata, get_block, bytes);
-	if (unlikely(ret)) {
-		loff_t isize = mapping->host->i_size;
-		if (pos + len > isize)
-			vmtruncate(mapping->host, isize);
-	}
-
-	return ret;
-}
 EXPORT_SYMBOL(cont_write_begin);
 
 int block_prepare_write(struct page *page, unsigned from, unsigned to,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index ffe7c6fdc1ec..ec6a699a4023 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -159,7 +159,7 @@ static int fat_write_begin(struct file *file, struct address_space *mapping,
 	int err;
 
 	*pagep = NULL;
-	err = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
+	err = cont_write_begin(file, mapping, pos, len, flags,
 				pagep, fsdata, fat_get_block,
 				&MSDOS_I(mapping->host)->mmu_private);
 	if (err < 0)
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 07b2464b5716..8df18e63eb6b 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -39,10 +39,19 @@ static int hfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int ret;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				hfs_get_block,
 				&HFS_I(mapping->host)->phys_size);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 486021773911..88bf1b562641 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -31,10 +31,19 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int ret;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				hfsplus_get_block,
 				&HFSPLUS_I(mapping->host).phys_size);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index a9ae9bfa752f..c0340887c7ea 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -97,10 +97,19 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int ret;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				hpfs_get_block,
 				&hpfs_i(mapping->host)->mmu_private);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 277575ddc05c..16829722be93 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -320,10 +320,19 @@ static int qnx4_write_begin(struct file *file, struct address_space *mapping,
 			struct page **pagep, void **fsdata)
 {
 	struct qnx4_inode_info *qnx4_inode = qnx4_i(mapping->host);
+	int ret;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 				qnx4_get_block,
 				&qnx4_inode->mmu_private);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
 {
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index cfda5f0b2a4b..7638647f0424 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -217,9 +217,6 @@ int generic_write_end(struct file *, struct address_space *,
 				struct page *, void *);
 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
-int cont_write_begin_newtrunc(struct file *, struct address_space *, loff_t,
-			unsigned, unsigned, struct page **, void **,
-			get_block_t *, loff_t *);
 int cont_write_begin(struct file *, struct address_space *, loff_t,
 			unsigned, unsigned, struct page **, void **,
 			get_block_t *, loff_t *);
-- 
cgit v1.2.3-59-g8ed1b


From 6e1db88d536adcbbfe562b2d4b7d6425784fff12 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 11:29:57 +0200
Subject: introduce __block_write_begin

Split up the block_write_begin implementation - __block_write_begin is a new
trivial wrapper for block_prepare_write that always takes an already
allocated page and can be either called from block_write_begin or filesystem
code that already has a page allocated.  Remove the handling of already
allocated pages from block_write_begin after switching all callers that
do it to __block_write_begin.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c                 | 69 +++++++++++++++++----------------------------
 fs/ext2/dir.c               |  3 +-
 fs/ext3/inode.c             |  3 +-
 fs/ext4/inode.c             | 11 +++-----
 fs/minix/inode.c            |  3 +-
 fs/nilfs2/dir.c             |  3 +-
 fs/reiserfs/inode.c         |  3 +-
 fs/sysv/itree.c             |  3 +-
 fs/ufs/inode.c              |  3 +-
 include/linux/buffer_head.h |  2 ++
 10 files changed, 39 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 14529ec759b9..c319c49da511 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1833,9 +1833,10 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
 
-static int __block_prepare_write(struct inode *inode, struct page *page,
-		unsigned from, unsigned to, get_block_t *get_block)
+int block_prepare_write(struct page *page, unsigned from, unsigned to,
+		get_block_t *get_block)
 {
+	struct inode *inode = page->mapping->host;
 	unsigned block_start, block_end;
 	sector_t block;
 	int err = 0;
@@ -1908,10 +1909,13 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
 		if (!buffer_uptodate(*wait_bh))
 			err = -EIO;
 	}
-	if (unlikely(err))
+	if (unlikely(err)) {
 		page_zero_new_buffers(page, from, to);
+		ClearPageUptodate(page);
+	}
 	return err;
 }
+EXPORT_SYMBOL(block_prepare_write);
 
 static int __block_commit_write(struct inode *inode, struct page *page,
 		unsigned from, unsigned to)
@@ -1948,6 +1952,15 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 	return 0;
 }
 
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block)
+{
+	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+
+	return block_prepare_write(page, start, start + len, get_block);
+}
+EXPORT_SYMBOL(__block_write_begin);
+
 /*
  * Filesystems implementing the new truncate sequence should use the
  * _newtrunc postfix variant which won't incorrectly call vmtruncate.
@@ -1958,41 +1971,22 @@ int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block)
 {
-	struct inode *inode = mapping->host;
-	int status = 0;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	struct page *page;
-	pgoff_t index;
-	unsigned start, end;
-	int ownpage = 0;
+	int status;
 
-	index = pos >> PAGE_CACHE_SHIFT;
-	start = pos & (PAGE_CACHE_SIZE - 1);
-	end = start + len;
-
-	page = *pagep;
-	if (page == NULL) {
-		ownpage = 1;
-		page = grab_cache_page_write_begin(mapping, index, flags);
-		if (!page) {
-			status = -ENOMEM;
-			goto out;
-		}
-		*pagep = page;
-	} else
-		BUG_ON(!PageLocked(page));
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
 
-	status = __block_prepare_write(inode, page, start, end, get_block);
+	status = __block_write_begin(page, pos, len, get_block);
 	if (unlikely(status)) {
-		ClearPageUptodate(page);
-
-		if (ownpage) {
-			unlock_page(page);
-			page_cache_release(page);
-			*pagep = NULL;
-		}
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
 	}
 
-out:
+	*pagep = page;
 	return status;
 }
 EXPORT_SYMBOL(block_write_begin_newtrunc);
@@ -2379,17 +2373,6 @@ out:
 }
 EXPORT_SYMBOL(cont_write_begin);
 
-int block_prepare_write(struct page *page, unsigned from, unsigned to,
-			get_block_t *get_block)
-{
-	struct inode *inode = page->mapping->host;
-	int err = __block_prepare_write(inode, page, from, to, get_block);
-	if (err)
-		ClearPageUptodate(page);
-	return err;
-}
-EXPORT_SYMBOL(block_prepare_write);
-
 int block_commit_write(struct page *page, unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 6b946bae11cf..764109886ec0 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -450,8 +450,7 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
 
 static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
 {
-	return block_write_begin_newtrunc(NULL, page->mapping, pos, len, 0,
-					  &page, NULL, ext2_get_block);
+	return __block_write_begin(page, pos, len, ext2_get_block);
 }
 
 /* Releases the page */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a66f3fe33672..5c6f07eefa4a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1196,8 +1196,7 @@ retry:
 		ret = PTR_ERR(handle);
 		goto out;
 	}
-	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-							ext3_get_block);
+	ret = __block_write_begin(page, pos, len, ext3_get_block);
 	if (ret)
 		goto write_begin_failed;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6a7701018a6..3da3c9646e5e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1578,11 +1578,9 @@ retry:
 	*pagep = page;
 
 	if (ext4_should_dioread_nolock(inode))
-		ret = block_write_begin(file, mapping, pos, len, flags, pagep,
-				fsdata, ext4_get_block_write);
+		ret = __block_write_begin(page, pos, len, ext4_get_block_write);
 	else
-		ret = block_write_begin(file, mapping, pos, len, flags, pagep,
-				fsdata, ext4_get_block);
+		ret = __block_write_begin(page, pos, len, ext4_get_block);
 
 	if (!ret && ext4_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
@@ -1593,7 +1591,7 @@ retry:
 		unlock_page(page);
 		page_cache_release(page);
 		/*
-		 * block_write_begin may have instantiated a few blocks
+		 * __block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
 		 *
@@ -3185,8 +3183,7 @@ retry:
 	}
 	*pagep = page;
 
-	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				ext4_da_get_block_prep);
+	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
 	if (ret < 0) {
 		unlock_page(page);
 		ext4_journal_stop(handle);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f4abe45229bb..6b29e73f0ca6 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -359,8 +359,7 @@ static int minix_readpage(struct file *file, struct page *page)
 
 int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
 {
-	return block_write_begin_newtrunc(NULL, page->mapping, pos, len, 0,
-					  &page, NULL, minix_get_block);
+	return __block_write_begin(page, pos, len, minix_get_block);
 }
 
 static int minix_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index fc2bcfa599a3..d14e3b94d81f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -83,8 +83,7 @@ static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
 static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
 {
 	loff_t pos = page_offset(page) + from;
-	return block_write_begin_newtrunc(NULL, page->mapping, pos, to - from,
-					  0, &page, NULL, nilfs_get_block);
+	return __block_write_begin(page, pos, to - from, nilfs_get_block);
 }
 
 static void nilfs_commit_chunk(struct page *page,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 4c1fb548ab64..045729f5674a 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2585,8 +2585,7 @@ static int reiserfs_write_begin(struct file *file,
 		old_ref = th->t_refcount;
 		th->t_refcount++;
 	}
-	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				reiserfs_get_block);
+	ret = __block_write_begin(page, pos, len, reiserfs_get_block);
 	if (ret && reiserfs_transaction_running(inode->i_sb)) {
 		struct reiserfs_transaction_handle *th = current->journal_info;
 		/* this gets a little ugly.  If reiserfs_get_block returned an
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 4068f485cfd6..82a005c3d7eb 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -461,8 +461,7 @@ static int sysv_readpage(struct file *file, struct page *page)
 
 int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
 {
-	return block_write_begin_newtrunc(NULL, page->mapping, pos, len, 0,
-					  &page, NULL, get_block);
+	return __block_write_begin(page, pos, len, get_block);
 }
 
 static int sysv_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index a9555b1ffd28..45ce32391f8f 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -560,8 +560,7 @@ static int ufs_readpage(struct file *file, struct page *page)
 
 int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
 {
-	return block_write_begin_newtrunc(NULL, page->mapping, pos, len, 0,
-					  &page, NULL, ufs_getfrag_block);
+	return __block_write_begin(page, pos, len, ufs_getfrag_block);
 }
 
 static int ufs_write_begin(struct file *file, struct address_space *mapping,
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7638647f0424..accc9f81bb63 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -209,6 +209,8 @@ int block_write_begin_newtrunc(struct file *, struct address_space *,
 int block_write_begin(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page **, void **, get_block_t*);
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block);
 int block_write_end(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page *, void *);
-- 
cgit v1.2.3-59-g8ed1b


From 155130a4f7848b1aac439cab6bda1a175507c71c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 11:29:58 +0200
Subject: get rid of block_write_begin_newtrunc

Move the call to vmtruncate to get rid of accessive blocks to the callers
in preparation of the new truncate sequence and rename the non-truncating
version to block_write_begin.

While we're at it also remove several unused arguments to block_write_begin.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/bfs/file.c               | 14 ++++++++---
 fs/block_dev.c              |  5 ++--
 fs/buffer.c                 | 61 +++++++--------------------------------------
 fs/ext2/inode.c             |  5 ++--
 fs/minix/inode.c            | 12 +++++++--
 fs/nilfs2/inode.c           | 12 ++++++---
 fs/nilfs2/recovery.c        | 11 +++++---
 fs/omfs/file.c              | 14 ++++++++---
 fs/sysv/itree.c             | 13 +++++++---
 fs/udf/inode.c              | 13 +++++++---
 fs/ufs/inode.c              | 12 +++++++--
 fs/xfs/linux-2.6/xfs_aops.c | 14 ++++++++---
 include/linux/buffer_head.h |  8 ++----
 13 files changed, 103 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 88b9a3ff44e4..8fc2e9c9739d 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -168,9 +168,17 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags,
-					pagep, fsdata, bfs_get_block);
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				bfs_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 65a0c26508e5..63c9d6076205 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -308,9 +308,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin_newtrunc(file, mapping, pos, len, flags,
-				pagep, fsdata, blkdev_get_block);
+	return block_write_begin(mapping, pos, len, flags, pagep,
+				 blkdev_get_block);
 }
 
 static int blkdev_write_end(struct file *file, struct address_space *mapping,
diff --git a/fs/buffer.c b/fs/buffer.c
index c319c49da511..50efa339e051 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1962,14 +1962,13 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 EXPORT_SYMBOL(__block_write_begin);
 
 /*
- * Filesystems implementing the new truncate sequence should use the
- * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
  * The filesystem needs to handle block truncation upon failure.
  */
-int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata,
-			get_block_t *get_block)
+int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
+		unsigned flags, struct page **pagep, get_block_t *get_block)
 {
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	struct page *page;
@@ -1989,44 +1988,6 @@ int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 	*pagep = page;
 	return status;
 }
-EXPORT_SYMBOL(block_write_begin_newtrunc);
-
-/*
- * block_write_begin takes care of the basic task of block allocation and
- * bringing partial write blocks uptodate first.
- *
- * If *pagep is not NULL, then block_write_begin uses the locked page
- * at *pagep rather than allocating its own. In this case, the page will
- * not be unlocked or deallocated on failure.
- */
-int block_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned flags,
-			struct page **pagep, void **fsdata,
-			get_block_t *get_block)
-{
-	int ret;
-
-	ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
-					pagep, fsdata, get_block);
-
-	/*
-	 * prepare_write() may have instantiated a few blocks
-	 * outside i_size.  Trim these off again. Don't need
-	 * i_size_read because we hold i_mutex.
-	 *
-	 * Filesystems which pass down their own page also cannot
-	 * call into vmtruncate here because it would lead to lock
-	 * inversion problems (*pagep is locked). This is a further
-	 * example of where the old truncate sequence is inadequate.
-	 */
-	if (unlikely(ret) && *pagep == NULL) {
-		loff_t isize = mapping->host->i_size;
-		if (pos + len > isize)
-			vmtruncate(mapping->host, isize);
-	}
-
-	return ret;
-}
 EXPORT_SYMBOL(block_write_begin);
 
 int block_write_end(struct file *file, struct address_space *mapping,
@@ -2357,7 +2318,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
 
 	err = cont_expand_zero(file, mapping, pos, bytes);
 	if (err)
-		goto out;
+		return err;
 
 	zerofrom = *bytes & ~PAGE_CACHE_MASK;
 	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
@@ -2365,11 +2326,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
 		(*bytes)++;
 	}
 
-	*pagep = NULL;
-	err = block_write_begin_newtrunc(file, mapping, pos, len,
-				flags, pagep, fsdata, get_block);
-out:
-	return err;
+	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
 }
 EXPORT_SYMBOL(cont_write_begin);
 
@@ -2511,8 +2468,8 @@ int nobh_write_begin(struct address_space *mapping,
 		unlock_page(page);
 		page_cache_release(page);
 		*pagep = NULL;
-		return block_write_begin_newtrunc(NULL, mapping, pos, len,
-					flags, pagep, fsdata, get_block);
+		return block_write_begin(mapping, pos, len, flags, pagep,
+					 get_block);
 	}
 
 	if (PageMappedToDisk(page))
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 2f4dfbcd7696..74dfe5f73330 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -772,9 +772,8 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
 {
 	int ret;
 
-	*pagep = NULL;
-	ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
-					 pagep, fsdata, ext2_get_block);
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				ext2_get_block);
 	if (ret < 0)
 		ext2_write_failed(mapping, pos + len);
 	return ret;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 6b29e73f0ca6..125062f55ef2 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -366,9 +366,17 @@ static int minix_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
 				minix_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t minix_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 1dd9e6a7d787..5c694ece172e 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -197,11 +197,15 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
 	if (unlikely(err))
 		return err;
 
-	*pagep = NULL;
-	err = block_write_begin(file, mapping, pos, len, flags, pagep,
-				fsdata, nilfs_get_block);
-	if (unlikely(err))
+	err = block_write_begin(mapping, pos, len, flags, pagep,
+				nilfs_get_block);
+	if (unlikely(err)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+
 		nilfs_transaction_abort(inode->i_sb);
+	}
 	return err;
 }
 
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index bae2a516b4ee..2f11f0868d87 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -505,11 +505,14 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
 		}
 
 		pos = rb->blkoff << inode->i_blkbits;
-		page = NULL;
-		err = block_write_begin(NULL, inode->i_mapping, pos, blocksize,
-					0, &page, NULL, nilfs_get_block);
-		if (unlikely(err))
+		err = block_write_begin(inode->i_mapping, pos, blocksize,
+					0, &page, nilfs_get_block);
+		if (unlikely(err)) {
+			loff_t isize = inode->i_size;
+			if (pos + blocksize > isize)
+				vmtruncate(inode, isize);
 			goto failed_inode;
+		}
 
 		err = nilfs_recovery_copy_block(sbi, rb, page);
 		if (unlikely(err))
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 6e7a3291bbe8..810cff346468 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -312,9 +312,17 @@ static int omfs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags,
-				pagep, fsdata, omfs_get_block);
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
+				omfs_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 82a005c3d7eb..9ca66276315e 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -468,9 +468,16 @@ static int sysv_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				get_block);
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 124852bcf6fe..ecddcc2ed746 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -127,9 +127,16 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				udf_get_block);
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t udf_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 45ce32391f8f..45cafa937a4b 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -567,9 +567,17 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	int ret;
+
+	ret = block_write_begin(mapping, pos, len, flags, pagep,
 				ufs_getfrag_block);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 7968d41e27ad..bf7aad0d78b8 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1504,9 +1504,17 @@ xfs_vm_write_begin(
 	struct page		**pagep,
 	void			**fsdata)
 {
-	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags | AOP_FLAG_NOFS,
-				 pagep, fsdata, xfs_get_blocks);
+	int			ret;
+
+	ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
+				pagep, xfs_get_blocks);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
 }
 
 STATIC sector_t
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index accc9f81bb63..3f69054f86d9 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -203,12 +203,8 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
 int block_read_full_page(struct page*, get_block_t*);
 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
 				unsigned long from);
-int block_write_begin_newtrunc(struct file *, struct address_space *,
-				loff_t, unsigned, unsigned,
-				struct page **, void **, get_block_t*);
-int block_write_begin(struct file *, struct address_space *,
-				loff_t, unsigned, unsigned,
-				struct page **, void **, get_block_t*);
+int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
+		unsigned flags, struct page **pagep, get_block_t *get_block);
 int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 		get_block_t *get_block);
 int block_write_end(struct file *, struct address_space *,
-- 
cgit v1.2.3-59-g8ed1b


From 6a1a90ad1b0edb556a7550a6ef8a8756f0304dd5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 11:30:00 +0200
Subject: rename generic_setattr

Despite its name it's now a generic implementation of ->setattr, but
rather a helper to copy attributes from a struct iattr to the inode.
Rename it to setattr_copy to reflect this fact.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/attr.c             | 14 +++++++-------
 fs/ext2/inode.c       |  2 +-
 fs/fat/file.c         |  2 +-
 fs/libfs.c            |  3 +--
 fs/ramfs/file-nommu.c |  2 +-
 fs/sysfs/inode.c      |  2 +-
 include/linux/fs.h    |  2 +-
 mm/shmem.c            |  2 +-
 8 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/attr.c b/fs/attr.c
index b4fa3b0aa596..1f6a895e24e9 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -105,13 +105,13 @@ out_big:
 EXPORT_SYMBOL(inode_newsize_ok);
 
 /**
- * generic_setattr - copy simple metadata updates into the generic inode
+ * setattr_copy - copy simple metadata updates into the generic inode
  * @inode:	the inode to be updated
  * @attr:	the new attributes
  *
- * generic_setattr must be called with i_mutex held.
+ * setattr_copy must be called with i_mutex held.
  *
- * generic_setattr updates the inode's metadata with that specified
+ * setattr_copy updates the inode's metadata with that specified
  * in attr. Noticably missing is inode size update, which is more complex
  * as it requires pagecache updates. See simple_setsize.
  *
@@ -119,7 +119,7 @@ EXPORT_SYMBOL(inode_newsize_ok);
  * that for "simple" filesystems, the struct inode is the inode storage.
  * The caller is free to mark the inode dirty afterwards if needed.
  */
-void generic_setattr(struct inode *inode, const struct iattr *attr)
+void setattr_copy(struct inode *inode, const struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 
@@ -144,11 +144,11 @@ void generic_setattr(struct inode *inode, const struct iattr *attr)
 		inode->i_mode = mode;
 	}
 }
-EXPORT_SYMBOL(generic_setattr);
+EXPORT_SYMBOL(setattr_copy);
 
 /*
  * note this function is deprecated, the new truncate sequence should be
- * used instead -- see eg. simple_setsize, generic_setattr.
+ * used instead -- see eg. simple_setsize, setattr_copy.
  */
 int inode_setattr(struct inode *inode, const struct iattr *attr)
 {
@@ -163,7 +163,7 @@ int inode_setattr(struct inode *inode, const struct iattr *attr)
 			return error;
 	}
 
-	generic_setattr(inode, attr);
+	setattr_copy(inode, attr);
 
 	mark_inode_dirty(inode);
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 74dfe5f73330..7dee7b3f3688 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1544,7 +1544,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 		if (error)
 			return error;
 	}
-	generic_setattr(inode, iattr);
+	setattr_copy(inode, iattr);
 	if (iattr->ia_valid & ATTR_MODE)
 		error = ext2_acl_chmod(inode);
 	mark_inode_dirty(inode);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 990dfae022e5..20813d2c7d61 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -446,7 +446,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 			goto out;
 	}
 
-	generic_setattr(inode, attr);
+	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 out:
 	return error;
diff --git a/fs/libfs.c b/fs/libfs.c
index dcaf972cbf1b..861a88797716 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -395,8 +395,7 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr)
 			return error;
 	}
 
-	generic_setattr(inode, iattr);
-
+	setattr_copy(inode, iattr);
 	return error;
 }
 EXPORT_SYMBOL(simple_setattr);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index d532c20fc179..8d44f0347b27 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -183,7 +183,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
 		}
 	}
 
-	generic_setattr(inode, ia);
+	setattr_copy(inode, ia);
  out:
 	ia->ia_valid = old_ia_valid;
 	return ret;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0835a3b70e03..7e187fbd3d47 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -122,7 +122,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 		goto out;
 
 	/* this ignores size changes */
-	generic_setattr(inode, iattr);
+	setattr_copy(inode, iattr);
 
 out:
 	mutex_unlock(&sysfs_mutex);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b347b2d5666f..8ebb5f01a418 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2393,7 +2393,7 @@ extern int buffer_migrate_page(struct address_space *,
 extern int inode_change_ok(const struct inode *, struct iattr *);
 extern int inode_newsize_ok(const struct inode *, loff_t offset);
 extern int __must_check inode_setattr(struct inode *, const struct iattr *);
-extern void generic_setattr(struct inode *inode, const struct iattr *attr);
+extern void setattr_copy(struct inode *inode, const struct iattr *attr);
 
 extern void file_update_time(struct file *file);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index f65f84062db5..3b58ad65d26c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -811,7 +811,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 
 	error = inode_change_ok(inode, attr);
 	if (!error)
-		generic_setattr(inode, attr);
+		setattr_copy(inode, attr);
 #ifdef CONFIG_TMPFS_POSIX_ACL
 	if (!error && (attr->ia_valid & ATTR_MODE))
 		error = generic_acl_chmod(inode);
-- 
cgit v1.2.3-59-g8ed1b


From 1025774ce411f2bd4b059ad7b53f0003569b74fa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 11:30:02 +0200
Subject: remove inode_setattr

Replace inode_setattr with opencoded variants of it in all callers.  This
moves the remaining call to vmtruncate into the filesystem methods where it
can be replaced with the proper truncate sequence.

In a few cases it was obvious that we would never end up calling vmtruncate
so it was left out in the opencoded variant:

 spufs: explicitly checks for ATTR_SIZE earlier
 btrfs,hugetlbfs,logfs,dlmfs: explicitly clears ATTR_SIZE earlier
 ufs: contains an opencoded simple_seattr + truncate that sets the filesize just above

In addition to that ncpfs called inode_setattr with handcrafted iattrs,
which allowed to trim down the opencoded variant.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/inode.c |  4 +-
 drivers/staging/pohmelfs/inode.c          | 14 +++--
 fs/9p/vfs_inode.c                         | 15 ++++-
 fs/affs/inode.c                           | 13 ++++-
 fs/attr.c                                 | 25 --------
 fs/btrfs/inode.c                          | 12 ++--
 fs/cifs/inode.c                           | 45 ++++++++++----
 fs/exofs/inode.c                          | 14 ++++-
 fs/ext3/inode.c                           | 12 +++-
 fs/ext4/inode.c                           | 16 +++--
 fs/gfs2/inode.c                           | 25 +++++---
 fs/gfs2/ops_inode.c                       | 12 +++-
 fs/gfs2/xattr.c                           | 24 ++++++--
 fs/hfs/inode.c                            | 12 +++-
 fs/hfsplus/inode.c                        | 12 +++-
 fs/hostfs/hostfs_kern.c                   | 18 +++++-
 fs/hpfs/inode.c                           | 12 +++-
 fs/hugetlbfs/inode.c                      | 17 +++---
 fs/jfs/file.c                             | 14 ++++-
 fs/logfs/file.c                           | 18 +++---
 fs/minix/file.c                           | 12 +++-
 fs/ncpfs/inode.c                          | 24 ++++----
 fs/nilfs2/inode.c                         | 25 ++++++--
 fs/ntfs/inode.c                           |  3 -
 fs/ocfs2/dlmfs/dlmfs.c                    |  8 ++-
 fs/ocfs2/file.c                           | 16 +++--
 fs/omfs/file.c                            | 12 +++-
 fs/proc/base.c                            | 16 ++++-
 fs/proc/generic.c                         | 18 ++++--
 fs/proc/proc_sysctl.c                     | 15 ++++-
 fs/reiserfs/inode.c                       | 97 +++++++++++++++++--------------
 fs/sysv/file.c                            | 12 +++-
 fs/udf/file.c                             | 12 +++-
 fs/ufs/truncate.c                         |  5 +-
 include/linux/fs.h                        |  1 -
 35 files changed, 416 insertions(+), 194 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index e5e5f823d687..32625f366fb5 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -110,7 +110,9 @@ spufs_setattr(struct dentry *dentry, struct iattr *attr)
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    (attr->ia_size != inode->i_size))
 		return -EINVAL;
-	return inode_setattr(inode, attr);
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index 643b413d9f0f..e818f53ccfd7 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -968,12 +968,18 @@ int pohmelfs_setattr_raw(struct inode *inode, struct iattr *attr)
 		goto err_out_exit;
 	}
 
-	err = inode_setattr(inode, attr);
-	if (err) {
-		dprintk("%s: ino: %llu, failed to set the attributes.\n", __func__, POHMELFS_I(inode)->ino);
-		goto err_out_exit;
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		err = vmtruncate(inode, attr->ia_size);
+		if (err) {
+			dprintk("%s: ino: %llu, failed to set the attributes.\n", __func__, POHMELFS_I(inode)->ino);
+			goto err_out_exit;
+		}
 	}
 
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
 	dprintk("%s: ino: %llu, mode: %o -> %o, uid: %u -> %u, gid: %u -> %u, size: %llu -> %llu.\n",
 			__func__, POHMELFS_I(inode)->ino, inode->i_mode, attr->ia_mode,
 			inode->i_uid, attr->ia_uid, inode->i_gid, attr->ia_gid, inode->i_size, attr->ia_size);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4331b3b5ee1c..4b3ad6ac9a41 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -896,10 +896,19 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	}
 
 	retval = p9_client_wstat(fid, &wstat);
-	if (retval >= 0)
-		retval = inode_setattr(dentry->d_inode, iattr);
+	if (retval < 0)
+		return retval;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(dentry->d_inode)) {
+		retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+		if (retval)
+			return retval;
+	}
 
-	return retval;
+	setattr_copy(dentry->d_inode, iattr);
+	mark_inode_dirty(dentry->d_inode);
+	return 0;
 }
 
 /**
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index f4b2a4ee4f91..6883d5fb84cf 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -235,8 +235,17 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
 		goto out;
 	}
 
-	error = inode_setattr(inode, attr);
-	if (!error && (attr->ia_valid & ATTR_MODE))
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	if (attr->ia_valid & ATTR_MODE)
 		mode_to_prot(inode);
 out:
 	return error;
diff --git a/fs/attr.c b/fs/attr.c
index aeac826f4774..ed44d8ae8bf1 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -146,31 +146,6 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
 }
 EXPORT_SYMBOL(setattr_copy);
 
-/*
- * note this function is deprecated, the new truncate sequence should be
- * used instead -- see eg. simple_setsize, setattr_copy.
- */
-int inode_setattr(struct inode *inode, const struct iattr *attr)
-{
-	unsigned int ia_valid = attr->ia_valid;
-
-	if (ia_valid & ATTR_SIZE &&
-	    attr->ia_size != i_size_read(inode)) {
-		int error;
-
-		error = vmtruncate(inode, attr->ia_size);
-		if (error)
-			return error;
-	}
-
-	setattr_copy(inode, attr);
-
-	mark_inode_dirty(inode);
-
-	return 0;
-}
-EXPORT_SYMBOL(inode_setattr);
-
 int notify_change(struct dentry * dentry, struct iattr * attr)
 {
 	struct inode *inode = dentry->d_inode;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1bff92ad4744..7f9e0536db1a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3656,13 +3656,15 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		if (err)
 			return err;
 	}
-	attr->ia_valid &= ~ATTR_SIZE;
 
-	if (attr->ia_valid)
-		err = inode_setattr(inode, attr);
+	if (attr->ia_valid) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
+
+		if (attr->ia_valid & ATTR_MODE)
+			err = btrfs_acl_chmod(inode);
+	}
 
-	if (!err && ((attr->ia_valid & ATTR_MODE)))
-		err = btrfs_acl_chmod(inode);
 	return err;
 }
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a15b3a9bbff4..9c6a40f5cc57 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1889,18 +1889,27 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	}
 
-	if (!rc) {
-		rc = inode_setattr(inode, attrs);
+	if (rc)
+		goto out;
 
-		/* force revalidate when any of these times are set since some
-		   of the fs types (eg ext3, fat) do not have fine enough
-		   time granularity to match protocol, and we do not have a
-		   a way (yet) to query the server fs's time granularity (and
-		   whether it rounds times down).
-		*/
-		if (!rc && (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME)))
-			cifsInode->time = 0;
+	if ((attrs->ia_valid & ATTR_SIZE) &&
+	    attrs->ia_size != i_size_read(inode)) {
+		rc = vmtruncate(inode, attrs->ia_size);
+		if (rc)
+			goto out;
 	}
+
+	setattr_copy(inode, attrs);
+	mark_inode_dirty(inode);
+
+	/* force revalidate when any of these times are set since some
+	   of the fs types (eg ext3, fat) do not have fine enough
+	   time granularity to match protocol, and we do not have a
+	   a way (yet) to query the server fs's time granularity (and
+	   whether it rounds times down).
+	*/
+	if (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+		cifsInode->time = 0;
 out:
 	kfree(args);
 	kfree(full_path);
@@ -2040,8 +2049,20 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 
 	/* do not need local check to inode_check_ok since the server does
 	   that */
-	if (!rc)
-		rc = inode_setattr(inode, attrs);
+	if (rc)
+		goto cifs_setattr_exit;
+
+	if ((attrs->ia_valid & ATTR_SIZE) &&
+	    attrs->ia_size != i_size_read(inode)) {
+		rc = vmtruncate(inode, attrs->ia_size);
+		if (rc)
+			goto cifs_setattr_exit;
+	}
+
+	setattr_copy(inode, attrs);
+	mark_inode_dirty(inode);
+	return 0;
+
 cifs_setattr_exit:
 	kfree(full_path);
 	FreeXid(xid);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 4bb6ef822e46..4bfc1f4fd013 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -887,8 +887,18 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (error)
 		return error;
 
-	error = inode_setattr(inode, iattr);
-	return error;
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		int error;
+
+		error = vmtruncate(inode, iattr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5c6f07eefa4a..b04d11936683 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3208,9 +3208,17 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 		ext3_journal_stop(handle);
 	}
 
-	rc = inode_setattr(inode, attr);
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		rc = vmtruncate(inode, attr->ia_size);
+		if (rc)
+			goto err_out;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
 
-	if (!rc && (ia_valid & ATTR_MODE))
+	if (ia_valid & ATTR_MODE)
 		rc = ext3_acl_chmod(inode);
 
 err_out:
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3da3c9646e5e..1fb390359bc5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5539,11 +5539,19 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			ext4_truncate(inode);
 	}
 
-	rc = inode_setattr(inode, attr);
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode))
+		rc = vmtruncate(inode, attr->ia_size);
 
-	/* If inode_setattr's call to ext4_truncate failed to get a
-	 * transaction handle at all, we need to clean up the in-core
-	 * orphan list manually. */
+	if (!rc) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
+	}
+
+	/*
+	 * If the call to ext4_truncate failed to get a transaction handle at
+	 * all, we need to clean up the in-core orphan list manually.
+	 */
 	if (inode->i_nlink)
 		ext4_orphan_del(NULL, inode);
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index f03afd9c44bc..6c023a3b5d25 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -991,18 +991,29 @@ fail:
 
 static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
+	struct inode *inode = &ip->i_inode;
 	struct buffer_head *dibh;
 	int error;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		error = inode_setattr(&ip->i_inode, attr);
-		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
 	}
-	return error;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	gfs2_assert_warn(GFS2_SB(inode), !error);
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+	return 0;
 }
 
 /**
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 98cdd05f3316..d7d410a4ca42 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1136,8 +1136,16 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (error)
 		goto out_end_trans;
 
-	error = inode_setattr(inode, attr);
-	gfs2_assert_warn(sdp, !error);
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error;
+
+		error = vmtruncate(inode, attr->ia_size);
+		gfs2_assert_warn(sdp, !error);
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 82f93da00d1b..776af6eb4bcb 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,6 +1296,7 @@ fail:
 
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
+	struct inode *inode = &ip->i_inode;
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_ea_location el;
 	struct buffer_head *dibh;
@@ -1321,14 +1322,25 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 		return error;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		error = inode_setattr(&ip->i_inode, attr);
-		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
+	if (error)
+		goto out_trans_end;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error;
+
+		error = vmtruncate(inode, attr->ia_size);
+		gfs2_assert_warn(GFS2_SB(inode), !error);
 	}
 
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+
+out_trans_end:
 	gfs2_trans_end(sdp);
 	return error;
 }
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 8df18e63eb6b..87de671baa83 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -612,10 +612,16 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 			attr->ia_mode = inode->i_mode & ~S_IWUGO;
 		attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask;
 	}
-	error = inode_setattr(inode, attr);
-	if (error)
-		return error;
 
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
 	return 0;
 }
 
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index d6ebe53fbdbf..654c5a8ddf1c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -298,7 +298,17 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
-	return inode_setattr(inode, attr);
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static const struct inode_operations hfsplus_file_inode_operations = {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 87ac1891a185..7943ff11d489 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -849,13 +849,14 @@ int hostfs_permission(struct inode *ino, int desired)
 
 int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
+	struct inode *inode = dentry->d_inode;
 	struct hostfs_iattr attrs;
 	char *name;
 	int err;
 
-	int fd = HOSTFS_I(dentry->d_inode)->fd;
+	int fd = HOSTFS_I(inode)->fd;
 
-	err = inode_change_ok(dentry->d_inode, attr);
+	err = inode_change_ok(inode, attr);
 	if (err)
 		return err;
 
@@ -905,7 +906,18 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err)
 		return err;
 
-	return inode_setattr(dentry->d_inode, attr);
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error;
+
+		error = vmtruncate(inode, attr->ia_size);
+		if (err)
+			return err;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static const struct inode_operations hostfs_iops = {
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 1042a9bc97f3..3f3b397fd4e6 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -277,9 +277,15 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (error)
 		goto out_unlock;
 
-	error = inode_setattr(inode, attr);
-	if (error)
-		goto out_unlock;
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
 
 	hpfs_write_inode(inode);
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a4e9a7ec3691..d5f019d48b09 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -448,19 +448,20 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	error = inode_change_ok(inode, attr);
 	if (error)
-		goto out;
+		return error;
 
 	if (ia_valid & ATTR_SIZE) {
 		error = -EINVAL;
-		if (!(attr->ia_size & ~huge_page_mask(h)))
-			error = hugetlb_vmtruncate(inode, attr->ia_size);
+		if (attr->ia_size & ~huge_page_mask(h))
+			return -EINVAL;
+		error = hugetlb_vmtruncate(inode, attr->ia_size);
 		if (error)
-			goto out;
-		attr->ia_valid &= ~ATTR_SIZE;
+			return error;
 	}
-	error = inode_setattr(inode, attr);
-out:
-	return error;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, 
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 127263cc8657..c5ce6c1d1ff4 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -17,6 +17,7 @@
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
+#include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/quotaops.h>
 #include "jfs_incore.h"
@@ -107,11 +108,18 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 			return rc;
 	}
 
-	rc = inode_setattr(inode, iattr);
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		rc = vmtruncate(inode, iattr->ia_size);
+		if (rc)
+			return rc;
+	}
 
-	if (!rc && (iattr->ia_valid & ATTR_MODE))
-		rc = jfs_acl_chmod(inode);
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
 
+	if (iattr->ia_valid & ATTR_MODE)
+		rc = jfs_acl_chmod(inode);
 	return rc;
 }
 
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index abe1cafbd4c2..23b4d03bbd26 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -232,15 +232,19 @@ static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = dentry->d_inode;
 	int err = 0;
 
-	if (attr->ia_valid & ATTR_SIZE)
+	if (attr->ia_valid & ATTR_SIZE) {
 		err = logfs_truncate(inode, attr->ia_size);
-	attr->ia_valid &= ~ATTR_SIZE;
+		if (err)
+			return err;
+	}
 
-	if (!err)
-		err = inode_change_ok(inode, attr);
-	if (!err)
-		err = inode_setattr(inode, attr);
-	return err;
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 const struct inode_operations logfs_reg_iops = {
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 7a45dd1fe2e5..4493ce695ab8 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -31,7 +31,17 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
-	return inode_setattr(inode, attr);
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 const struct inode_operations minix_file_inode_operations = {
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index fa3385154023..b4e8aaae14be 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -924,9 +924,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 				tmpattr.ia_valid = ATTR_MODE;
 				tmpattr.ia_mode = attr->ia_mode;
 
-				result = inode_setattr(inode, &tmpattr);
-				if (result)
-					goto out;
+				setattr_copy(inode, &tmpattr);
+				mark_inode_dirty(inode);
 			}
 		}
 #endif
@@ -954,15 +953,12 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 		result = ncp_make_closed(inode);
 		if (result)
 			goto out;
-		{
-			struct iattr tmpattr;
-			
-			tmpattr.ia_valid = ATTR_SIZE;
-			tmpattr.ia_size = attr->ia_size;
-			
-			result = inode_setattr(inode, &tmpattr);
+
+		if (attr->ia_size != i_size_read(inode)) {
+			result = vmtruncate(inode, attr->ia_size);
 			if (result)
 				goto out;
+			mark_inode_dirty(inode);
 		}
 	}
 	if ((attr->ia_valid & ATTR_CTIME) != 0) {
@@ -1002,8 +998,12 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 			NCP_FINFO(inode)->nwattr = info.attributes;
 #endif
 	}
-	if (!result)
-		result = inode_setattr(inode, attr);
+	if (result)
+		goto out;
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
 out:
 	unlock_kernel();
 	return result;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 5c694ece172e..051d279abb37 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -656,14 +656,27 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	err = nilfs_transaction_begin(sb, &ti, 0);
 	if (unlikely(err))
 		return err;
-	err = inode_setattr(inode, iattr);
-	if (!err && (iattr->ia_valid & ATTR_MODE))
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		err = vmtruncate(inode, iattr->ia_size);
+		if (unlikely(err))
+			goto out_err;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+
+	if (iattr->ia_valid & ATTR_MODE) {
 		err = nilfs_acl_chmod(inode);
-	if (likely(!err))
-		err = nilfs_transaction_commit(sb);
-	else
-		nilfs_transaction_abort(sb);
+		if (unlikely(err))
+			goto out_err;
+	}
+
+	return nilfs_transaction_commit(sb);
 
+out_err:
+	nilfs_transaction_abort(sb);
 	return err;
 }
 
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 4b57fb1eac2a..fdef8f729c3a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2879,9 +2879,6 @@ void ntfs_truncate_vfs(struct inode *vi) {
  *
  * Called with ->i_mutex held.  For the ATTR_SIZE (i.e. ->truncate) case, also
  * called with ->i_alloc_sem held for writing.
- *
- * Basically this is a copy of generic notify_change() and inode_setattr()
- * functionality, except we intercept and abort changes in i_size.
  */
 int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b83d6107a1f5..85e4ccaedd1f 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -214,10 +214,12 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
 
 	attr->ia_valid &= ~ATTR_SIZE;
 	error = inode_change_ok(inode, attr);
-	if (!error)
-		error = inode_setattr(inode, attr);
+	if (error)
+		return error;
 
-	return error;
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2b10b36d1577..584cf8ac167a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1238,13 +1238,21 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	 * Otherwise, we could get into problems with truncate as
 	 * ip_alloc_sem is used there to protect against i_size
 	 * changes.
+	 *
+	 * XXX: this means the conditional below can probably be removed.
 	 */
-	status = inode_setattr(inode, attr);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail_commit;
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		status = vmtruncate(inode, attr->ia_size);
+		if (status) {
+			mlog_errno(status);
+			goto bail_commit;
+		}
 	}
 
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+
 	status = ocfs2_mark_inode_dirty(handle, inode, bh);
 	if (status < 0)
 		mlog_errno(status);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 78c9f0c1a2f3..5542c284dc1c 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -349,7 +349,17 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
-	return inode_setattr(inode, attr);
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 const struct inode_operations omfs_file_inops = {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index acb7ef80ea4f..a49d9dd06d1d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -561,9 +561,19 @@ static int proc_setattr(struct dentry *dentry, struct iattr *attr)
 		return -EPERM;
 
 	error = inode_change_ok(inode, attr);
-	if (!error)
-		error = inode_setattr(inode, attr);
-	return error;
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static const struct inode_operations proc_def_inode_operations = {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 2791907744ed..dd29f0337661 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -12,6 +12,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
@@ -258,17 +259,22 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 
 	error = inode_change_ok(inode, iattr);
 	if (error)
-		goto out;
+		return error;
 
-	error = inode_setattr(inode, iattr);
-	if (error)
-		goto out;
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, iattr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
 	
 	de->uid = inode->i_uid;
 	de->gid = inode->i_gid;
 	de->mode = inode->i_mode;
-out:
-	return error;
+	return 0;
 }
 
 static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 6ff9981f0a18..5be436ea088e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -329,10 +329,19 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
 		return -EPERM;
 
 	error = inode_change_ok(inode, attr);
-	if (!error)
-		error = inode_setattr(inode, attr);
+	if (error)
+		return error;
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
 
-	return error;
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 045729f5674a..2b8dc5c22867 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3134,55 +3134,62 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	error = inode_change_ok(inode, attr);
-	if (!error) {
-		if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-		    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-			error = reiserfs_chown_xattrs(inode, attr);
+	if (error)
+		goto out;
 
-			if (!error) {
-				struct reiserfs_transaction_handle th;
-				int jbegin_count =
-				    2 *
-				    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
-				     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
-				    2;
-
-				/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
-				error =
-				    journal_begin(&th, inode->i_sb,
-						  jbegin_count);
-				if (error)
-					goto out;
-				error = dquot_transfer(inode, attr);
-				if (error) {
-					journal_end(&th, inode->i_sb,
-						    jbegin_count);
-					goto out;
-				}
-				/* Update corresponding info in inode so that everything is in
-				 * one transaction */
-				if (attr->ia_valid & ATTR_UID)
-					inode->i_uid = attr->ia_uid;
-				if (attr->ia_valid & ATTR_GID)
-					inode->i_gid = attr->ia_gid;
-				mark_inode_dirty(inode);
-				error =
-				    journal_end(&th, inode->i_sb, jbegin_count);
-			}
-		}
-		if (!error) {
-			/*
-			 * Relax the lock here, as it might truncate the
-			 * inode pages and wait for inode pages locks.
-			 * To release such page lock, the owner needs the
-			 * reiserfs lock
-			 */
-			reiserfs_write_unlock_once(inode->i_sb, depth);
-			error = inode_setattr(inode, attr);
-			depth = reiserfs_write_lock_once(inode->i_sb);
+	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+	    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+		struct reiserfs_transaction_handle th;
+		int jbegin_count =
+		    2 *
+		    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
+		     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
+		    2;
+
+		error = reiserfs_chown_xattrs(inode, attr);
+
+		if (error)
+			return error;
+
+		/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
+		error = journal_begin(&th, inode->i_sb, jbegin_count);
+		if (error)
+			goto out;
+		error = dquot_transfer(inode, attr);
+		if (error) {
+			journal_end(&th, inode->i_sb, jbegin_count);
+			goto out;
 		}
+
+		/* Update corresponding info in inode so that everything is in
+		 * one transaction */
+		if (attr->ia_valid & ATTR_UID)
+			inode->i_uid = attr->ia_uid;
+		if (attr->ia_valid & ATTR_GID)
+			inode->i_gid = attr->ia_gid;
+		mark_inode_dirty(inode);
+		error = journal_end(&th, inode->i_sb, jbegin_count);
+		if (error)
+			goto out;
 	}
 
+	/*
+	 * Relax the lock here, as it might truncate the
+	 * inode pages and wait for inode pages locks.
+	 * To release such page lock, the owner needs the
+	 * reiserfs lock
+	 */
+	reiserfs_write_unlock_once(inode->i_sb, depth);
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode))
+		error = vmtruncate(inode, attr->ia_size);
+
+	if (!error) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
+	}
+	depth = reiserfs_write_lock_once(inode->i_sb);
+
 	if (!error && reiserfs_posixacl(inode->i_sb)) {
 		if (attr->ia_valid & ATTR_MODE)
 			error = reiserfs_acl_chmod(inode);
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 94f6319292a1..0a65939508e9 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -38,7 +38,17 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
-	return inode_setattr(inode, attr);
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 const struct inode_operations sysv_file_inode_operations = {
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7376032c89ce..04bb5bf07630 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -236,7 +236,17 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
-	return inode_setattr(inode, attr);
+
+	if ((attr->ia_valid & ATTR_SIZE) &&
+	    attr->ia_size != i_size_read(inode)) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 const struct inode_operations udf_file_inode_operations = {
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 589e01a465ba..085e11623b7b 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -525,7 +525,10 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 		if (error)
 			return error;
 	}
-	return inode_setattr(inode, attr);
+
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+	return 0;
 }
 
 const struct inode_operations ufs_file_inode_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8ebb5f01a418..6ecb83c00a6d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2392,7 +2392,6 @@ extern int buffer_migrate_page(struct address_space *,
 
 extern int inode_change_ok(const struct inode *, struct iattr *);
 extern int inode_newsize_ok(const struct inode *, loff_t offset);
-extern int __must_check inode_setattr(struct inode *, const struct iattr *);
 extern void setattr_copy(struct inode *inode, const struct iattr *attr);
 
 extern void file_update_time(struct file *file);
-- 
cgit v1.2.3-59-g8ed1b


From 2c27c65ed0696f0b5df2dad2cf6462d72164d547 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Jun 2010 11:30:04 +0200
Subject: check ATTR_SIZE contraints in inode_change_ok

Make sure we check the truncate constraints early on in ->setattr by adding
those checks to inode_change_ok.  Also clean up and document inode_change_ok
to make this obvious.

As a fallout we don't have to call inode_newsize_ok from simple_setsize and
simplify it down to a truncate_setsize which doesn't return an error.  This
simplifies a lot of setattr implementations and means we use truncate_setsize
almost everywhere.  Get rid of fat_setsize now that it's trivial and mark
ext2_setsize static to make the calling convention obvious.

Keep the inode_newsize_ok in vmtruncate for now as all callers need an
audit for its removal anyway.

Note: setattr code in ecryptfs doesn't call inode_change_ok at all and
needs a deeper audit, but that is left for later.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/adfs/inode.c       |  5 +----
 fs/attr.c             | 44 ++++++++++++++++++++++++++++++--------------
 fs/ecryptfs/inode.c   | 18 ++++++++++++++----
 fs/ext2/inode.c       | 12 ++----------
 fs/fat/fat.h          |  1 -
 fs/fat/file.c         | 17 ++---------------
 fs/fuse/dir.c         |  6 +-----
 fs/gfs2/aops.c        |  4 ++--
 fs/gfs2/ops_inode.c   |  6 ++----
 fs/jffs2/fs.c         |  4 ++--
 fs/libfs.c            | 51 ++-------------------------------------------------
 fs/ocfs2/file.c       |  6 +++---
 fs/ramfs/file-nommu.c |  5 ++---
 fs/smbfs/inode.c      |  4 +---
 fs/ubifs/file.c       | 23 ++++++++---------------
 fs/ubifs/ubifs.h      |  2 +-
 fs/ufs/truncate.c     | 11 +++--------
 include/linux/fs.h    |  1 -
 include/linux/mm.h    |  1 +
 mm/shmem.c            |  5 ++---
 mm/truncate.c         | 38 +++++++++++++++++++++++++++++---------
 21 files changed, 108 insertions(+), 156 deletions(-)

(limited to 'include/linux')

diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index b3dec193036b..65794b8fe79e 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -333,10 +333,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
 
 	/* XXX: this is missing some actual on-disk truncation.. */
 	if (ia_valid & ATTR_SIZE)
-		error = simple_setsize(inode, attr->ia_size);
-
-	if (error)
-		goto out;
+		truncate_setsize(inode, attr->ia_size);
 
 	if (ia_valid & ATTR_MTIME) {
 		inode->i_mtime = attr->ia_mtime;
diff --git a/fs/attr.c b/fs/attr.c
index ed44d8ae8bf1..7ca41811afa1 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -14,35 +14,53 @@
 #include <linux/fcntl.h>
 #include <linux/security.h>
 
-/* Taken over from the old code... */
-
-/* POSIX UID/GID verification for setting inode attributes. */
+/**
+ * inode_change_ok - check if attribute changes to an inode are allowed
+ * @inode:	inode to check
+ * @attr:	attributes to change
+ *
+ * Check if we are allowed to change the attributes contained in @attr
+ * in the given inode.  This includes the normal unix access permission
+ * checks, as well as checks for rlimits and others.
+ *
+ * Should be called as the first thing in ->setattr implementations,
+ * possibly after taking additional locks.
+ */
 int inode_change_ok(const struct inode *inode, struct iattr *attr)
 {
-	int retval = -EPERM;
 	unsigned int ia_valid = attr->ia_valid;
 
+	/*
+	 * First check size constraints.  These can't be overriden using
+	 * ATTR_FORCE.
+	 */
+	if (ia_valid & ATTR_SIZE) {
+		int error = inode_newsize_ok(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
 	/* If force is set do it anyway. */
 	if (ia_valid & ATTR_FORCE)
-		goto fine;
+		return 0;
 
 	/* Make sure a caller can chown. */
 	if ((ia_valid & ATTR_UID) &&
 	    (current_fsuid() != inode->i_uid ||
 	     attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
-		goto error;
+		return -EPERM;
 
 	/* Make sure caller can chgrp. */
 	if ((ia_valid & ATTR_GID) &&
 	    (current_fsuid() != inode->i_uid ||
 	    (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
 	    !capable(CAP_CHOWN))
-		goto error;
+		return -EPERM;
 
 	/* Make sure a caller can chmod. */
 	if (ia_valid & ATTR_MODE) {
 		if (!is_owner_or_cap(inode))
-			goto error;
+			return -EPERM;
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
 				inode->i_gid) && !capable(CAP_FSETID))
@@ -52,12 +70,10 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 	/* Check for setting the inode time. */
 	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
 		if (!is_owner_or_cap(inode))
-			goto error;
+			return -EPERM;
 	}
-fine:
-	retval = 0;
-error:
-	return retval;
+
+	return 0;
 }
 EXPORT_SYMBOL(inode_change_ok);
 
@@ -113,7 +129,7 @@ EXPORT_SYMBOL(inode_newsize_ok);
  *
  * setattr_copy updates the inode's metadata with that specified
  * in attr. Noticably missing is inode size update, which is more complex
- * as it requires pagecache updates. See simple_setsize.
+ * as it requires pagecache updates.
  *
  * The inode is not marked as dirty after this operation. The rationale is
  * that for "simple" filesystems, the struct inode is the inode storage.
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 31ef5252f0fe..82900b063b1e 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -804,10 +804,20 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 		size_t num_zeros = (PAGE_CACHE_SIZE
 				    - (ia->ia_size & ~PAGE_CACHE_MASK));
 
+
+		/*
+		 * XXX(truncate) this should really happen at the begginning
+		 * of ->setattr.  But the code is too messy to that as part
+		 * of a larger patch.  ecryptfs is also totally missing out
+		 * on the inode_change_ok check at the beginning of
+		 * ->setattr while would include this.
+		 */
+		rc = inode_newsize_ok(inode, ia->ia_size);
+		if (rc)
+			goto out;
+
 		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-			rc = simple_setsize(inode, ia->ia_size);
-			if (rc)
-				goto out;
+			truncate_setsize(inode, ia->ia_size);
 			lower_ia->ia_size = ia->ia_size;
 			lower_ia->ia_valid |= ATTR_SIZE;
 			goto out;
@@ -830,7 +840,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 				goto out;
 			}
 		}
-		simple_setsize(inode, ia->ia_size);
+		truncate_setsize(inode, ia->ia_size);
 		rc = ecryptfs_write_inode_size_to_metadata(inode);
 		if (rc) {
 			printk(KERN_ERR	"Problem with "
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 7dee7b3f3688..069620b30d4d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1156,15 +1156,10 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
 	__ext2_truncate_blocks(inode, offset);
 }
 
-int ext2_setsize(struct inode *inode, loff_t newsize)
+static int ext2_setsize(struct inode *inode, loff_t newsize)
 {
-	loff_t oldsize;
 	int error;
 
-	error = inode_newsize_ok(inode, newsize);
-	if (error)
-		return error;
-
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	    S_ISLNK(inode->i_mode)))
 		return -EINVAL;
@@ -1184,10 +1179,7 @@ int ext2_setsize(struct inode *inode, loff_t newsize)
 	if (error)
 		return error;
 
-	oldsize = inode->i_size;
-	i_size_write(inode, newsize);
-	truncate_pagecache(inode, oldsize, newsize);
-
+	truncate_setsize(inode, newsize);
 	__ext2_truncate_blocks(inode, newsize);
 
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 27ac25725954..d75a77f85c28 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -306,7 +306,6 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
 extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
-extern int fat_setsize(struct inode *inode, loff_t offset);
 extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		       struct kstat *stat);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b2eedcee7516..7257752b6d5d 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -364,18 +364,6 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 	return 0;
 }
 
-int fat_setsize(struct inode *inode, loff_t offset)
-{
-	int error;
-
-	error = simple_setsize(inode, offset);
-	if (error)
-		return error;
-	fat_truncate_blocks(inode, offset);
-
-	return error;
-}
-
 #define TIMES_SET_FLAGS	(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
 /* valid file mode bits */
 #define FAT_VALID_MODE	(S_IFREG | S_IFDIR | S_IRWXUGO)
@@ -441,9 +429,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		error = fat_setsize(inode, attr->ia_size);
-		if (error)
-			goto out;
+		truncate_setsize(inode, attr->ia_size);
+		fat_truncate_blocks(inode, attr->ia_size);
 	}
 
 	setattr_copy(inode, attr);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 43a9b3730a98..3978a42d4f04 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1280,12 +1280,8 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 	if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc)
 		return 0;
 
-	if (attr->ia_valid & ATTR_SIZE) {
-		err = inode_newsize_ok(inode, attr->ia_size);
-		if (err)
-			return err;
+	if (attr->ia_valid & ATTR_SIZE)
 		is_truncate = true;
-	}
 
 	req = fuse_get_req(fc);
 	if (IS_ERR(req))
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 703000d6e4d2..54fe087bf54c 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -702,12 +702,12 @@ out:
 	page_cache_release(page);
 
 	/*
-	 * XXX(hch): the call below should probably be replaced with
+	 * XXX(truncate): the call below should probably be replaced with
 	 * a call to the gfs2-specific truncate blocks helper to actually
 	 * release disk blocks..
 	 */
 	if (pos + len > ip->i_inode.i_size)
-		simple_setsize(&ip->i_inode, ip->i_inode.i_size);
+		truncate_setsize(&ip->i_inode, ip->i_inode.i_size);
 out_endtrans:
 	gfs2_trans_end(sdp);
 out_trans_fail:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d7d410a4ca42..1009be2c9737 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1072,7 +1072,7 @@ int gfs2_permission(struct inode *inode, int mask)
 }
 
 /*
- * XXX: should be changed to have proper ordering by opencoding simple_setsize
+ * XXX(truncate): the truncate_setsize calls should be moved to the end.
  */
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
@@ -1084,10 +1084,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
 		error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
 		if (error)
 			return error;
-		error = simple_setsize(inode, attr->ia_size);
+		truncate_setsize(inode, attr->ia_size);
 		gfs2_trans_end(sdp);
-		if (error) 
-			return error;
 	}
 
 	error = gfs2_truncatei(ip, attr->ia_size);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 459d39d1ea0b..1b2426604fe3 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	mutex_unlock(&f->sem);
 	jffs2_complete_reservation(c);
 
-	/* We have to do the simple_setsize() without f->sem held, since
+	/* We have to do the truncate_setsize() without f->sem held, since
 	   some pages may be locked and waiting for it in readpage().
 	   We are protected from a simultaneous write() extending i_size
 	   back past iattr->ia_size, because do_truncate() holds the
 	   generic inode semaphore. */
 	if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
-		simple_setsize(inode, iattr->ia_size);
+		truncate_setsize(inode, iattr->ia_size);
 		inode->i_blocks = (inode->i_size + 511) >> 9;
 	}	
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 40562224b718..0a9da95317f7 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -326,49 +326,6 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return 0;
 }
 
-/**
- * simple_setsize - handle core mm and vfs requirements for file size change
- * @inode: inode
- * @newsize: new file size
- *
- * Returns 0 on success, -error on failure.
- *
- * simple_setsize must be called with inode_mutex held.
- *
- * simple_setsize will check that the requested new size is OK (see
- * inode_newsize_ok), and then will perform the necessary i_size update
- * and pagecache truncation (if necessary). It will be typically be called
- * from the filesystem's setattr function when ATTR_SIZE is passed in.
- *
- * The inode itself must have correct permissions and attributes to allow
- * i_size to be changed, this function then just checks that the new size
- * requested is valid.
- *
- * In the case of simple in-memory filesystems with inodes stored solely
- * in the inode cache, and file data in the pagecache, nothing more needs
- * to be done to satisfy a truncate request. Filesystems with on-disk
- * blocks for example will need to free them in the case of truncate, in
- * that case it may be easier not to use simple_setsize (but each of its
- * components will likely be required at some point to update pagecache
- * and inode etc).
- */
-int simple_setsize(struct inode *inode, loff_t newsize)
-{
-	loff_t oldsize;
-	int error;
-
-	error = inode_newsize_ok(inode, newsize);
-	if (error)
-		return error;
-
-	oldsize = inode->i_size;
-	i_size_write(inode, newsize);
-	truncate_pagecache(inode, oldsize, newsize);
-
-	return error;
-}
-EXPORT_SYMBOL(simple_setsize);
-
 /**
  * simple_setattr - setattr for simple filesystem
  * @dentry: dentry
@@ -394,12 +351,8 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (error)
 		return error;
 
-	if (iattr->ia_valid & ATTR_SIZE) {
-		error = simple_setsize(inode, iattr->ia_size);
-		if (error)
-			return error;
-	}
-
+	if (iattr->ia_valid & ATTR_SIZE)
+		truncate_setsize(inode, iattr->ia_size);
 	setattr_copy(inode, iattr);
 	mark_inode_dirty(inode);
 	return 0;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 584cf8ac167a..81296b4e3646 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1233,7 +1233,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	/*
-	 * This will intentionally not wind up calling simple_setsize(),
+	 * This will intentionally not wind up calling truncate_setsize(),
 	 * since all the work for a size change has been done above.
 	 * Otherwise, we could get into problems with truncate as
 	 * ip_alloc_sem is used there to protect against i_size
@@ -2308,12 +2308,12 @@ relock:
 			 * blocks outside i_size. Trim these off again.
 			 * Don't need i_size_read because we hold i_mutex.
 			 *
-			 * XXX(hch): this looks buggy because ocfs2 did not
+			 * XXX(truncate): this looks buggy because ocfs2 did not
 			 * actually implement ->truncate.  Take a look at
 			 * the new truncate sequence and update this accordingly
 			 */
 			if (*ppos + count > inode->i_size)
-				simple_setsize(inode, inode->i_size);
+				truncate_setsize(inode, inode->i_size);
 			ret = written;
 			goto out_dio;
 		}
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 8d44f0347b27..9eead2c796b7 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -146,9 +146,8 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
 			return ret;
 	}
 
-	ret = simple_setsize(inode, newsize);
-
-	return ret;
+	truncate_setsize(inode, newsize);
+	return 0;
 }
 
 /*****************************************************************************/
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 9551cb6f7fe4..e338f0a5a70d 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -714,9 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
 		error = server->ops->truncate(inode, attr->ia_size);
 		if (error)
 			goto out;
-		error = simple_setsize(inode, attr->ia_size);
-		if (error)
-			goto out;
+		truncate_setsize(inode, attr->ia_size);
 		refresh = 1;
 	}
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 12f445cee9f7..03ae894c45de 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -967,14 +967,15 @@ static int do_writepage(struct page *page, int len)
  * the page locked, and it locks @ui_mutex. However, write-back does take inode
  * @i_mutex, which means other VFS operations may be run on this inode at the
  * same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'simple_setsize()', which first changes @inode->i_size, then
+ * we have to call 'truncate_setsize()', which first changes @inode->i_size, then
  * drops the truncated pages. And while dropping the pages, it takes the page
- * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with
+ * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with
  * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
  * means that @inode->i_size is changed while @ui_mutex is unlocked.
  *
- * XXX: with the new truncate the above is not true anymore, the simple_setsize
- * calls can be replaced with the individual components.
+ * XXX(truncate): with the new truncate sequence this is not true anymore,
+ * and the calls to truncate_setsize can be move around freely.  They should
+ * be moved to the very end of the truncate sequence.
  *
  * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
  * inode size. How do we do this if @inode->i_size may became smaller while we
@@ -1128,9 +1129,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 		budgeted = 0;
 	}
 
-	err = simple_setsize(inode, new_size);
-	if (err)
-		goto out_budg;
+	truncate_setsize(inode, new_size);
 
 	if (offset) {
 		pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
@@ -1217,16 +1216,14 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 
 	if (attr->ia_valid & ATTR_SIZE) {
 		dbg_gen("size %lld -> %lld", inode->i_size, new_size);
-		err = simple_setsize(inode, new_size);
-		if (err)
-			goto out;
+		truncate_setsize(inode, new_size);
 	}
 
 	mutex_lock(&ui->ui_mutex);
 	if (attr->ia_valid & ATTR_SIZE) {
 		/* Truncation changes inode [mc]time */
 		inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
-		/* 'simple_setsize()' changed @i_size, update @ui_size */
+		/* 'truncate_setsize()' changed @i_size, update @ui_size */
 		ui->ui_size = inode->i_size;
 	}
 
@@ -1248,10 +1245,6 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 	if (IS_SYNC(inode))
 		err = inode->i_sb->s_op->write_inode(inode, NULL);
 	return err;
-
-out:
-	ubifs_release_budget(c, &req);
-	return err;
 }
 
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 04310878f449..0c9876b396dd 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -379,7 +379,7 @@ struct ubifs_gced_idx_leb {
  * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
  * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
  * make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock
+ * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock
  * with 'ubifs_writepage()' (see file.c). All the other inode fields are
  * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
  * could consider to rework locking and base it on "shadow" fields.
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 085e11623b7b..34d5cb135320 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -500,11 +500,6 @@ out:
 	return err;
 }
 
-/*
- * TODO:
- *	- truncate case should use proper ordering instead of using
- *	  simple_setsize
- */
 int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -518,9 +513,9 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
 		loff_t old_i_size = inode->i_size;
 
-		error = simple_setsize(inode, attr->ia_size);
-		if (error)
-			return error;
+		/* XXX(truncate): truncate_setsize should be called last */
+		truncate_setsize(inode, attr->ia_size);
+
 		error = ufs_truncate(inode, old_i_size);
 		if (error)
 			return error;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6ecb83c00a6d..5547b1b027db 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2355,7 +2355,6 @@ extern int simple_link(struct dentry *, struct inode *, struct dentry *);
 extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
 extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
-extern int simple_setsize(struct inode *, loff_t);
 extern int noop_fsync(struct file *, int);
 extern int simple_empty(struct dentry *);
 extern int simple_readpage(struct file *file, struct page *page);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2b48041b910..980164ea10ee 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -815,6 +815,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 }
 
 extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
+extern void truncate_setsize(struct inode *inode, loff_t newsize);
 extern int vmtruncate(struct inode *inode, loff_t offset);
 extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 0a43505eeaec..33222ba256fb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -805,11 +805,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 			}
 		}
 
-		error = simple_setsize(inode, newsize);
+		/* XXX(truncate): truncate_setsize should be called last */
+		truncate_setsize(inode, newsize);
 		if (page)
 			page_cache_release(page);
-		if (error)
-			return error;
 		shmem_truncate_range(inode, newsize, (loff_t)-1);
 	}
 
diff --git a/mm/truncate.c b/mm/truncate.c
index 937571b8b233..ba887bff48c5 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -540,29 +540,49 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
 }
 EXPORT_SYMBOL(truncate_pagecache);
 
+/**
+ * truncate_setsize - update inode and pagecache for a new file size
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * truncate_setsize updastes i_size update and performs pagecache
+ * truncation (if necessary) for a file size updates. It will be
+ * typically be called from the filesystem's setattr function when
+ * ATTR_SIZE is passed in.
+ *
+ * Must be called with inode_mutex held and after all filesystem
+ * specific block truncation has been performed.
+ */
+void truncate_setsize(struct inode *inode, loff_t newsize)
+{
+	loff_t oldsize;
+
+	oldsize = inode->i_size;
+	i_size_write(inode, newsize);
+
+	truncate_pagecache(inode, oldsize, newsize);
+}
+EXPORT_SYMBOL(truncate_setsize);
+
 /**
  * vmtruncate - unmap mappings "freed" by truncate() syscall
  * @inode: inode of the file used
  * @offset: file offset to start truncating
  *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page.  Ugly, but necessary.
- *
- * This function is deprecated and simple_setsize or truncate_pagecache
- * should be used instead.
+ * This function is deprecated and truncate_setsize or truncate_pagecache
+ * should be used instead, together with filesystem specific block truncation.
  */
 int vmtruncate(struct inode *inode, loff_t offset)
 {
 	int error;
 
-	error = simple_setsize(inode, offset);
+	error = inode_newsize_ok(inode, offset);
 	if (error)
 		return error;
 
+	truncate_setsize(inode, offset);
 	if (inode->i_op->truncate)
 		inode->i_op->truncate(inode);
-
-	return error;
+	return 0;
 }
 EXPORT_SYMBOL(vmtruncate);
-- 
cgit v1.2.3-59-g8ed1b


From b5fc510c48f631882ccec3c0f02a25d5b67de09f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 4 Jul 2010 12:24:09 +0400
Subject: get rid of file_fsync()

Copy and simplify in the only two users remaining.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfs/inode.c              | 26 +++++++++++++++++++++++++-
 fs/hfsplus/hfsplus_fs.h     |  1 +
 fs/hfsplus/inode.c          | 27 ++++++++++++++++++++++++++-
 fs/hfsplus/super.c          |  2 +-
 fs/sync.c                   | 25 -------------------------
 include/linux/buffer_head.h |  1 -
 6 files changed, 53 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 87de671baa83..93ceec8fbb8f 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -625,6 +625,30 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 	return 0;
 }
 
+static int hfs_file_fsync(struct file *filp, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	struct super_block * sb;
+	int ret, err;
+
+	/* sync the inode to buffers */
+	ret = write_inode_now(inode, 0);
+
+	/* sync the superblock to buffers */
+	sb = inode->i_sb;
+	if (sb->s_dirt) {
+		lock_super(sb);
+		sb->s_dirt = 0;
+		if (!(sb->s_flags & MS_RDONLY))
+			hfs_mdb_commit(sb);
+		unlock_super(sb);
+	}
+	/* .. finally sync the buffers to disk */
+	err = sync_blockdev(sb->s_bdev);
+	if (!ret)
+		ret = err;
+	return ret;
+}
 
 static const struct file_operations hfs_file_operations = {
 	.llseek		= generic_file_llseek,
@@ -634,7 +658,7 @@ static const struct file_operations hfs_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
-	.fsync		= file_fsync,
+	.fsync		= hfs_file_fsync,
 	.open		= hfs_file_open,
 	.release	= hfs_file_release,
 };
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 6505c30ad965..dc856be3c2b0 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -351,6 +351,7 @@ int hfsplus_show_options(struct seq_file *, struct vfsmount *);
 
 /* super.c */
 struct inode *hfsplus_iget(struct super_block *, unsigned long);
+int hfsplus_sync_fs(struct super_block *sb, int wait);
 
 /* tables.c */
 extern u16 hfsplus_case_fold_table[];
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 654c5a8ddf1c..c5a979d62c65 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -311,6 +311,31 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 	return 0;
 }
 
+static int hfsplus_file_fsync(struct file *filp, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	struct super_block * sb;
+	int ret, err;
+
+	/* sync the inode to buffers */
+	ret = write_inode_now(inode, 0);
+
+	/* sync the superblock to buffers */
+	sb = inode->i_sb;
+	if (sb->s_dirt) {
+		if (!(sb->s_flags & MS_RDONLY))
+			hfsplus_sync_fs(sb, 1);
+		else
+			sb->s_dirt = 0;
+	}
+
+	/* .. finally sync the buffers to disk */
+	err = sync_blockdev(sb->s_bdev);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
 static const struct inode_operations hfsplus_file_inode_operations = {
 	.lookup		= hfsplus_file_lookup,
 	.truncate	= hfsplus_file_truncate,
@@ -328,7 +353,7 @@ static const struct file_operations hfsplus_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
-	.fsync		= file_fsync,
+	.fsync		= hfsplus_file_fsync,
 	.open		= hfsplus_file_open,
 	.release	= hfsplus_file_release,
 	.unlocked_ioctl = hfsplus_ioctl,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 74b473a8ef92..a32c241e4e45 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -154,7 +154,7 @@ static void hfsplus_clear_inode(struct inode *inode)
 	}
 }
 
-static int hfsplus_sync_fs(struct super_block *sb, int wait)
+int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
 	struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
 
diff --git a/fs/sync.c b/fs/sync.c
index 15aa6f03b2da..ba76b9623e7e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -128,31 +128,6 @@ void emergency_sync(void)
 	}
 }
 
-/*
- * Generic function to fsync a file.
- */
-int file_fsync(struct file *filp, int datasync)
-{
-	struct inode *inode = filp->f_mapping->host;
-	struct super_block * sb;
-	int ret, err;
-
-	/* sync the inode to buffers */
-	ret = write_inode_now(inode, 0);
-
-	/* sync the superblock to buffers */
-	sb = inode->i_sb;
-	if (sb->s_dirt && sb->s_op->write_super)
-		sb->s_op->write_super(sb);
-
-	/* .. finally sync the buffers to disk */
-	err = sync_blockdev(sb->s_bdev);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-EXPORT_SYMBOL(file_fsync);
-
 /**
  * vfs_fsync_range - helper to sync a range of data & metadata to disk
  * @file:		file to sync
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 3f69054f86d9..620f1d1088cb 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -225,7 +225,6 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 void block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
-int file_fsync(struct file *, int);
 int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned,
 				struct page **, void **, get_block_t*);
 int nobh_write_end(struct file *, struct address_space *,
-- 
cgit v1.2.3-59-g8ed1b


From a4ffdde6e56fdf8c34ddadc2674d6eb978083369 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Jun 2010 17:38:30 -0400
Subject: simplify checks for I_CLEAR/I_FREEING

add I_CLEAR instead of replacing I_FREEING with it.  I_CLEAR is
equivalent to I_FREEING for almost all code looking at either;
it's there to keep track of having called clear_inode() exactly
once per inode lifetime, at some point after having set I_FREEING.
I_CLEAR and I_FREEING never get set at the same time with the
current code, so we can switch to setting i_flags to I_FREEING | I_CLEAR
instead of I_CLEAR without loss of information.  As the result of
such change, checks become simpler and the amount of code that needs
to know about I_CLEAR shrinks a lot.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c            |  2 +-
 fs/drop_caches.c            |  2 +-
 fs/fs-writeback.c           |  8 ++++----
 fs/gfs2/inode.c             |  2 +-
 fs/inode.c                  | 16 ++++++++--------
 fs/nilfs2/gcdat.c           |  2 +-
 fs/notify/inode_mark.c      |  6 +++---
 fs/notify/inotify/inotify.c |  7 +++----
 fs/quota/dquot.c            |  2 +-
 fs/xfs/linux-2.6/xfs_iops.c |  4 ++--
 include/linux/fs.h          |  4 ++--
 11 files changed, 27 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7f9e0536db1a..95eac0116963 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3860,7 +3860,7 @@ again:
 			p = &parent->rb_right;
 		else {
 			WARN_ON(!(entry->vfs_inode.i_state &
-				  (I_WILL_FREE | I_FREEING | I_CLEAR)));
+				  (I_WILL_FREE | I_FREEING)));
 			rb_erase(parent, &root->inode_tree);
 			RB_CLEAR_NODE(parent);
 			spin_unlock(&root->inode_lock);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 83c4f600786a..2195c213ab2f 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 		if (inode->i_mapping->nrpages == 0)
 			continue;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d5be1693ac93..7608880b5c58 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -352,7 +352,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_SYNC;
-	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
+	if (!(inode->i_state & I_FREEING)) {
 		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
 			/*
 			 * More pages get dirtied by a fast dirtier.
@@ -499,7 +499,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		if (inode_dirtied_after(inode, wbc->wb_start))
 			return 1;
 
-		BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
+		BUG_ON(inode->i_state & I_FREEING);
 		__iget(inode);
 		pages_skipped = wbc->pages_skipped;
 		writeback_single_inode(inode, wbc);
@@ -935,7 +935,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			if (hlist_unhashed(&inode->i_hash))
 				goto out;
 		}
-		if (inode->i_state & (I_FREEING|I_CLEAR))
+		if (inode->i_state & I_FREEING)
 			goto out;
 
 		/*
@@ -1001,7 +1001,7 @@ static void wait_sb_inodes(struct super_block *sb)
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		struct address_space *mapping;
 
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 		mapping = inode->i_mapping;
 		if (mapping->nrpages == 0)
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6c023a3b5d25..08140f185a37 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -84,7 +84,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
 	struct gfs2_skip_data *data = opaque;
 
 	if (ip->i_no_addr == data->no_addr) {
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)){
 			data->skipped = 1;
 			return 0;
 		}
diff --git a/fs/inode.c b/fs/inode.c
index 722860b323a9..71fe079ca32a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -317,7 +317,7 @@ void clear_inode(struct inode *inode)
 		bd_forget(inode);
 	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
 		cd_forget(inode);
-	inode->i_state = I_CLEAR;
+	inode->i_state = I_FREEING | I_CLEAR;
 }
 EXPORT_SYMBOL(clear_inode);
 
@@ -553,7 +553,7 @@ repeat:
 			continue;
 		if (!test(inode, data))
 			continue;
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
@@ -578,7 +578,7 @@ repeat:
 			continue;
 		if (inode->i_sb != sb)
 			continue;
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
+		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
@@ -840,7 +840,7 @@ EXPORT_SYMBOL(iunique);
 struct inode *igrab(struct inode *inode)
 {
 	spin_lock(&inode_lock);
-	if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)))
+	if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
 		__iget(inode);
 	else
 		/*
@@ -1089,7 +1089,7 @@ int insert_inode_locked(struct inode *inode)
 				continue;
 			if (old->i_sb != sb)
 				continue;
-			if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+			if (old->i_state & (I_FREEING|I_WILL_FREE))
 				continue;
 			break;
 		}
@@ -1128,7 +1128,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 				continue;
 			if (!test(old, data))
 				continue;
-			if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+			if (old->i_state & (I_FREEING|I_WILL_FREE))
 				continue;
 			break;
 		}
@@ -1218,7 +1218,7 @@ void generic_delete_inode(struct inode *inode)
 	hlist_del_init(&inode->i_hash);
 	spin_unlock(&inode_lock);
 	wake_up_inode(inode);
-	BUG_ON(inode->i_state != I_CLEAR);
+	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
 	destroy_inode(inode);
 }
 EXPORT_SYMBOL(generic_delete_inode);
@@ -1322,7 +1322,7 @@ static inline void iput_final(struct inode *inode)
 void iput(struct inode *inode)
 {
 	if (inode) {
-		BUG_ON(inode->i_state == I_CLEAR);
+		BUG_ON(inode->i_state & I_CLEAR);
 
 		if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
 			iput_final(inode);
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
index dd5f7e0a95f6..84a45d1d5464 100644
--- a/fs/nilfs2/gcdat.c
+++ b/fs/nilfs2/gcdat.c
@@ -78,7 +78,7 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
 	struct inode *gcdat = nilfs->ns_gc_dat;
 	struct nilfs_inode_info *gii = NILFS_I(gcdat);
 
-	gcdat->i_state = I_CLEAR;
+	gcdat->i_state = I_FREEING | I_CLEAR;
 	gii->i_flags = 0;
 
 	nilfs_palloc_clear_cache(gcdat);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 0399bcbe09c8..152b83ec005d 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -369,11 +369,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		struct inode *need_iput_tmp;
 
 		/*
-		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+		 * We cannot __iget() an inode in state I_FREEING,
 		 * I_WILL_FREE, or I_NEW which is fine because by that point
 		 * the inode cannot have any associated watches.
 		 */
-		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 
 		/*
@@ -397,7 +397,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		/* In case the dropping of a reference would nuke next_i. */
 		if ((&next_i->i_sb_list != list) &&
 		    atomic_read(&next_i->i_count) &&
-		    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
+		    !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
 			__iget(next_i);
 			need_iput = next_i;
 		}
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 27b75ebc7460..cf6b0429a257 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -377,11 +377,11 @@ void inotify_unmount_inodes(struct list_head *list)
 		struct list_head *watches;
 
 		/*
-		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+		 * We cannot __iget() an inode in state I_FREEING,
 		 * I_WILL_FREE, or I_NEW which is fine because by that point
 		 * the inode cannot have any associated watches.
 		 */
-		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 
 		/*
@@ -403,8 +403,7 @@ void inotify_unmount_inodes(struct list_head *list)
 		/* In case the dropping of a reference would nuke next_i. */
 		if ((&next_i->i_sb_list != list) &&
 				atomic_read(&next_i->i_count) &&
-				!(next_i->i_state & (I_CLEAR | I_FREEING |
-					I_WILL_FREE))) {
+				!(next_i->i_state & (I_FREEING|I_WILL_FREE))) {
 			__iget(next_i);
 			need_iput = next_i;
 		}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 437d2ca2de97..5cec3e2348f1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -885,7 +885,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 #ifdef CONFIG_QUOTA_DEBUG
 		if (unlikely(inode_get_rsv_space(inode) > 0))
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 62dd349facee..68be25dcd301 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -80,7 +80,7 @@ xfs_mark_inode_dirty_sync(
 {
 	struct inode	*inode = VFS_I(ip);
 
-	if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
+	if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
 		mark_inode_dirty_sync(inode);
 }
 
@@ -90,7 +90,7 @@ xfs_mark_inode_dirty(
 {
 	struct inode	*inode = VFS_I(ip);
 
-	if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
+	if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
 		mark_inode_dirty(inode);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5547b1b027db..218693d8d446 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1616,8 +1616,8 @@ struct super_operations {
  * I_FREEING		Set when inode is about to be freed but still has dirty
  *			pages or buffers attached or the inode itself is still
  *			dirty.
- * I_CLEAR		Set by clear_inode().  In this state the inode is clean
- *			and can be destroyed.
+ * I_CLEAR		Added by clear_inode().  In this state the inode is clean
+ *			and can be destroyed.  Inode keeps I_FREEING.
  *
  *			Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
  *			prohibited for many purposes.  iget() must wait for
-- 
cgit v1.2.3-59-g8ed1b


From be7ce4161f9e6bf2497f90337d1214aa6ee06e15 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Jun 2010 19:40:39 -0400
Subject: New method - evict_inode()

Hybrid of ->clear_inode() and ->delete_inode(); if present, does
all fs work to be done when in-core inode is about to be gone,
for whatever reason.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 4 +++-
 include/linux/fs.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 60cb25969762..474a72f571a4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -325,7 +325,9 @@ static void evict(struct inode *inode, int delete)
 {
 	const struct super_operations *op = inode->i_sb->s_op;
 
-	if (delete && op->delete_inode) {
+	if (op->evict_inode) {
+		op->evict_inode(inode);
+	} else if (delete && op->delete_inode) {
 		op->delete_inode(inode);
 	} else {
 		if (inode->i_data.nrpages)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 218693d8d446..ce50be4b0b41 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1563,6 +1563,7 @@ struct super_operations {
    	void (*dirty_inode) (struct inode *);
 	int (*write_inode) (struct inode *, struct writeback_control *wbc);
 	void (*drop_inode) (struct inode *);
+	void (*evict_inode) (struct inode *);
 	void (*delete_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
 	void (*write_super) (struct super_block *);
-- 
cgit v1.2.3-59-g8ed1b


From c6287315cb958e740466555ca5e9d007f25b39bd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Jun 2010 19:56:17 -0400
Subject: generic_detach_inode() can be static now

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 3 +--
 include/linux/fs.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 474a72f571a4..256e620c6416 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1234,7 +1234,7 @@ EXPORT_SYMBOL(generic_delete_inode);
  *
  *	Returns 1 if inode should be completely destroyed.
  */
-int generic_detach_inode(struct inode *inode)
+static int generic_detach_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 
@@ -1264,7 +1264,6 @@ int generic_detach_inode(struct inode *inode)
 	spin_unlock(&inode_lock);
 	return 1;
 }
-EXPORT_SYMBOL_GPL(generic_detach_inode);
 
 static void generic_forget_inode(struct inode *inode)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ce50be4b0b41..e0ecb8e75ebf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2167,7 +2167,6 @@ extern ino_t iunique(struct super_block *, ino_t);
 extern int inode_needs_sync(struct inode *inode);
 extern void generic_delete_inode(struct inode *inode);
 extern void generic_drop_inode(struct inode *inode);
-extern int generic_detach_inode(struct inode *inode);
 
 extern struct inode *ilookup5_nowait(struct super_block *sb,
 		unsigned long hashval, int (*test)(struct inode *, void *),
-- 
cgit v1.2.3-59-g8ed1b


From b0683aa638b3326c6fc22e5290dfa75e08bd83f5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Jun 2010 20:55:25 -0400
Subject: new helper: end_writeback()

Essentially, the minimal variant of ->evict_inode().  It's
a trimmed-down clear_inode(), sans any fs callbacks.  Once
it returns we know that no async writeback will be happening;
every ->evict_inode() instance should do that once and do that
before doing anything ->write_inode() could interfere with
(e.g. freeing the on-disk inode).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hugetlbfs/inode.c |  2 +-
 fs/inode.c           | 12 ++++++++++++
 include/linux/fs.h   |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index bf1a2f400e70..6e5bd42f3860 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -374,7 +374,7 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 static void hugetlbfs_evict_inode(struct inode *inode)
 {
 	truncate_hugepages(inode, 0);
-	clear_inode(inode);
+	end_writeback(inode);
 }
 
 static inline void
diff --git a/fs/inode.c b/fs/inode.c
index 9aff7deaf816..93e7a5ecbc26 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -294,6 +294,18 @@ void __iget(struct inode *inode)
 	inodes_stat.nr_unused--;
 }
 
+void end_writeback(struct inode *inode)
+{
+	might_sleep();
+	BUG_ON(inode->i_data.nrpages);
+	BUG_ON(!list_empty(&inode->i_data.private_list));
+	BUG_ON(!(inode->i_state & I_FREEING));
+	BUG_ON(inode->i_state & I_CLEAR);
+	inode_sync_wait(inode);
+	inode->i_state = I_FREEING | I_CLEAR;
+}
+EXPORT_SYMBOL(end_writeback);
+
 /**
  * clear_inode - clear an inode
  * @inode: inode to clear
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e0ecb8e75ebf..3c23c1dcb1bd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2184,6 +2184,7 @@ extern void unlock_new_inode(struct inode *);
 extern void __iget(struct inode * inode);
 extern void iget_failed(struct inode *);
 extern void clear_inode(struct inode *);
+extern void end_writeback(struct inode *);
 extern void destroy_inode(struct inode *);
 extern void __destroy_inode(struct inode *);
 extern struct inode *new_inode(struct super_block *);
-- 
cgit v1.2.3-59-g8ed1b


From ac14a95b5239d37b6082c3791b88d7ab4e8e444c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 6 Jun 2010 07:08:19 -0400
Subject: convert ext3 to ->evict_inode()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext3/ialloc.c        | 12 ------------
 fs/ext3/inode.c         | 37 +++++++++++++++++++++++++++----------
 fs/ext3/super.c         | 14 +-------------
 include/linux/ext3_fs.h |  2 +-
 4 files changed, 29 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 498021eb88fb..4ab72db3559e 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -119,20 +119,8 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 	ino = inode->i_ino;
 	ext3_debug ("freeing inode %lu\n", ino);
 
-	/*
-	 * Note: we must free any quota before locking the superblock,
-	 * as writing the quota to disk may need the lock as well.
-	 */
-	dquot_initialize(inode);
-	ext3_xattr_delete_inode(handle, inode);
-	dquot_free_inode(inode);
-	dquot_drop(inode);
-
 	is_directory = S_ISDIR(inode->i_mode);
 
-	/* Do this BEFORE marking the inode not in use or returning an error */
-	clear_inode (inode);
-
 	es = EXT3_SB(sb)->s_es;
 	if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
 		ext3_error (sb, "ext3_free_inode",
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b04d11936683..cc55cecf9fbc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -190,18 +190,28 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
 }
 
 /*
- * Called at the last iput() if i_nlink is zero.
+ * Called at inode eviction from icache
  */
-void ext3_delete_inode (struct inode * inode)
+void ext3_evict_inode (struct inode *inode)
 {
+	struct ext3_block_alloc_info *rsv;
 	handle_t *handle;
+	int want_delete = 0;
 
-	if (!is_bad_inode(inode))
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
 		dquot_initialize(inode);
+		want_delete = 1;
+	}
 
 	truncate_inode_pages(&inode->i_data, 0);
 
-	if (is_bad_inode(inode))
+	ext3_discard_reservation(inode);
+	rsv = EXT3_I(inode)->i_block_alloc_info;
+	EXT3_I(inode)->i_block_alloc_info = NULL;
+	if (unlikely(rsv))
+		kfree(rsv);
+
+	if (!want_delete)
 		goto no_delete;
 
 	handle = start_transaction(inode);
@@ -238,15 +248,22 @@ void ext3_delete_inode (struct inode * inode)
 	 * having errors), but we can't free the inode if the mark_dirty
 	 * fails.
 	 */
-	if (ext3_mark_inode_dirty(handle, inode))
-		/* If that failed, just do the required in-core inode clear. */
-		clear_inode(inode);
-	else
+	if (ext3_mark_inode_dirty(handle, inode)) {
+		/* If that failed, just dquot_drop() and be done with that */
+		dquot_drop(inode);
+		end_writeback(inode);
+	} else {
+		ext3_xattr_delete_inode(handle, inode);
+		dquot_free_inode(inode);
+		dquot_drop(inode);
+		end_writeback(inode);
 		ext3_free_inode(handle, inode);
+	}
 	ext3_journal_stop(handle);
 	return;
 no_delete:
-	clear_inode(inode);	/* We must guarantee clearing of inode... */
+	end_writeback(inode);
+	dquot_drop(inode);
 }
 
 typedef struct {
@@ -2564,7 +2581,7 @@ out_stop:
 	 * If this was a simple ftruncate(), and the file will remain alive
 	 * then we need to clear up the orphan record which we created above.
 	 * However, if this was a real unlink then we were called by
-	 * ext3_delete_inode(), and we allow that function to clean up the
+	 * ext3_evict_inode(), and we allow that function to clean up the
 	 * orphan info for us.
 	 */
 	if (inode->i_nlink)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6c953bb255e7..a951fd5c081c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -527,17 +527,6 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(ext3_inode_cachep);
 }
 
-static void ext3_clear_inode(struct inode *inode)
-{
-	struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
-
-	dquot_drop(inode);
-	ext3_discard_reservation(inode);
-	EXT3_I(inode)->i_block_alloc_info = NULL;
-	if (unlikely(rsv))
-		kfree(rsv);
-}
-
 static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
 {
 #if defined(CONFIG_QUOTA)
@@ -783,14 +772,13 @@ static const struct super_operations ext3_sops = {
 	.destroy_inode	= ext3_destroy_inode,
 	.write_inode	= ext3_write_inode,
 	.dirty_inode	= ext3_dirty_inode,
-	.delete_inode	= ext3_delete_inode,
+	.evict_inode	= ext3_evict_inode,
 	.put_super	= ext3_put_super,
 	.sync_fs	= ext3_sync_fs,
 	.freeze_fs	= ext3_freeze,
 	.unfreeze_fs	= ext3_unfreeze,
 	.statfs		= ext3_statfs,
 	.remount_fs	= ext3_remount,
-	.clear_inode	= ext3_clear_inode,
 	.show_options	= ext3_show_options,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext3_quota_read,
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 7fc62d4550b2..e7cb21766992 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -896,7 +896,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 extern struct inode *ext3_iget(struct super_block *, unsigned long);
 extern int  ext3_write_inode (struct inode *, struct writeback_control *);
 extern int  ext3_setattr (struct dentry *, struct iattr *);
-extern void ext3_delete_inode (struct inode *);
+extern void ext3_evict_inode (struct inode *);
 extern int  ext3_sync_inode (handle_t *, struct inode *);
 extern void ext3_discard_reservation (struct inode *);
 extern void ext3_dirty_inode(struct inode *);
-- 
cgit v1.2.3-59-g8ed1b


From c103135c14e03fc9a9e5f0adc01df9ad272cf2a1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 6 Jun 2010 22:31:14 -0400
Subject: new helper: __dentry_path()

builds path relative to fs root, called under dcache_lock,
doesn't append any nonsense to unlinked ones.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 27 ++++++++++++++++++++++-----
 include/linux/dcache.h |  1 +
 2 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 86d4db15473e..caf08574982f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2049,16 +2049,12 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
 /*
  * Write full pathname from the root of the filesystem into the buffer.
  */
-char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
 	char *end = buf + buflen;
 	char *retval;
 
-	spin_lock(&dcache_lock);
 	prepend(&end, &buflen, "\0", 1);
-	if (d_unlinked(dentry) &&
-		(prepend(&end, &buflen, "//deleted", 9) != 0))
-			goto Elong;
 	if (buflen < 1)
 		goto Elong;
 	/* Get '/' right */
@@ -2076,7 +2072,28 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 		retval = end;
 		dentry = parent;
 	}
+	return retval;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+EXPORT_SYMBOL(__dentry_path);
+
+char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+	char *p = NULL;
+	char *retval;
+
+	spin_lock(&dcache_lock);
+	if (d_unlinked(dentry)) {
+		p = buf + buflen;
+		if (prepend(&p, &buflen, "//deleted", 10) != 0)
+			goto Elong;
+		buflen++;
+	}
+	retval = __dentry_path(dentry, buf, buflen);
 	spin_unlock(&dcache_lock);
+	if (!IS_ERR(retval) && p)
+		*p = '/';	/* restore '/' overriden with '\0' */
 	return retval;
 Elong:
 	spin_unlock(&dcache_lock);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index eebb617c17d8..d23be0386e2d 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -315,6 +315,7 @@ extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
 
 extern char *__d_path(const struct path *path, struct path *root, char *, int);
 extern char *d_path(const struct path *, char *, int);
+extern char *__dentry_path(struct dentry *, char *, int);
 extern char *dentry_path(struct dentry *, char *, int);
 
 /* Allocation counts.. */
-- 
cgit v1.2.3-59-g8ed1b


From 845a2cc0507055278e0fa722ed0f8c791b7401dd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Jun 2010 11:37:37 -0400
Subject: convert reiserfs to ->evict_inode()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/inode.c         | 13 ++++++++++---
 fs/reiserfs/super.c         |  8 +-------
 include/linux/reiserfs_fs.h |  2 +-
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 46ba1cfc2df3..a94e08b339fc 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -25,7 +25,7 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 int reiserfs_prepare_write(struct file *f, struct page *page,
 			   unsigned from, unsigned to);
 
-void reiserfs_delete_inode(struct inode *inode)
+void reiserfs_evict_inode(struct inode *inode)
 {
 	/* We need blocks for transaction + (user+group) quota update (possibly delete) */
 	int jbegin_count =
@@ -35,10 +35,12 @@ void reiserfs_delete_inode(struct inode *inode)
 	int depth;
 	int err;
 
-	if (!is_bad_inode(inode))
+	if (!inode->i_nlink && !is_bad_inode(inode))
 		dquot_initialize(inode);
 
 	truncate_inode_pages(&inode->i_data, 0);
+	if (inode->i_nlink)
+		goto no_delete;
 
 	depth = reiserfs_write_lock_once(inode->i_sb);
 
@@ -77,9 +79,14 @@ void reiserfs_delete_inode(struct inode *inode)
 		;
 	}
       out:
-	clear_inode(inode);	/* note this must go after the journal_end to prevent deadlock */
+	end_writeback(inode);	/* note this must go after the journal_end to prevent deadlock */
+	dquot_drop(inode);
 	inode->i_blocks = 0;
 	reiserfs_write_unlock_once(inode->i_sb, depth);
+
+no_delete:
+	end_writeback(inode);
+	dquot_drop(inode);
 }
 
 static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1e1ee9056eb6..e15ff612002d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -591,11 +591,6 @@ out:
 	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 }
 
-static void reiserfs_clear_inode(struct inode *inode)
-{
-	dquot_drop(inode);
-}
-
 #ifdef CONFIG_QUOTA
 static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
 				    size_t, loff_t);
@@ -608,8 +603,7 @@ static const struct super_operations reiserfs_sops = {
 	.destroy_inode = reiserfs_destroy_inode,
 	.write_inode = reiserfs_write_inode,
 	.dirty_inode = reiserfs_dirty_inode,
-	.clear_inode = reiserfs_clear_inode,
-	.delete_inode = reiserfs_delete_inode,
+	.evict_inode = reiserfs_evict_inode,
 	.put_super = reiserfs_put_super,
 	.write_super = reiserfs_write_super,
 	.sync_fs = reiserfs_sync_fs,
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 3b603f474186..2a464ae147ce 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -2033,7 +2033,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
 				struct reiserfs_iget_args *args);
 int reiserfs_find_actor(struct inode *inode, void *p);
 int reiserfs_init_locked_inode(struct inode *inode, void *p);
-void reiserfs_delete_inode(struct inode *inode);
+void reiserfs_evict_inode(struct inode *inode);
 int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 int reiserfs_get_block(struct inode *inode, sector_t block,
 		       struct buffer_head *bh_result, int create);
-- 
cgit v1.2.3-59-g8ed1b


From 07958f9f5b9e8422c15368a1733a52ea99009896 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Jun 2010 13:20:09 -0400
Subject: ->delete_inode() is gone

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 2 --
 include/linux/fs.h | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 93e7a5ecbc26..7a1bea9cb8ee 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -335,8 +335,6 @@ static void evict(struct inode *inode, int delete)
 
 	if (op->evict_inode) {
 		op->evict_inode(inode);
-	} else if (delete && op->delete_inode) {
-		op->delete_inode(inode);
 	} else {
 		if (inode->i_data.nrpages)
 			truncate_inode_pages(&inode->i_data, 0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3c23c1dcb1bd..2b1254771e46 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1564,7 +1564,6 @@ struct super_operations {
 	int (*write_inode) (struct inode *, struct writeback_control *wbc);
 	void (*drop_inode) (struct inode *);
 	void (*evict_inode) (struct inode *);
-	void (*delete_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
 	void (*write_super) (struct super_block *);
 	int (*sync_fs)(struct super_block *sb, int wait);
-- 
cgit v1.2.3-59-g8ed1b


From 30140837f256558c943636245ab90897a9455a70 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Jun 2010 13:23:20 -0400
Subject: fs/inode.c:clear_inode() is gone

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 28 ++++------------------------
 include/linux/fs.h |  1 -
 2 files changed, 4 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 8320bef7177e..82ca3562a688 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -306,29 +306,6 @@ void end_writeback(struct inode *inode)
 }
 EXPORT_SYMBOL(end_writeback);
 
-/**
- * clear_inode - clear an inode
- * @inode: inode to clear
- *
- * This is called by the filesystem to tell us
- * that the inode is no longer useful. We just
- * terminate it with extreme prejudice.
- */
-void clear_inode(struct inode *inode)
-{
-	might_sleep();
-	invalidate_inode_buffers(inode);
-
-	BUG_ON(inode->i_data.nrpages);
-	BUG_ON(!(inode->i_state & I_FREEING));
-	BUG_ON(inode->i_state & I_CLEAR);
-	inode_sync_wait(inode);
-	if (inode->i_sb->s_op->clear_inode)
-		inode->i_sb->s_op->clear_inode(inode);
-	inode->i_state = I_FREEING | I_CLEAR;
-}
-EXPORT_SYMBOL(clear_inode);
-
 static void evict(struct inode *inode)
 {
 	const struct super_operations *op = inode->i_sb->s_op;
@@ -338,7 +315,10 @@ static void evict(struct inode *inode)
 	} else {
 		if (inode->i_data.nrpages)
 			truncate_inode_pages(&inode->i_data, 0);
-		clear_inode(inode);
+		invalidate_inode_buffers(inode);
+		end_writeback(inode);
+		if (op->clear_inode)
+			op->clear_inode(inode);
 	}
 	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
 		bd_forget(inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2b1254771e46..4eaa6b2e35db 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2182,7 +2182,6 @@ extern void unlock_new_inode(struct inode *);
 
 extern void __iget(struct inode * inode);
 extern void iget_failed(struct inode *);
-extern void clear_inode(struct inode *);
 extern void end_writeback(struct inode *);
 extern void destroy_inode(struct inode *);
 extern void __destroy_inode(struct inode *);
-- 
cgit v1.2.3-59-g8ed1b


From 45321ac54316eaeeebde0b5f728a1791e500974c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Jun 2010 13:43:19 -0400
Subject: Make ->drop_inode() just return whether inode needs to be dropped

... and let iput_final() do the actual eviction or retention

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/staging/pohmelfs/inode.c |   4 +-
 fs/btrfs/ctree.h                 |   2 +-
 fs/btrfs/inode.c                 |  11 ++--
 fs/cifs/cifsfs.c                 |   9 ++--
 fs/gfs2/super.c                  |   4 +-
 fs/inode.c                       | 113 ++++++++++++---------------------------
 fs/logfs/inode.c                 |   4 +-
 fs/ocfs2/inode.c                 |   8 +--
 fs/ocfs2/inode.h                 |   2 +-
 include/linux/fs.h               |   6 +--
 10 files changed, 60 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index e818f53ccfd7..100e3a3c1b10 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -1223,7 +1223,7 @@ void pohmelfs_fill_inode(struct inode *inode, struct netfs_inode_info *info)
 	}
 }
 
-static void pohmelfs_drop_inode(struct inode *inode)
+static int pohmelfs_drop_inode(struct inode *inode)
 {
 	struct pohmelfs_sb *psb = POHMELFS_SB(inode->i_sb);
 	struct pohmelfs_inode *pi = POHMELFS_I(inode);
@@ -1232,7 +1232,7 @@ static void pohmelfs_drop_inode(struct inode *inode)
 	list_del_init(&pi->inode_entry);
 	spin_unlock(&psb->ino_lock);
 
-	generic_drop_inode(inode);
+	return generic_drop_inode(inode);
 }
 
 static struct pohmelfs_inode *pohmelfs_get_inode_from_list(struct pohmelfs_sb *psb,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 394d5422ab6a..eaf286abad17 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2395,7 +2395,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
-void btrfs_drop_inode(struct inode *inode);
+int btrfs_drop_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ce02199ec4e5..2c54f04a0bf5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3943,7 +3943,7 @@ again:
 			if (atomic_read(&inode->i_count) > 1)
 				d_prune_aliases(inode);
 			/*
-			 * btrfs_drop_inode will remove it from
+			 * btrfs_drop_inode will have it removed from
 			 * the inode cache when its usage count
 			 * hits zero.
 			 */
@@ -6337,13 +6337,14 @@ free:
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
-void btrfs_drop_inode(struct inode *inode)
+int btrfs_drop_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
-		generic_delete_inode(inode);
+
+	if (btrfs_root_refs(&root->root_item) == 0)
+		return 1;
 	else
-		generic_drop_inode(inode);
+		return generic_drop_inode(inode);
 }
 
 static void init_once(void *foo)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8a2cf129e535..20914f5627dd 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -480,14 +480,13 @@ static int cifs_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-void cifs_drop_inode(struct inode *inode)
+static int cifs_drop_inode(struct inode *inode)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-		return generic_drop_inode(inode);
-
-	return generic_delete_inode(inode);
+	/* no serverino => unconditional eviction */
+	return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
+		generic_drop_inode(inode);
 }
 
 static const struct super_operations cifs_super_ops = {
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 555f5a417c67..fa865ab37f12 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1191,7 +1191,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
  * node for later deallocation.
  */
 
-static void gfs2_drop_inode(struct inode *inode)
+static int gfs2_drop_inode(struct inode *inode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 
@@ -1200,7 +1200,7 @@ static void gfs2_drop_inode(struct inode *inode)
 		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
 			clear_nlink(inode);
 	}
-	generic_drop_inode(inode);
+	return generic_drop_inode(inode);
 }
 
 static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
diff --git a/fs/inode.c b/fs/inode.c
index 82ca3562a688..0e077619cbf6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1183,58 +1183,51 @@ void remove_inode_hash(struct inode *inode)
 }
 EXPORT_SYMBOL(remove_inode_hash);
 
+int generic_delete_inode(struct inode *inode)
+{
+	return 1;
+}
+EXPORT_SYMBOL(generic_delete_inode);
+
 /*
- * Tell the filesystem that this inode is no longer of any interest and should
- * be completely destroyed.
- *
- * We leave the inode in the inode hash table until *after* the filesystem's
- * ->delete_inode completes.  This ensures that an iget (such as nfsd might
- * instigate) will always find up-to-date information either in the hash or on
- * disk.
- *
- * I_FREEING is set so that no-one will take a new reference to the inode while
- * it is being deleted.
+ * Normal UNIX filesystem behaviour: delete the
+ * inode when the usage count drops to zero, and
+ * i_nlink is zero.
  */
-void generic_delete_inode(struct inode *inode)
+int generic_drop_inode(struct inode *inode)
 {
-	list_del_init(&inode->i_list);
-	list_del_init(&inode->i_sb_list);
-	WARN_ON(inode->i_state & I_NEW);
-	inode->i_state |= I_FREEING;
-	inodes_stat.nr_inodes--;
-	spin_unlock(&inode_lock);
-
-	evict(inode);
-
-	spin_lock(&inode_lock);
-	hlist_del_init(&inode->i_hash);
-	spin_unlock(&inode_lock);
-	wake_up_inode(inode);
-	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
-	destroy_inode(inode);
+	return !inode->i_nlink || hlist_unhashed(&inode->i_hash);
 }
-EXPORT_SYMBOL(generic_delete_inode);
+EXPORT_SYMBOL_GPL(generic_drop_inode);
 
-/**
- *	generic_detach_inode - remove inode from inode lists
- *	@inode: inode to remove
- *
- *	Remove inode from inode lists, write it if it's dirty. This is just an
- *	internal VFS helper exported for hugetlbfs. Do not use!
+/*
+ * Called when we're dropping the last reference
+ * to an inode.
  *
- *	Returns 1 if inode should be completely destroyed.
+ * Call the FS "drop_inode()" function, defaulting to
+ * the legacy UNIX filesystem behaviour.  If it tells
+ * us to evict inode, do so.  Otherwise, retain inode
+ * in cache if fs is alive, sync and evict if fs is
+ * shutting down.
  */
-static int generic_detach_inode(struct inode *inode)
+static void iput_final(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
+	const struct super_operations *op = inode->i_sb->s_op;
+	int drop;
+
+	if (op && op->drop_inode)
+		drop = op->drop_inode(inode);
+	else
+		drop = generic_drop_inode(inode);
 
-	if (!hlist_unhashed(&inode->i_hash)) {
+	if (!drop) {
 		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
 			list_move(&inode->i_list, &inode_unused);
 		inodes_stat.nr_unused++;
 		if (sb->s_flags & MS_ACTIVE) {
 			spin_unlock(&inode_lock);
-			return 0;
+			return;
 		}
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_WILL_FREE;
@@ -1252,53 +1245,15 @@ static int generic_detach_inode(struct inode *inode)
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
-	return 1;
-}
-
-static void generic_forget_inode(struct inode *inode)
-{
-	if (!generic_detach_inode(inode))
-		return;
 	evict(inode);
+	spin_lock(&inode_lock);
+	hlist_del_init(&inode->i_hash);
+	spin_unlock(&inode_lock);
 	wake_up_inode(inode);
+	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
 	destroy_inode(inode);
 }
 
-/*
- * Normal UNIX filesystem behaviour: delete the
- * inode when the usage count drops to zero, and
- * i_nlink is zero.
- */
-void generic_drop_inode(struct inode *inode)
-{
-	if (!inode->i_nlink)
-		generic_delete_inode(inode);
-	else
-		generic_forget_inode(inode);
-}
-EXPORT_SYMBOL_GPL(generic_drop_inode);
-
-/*
- * Called when we're dropping the last reference
- * to an inode.
- *
- * Call the FS "drop()" function, defaulting to
- * the legacy UNIX filesystem behaviour..
- *
- * NOTE! NOTE! NOTE! We're called with the inode lock
- * held, and the drop function is supposed to release
- * the lock!
- */
-static inline void iput_final(struct inode *inode)
-{
-	const struct super_operations *op = inode->i_sb->s_op;
-	void (*drop)(struct inode *) = generic_drop_inode;
-
-	if (op && op->drop_inode)
-		drop = op->drop_inode;
-	drop(inode);
-}
-
 /**
  *	iput	- put an inode
  *	@inode: inode to put
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 78be674d95c8..d8c71ece098f 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -287,7 +287,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 }
 
 /* called with inode_lock held */
-static void logfs_drop_inode(struct inode *inode)
+static int logfs_drop_inode(struct inode *inode)
 {
 	struct logfs_super *super = logfs_super(inode->i_sb);
 	struct logfs_inode *li = logfs_inode(inode);
@@ -295,7 +295,7 @@ static void logfs_drop_inode(struct inode *inode)
 	spin_lock(&logfs_inode_lock);
 	list_move(&li->li_freeing_list, &super->s_freeing_list);
 	spin_unlock(&logfs_inode_lock);
-	generic_drop_inode(inode);
+	return generic_drop_inode(inode);
 }
 
 static void logfs_set_ino_generation(struct super_block *sb,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index eb7fd07c90f2..0492464916b1 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1194,9 +1194,10 @@ void ocfs2_evict_inode(struct inode *inode)
 /* Called under inode_lock, with no more references on the
  * struct inode, so it's safe here to check the flags field
  * and to manipulate i_nlink without any other locks. */
-void ocfs2_drop_inode(struct inode *inode)
+int ocfs2_drop_inode(struct inode *inode)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int res;
 
 	mlog_entry_void();
 
@@ -1204,11 +1205,12 @@ void ocfs2_drop_inode(struct inode *inode)
 	     (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
 
 	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
-		generic_delete_inode(inode);
+		res = 1;
 	else
-		generic_drop_inode(inode);
+		res = generic_drop_inode(inode);
 
 	mlog_exit_void();
+	return res;
 }
 
 /*
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 975eedd7b243..6de5a869db30 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -124,7 +124,7 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
 }
 
 void ocfs2_evict_inode(struct inode *inode);
-void ocfs2_drop_inode(struct inode *inode);
+int ocfs2_drop_inode(struct inode *inode);
 
 /* Flags for ocfs2_iget() */
 #define OCFS2_FI_FLAG_SYSFILE		0x1
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4eaa6b2e35db..8553adbda57b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1562,7 +1562,7 @@ struct super_operations {
 
    	void (*dirty_inode) (struct inode *);
 	int (*write_inode) (struct inode *, struct writeback_control *wbc);
-	void (*drop_inode) (struct inode *);
+	int (*drop_inode) (struct inode *);
 	void (*evict_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
 	void (*write_super) (struct super_block *);
@@ -2164,8 +2164,8 @@ extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
 extern int inode_needs_sync(struct inode *inode);
-extern void generic_delete_inode(struct inode *inode);
-extern void generic_drop_inode(struct inode *inode);
+extern int generic_delete_inode(struct inode *inode);
+extern int generic_drop_inode(struct inode *inode);
 
 extern struct inode *ilookup5_nowait(struct super_block *sb,
 		unsigned long hashval, int (*test)(struct inode *, void *),
-- 
cgit v1.2.3-59-g8ed1b


From b57922d97fd6f79b6dbe6db0c4fd30d219fa08c1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Jun 2010 14:34:48 -0400
Subject: convert remaining ->clear_inode() to ->evict_inode()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/v9fs_vfs.h             |  2 +-
 fs/9p/vfs_inode.c            |  4 +++-
 fs/9p/vfs_super.c            |  4 ++--
 fs/afs/inode.c               |  5 ++++-
 fs/afs/internal.h            |  2 +-
 fs/afs/super.c               |  2 +-
 fs/binfmt_misc.c             |  5 +++--
 fs/block_dev.c               |  7 +++++--
 fs/cifs/cifsfs.c             |  6 ++++--
 fs/coda/inode.c              |  8 +++++---
 fs/ecryptfs/super.c          |  8 +++++---
 fs/freevxfs/vxfs_extern.h    |  2 +-
 fs/freevxfs/vxfs_inode.c     |  8 +++++---
 fs/freevxfs/vxfs_super.c     |  2 +-
 fs/fuse/inode.c              |  6 ++++--
 fs/hfs/hfs_fs.h              |  2 +-
 fs/hfs/inode.c               |  4 +++-
 fs/hfs/super.c               |  2 +-
 fs/hfsplus/super.c           |  8 +++++---
 fs/inode.c                   |  2 --
 fs/jffs2/fs.c                |  6 ++++--
 fs/jffs2/os-linux.h          |  2 +-
 fs/jffs2/super.c             |  2 +-
 fs/jffs2/xattr.c             |  2 +-
 fs/nfs/inode.c               | 13 +++++++++++--
 fs/nfs/internal.h            |  4 ++--
 fs/nfs/super.c               |  4 ++--
 fs/ntfs/inode.c              |  7 +++++--
 fs/ntfs/inode.h              |  2 +-
 fs/ntfs/super.c              |  2 +-
 fs/ocfs2/dlmfs/dlmfs.c       |  7 +++----
 fs/xfs/linux-2.6/xfs_super.c |  8 +++++---
 fs/xfs/linux-2.6/xfs_trace.h |  2 +-
 include/linux/fs.h           |  3 +--
 34 files changed, 94 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 32ef4009d030..3d056fe01b50 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -52,7 +52,7 @@ void v9fs_destroy_inode(struct inode *inode);
 #endif
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
-void v9fs_clear_inode(struct inode *inode);
+void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4b3ad6ac9a41..b81ce206508d 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -387,8 +387,10 @@ error:
  * @inode: inode to release
  *
  */
-void v9fs_clear_inode(struct inode *inode)
+void v9fs_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(inode->i_mapping, 0);
+	end_writeback(inode);
 	filemap_fdatawrite(inode->i_mapping);
 
 #ifdef CONFIG_9P_FSCACHE
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index be74d020436e..c6122bf547df 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -257,7 +257,7 @@ static const struct super_operations v9fs_super_ops = {
 	.destroy_inode = v9fs_destroy_inode,
 #endif
 	.statfs = simple_statfs,
-	.clear_inode = v9fs_clear_inode,
+	.evict_inode = v9fs_evict_inode,
 	.show_options = generic_show_options,
 	.umount_begin = v9fs_umount_begin,
 };
@@ -268,7 +268,7 @@ static const struct super_operations v9fs_super_ops_dotl = {
 	.destroy_inode = v9fs_destroy_inode,
 #endif
 	.statfs = v9fs_statfs,
-	.clear_inode = v9fs_clear_inode,
+	.evict_inode = v9fs_evict_inode,
 	.show_options = generic_show_options,
 	.umount_begin = v9fs_umount_begin,
 };
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index d00b312e3110..320ffef11574 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -316,7 +316,7 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 /*
  * clear an AFS inode
  */
-void afs_clear_inode(struct inode *inode)
+void afs_evict_inode(struct inode *inode)
 {
 	struct afs_permits *permits;
 	struct afs_vnode *vnode;
@@ -335,6 +335,9 @@ void afs_clear_inode(struct inode *inode)
 
 	ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
 
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+
 	afs_give_up_callback(vnode);
 
 	if (vnode->server) {
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5f679b77ce24..8679089ce9a1 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -565,7 +565,7 @@ extern void afs_zap_data(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
 extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int afs_setattr(struct dentry *, struct iattr *);
-extern void afs_clear_inode(struct inode *);
+extern void afs_evict_inode(struct inode *);
 
 /*
  * main.c
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e932e5a3a0c1..9cf80f02da16 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -49,7 +49,7 @@ static const struct super_operations afs_super_ops = {
 	.statfs		= afs_statfs,
 	.alloc_inode	= afs_alloc_inode,
 	.destroy_inode	= afs_destroy_inode,
-	.clear_inode	= afs_clear_inode,
+	.evict_inode	= afs_evict_inode,
 	.put_super	= afs_put_super,
 	.show_options	= generic_show_options,
 };
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index c4e83537ead7..9e60fd201716 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -502,8 +502,9 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 	return inode;
 }
 
-static void bm_clear_inode(struct inode *inode)
+static void bm_evict_inode(struct inode *inode)
 {
+	end_writeback(inode);
 	kfree(inode->i_private);
 }
 
@@ -685,7 +686,7 @@ static const struct file_operations bm_status_operations = {
 
 static const struct super_operations s_ops = {
 	.statfs		= simple_statfs,
-	.clear_inode	= bm_clear_inode,
+	.evict_inode	= bm_evict_inode,
 };
 
 static int bm_fill_super(struct super_block * sb, void * data, int silent)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 63c9d6076205..de7b4d0c7e30 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -426,10 +426,13 @@ static inline void __bd_forget(struct inode *inode)
 	inode->i_mapping = &inode->i_data;
 }
 
-static void bdev_clear_inode(struct inode *inode)
+static void bdev_evict_inode(struct inode *inode)
 {
 	struct block_device *bdev = &BDEV_I(inode)->bdev;
 	struct list_head *p;
+	truncate_inode_pages(&inode->i_data, 0);
+	invalidate_inode_buffers(inode); /* is it needed here? */
+	end_writeback(inode);
 	spin_lock(&bdev_lock);
 	while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
 		__bd_forget(list_entry(p, struct inode, i_devices));
@@ -443,7 +446,7 @@ static const struct super_operations bdev_sops = {
 	.alloc_inode = bdev_alloc_inode,
 	.destroy_inode = bdev_destroy_inode,
 	.drop_inode = generic_delete_inode,
-	.clear_inode = bdev_clear_inode,
+	.evict_inode = bdev_evict_inode,
 };
 
 static int bd_get_sb(struct file_system_type *fs_type,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 20914f5627dd..5574a42b7bb6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -330,8 +330,10 @@ cifs_destroy_inode(struct inode *inode)
 }
 
 static void
-cifs_clear_inode(struct inode *inode)
+cifs_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	cifs_fscache_release_inode_cookie(inode);
 }
 
@@ -495,7 +497,7 @@ static const struct super_operations cifs_super_ops = {
 	.alloc_inode = cifs_alloc_inode,
 	.destroy_inode = cifs_destroy_inode,
 	.drop_inode	= cifs_drop_inode,
-	.clear_inode	= cifs_clear_inode,
+	.evict_inode	= cifs_evict_inode,
 /*	.delete_inode	= cifs_delete_inode,  */  /* Do not need above
 	function unless later we add lazy close of inodes or unless the
 	kernel forgets to call us with the same number of releases (closes)
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index d97f9935a028..6526e6f21ecf 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -35,7 +35,7 @@
 #include "coda_int.h"
 
 /* VFS super_block ops */
-static void coda_clear_inode(struct inode *);
+static void coda_evict_inode(struct inode *);
 static void coda_put_super(struct super_block *);
 static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
 
@@ -93,7 +93,7 @@ static const struct super_operations coda_super_operations =
 {
 	.alloc_inode	= coda_alloc_inode,
 	.destroy_inode	= coda_destroy_inode,
-	.clear_inode	= coda_clear_inode,
+	.evict_inode	= coda_evict_inode,
 	.put_super	= coda_put_super,
 	.statfs		= coda_statfs,
 	.remount_fs	= coda_remount,
@@ -224,8 +224,10 @@ static void coda_put_super(struct super_block *sb)
 	printk("Coda: Bye bye.\n");
 }
 
-static void coda_clear_inode(struct inode *inode)
+static void coda_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	coda_cache_clear_inode(inode);
 }
 
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0435886e4a9f..4b5de6c6e0fa 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -122,7 +122,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 }
 
 /**
- * ecryptfs_clear_inode
+ * ecryptfs_evict_inode
  * @inode - The ecryptfs inode
  *
  * Called by iput() when the inode reference count reached zero
@@ -131,8 +131,10 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
  * on the inode free list. We use this to drop out reference to the
  * lower inode.
  */
-static void ecryptfs_clear_inode(struct inode *inode)
+static void ecryptfs_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	iput(ecryptfs_inode_to_lower(inode));
 }
 
@@ -184,6 +186,6 @@ const struct super_operations ecryptfs_sops = {
 	.drop_inode = generic_delete_inode,
 	.statfs = ecryptfs_statfs,
 	.remount_fs = NULL,
-	.clear_inode = ecryptfs_clear_inode,
+	.evict_inode = ecryptfs_evict_inode,
 	.show_options = ecryptfs_show_options
 };
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index 50ab5eecb99b..881aa3d217f0 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -63,7 +63,7 @@ extern void			vxfs_put_fake_inode(struct inode *);
 extern struct vxfs_inode_info *	vxfs_blkiget(struct super_block *, u_long, ino_t);
 extern struct vxfs_inode_info *	vxfs_stiget(struct super_block *, ino_t);
 extern struct inode *		vxfs_iget(struct super_block *, ino_t);
-extern void			vxfs_clear_inode(struct inode *);
+extern void			vxfs_evict_inode(struct inode *);
 
 /* vxfs_lookup.c */
 extern const struct inode_operations	vxfs_dir_inode_ops;
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 03a6ea5e99f7..79d1b4ea13e7 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -337,15 +337,17 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
 }
 
 /**
- * vxfs_clear_inode - remove inode from main memory
+ * vxfs_evict_inode - remove inode from main memory
  * @ip:		inode to discard.
  *
  * Description:
- *  vxfs_clear_inode() is called on the final iput and frees the private
+ *  vxfs_evict_inode() is called on the final iput and frees the private
  *  inode area.
  */
 void
-vxfs_clear_inode(struct inode *ip)
+vxfs_evict_inode(struct inode *ip)
 {
+	truncate_inode_pages(&ip->i_data, 0);
+	end_writeback(ip);
 	kmem_cache_free(vxfs_inode_cachep, ip->i_private);
 }
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1e8af939b3e4..1f3ffd93b357 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -61,7 +61,7 @@ static int		vxfs_statfs(struct dentry *, struct kstatfs *);
 static int		vxfs_remount(struct super_block *, int *, char *);
 
 static const struct super_operations vxfs_super_ops = {
-	.clear_inode =		vxfs_clear_inode,
+	.evict_inode =		vxfs_evict_inode,
 	.put_super =		vxfs_put_super,
 	.statfs =		vxfs_statfs,
 	.remount_fs =		vxfs_remount,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ec14d19ce501..da9e6e11374c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -122,8 +122,10 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
 	fuse_request_send_noreply(fc, req);
 }
 
-static void fuse_clear_inode(struct inode *inode)
+static void fuse_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	if (inode->i_sb->s_flags & MS_ACTIVE) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
 		struct fuse_inode *fi = get_fuse_inode(inode);
@@ -736,7 +738,7 @@ static const struct export_operations fuse_export_operations = {
 static const struct super_operations fuse_super_operations = {
 	.alloc_inode    = fuse_alloc_inode,
 	.destroy_inode  = fuse_destroy_inode,
-	.clear_inode	= fuse_clear_inode,
+	.evict_inode	= fuse_evict_inode,
 	.drop_inode	= generic_delete_inode,
 	.remount_fs	= fuse_remount_fs,
 	.put_super	= fuse_put_super,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index fe35e3b626c4..4f55651aaa51 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -193,7 +193,7 @@ extern int hfs_inode_setattr(struct dentry *, struct iattr *);
 extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
 			__be32 log_size, __be32 phys_size, u32 clump_size);
 extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *);
-extern void hfs_clear_inode(struct inode *);
+extern void hfs_evict_inode(struct inode *);
 extern void hfs_delete_inode(struct inode *);
 
 /* attr.c */
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 93ceec8fbb8f..397b7adc7ce6 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -531,8 +531,10 @@ out:
 	return NULL;
 }
 
-void hfs_clear_inode(struct inode *inode)
+void hfs_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
 		HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
 		iput(HFS_I(inode)->rsrc_inode);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 0a81eb7111f3..34235d4bf08b 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -181,7 +181,7 @@ static const struct super_operations hfs_super_operations = {
 	.alloc_inode	= hfs_alloc_inode,
 	.destroy_inode	= hfs_destroy_inode,
 	.write_inode	= hfs_write_inode,
-	.clear_inode	= hfs_clear_inode,
+	.evict_inode	= hfs_evict_inode,
 	.put_super	= hfs_put_super,
 	.write_super	= hfs_write_super,
 	.sync_fs	= hfs_sync_fs,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index a32c241e4e45..3b55c050c742 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -145,9 +145,11 @@ static int hfsplus_write_inode(struct inode *inode,
 	return ret;
 }
 
-static void hfsplus_clear_inode(struct inode *inode)
+static void hfsplus_evict_inode(struct inode *inode)
 {
-	dprint(DBG_INODE, "hfsplus_clear_inode: %lu\n", inode->i_ino);
+	dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	if (HFSPLUS_IS_RSRC(inode)) {
 		HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL;
 		iput(HFSPLUS_I(inode).rsrc_inode);
@@ -293,7 +295,7 @@ static const struct super_operations hfsplus_sops = {
 	.alloc_inode	= hfsplus_alloc_inode,
 	.destroy_inode	= hfsplus_destroy_inode,
 	.write_inode	= hfsplus_write_inode,
-	.clear_inode	= hfsplus_clear_inode,
+	.evict_inode	= hfsplus_evict_inode,
 	.put_super	= hfsplus_put_super,
 	.write_super	= hfsplus_write_super,
 	.sync_fs	= hfsplus_sync_fs,
diff --git a/fs/inode.c b/fs/inode.c
index 0e077619cbf6..5daeb0b8fb59 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -317,8 +317,6 @@ static void evict(struct inode *inode)
 			truncate_inode_pages(&inode->i_data, 0);
 		invalidate_inode_buffers(inode);
 		end_writeback(inode);
-		if (op->clear_inode)
-			op->clear_inode(inode);
 	}
 	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
 		bd_forget(inode);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 1b2426604fe3..ac0638f04969 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -225,7 +225,7 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 }
 
 
-void jffs2_clear_inode (struct inode *inode)
+void jffs2_evict_inode (struct inode *inode)
 {
 	/* We can forget about this inode for now - drop all
 	 *  the nodelists associated with it, etc.
@@ -233,7 +233,9 @@ void jffs2_clear_inode (struct inode *inode)
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
 
-	D1(printk(KERN_DEBUG "jffs2_clear_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
+	D1(printk(KERN_DEBUG "jffs2_evict_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	jffs2_do_clear_inode(c, f);
 }
 
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 4791aacf3084..00bae7cc2e48 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -171,7 +171,7 @@ extern const struct inode_operations jffs2_symlink_inode_operations;
 int jffs2_setattr (struct dentry *, struct iattr *);
 int jffs2_do_setattr (struct inode *, struct iattr *);
 struct inode *jffs2_iget(struct super_block *, unsigned long);
-void jffs2_clear_inode (struct inode *);
+void jffs2_evict_inode (struct inode *);
 void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
 			       struct jffs2_raw_inode *ri);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 511e2d609d12..662bba099501 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -135,7 +135,7 @@ static const struct super_operations jffs2_super_operations =
 	.write_super =	jffs2_write_super,
 	.statfs =	jffs2_statfs,
 	.remount_fs =	jffs2_remount_fs,
-	.clear_inode =	jffs2_clear_inode,
+	.evict_inode =	jffs2_evict_inode,
 	.dirty_inode =	jffs2_dirty_inode,
 	.sync_fs =	jffs2_sync_fs,
 };
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index d258e261bdc7..9b572ca40a49 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -588,7 +588,7 @@ static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *re
 
 void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
 {
-	/* It's called from jffs2_clear_inode() on inode removing.
+	/* It's called from jffs2_evict_inode() on inode removing.
 	   When an inode with XATTR is removed, those XATTRs must be removed. */
 	struct jffs2_xattr_ref *ref, *_ref;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 099b3518feea..c211b8168e5b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -98,7 +98,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
 	return ino;
 }
 
-void nfs_clear_inode(struct inode *inode)
+static void nfs_clear_inode(struct inode *inode)
 {
 	/*
 	 * The following should never happen...
@@ -110,6 +110,13 @@ void nfs_clear_inode(struct inode *inode)
 	nfs_fscache_release_inode_cookie(inode);
 }
 
+void nfs_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
+	nfs_clear_inode(inode);
+}
+
 /**
  * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
  */
@@ -1338,8 +1345,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  * to open() calls that passed nfs_atomic_lookup, but failed to call
  * nfs_open().
  */
-void nfs4_clear_inode(struct inode *inode)
+void nfs4_evict_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	/* If we are holding a delegation, return it! */
 	nfs_inode_return_delegation_noreclaim(inode);
 	/* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e70f44b9b3f4..f168ebdf7c6d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -213,9 +213,9 @@ extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
 extern int nfs_write_inode(struct inode *, struct writeback_control *);
-extern void nfs_clear_inode(struct inode *);
+extern void nfs_evict_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
-extern void nfs4_clear_inode(struct inode *);
+extern void nfs4_evict_inode(struct inode *);
 #endif
 void nfs_zap_acl_cache(struct inode *inode);
 extern int nfs_wait_bit_killable(void *word);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f9df16de4a56..ef2b7e468a7e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -270,7 +270,7 @@ static const struct super_operations nfs_sops = {
 	.write_inode	= nfs_write_inode,
 	.put_super	= nfs_put_super,
 	.statfs		= nfs_statfs,
-	.clear_inode	= nfs_clear_inode,
+	.evict_inode	= nfs_evict_inode,
 	.umount_begin	= nfs_umount_begin,
 	.show_options	= nfs_show_options,
 	.show_stats	= nfs_show_stats,
@@ -340,7 +340,7 @@ static const struct super_operations nfs4_sops = {
 	.write_inode	= nfs_write_inode,
 	.put_super	= nfs_put_super,
 	.statfs		= nfs_statfs,
-	.clear_inode	= nfs4_clear_inode,
+	.evict_inode	= nfs4_evict_inode,
 	.umount_begin	= nfs_umount_begin,
 	.show_options	= nfs_show_options,
 	.show_stats	= nfs_show_stats,
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index fdef8f729c3a..93622b175fc7 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2238,7 +2238,7 @@ void ntfs_clear_extent_inode(ntfs_inode *ni)
 }
 
 /**
- * ntfs_clear_big_inode - clean up the ntfs specific part of an inode
+ * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
  * @vi:		vfs inode pending annihilation
  *
  * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
@@ -2247,10 +2247,13 @@ void ntfs_clear_extent_inode(ntfs_inode *ni)
  *
  * If the MFT record is dirty, we commit it before doing anything else.
  */
-void ntfs_clear_big_inode(struct inode *vi)
+void ntfs_evict_big_inode(struct inode *vi)
 {
 	ntfs_inode *ni = NTFS_I(vi);
 
+	truncate_inode_pages(&vi->i_data, 0);
+	end_writeback(vi);
+
 #ifdef NTFS_RW
 	if (NInoDirty(ni)) {
 		bool was_bad = (is_bad_inode(vi));
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 9a113544605d..2dabf813456c 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -279,7 +279,7 @@ extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
 
 extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
 extern void ntfs_destroy_big_inode(struct inode *inode);
-extern void ntfs_clear_big_inode(struct inode *vi);
+extern void ntfs_evict_big_inode(struct inode *vi);
 
 extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 0de1db6cddbf..512806171bfa 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2700,7 +2700,7 @@ static const struct super_operations ntfs_sops = {
 	.put_super	= ntfs_put_super,	/* Syscall: umount. */
 	.statfs		= ntfs_statfs,		/* Syscall: statfs */
 	.remount_fs	= ntfs_remount,		/* Syscall: mount -o remount. */
-	.clear_inode	= ntfs_clear_big_inode,	/* VFS: Called when an inode is
+	.evict_inode	= ntfs_evict_big_inode,	/* VFS: Called when an inode is
 						   removed from memory. */
 	//.umount_begin	= NULL,			/* Forced umount. */
 	.show_options	= ntfs_show_options,	/* Show mount options in
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 85e4ccaedd1f..a43ebb11ad3b 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -357,13 +357,12 @@ static void dlmfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
 }
 
-static void dlmfs_clear_inode(struct inode *inode)
+static void dlmfs_evict_inode(struct inode *inode)
 {
 	int status;
 	struct dlmfs_inode_private *ip;
 
-	if (!inode)
-		return;
+	end_writeback(inode);
 
 	mlog(0, "inode %lu\n", inode->i_ino);
 
@@ -633,7 +632,7 @@ static const struct super_operations dlmfs_ops = {
 	.statfs		= simple_statfs,
 	.alloc_inode	= dlmfs_alloc_inode,
 	.destroy_inode	= dlmfs_destroy_inode,
-	.clear_inode	= dlmfs_clear_inode,
+	.evict_inode	= dlmfs_evict_inode,
 	.drop_inode	= generic_delete_inode,
 };
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 758df94690ed..15c35b62ff14 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1100,13 +1100,15 @@ xfs_fs_write_inode(
 }
 
 STATIC void
-xfs_fs_clear_inode(
+xfs_fs_evict_inode(
 	struct inode		*inode)
 {
 	xfs_inode_t		*ip = XFS_I(inode);
 
-	trace_xfs_clear_inode(ip);
+	trace_xfs_evict_inode(ip);
 
+	truncate_inode_pages(&inode->i_data, 0);
+	end_writeback(inode);
 	XFS_STATS_INC(vn_rele);
 	XFS_STATS_INC(vn_remove);
 	XFS_STATS_DEC(vn_active);
@@ -1622,7 +1624,7 @@ static const struct super_operations xfs_super_operations = {
 	.destroy_inode		= xfs_fs_destroy_inode,
 	.dirty_inode		= xfs_fs_dirty_inode,
 	.write_inode		= xfs_fs_write_inode,
-	.clear_inode		= xfs_fs_clear_inode,
+	.evict_inode		= xfs_fs_evict_inode,
 	.put_super		= xfs_fs_put_super,
 	.sync_fs		= xfs_fs_sync_fs,
 	.freeze_fs		= xfs_fs_freeze,
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index c657cdca2cd2..be5dffd282a1 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -581,7 +581,7 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr);
 DEFINE_INODE_EVENT(xfs_file_fsync);
 DEFINE_INODE_EVENT(xfs_destroy_inode);
 DEFINE_INODE_EVENT(xfs_write_inode);
-DEFINE_INODE_EVENT(xfs_clear_inode);
+DEFINE_INODE_EVENT(xfs_evict_inode);
 
 DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
 DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8553adbda57b..dec9ac598859 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1571,7 +1571,6 @@ struct super_operations {
 	int (*unfreeze_fs) (struct super_block *);
 	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
-	void (*clear_inode) (struct inode *);
 	void (*umount_begin) (struct super_block *);
 
 	int (*show_options)(struct seq_file *, struct vfsmount *);
@@ -1616,7 +1615,7 @@ struct super_operations {
  * I_FREEING		Set when inode is about to be freed but still has dirty
  *			pages or buffers attached or the inode itself is still
  *			dirty.
- * I_CLEAR		Added by clear_inode().  In this state the inode is clean
+ * I_CLEAR		Added by end_writeback().  In this state the inode is clean
  *			and can be destroyed.  Inode keeps I_FREEING.
  *
  *			Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
-- 
cgit v1.2.3-59-g8ed1b


From ebabe9a9001af0af56c0c2780ca1576246e7a74b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 7 Jul 2010 18:53:11 +0200
Subject: pass a struct path to vfs_statfs

We'll need the path to implement the flags field for statvfs support.
We do have it available in all callers except:

 - ecryptfs_statfs.  This one doesn't actually need vfs_statfs but just
   needs to do a caller to the lower filesystem statfs method.
 - sys_ustat.  Add a non-exported statfs_by_dentry helper for it which
   doesn't won't be able to fill out the flags field later on.

In addition rename the helpers for statfs vs fstatfs to do_*statfs instead
of the misleading vfs prefix.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/osf_sys.c |  8 ++++----
 arch/parisc/hpux/sys_hpux.c | 10 ++++-----
 fs/cachefiles/bind.c        |  2 +-
 fs/cachefiles/daemon.c      |  6 +++++-
 fs/compat.c                 | 10 ++++-----
 fs/ecryptfs/super.c         |  6 +++++-
 fs/nfsd/nfs4xdr.c           |  6 +++++-
 fs/nfsd/vfs.c               | 10 +++++++--
 fs/statfs.c                 | 50 +++++++++++++++++++++++----------------------
 include/linux/fs.h          |  3 ++-
 kernel/acct.c               |  2 +-
 11 files changed, 67 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index de9d39717808..88131c6e42e3 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -234,11 +234,11 @@ linux_to_osf_statfs(struct kstatfs *linux_stat, struct osf_statfs __user *osf_st
 }
 
 static int
-do_osf_statfs(struct dentry * dentry, struct osf_statfs __user *buffer,
+do_osf_statfs(struct path *path, struct osf_statfs __user *buffer,
 	      unsigned long bufsiz)
 {
 	struct kstatfs linux_stat;
-	int error = vfs_statfs(dentry, &linux_stat);
+	int error = vfs_statfs(path, &linux_stat);
 	if (!error)
 		error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz);
 	return error;	
@@ -252,7 +252,7 @@ SYSCALL_DEFINE3(osf_statfs, char __user *, pathname,
 
 	retval = user_path(pathname, &path);
 	if (!retval) {
-		retval = do_osf_statfs(path.dentry, buffer, bufsiz);
+		retval = do_osf_statfs(&path buffer, bufsiz);
 		path_put(&path);
 	}
 	return retval;
@@ -267,7 +267,7 @@ SYSCALL_DEFINE3(osf_fstatfs, unsigned long, fd,
 	retval = -EBADF;
 	file = fget(fd);
 	if (file) {
-		retval = do_osf_statfs(file->f_path.dentry, buffer, bufsiz);
+		retval = do_osf_statfs(&file->f_path, buffer, bufsiz);
 		fput(file);
 	}
 	return retval;
diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c
index 92343bd35fa3..ba430a03bc7a 100644
--- a/arch/parisc/hpux/sys_hpux.c
+++ b/arch/parisc/hpux/sys_hpux.c
@@ -145,7 +145,7 @@ static int hpux_ustat(dev_t dev, struct hpux_ustat __user *ubuf)
 	s = user_get_super(dev);
 	if (s == NULL)
 		goto out;
-	err = vfs_statfs(s->s_root, &sbuf);
+	err = statfs_by_dentry(s->s_root, &sbuf);
 	drop_super(s);
 	if (err)
 		goto out;
@@ -186,12 +186,12 @@ struct hpux_statfs {
      int16_t f_pad;
 };
 
-static int vfs_statfs_hpux(struct dentry *dentry, struct hpux_statfs *buf)
+static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf)
 {
 	struct kstatfs st;
 	int retval;
 	
-	retval = vfs_statfs(dentry, &st);
+	retval = vfs_statfs(path, &st);
 	if (retval)
 		return retval;
 
@@ -219,7 +219,7 @@ asmlinkage long hpux_statfs(const char __user *pathname,
 	error = user_path(pathname, &path);
 	if (!error) {
 		struct hpux_statfs tmp;
-		error = vfs_statfs_hpux(path.dentry, &tmp);
+		error = do_statfs_hpux(&path, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_put(&path);
@@ -237,7 +237,7 @@ asmlinkage long hpux_fstatfs(unsigned int fd, struct hpux_statfs __user * buf)
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs_hpux(file->f_path.dentry, &tmp);
+	error = do_statfs_hpux(&file->f_path, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 2906077ac798..a2603e7c0bb5 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -146,7 +146,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
 		goto error_unsupported;
 
 	/* get the cache size and blocksize */
-	ret = vfs_statfs(root, &stats);
+	ret = vfs_statfs(&path, &stats);
 	if (ret < 0)
 		goto error_unsupported;
 
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index c2413561ea75..24eb0d37241a 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -683,6 +683,10 @@ int cachefiles_has_space(struct cachefiles_cache *cache,
 			 unsigned fnr, unsigned bnr)
 {
 	struct kstatfs stats;
+	struct path path = {
+		.mnt	= cache->mnt,
+		.dentry	= cache->mnt->mnt_root,
+	};
 	int ret;
 
 	//_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
@@ -697,7 +701,7 @@ int cachefiles_has_space(struct cachefiles_cache *cache,
 	/* find out how many pages of blockdev are available */
 	memset(&stats, 0, sizeof(stats));
 
-	ret = vfs_statfs(cache->mnt->mnt_root, &stats);
+	ret = vfs_statfs(&path, &stats);
 	if (ret < 0) {
 		if (ret == -EIO)
 			cachefiles_io_error(cache, "statfs failed");
diff --git a/fs/compat.c b/fs/compat.c
index 6490d2134ff3..fc6c2adf2f6b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -266,7 +266,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta
 	error = user_path(pathname, &path);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(path.dentry, &tmp);
+		error = vfs_statfs(&path, &tmp);
 		if (!error)
 			error = put_compat_statfs(buf, &tmp);
 		path_put(&path);
@@ -284,7 +284,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_path.dentry, &tmp);
+	error = vfs_statfs(&file->f_path, &tmp);
 	if (!error)
 		error = put_compat_statfs(buf, &tmp);
 	fput(file);
@@ -334,7 +334,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s
 	error = user_path(pathname, &path);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(path.dentry, &tmp);
+		error = vfs_statfs(&path, &tmp);
 		if (!error)
 			error = put_compat_statfs64(buf, &tmp);
 		path_put(&path);
@@ -355,7 +355,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_path.dentry, &tmp);
+	error = vfs_statfs(&file->f_path, &tmp);
 	if (!error)
 		error = put_compat_statfs64(buf, &tmp);
 	fput(file);
@@ -378,7 +378,7 @@ asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
 	sb = user_get_super(new_decode_dev(dev));
 	if (!sb)
 		return -EINVAL;
-	err = vfs_statfs(sb->s_root, &sbuf);
+	err = statfs_by_dentry(sb->s_root, &sbuf);
 	drop_super(sb);
 	if (err)
 		return err;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 4b5de6c6e0fa..f7fc286a3aa9 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -118,7 +118,11 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
  */
 static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	return vfs_statfs(ecryptfs_dentry_to_lower(dentry), buf);
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+	if (!lower_dentry->d_sb->s_op->statfs)
+		return -ENOSYS;
+	return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
 }
 
 /**
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index ac17a7080239..4d6154f66e04 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1756,6 +1756,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	struct nfs4_acl *acl = NULL;
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	u32 minorversion = resp->cstate.minorversion;
+	struct path path = {
+		.mnt	= exp->ex_path.mnt,
+		.dentry	= dentry,
+	};
 
 	BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
 	BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
@@ -1776,7 +1780,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 			FATTR4_WORD0_MAXNAME)) ||
 	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
 		       FATTR4_WORD1_SPACE_TOTAL))) {
-		err = vfs_statfs(dentry, &statfs);
+		err = vfs_statfs(&path, &statfs);
 		if (err)
 			goto out_nfserr;
 	}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3c111120b619..f6f1a718642f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2019,8 +2019,14 @@ out:
 __be32
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
 {
-	__be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
-	if (!err && vfs_statfs(fhp->fh_dentry,stat))
+	struct path path = {
+		.mnt	= fhp->fh_export->ex_path.mnt,
+		.dentry	= fhp->fh_dentry,
+	};
+	__be32 err;
+
+	err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
+	if (!err && vfs_statfs(&path, stat))
 		err = nfserr_io;
 	return err;
 }
diff --git a/fs/statfs.c b/fs/statfs.c
index 4ef021f3b612..6a305709a4da 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -7,33 +7,35 @@
 #include <linux/security.h>
 #include <linux/uaccess.h>
 
-int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
 {
-	int retval = -ENODEV;
-
-	if (dentry) {
-		retval = -ENOSYS;
-		if (dentry->d_sb->s_op->statfs) {
-			memset(buf, 0, sizeof(*buf));
-			retval = security_sb_statfs(dentry);
-			if (retval)
-				return retval;
-			retval = dentry->d_sb->s_op->statfs(dentry, buf);
-			if (retval == 0 && buf->f_frsize == 0)
-				buf->f_frsize = buf->f_bsize;
-		}
-	}
+	int retval;
+
+	if (!dentry->d_sb->s_op->statfs)
+		return -ENOSYS;
+
+	memset(buf, 0, sizeof(*buf));
+	retval = security_sb_statfs(dentry);
+	if (retval)
+		return retval;
+	retval = dentry->d_sb->s_op->statfs(dentry, buf);
+	if (retval == 0 && buf->f_frsize == 0)
+		buf->f_frsize = buf->f_bsize;
 	return retval;
 }
 
+int vfs_statfs(struct path *path, struct kstatfs *buf)
+{
+	return statfs_by_dentry(path->dentry, buf);
+}
 EXPORT_SYMBOL(vfs_statfs);
 
-static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
+static int do_statfs_native(struct path *path, struct statfs *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(dentry, &st);
+	retval = vfs_statfs(path, &st);
 	if (retval)
 		return retval;
 
@@ -72,12 +74,12 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
 	return 0;
 }
 
-static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
+static int do_statfs64(struct path *path, struct statfs64 *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(dentry, &st);
+	retval = vfs_statfs(path, &st);
 	if (retval)
 		return retval;
 
@@ -107,7 +109,7 @@ SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, b
 	error = user_path(pathname, &path);
 	if (!error) {
 		struct statfs tmp;
-		error = vfs_statfs_native(path.dentry, &tmp);
+		error = do_statfs_native(&path, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_put(&path);
@@ -125,7 +127,7 @@ SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct stat
 	error = user_path(pathname, &path);
 	if (!error) {
 		struct statfs64 tmp;
-		error = vfs_statfs64(path.dentry, &tmp);
+		error = do_statfs64(&path, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_put(&path);
@@ -143,7 +145,7 @@ SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs_native(file->f_path.dentry, &tmp);
+	error = do_statfs_native(&file->f_path, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
@@ -164,7 +166,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs64(file->f_path.dentry, &tmp);
+	error = do_statfs64(&file->f_path, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
@@ -183,7 +185,7 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
 	if (!s)
 		return -EINVAL;
 
-	err = vfs_statfs(s->s_root, &sbuf);
+	err = statfs_by_dentry(s->s_root, &sbuf);
 	drop_super(s);
 	if (err)
 		return err;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dec9ac598859..9bedf4219f83 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1813,7 +1813,8 @@ extern struct vfsmount *collect_mounts(struct path *);
 extern void drop_collected_mounts(struct vfsmount *);
 extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
 			  struct vfsmount *);
-extern int vfs_statfs(struct dentry *, struct kstatfs *);
+extern int vfs_statfs(struct path *, struct kstatfs *);
+extern int statfs_by_dentry(struct dentry *, struct kstatfs *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
 
diff --git a/kernel/acct.c b/kernel/acct.c
index 385b88461c29..fa7eb3de2ddc 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
 	spin_unlock(&acct_lock);
 
 	/* May block */
-	if (vfs_statfs(file->f_path.dentry, &sbuf))
+	if (vfs_statfs(&file->f_path, &sbuf))
 		return res;
 	suspend = sbuf.f_blocks * SUSPEND;
 	resume = sbuf.f_blocks * RESUME;
-- 
cgit v1.2.3-59-g8ed1b


From 365b18189789bfa1acd9939e6312b8a4b4577b28 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 7 Jul 2010 18:53:25 +0200
Subject: add f_flags to struct statfs(64)

Add a flags field to help glibc implementing statvfs(3) efficiently.

We copy the flag values from glibc, and add a new ST_VALID flag to
denote that f_flags is implemented.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/mips/include/asm/statfs.h | 12 +++++++----
 arch/s390/include/asm/statfs.h |  9 +++++---
 fs/statfs.c                    | 47 +++++++++++++++++++++++++++++++++++++++++-
 include/asm-generic/statfs.h   |  9 +++++---
 include/linux/statfs.h         | 25 ++++++++++++++++++++--
 5 files changed, 89 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/include/asm/statfs.h b/arch/mips/include/asm/statfs.h
index c3ddf973c1c0..0f805c7a42a5 100644
--- a/arch/mips/include/asm/statfs.h
+++ b/arch/mips/include/asm/statfs.h
@@ -33,7 +33,8 @@ struct statfs {
 	/* Linux specials */
 	__kernel_fsid_t	f_fsid;
 	long		f_namelen;
-	long		f_spare[6];
+	long		f_flags;
+	long		f_spare[5];
 };
 
 #if (_MIPS_SIM == _MIPS_SIM_ABI32) || (_MIPS_SIM == _MIPS_SIM_NABI32)
@@ -53,7 +54,8 @@ struct statfs64 {
 	__u64	f_bavail;
 	__kernel_fsid_t f_fsid;
 	__u32	f_namelen;
-	__u32	f_spare[6];
+	__u32	f_flags;
+	__u32	f_spare[5];
 };
 
 #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
@@ -73,7 +75,8 @@ struct statfs64 {			/* Same as struct statfs */
 	/* Linux specials */
 	__kernel_fsid_t	f_fsid;
 	long		f_namelen;
-	long		f_spare[6];
+	long		f_flags;
+	long		f_spare[5];
 };
 
 struct compat_statfs64 {
@@ -88,7 +91,8 @@ struct compat_statfs64 {
 	__u64	f_bavail;
 	__kernel_fsid_t f_fsid;
 	__u32	f_namelen;
-	__u32	f_spare[6];
+	__u32	f_flags;
+	__u32	f_spare[5];
 };
 
 #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */
diff --git a/arch/s390/include/asm/statfs.h b/arch/s390/include/asm/statfs.h
index 06cc70307ece..3be7fbd406c8 100644
--- a/arch/s390/include/asm/statfs.h
+++ b/arch/s390/include/asm/statfs.h
@@ -33,7 +33,8 @@ struct statfs {
 	__kernel_fsid_t f_fsid;
 	int  f_namelen;
 	int  f_frsize;
-	int  f_spare[5];
+	int  f_flags;
+	int  f_spare[4];
 };
 
 struct statfs64 {
@@ -47,7 +48,8 @@ struct statfs64 {
 	__kernel_fsid_t f_fsid;
 	int  f_namelen;
 	int  f_frsize;
-	int  f_spare[5];
+	int  f_flags;
+	int  f_spare[4];
 };
 
 struct compat_statfs64 {
@@ -61,7 +63,8 @@ struct compat_statfs64 {
 	__kernel_fsid_t f_fsid;
 	__u32 f_namelen;
 	__u32 f_frsize;
-	__u32 f_spare[5];
+	__u32 f_flags;
+	__u32 f_spare[4];
 };
 
 #endif /* __s390x__ */
diff --git a/fs/statfs.c b/fs/statfs.c
index 6a305709a4da..30ea8c8a996b 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -2,11 +2,49 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/file.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/uaccess.h>
 
+static int flags_by_mnt(int mnt_flags)
+{
+	int flags = 0;
+
+	if (mnt_flags & MNT_READONLY)
+		flags |= ST_RDONLY;
+	if (mnt_flags & MNT_NOSUID)
+		flags |= ST_NOSUID;
+	if (mnt_flags & MNT_NODEV)
+		flags |= ST_NODEV;
+	if (mnt_flags & MNT_NOEXEC)
+		flags |= ST_NOEXEC;
+	if (mnt_flags & MNT_NOATIME)
+		flags |= ST_NOATIME;
+	if (mnt_flags & MNT_NODIRATIME)
+		flags |= ST_NODIRATIME;
+	if (mnt_flags & MNT_RELATIME)
+		flags |= ST_RELATIME;
+	return flags;
+}
+
+static int flags_by_sb(int s_flags)
+{
+	int flags = 0;
+	if (s_flags & MS_SYNCHRONOUS)
+		flags |= ST_SYNCHRONOUS;
+	if (s_flags & MS_MANDLOCK)
+		flags |= ST_MANDLOCK;
+	return flags;
+}
+
+static int calculate_f_flags(struct vfsmount *mnt)
+{
+	return ST_VALID | flags_by_mnt(mnt->mnt_flags) |
+		flags_by_sb(mnt->mnt_sb->s_flags);
+}
+
 int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
 {
 	int retval;
@@ -26,7 +64,12 @@ int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
 
 int vfs_statfs(struct path *path, struct kstatfs *buf)
 {
-	return statfs_by_dentry(path->dentry, buf);
+	int error;
+
+	error = statfs_by_dentry(path->dentry, buf);
+	if (!error)
+		buf->f_flags = calculate_f_flags(path->mnt);
+	return error;
 }
 EXPORT_SYMBOL(vfs_statfs);
 
@@ -69,6 +112,7 @@ static int do_statfs_native(struct path *path, struct statfs *buf)
 		buf->f_fsid = st.f_fsid;
 		buf->f_namelen = st.f_namelen;
 		buf->f_frsize = st.f_frsize;
+		buf->f_flags = st.f_flags;
 		memset(buf->f_spare, 0, sizeof(buf->f_spare));
 	}
 	return 0;
@@ -96,6 +140,7 @@ static int do_statfs64(struct path *path, struct statfs64 *buf)
 		buf->f_fsid = st.f_fsid;
 		buf->f_namelen = st.f_namelen;
 		buf->f_frsize = st.f_frsize;
+		buf->f_flags = st.f_flags;
 		memset(buf->f_spare, 0, sizeof(buf->f_spare));
 	}
 	return 0;
diff --git a/include/asm-generic/statfs.h b/include/asm-generic/statfs.h
index 3b4fb3e52f0d..0fd28e028de1 100644
--- a/include/asm-generic/statfs.h
+++ b/include/asm-generic/statfs.h
@@ -33,7 +33,8 @@ struct statfs {
 	__kernel_fsid_t f_fsid;
 	__statfs_word f_namelen;
 	__statfs_word f_frsize;
-	__statfs_word f_spare[5];
+	__statfs_word f_flags;
+	__statfs_word f_spare[4];
 };
 
 /*
@@ -55,7 +56,8 @@ struct statfs64 {
 	__kernel_fsid_t f_fsid;
 	__statfs_word f_namelen;
 	__statfs_word f_frsize;
-	__statfs_word f_spare[5];
+	__statfs_word f_flags;
+	__statfs_word f_spare[4];
 } ARCH_PACK_STATFS64;
 
 /* 
@@ -77,7 +79,8 @@ struct compat_statfs64 {
 	__kernel_fsid_t f_fsid;
 	__u32 f_namelen;
 	__u32 f_frsize;
-	__u32 f_spare[5];
+	__u32 f_flags;
+	__u32 f_spare[4];
 } ARCH_PACK_COMPAT_STATFS64;
 
 #endif
diff --git a/include/linux/statfs.h b/include/linux/statfs.h
index b34cc829f98d..0166d320a75d 100644
--- a/include/linux/statfs.h
+++ b/include/linux/statfs.h
@@ -2,7 +2,6 @@
 #define _LINUX_STATFS_H
 
 #include <linux/types.h>
-
 #include <asm/statfs.h>
 
 struct kstatfs {
@@ -16,7 +15,29 @@ struct kstatfs {
 	__kernel_fsid_t f_fsid;
 	long f_namelen;
 	long f_frsize;
-	long f_spare[5];
+	long f_flags;
+	long f_spare[4];
 };
 
+/*
+ * Definitions for the flag in f_flag.
+ *
+ * Generally these flags are equivalent to the MS_ flags used in the mount
+ * ABI.  The exception is ST_VALID which has the same value as MS_REMOUNT
+ * which doesn't make any sense for statfs.
+ */
+#define ST_RDONLY	0x0001	/* mount read-only */
+#define ST_NOSUID	0x0002	/* ignore suid and sgid bits */
+#define ST_NODEV	0x0004	/* disallow access to device special files */
+#define ST_NOEXEC	0x0008	/* disallow program execution */
+#define ST_SYNCHRONOUS	0x0010	/* writes are synced at once */
+#define ST_VALID	0x0020	/* f_flags support is implemented */
+#define ST_MANDLOCK	0x0040	/* allow mandatory locks on an FS */
+/* 0x0080 used for ST_WRITE in glibc */
+/* 0x0100 used for ST_APPEND in glibc */
+/* 0x0200 used for ST_IMMUTABLE in glibc */
+#define ST_NOATIME	0x0400	/* do not update access times */
+#define ST_NODIRATIME	0x0800	/* do not update directory access times */
+#define ST_RELATIME	0x1000	/* update atime relative to mtime/ctime */
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 2aec7c523291621ebb68ba8e0bd9b52a26bb76ee Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Mon, 19 Jul 2010 18:19:41 +0200
Subject: mbcache: Remove unused features

The mbcache code was written to support a variable number of indexes,
but all the existing users use exactly one index.  Simplify to code to
support only that case.

There are also no users of the cache entry free operation, and none of
the users keep extra data in cache entries.  Remove those features as
well.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/xattr.c         |  12 ++---
 fs/ext3/xattr.c         |  12 ++---
 fs/ext4/xattr.c         |  12 ++---
 fs/mbcache.c            | 141 ++++++++++++++----------------------------------
 include/linux/mbcache.h |  20 ++-----
 5 files changed, 60 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 5ab87e6edffc..8c29ae15129e 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -843,7 +843,7 @@ ext2_xattr_cache_insert(struct buffer_head *bh)
 	ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
 	if (!ce)
 		return -ENOMEM;
-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
 	if (error) {
 		mb_cache_entry_free(ce);
 		if (error == -EBUSY) {
@@ -917,8 +917,8 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 		return NULL;  /* never share */
 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-	ce = mb_cache_entry_find_first(ext2_xattr_cache, 0,
-				       inode->i_sb->s_bdev, hash);
+	ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
+				       hash);
 	while (ce) {
 		struct buffer_head *bh;
 
@@ -950,7 +950,7 @@ again:
 			unlock_buffer(bh);
 			brelse(bh);
 		}
-		ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
 	}
 	return NULL;
 }
@@ -1026,9 +1026,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
 int __init
 init_ext2_xattr(void)
 {
-	ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL,
-		sizeof(struct mb_cache_entry) +
-		sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+	ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
 	if (!ext2_xattr_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 71fb8d65e54c..e69dc6dfaa89 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -1139,7 +1139,7 @@ ext3_xattr_cache_insert(struct buffer_head *bh)
 		ea_bdebug(bh, "out of memory");
 		return;
 	}
-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
 	if (error) {
 		mb_cache_entry_free(ce);
 		if (error == -EBUSY) {
@@ -1211,8 +1211,8 @@ ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
 		return NULL;  /* never share */
 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-	ce = mb_cache_entry_find_first(ext3_xattr_cache, 0,
-				       inode->i_sb->s_bdev, hash);
+	ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
+				       hash);
 	while (ce) {
 		struct buffer_head *bh;
 
@@ -1237,7 +1237,7 @@ again:
 			return bh;
 		}
 		brelse(bh);
-		ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
 	}
 	return NULL;
 }
@@ -1313,9 +1313,7 @@ static void ext3_xattr_rehash(struct ext3_xattr_header *header,
 int __init
 init_ext3_xattr(void)
 {
-	ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
-		sizeof(struct mb_cache_entry) +
-		sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+	ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
 	if (!ext3_xattr_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 04338009793a..1c93198353e7 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1418,7 +1418,7 @@ ext4_xattr_cache_insert(struct buffer_head *bh)
 		ea_bdebug(bh, "out of memory");
 		return;
 	}
-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
 	if (error) {
 		mb_cache_entry_free(ce);
 		if (error == -EBUSY) {
@@ -1490,8 +1490,8 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
 		return NULL;  /* never share */
 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-	ce = mb_cache_entry_find_first(ext4_xattr_cache, 0,
-				       inode->i_sb->s_bdev, hash);
+	ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
+				       hash);
 	while (ce) {
 		struct buffer_head *bh;
 
@@ -1515,7 +1515,7 @@ again:
 			return bh;
 		}
 		brelse(bh);
-		ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
 	}
 	return NULL;
 }
@@ -1591,9 +1591,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 int __init
 init_ext4_xattr(void)
 {
-	ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL,
-		sizeof(struct mb_cache_entry) +
-		sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+	ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
 	if (!ext4_xattr_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e28f21b95344..8a2cbd823079 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -79,15 +79,11 @@ EXPORT_SYMBOL(mb_cache_entry_find_next);
 struct mb_cache {
 	struct list_head		c_cache_list;
 	const char			*c_name;
-	struct mb_cache_op		c_op;
 	atomic_t			c_entry_count;
 	int				c_bucket_bits;
-#ifndef MB_CACHE_INDEXES_COUNT
-	int				c_indexes_count;
-#endif
-	struct kmem_cache			*c_entry_cache;
+	struct kmem_cache		*c_entry_cache;
 	struct list_head		*c_block_hash;
-	struct list_head		*c_indexes_hash[0];
+	struct list_head		*c_index_hash;
 };
 
 
@@ -101,16 +97,6 @@ static LIST_HEAD(mb_cache_list);
 static LIST_HEAD(mb_cache_lru_list);
 static DEFINE_SPINLOCK(mb_cache_spinlock);
 
-static inline int
-mb_cache_indexes(struct mb_cache *cache)
-{
-#ifdef MB_CACHE_INDEXES_COUNT
-	return MB_CACHE_INDEXES_COUNT;
-#else
-	return cache->c_indexes_count;
-#endif
-}
-
 /*
  * What the mbcache registers as to get shrunk dynamically.
  */
@@ -132,12 +118,9 @@ __mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
 static void
 __mb_cache_entry_unhash(struct mb_cache_entry *ce)
 {
-	int n;
-
 	if (__mb_cache_entry_is_hashed(ce)) {
 		list_del_init(&ce->e_block_list);
-		for (n=0; n<mb_cache_indexes(ce->e_cache); n++)
-			list_del(&ce->e_indexes[n].o_list);
+		list_del(&ce->e_index.o_list);
 	}
 }
 
@@ -148,16 +131,8 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 	struct mb_cache *cache = ce->e_cache;
 
 	mb_assert(!(ce->e_used || ce->e_queued));
-	if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) {
-		/* free failed -- put back on the lru list
-		   for freeing later. */
-		spin_lock(&mb_cache_spinlock);
-		list_add(&ce->e_lru_list, &mb_cache_lru_list);
-		spin_unlock(&mb_cache_spinlock);
-	} else {
-		kmem_cache_free(cache->c_entry_cache, ce);
-		atomic_dec(&cache->c_entry_count);
-	}
+	kmem_cache_free(cache->c_entry_cache, ce);
+	atomic_dec(&cache->c_entry_count);
 }
 
 
@@ -243,72 +218,49 @@ out:
  * memory was available.
  *
  * @name: name of the cache (informal)
- * @cache_op: contains the callback called when freeing a cache entry
- * @entry_size: The size of a cache entry, including
- *              struct mb_cache_entry
- * @indexes_count: number of additional indexes in the cache. Must equal
- *                 MB_CACHE_INDEXES_COUNT if the number of indexes is
- *                 hardwired.
  * @bucket_bits: log2(number of hash buckets)
  */
 struct mb_cache *
-mb_cache_create(const char *name, struct mb_cache_op *cache_op,
-		size_t entry_size, int indexes_count, int bucket_bits)
+mb_cache_create(const char *name, int bucket_bits)
 {
-	int m=0, n, bucket_count = 1 << bucket_bits;
+	int n, bucket_count = 1 << bucket_bits;
 	struct mb_cache *cache = NULL;
 
-	if(entry_size < sizeof(struct mb_cache_entry) +
-	   indexes_count * sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]))
-		return NULL;
-
-	cache = kmalloc(sizeof(struct mb_cache) +
-	                indexes_count * sizeof(struct list_head), GFP_KERNEL);
+	cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
 	if (!cache)
-		goto fail;
+		return NULL;
 	cache->c_name = name;
-	cache->c_op.free = NULL;
-	if (cache_op)
-		cache->c_op.free = cache_op->free;
 	atomic_set(&cache->c_entry_count, 0);
 	cache->c_bucket_bits = bucket_bits;
-#ifdef MB_CACHE_INDEXES_COUNT
-	mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
-#else
-	cache->c_indexes_count = indexes_count;
-#endif
 	cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
 	                              GFP_KERNEL);
 	if (!cache->c_block_hash)
 		goto fail;
 	for (n=0; n<bucket_count; n++)
 		INIT_LIST_HEAD(&cache->c_block_hash[n]);
-	for (m=0; m<indexes_count; m++) {
-		cache->c_indexes_hash[m] = kmalloc(bucket_count *
-		                                 sizeof(struct list_head),
-		                                 GFP_KERNEL);
-		if (!cache->c_indexes_hash[m])
-			goto fail;
-		for (n=0; n<bucket_count; n++)
-			INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
-	}
-	cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
+	cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
+				      GFP_KERNEL);
+	if (!cache->c_index_hash)
+		goto fail;
+	for (n=0; n<bucket_count; n++)
+		INIT_LIST_HEAD(&cache->c_index_hash[n]);
+	cache->c_entry_cache = kmem_cache_create(name,
+		sizeof(struct mb_cache_entry), 0,
 		SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
 	if (!cache->c_entry_cache)
-		goto fail;
+		goto fail2;
 
 	spin_lock(&mb_cache_spinlock);
 	list_add(&cache->c_cache_list, &mb_cache_list);
 	spin_unlock(&mb_cache_spinlock);
 	return cache;
 
+fail2:
+	kfree(cache->c_index_hash);
+
 fail:
-	if (cache) {
-		while (--m >= 0)
-			kfree(cache->c_indexes_hash[m]);
-		kfree(cache->c_block_hash);
-		kfree(cache);
-	}
+	kfree(cache->c_block_hash);
+	kfree(cache);
 	return NULL;
 }
 
@@ -357,7 +309,6 @@ mb_cache_destroy(struct mb_cache *cache)
 {
 	LIST_HEAD(free_list);
 	struct list_head *l, *ltmp;
-	int n;
 
 	spin_lock(&mb_cache_spinlock);
 	list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
@@ -384,8 +335,7 @@ mb_cache_destroy(struct mb_cache *cache)
 
 	kmem_cache_destroy(cache->c_entry_cache);
 
-	for (n=0; n < mb_cache_indexes(cache); n++)
-		kfree(cache->c_indexes_hash[n]);
+	kfree(cache->c_index_hash);
 	kfree(cache->c_block_hash);
 	kfree(cache);
 }
@@ -429,17 +379,16 @@ mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
  *
  * @bdev: device the cache entry belongs to
  * @block: block number
- * @keys: array of additional keys. There must be indexes_count entries
- *        in the array (as specified when creating the cache).
+ * @key: lookup key
  */
 int
 mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
-		      sector_t block, unsigned int keys[])
+		      sector_t block, unsigned int key)
 {
 	struct mb_cache *cache = ce->e_cache;
 	unsigned int bucket;
 	struct list_head *l;
-	int error = -EBUSY, n;
+	int error = -EBUSY;
 
 	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
 			   cache->c_bucket_bits);
@@ -454,12 +403,9 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
 	ce->e_bdev = bdev;
 	ce->e_block = block;
 	list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
-	for (n=0; n<mb_cache_indexes(cache); n++) {
-		ce->e_indexes[n].o_key = keys[n];
-		bucket = hash_long(keys[n], cache->c_bucket_bits);
-		list_add(&ce->e_indexes[n].o_list,
-			 &cache->c_indexes_hash[n][bucket]);
-	}
+	ce->e_index.o_key = key;
+	bucket = hash_long(key, cache->c_bucket_bits);
+	list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
 	error = 0;
 out:
 	spin_unlock(&mb_cache_spinlock);
@@ -555,13 +501,12 @@ cleanup:
 
 static struct mb_cache_entry *
 __mb_cache_entry_find(struct list_head *l, struct list_head *head,
-		      int index, struct block_device *bdev, unsigned int key)
+		      struct block_device *bdev, unsigned int key)
 {
 	while (l != head) {
 		struct mb_cache_entry *ce =
-			list_entry(l, struct mb_cache_entry,
-			           e_indexes[index].o_list);
-		if (ce->e_bdev == bdev && ce->e_indexes[index].o_key == key) {
+			list_entry(l, struct mb_cache_entry, e_index.o_list);
+		if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
 			DEFINE_WAIT(wait);
 
 			if (!list_empty(&ce->e_lru_list))
@@ -603,23 +548,20 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
  * returned cache entry is locked for shared access ("multiple readers").
  *
  * @cache: the cache to search
- * @index: the number of the additonal index to search (0<=index<indexes_count)
  * @bdev: the device the cache entry should belong to
  * @key: the key in the index
  */
 struct mb_cache_entry *
-mb_cache_entry_find_first(struct mb_cache *cache, int index,
-			  struct block_device *bdev, unsigned int key)
+mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
+			  unsigned int key)
 {
 	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
 	struct list_head *l;
 	struct mb_cache_entry *ce;
 
-	mb_assert(index < mb_cache_indexes(cache));
 	spin_lock(&mb_cache_spinlock);
-	l = cache->c_indexes_hash[index][bucket].next;
-	ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
-	                           index, bdev, key);
+	l = cache->c_index_hash[bucket].next;
+	ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
 	spin_unlock(&mb_cache_spinlock);
 	return ce;
 }
@@ -640,12 +582,11 @@ mb_cache_entry_find_first(struct mb_cache *cache, int index,
  * }
  *
  * @prev: The previous match
- * @index: the number of the additonal index to search (0<=index<indexes_count)
  * @bdev: the device the cache entry should belong to
  * @key: the key in the index
  */
 struct mb_cache_entry *
-mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
+mb_cache_entry_find_next(struct mb_cache_entry *prev,
 			 struct block_device *bdev, unsigned int key)
 {
 	struct mb_cache *cache = prev->e_cache;
@@ -653,11 +594,9 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
 	struct list_head *l;
 	struct mb_cache_entry *ce;
 
-	mb_assert(index < mb_cache_indexes(cache));
 	spin_lock(&mb_cache_spinlock);
-	l = prev->e_indexes[index].o_list.next;
-	ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
-	                           index, bdev, key);
+	l = prev->e_index.o_list.next;
+	ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
 	__mb_cache_entry_release_unlock(prev);
 	return ce;
 }
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index a09b84e4fdb4..54cbbac1e71d 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -4,9 +4,6 @@
   (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
 
-/* Hardwire the number of additional indexes */
-#define MB_CACHE_INDEXES_COUNT 1
-
 struct mb_cache_entry {
 	struct list_head		e_lru_list;
 	struct mb_cache			*e_cache;
@@ -18,17 +15,12 @@ struct mb_cache_entry {
 	struct {
 		struct list_head	o_list;
 		unsigned int		o_key;
-	} e_indexes[0];
-};
-
-struct mb_cache_op {
-	int (*free)(struct mb_cache_entry *, gfp_t);
+	} e_index;
 };
 
 /* Functions on caches */
 
-struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t,
-				  int, int);
+struct mb_cache *mb_cache_create(const char *, int);
 void mb_cache_shrink(struct block_device *);
 void mb_cache_destroy(struct mb_cache *);
 
@@ -36,17 +28,15 @@ void mb_cache_destroy(struct mb_cache *);
 
 struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *, gfp_t);
 int mb_cache_entry_insert(struct mb_cache_entry *, struct block_device *,
-			  sector_t, unsigned int[]);
+			  sector_t, unsigned int);
 void mb_cache_entry_release(struct mb_cache_entry *);
 void mb_cache_entry_free(struct mb_cache_entry *);
 struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *,
 					  struct block_device *,
 					  sector_t);
-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
-struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int,
+struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
 						 struct block_device *, 
 						 unsigned int);
-struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int,
+struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *,
 						struct block_device *, 
 						unsigned int);
-#endif
-- 
cgit v1.2.3-59-g8ed1b


From 7a4dec53897ecd3367efb1e12fe8a4edc47dc0e9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 9 Aug 2010 12:05:43 -0400
Subject: Fix sget() race with failing mount

If sget() finds a matching superblock being set up, it'll
grab an active reference to it and grab s_umount.  That's
fine - we'll wait for completion of foofs_get_sb() that way.
However, if said foofs_get_sb() fails we'll end up holding
the halfway-created superblock.  deactivate_locked_super()
called by foofs_get_sb() will just unlock the sucker since
we are holding another active reference to it.

What we need is a way to tell if superblock has been successfully
set up.  Unfortunately, neither ->s_root nor the check for
MS_ACTIVE quite fit.  Cheap and easy way, suitable for backport:
new flag set by the (only) caller of ->get_sb().  If that flag
isn't present by the time sget() grabbed s_umount on preexisting
superblock it has found, it's seeing a stillborn and should
just bury it with deactivate_locked_super() (and repeat the search).

Longer term we want to set that flag in ->get_sb() instances (and
check for it to distinguish between "sget() found us a live sb"
and "sget() has allocated an sb, we need to set it up" in there,
instead of checking ->s_root as we do now).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@kernel.org
---
 fs/namespace.c     | 2 +-
 fs/super.c         | 6 ++++++
 include/linux/fs.h | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 88058de59c7c..32dcd24bbc9a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1984,7 +1984,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	if (flags & MS_RDONLY)
 		mnt_flags |= MNT_READONLY;
 
-	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
+	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
 		   MS_STRICTATIME);
 
diff --git a/fs/super.c b/fs/super.c
index 3479ca6f005f..bd9eea4bb2bb 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -305,8 +305,13 @@ retry:
 			if (s) {
 				up_write(&s->s_umount);
 				destroy_super(s);
+				s = NULL;
 			}
 			down_write(&old->s_umount);
+			if (unlikely(!(old->s_flags & MS_BORN))) {
+				deactivate_locked_super(old);
+				goto retry;
+			}
 			return old;
 		}
 	}
@@ -918,6 +923,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 		goto out_free_secdata;
 	BUG_ON(!mnt->mnt_sb);
 	WARN_ON(!mnt->mnt_sb->s_bdi);
+	mnt->mnt_sb->s_flags |= MS_BORN;
 
 	error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
 	if (error)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9bedf4219f83..58e4b035e282 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -209,6 +209,7 @@ struct inodes_stat_t {
 #define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
 #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
+#define MS_BORN		(1<<29)
 #define MS_ACTIVE	(1<<30)
 #define MS_NOUSER	(1<<31)
 
-- 
cgit v1.2.3-59-g8ed1b


From 597781f3e51f48ef8e67be772196d9e9673752c4 Mon Sep 17 00:00:00 2001
From: Cesar Eduardo Barros <cesarb@cesarb.net>
Date: Mon, 9 Aug 2010 17:18:32 -0700
Subject: kmap_atomic: make kunmap_atomic() harder to misuse

kunmap_atomic() is currently at level -4 on Rusty's "Hard To Misuse"
list[1] ("Follow common convention and you'll get it wrong"), except in
some architectures when CONFIG_DEBUG_HIGHMEM is set[2][3].

kunmap() takes a pointer to a struct page; kunmap_atomic(), however, takes
takes a pointer to within the page itself.  This seems to once in a while
trip people up (the convention they are following is the one from
kunmap()).

Make it much harder to misuse, by moving it to level 9 on Rusty's list[4]
("The compiler/linker won't let you get it wrong").  This is done by
refusing to build if the type of its first argument is a pointer to a
struct page.

The real kunmap_atomic() is renamed to kunmap_atomic_notypecheck()
(which is what you would call in case for some strange reason calling it
with a pointer to a struct page is not incorrect in your code).

The previous version of this patch was compile tested on x86-64.

[1] http://ozlabs.org/~rusty/index.cgi/tech/2008-04-01.html
[2] In these cases, it is at level 5, "Do it right or it will always
    break at runtime."
[3] At least mips and powerpc look very similar, and sparc also seems to
    share a common ancestor with both; there seems to be quite some
    degree of copy-and-paste coding here. The include/asm/highmem.h file
    for these three archs mention x86 CPUs at its top.
[4] http://ozlabs.org/~rusty/index.cgi/tech/2008-03-30.html
[5] As an aside, could someone tell me why mn10300 uses unsigned long as
    the first parameter of kunmap_atomic() instead of void *?

Signed-off-by: Cesar Eduardo Barros <cesarb@cesarb.net>
Cc: Russell King <linux@arm.linux.org.uk> (arch/arm)
Cc: Ralf Baechle <ralf@linux-mips.org> (arch/mips)
Cc: David Howells <dhowells@redhat.com> (arch/frv, arch/mn10300)
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com> (arch/mn10300)
Cc: Kyle McMartin <kyle@mcmartin.ca> (arch/parisc)
Cc: Helge Deller <deller@gmx.de> (arch/parisc)
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org> (arch/parisc)
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> (arch/powerpc)
Cc: Paul Mackerras <paulus@samba.org> (arch/powerpc)
Cc: "David S. Miller" <davem@davemloft.net> (arch/sparc)
Cc: Thomas Gleixner <tglx@linutronix.de> (arch/x86)
Cc: Ingo Molnar <mingo@redhat.com> (arch/x86)
Cc: "H. Peter Anvin" <hpa@zytor.com> (arch/x86)
Cc: Arnd Bergmann <arnd@arndb.de> (include/asm-generic)
Cc: Rusty Russell <rusty@rustcorp.com.au> ("Hard To Misuse" list)
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/highmem.h       |  2 +-
 arch/arm/mm/highmem.c                |  4 ++--
 arch/frv/include/asm/highmem.h       |  2 +-
 arch/mips/include/asm/highmem.h      |  4 ++--
 arch/mips/mm/highmem.c               |  4 ++--
 arch/mn10300/include/asm/highmem.h   |  2 +-
 arch/parisc/include/asm/cacheflush.h |  2 +-
 arch/powerpc/include/asm/highmem.h   |  2 +-
 arch/powerpc/mm/highmem.c            |  4 ++--
 arch/sparc/include/asm/highmem.h     |  2 +-
 arch/sparc/mm/highmem.c              |  4 ++--
 arch/x86/include/asm/highmem.h       |  2 +-
 arch/x86/mm/highmem_32.c             |  4 ++--
 include/linux/highmem.h              | 10 +++++++++-
 14 files changed, 28 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h
index feb988a7ec37..5aff58126602 100644
--- a/arch/arm/include/asm/highmem.h
+++ b/arch/arm/include/asm/highmem.h
@@ -36,7 +36,7 @@ extern void kunmap_high_l1_vipt(struct page *page, pte_t saved_pte);
 extern void *kmap(struct page *page);
 extern void kunmap(struct page *page);
 extern void *kmap_atomic(struct page *page, enum km_type type);
-extern void kunmap_atomic(void *kvaddr, enum km_type type);
+extern void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
 extern void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
 extern struct page *kmap_atomic_to_page(const void *ptr);
 #endif
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index 6ab244062b4a..1fbdb55bfd1b 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -82,7 +82,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
 }
 EXPORT_SYMBOL(kmap_atomic);
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	unsigned int idx = type + KM_TYPE_NR * smp_processor_id();
@@ -103,7 +103,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 	}
 	pagefault_enable();
 }
-EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_notypecheck);
 
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 {
diff --git a/arch/frv/include/asm/highmem.h b/arch/frv/include/asm/highmem.h
index 68e4677fb9e7..cb4c317eaecc 100644
--- a/arch/frv/include/asm/highmem.h
+++ b/arch/frv/include/asm/highmem.h
@@ -152,7 +152,7 @@ do {									\
 	asm volatile("tlbpr %0,gr0,#4,#1" : : "r"(vaddr) : "memory");	\
 } while(0)
 
-static inline void kunmap_atomic(void *kvaddr, enum km_type type)
+static inline void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 	switch (type) {
         case 0:		__kunmap_atomic_primary(0, 2);	break;
diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h
index 25adfb02923d..75753ca73bfd 100644
--- a/arch/mips/include/asm/highmem.h
+++ b/arch/mips/include/asm/highmem.h
@@ -48,14 +48,14 @@ extern void kunmap_high(struct page *page);
 extern void *__kmap(struct page *page);
 extern void __kunmap(struct page *page);
 extern void *__kmap_atomic(struct page *page, enum km_type type);
-extern void __kunmap_atomic(void *kvaddr, enum km_type type);
+extern void __kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
 extern void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
 extern struct page *__kmap_atomic_to_page(void *ptr);
 
 #define kmap			__kmap
 #define kunmap			__kunmap
 #define kmap_atomic		__kmap_atomic
-#define kunmap_atomic		__kunmap_atomic
+#define kunmap_atomic_notypecheck		__kunmap_atomic_notypecheck
 #define kmap_atomic_to_page	__kmap_atomic_to_page
 
 #define flush_cache_kmaps()	flush_cache_all()
diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c
index 127d732474bf..6a2b1bf9ef11 100644
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -64,7 +64,7 @@ void *__kmap_atomic(struct page *page, enum km_type type)
 }
 EXPORT_SYMBOL(__kmap_atomic);
 
-void __kunmap_atomic(void *kvaddr, enum km_type type)
+void __kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -87,7 +87,7 @@ void __kunmap_atomic(void *kvaddr, enum km_type type)
 
 	pagefault_enable();
 }
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(__kunmap_atomic_notypecheck);
 
 /*
  * This is the same as kmap_atomic() but can map memory that doesn't
diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h
index 90f2abb04bfd..b0b187a29b88 100644
--- a/arch/mn10300/include/asm/highmem.h
+++ b/arch/mn10300/include/asm/highmem.h
@@ -91,7 +91,7 @@ static inline unsigned long kmap_atomic(struct page *page, enum km_type type)
 	return vaddr;
 }
 
-static inline void kunmap_atomic(unsigned long vaddr, enum km_type type)
+static inline void kunmap_atomic_notypecheck(unsigned long vaddr, enum km_type type)
 {
 #if HIGHMEM_DEBUG
 	enum fixed_addresses idx = type + KM_TYPE_NR * smp_processor_id();
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index 4556d820128a..dba11aedce1b 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -132,7 +132,7 @@ static inline void *kmap_atomic(struct page *page, enum km_type idx)
 	return page_address(page);
 }
 
-static inline void kunmap_atomic(void *addr, enum km_type idx)
+static inline void kunmap_atomic_notypecheck(void *addr, enum km_type idx)
 {
 	kunmap_parisc(addr);
 	pagefault_enable();
diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h
index a74c4ee6c020..d10d64a4be38 100644
--- a/arch/powerpc/include/asm/highmem.h
+++ b/arch/powerpc/include/asm/highmem.h
@@ -62,7 +62,7 @@ extern void *kmap_high(struct page *page);
 extern void kunmap_high(struct page *page);
 extern void *kmap_atomic_prot(struct page *page, enum km_type type,
 			      pgprot_t prot);
-extern void kunmap_atomic(void *kvaddr, enum km_type type);
+extern void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
 
 static inline void *kmap(struct page *page)
 {
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index c2186c74c85a..857d4173f9c6 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -52,7 +52,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 }
 EXPORT_SYMBOL(kmap_atomic_prot);
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -74,4 +74,4 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 #endif
 	pagefault_enable();
 }
-EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_notypecheck);
diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h
index 3de42e776274..ec23b0a87b98 100644
--- a/arch/sparc/include/asm/highmem.h
+++ b/arch/sparc/include/asm/highmem.h
@@ -71,7 +71,7 @@ static inline void kunmap(struct page *page)
 }
 
 extern void *kmap_atomic(struct page *page, enum km_type type);
-extern void kunmap_atomic(void *kvaddr, enum km_type type);
+extern void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
 extern struct page *kmap_atomic_to_page(void *vaddr);
 
 #define flush_cache_kmaps()	flush_cache_all()
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index 7916feba6e4a..e139e9cbf5f7 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -65,7 +65,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
 }
 EXPORT_SYMBOL(kmap_atomic);
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -100,7 +100,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 
 	pagefault_enable();
 }
-EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_notypecheck);
 
 /* We may be fed a pagetable here by ptep_to_xxx and others. */
 struct page *kmap_atomic_to_page(void *ptr)
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index a726650fc80f..8caac76ac324 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -61,7 +61,7 @@ void *kmap(struct page *page);
 void kunmap(struct page *page);
 void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
 void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic(void *kvaddr, enum km_type type);
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
 void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 63a6ba66cbe0..5e8fa12ef861 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
 	return kmap_atomic_prot(page, type, kmap_prot);
 }
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
@@ -102,7 +102,7 @@ struct page *kmap_atomic_to_page(void *ptr)
 EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
 EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_notypecheck);
 EXPORT_SYMBOL(kmap_atomic_prot);
 EXPORT_SYMBOL(kmap_atomic_to_page);
 
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index caafd0561aa1..67460f010224 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -2,6 +2,7 @@
 #define _LINUX_HIGHMEM_H
 
 #include <linux/fs.h>
+#include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 
@@ -72,7 +73,7 @@ static inline void *kmap_atomic(struct page *page, enum km_type idx)
 }
 #define kmap_atomic_prot(page, idx, prot)	kmap_atomic(page, idx)
 
-#define kunmap_atomic(addr, idx)	do { pagefault_enable(); } while (0)
+#define kunmap_atomic_notypecheck(addr, idx)	do { pagefault_enable(); } while (0)
 #define kmap_atomic_pfn(pfn, idx)	kmap_atomic(pfn_to_page(pfn), (idx))
 #define kmap_atomic_to_page(ptr)	virt_to_page(ptr)
 
@@ -81,6 +82,13 @@ static inline void *kmap_atomic(struct page *page, enum km_type idx)
 
 #endif /* CONFIG_HIGHMEM */
 
+/* Prevent people trying to call kunmap_atomic() as if it were kunmap() */
+/* kunmap_atomic() should get the return value of kmap_atomic, not the page. */
+#define kunmap_atomic(addr, idx) do { \
+		BUILD_BUG_ON(__same_type((addr), struct page *)); \
+		kunmap_atomic_notypecheck((addr), (idx)); \
+	} while (0)
+
 /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
 #ifndef clear_user_highpage
 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
-- 
cgit v1.2.3-59-g8ed1b


From bb4a340e075b7897ece109686bfa177f8518d2db Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 9 Aug 2010 17:18:37 -0700
Subject: mm: rename anon_vma_lock to vma_lock_anon_vma

Rename anon_vma_lock to vma_lock_anon_vma.  This matches the naming style
used in page_lock_anon_vma and will come in really handy further down in
this patch series.

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  4 ++--
 mm/mmap.c            | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 77216742c178..80cd162a8aa6 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -99,14 +99,14 @@ static inline struct anon_vma *page_anon_vma(struct page *page)
 	return page_rmapping(page);
 }
 
-static inline void anon_vma_lock(struct vm_area_struct *vma)
+static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma)
 		spin_lock(&anon_vma->lock);
 }
 
-static inline void anon_vma_unlock(struct vm_area_struct *vma)
+static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma)
diff --git a/mm/mmap.c b/mm/mmap.c
index e38e910cb756..e26f1ea7c904 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -452,12 +452,12 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 		spin_lock(&mapping->i_mmap_lock);
 		vma->vm_truncate_count = mapping->truncate_count;
 	}
-	anon_vma_lock(vma);
+	vma_lock_anon_vma(vma);
 
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	__vma_link_file(vma);
 
-	anon_vma_unlock(vma);
+	vma_unlock_anon_vma(vma);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
 
@@ -1710,7 +1710,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 	 */
 	if (unlikely(anon_vma_prepare(vma)))
 		return -ENOMEM;
-	anon_vma_lock(vma);
+	vma_lock_anon_vma(vma);
 
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1721,7 +1721,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 	if (address < PAGE_ALIGN(address+4))
 		address = PAGE_ALIGN(address+4);
 	else {
-		anon_vma_unlock(vma);
+		vma_unlock_anon_vma(vma);
 		return -ENOMEM;
 	}
 	error = 0;
@@ -1739,7 +1739,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 			perf_event_mmap(vma);
 		}
 	}
-	anon_vma_unlock(vma);
+	vma_unlock_anon_vma(vma);
 	return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1764,7 +1764,7 @@ static int expand_downwards(struct vm_area_struct *vma,
 	if (error)
 		return error;
 
-	anon_vma_lock(vma);
+	vma_lock_anon_vma(vma);
 
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1786,7 +1786,7 @@ static int expand_downwards(struct vm_area_struct *vma,
 			perf_event_mmap(vma);
 		}
 	}
-	anon_vma_unlock(vma);
+	vma_unlock_anon_vma(vma);
 	return error;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From cba48b98f2348c814316c4b4f411a07a0e4a2bf9 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 9 Aug 2010 17:18:38 -0700
Subject: mm: change direct call of spin_lock(anon_vma->lock) to inline
 function

Subsitute a direct call of spin_lock(anon_vma->lock) with an inline
function doing exactly the same.

This makes it easier to do the substitution to the root anon_vma lock in a
following patch.

We will deal with the handful of special locks (nested, dec_and_lock, etc)
separately.

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 10 ++++++++++
 mm/ksm.c             | 18 +++++++++---------
 mm/migrate.c         |  2 +-
 mm/mmap.c            |  2 +-
 mm/rmap.c            | 20 ++++++++++----------
 5 files changed, 31 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 80cd162a8aa6..5f981be61416 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -113,6 +113,16 @@ static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
 		spin_unlock(&anon_vma->lock);
 }
 
+static inline void anon_vma_lock(struct anon_vma *anon_vma)
+{
+	spin_lock(&anon_vma->lock);
+}
+
+static inline void anon_vma_unlock(struct anon_vma *anon_vma)
+{
+	spin_unlock(&anon_vma->lock);
+}
+
 /*
  * anon_vma helper functions.
  */
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c3e99b4ae7c..eb9f6806ed51 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -327,7 +327,7 @@ static void drop_anon_vma(struct rmap_item *rmap_item)
 
 	if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
 		int empty = list_empty(&anon_vma->head);
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 		if (empty)
 			anon_vma_free(anon_vma);
 	}
@@ -1566,7 +1566,7 @@ again:
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
-		spin_lock(&anon_vma->lock);
+		anon_vma_lock(anon_vma);
 		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
@@ -1589,7 +1589,7 @@ again:
 			if (!search_new_forks || !mapcount)
 				break;
 		}
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 		if (!mapcount)
 			goto out;
 	}
@@ -1619,7 +1619,7 @@ again:
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
-		spin_lock(&anon_vma->lock);
+		anon_vma_lock(anon_vma);
 		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
@@ -1637,11 +1637,11 @@ again:
 			ret = try_to_unmap_one(page, vma,
 					rmap_item->address, flags);
 			if (ret != SWAP_AGAIN || !page_mapped(page)) {
-				spin_unlock(&anon_vma->lock);
+				anon_vma_unlock(anon_vma);
 				goto out;
 			}
 		}
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 	}
 	if (!search_new_forks++)
 		goto again;
@@ -1671,7 +1671,7 @@ again:
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
 
-		spin_lock(&anon_vma->lock);
+		anon_vma_lock(anon_vma);
 		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
@@ -1688,11 +1688,11 @@ again:
 
 			ret = rmap_one(page, vma, rmap_item->address, arg);
 			if (ret != SWAP_AGAIN) {
-				spin_unlock(&anon_vma->lock);
+				anon_vma_unlock(anon_vma);
 				goto out;
 			}
 		}
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 	}
 	if (!search_new_forks++)
 		goto again;
diff --git a/mm/migrate.c b/mm/migrate.c
index 4205b1d6049e..1855f869917d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -684,7 +684,7 @@ rcu_unlock:
 	/* Drop an anon_vma reference if we took one */
 	if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
 		int empty = list_empty(&anon_vma->head);
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 		if (empty)
 			anon_vma_free(anon_vma);
 	}
diff --git a/mm/mmap.c b/mm/mmap.c
index e26f1ea7c904..f5db18decc2e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2593,7 +2593,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 		if (!__test_and_clear_bit(0, (unsigned long *)
 					  &anon_vma->head.next))
 			BUG();
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 	}
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 38a336e2eea1..b65f00d1707f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -134,7 +134,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 			allocated = anon_vma;
 		}
 
-		spin_lock(&anon_vma->lock);
+		anon_vma_lock(anon_vma);
 		/* page_table_lock to protect against threads */
 		spin_lock(&mm->page_table_lock);
 		if (likely(!vma->anon_vma)) {
@@ -147,7 +147,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 			avc = NULL;
 		}
 		spin_unlock(&mm->page_table_lock);
-		spin_unlock(&anon_vma->lock);
+		anon_vma_unlock(anon_vma);
 
 		if (unlikely(allocated))
 			anon_vma_free(allocated);
@@ -170,9 +170,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
 	avc->anon_vma = anon_vma;
 	list_add(&avc->same_vma, &vma->anon_vma_chain);
 
-	spin_lock(&anon_vma->lock);
+	anon_vma_lock(anon_vma);
 	list_add_tail(&avc->same_anon_vma, &anon_vma->head);
-	spin_unlock(&anon_vma->lock);
+	anon_vma_unlock(anon_vma);
 }
 
 /*
@@ -246,12 +246,12 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
 	if (!anon_vma)
 		return;
 
-	spin_lock(&anon_vma->lock);
+	anon_vma_lock(anon_vma);
 	list_del(&anon_vma_chain->same_anon_vma);
 
 	/* We must garbage collect the anon_vma if it's empty */
 	empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
-	spin_unlock(&anon_vma->lock);
+	anon_vma_unlock(anon_vma);
 
 	if (empty)
 		anon_vma_free(anon_vma);
@@ -302,7 +302,7 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
 		goto out;
 
 	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
-	spin_lock(&anon_vma->lock);
+	anon_vma_lock(anon_vma);
 	return anon_vma;
 out:
 	rcu_read_unlock();
@@ -311,7 +311,7 @@ out:
 
 void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-	spin_unlock(&anon_vma->lock);
+	anon_vma_unlock(anon_vma);
 	rcu_read_unlock();
 }
 
@@ -1389,7 +1389,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 	anon_vma = page_anon_vma(page);
 	if (!anon_vma)
 		return ret;
-	spin_lock(&anon_vma->lock);
+	anon_vma_lock(anon_vma);
 	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
@@ -1399,7 +1399,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 		if (ret != SWAP_AGAIN)
 			break;
 	}
-	spin_unlock(&anon_vma->lock);
+	anon_vma_unlock(anon_vma);
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 5c341ee1dfc8fe69d66b1c8b19e463c6d7201ae1 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 9 Aug 2010 17:18:39 -0700
Subject: mm: track the root (oldest) anon_vma

Track the root (oldest) anon_vma in each anon_vma tree.  Because we only
take the lock on the root anon_vma, we cannot use the lock on higher-up
anon_vmas to lock anything.  This makes it impossible to do an indirect
lookup of the root anon_vma, since the data structures could go away from
under us.

However, a direct pointer is safe because the root anon_vma is always the
last one that gets freed on munmap or exit, by virtue of the same_vma list
order and unlink_anon_vmas walking the list forward.

[akpm@linux-foundation.org: fix typo]
Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  1 +
 mm/rmap.c            | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 5f981be61416..41fa6ddc6214 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -26,6 +26,7 @@
  */
 struct anon_vma {
 	spinlock_t lock;	/* Serialize access to vma list */
+	struct anon_vma *root;	/* Root of this anon_vma tree */
 #if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
 
 	/*
diff --git a/mm/rmap.c b/mm/rmap.c
index b65f00d1707f..caa48b27371b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -132,6 +132,11 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 			if (unlikely(!anon_vma))
 				goto out_enomem_free_avc;
 			allocated = anon_vma;
+			/*
+			 * This VMA had no anon_vma yet.  This anon_vma is
+			 * the root of any anon_vma tree that might form.
+			 */
+			anon_vma->root = anon_vma;
 		}
 
 		anon_vma_lock(anon_vma);
@@ -224,9 +229,15 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 	avc = anon_vma_chain_alloc();
 	if (!avc)
 		goto out_error_free_anon_vma;
-	anon_vma_chain_link(vma, avc, anon_vma);
+
+	/*
+	 * The root anon_vma's spinlock is the lock actually used when we
+	 * lock any of the anon_vmas in this anon_vma tree.
+	 */
+	anon_vma->root = pvma->anon_vma->root;
 	/* Mark this anon_vma as the one where our new (COWed) pages go. */
 	vma->anon_vma = anon_vma;
+	anon_vma_chain_link(vma, avc, anon_vma);
 
 	return 0;
 
@@ -261,7 +272,10 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
 {
 	struct anon_vma_chain *avc, *next;
 
-	/* Unlink each anon_vma chained to the VMA. */
+	/*
+	 * Unlink each anon_vma chained to the VMA.  This list is ordered
+	 * from newest to oldest, ensuring the root anon_vma gets freed last.
+	 */
 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
 		anon_vma_unlink(avc);
 		list_del(&avc->same_vma);
-- 
cgit v1.2.3-59-g8ed1b


From 012f18004da33ba672e3c60838cc4898126174d3 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 9 Aug 2010 17:18:40 -0700
Subject: mm: always lock the root (oldest) anon_vma

Always (and only) lock the root (oldest) anon_vma whenever we do something
in an anon_vma.  The recently introduced anon_vma scalability is due to
the rmap code scanning only the VMAs that need to be scanned.  Many common
operations still took the anon_vma lock on the root anon_vma, so always
taking that lock is not expected to introduce any scalability issues.

However, always taking the same lock does mean we only need to take one
lock, which means rmap_walk on pages from any anon_vma in the vma is
excluded from occurring during an munmap, expand_stack or other operation
that needs to exclude rmap_walk and similar functions.

Also add the proper locking to vma_adjust.

Signed-off-by: Rik van Riel <riel@redhat.com>
Tested-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  8 ++++----
 mm/ksm.c             |  2 +-
 mm/migrate.c         |  2 +-
 mm/mmap.c            | 30 ++++++++++++++++++++++--------
 4 files changed, 28 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 41fa6ddc6214..af43cb9a0506 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -104,24 +104,24 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma)
-		spin_lock(&anon_vma->lock);
+		spin_lock(&anon_vma->root->lock);
 }
 
 static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
 	if (anon_vma)
-		spin_unlock(&anon_vma->lock);
+		spin_unlock(&anon_vma->root->lock);
 }
 
 static inline void anon_vma_lock(struct anon_vma *anon_vma)
 {
-	spin_lock(&anon_vma->lock);
+	spin_lock(&anon_vma->root->lock);
 }
 
 static inline void anon_vma_unlock(struct anon_vma *anon_vma)
 {
-	spin_unlock(&anon_vma->lock);
+	spin_unlock(&anon_vma->root->lock);
 }
 
 /*
diff --git a/mm/ksm.c b/mm/ksm.c
index eb9f6806ed51..da6037c261f1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -325,7 +325,7 @@ static void drop_anon_vma(struct rmap_item *rmap_item)
 {
 	struct anon_vma *anon_vma = rmap_item->anon_vma;
 
-	if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
+	if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
 		int empty = list_empty(&anon_vma->head);
 		anon_vma_unlock(anon_vma);
 		if (empty)
diff --git a/mm/migrate.c b/mm/migrate.c
index 1855f869917d..5208fa1d9712 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -682,7 +682,7 @@ skip_unmap:
 rcu_unlock:
 
 	/* Drop an anon_vma reference if we took one */
-	if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
+	if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
 		int empty = list_empty(&anon_vma->head);
 		anon_vma_unlock(anon_vma);
 		if (empty)
diff --git a/mm/mmap.c b/mm/mmap.c
index f5db18decc2e..fb89360a2120 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -506,6 +506,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	struct vm_area_struct *importer = NULL;
 	struct address_space *mapping = NULL;
 	struct prio_tree_root *root = NULL;
+	struct anon_vma *anon_vma = NULL;
 	struct file *file = vma->vm_file;
 	long adjust_next = 0;
 	int remove_next = 0;
@@ -578,6 +579,17 @@ again:			remove_next = 1 + (end > next->vm_end);
 		}
 	}
 
+	/*
+	 * When changing only vma->vm_end, we don't really need anon_vma
+	 * lock. This is a fairly rare case by itself, but the anon_vma
+	 * lock may be shared between many sibling processes.  Skipping
+	 * the lock for brk adjustments makes a difference sometimes.
+	 */
+	if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
+		anon_vma = vma->anon_vma;
+		anon_vma_lock(anon_vma);
+	}
+
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
 		vma_prio_tree_remove(vma, root);
@@ -617,6 +629,8 @@ again:			remove_next = 1 + (end > next->vm_end);
 		__insert_vm_struct(mm, insert);
 	}
 
+	if (anon_vma)
+		anon_vma_unlock(anon_vma);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
 
@@ -2470,23 +2484,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-	if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+	if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
 		/*
 		 * The LSB of head.next can't change from under us
 		 * because we hold the mm_all_locks_mutex.
 		 */
-		spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+		spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
 		/*
 		 * We can safely modify head.next after taking the
-		 * anon_vma->lock. If some other vma in this mm shares
+		 * anon_vma->root->lock. If some other vma in this mm shares
 		 * the same anon_vma we won't take it again.
 		 *
 		 * No need of atomic instructions here, head.next
 		 * can't change from under us thanks to the
-		 * anon_vma->lock.
+		 * anon_vma->root->lock.
 		 */
 		if (__test_and_set_bit(0, (unsigned long *)
-				       &anon_vma->head.next))
+				       &anon_vma->root->head.next))
 			BUG();
 	}
 }
@@ -2577,7 +2591,7 @@ out_unlock:
 
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-	if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+	if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
 		/*
 		 * The LSB of head.next can't change to 0 from under
 		 * us because we hold the mm_all_locks_mutex.
@@ -2588,10 +2602,10 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 		 *
 		 * No need of atomic instructions here, head.next
 		 * can't change from under us until we release the
-		 * anon_vma->lock.
+		 * anon_vma->root->lock.
 		 */
 		if (!__test_and_clear_bit(0, (unsigned long *)
-					  &anon_vma->head.next))
+					  &anon_vma->root->head.next))
 			BUG();
 		anon_vma_unlock(anon_vma);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 76545066c8521f3e32c849744744842b4df25b79 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 9 Aug 2010 17:18:41 -0700
Subject: mm: extend KSM refcounts to the anon_vma root

KSM reference counts can cause an anon_vma to exist after the processe it
belongs to have already exited.  Because the anon_vma lock now lives in
the root anon_vma, we need to ensure that the root anon_vma stays around
until after all the "child" anon_vmas have been freed.

The obvious way to do this is to have a "child" anon_vma take a reference
to the root in anon_vma_fork.  When the anon_vma is freed at munmap or
process exit, we drop the refcount in anon_vma_unlink and possibly free
the root anon_vma.

The KSM anon_vma reference count function also needs to be modified to
deal with the possibility of freeing 2 levels of anon_vma.  The easiest
way to do this is to break out the KSM magic and make it generic.

When compiling without CONFIG_KSM, this code is compiled out.

Signed-off-by: Rik van Riel <riel@redhat.com>
Tested-by: Larry Woodman <lwoodman@redhat.com>
Acked-by: Larry Woodman <lwoodman@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 15 +++++++++++++++
 mm/ksm.c             | 17 ++++++-----------
 mm/migrate.c         | 10 +++-------
 mm/rmap.c            | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 69 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index af43cb9a0506..dc9b3c0bf5d4 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -81,6 +81,13 @@ static inline int anonvma_external_refcount(struct anon_vma *anon_vma)
 {
 	return atomic_read(&anon_vma->external_refcount);
 }
+
+static inline void get_anon_vma(struct anon_vma *anon_vma)
+{
+	atomic_inc(&anon_vma->external_refcount);
+}
+
+void drop_anon_vma(struct anon_vma *);
 #else
 static inline void anonvma_external_refcount_init(struct anon_vma *anon_vma)
 {
@@ -90,6 +97,14 @@ static inline int anonvma_external_refcount(struct anon_vma *anon_vma)
 {
 	return 0;
 }
+
+static inline void get_anon_vma(struct anon_vma *anon_vma)
+{
+}
+
+static inline void drop_anon_vma(struct anon_vma *anon_vma)
+{
+}
 #endif /* CONFIG_KSM */
 
 static inline struct anon_vma *page_anon_vma(struct page *page)
diff --git a/mm/ksm.c b/mm/ksm.c
index da6037c261f1..9f2acc998a37 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -318,19 +318,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
 			  struct anon_vma *anon_vma)
 {
 	rmap_item->anon_vma = anon_vma;
-	atomic_inc(&anon_vma->external_refcount);
+	get_anon_vma(anon_vma);
 }
 
-static void drop_anon_vma(struct rmap_item *rmap_item)
+static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
 {
 	struct anon_vma *anon_vma = rmap_item->anon_vma;
 
-	if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
-		int empty = list_empty(&anon_vma->head);
-		anon_vma_unlock(anon_vma);
-		if (empty)
-			anon_vma_free(anon_vma);
-	}
+	drop_anon_vma(anon_vma);
 }
 
 /*
@@ -415,7 +410,7 @@ static void break_cow(struct rmap_item *rmap_item)
 	 * It is not an accident that whenever we want to break COW
 	 * to undo, we also need to drop a reference to the anon_vma.
 	 */
-	drop_anon_vma(rmap_item);
+	ksm_drop_anon_vma(rmap_item);
 
 	down_read(&mm->mmap_sem);
 	if (ksm_test_exit(mm))
@@ -470,7 +465,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
 			ksm_pages_sharing--;
 		else
 			ksm_pages_shared--;
-		drop_anon_vma(rmap_item);
+		ksm_drop_anon_vma(rmap_item);
 		rmap_item->address &= PAGE_MASK;
 		cond_resched();
 	}
@@ -558,7 +553,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
 		else
 			ksm_pages_shared--;
 
-		drop_anon_vma(rmap_item);
+		ksm_drop_anon_vma(rmap_item);
 		rmap_item->address &= PAGE_MASK;
 
 	} else if (rmap_item->address & UNSTABLE_FLAG) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 5208fa1d9712..38e7cad782f4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,7 +639,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 			 * exist when the page is remapped later
 			 */
 			anon_vma = page_anon_vma(page);
-			atomic_inc(&anon_vma->external_refcount);
+			get_anon_vma(anon_vma);
 		}
 	}
 
@@ -682,12 +682,8 @@ skip_unmap:
 rcu_unlock:
 
 	/* Drop an anon_vma reference if we took one */
-	if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
-		int empty = list_empty(&anon_vma->head);
-		anon_vma_unlock(anon_vma);
-		if (empty)
-			anon_vma_free(anon_vma);
-	}
+	if (anon_vma)
+		drop_anon_vma(anon_vma);
 
 	if (rcu_locked)
 		rcu_read_unlock();
diff --git a/mm/rmap.c b/mm/rmap.c
index caa48b27371b..07e9814c7a41 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -235,6 +235,12 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 	 * lock any of the anon_vmas in this anon_vma tree.
 	 */
 	anon_vma->root = pvma->anon_vma->root;
+	/*
+	 * With KSM refcounts, an anon_vma can stay around longer than the
+	 * process it belongs to.  The root anon_vma needs to be pinned
+	 * until this anon_vma is freed, because the lock lives in the root.
+	 */
+	get_anon_vma(anon_vma->root);
 	/* Mark this anon_vma as the one where our new (COWed) pages go. */
 	vma->anon_vma = anon_vma;
 	anon_vma_chain_link(vma, avc, anon_vma);
@@ -264,8 +270,12 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
 	empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
 	anon_vma_unlock(anon_vma);
 
-	if (empty)
+	if (empty) {
+		/* We no longer need the root anon_vma */
+		if (anon_vma->root != anon_vma)
+			drop_anon_vma(anon_vma->root);
 		anon_vma_free(anon_vma);
+	}
 }
 
 void unlink_anon_vmas(struct vm_area_struct *vma)
@@ -1382,6 +1392,40 @@ int try_to_munlock(struct page *page)
 		return try_to_unmap_file(page, TTU_MUNLOCK);
 }
 
+#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
+/*
+ * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
+ * if necessary.  Be careful to do all the tests under the lock.  Once
+ * we know we are the last user, nobody else can get a reference and we
+ * can do the freeing without the lock.
+ */
+void drop_anon_vma(struct anon_vma *anon_vma)
+{
+	if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
+		struct anon_vma *root = anon_vma->root;
+		int empty = list_empty(&anon_vma->head);
+		int last_root_user = 0;
+		int root_empty = 0;
+
+		/*
+		 * The refcount on a non-root anon_vma got dropped.  Drop
+		 * the refcount on the root and check if we need to free it.
+		 */
+		if (empty && anon_vma != root) {
+			last_root_user = atomic_dec_and_test(&root->external_refcount);
+			root_empty = list_empty(&root->head);
+		}
+		anon_vma_unlock(anon_vma);
+
+		if (empty) {
+			anon_vma_free(anon_vma);
+			if (root_empty && last_root_user)
+				anon_vma_free(root);
+		}
+	}
+}
+#endif
+
 #ifdef CONFIG_MIGRATION
 /*
  * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
-- 
cgit v1.2.3-59-g8ed1b


From a9877cc2937889e5669114f28612b494384152a4 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Mon, 9 Aug 2010 17:18:42 -0700
Subject: buffer_head: remove redundant test from wait_on_buffer

The comment suggests that when b_count equals zero it is calling
__wait_no_buffer to trigger some debug, but as there is no debug in
__wait_on_buffer the whole thing is redundant.

AFAICT from the git log this has been the case for at least 5 years, so
it seems safe just to remove this.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buffer_head.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 1b9ba193b789..2ce51fac7d3d 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -314,15 +314,10 @@ map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
 	bh->b_size = sb->s_blocksize;
 }
 
-/*
- * Calling wait_on_buffer() for a zero-ref buffer is illegal, so we call into
- * __wait_on_buffer() just to trip a debug check.  Because debug code in inline
- * functions is bloaty.
- */
 static inline void wait_on_buffer(struct buffer_head *bh)
 {
 	might_sleep();
-	if (buffer_locked(bh) || atomic_read(&bh->b_count) == 0)
+	if (buffer_locked(bh))
 		__wait_on_buffer(bh);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 6f48d0ebd907ae419387f27b602ee98870cfa7bb Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Aug 2010 17:18:52 -0700
Subject: oom: select task from tasklist for mempolicy ooms

The oom killer presently kills current whenever there is no more memory
free or reclaimable on its mempolicy's nodes.  There is no guarantee that
current is a memory-hogging task or that killing it will free any
substantial amount of memory, however.

In such situations, it is better to scan the tasklist for nodes that are
allowed to allocate on current's set of nodes and kill the task with the
highest badness() score.  This ensures that the most memory-hogging task,
or the one configured by the user with /proc/pid/oom_adj, is always
selected in such scenarios.

Signed-off-by: David Rientjes <rientjes@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  13 +++++-
 mm/mempolicy.c            |  44 ++++++++++++++++++++
 mm/oom_kill.c             | 104 ++++++++++++++++++++++++++++++----------------
 3 files changed, 124 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 7b9ef6bf45aa..31ac26ca4acf 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -210,6 +210,8 @@ extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
 				struct mempolicy **mpol, nodemask_t **nodemask);
 extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
+extern bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+				const nodemask_t *mask);
 extern unsigned slab_node(struct mempolicy *policy);
 
 extern enum zone_type policy_zone;
@@ -338,7 +340,16 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 	return node_zonelist(0, gfp_flags);
 }
 
-static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; }
+static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
+{
+	return false;
+}
+
+static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+			const nodemask_t *mask)
+{
+	return false;
+}
 
 static inline int do_migrate_pages(struct mm_struct *mm,
 			const nodemask_t *from_nodes,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5bc0a96beb51..8a73708d59bb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1712,6 +1712,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 }
 #endif
 
+/*
+ * mempolicy_nodemask_intersects
+ *
+ * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
+ * policy.  Otherwise, check for intersection between mask and the policy
+ * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
+ * policy, always return true since it may allocate elsewhere on fallback.
+ *
+ * Takes task_lock(tsk) to prevent freeing of its mempolicy.
+ */
+bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+					const nodemask_t *mask)
+{
+	struct mempolicy *mempolicy;
+	bool ret = true;
+
+	if (!mask)
+		return ret;
+	task_lock(tsk);
+	mempolicy = tsk->mempolicy;
+	if (!mempolicy)
+		goto out;
+
+	switch (mempolicy->mode) {
+	case MPOL_PREFERRED:
+		/*
+		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
+		 * allocate from, they may fallback to other nodes when oom.
+		 * Thus, it's possible for tsk to have allocated memory from
+		 * nodes in mask.
+		 */
+		break;
+	case MPOL_BIND:
+	case MPOL_INTERLEAVE:
+		ret = nodes_intersects(mempolicy->v.nodes, *mask);
+		break;
+	default:
+		BUG();
+	}
+out:
+	task_unlock(tsk);
+	return ret;
+}
+
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7c8488f6a3f5..13ceed78bc45 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/memcontrol.h>
+#include <linux/mempolicy.h>
 #include <linux/security.h>
 
 int sysctl_panic_on_oom;
@@ -35,23 +36,57 @@ int sysctl_oom_dump_tasks;
 static DEFINE_SPINLOCK(zone_scan_lock);
 /* #define DEBUG */
 
-/*
- * Is all threads of the target process nodes overlap ours?
+#ifdef CONFIG_NUMA
+/**
+ * has_intersects_mems_allowed() - check task eligiblity for kill
+ * @tsk: task struct of which task to consider
+ * @mask: nodemask passed to page allocator for mempolicy ooms
+ *
+ * Task eligibility is determined by whether or not a candidate task, @tsk,
+ * shares the same mempolicy nodes as current if it is bound by such a policy
+ * and whether or not it has the same set of allowed cpuset nodes.
  */
-static int has_intersects_mems_allowed(struct task_struct *tsk)
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+					const nodemask_t *mask)
 {
-	struct task_struct *t;
+	struct task_struct *start = tsk;
 
-	t = tsk;
 	do {
-		if (cpuset_mems_allowed_intersects(current, t))
-			return 1;
-		t = next_thread(t);
-	} while (t != tsk);
-
-	return 0;
+		if (mask) {
+			/*
+			 * If this is a mempolicy constrained oom, tsk's
+			 * cpuset is irrelevant.  Only return true if its
+			 * mempolicy intersects current, otherwise it may be
+			 * needlessly killed.
+			 */
+			if (mempolicy_nodemask_intersects(tsk, mask))
+				return true;
+		} else {
+			/*
+			 * This is not a mempolicy constrained oom, so only
+			 * check the mems of tsk's cpuset.
+			 */
+			if (cpuset_mems_allowed_intersects(current, tsk))
+				return true;
+		}
+		tsk = next_thread(tsk);
+	} while (tsk != start);
+	return false;
+}
+#else
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+					const nodemask_t *mask)
+{
+	return true;
 }
+#endif /* CONFIG_NUMA */
 
+/*
+ * The process p may have detached its own ->mm while exiting or through
+ * use_mm(), but one or more of its subthreads may still have a valid
+ * pointer.  Return p, or any of its subthreads with a valid ->mm, with
+ * task_lock() held.
+ */
 static struct task_struct *find_lock_task_mm(struct task_struct *p)
 {
 	struct task_struct *t = p;
@@ -106,10 +141,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	 * The memory size of the process is the basis for the badness.
 	 */
 	points = p->mm->total_vm;
-
-	/*
-	 * After this unlock we can no longer dereference local variable `mm'
-	 */
 	task_unlock(p);
 
 	/*
@@ -253,7 +284,8 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
  * (not docbooked, we don't want this one cluttering up the manual)
  */
 static struct task_struct *select_bad_process(unsigned long *ppoints,
-						struct mem_cgroup *mem)
+		struct mem_cgroup *mem, enum oom_constraint constraint,
+		const nodemask_t *mask)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
@@ -269,7 +301,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 			continue;
 		if (mem && !task_in_mem_cgroup(p, mem))
 			continue;
-		if (!has_intersects_mems_allowed(p))
+		if (!has_intersects_mems_allowed(p,
+				constraint == CONSTRAINT_MEMORY_POLICY ? mask :
+									 NULL))
 			continue;
 
 		/*
@@ -497,7 +531,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 		panic("out of memory(memcg). panic_on_oom is selected.\n");
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem);
+	p = select_bad_process(&points, mem, CONSTRAINT_NONE, NULL);
 	if (!p || PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -576,7 +610,8 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 /*
  * Must be called with tasklist_lock held for read.
  */
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static void __out_of_memory(gfp_t gfp_mask, int order,
+			enum oom_constraint constraint, const nodemask_t *mask)
 {
 	struct task_struct *p;
 	unsigned long points;
@@ -590,7 +625,7 @@ retry:
 	 * Rambo mode: Shoot down a process and hope it solves whatever
 	 * issues we may have.
 	 */
-	p = select_bad_process(&points, NULL);
+	p = select_bad_process(&points, NULL, constraint, mask);
 
 	if (PTR_ERR(p) == -1UL)
 		return;
@@ -624,7 +659,8 @@ void pagefault_out_of_memory(void)
 		panic("out of memory from page fault. panic_on_oom is selected.\n");
 
 	read_lock(&tasklist_lock);
-	__out_of_memory(0, 0); /* unknown gfp_mask and order */
+	/* unknown gfp_mask and order */
+	__out_of_memory(0, 0, CONSTRAINT_NONE, NULL);
 	read_unlock(&tasklist_lock);
 
 	/*
@@ -640,6 +676,7 @@ void pagefault_out_of_memory(void)
  * @zonelist: zonelist pointer
  * @gfp_mask: memory allocation flags
  * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
  *
  * If we run out of memory, we have the choice between either
  * killing a random task (bad), letting the system crash (worse)
@@ -678,24 +715,19 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 */
 	constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
 	read_lock(&tasklist_lock);
-
-	switch (constraint) {
-	case CONSTRAINT_MEMORY_POLICY:
-		oom_kill_process(current, gfp_mask, order, 0, NULL,
-				"No available memory (MPOL_BIND)");
-		break;
-
-	case CONSTRAINT_NONE:
-		if (sysctl_panic_on_oom) {
+	if (unlikely(sysctl_panic_on_oom)) {
+		/*
+		 * panic_on_oom only affects CONSTRAINT_NONE, the kernel
+		 * should not panic for cpuset or mempolicy induced memory
+		 * failures.
+		 */
+		if (constraint == CONSTRAINT_NONE) {
 			dump_header(NULL, gfp_mask, order, NULL);
-			panic("out of memory. panic_on_oom is selected\n");
+			read_unlock(&tasklist_lock);
+			panic("Out of memory: panic_on_oom is enabled\n");
 		}
-		/* Fall-through */
-	case CONSTRAINT_CPUSET:
-		__out_of_memory(gfp_mask, order);
-		break;
 	}
-
+	__out_of_memory(gfp_mask, order, constraint, nodemask);
 	read_unlock(&tasklist_lock);
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 309ed882508cc471320ff79265e7340774d6746c Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Aug 2010 17:18:54 -0700
Subject: oom: extract panic helper function

There are various points in the oom killer where the kernel must determine
whether to panic or not.  It's better to extract this to a helper function
to remove all the confusion as to its semantics.

Also fix a call to dump_header() where tasklist_lock is not read- locked,
as required.

There's no functional change with this patch.

Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h |  1 +
 mm/oom_kill.c       | 53 +++++++++++++++++++++++++++++------------------------
 2 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 537662315627..3ae6d94d0540 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -22,6 +22,7 @@ enum oom_constraint {
 	CONSTRAINT_NONE,
 	CONSTRAINT_CPUSET,
 	CONSTRAINT_MEMORY_POLICY,
+	CONSTRAINT_MEMCG,
 };
 
 extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 01b5e01e52cb..fca886d8b5fb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -521,17 +521,40 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	return oom_kill_task(victim);
 }
 
+/*
+ * Determines whether the kernel must panic because of the panic_on_oom sysctl.
+ */
+static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+				int order)
+{
+	if (likely(!sysctl_panic_on_oom))
+		return;
+	if (sysctl_panic_on_oom != 2) {
+		/*
+		 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
+		 * does not panic for cpuset, mempolicy, or memcg allocation
+		 * failures.
+		 */
+		if (constraint != CONSTRAINT_NONE)
+			return;
+	}
+	read_lock(&tasklist_lock);
+	dump_header(NULL, gfp_mask, order, NULL);
+	read_unlock(&tasklist_lock);
+	panic("Out of memory: %s panic_on_oom is enabled\n",
+		sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
+}
+
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 {
 	unsigned long points = 0;
 	struct task_struct *p;
 
-	if (sysctl_panic_on_oom == 2)
-		panic("out of memory(memcg). panic_on_oom is selected.\n");
+	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem, CONSTRAINT_NONE, NULL);
+	p = select_bad_process(&points, mem, CONSTRAINT_MEMCG, NULL);
 	if (!p || PTR_ERR(p) == -1UL)
 		goto out;
 
@@ -632,8 +655,8 @@ retry:
 
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
-		read_unlock(&tasklist_lock);
 		dump_header(NULL, gfp_mask, order, NULL);
+		read_unlock(&tasklist_lock);
 		panic("Out of memory and no killable processes...\n");
 	}
 
@@ -655,9 +678,7 @@ void pagefault_out_of_memory(void)
 		/* Got some memory back in the last second. */
 		return;
 
-	if (sysctl_panic_on_oom)
-		panic("out of memory from page fault. panic_on_oom is selected.\n");
-
+	check_panic_on_oom(CONSTRAINT_NONE, 0, 0);
 	read_lock(&tasklist_lock);
 	/* unknown gfp_mask and order */
 	__out_of_memory(0, 0, CONSTRAINT_NONE, NULL);
@@ -704,29 +725,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		return;
 	}
 
-	if (sysctl_panic_on_oom == 2) {
-		dump_header(NULL, gfp_mask, order, NULL);
-		panic("out of memory. Compulsory panic_on_oom is selected.\n");
-	}
-
 	/*
 	 * Check if there were limitations on the allocation (only relevant for
 	 * NUMA) that may require different handling.
 	 */
 	constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
+	check_panic_on_oom(constraint, gfp_mask, order);
 	read_lock(&tasklist_lock);
-	if (unlikely(sysctl_panic_on_oom)) {
-		/*
-		 * panic_on_oom only affects CONSTRAINT_NONE, the kernel
-		 * should not panic for cpuset or mempolicy induced memory
-		 * failures.
-		 */
-		if (constraint == CONSTRAINT_NONE) {
-			dump_header(NULL, gfp_mask, order, NULL);
-			read_unlock(&tasklist_lock);
-			panic("Out of memory: panic_on_oom is enabled\n");
-		}
-	}
 	__out_of_memory(gfp_mask, order, constraint, nodemask);
 	read_unlock(&tasklist_lock);
 
-- 
cgit v1.2.3-59-g8ed1b


From 8e4228e1edb922afa83366803a1e5b3fa8e987c2 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Aug 2010 17:18:56 -0700
Subject: oom: move sysctl declarations to oom.h

The three oom killer sysctl variables (sysctl_oom_dump_tasks,
sysctl_oom_kill_allocating_task, and sysctl_panic_on_oom) are better
declared in include/linux/oom.h rather than kernel/sysctl.c.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 5 +++++
 kernel/sysctl.c     | 4 +---
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 3ae6d94d0540..1a8e407a1a53 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -44,5 +44,10 @@ static inline void oom_killer_enable(void)
 {
 	oom_killer_disabled = false;
 }
+
+/* sysctls */
+extern int sysctl_oom_dump_tasks;
+extern int sysctl_oom_kill_allocating_task;
+extern int sysctl_panic_on_oom;
 #endif /* __KERNEL__*/
 #endif /* _INCLUDE_LINUX_OOM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6d850bf0a517..6b005e4912b5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -53,6 +53,7 @@
 #include <linux/perf_event.h>
 #include <linux/kprobes.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -85,9 +86,6 @@
 /* External variables not in a header file. */
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
-extern int sysctl_panic_on_oom;
-extern int sysctl_oom_kill_allocating_task;
-extern int sysctl_oom_dump_tasks;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
-- 
cgit v1.2.3-59-g8ed1b


From ff321feac22313cf53ffceb69224b09ac19ff22b Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Mon, 9 Aug 2010 17:18:57 -0700
Subject: mm: rename try_set_zone_oom() to try_set_zonelist_oom()

We have been used naming try_set_zone_oom and clear_zonelist_oom.
The role of functions is to lock of zonelist for preventing parallel
OOM. So clear_zonelist_oom makes sense but try_set_zone_oome is rather
awkward and unmatched with clear_zonelist_oom.

Let's change it with try_set_zonelist_oom.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 2 +-
 mm/oom_kill.c       | 4 ++--
 mm/page_alloc.c     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 1a8e407a1a53..9d7c34a741e7 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -25,7 +25,7 @@ enum oom_constraint {
 	CONSTRAINT_MEMCG,
 };
 
-extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags);
+extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2f37f6113d9e..c29cf00d9084 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -549,7 +549,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
  * if a parallel OOM killing is already taking place that includes a zone in
  * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
  */
-int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 {
 	struct zoneref *z;
 	struct zone *zone;
@@ -566,7 +566,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
 		/*
 		 * Lock each zone in the zonelist under zone_scan_lock so a
-		 * parallel invocation of try_set_zone_oom() doesn't succeed
+		 * parallel invocation of try_set_zonelist_oom() doesn't succeed
 		 * when it shouldn't.
 		 */
 		zone_set_flag(zone, ZONE_OOM_LOCKED);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 527f73e4c63f..33c6b4c1277b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1738,7 +1738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct page *page;
 
 	/* Acquire the OOM killer lock for the zones in zonelist */
-	if (!try_set_zone_oom(zonelist, gfp_mask)) {
+	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From b645bd1286f2fbcd2eb4ab3bed5884f63c42e363 Mon Sep 17 00:00:00 2001
From: Alexander Nevenchannyy <a.nevenchannyy@gmail.com>
Date: Mon, 9 Aug 2010 17:19:00 -0700
Subject: mmzone.h: remove dead prototype

get_zone_counts() was dropped from kernel tree, see:
http://www.mail-archive.com/mm-commits@vger.kernel.org/msg07313.html but
its prototype remains.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b4d109e389b8..9ed9c459b14c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -651,8 +651,6 @@ typedef struct pglist_data {
 #include <linux/memory_hotplug.h>
 
 extern struct mutex zonelists_mutex;
-void get_zone_counts(unsigned long *active, unsigned long *inactive,
-			unsigned long *free);
 void build_all_zonelists(void *data);
 void wakeup_kswapd(struct zone *zone, int order);
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-- 
cgit v1.2.3-59-g8ed1b


From 251060006003b79b788f8ce5a827ee5354a42910 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Mon, 9 Aug 2010 17:19:00 -0700
Subject: topology: alternate fix for ia64 tiger_defconfig build breakage

Define stubs for the numa_*_id() generic percpu related functions for
non-NUMA configurations in <asm-generic/topology.h> where the other
non-numa stubs live.

Fixes ia64 !NUMA build breakage -- e.g., tiger_defconfig

Back out now unneeded '#ifndef CONFIG_NUMA' guards from ia64 smpboot.c

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/Kconfig              |  3 +--
 arch/ia64/kernel/smpboot.c     |  4 ----
 include/asm-generic/topology.h | 20 +++++++++++++++++++-
 include/linux/topology.h       |  4 ----
 4 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 8711d13cd79f..ba22849ee3ec 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -499,8 +499,7 @@ config USE_PERCPU_NUMA_NODE_ID
 	depends on NUMA
 
 config HAVE_MEMORYLESS_NODES
-	def_bool y
-	depends on NUMA
+	def_bool NUMA
 
 config ARCH_PROC_KCORE_TEXT
 	def_bool y
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 99dcc85193c9..1d85d8cfaa7d 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -390,13 +390,11 @@ smp_callin (void)
 
 	fix_b0_for_bsp();
 
-#ifdef CONFIG_NUMA
 	/*
 	 * numa_node_id() works after this.
 	 */
 	set_numa_node(cpu_to_node_map[cpuid]);
 	set_numa_mem(local_memory_node(cpu_to_node_map[cpuid]));
-#endif
 
 	ipi_call_lock_irq();
 	spin_lock(&vector_lock);
@@ -640,9 +638,7 @@ void __devinit smp_prepare_boot_cpu(void)
 {
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), cpu_callin_map);
-#ifdef CONFIG_NUMA
 	set_numa_node(cpu_to_node_map[smp_processor_id()]);
-#endif
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 	paravirt_post_smp_prepare_boot_cpu();
 }
diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index fd60700503c8..fc824e2828f3 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -5,7 +5,7 @@
  *
  * Copyright (C) 2002, IBM Corp.
  *
- * All rights reserved.          
+ * All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -34,9 +34,16 @@
 #ifndef cpu_to_node
 #define cpu_to_node(cpu)	((void)(cpu),0)
 #endif
+#ifndef set_numa_node
+#define set_numa_node(node)
+#endif
+#ifndef set_cpu_numa_node
+#define set_cpu_numa_node(cpu, node)
+#endif
 #ifndef cpu_to_mem
 #define cpu_to_mem(cpu)		((void)(cpu),0)
 #endif
+
 #ifndef parent_node
 #define parent_node(node)	((void)(node),0)
 #endif
@@ -55,4 +62,15 @@
 
 #endif	/* CONFIG_NUMA */
 
+#if !defined(CONFIG_NUMA) || !defined(CONFIG_HAVE_MEMORYLESS_NODES)
+
+#ifndef set_numa_mem
+#define set_numa_mem(node)
+#endif
+#ifndef set_cpu_numa_mem
+#define set_cpu_numa_mem(cpu, node)
+#endif
+
+#endif	/* !CONFIG_NUMA || !CONFIG_HAVE_MEMORYLESS_NODES */
+
 #endif /* _ASM_GENERIC_TOPOLOGY_H */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index b572e432d2f3..64e084ff5e5c 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -292,10 +292,6 @@ static inline void set_cpu_numa_mem(int cpu, int node)
 
 #else	/* !CONFIG_HAVE_MEMORYLESS_NODES */
 
-static inline void set_numa_mem(int node) {}
-
-static inline void set_cpu_numa_mem(int cpu, int node) {}
-
 #ifndef numa_mem_id
 /* Returns the number of the nearest Node with memory */
 static inline int numa_mem_id(void)
-- 
cgit v1.2.3-59-g8ed1b


From 627295e492638936e76f3d8fcb1e0a3367b88341 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 9 Aug 2010 17:19:02 -0700
Subject: gcc-4.6: pagemap: avoid unused-but-set variable

Avoid quite a lot of warnings in header files in a gcc 4.6 -Wall builds

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3c62ed408492..78a702ce4fcb 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -423,8 +423,10 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size)
 		const char __user *end = uaddr + size - 1;
 
 		if (((unsigned long)uaddr & PAGE_MASK) !=
-				((unsigned long)end & PAGE_MASK))
+				((unsigned long)end & PAGE_MASK)) {
 		 	ret = __get_user(c, end);
+			(void)c;
+		}
 	}
 	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 4e60c86bd9e5a7110ed28874d0b6592186550ae8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 9 Aug 2010 17:19:03 -0700
Subject: gcc-4.6: mm: fix unused but set warnings

No real bugs, just some dead code and some fixups.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable_64.h | 4 ++--
 include/linux/highmem.h           | 6 +++++-
 include/linux/mmdebug.h           | 2 +-
 mm/filemap.c                      | 2 --
 mm/memory.c                       | 2 --
 mm/slab.c                         | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 181be528c612..076052cd62be 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -126,8 +126,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 /* x86-64 always has all page tables mapped. */
 #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
 #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
-#define pte_unmap(pte) /* NOP */
-#define pte_unmap_nested(pte) /* NOP */
+#define pte_unmap(pte) ((void)(pte))/* NOP */
+#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */
 
 #define update_mmu_cache(vma, address, ptep) do { } while (0)
 
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 67460f010224..e3060ef85b6d 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -73,7 +73,11 @@ static inline void *kmap_atomic(struct page *page, enum km_type idx)
 }
 #define kmap_atomic_prot(page, idx, prot)	kmap_atomic(page, idx)
 
-#define kunmap_atomic_notypecheck(addr, idx)	do { pagefault_enable(); } while (0)
+static inline void kunmap_atomic_notypecheck(void *addr, enum km_type idx)
+{
+	pagefault_enable();
+}
+
 #define kmap_atomic_pfn(pfn, idx)	kmap_atomic(pfn_to_page(pfn), (idx))
 #define kmap_atomic_to_page(ptr)	virt_to_page(ptr)
 
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index ee24ef8ab616..c04ecfe03f7f 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -4,7 +4,7 @@
 #ifdef CONFIG_DEBUG_VM
 #define VM_BUG_ON(cond) BUG_ON(cond)
 #else
-#define VM_BUG_ON(cond) do { } while (0)
+#define VM_BUG_ON(cond) do { (void)(cond); } while (0)
 #endif
 
 #ifdef CONFIG_DEBUG_VIRTUAL
diff --git a/mm/filemap.c b/mm/filemap.c
index 20e5642e9f9f..3d4df44e4221 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2238,14 +2238,12 @@ static ssize_t generic_perform_write(struct file *file,
 
 	do {
 		struct page *page;
-		pgoff_t index;		/* Pagecache index for current page */
 		unsigned long offset;	/* Offset into pagecache page */
 		unsigned long bytes;	/* Bytes to write to page */
 		size_t copied;		/* Bytes copied from user */
 		void *fsdata;
 
 		offset = (pos & (PAGE_CACHE_SIZE - 1));
-		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
 						iov_iter_count(i));
 
diff --git a/mm/memory.c b/mm/memory.c
index bde42c6d3633..6b0c37dcfd16 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -307,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb,
 {
 	pgd_t *pgd;
 	unsigned long next;
-	unsigned long start;
 
 	/*
 	 * The next few lines have given us lots of grief...
@@ -351,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb,
 	if (addr > end - 1)
 		return;
 
-	start = addr;
 	pgd = pgd_offset(tlb->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
diff --git a/mm/slab.c b/mm/slab.c
index 736e497733d6..88435fcc8387 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -394,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 #define	STATS_DEC_ACTIVE(x)	do { } while (0)
 #define	STATS_INC_ALLOCED(x)	do { } while (0)
 #define	STATS_INC_GROWN(x)	do { } while (0)
-#define	STATS_ADD_REAPED(x,y)	do { } while (0)
+#define	STATS_ADD_REAPED(x,y)	do { (void)(y); } while (0)
 #define	STATS_SET_HIGH(x)	do { } while (0)
 #define	STATS_INC_ERR(x)	do { } while (0)
 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
-- 
cgit v1.2.3-59-g8ed1b


From 27f5e0f694fd0600274a76854636c0749e3bb1f6 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Mon, 9 Aug 2010 17:19:04 -0700
Subject: tmpfs: add accurate compare function to percpu_counter library

Add percpu_counter_compare that allows for a quick but accurate comparison
of percpu_counter with a given value.

A rough count is provided by the count field in percpu_counter structure,
without accounting for the other values stored in individual cpu counters.

The actual count is a sum of count and the cpu counters.  However, count
field is never different from the actual value by a factor of
batch*num_online_cpu.  We do not need to get actual count for comparison
if count is different from the given value by this factor and allows for
quick comparison without summing up all the per cpu counters.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/percpu_counter.h | 11 +++++++++++
 lib/percpu_counter.c           | 27 +++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index c88d67b59394..8a7d510ffa9c 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -40,6 +40,7 @@ void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
 s64 __percpu_counter_sum(struct percpu_counter *fbc);
+int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -98,6 +99,16 @@ static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
 	fbc->count = amount;
 }
 
+static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
+{
+	if (fbc->count > rhs)
+		return 1;
+	else if (fbc->count < rhs)
+		return -1;
+	else
+		return 0;
+}
+
 static inline void
 percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index aeaa6d734447..ec9048e74f44 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -137,6 +137,33 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+/*
+ * Compare counter against given value.
+ * Return 1 if greater, 0 if equal and -1 if less
+ */
+int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
+{
+	s64	count;
+
+	count = percpu_counter_read(fbc);
+	/* Check to see if rough count will be sufficient for comparison */
+	if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) {
+		if (count > rhs)
+			return 1;
+		else
+			return -1;
+	}
+	/* Need to use precise count */
+	count = percpu_counter_sum(fbc);
+	if (count > rhs)
+		return 1;
+	else if (count < rhs)
+		return -1;
+	else
+		return 0;
+}
+EXPORT_SYMBOL(percpu_counter_compare);
+
 static int __init percpu_counter_startup(void)
 {
 	compute_batch_value();
-- 
cgit v1.2.3-59-g8ed1b


From 7e496299d4d2ad8083effed6c5a18313a919edc6 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Mon, 9 Aug 2010 17:19:05 -0700
Subject: tmpfs: make tmpfs scalable with percpu_counter for used blocks

The current implementation of tmpfs is not scalable.  We found that
stat_lock is contended by multiple threads when we need to get a new page,
leading to useless spinning inside this spin lock.

This patch makes use of the percpu_counter library to maintain local count
of used blocks to speed up getting and returning of pages.  So the
acquisition of stat_lock is unnecessary for getting and returning blocks,
improving the performance of tmpfs on system with large number of cpus.
On a 4 socket 32 core NHM-EX system, we saw improvement of 270%.

The implementation below has a slight chance of race between threads
causing a slight overshoot of the maximum configured blocks.  However, any
overshoot is small, and is bounded by the number of cpus.  This happens
when the number of used blocks is slightly below the maximum configured
blocks when a thread checks the used block count, and another thread
allocates the last block before the current thread does.  This should not
be a problem for tmpfs, as the overshoot is most likely to be a few blocks
and bounded.  If a strict limit is really desired, then configured the max
blocks to be the limit less the number of cpus in system.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h |  3 ++-
 mm/shmem.c               | 40 +++++++++++++++++-----------------------
 2 files changed, 19 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index e164291fb3e7..399be5ad2f99 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -3,6 +3,7 @@
 
 #include <linux/swap.h>
 #include <linux/mempolicy.h>
+#include <linux/percpu_counter.h>
 
 /* inode in-kernel data */
 
@@ -23,7 +24,7 @@ struct shmem_inode_info {
 
 struct shmem_sb_info {
 	unsigned long max_blocks;   /* How many blocks are allowed */
-	unsigned long free_blocks;  /* How many are left for allocation */
+	struct percpu_counter used_blocks;  /* How many are allocated */
 	unsigned long max_inodes;   /* How many inodes are allowed */
 	unsigned long free_inodes;  /* How many are left for allocation */
 	spinlock_t stat_lock;	    /* Serialize shmem_sb_info changes */
diff --git a/mm/shmem.c b/mm/shmem.c
index f65f84062db5..0618fdad406c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/percpu_counter.h>
 #include <linux/swap.h>
 
 static struct vfsmount *shm_mnt;
@@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	if (sbinfo->max_blocks) {
-		spin_lock(&sbinfo->stat_lock);
-		sbinfo->free_blocks += pages;
+		percpu_counter_add(&sbinfo->used_blocks, -pages);
+		spin_lock(&inode->i_lock);
 		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
-		spin_unlock(&sbinfo->stat_lock);
+		spin_unlock(&inode->i_lock);
 	}
 }
 
@@ -416,19 +417,17 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 		if (sgp == SGP_READ)
 			return shmem_swp_map(ZERO_PAGE(0));
 		/*
-		 * Test free_blocks against 1 not 0, since we have 1 data
+		 * Test used_blocks against 1 less max_blocks, since we have 1 data
 		 * page (and perhaps indirect index pages) yet to allocate:
 		 * a waste to allocate index if we cannot allocate data.
 		 */
 		if (sbinfo->max_blocks) {
-			spin_lock(&sbinfo->stat_lock);
-			if (sbinfo->free_blocks <= 1) {
-				spin_unlock(&sbinfo->stat_lock);
+			if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
 				return ERR_PTR(-ENOSPC);
-			}
-			sbinfo->free_blocks--;
+			percpu_counter_inc(&sbinfo->used_blocks);
+			spin_lock(&inode->i_lock);
 			inode->i_blocks += BLOCKS_PER_PAGE;
-			spin_unlock(&sbinfo->stat_lock);
+			spin_unlock(&inode->i_lock);
 		}
 
 		spin_unlock(&info->lock);
@@ -1387,17 +1386,16 @@ repeat:
 		shmem_swp_unmap(entry);
 		sbinfo = SHMEM_SB(inode->i_sb);
 		if (sbinfo->max_blocks) {
-			spin_lock(&sbinfo->stat_lock);
-			if (sbinfo->free_blocks == 0 ||
+			if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
 			    shmem_acct_block(info->flags)) {
-				spin_unlock(&sbinfo->stat_lock);
 				spin_unlock(&info->lock);
 				error = -ENOSPC;
 				goto failed;
 			}
-			sbinfo->free_blocks--;
+			percpu_counter_inc(&sbinfo->used_blocks);
+			spin_lock(&inode->i_lock);
 			inode->i_blocks += BLOCKS_PER_PAGE;
-			spin_unlock(&sbinfo->stat_lock);
+			spin_unlock(&inode->i_lock);
 		} else if (shmem_acct_block(info->flags)) {
 			spin_unlock(&info->lock);
 			error = -ENOSPC;
@@ -1791,17 +1789,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_type = TMPFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_namelen = NAME_MAX;
-	spin_lock(&sbinfo->stat_lock);
 	if (sbinfo->max_blocks) {
 		buf->f_blocks = sbinfo->max_blocks;
-		buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+		buf->f_bavail = buf->f_bfree =
+				sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
 	}
 	if (sbinfo->max_inodes) {
 		buf->f_files = sbinfo->max_inodes;
 		buf->f_ffree = sbinfo->free_inodes;
 	}
 	/* else leave those fields 0 like simple_statfs */
-	spin_unlock(&sbinfo->stat_lock);
 	return 0;
 }
 
@@ -2242,7 +2239,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 	struct shmem_sb_info config = *sbinfo;
-	unsigned long blocks;
 	unsigned long inodes;
 	int error = -EINVAL;
 
@@ -2250,9 +2246,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 		return error;
 
 	spin_lock(&sbinfo->stat_lock);
-	blocks = sbinfo->max_blocks - sbinfo->free_blocks;
 	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
-	if (config.max_blocks < blocks)
+	if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
 		goto out;
 	if (config.max_inodes < inodes)
 		goto out;
@@ -2269,7 +2264,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 
 	error = 0;
 	sbinfo->max_blocks  = config.max_blocks;
-	sbinfo->free_blocks = config.max_blocks - blocks;
 	sbinfo->max_inodes  = config.max_inodes;
 	sbinfo->free_inodes = config.max_inodes - inodes;
 
@@ -2344,7 +2338,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 #endif
 
 	spin_lock_init(&sbinfo->stat_lock);
-	sbinfo->free_blocks = sbinfo->max_blocks;
+	percpu_counter_init(&sbinfo->used_blocks, 0);
 	sbinfo->free_inodes = sbinfo->max_inodes;
 
 	sb->s_maxbytes = SHMEM_MAX_BYTES;
-- 
cgit v1.2.3-59-g8ed1b


From ba6f0ff3981e6263ab81ac512f04cca55b85ec81 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Mon, 9 Aug 2010 17:19:08 -0700
Subject: ksm: fix ksm swapin time optimization

The new anon-vma code, was suboptimal and it lead to erratic invocation of
ksm_does_need_to_copy.  That leads to host hangs or guest vnc lockup, or
weird behavior.  It's unclear why ksm_does_need_to_copy is unstable but
the point is that when KSM is not in use, ksm_does_need_to_copy must never
run or we bounce pages for no good reason.  I suspect the same hangs will
happen with KVM swaps.  But this at least fixes the regression in the
new-anon-vma code and it only let KSM bugs triggers when KSM is in use.

The code in do_swap_page likely doesn't cope well with a not-swapcache,
especially the memcg code.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Izik Eidus <ieidus@yahoo.com>
Cc: Avi Kivity <avi@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ksm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 43bdab769fc3..74d691ee9121 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -78,7 +78,7 @@ static inline struct page *ksm_might_need_to_copy(struct page *page,
 	struct anon_vma *anon_vma = page_anon_vma(page);
 
 	if (!anon_vma ||
-	    (anon_vma == vma->anon_vma &&
+	    (anon_vma->root == vma->anon_vma->root &&
 	     page->index == linear_page_index(vma, address)))
 		return page;
 
-- 
cgit v1.2.3-59-g8ed1b


From ebf8aa44beed48cd17893a83d92a4403e5f9d9e2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 9 Aug 2010 17:19:11 -0700
Subject: radix-tree: omplement function radix_tree_range_tag_if_tagged

Implement function for setting one tag if another tag is set for each item
in given range.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  4 ++
 lib/radix-tree.c           | 94 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 55ca73cf25e5..a4b00e9cca90 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -192,6 +192,10 @@ unsigned int
 radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
 		unsigned long first_index, unsigned int max_items,
 		unsigned int tag);
+unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
+		unsigned long *first_indexp, unsigned long last_index,
+		unsigned long nr_to_tag,
+		unsigned int fromtag, unsigned int totag);
 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
 
 static inline void radix_tree_preload_end(void)
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 05da38bcc298..e907858498a6 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -608,6 +608,100 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_get);
 
+/**
+ * radix_tree_range_tag_if_tagged - for each item in given range set given
+ *				   tag if item has another tag set
+ * @root:		radix tree root
+ * @first_indexp:	pointer to a starting index of a range to scan
+ * @last_index:		last index of a range to scan
+ * @nr_to_tag:		maximum number items to tag
+ * @iftag:		tag index to test
+ * @settag:		tag index to set if tested tag is set
+ *
+ * This function scans range of radix tree from first_index to last_index
+ * (inclusive).  For each item in the range if iftag is set, the function sets
+ * also settag. The function stops either after tagging nr_to_tag items or
+ * after reaching last_index.
+ *
+ * The function returns number of leaves where the tag was set and sets
+ * *first_indexp to the first unscanned index.
+ */
+unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
+		unsigned long *first_indexp, unsigned long last_index,
+		unsigned long nr_to_tag,
+		unsigned int iftag, unsigned int settag)
+{
+	unsigned int height = root->height, shift;
+	unsigned long tagged = 0, index = *first_indexp;
+	struct radix_tree_node *open_slots[height], *slot;
+
+	last_index = min(last_index, radix_tree_maxindex(height));
+	if (index > last_index)
+		return 0;
+	if (!nr_to_tag)
+		return 0;
+	if (!root_tag_get(root, iftag)) {
+		*first_indexp = last_index + 1;
+		return 0;
+	}
+	if (height == 0) {
+		*first_indexp = last_index + 1;
+		root_tag_set(root, settag);
+		return 1;
+	}
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = radix_tree_indirect_to_ptr(root->rnode);
+
+	for (;;) {
+		int offset;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		if (!slot->slots[offset])
+			goto next;
+		if (!tag_get(slot, iftag, offset))
+			goto next;
+		tag_set(slot, settag, offset);
+		if (height == 1) {
+			tagged++;
+			goto next;
+		}
+		/* Go down one level */
+		height--;
+		shift -= RADIX_TREE_MAP_SHIFT;
+		open_slots[height] = slot;
+		slot = slot->slots[offset];
+		continue;
+next:
+		/* Go to next item at level determined by 'shift' */
+		index = ((index >> shift) + 1) << shift;
+		if (index > last_index)
+			break;
+		if (tagged >= nr_to_tag)
+			break;
+		while (((index >> shift) & RADIX_TREE_MAP_MASK) == 0) {
+			/*
+			 * We've fully scanned this node. Go up. Because
+			 * last_index is guaranteed to be in the tree, what
+			 * we do below cannot wander astray.
+			 */
+			slot = open_slots[height];
+			height++;
+			shift += RADIX_TREE_MAP_SHIFT;
+		}
+	}
+	/*
+	 * The iftag must have been set somewhere because otherwise
+	 * we would return immediated at the beginning of the function
+	 */
+	root_tag_set(root, settag);
+	*first_indexp = index;
+
+	return tagged;
+}
+EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
+
+
 /**
  *	radix_tree_next_hole    -    find the next hole (not-present entry)
  *	@root:		tree root
-- 
cgit v1.2.3-59-g8ed1b


From f446daaea9d4a420d16c606f755f3689dcb2d0ce Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 9 Aug 2010 17:19:12 -0700
Subject: mm: implement writeback livelock avoidance using page tagging

We try to avoid livelocks of writeback when some steadily creates dirty
pages in a mapping we are writing out.  For memory-cleaning writeback,
using nr_to_write works reasonably well but we cannot really use it for
data integrity writeback.  This patch tries to solve the problem.

The idea is simple: Tag all pages that should be written back with a
special tag (TOWRITE) in the radix tree.  This can be done rather quickly
and thus livelocks should not happen in practice.  Then we start doing the
hard work of locking pages and sending them to disk only for those pages
that have TOWRITE tag set.

Note: Adding new radix tree tag grows radix tree node from 288 to 296
bytes for 32-bit archs and from 552 to 560 bytes for 64-bit archs.
However, the number of slab/slub items per page remains the same (13 and 7
respectively).

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h         |  1 +
 include/linux/radix-tree.h |  2 +-
 mm/page-writeback.c        | 70 +++++++++++++++++++++++++++++++++++-----------
 3 files changed, 55 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e5106e49bd2c..488efec09d14 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -687,6 +687,7 @@ struct block_device {
  */
 #define PAGECACHE_TAG_DIRTY	0
 #define PAGECACHE_TAG_WRITEBACK	1
+#define PAGECACHE_TAG_TOWRITE	2
 
 int mapping_tagged(struct address_space *mapping, int tag);
 
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index a4b00e9cca90..634b8e674ac5 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -55,7 +55,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 
 /*** radix-tree API starts here ***/
 
-#define RADIX_TREE_MAX_TAGS 2
+#define RADIX_TREE_MAX_TAGS 3
 
 /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 37498ef61548..df8202ebc7b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -804,6 +804,41 @@ void __init page_writeback_init(void)
 	prop_descriptor_init(&vm_dirties, shift);
 }
 
+/**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end (inclusive) and tags
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
+ * that write_cache_pages (or whoever calls this function) will then use
+ * TOWRITE tag to identify pages eligible for writeback.  This mechanism is
+ * used to avoid livelocking of writeback by a process steadily creating new
+ * dirty pages in the file (thus it is important for this function to be quick
+ * so that it can tag pages faster than a dirtying process can create them).
+ */
+/*
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ */
+#define WRITEBACK_TAG_BATCH 4096
+void tag_pages_for_writeback(struct address_space *mapping,
+			     pgoff_t start, pgoff_t end)
+{
+	unsigned long tagged;
+
+	do {
+		spin_lock_irq(&mapping->tree_lock);
+		tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
+				&start, end, WRITEBACK_TAG_BATCH,
+				PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+		spin_unlock_irq(&mapping->tree_lock);
+		WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
+		cond_resched();
+	} while (tagged >= WRITEBACK_TAG_BATCH);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
@@ -818,6 +853,13 @@ void __init page_writeback_init(void)
  * the call was made get new I/O started against them.  If wbc->sync_mode is
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
  */
 int write_cache_pages(struct address_space *mapping,
 		      struct writeback_control *wbc, writepage_t writepage,
@@ -833,6 +875,7 @@ int write_cache_pages(struct address_space *mapping,
 	pgoff_t done_index;
 	int cycled;
 	int range_whole = 0;
+	int tag;
 
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
@@ -849,29 +892,19 @@ int write_cache_pages(struct address_space *mapping,
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
-
-		/*
-		 * If this is a data integrity sync, cap the writeback to the
-		 * current end of file. Any extension to the file that occurs
-		 * after this is a new write and we don't need to write those
-		 * pages out to fulfil our data integrity requirements. If we
-		 * try to write them out, we can get stuck in this scan until
-		 * the concurrent writer stops adding dirty pages and extending
-		 * EOF.
-		 */
-		if (wbc->sync_mode == WB_SYNC_ALL &&
-		    wbc->range_end == LLONG_MAX) {
-			end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
-		}
 	}
-
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
 retry:
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && (index <= end)) {
 		int i;
 
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-			      PAGECACHE_TAG_DIRTY,
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
 			break;
@@ -1327,6 +1360,9 @@ int test_set_page_writeback(struct page *page)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
+		radix_tree_tag_clear(&mapping->page_tree,
+				     page_index(page),
+				     PAGECACHE_TAG_TOWRITE);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestSetPageWriteback(page);
-- 
cgit v1.2.3-59-g8ed1b


From 25edde0332916ae706ccf83de688be57bcc844b7 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 9 Aug 2010 17:19:27 -0700
Subject: vmscan: kill prev_priority completely

Since 2.6.28 zone->prev_priority is unused. Then it can be removed
safely. It reduce stack usage slightly.

Now I have to say that I'm sorry. 2 years ago, I thought prev_priority
can be integrate again, it's useful. but four (or more) times trying
haven't got good performance number. Thus I give up such approach.

The rest of this changelog is notes on prev_priority and why it existed in
the first place and why it might be not necessary any more. This information
is based heavily on discussions between Andrew Morton, Rik van Riel and
Kosaki Motohiro who is heavily quotes from.

Historically prev_priority was important because it determined when the VM
would start unmapping PTE pages. i.e. there are no balances of note within
the VM, Anon vs File and Mapped vs Unmapped. Without prev_priority, there
is a potential risk of unnecessarily increasing minor faults as a large
amount of read activity of use-once pages could push mapped pages to the
end of the LRU and get unmapped.

There is no proof this is still a problem but currently it is not considered
to be. Active files are not deactivated if the active file list is smaller
than the inactive list reducing the liklihood that file-mapped pages are
being pushed off the LRU and referenced executable pages are kept on the
active list to avoid them getting pushed out by read activity.

Even if it is a problem, prev_priority prev_priority wouldn't works
nowadays. First of all, current vmscan still a lot of UP centric code. it
expose some weakness on some dozens CPUs machine. I think we need more and
more improvement.

The problem is, current vmscan mix up per-system-pressure, per-zone-pressure
and per-task-pressure a bit. example, prev_priority try to boost priority to
other concurrent priority. but if the another task have mempolicy restriction,
it is unnecessary, but also makes wrong big latency and exceeding reclaim.
per-task based priority + prev_priority adjustment make the emulation of
per-system pressure. but it have two issue 1) too rough and brutal emulation
2) we need per-zone pressure, not per-system.

Another example, currently DEF_PRIORITY is 12. it mean the lru rotate about
2 cycle (1/4096 + 1/2048 + 1/1024 + .. + 1) before invoking OOM-Killer.
but if 10,0000 thrreads enter DEF_PRIORITY reclaim at the same time, the
system have higher memory pressure than priority==0 (1/4096*10,000 > 2).
prev_priority can't solve such multithreads workload issue. In other word,
prev_priority concept assume the sysmtem don't have lots threads."

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michael Rubin <mrubin@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  5 ----
 include/linux/mmzone.h     | 15 ------------
 mm/memcontrol.c            | 31 -------------------------
 mm/page_alloc.c            |  2 --
 mm/vmscan.c                | 57 ----------------------------------------------
 mm/vmstat.c                |  2 --
 6 files changed, 112 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9411d32840b0..9f1afd361583 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -98,11 +98,6 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
 /*
  * For memory reclaim.
  */
-extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem);
-extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
-							int priority);
-extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
-							int priority);
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9ed9c459b14c..6e6e62648a4d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -347,21 +347,6 @@ struct zone {
 	/* Zone statistics */
 	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
 
-	/*
-	 * prev_priority holds the scanning priority for this zone.  It is
-	 * defined as the scanning priority at which we achieved our reclaim
-	 * target at the previous try_to_free_pages() or balance_pgdat()
-	 * invocation.
-	 *
-	 * We use prev_priority as a measure of how much stress page reclaim is
-	 * under - it drives the swappiness decision: whether to unmap mapped
-	 * pages.
-	 *
-	 * Access to both this field is quite racy even on uniprocessor.  But
-	 * it is expected to average out OK.
-	 */
-	int prev_priority;
-
 	/*
 	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
 	 * this zone's LRU.  Maintained by the pageout code.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193a7af8..31abd1c2c0c5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -211,8 +211,6 @@ struct mem_cgroup {
 	*/
 	spinlock_t reclaim_param_lock;
 
-	int	prev_priority;	/* for recording reclaim priority */
-
 	/*
 	 * While reclaiming in a hierarchy, we cache the last child we
 	 * reclaimed from.
@@ -858,35 +856,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 	return ret;
 }
 
-/*
- * prev_priority control...this will be used in memory reclaim path.
- */
-int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
-{
-	int prev_priority;
-
-	spin_lock(&mem->reclaim_param_lock);
-	prev_priority = mem->prev_priority;
-	spin_unlock(&mem->reclaim_param_lock);
-
-	return prev_priority;
-}
-
-void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
-	spin_lock(&mem->reclaim_param_lock);
-	if (priority < mem->prev_priority)
-		mem->prev_priority = priority;
-	spin_unlock(&mem->reclaim_param_lock);
-}
-
-void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
-	spin_lock(&mem->reclaim_param_lock);
-	mem->prev_priority = priority;
-	spin_unlock(&mem->reclaim_param_lock);
-}
-
 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
 {
 	unsigned long active;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 33c6b4c1277b..a9649f4b261e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4100,8 +4100,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 
-		zone->prev_priority = DEF_PRIORITY;
-
 		zone_pcp_init(zone);
 		for_each_lru(l) {
 			INIT_LIST_HEAD(&zone->lru[l].list);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b7a4e6a3cf89..594eba8a44c0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1289,20 +1289,6 @@ done:
 	return nr_reclaimed;
 }
 
-/*
- * We are about to scan this zone at a certain priority level.  If that priority
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone.  This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
-	if (priority < zone->prev_priority)
-		zone->prev_priority = priority;
-}
-
 /*
  * This moves pages from the active list to the inactive list.
  *
@@ -1766,17 +1752,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
 		if (scanning_global_lru(sc)) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
-			note_zone_scanning_priority(zone, priority);
-
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;	/* Let kswapd poll it */
-		} else {
-			/*
-			 * Ignore cpuset limitation here. We just want to reduce
-			 * # of used pages by us regardless of memory shortage.
-			 */
-			mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
-							priority);
 		}
 
 		shrink_zone(priority, zone, sc);
@@ -1877,17 +1854,6 @@ out:
 	if (priority < 0)
 		priority = 0;
 
-	if (scanning_global_lru(sc)) {
-		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
-			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-				continue;
-
-			zone->prev_priority = priority;
-		}
-	} else
-		mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
-
 	delayacct_freepages_end();
 	put_mems_allowed();
 
@@ -2053,22 +2019,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 		.order = order,
 		.mem_cgroup = NULL,
 	};
-	/*
-	 * temp_priority is used to remember the scanning priority at which
-	 * this zone was successfully refilled to
-	 * free_pages == high_wmark_pages(zone).
-	 */
-	int temp_priority[MAX_NR_ZONES];
-
 loop_again:
 	total_scanned = 0;
 	sc.nr_reclaimed = 0;
 	sc.may_writepage = !laptop_mode;
 	count_vm_event(PAGEOUTRUN);
 
-	for (i = 0; i < pgdat->nr_zones; i++)
-		temp_priority[i] = DEF_PRIORITY;
-
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 		unsigned long lru_pages = 0;
@@ -2136,9 +2092,7 @@ loop_again:
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;
 
-			temp_priority[i] = priority;
 			sc.nr_scanned = 0;
-			note_zone_scanning_priority(zone, priority);
 
 			nid = pgdat->node_id;
 			zid = zone_idx(zone);
@@ -2211,16 +2165,6 @@ loop_again:
 			break;
 	}
 out:
-	/*
-	 * Note within each zone the priority level at which this zone was
-	 * brought into a happy state.  So that the next thread which scans this
-	 * zone will start out at that priority level.
-	 */
-	for (i = 0; i < pgdat->nr_zones; i++) {
-		struct zone *zone = pgdat->node_zones + i;
-
-		zone->prev_priority = temp_priority[i];
-	}
 	if (!all_zones_ok) {
 		cond_resched();
 
@@ -2639,7 +2583,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		 */
 		priority = ZONE_RECLAIM_PRIORITY;
 		do {
-			note_zone_scanning_priority(zone, priority);
 			shrink_zone(priority, zone, &sc);
 			priority--;
 		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 15a14b16e176..f389168f9a83 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -853,11 +853,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	}
 	seq_printf(m,
 		   "\n  all_unreclaimable: %u"
-		   "\n  prev_priority:     %i"
 		   "\n  start_pfn:         %lu"
 		   "\n  inactive_ratio:    %u",
 		   zone->all_unreclaimable,
-		   zone->prev_priority,
 		   zone->zone_start_pfn,
 		   zone->inactive_ratio);
 	seq_putc(m, '\n');
-- 
cgit v1.2.3-59-g8ed1b


From 74bcbf40546bb7500f2a7ba4ff3cc056a6bd004a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 9 Aug 2010 17:19:43 -0700
Subject: oom: move badness() declaration into oom.h

Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c      | 3 ---
 include/linux/oom.h | 6 ++++++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index fc23f62bb0b8..5949d0ac30f2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -427,9 +427,6 @@ static const struct file_operations proc_lstats_operations = {
 
 #endif
 
-/* The badness from the OOM killer */
-unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
-		      nodemask_t *nodemask, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long points = 0;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 9d7c34a741e7..40e5e3a6bc20 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -14,6 +14,8 @@
 
 struct zonelist;
 struct notifier_block;
+struct mem_cgroup;
+struct task_struct;
 
 /*
  * Types of limitations to the nodes from which allocations may occur
@@ -45,6 +47,10 @@ static inline void oom_killer_enable(void)
 	oom_killer_disabled = false;
 }
 
+/* The badness from the OOM killer */
+extern unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
+		      const nodemask_t *nodemask, unsigned long uptime);
+
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
-- 
cgit v1.2.3-59-g8ed1b


From a63d83f427fbce97a6cea0db2e64b0eb8435cd10 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Aug 2010 17:19:46 -0700
Subject: oom: badness heuristic rewrite

This a complete rewrite of the oom killer's badness() heuristic which is
used to determine which task to kill in oom conditions.  The goal is to
make it as simple and predictable as possible so the results are better
understood and we end up killing the task which will lead to the most
memory freeing while still respecting the fine-tuning from userspace.

Instead of basing the heuristic on mm->total_vm for each task, the task's
rss and swap space is used instead.  This is a better indication of the
amount of memory that will be freeable if the oom killed task is chosen
and subsequently exits.  This helps specifically in cases where KDE or
GNOME is chosen for oom kill on desktop systems instead of a memory
hogging task.

The baseline for the heuristic is a proportion of memory that each task is
currently using in memory plus swap compared to the amount of "allowable"
memory.  "Allowable," in this sense, means the system-wide resources for
unconstrained oom conditions, the set of mempolicy nodes, the mems
attached to current's cpuset, or a memory controller's limit.  The
proportion is given on a scale of 0 (never kill) to 1000 (always kill),
roughly meaning that if a task has a badness() score of 500 that the task
consumes approximately 50% of allowable memory resident in RAM or in swap
space.

The proportion is always relative to the amount of "allowable" memory and
not the total amount of RAM systemwide so that mempolicies and cpusets may
operate in isolation; they shall not need to know the true size of the
machine on which they are running if they are bound to a specific set of
nodes or mems, respectively.

Root tasks are given 3% extra memory just like __vm_enough_memory()
provides in LSMs.  In the event of two tasks consuming similar amounts of
memory, it is generally better to save root's task.

Because of the change in the badness() heuristic's baseline, it is also
necessary to introduce a new user interface to tune it.  It's not possible
to redefine the meaning of /proc/pid/oom_adj with a new scale since the
ABI cannot be changed for backward compatability.  Instead, a new tunable,
/proc/pid/oom_score_adj, is added that ranges from -1000 to +1000.  It may
be used to polarize the heuristic such that certain tasks are never
considered for oom kill while others may always be considered.  The value
is added directly into the badness() score so a value of -500, for
example, means to discount 50% of its memory consumption in comparison to
other tasks either on the system, bound to the mempolicy, in the cpuset,
or sharing the same memory controller.

/proc/pid/oom_adj is changed so that its meaning is rescaled into the
units used by /proc/pid/oom_score_adj, and vice versa.  Changing one of
these per-task tunables will rescale the value of the other to an
equivalent meaning.  Although /proc/pid/oom_adj was originally defined as
a bitshift on the badness score, it now shares the same linear growth as
/proc/pid/oom_score_adj but with different granularity.  This is required
so the ABI is not broken with userspace applications and allows oom_adj to
be deprecated for future removal.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  94 ++++++++------
 fs/proc/base.c                     |  94 +++++++++++++-
 include/linux/memcontrol.h         |   8 ++
 include/linux/oom.h                |  14 +-
 include/linux/sched.h              |   3 +-
 kernel/fork.c                      |   1 +
 mm/memcontrol.c                    |  18 +++
 mm/oom_kill.c                      | 259 ++++++++++++++++---------------------
 8 files changed, 300 insertions(+), 191 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 8fe8895894d8..cf1295c2bb66 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -33,7 +33,8 @@ Table of Contents
   2	Modifying System Parameters
 
   3	Per-Process Parameters
-  3.1	/proc/<pid>/oom_adj - Adjust the oom-killer score
+  3.1	/proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj - Adjust the oom-killer
+								score
   3.2	/proc/<pid>/oom_score - Display current oom-killer score
   3.3	/proc/<pid>/io - Display the IO accounting fields
   3.4	/proc/<pid>/coredump_filter - Core dump filtering settings
@@ -1234,42 +1235,61 @@ of the kernel.
 CHAPTER 3: PER-PROCESS PARAMETERS
 ------------------------------------------------------------------------------
 
-3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
-------------------------------------------------------
-
-This file can be used to adjust the score used to select which processes
-should be killed in an  out-of-memory  situation.  Giving it a high score will
-increase the likelihood of this process being killed by the oom-killer.  Valid
-values are in the range -16 to +15, plus the special value -17, which disables
-oom-killing altogether for this process.
-
-The process to be killed in an out-of-memory situation is selected among all others
-based on its badness score. This value equals the original memory size of the process
-and is then updated according to its CPU time (utime + stime) and the
-run time (uptime - start time). The longer it runs the smaller is the score.
-Badness score is divided by the square root of the CPU time and then by
-the double square root of the run time.
-
-Swapped out tasks are killed first. Half of each child's memory size is added to
-the parent's score if they do not share the same memory. Thus forking servers
-are the prime candidates to be killed. Having only one 'hungry' child will make
-parent less preferable than the child.
-
-/proc/<pid>/oom_score shows process' current badness score.
-
-The following heuristics are then applied:
- * if the task was reniced, its score doubles
- * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE
- 	or CAP_SYS_RAWIO) have their score divided by 4
- * if oom condition happened in one cpuset and checked process does not belong
- 	to it, its score is divided by 8
- * the resulting score is multiplied by two to the power of oom_adj, i.e.
-	points <<= oom_adj when it is positive and
-	points >>= -(oom_adj) otherwise
-
-The task with the highest badness score is then selected and its children
-are killed, process itself will be killed in an OOM situation when it does
-not have children or some of them disabled oom like described above.
+3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj- Adjust the oom-killer score
+--------------------------------------------------------------------------------
+
+These file can be used to adjust the badness heuristic used to select which
+process gets killed in out of memory conditions.
+
+The badness heuristic assigns a value to each candidate task ranging from 0
+(never kill) to 1000 (always kill) to determine which process is targeted.  The
+units are roughly a proportion along that range of allowed memory the process
+may allocate from based on an estimation of its current memory and swap use.
+For example, if a task is using all allowed memory, its badness score will be
+1000.  If it is using half of its allowed memory, its score will be 500.
+
+There is an additional factor included in the badness score: root
+processes are given 3% extra memory over other tasks.
+
+The amount of "allowed" memory depends on the context in which the oom killer
+was called.  If it is due to the memory assigned to the allocating task's cpuset
+being exhausted, the allowed memory represents the set of mems assigned to that
+cpuset.  If it is due to a mempolicy's node(s) being exhausted, the allowed
+memory represents the set of mempolicy nodes.  If it is due to a memory
+limit (or swap limit) being reached, the allowed memory is that configured
+limit.  Finally, if it is due to the entire system being out of memory, the
+allowed memory represents all allocatable resources.
+
+The value of /proc/<pid>/oom_score_adj is added to the badness score before it
+is used to determine which task to kill.  Acceptable values range from -1000
+(OOM_SCORE_ADJ_MIN) to +1000 (OOM_SCORE_ADJ_MAX).  This allows userspace to
+polarize the preference for oom killing either by always preferring a certain
+task or completely disabling it.  The lowest possible value, -1000, is
+equivalent to disabling oom killing entirely for that task since it will always
+report a badness score of 0.
+
+Consequently, it is very simple for userspace to define the amount of memory to
+consider for each task.  Setting a /proc/<pid>/oom_score_adj value of +500, for
+example, is roughly equivalent to allowing the remainder of tasks sharing the
+same system, cpuset, mempolicy, or memory controller resources to use at least
+50% more memory.  A value of -500, on the other hand, would be roughly
+equivalent to discounting 50% of the task's allowed memory from being considered
+as scoring against the task.
+
+For backwards compatibility with previous kernels, /proc/<pid>/oom_adj may also
+be used to tune the badness score.  Its acceptable values range from -16
+(OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17
+(OOM_DISABLE) to disable oom killing entirely for that task.  Its value is
+scaled linearly with /proc/<pid>/oom_score_adj.
+
+Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
+other with its scaled value.
+
+Caveat: when a parent task is selected, the oom killer will sacrifice any first
+generation children with seperate address spaces instead, if possible.  This
+avoids servers and important system daemons from being killed and loses the
+minimal amount of work.
+
 
 3.2 /proc/<pid>/oom_score - Display current oom-killer score
 -------------------------------------------------------------
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5949d0ac30f2..f923b728388a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -63,6 +63,7 @@
 #include <linux/namei.h>
 #include <linux/mnt_namespace.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
@@ -430,12 +431,11 @@ static const struct file_operations proc_lstats_operations = {
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long points = 0;
-	struct timespec uptime;
 
-	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
 	if (pid_alive(task))
-		points = badness(task, NULL, NULL, uptime.tv_sec);
+		points = oom_badness(task, NULL, NULL,
+					totalram_pages + total_swap_pages);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }
@@ -1038,7 +1038,15 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	}
 
 	task->signal->oom_adj = oom_adjust;
-
+	/*
+	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+	 * value is always attainable.
+	 */
+	if (task->signal->oom_adj == OOM_ADJUST_MAX)
+		task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
+	else
+		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
+								-OOM_DISABLE;
 	unlock_task_sighand(task, &flags);
 	put_task_struct(task);
 
@@ -1051,6 +1059,82 @@ static const struct file_operations proc_oom_adjust_operations = {
 	.llseek		= generic_file_llseek,
 };
 
+static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	char buffer[PROC_NUMBUF];
+	int oom_score_adj = OOM_SCORE_ADJ_MIN;
+	unsigned long flags;
+	size_t len;
+
+	if (!task)
+		return -ESRCH;
+	if (lock_task_sighand(task, &flags)) {
+		oom_score_adj = task->signal->oom_score_adj;
+		unlock_task_sighand(task, &flags);
+	}
+	put_task_struct(task);
+	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	unsigned long flags;
+	long oom_score_adj;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+
+	err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
+	if (err)
+		return -EINVAL;
+	if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
+			oom_score_adj > OOM_SCORE_ADJ_MAX)
+		return -EINVAL;
+
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task)
+		return -ESRCH;
+	if (!lock_task_sighand(task, &flags)) {
+		put_task_struct(task);
+		return -ESRCH;
+	}
+	if (oom_score_adj < task->signal->oom_score_adj &&
+			!capable(CAP_SYS_RESOURCE)) {
+		unlock_task_sighand(task, &flags);
+		put_task_struct(task);
+		return -EACCES;
+	}
+
+	task->signal->oom_score_adj = oom_score_adj;
+	/*
+	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
+	 * always attainable.
+	 */
+	if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+		task->signal->oom_adj = OOM_DISABLE;
+	else
+		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
+							OOM_SCORE_ADJ_MAX;
+	unlock_task_sighand(task, &flags);
+	put_task_struct(task);
+	return count;
+}
+
+static const struct file_operations proc_oom_score_adj_operations = {
+	.read		= oom_score_adj_read,
+	.write		= oom_score_adj_write,
+};
+
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -2623,6 +2707,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
 	INF("oom_score",  S_IRUGO, proc_oom_score),
 	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
@@ -2957,6 +3042,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
 	INF("oom_score", S_IRUGO, proc_oom_score),
 	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUSR, proc_sessionid_operations),
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9f1afd361583..73564cac38c7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -125,6 +125,8 @@ void mem_cgroup_update_file_mapped(struct page *page, int val);
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask, int nid,
 						int zid);
+u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -304,6 +306,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 	return 0;
 }
 
+static inline
+u64 mem_cgroup_get_limit(struct mem_cgroup *mem)
+{
+	return 0;
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 40e5e3a6bc20..73b8d7b6dd19 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -1,14 +1,24 @@
 #ifndef __INCLUDE_LINUX_OOM_H
 #define __INCLUDE_LINUX_OOM_H
 
-/* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */
+/*
+ * /proc/<pid>/oom_adj set to -17 protects from the oom-killer
+ */
 #define OOM_DISABLE (-17)
 /* inclusive */
 #define OOM_ADJUST_MIN (-16)
 #define OOM_ADJUST_MAX 15
 
+/*
+ * /proc/<pid>/oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for
+ * pid.
+ */
+#define OOM_SCORE_ADJ_MIN	(-1000)
+#define OOM_SCORE_ADJ_MAX	1000
+
 #ifdef __KERNEL__
 
+#include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/nodemask.h>
 
@@ -27,6 +37,8 @@ enum oom_constraint {
 	CONSTRAINT_MEMCG,
 };
 
+extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+			const nodemask_t *nodemask, unsigned long totalpages);
 extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9591907c4f79..ce160d68f5e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -621,7 +621,8 @@ struct signal_struct {
 	struct tty_audit_buf *tty_audit_buf;
 #endif
 
-	int oom_adj;	/* OOM kill score adjustment (bit shift) */
+	int oom_adj;		/* OOM kill score adjustment (bit shift) */
+	int oom_score_adj;	/* OOM kill score adjustment */
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/kernel/fork.c b/kernel/fork.c
index a82a65cef741..98b450876f93 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -899,6 +899,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	tty_audit_fork(sig);
 
 	sig->oom_adj = current->signal->oom_adj;
+	sig->oom_score_adj = current->signal->oom_score_adj;
 
 	return 0;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 31abd1c2c0c5..de54ea0094a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1126,6 +1126,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
 	return num;
 }
 
+/*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+	u64 limit;
+	u64 memsw;
+
+	limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+			total_swap_pages;
+	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+	/*
+	 * If memsw is finite and limits the amount of swap space available
+	 * to this memcg, return that limit.
+	 */
+	return min(limit, memsw);
+}
+
 /*
  * Visit the first child (need not be the first child as per the ordering
  * of the cgroup list, since we track last_scanned_child) of @mem and use
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0a4ca8a0234b..d3def05a33d9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
  *  Copyright (C)  1998,2000  Rik van Riel
  *	Thanks go out to Claus Fischer for some serious inspiration and
  *	for goading me into coding this file...
+ *  Copyright (C)  2010  Google, Inc.
+ *	Rewritten by David Rientjes
  *
  *  The routines in this file are used to kill a process when
  *  we're seriously out of memory. This gets called from __alloc_pages()
@@ -34,7 +36,6 @@ int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 static DEFINE_SPINLOCK(zone_scan_lock);
-/* #define DEBUG */
 
 #ifdef CONFIG_NUMA
 /**
@@ -140,137 +141,76 @@ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
 }
 
 /**
- * badness - calculate a numeric value for how bad this task has been
+ * oom_badness - heuristic function to determine which candidate task to kill
  * @p: task struct of which task we should calculate
- * @uptime: current uptime in seconds
+ * @totalpages: total present RAM allowed for page allocation
  *
- * The formula used is relatively simple and documented inline in the
- * function. The main rationale is that we want to select a good task
- * to kill when we run out of memory.
- *
- * Good in this context means that:
- * 1) we lose the minimum amount of work done
- * 2) we recover a large amount of memory
- * 3) we don't kill anything innocent of eating tons of memory
- * 4) we want to kill the minimum amount of processes (one)
- * 5) we try to kill the process the user expects us to kill, this
- *    algorithm has been meticulously tuned to meet the principle
- *    of least surprise ... (be careful when you change it)
+ * The heuristic for determining which task to kill is made to be as simple and
+ * predictable as possible.  The goal is to return the highest value for the
+ * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
-		      const nodemask_t *nodemask, unsigned long uptime)
+unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+		      const nodemask_t *nodemask, unsigned long totalpages)
 {
-	unsigned long points, cpu_time, run_time;
-	struct task_struct *child;
-	struct task_struct *c, *t;
-	int oom_adj = p->signal->oom_adj;
-	struct task_cputime task_time;
-	unsigned long utime;
-	unsigned long stime;
+	int points;
 
 	if (oom_unkillable_task(p, mem, nodemask))
 		return 0;
-	if (oom_adj == OOM_DISABLE)
-		return 0;
 
 	p = find_lock_task_mm(p);
 	if (!p)
 		return 0;
 
 	/*
-	 * The memory size of the process is the basis for the badness.
-	 */
-	points = p->mm->total_vm;
-	task_unlock(p);
-
-	/*
-	 * swapoff can easily use up all memory, so kill those first.
-	 */
-	if (p->flags & PF_OOM_ORIGIN)
-		return ULONG_MAX;
-
-	/*
-	 * Processes which fork a lot of child processes are likely
-	 * a good choice. We add half the vmsize of the children if they
-	 * have an own mm. This prevents forking servers to flood the
-	 * machine with an endless amount of children. In case a single
-	 * child is eating the vast majority of memory, adding only half
-	 * to the parents will make the child our kill candidate of choice.
+	 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
+	 * need to be executed for something that cannot be killed.
 	 */
-	t = p;
-	do {
-		list_for_each_entry(c, &t->children, sibling) {
-			child = find_lock_task_mm(c);
-			if (child) {
-				if (child->mm != p->mm)
-					points += child->mm->total_vm/2 + 1;
-				task_unlock(child);
-			}
-		}
-	} while_each_thread(p, t);
+	if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+		task_unlock(p);
+		return 0;
+	}
 
 	/*
-	 * CPU time is in tens of seconds and run time is in thousands
-         * of seconds. There is no particular reason for this other than
-         * that it turned out to work very well in practice.
+	 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
+	 * priority for oom killing.
 	 */
-	thread_group_cputime(p, &task_time);
-	utime = cputime_to_jiffies(task_time.utime);
-	stime = cputime_to_jiffies(task_time.stime);
-	cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
-
-
-	if (uptime >= p->start_time.tv_sec)
-		run_time = (uptime - p->start_time.tv_sec) >> 10;
-	else
-		run_time = 0;
-
-	if (cpu_time)
-		points /= int_sqrt(cpu_time);
-	if (run_time)
-		points /= int_sqrt(int_sqrt(run_time));
+	if (p->flags & PF_OOM_ORIGIN) {
+		task_unlock(p);
+		return 1000;
+	}
 
 	/*
-	 * Niced processes are most likely less important, so double
-	 * their badness points.
+	 * The memory controller may have a limit of 0 bytes, so avoid a divide
+	 * by zero, if necessary.
 	 */
-	if (task_nice(p) > 0)
-		points *= 2;
+	if (!totalpages)
+		totalpages = 1;
 
 	/*
-	 * Superuser processes are usually more important, so we make it
-	 * less likely that we kill those.
+	 * The baseline for the badness score is the proportion of RAM that each
+	 * task's rss and swap space use.
 	 */
-	if (has_capability_noaudit(p, CAP_SYS_ADMIN) ||
-	    has_capability_noaudit(p, CAP_SYS_RESOURCE))
-		points /= 4;
+	points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
+			totalpages;
+	task_unlock(p);
 
 	/*
-	 * We don't want to kill a process with direct hardware access.
-	 * Not only could that mess up the hardware, but usually users
-	 * tend to only have this flag set on applications they think
-	 * of as important.
+	 * Root processes get 3% bonus, just like the __vm_enough_memory()
+	 * implementation used by LSMs.
 	 */
-	if (has_capability_noaudit(p, CAP_SYS_RAWIO))
-		points /= 4;
+	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
+		points -= 30;
 
 	/*
-	 * Adjust the score by oom_adj.
+	 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
+	 * either completely disable oom killing or always prefer a certain
+	 * task.
 	 */
-	if (oom_adj) {
-		if (oom_adj > 0) {
-			if (!points)
-				points = 1;
-			points <<= oom_adj;
-		} else
-			points >>= -(oom_adj);
-	}
+	points += p->signal->oom_score_adj;
 
-#ifdef DEBUG
-	printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n",
-	p->pid, p->comm, points);
-#endif
-	return points;
+	if (points < 0)
+		return 0;
+	return (points < 1000) ? points : 1000;
 }
 
 /*
@@ -278,12 +218,20 @@ unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
  */
 #ifdef CONFIG_NUMA
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-				    gfp_t gfp_mask, nodemask_t *nodemask)
+				gfp_t gfp_mask, nodemask_t *nodemask,
+				unsigned long *totalpages)
 {
 	struct zone *zone;
 	struct zoneref *z;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+	bool cpuset_limited = false;
+	int nid;
 
+	/* Default to all available memory */
+	*totalpages = totalram_pages + total_swap_pages;
+
+	if (!zonelist)
+		return CONSTRAINT_NONE;
 	/*
 	 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
 	 * to kill current.We have to random task kill in this case.
@@ -293,26 +241,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 		return CONSTRAINT_NONE;
 
 	/*
-	 * The nodemask here is a nodemask passed to alloc_pages(). Now,
-	 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
-	 * feature. mempolicy is an only user of nodemask here.
-	 * check mempolicy's nodemask contains all N_HIGH_MEMORY
+	 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
+	 * the page allocator means a mempolicy is in effect.  Cpuset policy
+	 * is enforced in get_page_from_freelist().
 	 */
-	if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
+	if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
+		*totalpages = total_swap_pages;
+		for_each_node_mask(nid, *nodemask)
+			*totalpages += node_spanned_pages(nid);
 		return CONSTRAINT_MEMORY_POLICY;
+	}
 
 	/* Check this allocation failure is caused by cpuset's wall function */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 			high_zoneidx, nodemask)
 		if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
-			return CONSTRAINT_CPUSET;
+			cpuset_limited = true;
 
+	if (cpuset_limited) {
+		*totalpages = total_swap_pages;
+		for_each_node_mask(nid, cpuset_current_mems_allowed)
+			*totalpages += node_spanned_pages(nid);
+		return CONSTRAINT_CPUSET;
+	}
 	return CONSTRAINT_NONE;
 }
 #else
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-				gfp_t gfp_mask, nodemask_t *nodemask)
+				gfp_t gfp_mask, nodemask_t *nodemask,
+				unsigned long *totalpages)
 {
+	*totalpages = totalram_pages + total_swap_pages;
 	return CONSTRAINT_NONE;
 }
 #endif
@@ -323,17 +282,16 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
  *
  * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct *select_bad_process(unsigned long *ppoints,
-		struct mem_cgroup *mem, const nodemask_t *nodemask)
+static struct task_struct *select_bad_process(unsigned int *ppoints,
+		unsigned long totalpages, struct mem_cgroup *mem,
+		const nodemask_t *nodemask)
 {
 	struct task_struct *p;
 	struct task_struct *chosen = NULL;
-	struct timespec uptime;
 	*ppoints = 0;
 
-	do_posix_clock_monotonic_gettime(&uptime);
 	for_each_process(p) {
-		unsigned long points;
+		unsigned int points;
 
 		if (oom_unkillable_task(p, mem, nodemask))
 			continue;
@@ -365,11 +323,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 				return ERR_PTR(-1UL);
 
 			chosen = p;
-			*ppoints = ULONG_MAX;
+			*ppoints = 1000;
 		}
 
-		points = badness(p, mem, nodemask, uptime.tv_sec);
-		if (points > *ppoints || !chosen) {
+		points = oom_badness(p, mem, nodemask, totalpages);
+		if (points > *ppoints) {
 			chosen = p;
 			*ppoints = points;
 		}
@@ -384,7 +342,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
  *
  * Dumps the current memory state of all system tasks, excluding kernel threads.
  * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
- * score, and name.
+ * value, oom_score_adj value, and name.
  *
  * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
  * shown.
@@ -396,8 +354,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
 	struct task_struct *p;
 	struct task_struct *task;
 
-	printk(KERN_INFO "[ pid ]   uid  tgid total_vm      rss cpu oom_adj "
-	       "name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
 	for_each_process(p) {
 		if (p->flags & PF_KTHREAD)
 			continue;
@@ -414,10 +371,11 @@ static void dump_tasks(const struct mem_cgroup *mem)
 			continue;
 		}
 
-		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3u     %3d %s\n",
-		       task->pid, __task_cred(task)->uid, task->tgid,
-		       task->mm->total_vm, get_mm_rss(task->mm),
-		       task_cpu(task), task->signal->oom_adj, task->comm);
+		pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
+			task->pid, __task_cred(task)->uid, task->tgid,
+			task->mm->total_vm, get_mm_rss(task->mm),
+			task_cpu(task), task->signal->oom_adj,
+			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
 	}
 }
@@ -427,8 +385,9 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 {
 	task_lock(current);
 	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-		"oom_adj=%d\n",
-		current->comm, gfp_mask, order, current->signal->oom_adj);
+		"oom_adj=%d, oom_score_adj=%d\n",
+		current->comm, gfp_mask, order, current->signal->oom_adj,
+		current->signal->oom_score_adj);
 	cpuset_print_task_mems_allowed(current);
 	task_unlock(current);
 	dump_stack();
@@ -468,14 +427,14 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 #undef K
 
 static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-			    unsigned long points, struct mem_cgroup *mem,
-			    nodemask_t *nodemask, const char *message)
+			    unsigned int points, unsigned long totalpages,
+			    struct mem_cgroup *mem, nodemask_t *nodemask,
+			    const char *message)
 {
 	struct task_struct *victim = p;
 	struct task_struct *child;
 	struct task_struct *t = p;
-	unsigned long victim_points = 0;
-	struct timespec uptime;
+	unsigned int victim_points = 0;
 
 	if (printk_ratelimit())
 		dump_header(p, gfp_mask, order, mem);
@@ -491,7 +450,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	}
 
 	task_lock(p);
-	pr_err("%s: Kill process %d (%s) score %lu or sacrifice child\n",
+	pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
 		message, task_pid_nr(p), p->comm, points);
 	task_unlock(p);
 
@@ -501,14 +460,15 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 * parent.  This attempts to lose the minimal amount of work done while
 	 * still freeing memory.
 	 */
-	do_posix_clock_monotonic_gettime(&uptime);
 	do {
 		list_for_each_entry(child, &t->children, sibling) {
-			unsigned long child_points;
+			unsigned int child_points;
 
-			/* badness() returns 0 if the thread is unkillable */
-			child_points = badness(child, mem, nodemask,
-					       uptime.tv_sec);
+			/*
+			 * oom_badness() returns 0 if the thread is unkillable
+			 */
+			child_points = oom_badness(child, mem, nodemask,
+								totalpages);
 			if (child_points > victim_points) {
 				victim = child;
 				victim_points = child_points;
@@ -546,17 +506,19 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 {
-	unsigned long points = 0;
+	unsigned long limit;
+	unsigned int points = 0;
 	struct task_struct *p;
 
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
+	limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
 	read_lock(&tasklist_lock);
 retry:
-	p = select_bad_process(&points, mem, NULL);
+	p = select_bad_process(&points, limit, mem, NULL);
 	if (!p || PTR_ERR(p) == -1UL)
 		goto out;
 
-	if (oom_kill_process(p, gfp_mask, 0, points, mem, NULL,
+	if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
 				"Memory cgroup out of memory"))
 		goto retry;
 out:
@@ -681,8 +643,9 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *nodemask)
 {
 	struct task_struct *p;
+	unsigned long totalpages;
 	unsigned long freed = 0;
-	unsigned long points;
+	unsigned int points;
 	enum oom_constraint constraint = CONSTRAINT_NONE;
 
 	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
@@ -705,8 +668,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 * Check if there were limitations on the allocation (only relevant for
 	 * NUMA) that may require different handling.
 	 */
-	if (zonelist)
-		constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
+	constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
+						&totalpages);
 	check_panic_on_oom(constraint, gfp_mask, order);
 
 	read_lock(&tasklist_lock);
@@ -718,14 +681,14 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		 * non-zero, current could not be killed so we must fallback to
 		 * the tasklist scan.
 		 */
-		if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
-				nodemask,
+		if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
+				NULL, nodemask,
 				"Out of memory (oom_kill_allocating_task)"))
 			return;
 	}
 
 retry:
-	p = select_bad_process(&points, NULL,
+	p = select_bad_process(&points, totalpages, NULL,
 			constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
 								 NULL);
 	if (PTR_ERR(p) == -1UL)
@@ -738,8 +701,8 @@ retry:
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	if (oom_kill_process(p, gfp_mask, order, points, NULL, nodemask,
-			     "Out of memory"))
+	if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+				nodemask, "Out of memory"))
 		goto retry;
 	read_unlock(&tasklist_lock);
 
-- 
cgit v1.2.3-59-g8ed1b


From 51b1bd2ace1595b72956224deda349efa880b693 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 9 Aug 2010 17:19:47 -0700
Subject: oom: deprecate oom_adj tunable

/proc/pid/oom_adj is now deprecated so that that it may eventually be
removed.  The target date for removal is August 2012.

A warning will be printed to the kernel log if a task attempts to use this
interface.  Future warning will be suppressed until the kernel is rebooted
to prevent spamming the kernel log.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/feature-removal-schedule.txt | 25 +++++++++++++++++++++++++
 Documentation/filesystems/proc.txt         |  3 +++
 fs/proc/base.c                             |  8 ++++++++
 include/linux/oom.h                        |  3 +++
 4 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 71f0fea1058f..56cee4727b1a 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -151,6 +151,31 @@ Who:	Eric Biederman <ebiederm@xmission.com>
 
 ---------------------------
 
+What:	/proc/<pid>/oom_adj
+When:	August 2012
+Why:	/proc/<pid>/oom_adj allows userspace to influence the oom killer's
+	badness heuristic used to determine which task to kill when the kernel
+	is out of memory.
+
+	The badness heuristic has since been rewritten since the introduction of
+	this tunable such that its meaning is deprecated.  The value was
+	implemented as a bitshift on a score generated by the badness()
+	function that did not have any precise units of measure.  With the
+	rewrite, the score is given as a proportion of available memory to the
+	task allocating pages, so using a bitshift which grows the score
+	exponentially is, thus, impossible to tune with fine granularity.
+
+	A much more powerful interface, /proc/<pid>/oom_score_adj, was
+	introduced with the oom killer rewrite that allows users to increase or
+	decrease the badness() score linearly.  This interface will replace
+	/proc/<pid>/oom_adj.
+
+	A warning will be emitted to the kernel log if an application uses this
+	deprecated interface.  After it is printed once, future warnings will be
+	suppressed until the kernel is rebooted.
+
+---------------------------
+
 What:	remove EXPORT_SYMBOL(kernel_thread)
 When:	August 2006
 Files:	arch/*/kernel/*_ksyms.c
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index cf1295c2bb66..a6aca8740883 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1285,6 +1285,9 @@ scaled linearly with /proc/<pid>/oom_score_adj.
 Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
 other with its scaled value.
 
+NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
+Documentation/feature-removal-schedule.txt.
+
 Caveat: when a parent task is selected, the oom killer will sacrifice any first
 generation children with seperate address spaces instead, if possible.  This
 avoids servers and important system daemons from being killed and loses the
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f923b728388a..69254a365ce2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1037,6 +1037,14 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		return -EACCES;
 	}
 
+	/*
+	 * Warn that /proc/pid/oom_adj is deprecated, see
+	 * Documentation/feature-removal-schedule.txt.
+	 */
+	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, "
+			"please use /proc/%d/oom_score_adj instead.\n",
+			current->comm, task_pid_nr(current),
+			task_pid_nr(task), task_pid_nr(task));
 	task->signal->oom_adj = oom_adjust;
 	/*
 	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 73b8d7b6dd19..f209b683e118 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -2,6 +2,9 @@
 #define __INCLUDE_LINUX_OOM_H
 
 /*
+ * /proc/<pid>/oom_adj is deprecated, see
+ * Documentation/feature-removal-schedule.txt.
+ *
  * /proc/<pid>/oom_adj set to -17 protects from the oom-killer
  */
 #define OOM_DISABLE (-17)
-- 
cgit v1.2.3-59-g8ed1b


From ad8c2ee801ad7a52d919b478d9b2c7b39a72d295 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 9 Aug 2010 17:19:48 -0700
Subject: rmap: add exclusive page to private anon_vma on swapin

On swapin it is fairly common for a page to be owned exclusively by one
process.  In that case we want to add the page to the anon_vma of that
process's VMA, instead of to the root anon_vma.

This will reduce the amount of rmap searching that the swapout code needs
to do.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  2 ++
 mm/memory.c          |  4 +++-
 mm/rmap.c            | 13 ++++++++++++-
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index dc9b3c0bf5d4..d6661de56f30 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -162,6 +162,8 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
  */
 void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
+			   unsigned long, int);
 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void page_add_file_rmap(struct page *);
 void page_remove_rmap(struct page *);
diff --git a/mm/memory.c b/mm/memory.c
index 6b0c37dcfd16..6bc039486e9f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2628,6 +2628,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	swp_entry_t entry;
 	pte_t pte;
 	struct mem_cgroup *ptr = NULL;
+	int exclusive = 0;
 	int ret = 0;
 
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2722,10 +2723,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
 		flags &= ~FAULT_FLAG_WRITE;
+		exclusive = 1;
 	}
 	flush_icache_page(vma, page);
 	set_pte_at(mm, address, page_table, pte);
-	page_add_anon_rmap(page, vma, address);
+	do_page_add_anon_rmap(page, vma, address, exclusive);
 	/* It's better to call commit-charge after rmap is established */
 	mem_cgroup_commit_charge_swapin(page, ptr);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 4d152a6d3a89..a7d0f5482634 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -829,6 +829,17 @@ static void __page_check_anon_rmap(struct page *page,
  */
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
+{
+	do_page_add_anon_rmap(page, vma, address, 0);
+}
+
+/*
+ * Special version of the above for do_swap_page, which often runs
+ * into pages that are exclusively owned by the current process.
+ * Everybody else should continue to use page_add_anon_rmap above.
+ */
+void do_page_add_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address, int exclusive)
 {
 	int first = atomic_inc_and_test(&page->_mapcount);
 	if (first)
@@ -839,7 +850,7 @@ void page_add_anon_rmap(struct page *page,
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 	if (first)
-		__page_set_anon_rmap(page, vma, address, 0);
+		__page_set_anon_rmap(page, vma, address, exclusive);
 	else
 		__page_check_anon_rmap(page, vma, address);
 }
-- 
cgit v1.2.3-59-g8ed1b


From d2997b1042ec150616c1963b5e5e919ffd0b0ebf Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 9 Aug 2010 17:20:11 -0700
Subject: hibernation: freeze swap at hibernation

When taking a memory snapshot in hibernate_snapshot(), all (directly
called) memory allocations use GFP_ATOMIC.  Hence swap misusage during
hibernation never occurs.

But from a pessimistic point of view, there is no guarantee that no page
allcation has __GFP_WAIT.  It is better to have a global indication "we
enter hibernation, don't use swap!".

This patch tries to freeze new-swap-allocation during hibernation.  (All
user processes are frozenm so swapin is not a concern).

This way, no updates will happen to swap_map[] between
hibernate_snapshot() and save_image().  Swap is thawed when swsusp_free()
is called.  We can be assured that swap corruption will not occur.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ondrej Zary <linux@rainbow-software.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h     |  8 ++++-
 kernel/power/hibernate.c |  1 +
 kernel/power/snapshot.c  |  1 +
 kernel/power/swap.c      |  6 ++--
 mm/swapfile.c            | 94 ++++++++++++++++++++++++++++++++++++------------
 5 files changed, 84 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ff4acea9bbdb..91c9d3fc8513 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -316,7 +316,6 @@ extern long nr_swap_pages;
 extern long total_swap_pages;
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
-extern swp_entry_t get_swap_page_of_type(int);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
@@ -333,6 +332,13 @@ extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 
+#ifdef CONFIG_HIBERNATION
+void hibernation_freeze_swap(void);
+void hibernation_thaw_swap(void);
+swp_entry_t get_swap_for_hibernation(int type);
+void swap_free_for_hibernation(swp_entry_t val);
+#endif
+
 /* linux/mm/thrash.c */
 extern struct mm_struct *swap_token_mm;
 extern void grab_swap_token(struct mm_struct *);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8dc31e02ae12..c77963938bca 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -338,6 +338,7 @@ int hibernation_snapshot(int platform_mode)
 		goto Close;
 
 	suspend_console();
+	hibernation_freeze_swap();
 	saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
 	error = dpm_suspend_start(PMSG_FREEZE);
 	if (error)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f6cd6faf84fd..5e7edfb05e66 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1086,6 +1086,7 @@ void swsusp_free(void)
 	buffer = NULL;
 	alloc_normal = 0;
 	alloc_highmem = 0;
+	hibernation_thaw_swap();
 }
 
 /* Helper functions used for the shrinking of memory. */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index e6a5bdf61a37..5d0059eed3e4 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap)
 {
 	unsigned long offset;
 
-	offset = swp_offset(get_swap_page_of_type(swap));
+	offset = swp_offset(get_swap_for_hibernation(swap));
 	if (offset) {
 		if (swsusp_extents_insert(offset))
-			swap_free(swp_entry(swap, offset));
+			swap_free_for_hibernation(swp_entry(swap, offset));
 		else
 			return swapdev_block(swap, offset);
 	}
@@ -163,7 +163,7 @@ void free_all_swap_pages(int swap)
 		ext = container_of(node, struct swsusp_extent, node);
 		rb_erase(node, &swsusp_extents);
 		for (offset = ext->start; offset <= ext->end; offset++)
-			swap_free(swp_entry(swap, offset));
+			swap_free_for_hibernation(swp_entry(swap, offset));
 
 		kfree(ext);
 	}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f08d165871b3..1f3f9c59a73a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,6 +47,8 @@ long nr_swap_pages;
 long total_swap_pages;
 static int least_priority;
 
+static bool swap_for_hibernation;
+
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
@@ -451,6 +453,8 @@ swp_entry_t get_swap_page(void)
 	spin_lock(&swap_lock);
 	if (nr_swap_pages <= 0)
 		goto noswap;
+	if (swap_for_hibernation)
+		goto noswap;
 	nr_swap_pages--;
 
 	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
@@ -483,28 +487,6 @@ noswap:
 	return (swp_entry_t) {0};
 }
 
-/* The only caller of this function is now susupend routine */
-swp_entry_t get_swap_page_of_type(int type)
-{
-	struct swap_info_struct *si;
-	pgoff_t offset;
-
-	spin_lock(&swap_lock);
-	si = swap_info[type];
-	if (si && (si->flags & SWP_WRITEOK)) {
-		nr_swap_pages--;
-		/* This is called for allocating swap entry, not cache */
-		offset = scan_swap_map(si, 1);
-		if (offset) {
-			spin_unlock(&swap_lock);
-			return swp_entry(type, offset);
-		}
-		nr_swap_pages++;
-	}
-	spin_unlock(&swap_lock);
-	return (swp_entry_t) {0};
-}
-
 static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
@@ -764,6 +746,74 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
 #endif
 
 #ifdef CONFIG_HIBERNATION
+
+static pgoff_t hibernation_offset[MAX_SWAPFILES];
+/*
+ * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise,
+ * saved swap_map[] image to the disk will be an incomplete because it's
+ * changing without synchronization with hibernation snap shot.
+ * At resume, we just make swap_for_hibernation=false. We can forget
+ * used maps easily.
+ */
+void hibernation_freeze_swap(void)
+{
+	int i;
+
+	spin_lock(&swap_lock);
+
+	printk(KERN_INFO "PM: Freeze Swap\n");
+	swap_for_hibernation = true;
+	for (i = 0; i < MAX_SWAPFILES; i++)
+		hibernation_offset[i] = 1;
+	spin_unlock(&swap_lock);
+}
+
+void hibernation_thaw_swap(void)
+{
+	spin_lock(&swap_lock);
+	if (swap_for_hibernation) {
+		printk(KERN_INFO "PM: Thaw Swap\n");
+		swap_for_hibernation = false;
+	}
+	spin_unlock(&swap_lock);
+}
+
+/*
+ * Because updateing swap_map[] can make not-saved-status-change,
+ * we use our own easy allocator.
+ * Please see kernel/power/swap.c, Used swaps are recorded into
+ * RB-tree.
+ */
+swp_entry_t get_swap_for_hibernation(int type)
+{
+	pgoff_t off;
+	swp_entry_t val = {0};
+	struct swap_info_struct *si;
+
+	spin_lock(&swap_lock);
+
+	si = swap_info[type];
+	if (!si || !(si->flags & SWP_WRITEOK))
+		goto done;
+
+	for (off = hibernation_offset[type]; off < si->max; ++off) {
+		if (!si->swap_map[off])
+			break;
+	}
+	if (off < si->max) {
+		val = swp_entry(type, off);
+		hibernation_offset[type] = off + 1;
+	}
+done:
+	spin_unlock(&swap_lock);
+	return val;
+}
+
+void swap_free_for_hibernation(swp_entry_t ent)
+{
+	/* Nothing to do */
+}
+
 /*
  * Find the swap type that corresponds to given device (if any).
  *
-- 
cgit v1.2.3-59-g8ed1b


From 71abbbf856a0e70ca478782505c800891260ba84 Mon Sep 17 00:00:00 2001
From: Ai Li <aili@codeaurora.org>
Date: Mon, 9 Aug 2010 17:20:13 -0700
Subject: cpuidle: extend cpuidle and menu governor to handle dynamic states

On some SoC chips, HW resources may be in use during any particular idle
period.  As a consequence, the cpuidle states that the SoC is safe to
enter can change from idle period to idle period.  In addition, the
latency and threshold of each cpuidle state can vary, depending on the
operating condition when the CPU becomes idle, e.g.  the current cpu
frequency, the current state of the HW blocks, etc.

cpuidle core and the menu governor, in the current form, are geared
towards cpuidle states that are static, i.e.  the availabiltiy of the
states, their latencies, their thresholds are non-changing during run
time.  cpuidle does not provide any hook that cpuidle drivers can use to
adjust those values on the fly for the current idle period before the menu
governor selects the target cpuidle state.

This patch extends cpuidle core and the menu governor to handle states
that are dynamic.  There are three additions in the patch and the patch
maintains backwards-compatibility with existing cpuidle drivers.

1) add prepare() to struct cpuidle_device.  A cpuidle driver can hook
   into the callback and cpuidle will call prepare() before calling the
   governor's select function.  The callback gives the cpuidle driver a
   chance to update the dynamic information of the cpuidle states for the
   current idle period, e.g.  state availability, latencies, thresholds,
   power values, etc.

2) add CPUIDLE_FLAG_IGNORE as one of the state flags.  In the prepare()
   function, a cpuidle driver can set/clear the flag to indicate to the
   menu governor whether a cpuidle state should be ignored, i.e.  not
   available, during the current idle period.

3) add power_specified bit to struct cpuidle_device.  The menu governor
   currently assumes that the cpuidle states are arranged in the order of
   increasing latency, threshold, and power savings.  This is true or can
   be made true for static states.  Once the state parameters are dynamic,
   the latencies, thresholds, and power savings for the cpuidle states can
   increase or decrease by different amounts from idle period to idle
   period.  So the assumption of increasing latency, threshold, and power
   savings from Cn to C(n+1) can no longer be guaranteed.

It can be straightforward to calculate the power consumption of each
available state and to specify it in power_usage for the idle period.
Using the power_usage fields, the menu governor then selects the state
that has the lowest power consumption and that still satisfies all other
critieria.  The power_specified bit defaults to 0.  For existing cpuidle
drivers, cpuidle detects that power_specified is 0 and fills in a dummy
set of power_usage values.

Signed-off-by: Ai Li <aili@codeaurora.org>
Cc: Len Brown <len.brown@intel.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/cpuidle/cpuidle.c        | 31 +++++++++++++++++++++++++++++++
 drivers/cpuidle/governors/menu.c | 23 ++++++++++++++++-------
 include/linux/cpuidle.h          |  4 ++++
 3 files changed, 51 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index dbefe15bd582..a50710843378 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -74,6 +74,17 @@ static void cpuidle_idle_call(void)
 	 */
 	hrtimer_peek_ahead_timers();
 #endif
+
+	/*
+	 * Call the device's prepare function before calling the
+	 * governor's select function.  ->prepare gives the device's
+	 * cpuidle driver a chance to update any dynamic information
+	 * of its cpuidle states for the current idle period, e.g.
+	 * state availability, latencies, residencies, etc.
+	 */
+	if (dev->prepare)
+		dev->prepare(dev);
+
 	/* ask the governor for the next state */
 	next_state = cpuidle_curr_governor->select(dev);
 	if (need_resched()) {
@@ -282,6 +293,26 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
 
 	poll_idle_init(dev);
 
+	/*
+	 * cpuidle driver should set the dev->power_specified bit
+	 * before registering the device if the driver provides
+	 * power_usage numbers.
+	 *
+	 * For those devices whose ->power_specified is not set,
+	 * we fill in power_usage with decreasing values as the
+	 * cpuidle code has an implicit assumption that state Cn
+	 * uses less power than C(n-1).
+	 *
+	 * With CONFIG_ARCH_HAS_CPU_RELAX, C0 is already assigned
+	 * an power value of -1.  So we use -2, -3, etc, for other
+	 * c-states.
+	 */
+	if (!dev->power_specified) {
+		int i;
+		for (i = CPUIDLE_DRIVER_STATE_START; i < dev->state_count; i++)
+			dev->states[i].power_usage = -1 - i;
+	}
+
 	per_cpu(cpuidle_devices, dev->cpu) = dev;
 	list_add(&dev->device_list, &cpuidle_detected_devices);
 	if ((ret = cpuidle_add_sysfs(sys_dev))) {
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 1b128702d300..c2408bbe9c2e 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -234,6 +234,7 @@ static int menu_select(struct cpuidle_device *dev)
 {
 	struct menu_device *data = &__get_cpu_var(menu_devices);
 	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
+	unsigned int power_usage = -1;
 	int i;
 	int multiplier;
 
@@ -278,19 +279,27 @@ static int menu_select(struct cpuidle_device *dev)
 	if (data->expected_us > 5)
 		data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
 
-
-	/* find the deepest idle state that satisfies our constraints */
+	/*
+	 * Find the idle state with the lowest power while satisfying
+	 * our constraints.
+	 */
 	for (i = CPUIDLE_DRIVER_STATE_START; i < dev->state_count; i++) {
 		struct cpuidle_state *s = &dev->states[i];
 
+		if (s->flags & CPUIDLE_FLAG_IGNORE)
+			continue;
 		if (s->target_residency > data->predicted_us)
-			break;
+			continue;
 		if (s->exit_latency > latency_req)
-			break;
+			continue;
 		if (s->exit_latency * multiplier > data->predicted_us)
-			break;
-		data->exit_us = s->exit_latency;
-		data->last_state_idx = i;
+			continue;
+
+		if (s->power_usage < power_usage) {
+			power_usage = s->power_usage;
+			data->last_state_idx = i;
+			data->exit_us = s->exit_latency;
+		}
 	}
 
 	return data->last_state_idx;
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 55215cce5005..36ca9721a0c2 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -52,6 +52,7 @@ struct cpuidle_state {
 #define CPUIDLE_FLAG_SHALLOW	(0x20) /* low latency, minimal savings */
 #define CPUIDLE_FLAG_BALANCED	(0x40) /* medium latency, moderate savings */
 #define CPUIDLE_FLAG_DEEP	(0x80) /* high latency, large savings */
+#define CPUIDLE_FLAG_IGNORE	(0x100) /* ignore during this idle period */
 
 #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000)
 
@@ -84,6 +85,7 @@ struct cpuidle_state_kobj {
 struct cpuidle_device {
 	unsigned int		registered:1;
 	unsigned int		enabled:1;
+	unsigned int		power_specified:1;
 	unsigned int		cpu;
 
 	int			last_residency;
@@ -97,6 +99,8 @@ struct cpuidle_device {
 	struct completion	kobj_unregister;
 	void			*governor_data;
 	struct cpuidle_state	*safe_state;
+
+	int (*prepare)		(struct cpuidle_device *dev);
 };
 
 DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
-- 
cgit v1.2.3-59-g8ed1b


From ea6b101d8a3ea4e1dec29df31188c2f9852296fe Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 9 Aug 2010 17:20:18 -0700
Subject: include/linux/compiler-gcc.h: use __same_type() in __must_be_array()

We should use the __same_type() helper in __must_be_array().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 0da5b187f124..16508bcddacc 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -35,8 +35,7 @@
     (typeof(ptr)) (__ptr + (off)); })
 
 /* &a[0] degrades to a pointer: a different type from an array */
-#define __must_be_array(a) \
-  BUILD_BUG_ON_ZERO(__builtin_types_compatible_p(typeof(a), typeof(&a[0])))
+#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 
 /*
  * Force always-inline if the user requests it so via the .config,
-- 
cgit v1.2.3-59-g8ed1b


From cf4ca4874fc45166198424384275f443a672d0b7 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 9 Aug 2010 17:20:20 -0700
Subject: kernel.h: remove unused NIPQUAD and NIPQUAD_FMT

There are no more uses of NIPQUAD or NIPQUAD_FMT.  Remove the definitions.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7d5b10ff63e0..5b57236dfbd0 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -615,17 +615,6 @@ ftrace_vprintk(const char *fmt, va_list ap)
 static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #endif /* CONFIG_TRACING */
 
-/*
- *      Display an IP address in readable format.
- */
-
-#define NIPQUAD(addr) \
-	((unsigned char *)&addr)[0], \
-	((unsigned char *)&addr)[1], \
-	((unsigned char *)&addr)[2], \
-	((unsigned char *)&addr)[3]
-#define NIPQUAD_FMT "%u.%u.%u.%u"
-
 /*
  * min()/max()/clamp() macros that also do
  * strict type-checking.. See the
-- 
cgit v1.2.3-59-g8ed1b


From e269b085175acf03fc687a7416b9fd84aa9c6c23 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Mon, 9 Aug 2010 17:20:23 -0700
Subject: iommu: inline iommu_num_pages

A profile of a network benchmark showed iommu_num_pages rather high up:

     0.52%  iommu_num_pages

Looking at the profile, an integer divide is taking almost all of the time:

      %
         :      c000000000376ea4 <.iommu_num_pages>:
    1.93 :      c000000000376ea4:       fb e1 ff f8     std     r31,-8(r1)
    0.00 :      c000000000376ea8:       f8 21 ff c1     stdu    r1,-64(r1)
    0.00 :      c000000000376eac:       7c 3f 0b 78     mr      r31,r1
    3.86 :      c000000000376eb0:       38 84 ff ff     addi    r4,r4,-1
    0.00 :      c000000000376eb4:       38 05 ff ff     addi    r0,r5,-1
    0.00 :      c000000000376eb8:       7c 84 2a 14     add     r4,r4,r5
   46.95 :      c000000000376ebc:       7c 00 18 38     and     r0,r0,r3
   45.66 :      c000000000376ec0:       7c 84 02 14     add     r4,r4,r0
    0.00 :      c000000000376ec4:       7c 64 2b 92     divdu   r3,r4,r5
    0.00 :      c000000000376ec8:       38 3f 00 40     addi    r1,r31,64
    0.00 :      c000000000376ecc:       eb e1 ff f8     ld      r31,-8(r1)
    1.61 :      c000000000376ed0:       4e 80 00 20     blr

Since every caller of iommu_num_pages passes in a constant power of two
we can inline this such that the divide is replaced by a shift. The
entire function is only a few instructions once optimised, so it is
a good candidate for inlining overall.

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/iommu-helper.h | 12 ++++++++++--
 lib/iommu-helper.c           |  9 ---------
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu-helper.h b/include/linux/iommu-helper.h
index 64d1b638745d..86bdeffe43ad 100644
--- a/include/linux/iommu-helper.h
+++ b/include/linux/iommu-helper.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_IOMMU_HELPER_H
 #define _LINUX_IOMMU_HELPER_H
 
+#include <linux/kernel.h>
+
 static inline unsigned long iommu_device_max_index(unsigned long size,
 						   unsigned long offset,
 						   u64 dma_mask)
@@ -20,7 +22,13 @@ extern unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
 				      unsigned long boundary_size,
 				      unsigned long align_mask);
 
-extern unsigned long iommu_num_pages(unsigned long addr, unsigned long len,
-				     unsigned long io_page_size);
+static inline unsigned long iommu_num_pages(unsigned long addr,
+					    unsigned long len,
+					    unsigned long io_page_size)
+{
+	unsigned long size = (addr & (io_page_size - 1)) + len;
+
+	return DIV_ROUND_UP(size, io_page_size);
+}
 
 #endif
diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
index c0251f4ad08b..da053313ee5c 100644
--- a/lib/iommu-helper.c
+++ b/lib/iommu-helper.c
@@ -38,12 +38,3 @@ again:
 	return -1;
 }
 EXPORT_SYMBOL(iommu_area_alloc);
-
-unsigned long iommu_num_pages(unsigned long addr, unsigned long len,
-			      unsigned long io_page_size)
-{
-	unsigned long size = (addr & (io_page_size - 1)) + len;
-
-	return DIV_ROUND_UP(size, io_page_size);
-}
-EXPORT_SYMBOL(iommu_num_pages);
-- 
cgit v1.2.3-59-g8ed1b


From ea98eed9bcb62d1319db8b1210712c6a110a886c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 9 Aug 2010 17:20:56 -0700
Subject: flex_array: add helpers to get and put to make pointers easy to use

Getting and putting arrays of pointers with flex arrays is a PITA.  You
have to remember to pass &ptr to the _put and you have to do weird and
wacky casting to get the ptr back from the _get.  Add two functions
flex_array_get_ptr() and flex_array_put_ptr() to handle all of the magic.

[akpm@linux-foundation.org: simplification suggested by Joe]
Signed-off-by: Eric Paris <eparis@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Joe Perches <joe@perches.com>
Cc: James Morris <jmorris@namei.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/flex_array.h |  5 +++++
 lib/flex_array.c           | 25 ++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
index 1d747f72298b..631b77f2ac70 100644
--- a/include/linux/flex_array.h
+++ b/include/linux/flex_array.h
@@ -70,4 +70,9 @@ int flex_array_clear(struct flex_array *fa, unsigned int element_nr);
 void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
 int flex_array_shrink(struct flex_array *fa);
 
+#define flex_array_put_ptr(fa, nr, src, gfp) \
+	flex_array_put(fa, nr, &(void *)(src), gfp)
+
+void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr);
+
 #endif /* _FLEX_ARRAY_H */
diff --git a/lib/flex_array.c b/lib/flex_array.c
index 41b1804fa728..77a6fea7481e 100644
--- a/lib/flex_array.c
+++ b/lib/flex_array.c
@@ -171,6 +171,8 @@ __fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags)
  * Note that this *copies* the contents of @src into
  * the array.  If you are trying to store an array of
  * pointers, make sure to pass in &ptr instead of ptr.
+ * You may instead wish to use the flex_array_put_ptr()
+ * helper function.
  *
  * Locking must be provided by the caller.
  */
@@ -265,7 +267,8 @@ int flex_array_prealloc(struct flex_array *fa, unsigned int start,
  *
  * Returns a pointer to the data at index @element_nr.  Note
  * that this is a copy of the data that was passed in.  If you
- * are using this to store pointers, you'll get back &ptr.
+ * are using this to store pointers, you'll get back &ptr.  You
+ * may instead wish to use the flex_array_get_ptr helper.
  *
  * Locking must be provided by the caller.
  */
@@ -286,6 +289,26 @@ void *flex_array_get(struct flex_array *fa, unsigned int element_nr)
 	return &part->elements[index_inside_part(fa, element_nr)];
 }
 
+/**
+ * flex_array_get_ptr - pull a ptr back out of the array
+ * @fa:		the flex array from which to extract data
+ * @element_nr:	index of the element to fetch from the array
+ *
+ * Returns the pointer placed in the flex array at element_nr using
+ * flex_array_put_ptr().  This function should not be called if the
+ * element in question was not set using the _put_ptr() helper.
+ */
+void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr)
+{
+	void **tmp;
+
+	tmp = flex_array_get(fa, element_nr);
+	if (!tmp)
+		return NULL;
+
+	return *tmp;
+}
+
 static int part_is_free(struct flex_array_part *part)
 {
 	int i;
-- 
cgit v1.2.3-59-g8ed1b


From 7ed24e8da75615418cbf3417e421053e53a5f5b3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 9 Aug 2010 13:40:03 +0000
Subject: etherdevice.h: fix kernel-doc typo

Fix etherdevice.h parameter name typo in kernel-doc:

Warning(include/linux/etherdevice.h:138): No description found for parameter 'hwaddr'
Warning(include/linux/etherdevice.h:138): Excess function parameter 'addr' description in 'dev_hw_addr_random'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 848480bc2bf9..2308fbb4523a 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -129,7 +129,7 @@ static inline void random_ether_addr(u8 *addr)
 /**
  * dev_hw_addr_random - Create random MAC and set device flag
  * @dev: pointer to net_device structure
- * @addr: Pointer to a six-byte array containing the Ethernet address
+ * @hwaddr: Pointer to a six-byte array containing the Ethernet address
  *
  * Generate random MAC to be used by a device and set addr_assign_type
  * so the state can be read by sysfs and be used by udev.
-- 
cgit v1.2.3-59-g8ed1b


From de75d60d5ea235e6e09f4962ab22541ce0fe176a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 10 Aug 2010 12:14:27 -0400
Subject: block: make sure that REQ_* types are seen even with CONFIG_BLOCK=n

These form the basis of the basic WRITE etc primitives, so we
need them to be always visible. Otherwise we see errors like:

	mm/filemap.c:2164: error: 'REQ_WRITE' undeclared
	fs/read_write.c:362: error: 'REQ_WRITE' undeclared
	fs/splice.c:1108: error: 'REQ_WRITE' undeclared
	fs/aio.c:1496: error: 'REQ_WRITE' undeclared

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blk_types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 118523734af0..53691774d34e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -108,6 +108,8 @@ struct bio {
 #define BIO_POOL_MASK		(1UL << BIO_POOL_OFFSET)
 #define BIO_POOL_IDX(bio)	((bio)->bi_flags >> BIO_POOL_OFFSET)
 
+#endif /* CONFIG_BLOCK */
+
 /*
  * Request flags.  For use in the cmd_flags field of struct request, and in
  * bi_rw of struct bio.  Note that some flags are only valid in either one.
@@ -189,5 +191,4 @@ enum rq_flag_bits {
 #define REQ_IO_STAT		(1 << __REQ_IO_STAT)
 #define REQ_MIXED_MERGE		(1 << __REQ_MIXED_MERGE)
 
-#endif /* CONFIG_BLOCK */
 #endif /* __LINUX_BLK_TYPES_H */
-- 
cgit v1.2.3-59-g8ed1b


From 26df6d13406d1a53b0bda08bd712f1924affd7cd Mon Sep 17 00:00:00 2001
From: "hyc@symas.com" <hyc@symas.com>
Date: Tue, 22 Jun 2010 10:14:49 -0700
Subject: tty: Add EXTPROC support for LINEMODE

This patch is against the 2.6.34 source.

Paraphrased from the 1989 BSD patch by David Borman @ cray.com:

     These are the changes needed for the kernel to support
     LINEMODE in the server.

     There is a new bit in the termios local flag word, EXTPROC.
     When this bit is set, several aspects of the terminal driver
     are disabled.  Input line editing, character echo, and mapping
     of signals are all disabled.  This allows the telnetd to turn
     off these functions when in linemode, but still keep track of
     what state the user wants the terminal to be in.

     New ioctl:
         TIOCSIG         Generate a signal to processes in the
                         current process group of the pty.

     There is a new mode for packet driver, the TIOCPKT_IOCTL bit.
     When packet mode is turned on in the pty, and the EXTPROC bit
     is set, then whenever the state of the pty is changed, the
     next read on the master side of the pty will have the TIOCPKT_IOCTL
     bit set.  This allows the process on the server side of the pty
     to know when the state of the terminal has changed; it can then
     issue the appropriate ioctl to retrieve the new state.

Since the original BSD patches accompanied the source code for telnet
I've left that reference here, but obviously the feature is useful for
any remote terminal protocol, including ssh.

The corresponding feature has existed in the BSD tty driver since 1989.
For historical reference, a good copy of the relevant files can be found
here:

http://anonsvn.mit.edu/viewvc/krb5/trunk/src/appl/telnet/?pathrev=17741

Signed-off-by: Howard Chu <hyc@symas.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/alpha/include/asm/ioctls.h     |  2 ++
 arch/alpha/include/asm/termbits.h   |  1 +
 arch/arm/include/asm/ioctls.h       |  2 ++
 arch/arm/include/asm/termbits.h     |  1 +
 arch/avr32/include/asm/ioctls.h     |  2 ++
 arch/avr32/include/asm/termbits.h   |  1 +
 arch/cris/include/asm/ioctls.h      |  2 ++
 arch/cris/include/asm/termbits.h    |  1 +
 arch/frv/include/asm/ioctls.h       |  2 ++
 arch/frv/include/asm/termbits.h     |  1 +
 arch/h8300/include/asm/ioctls.h     |  2 ++
 arch/h8300/include/asm/termbits.h   |  1 +
 arch/ia64/include/asm/ioctls.h      |  2 ++
 arch/ia64/include/asm/termbits.h    |  1 +
 arch/m32r/include/asm/ioctls.h      |  2 ++
 arch/m32r/include/asm/termbits.h    |  1 +
 arch/m68k/include/asm/ioctls.h      |  2 ++
 arch/m68k/include/asm/termbits.h    |  1 +
 arch/mips/include/asm/ioctls.h      |  3 ++-
 arch/mips/include/asm/termbits.h    |  1 +
 arch/mn10300/include/asm/ioctls.h   |  2 ++
 arch/mn10300/include/asm/termbits.h |  1 +
 arch/parisc/include/asm/ioctls.h    |  2 ++
 arch/parisc/include/asm/termbits.h  |  1 +
 arch/powerpc/include/asm/ioctls.h   |  2 ++
 arch/powerpc/include/asm/termbits.h |  1 +
 arch/s390/include/asm/ioctls.h      |  2 ++
 arch/sh/include/asm/ioctls.h        |  2 ++
 arch/sparc/include/asm/ioctls.h     |  2 ++
 arch/sparc/include/asm/termbits.h   |  1 +
 arch/xtensa/include/asm/ioctls.h    |  2 ++
 arch/xtensa/include/asm/termbits.h  |  1 +
 drivers/char/n_tty.c                | 17 ++++++++++++++---
 drivers/char/pty.c                  | 21 +++++++++++++++++++++
 drivers/char/tty_ioctl.c            | 18 ++++++++++++------
 fs/compat_ioctl.c                   |  1 +
 include/asm-generic/ioctls.h        |  2 ++
 include/asm-generic/termbits.h      |  1 +
 include/linux/tty.h                 |  1 +
 39 files changed, 101 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/ioctls.h b/arch/alpha/include/asm/ioctls.h
index c7b0cc61ce5b..59617c3c2be6 100644
--- a/arch/alpha/include/asm/ioctls.h
+++ b/arch/alpha/include/asm/ioctls.h
@@ -80,6 +80,7 @@
 # define TIOCPKT_START		 8
 # define TIOCPKT_NOSTOP		16
 # define TIOCPKT_DOSTOP		32
+# define TIOCPKT_IOCTL		64
 
 
 #define TIOCNOTTY	0x5422
@@ -91,6 +92,7 @@
 #define TIOCGSID	0x5429  /* Return the session ID of FD */
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define TIOCSERCONFIG	0x5453
 #define TIOCSERGWILD	0x5454
diff --git a/arch/alpha/include/asm/termbits.h b/arch/alpha/include/asm/termbits.h
index ad854a4a3af6..879dd3589921 100644
--- a/arch/alpha/include/asm/termbits.h
+++ b/arch/alpha/include/asm/termbits.h
@@ -180,6 +180,7 @@ struct ktermios {
 #define FLUSHO	0x00800000
 #define PENDIN	0x20000000
 #define IEXTEN	0x00000400
+#define EXTPROC	0x10000000
 
 /* Values for the ACTION argument to `tcflow'.  */
 #define	TCOOFF		0
diff --git a/arch/arm/include/asm/ioctls.h b/arch/arm/include/asm/ioctls.h
index 7f0b6d13296a..0b30894b5482 100644
--- a/arch/arm/include/asm/ioctls.h
+++ b/arch/arm/include/asm/ioctls.h
@@ -52,6 +52,7 @@
 #define TCSETSF2	_IOW('T',0x2D, struct termios2)
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define TIOCGRS485      0x542E
 #define TIOCSRS485      0x542F
@@ -81,6 +82,7 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #define TIOCSER_TEMT	0x01	/* Transmitter physically empty */
 
diff --git a/arch/arm/include/asm/termbits.h b/arch/arm/include/asm/termbits.h
index f784d11f40b5..704135d28d1d 100644
--- a/arch/arm/include/asm/termbits.h
+++ b/arch/arm/include/asm/termbits.h
@@ -177,6 +177,7 @@ struct ktermios {
 #define FLUSHO	0010000
 #define PENDIN	0040000
 #define IEXTEN	0100000
+#define EXTPROC	0200000
 
 /* tcflow() and TCXONC use these */
 #define	TCOOFF		0
diff --git a/arch/avr32/include/asm/ioctls.h b/arch/avr32/include/asm/ioctls.h
index 143dafb3997e..b7dd324b46a9 100644
--- a/arch/avr32/include/asm/ioctls.h
+++ b/arch/avr32/include/asm/ioctls.h
@@ -53,6 +53,7 @@
 #define TCSETSF2	_IOW('T',0x2D, struct termios2)
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define TIOCGRS485      0x542E
 #define TIOCSRS485      0x542F
@@ -82,6 +83,7 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
 
diff --git a/arch/avr32/include/asm/termbits.h b/arch/avr32/include/asm/termbits.h
index db2daab31fdb..366adc5ebb10 100644
--- a/arch/avr32/include/asm/termbits.h
+++ b/arch/avr32/include/asm/termbits.h
@@ -175,6 +175,7 @@ struct ktermios {
 #define FLUSHO	0010000
 #define PENDIN	0040000
 #define IEXTEN	0100000
+#define EXTPROC	0200000
 
 /* tcflow() and TCXONC use these */
 #define	TCOOFF		0
diff --git a/arch/cris/include/asm/ioctls.h b/arch/cris/include/asm/ioctls.h
index bb49142dc6ab..c9129ed37443 100644
--- a/arch/cris/include/asm/ioctls.h
+++ b/arch/cris/include/asm/ioctls.h
@@ -54,6 +54,7 @@
 #define TCSETSF2	_IOW('T',0x2D, struct termios2)
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define FIONCLEX	0x5450  /* these numbers need to be adjusted. */
 #define FIOCLEX		0x5451
@@ -85,6 +86,7 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
 
diff --git a/arch/cris/include/asm/termbits.h b/arch/cris/include/asm/termbits.h
index 66e1a7492a0c..1c43bc874ccf 100644
--- a/arch/cris/include/asm/termbits.h
+++ b/arch/cris/include/asm/termbits.h
@@ -214,6 +214,7 @@ struct ktermios {
 #define FLUSHO	0010000
 #define PENDIN	0040000
 #define IEXTEN	0100000
+#define EXTPROC	0200000
 
 /* tcflow() and TCXONC use these */
 #define	TCOOFF		0
diff --git a/arch/frv/include/asm/ioctls.h b/arch/frv/include/asm/ioctls.h
index d0c30e31fbda..a993e3759ccf 100644
--- a/arch/frv/include/asm/ioctls.h
+++ b/arch/frv/include/asm/ioctls.h
@@ -53,6 +53,7 @@
 #define TCSETSF2	_IOW('T',0x2D, struct termios2)
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define FIONCLEX	0x5450  /* these numbers need to be adjusted. */
 #define FIOCLEX		0x5451
@@ -79,6 +80,7 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
 
diff --git a/arch/frv/include/asm/termbits.h b/arch/frv/include/asm/termbits.h
index 5568492b5086..7722e19cc349 100644
--- a/arch/frv/include/asm/termbits.h
+++ b/arch/frv/include/asm/termbits.h
@@ -180,6 +180,7 @@ struct ktermios {
 #define FLUSHO	0010000
 #define PENDIN	0040000
 #define IEXTEN	0100000
+#define EXTPROC	0200000
 
 
 /* tcflow() and TCXONC use these */
diff --git a/arch/h8300/include/asm/ioctls.h b/arch/h8300/include/asm/ioctls.h
index 98a53d067269..b6b249f9f308 100644
--- a/arch/h8300/include/asm/ioctls.h
+++ b/arch/h8300/include/asm/ioctls.h
@@ -53,6 +53,7 @@
 #define TCSETSF2	_IOW('T',0x2D, struct termios2)
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define FIONCLEX	0x5450  /* these numbers need to be adjusted. */
 #define FIOCLEX		0x5451
@@ -79,6 +80,7 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
 
diff --git a/arch/h8300/include/asm/termbits.h b/arch/h8300/include/asm/termbits.h
index 31eca81db3f7..3287a6244d74 100644
--- a/arch/h8300/include/asm/termbits.h
+++ b/arch/h8300/include/asm/termbits.h
@@ -179,6 +179,7 @@ struct ktermios {
 #define FLUSHO	0010000
 #define PENDIN	0040000
 #define IEXTEN	0100000
+#define EXTPROC	0200000
 
 
 /* tcflow() and TCXONC use these */
diff --git a/arch/ia64/include/asm/ioctls.h b/arch/ia64/include/asm/ioctls.h
index f0ee86c0b5f7..b79c385114ef 100644
--- a/arch/ia64/include/asm/ioctls.h
+++ b/arch/ia64/include/asm/ioctls.h
@@ -59,6 +59,7 @@
 #define TCSETSF2	_IOW('T',0x2D, struct termios2)
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define FIONCLEX	0x5450  /* these numbers need to be adjusted. */
 #define FIOCLEX		0x5451
@@ -85,6 +86,7 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
 
diff --git a/arch/ia64/include/asm/termbits.h b/arch/ia64/include/asm/termbits.h
index 9f162e0089ad..c009b94e58d9 100644
--- a/arch/ia64/include/asm/termbits.h
+++ b/arch/ia64/include/asm/termbits.h
@@ -187,6 +187,7 @@ struct ktermios {
 #define FLUSHO	0010000
 #define PENDIN	0040000
 #define IEXTEN	0100000
+#define EXTPROC	0200000
 
 /* tcflow() and TCXONC use these */
 #define	TCOOFF		0
diff --git a/arch/m32r/include/asm/ioctls.h b/arch/m32r/include/asm/ioctls.h
index f4c13a52fe48..66288063a4c0 100644
--- a/arch/m32r/include/asm/ioctls.h
+++ b/arch/m32r/include/asm/ioctls.h
@@ -53,6 +53,7 @@
 #define TCSETSF2	_IOW('T',0x2D, struct termios2)
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define FIONCLEX	0x5450
 #define FIOCLEX		0x5451
@@ -79,6 +80,7 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
 
diff --git a/arch/m32r/include/asm/termbits.h b/arch/m32r/include/asm/termbits.h
index bc104008b55b..957a3c688549 100644
--- a/arch/m32r/include/asm/termbits.h
+++ b/arch/m32r/include/asm/termbits.h
@@ -179,6 +179,7 @@ struct ktermios {
 #define FLUSHO	0010000
 #define PENDIN	0040000
 #define IEXTEN	0100000
+#define EXTPROC	0200000
 
 /* tcflow() and TCXONC use these */
 #define	TCOOFF		0
diff --git a/arch/m68k/include/asm/ioctls.h b/arch/m68k/include/asm/ioctls.h
index b8d2f4be7fd7..91a57d665460 100644
--- a/arch/m68k/include/asm/ioctls.h
+++ b/arch/m68k/include/asm/ioctls.h
@@ -52,6 +52,7 @@
 #define TCSETSF2	_IOW('T',0x2D, struct termios2)
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define FIONCLEX	0x5450  /* these numbers need to be adjusted. */
 #define FIOCLEX		0x5451
@@ -78,6 +79,7 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
 
diff --git a/arch/m68k/include/asm/termbits.h b/arch/m68k/include/asm/termbits.h
index 8c14170996bb..aea1e37b765a 100644
--- a/arch/m68k/include/asm/termbits.h
+++ b/arch/m68k/include/asm/termbits.h
@@ -179,6 +179,7 @@ struct ktermios {
 #define FLUSHO	0010000
 #define PENDIN	0040000
 #define IEXTEN	0100000
+#define EXTPROC	0200000
 
 
 /* tcflow() and TCXONC use these */
diff --git a/arch/mips/include/asm/ioctls.h b/arch/mips/include/asm/ioctls.h
index 2fb9e1693bf7..d87cb0465693 100644
--- a/arch/mips/include/asm/ioctls.h
+++ b/arch/mips/include/asm/ioctls.h
@@ -41,7 +41,7 @@
 #define	 TIOCPKT_START		0x08	/* start output */
 #define	 TIOCPKT_NOSTOP		0x10	/* no more ^S, ^Q */
 #define	 TIOCPKT_DOSTOP		0x20	/* now do ^S ^Q */
-/* #define  TIOCPKT_IOCTL		0x40	state change of pty driver */
+#define  TIOCPKT_IOCTL		0x40	/* state change of pty driver */
 #define TIOCSWINSZ	_IOW('t', 103, struct winsize)	/* set window size */
 #define TIOCGWINSZ	_IOR('t', 104, struct winsize)	/* get window size */
 #define TIOCNOTTY	0x5471		/* void tty association */
@@ -83,6 +83,7 @@
 #define TCSETSF2	_IOW('T', 0x2D, struct termios2)
 #define TIOCGPTN	_IOR('T', 0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T', 0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T', 0x36, int)  /* Generate signal on Pty slave */
 
 /* I hope the range from 0x5480 on is free ... */
 #define TIOCSCTTY	0x5480		/* become controlling tty */
diff --git a/arch/mips/include/asm/termbits.h b/arch/mips/include/asm/termbits.h
index c83c68444e86..76630b396fac 100644
--- a/arch/mips/include/asm/termbits.h
+++ b/arch/mips/include/asm/termbits.h
@@ -203,6 +203,7 @@ struct ktermios {
 #define PENDIN	0040000		/* Retype pending input (state).  */
 #define TOSTOP	0100000		/* Send SIGTTOU for background output.  */
 #define ITOSTOP	TOSTOP
+#define EXTPROC	0200000		/* External processing on pty */
 
 /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
 #define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
diff --git a/arch/mn10300/include/asm/ioctls.h b/arch/mn10300/include/asm/ioctls.h
index 638219a99b1e..cb8cf1902234 100644
--- a/arch/mn10300/include/asm/ioctls.h
+++ b/arch/mn10300/include/asm/ioctls.h
@@ -54,6 +54,7 @@
 #define TIOCGPTN	_IOR('T', 0x30, unsigned int) /* Get Pty Number
 						       * (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T', 0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T', 0x36, int)  /* Generate signal on Pty slave */
 
 #define FIONCLEX	0x5450
 #define FIOCLEX		0x5451
@@ -80,6 +81,7 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
 
diff --git a/arch/mn10300/include/asm/termbits.h b/arch/mn10300/include/asm/termbits.h
index eb2b0dc1f696..130d42495972 100644
--- a/arch/mn10300/include/asm/termbits.h
+++ b/arch/mn10300/include/asm/termbits.h
@@ -180,6 +180,7 @@ struct ktermios {
 #define FLUSHO	0010000
 #define PENDIN	0040000
 #define IEXTEN	0100000
+#define EXTPROC	0200000
 
 /* tcflow() and TCXONC use these */
 #define	TCOOFF		0
diff --git a/arch/parisc/include/asm/ioctls.h b/arch/parisc/include/asm/ioctls.h
index 6cc497e52532..4e0614456bea 100644
--- a/arch/parisc/include/asm/ioctls.h
+++ b/arch/parisc/include/asm/ioctls.h
@@ -52,6 +52,7 @@
 #define TCSETSF2	_IOW('T',0x2D, struct termios2)
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define FIONCLEX	0x5450  /* these numbers need to be adjusted. */
 #define FIOCLEX		0x5451
@@ -82,6 +83,7 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
 
diff --git a/arch/parisc/include/asm/termbits.h b/arch/parisc/include/asm/termbits.h
index d8bbc73b16b7..d1ab92177a5c 100644
--- a/arch/parisc/include/asm/termbits.h
+++ b/arch/parisc/include/asm/termbits.h
@@ -180,6 +180,7 @@ struct ktermios {
 #define FLUSHO  0010000
 #define PENDIN  0040000
 #define IEXTEN  0100000
+#define EXTPROC	0200000
 
 /* tcflow() and TCXONC use these */
 #define	TCOOFF		0
diff --git a/arch/powerpc/include/asm/ioctls.h b/arch/powerpc/include/asm/ioctls.h
index 1842186d872c..851920052e08 100644
--- a/arch/powerpc/include/asm/ioctls.h
+++ b/arch/powerpc/include/asm/ioctls.h
@@ -80,6 +80,7 @@
 # define TIOCPKT_START		 8
 # define TIOCPKT_NOSTOP		16
 # define TIOCPKT_DOSTOP		32
+# define TIOCPKT_IOCTL		64
 
 
 #define TIOCNOTTY	0x5422
@@ -93,6 +94,7 @@
 #define TIOCSRS485	0x542f
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define TIOCSERCONFIG	0x5453
 #define TIOCSERGWILD	0x5454
diff --git a/arch/powerpc/include/asm/termbits.h b/arch/powerpc/include/asm/termbits.h
index 6698188ca550..549d700e18f2 100644
--- a/arch/powerpc/include/asm/termbits.h
+++ b/arch/powerpc/include/asm/termbits.h
@@ -189,6 +189,7 @@ struct ktermios {
 #define FLUSHO	0x00800000
 #define PENDIN	0x20000000
 #define IEXTEN	0x00000400
+#define EXTPROC	0x10000000
 
 /* Values for the ACTION argument to `tcflow'.  */
 #define	TCOOFF		0
diff --git a/arch/s390/include/asm/ioctls.h b/arch/s390/include/asm/ioctls.h
index 40e481b1b461..2f3d8736361f 100644
--- a/arch/s390/include/asm/ioctls.h
+++ b/arch/s390/include/asm/ioctls.h
@@ -60,6 +60,7 @@
 #define TCSETSF2	_IOW('T',0x2D, struct termios2)
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define FIONCLEX	0x5450  /* these numbers need to be adjusted. */
 #define FIOCLEX		0x5451
@@ -86,6 +87,7 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
 
diff --git a/arch/sh/include/asm/ioctls.h b/arch/sh/include/asm/ioctls.h
index c212c371a4a5..eb6c4c687972 100644
--- a/arch/sh/include/asm/ioctls.h
+++ b/arch/sh/include/asm/ioctls.h
@@ -69,6 +69,7 @@
 # define TIOCPKT_START		 8
 # define TIOCPKT_NOSTOP		16
 # define TIOCPKT_DOSTOP		32
+# define TIOCPKT_IOCTL		64
 
 
 #define TIOCNOTTY	_IO('T', 34) /* 0x5422 */
@@ -84,6 +85,7 @@
 #define TCSETSF2	_IOW('T', 45, struct termios2)
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define TIOCSERCONFIG	_IO('T', 83) /* 0x5453 */
 #define TIOCSERGWILD	_IOR('T', 84,  int) /* 0x5454 */
diff --git a/arch/sparc/include/asm/ioctls.h b/arch/sparc/include/asm/ioctls.h
index 1fe6855c5c18..53f4ee009bdd 100644
--- a/arch/sparc/include/asm/ioctls.h
+++ b/arch/sparc/include/asm/ioctls.h
@@ -80,6 +80,7 @@
 /* Get minor device of a pty master's FD -- Solaris equiv is ISPTM */
 #define TIOCGPTN	_IOR('t', 134, unsigned int) /* Get Pty Number */
 #define TIOCSPTLCK	_IOW('t', 135, int) /* Lock/unlock PTY */
+#define TIOCSIG		_IOW('t', 136, int) /* Generate signal on Pty slave */
 
 /* Little f */
 #define FIOCLEX		_IO('f', 1)
@@ -132,5 +133,6 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #endif /* !(_ASM_SPARC_IOCTLS_H) */
diff --git a/arch/sparc/include/asm/termbits.h b/arch/sparc/include/asm/termbits.h
index d72dfed1f9d7..23b10ff08df2 100644
--- a/arch/sparc/include/asm/termbits.h
+++ b/arch/sparc/include/asm/termbits.h
@@ -225,6 +225,7 @@ struct ktermios {
 #define FLUSHO	0x00002000
 #define PENDIN	0x00004000
 #define IEXTEN	0x00008000
+#define EXTPROC	0x00010000
 
 /* modem lines */
 #define TIOCM_LE	0x001
diff --git a/arch/xtensa/include/asm/ioctls.h b/arch/xtensa/include/asm/ioctls.h
index 0ffa942954b9..ab1800012ed9 100644
--- a/arch/xtensa/include/asm/ioctls.h
+++ b/arch/xtensa/include/asm/ioctls.h
@@ -81,6 +81,7 @@
 # define TIOCPKT_START		 8
 # define TIOCPKT_NOSTOP		16
 # define TIOCPKT_DOSTOP		32
+# define TIOCPKT_IOCTL		64
 
 
 #define TIOCNOTTY	_IO('T', 34)
@@ -97,6 +98,7 @@
 #define TCSETSF2	_IOW('T', 45, struct termios2)
 #define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
 #define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
 
 #define TIOCSERCONFIG	_IO('T', 83)
 #define TIOCSERGWILD	_IOR('T', 84,  int)
diff --git a/arch/xtensa/include/asm/termbits.h b/arch/xtensa/include/asm/termbits.h
index 85aa6a3c0b6e..0d6c8715b24f 100644
--- a/arch/xtensa/include/asm/termbits.h
+++ b/arch/xtensa/include/asm/termbits.h
@@ -196,6 +196,7 @@ struct ktermios {
 #define FLUSHO	0010000
 #define PENDIN	0040000
 #define IEXTEN	0100000
+#define EXTPROC	0200000
 
 /* tcflow() and TCXONC use these */
 
diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index bdae8327143c..428f4fe0b5f7 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -1102,6 +1102,11 @@ static inline void n_tty_receive_char(struct tty_struct *tty, unsigned char c)
 	if (I_IUCLC(tty) && L_IEXTEN(tty))
 		c = tolower(c);
 
+	if (L_EXTPROC(tty)) {
+		put_tty_queue(c, tty);
+		return;
+	}
+
 	if (tty->stopped && !tty->flow_stopped && I_IXON(tty) &&
 	    I_IXANY(tty) && c != START_CHAR(tty) && c != STOP_CHAR(tty) &&
 	    c != INTR_CHAR(tty) && c != QUIT_CHAR(tty) && c != SUSP_CHAR(tty)) {
@@ -1409,7 +1414,8 @@ static void n_tty_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 
 	n_tty_set_room(tty);
 
-	if (!tty->icanon && (tty->read_cnt >= tty->minimum_to_wake)) {
+	if ((!tty->icanon && (tty->read_cnt >= tty->minimum_to_wake)) ||
+		L_EXTPROC(tty)) {
 		kill_fasync(&tty->fasync, SIGIO, POLL_IN);
 		if (waitqueue_active(&tty->read_wait))
 			wake_up_interruptible(&tty->read_wait);
@@ -1585,7 +1591,7 @@ static int n_tty_open(struct tty_struct *tty)
 static inline int input_available_p(struct tty_struct *tty, int amt)
 {
 	tty_flush_to_ldisc(tty);
-	if (tty->icanon) {
+	if (tty->icanon && !L_EXTPROC(tty)) {
 		if (tty->canon_data)
 			return 1;
 	} else if (tty->read_cnt >= (amt ? amt : 1))
@@ -1632,6 +1638,11 @@ static int copy_from_read_buf(struct tty_struct *tty,
 		spin_lock_irqsave(&tty->read_lock, flags);
 		tty->read_tail = (tty->read_tail + n) & (N_TTY_BUF_SIZE-1);
 		tty->read_cnt -= n;
+		/* Turn single EOF into zero-length read */
+		if (L_EXTPROC(tty) && tty->icanon && n == 1) {
+			if (!tty->read_cnt && (*b)[n-1] == EOF_CHAR(tty))
+				n--;
+		}
 		spin_unlock_irqrestore(&tty->read_lock, flags);
 		*b += n;
 		*nr -= n;
@@ -1812,7 +1823,7 @@ do_it_again:
 			nr--;
 		}
 
-		if (tty->icanon) {
+		if (tty->icanon && !L_EXTPROC(tty)) {
 			/* N.B. avoid overrun if nr == 0 */
 			while (nr && tty->read_cnt) {
 				int eol;
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index d83a43130df4..b640ef29be1c 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -171,6 +171,23 @@ static int pty_set_lock(struct tty_struct *tty, int __user *arg)
 	return 0;
 }
 
+/* Send a signal to the slave */
+static int pty_signal(struct tty_struct *tty, int sig)
+{
+	unsigned long flags;
+	struct pid *pgrp;
+
+	if (tty->link) {
+		spin_lock_irqsave(&tty->link->ctrl_lock, flags);
+		pgrp = get_pid(tty->link->pgrp);
+		spin_unlock_irqrestore(&tty->link->ctrl_lock, flags);
+
+		kill_pgrp(pgrp, sig, 1);
+		put_pid(pgrp);
+	}
+	return 0;
+}
+
 static void pty_flush_buffer(struct tty_struct *tty)
 {
 	struct tty_struct *to = tty->link;
@@ -321,6 +338,8 @@ static int pty_bsd_ioctl(struct tty_struct *tty, struct file *file,
 	switch (cmd) {
 	case TIOCSPTLCK: /* Set PT Lock (disallow slave open) */
 		return pty_set_lock(tty, (int __user *) arg);
+	case TIOCSIG:    /* Send signal to other side of pty */
+		return pty_signal(tty, (int) arg);
 	}
 	return -ENOIOCTLCMD;
 }
@@ -476,6 +495,8 @@ static int pty_unix98_ioctl(struct tty_struct *tty, struct file *file,
 		return pty_set_lock(tty, (int __user *)arg);
 	case TIOCGPTN: /* Get PT Number */
 		return put_user(tty->index, (unsigned int __user *)arg);
+	case TIOCSIG:    /* Send signal to other side of pty */
+		return pty_signal(tty, (int) arg);
 	}
 
 	return -ENOIOCTLCMD;
diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c
index 6bd5f8866c74..0c1889971459 100644
--- a/drivers/char/tty_ioctl.c
+++ b/drivers/char/tty_ioctl.c
@@ -517,19 +517,25 @@ static void change_termios(struct tty_struct *tty, struct ktermios *new_termios)
 
 	/* See if packet mode change of state. */
 	if (tty->link && tty->link->packet) {
+		int extproc = (old_termios.c_lflag & EXTPROC) |
+				(tty->termios->c_lflag & EXTPROC);
 		int old_flow = ((old_termios.c_iflag & IXON) &&
 				(old_termios.c_cc[VSTOP] == '\023') &&
 				(old_termios.c_cc[VSTART] == '\021'));
 		int new_flow = (I_IXON(tty) &&
 				STOP_CHAR(tty) == '\023' &&
 				START_CHAR(tty) == '\021');
-		if (old_flow != new_flow) {
+		if ((old_flow != new_flow) || extproc) {
 			spin_lock_irqsave(&tty->ctrl_lock, flags);
-			tty->ctrl_status &= ~(TIOCPKT_DOSTOP | TIOCPKT_NOSTOP);
-			if (new_flow)
-				tty->ctrl_status |= TIOCPKT_DOSTOP;
-			else
-				tty->ctrl_status |= TIOCPKT_NOSTOP;
+			if (old_flow != new_flow) {
+				tty->ctrl_status &= ~(TIOCPKT_DOSTOP | TIOCPKT_NOSTOP);
+				if (new_flow)
+					tty->ctrl_status |= TIOCPKT_DOSTOP;
+				else
+					tty->ctrl_status |= TIOCPKT_NOSTOP;
+			}
+			if (extproc)
+				tty->ctrl_status |= TIOCPKT_IOCTL;
 			spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 			wake_up_interruptible(&tty->link->read_wait);
 		}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 63ae85831464..fa4bc48810fd 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -969,6 +969,7 @@ COMPATIBLE_IOCTL(TIOCGPGRP)
 COMPATIBLE_IOCTL(TIOCGPTN)
 COMPATIBLE_IOCTL(TIOCSPTLCK)
 COMPATIBLE_IOCTL(TIOCSERGETLSR)
+COMPATIBLE_IOCTL(TIOCSIG)
 #ifdef TCGETS2
 COMPATIBLE_IOCTL(TCGETS2)
 COMPATIBLE_IOCTL(TCSETS2)
diff --git a/include/asm-generic/ioctls.h b/include/asm-generic/ioctls.h
index 1375fa1a7a4d..8554cb6a81b9 100644
--- a/include/asm-generic/ioctls.h
+++ b/include/asm-generic/ioctls.h
@@ -69,6 +69,7 @@
 #define TCSETX		0x5433
 #define TCSETXF		0x5434
 #define TCSETXW		0x5435
+#define TIOCSIG		_IOW('T', 0x36, int)  /* pty: generate signal */
 
 #define FIONCLEX	0x5450
 #define FIOCLEX		0x5451
@@ -102,6 +103,7 @@
 #define TIOCPKT_START		 8
 #define TIOCPKT_NOSTOP		16
 #define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
 
 #define TIOCSER_TEMT	0x01	/* Transmitter physically empty */
 
diff --git a/include/asm-generic/termbits.h b/include/asm-generic/termbits.h
index 1c9773d48cb0..232b4781aef3 100644
--- a/include/asm-generic/termbits.h
+++ b/include/asm-generic/termbits.h
@@ -178,6 +178,7 @@ struct ktermios {
 #define FLUSHO	0010000
 #define PENDIN	0040000
 #define IEXTEN	0100000
+#define EXTPROC	0200000
 
 /* tcflow() and TCXONC use these */
 #define	TCOOFF		0
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 7802a243ee13..2df60e4ff40e 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -179,6 +179,7 @@ struct tty_bufhead {
 #define L_FLUSHO(tty)	_L_FLAG((tty), FLUSHO)
 #define L_PENDIN(tty)	_L_FLAG((tty), PENDIN)
 #define L_IEXTEN(tty)	_L_FLAG((tty), IEXTEN)
+#define L_EXTPROC(tty)	_L_FLAG((tty), EXTPROC)
 
 struct device;
 struct signal_struct;
-- 
cgit v1.2.3-59-g8ed1b


From 8fd4bd22350784d5b2fe9274f6790ba353976415 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Wed, 23 Jun 2010 12:56:12 -0700
Subject: vt/console: try harder to print output when panicing

Jesse's initial patch commit said:

"At panic time (i.e.  when oops_in_progress is set) we should try a bit
harder to update the screen and make sure output gets to the VT, since
some drivers are capable of flipping back to it.

So make sure we try to unblank and update the display if called from a
panic context."

I've enhanced this to add a flag to the vc that console layer can set to
indicate they want this behaviour to occur.  This also adds support to
fbcon for that flag and adds an fb flag for drivers to indicate they want
to use the support.  It enables this for KMS drivers.

Signed-off-by: Dave Airlie <airlied@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Acked-by: James Simmons <jsimmons@infradead.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/vt.c                       | 13 +++++++++----
 drivers/gpu/drm/i915/intel_fb.c         |  4 +---
 drivers/gpu/drm/nouveau/nouveau_fbcon.c |  1 +
 drivers/gpu/drm/radeon/radeon_fb.c      |  2 +-
 drivers/video/console/fbcon.c           |  4 +++-
 include/linux/console_struct.h          |  1 +
 include/linux/fb.h                      |  4 ++++
 include/linux/vt_kern.h                 |  7 +++++++
 8 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 34bfb056d7a6..82f64ac21191 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -705,7 +705,10 @@ void redraw_screen(struct vc_data *vc, int is_switch)
 			update_attr(vc);
 			clear_buffer_attributes(vc);
 		}
-		if (update && vc->vc_mode != KD_GRAPHICS)
+
+		/* Forcibly update if we're panicing */
+		if ((update && vc->vc_mode != KD_GRAPHICS) ||
+		    vt_force_oops_output(vc))
 			do_update_region(vc, vc->vc_origin, vc->vc_screenbuf_size / 2);
 	}
 	set_cursor(vc);
@@ -743,6 +746,7 @@ static void visual_init(struct vc_data *vc, int num, int init)
 	vc->vc_hi_font_mask = 0;
 	vc->vc_complement_mask = 0;
 	vc->vc_can_do_color = 0;
+	vc->vc_panic_force_write = false;
 	vc->vc_sw->con_init(vc, init);
 	if (!vc->vc_complement_mask)
 		vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800;
@@ -2506,7 +2510,7 @@ static void vt_console_print(struct console *co, const char *b, unsigned count)
 		goto quit;
 	}
 
-	if (vc->vc_mode != KD_TEXT)
+	if (vc->vc_mode != KD_TEXT && !vt_force_oops_output(vc))
 		goto quit;
 
 	/* undraw cursor first */
@@ -3784,7 +3788,8 @@ void do_unblank_screen(int leaving_gfx)
 		return;
 	}
 	vc = vc_cons[fg_console].d;
-	if (vc->vc_mode != KD_TEXT)
+	/* Try to unblank in oops case too */
+	if (vc->vc_mode != KD_TEXT && !vt_force_oops_output(vc))
 		return; /* but leave console_blanked != 0 */
 
 	if (blankinterval) {
@@ -3793,7 +3798,7 @@ void do_unblank_screen(int leaving_gfx)
 	}
 
 	console_blanked = 0;
-	if (vc->vc_sw->con_blank(vc, 0, leaving_gfx))
+	if (vc->vc_sw->con_blank(vc, 0, leaving_gfx) || vt_force_oops_output(vc))
 		/* Low-level driver cannot restore -> do it ourselves */
 		update_screen(vc);
 	if (console_blank_hook)
diff --git a/drivers/gpu/drm/i915/intel_fb.c b/drivers/gpu/drm/i915/intel_fb.c
index 54acd8b534df..a79525f434a8 100644
--- a/drivers/gpu/drm/i915/intel_fb.c
+++ b/drivers/gpu/drm/i915/intel_fb.c
@@ -130,7 +130,7 @@ static int intelfb_create(struct intel_fbdev *ifbdev,
 
 	strcpy(info->fix.id, "inteldrmfb");
 
-	info->flags = FBINFO_DEFAULT;
+	info->flags = FBINFO_DEFAULT | FBINFO_CAN_FORCE_OUTPUT;
 	info->fbops = &intelfb_ops;
 
 	/* setup aperture base/size for vesafb takeover */
@@ -148,8 +148,6 @@ static int intelfb_create(struct intel_fbdev *ifbdev,
 	info->fix.smem_start = dev->mode_config.fb_base + obj_priv->gtt_offset;
 	info->fix.smem_len = size;
 
-	info->flags = FBINFO_DEFAULT;
-
 	info->screen_base = ioremap_wc(dev->agp->base + obj_priv->gtt_offset,
 				       size);
 	if (!info->screen_base) {
diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c
index 2fb2444d2322..099f637264aa 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c
@@ -250,6 +250,7 @@ nouveau_fbcon_create(struct nouveau_fbdev *nfbdev,
 		info->flags = FBINFO_DEFAULT | FBINFO_HWACCEL_COPYAREA |
 			      FBINFO_HWACCEL_FILLRECT |
 			      FBINFO_HWACCEL_IMAGEBLIT;
+	info->flags |= FBINFO_CAN_FORCE_OUTPUT;
 	info->fbops = &nouveau_fbcon_ops;
 	info->fix.smem_start = dev->mode_config.fb_base + nvbo->bo.offset -
 			       dev_priv->vm_vram_base;
diff --git a/drivers/gpu/drm/radeon/radeon_fb.c b/drivers/gpu/drm/radeon/radeon_fb.c
index dc1634bb0c11..dbf86962bdd1 100644
--- a/drivers/gpu/drm/radeon/radeon_fb.c
+++ b/drivers/gpu/drm/radeon/radeon_fb.c
@@ -224,7 +224,7 @@ static int radeonfb_create(struct radeon_fbdev *rfbdev,
 
 	drm_fb_helper_fill_fix(info, fb->pitch, fb->depth);
 
-	info->flags = FBINFO_DEFAULT;
+	info->flags = FBINFO_DEFAULT | FBINFO_CAN_FORCE_OUTPUT;
 	info->fbops = &radeonfb_ops;
 
 	tmp = radeon_bo_gpu_offset(rbo) - rdev->mc.vram_start;
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 3b3f5749af92..26bf7cbfecc2 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -283,7 +283,8 @@ static inline int fbcon_is_inactive(struct vc_data *vc, struct fb_info *info)
 	struct fbcon_ops *ops = info->fbcon_par;
 
 	return (info->state != FBINFO_STATE_RUNNING ||
-		vc->vc_mode != KD_TEXT || ops->graphics);
+		vc->vc_mode != KD_TEXT || ops->graphics) &&
+		!vt_force_oops_output(vc);
 }
 
 static inline int get_color(struct vc_data *vc, struct fb_info *info,
@@ -1073,6 +1074,7 @@ static void fbcon_init(struct vc_data *vc, int init)
 	if (p->userfont)
 		charcnt = FNTCHARCNT(p->fontdata);
 
+	vc->vc_panic_force_write = !!(info->flags & FBINFO_CAN_FORCE_OUTPUT);
 	vc->vc_can_do_color = (fb_get_color_depth(&info->var, &info->fix)!=1);
 	vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800;
 	if (charcnt == 256) {
diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h
index 38fe59dc89ae..d7d9acdccffb 100644
--- a/include/linux/console_struct.h
+++ b/include/linux/console_struct.h
@@ -105,6 +105,7 @@ struct vc_data {
 	struct vc_data **vc_display_fg;		/* [!] Ptr to var holding fg console for this display */
 	unsigned long	vc_uni_pagedir;
 	unsigned long	*vc_uni_pagedir_loc;  /* [!] Location of uni_pagedir variable for this console */
+	bool vc_panic_force_write; /* when oops/panic this VC can accept forced output/blanking */
 	/* additional information is in vt_kern.h */
 };
 
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 0c5659c41b01..f0268deca658 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -825,6 +825,10 @@ struct fb_tile_ops {
  */
 #define FBINFO_BE_MATH  0x100000
 
+/* report to the VT layer that this fb driver can accept forced console
+   output like oopses */
+#define FBINFO_CAN_FORCE_OUTPUT     0x200000
+
 struct fb_info {
 	int node;
 	int flags;
diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index 7f56db4a79f0..56cce345aa8d 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -100,6 +100,13 @@ extern int unbind_con_driver(const struct consw *csw, int first, int last,
 			     int deflt);
 int vty_init(const struct file_operations *console_fops);
 
+static inline bool vt_force_oops_output(struct vc_data *vc)
+{
+	if (oops_in_progress && vc->vc_panic_force_write)
+		return true;
+	return false;
+}
+
 /*
  * vc_screen.c shares this temporary buffer with the console write code so that
  * we can easily avoid touching user space while holding the console spinlock.
-- 
cgit v1.2.3-59-g8ed1b


From 8a1e803d0148e320b9200a442dfb88f8cbde88e7 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Tue, 1 Jun 2010 22:52:42 +0200
Subject: istallion: use bit ops for the board flags

This lets us avoid problems with races on the flag changes

Signed-off-by: Alan Cox <alan@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/istallion.c  | 36 ++++++++++++++++++------------------
 include/linux/istallion.h |  2 +-
 2 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index 750650cbac11..5e9a81d8ebcf 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -14,7 +14,6 @@
  *	the Free Software Foundation; either version 2 of the License, or
  *	(at your option) any later version.
  *
- *	FIXME: brdp->state needs proper locking.
  */
 
 /*****************************************************************************/
@@ -204,9 +203,9 @@ static int		stli_shared;
  *	the board has been detected, and whether it is actually running a slave
  *	or not.
  */
-#define	BST_FOUND	0x1
-#define	BST_STARTED	0x2
-#define	BST_PROBED	0x4
+#define	BST_FOUND	0
+#define	BST_STARTED	1
+#define	BST_PROBED	2
 
 /*
  *	Define the set of port state flags. These are marked for internal
@@ -817,7 +816,7 @@ static int stli_open(struct tty_struct *tty, struct file *filp)
 	brdp = stli_brds[brdnr];
 	if (brdp == NULL)
 		return -ENODEV;
-	if ((brdp->state & BST_STARTED) == 0)
+	if (!test_bit(BST_STARTED, &brdp->state))
 		return -ENODEV;
 	portnr = MINOR2PORT(minordev);
 	if (portnr > brdp->nrports)
@@ -1847,7 +1846,7 @@ static void stli_portinfo(struct seq_file *m, struct stlibrd *brdp, struct stlip
 	rc = stli_portcmdstats(NULL, portp);
 
 	uart = "UNKNOWN";
-	if (brdp->state & BST_STARTED) {
+	if (test_bit(BST_STARTED, &brdp->state)) {
 		switch (stli_comstats.hwid) {
 		case 0:	uart = "2681"; break;
 		case 1:	uart = "SC26198"; break;
@@ -1856,7 +1855,7 @@ static void stli_portinfo(struct seq_file *m, struct stlibrd *brdp, struct stlip
 	}
 	seq_printf(m, "%d: uart:%s ", portnr, uart);
 
-	if ((brdp->state & BST_STARTED) && (rc >= 0)) {
+	if (test_bit(BST_STARTED, &brdp->state) && rc >= 0) {
 		char sep;
 
 		seq_printf(m, "tx:%d rx:%d", (int) stli_comstats.txtotal,
@@ -2356,7 +2355,7 @@ static void stli_poll(unsigned long arg)
 		brdp = stli_brds[brdnr];
 		if (brdp == NULL)
 			continue;
-		if ((brdp->state & BST_STARTED) == 0)
+		if (!test_bit(BST_STARTED, &brdp->state))
 			continue;
 
 		spin_lock(&brd_lock);
@@ -3141,7 +3140,7 @@ static int stli_initecp(struct stlibrd *brdp)
 	}
 
 
-	brdp->state |= BST_FOUND;
+	set_bit(BST_FOUND, &brdp->state);
 	return 0;
 err_unmap:
 	iounmap(brdp->membase);
@@ -3298,7 +3297,7 @@ static int stli_initonb(struct stlibrd *brdp)
 	brdp->panels[0] = brdp->nrports;
 
 
-	brdp->state |= BST_FOUND;
+	set_bit(BST_FOUND, &brdp->state);
 	return 0;
 err_unmap:
 	iounmap(brdp->membase);
@@ -3408,7 +3407,7 @@ stli_donestartup:
 	spin_unlock_irqrestore(&brd_lock, flags);
 
 	if (rc == 0)
-		brdp->state |= BST_STARTED;
+		set_bit(BST_STARTED, &brdp->state);
 
 	if (! stli_timeron) {
 		stli_timeron++;
@@ -3711,7 +3710,7 @@ static int __devinit stli_pciprobe(struct pci_dev *pdev,
 	if (retval)
 		goto err_null;
 
-	brdp->state |= BST_PROBED;
+	set_bit(BST_PROBED, &brdp->state);
 	pci_set_drvdata(pdev, brdp);
 
 	EBRDENABLE(brdp);
@@ -3842,7 +3841,7 @@ static int __init stli_initbrds(void)
 			brdp = stli_brds[i];
 			if (brdp == NULL)
 				continue;
-			if (brdp->state & BST_FOUND) {
+			if (test_bit(BST_FOUND, &brdp->state)) {
 				EBRDENABLE(brdp);
 				brdp->enable = NULL;
 				brdp->disable = NULL;
@@ -4079,7 +4078,7 @@ static int stli_portcmdstats(struct tty_struct *tty, struct stliport *portp)
 		return -ENODEV;
 
 	mutex_lock(&portp->port.mutex);
-	if (brdp->state & BST_STARTED) {
+	if (test_bit(BST_STARTED, &brdp->state)) {
 		if ((rc = stli_cmdwait(brdp, portp, A_GETSTATS,
 		    &stli_cdkstats, sizeof(asystats_t), 1)) < 0) {
 			mutex_unlock(&portp->port.mutex);
@@ -4194,7 +4193,7 @@ static int stli_clrportstats(struct stliport *portp, comstats_t __user *cp)
 
 	mutex_lock(&portp->port.mutex);
 
-	if (brdp->state & BST_STARTED) {
+	if (test_bit(BST_STARTED, &brdp->state)) {
 		if ((rc = stli_cmdwait(brdp, portp, A_CLEARSTATS, NULL, 0, 0)) < 0) {
 			mutex_unlock(&portp->port.mutex);
 			return rc;
@@ -4323,10 +4322,10 @@ static long stli_memioctl(struct file *fp, unsigned int cmd, unsigned long arg)
 		rc = stli_startbrd(brdp);
 		break;
 	case STL_BSTOP:
-		brdp->state &= ~BST_STARTED;
+		clear_bit(BST_STARTED, &brdp->state);
 		break;
 	case STL_BRESET:
-		brdp->state &= ~BST_STARTED;
+		clear_bit(BST_STARTED, &brdp->state);
 		EBRDRESET(brdp);
 		if (stli_shared == 0) {
 			if (brdp->reenable != NULL)
@@ -4382,7 +4381,8 @@ static void istallion_cleanup_isa(void)
 	unsigned int j;
 
 	for (j = 0; (j < stli_nrbrds); j++) {
-		if ((brdp = stli_brds[j]) == NULL || (brdp->state & BST_PROBED))
+		if ((brdp = stli_brds[j]) == NULL ||
+				test_bit(BST_PROBED, &brdp->state))
 			continue;
 
 		stli_cleanup_ports(brdp);
diff --git a/include/linux/istallion.h b/include/linux/istallion.h
index 7faca98c7d14..ad700a60c158 100644
--- a/include/linux/istallion.h
+++ b/include/linux/istallion.h
@@ -86,7 +86,7 @@ struct stlibrd {
 	unsigned long	magic;
 	unsigned int	brdnr;
 	unsigned int	brdtype;
-	unsigned int	state;
+	unsigned long	state;
 	unsigned int	nrpanels;
 	unsigned int	nrports;
 	unsigned int	nrdevs;
-- 
cgit v1.2.3-59-g8ed1b


From d87d9b7d19f04b16c4406d3c0feeca10090e0ada Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Tue, 1 Jun 2010 22:52:53 +0200
Subject: tty: serial - fix tty referencing in set_ldisc

Pass down the ldisc number so that the drivers don't have to peek into the
tty object themselves. This lets us get rid of another case of back referencing
port to tty which we don't want (because of races versus hangup/close).

Signed-off-by: Alan Cox <alan@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/bfin_5xx.c    | 7 ++-----
 drivers/serial/serial_core.c | 2 +-
 include/linux/serial_core.h  | 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index 511cbf687877..a9eff2b18eab 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -957,15 +957,12 @@ bfin_serial_verify_port(struct uart_port *port, struct serial_struct *ser)
  * Enable the IrDA function if tty->ldisc.num is N_IRDA.
  * In other cases, disable IrDA function.
  */
-static void bfin_serial_set_ldisc(struct uart_port *port)
+static void bfin_serial_set_ldisc(struct uart_port *port, int ld)
 {
 	int line = port->line;
 	unsigned short val;
 
-	if (line >= port->state->port.tty->driver->num)
-		return;
-
-	switch (port->state->port.tty->termios->c_line) {
+	switch (ld) {
 	case N_IRDA:
 		val = UART_GET_GCTL(&bfin_serial_ports[line]);
 		val |= (IREN | RPOLC);
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 12ee7e0f99ce..570dca2935e2 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -1194,7 +1194,7 @@ static void uart_set_ldisc(struct tty_struct *tty)
 	struct uart_port *uport = state->uart_port;
 
 	if (uport->ops->set_ldisc)
-		uport->ops->set_ldisc(uport);
+		uport->ops->set_ldisc(uport, tty->termios->c_line);
 }
 
 static void uart_set_termios(struct tty_struct *tty,
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index f10db6e5f3b5..32928161fab6 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -220,7 +220,7 @@ struct uart_ops {
 	void		(*flush_buffer)(struct uart_port *);
 	void		(*set_termios)(struct uart_port *, struct ktermios *new,
 				       struct ktermios *old);
-	void		(*set_ldisc)(struct uart_port *);
+	void		(*set_ldisc)(struct uart_port *, int new);
 	void		(*pm)(struct uart_port *, unsigned int state,
 			      unsigned int oldstate);
 	int		(*set_wake)(struct uart_port *, unsigned int state);
-- 
cgit v1.2.3-59-g8ed1b


From ff917ba4f1a6189f90ed2c975984d6a1a1dc553d Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Tue, 1 Jun 2010 22:52:55 +0200
Subject: tty: Make vt's have a tty_port

The vt layer isn't safely handling reference counts to tty object on the input
side. Add a tty port structure to the vt layer in order to implement this using
the standard helpers.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/vt.c              | 2 ++
 include/linux/console_struct.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 9f67ad919a4a..295af823a074 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -783,6 +783,7 @@ int vc_allocate(unsigned int currcons)	/* return 0 on success */
 	    if (!vc)
 		return -ENOMEM;
 	    vc_cons[currcons].d = vc;
+	    tty_port_init(&vc->port);
 	    INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK);
 	    visual_init(vc, currcons, 1);
 	    if (!*vc->vc_uni_pagedir_loc)
@@ -2921,6 +2922,7 @@ static int __init con_init(void)
 	for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
 		vc_cons[currcons].d = vc = kzalloc(sizeof(struct vc_data), GFP_NOWAIT);
 		INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK);
+		tty_port_init(&vc->port);
 		visual_init(vc, currcons, 1);
 		vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT);
 		vc_init(vc, vc->vc_rows, vc->vc_cols,
diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h
index d7d9acdccffb..25bf67f541fc 100644
--- a/include/linux/console_struct.h
+++ b/include/linux/console_struct.h
@@ -21,6 +21,8 @@ struct vt_struct;
 #define NPAR 16
 
 struct vc_data {
+	struct tty_port port;			/* Upper level data */
+
 	unsigned short	vc_num;			/* Console number */
 	unsigned int	vc_cols;		/* [#] Console size */
 	unsigned int	vc_rows;
-- 
cgit v1.2.3-59-g8ed1b


From 8ce73264b75be4d5ed480440ac32dfc1f25ff678 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Tue, 1 Jun 2010 22:52:56 +0200
Subject: tty: Move the vt_tty field from the vc_data into the standard
 tty_port

This takes all the tty references through the expected interface points so
we can refcount them.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/keyboard.c        | 10 +++++-----
 drivers/char/vt.c              | 10 +++++-----
 drivers/char/vt_ioctl.c        |  2 +-
 include/linux/console_struct.h |  1 -
 4 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
index 25be2102a60a..a7ca75212bfe 100644
--- a/drivers/char/keyboard.c
+++ b/drivers/char/keyboard.c
@@ -299,7 +299,7 @@ int kbd_rate(struct kbd_repeat *rep)
  */
 static void put_queue(struct vc_data *vc, int ch)
 {
-	struct tty_struct *tty = vc->vc_tty;
+	struct tty_struct *tty = vc->port.tty;
 
 	if (tty) {
 		tty_insert_flip_char(tty, ch, 0);
@@ -309,7 +309,7 @@ static void put_queue(struct vc_data *vc, int ch)
 
 static void puts_queue(struct vc_data *vc, char *cp)
 {
-	struct tty_struct *tty = vc->vc_tty;
+	struct tty_struct *tty = vc->port.tty;
 
 	if (!tty)
 		return;
@@ -485,7 +485,7 @@ static void fn_show_ptregs(struct vc_data *vc)
 
 static void fn_hold(struct vc_data *vc)
 {
-	struct tty_struct *tty = vc->vc_tty;
+	struct tty_struct *tty = vc->port.tty;
 
 	if (rep || !tty)
 		return;
@@ -563,7 +563,7 @@ static void fn_inc_console(struct vc_data *vc)
 
 static void fn_send_intr(struct vc_data *vc)
 {
-	struct tty_struct *tty = vc->vc_tty;
+	struct tty_struct *tty = vc->port.tty;
 
 	if (!tty)
 		return;
@@ -1162,7 +1162,7 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw)
 	struct keyboard_notifier_param param = { .vc = vc, .value = keycode, .down = down };
 	int rc;
 
-	tty = vc->vc_tty;
+	tty = vc->port.tty;
 
 	if (tty && (!tty->driver_data)) {
 		/* No driver data? Strange. Okay we fix it then. */
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 295af823a074..c734f9b1263a 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -973,12 +973,12 @@ static int vc_do_resize(struct tty_struct *tty, struct vc_data *vc,
  *	Resize a virtual console as seen from the console end of things. We
  *	use the common vc_do_resize methods to update the structures. The
  *	caller must hold the console sem to protect console internals and
- *	vc->vc_tty
+ *	vc->port.tty
  */
 
 int vc_resize(struct vc_data *vc, unsigned int cols, unsigned int rows)
 {
-	return vc_do_resize(vc->vc_tty, vc, cols, rows);
+	return vc_do_resize(vc->port.tty, vc, cols, rows);
 }
 
 /**
@@ -2807,12 +2807,12 @@ static int con_open(struct tty_struct *tty, struct file *filp)
 			struct vc_data *vc = vc_cons[currcons].d;
 
 			/* Still being freed */
-			if (vc->vc_tty) {
+			if (vc->port.tty) {
 				release_console_sem();
 				return -ERESTARTSYS;
 			}
 			tty->driver_data = vc;
-			vc->vc_tty = tty;
+			vc->port.tty = tty;
 
 			if (!tty->winsize.ws_row && !tty->winsize.ws_col) {
 				tty->winsize.ws_row = vc_cons[currcons].d->vc_rows;
@@ -2840,7 +2840,7 @@ static void con_shutdown(struct tty_struct *tty)
 	struct vc_data *vc = tty->driver_data;
 	BUG_ON(vc == NULL);
 	acquire_console_sem();
-	vc->vc_tty = NULL;
+	vc->port.tty = NULL;
 	release_console_sem();
 	tty_shutdown(tty);
 }
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index cb19dbc52136..72dcfb2dc126 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -1369,7 +1369,7 @@ void vc_SAK(struct work_struct *work)
 	acquire_console_sem();
 	vc = vc_con->d;
 	if (vc) {
-		tty = vc->vc_tty;
+		tty = vc->port.tty;
 		/*
 		 * SAK should also work in all raw modes and reset
 		 * them properly.
diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h
index 25bf67f541fc..7f0c32908568 100644
--- a/include/linux/console_struct.h
+++ b/include/linux/console_struct.h
@@ -58,7 +58,6 @@ struct vc_data {
 	/* VT terminal data */
 	unsigned int	vc_state;		/* Escape sequence parser state */
 	unsigned int	vc_npar,vc_par[NPAR];	/* Parameters of current escape sequence */
-	struct tty_struct *vc_tty;		/* TTY we are attached to */
 	/* data for manual vt switching */
 	struct vt_mode	vt_mode;
 	struct pid 	*vt_pid;
-- 
cgit v1.2.3-59-g8ed1b


From ec79d6056de58511d8e46d9ae59d3878f958dc3e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 1 Jun 2010 22:53:01 +0200
Subject: tty: replace BKL with a new tty_lock

As a preparation for replacing the big kernel lock
in the TTY layer, wrap all the callers in new
macros tty_lock, tty_lock_nested and tty_unlock.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/amiserial.c       |  16 +++---
 drivers/char/briq_panel.c      |   6 +--
 drivers/char/n_hdlc.c          |  16 +++---
 drivers/char/n_r3964.c         |   8 +--
 drivers/char/pty.c             |   4 +-
 drivers/char/selection.c       |   4 +-
 drivers/char/serial167.c       |   4 +-
 drivers/char/sx.c              |  12 ++---
 drivers/char/tty_io.c          | 115 +++++++++++++++++++++++------------------
 drivers/char/tty_ldisc.c       |  24 +++++----
 drivers/char/vc_screen.c       |   4 +-
 drivers/char/vt_ioctl.c        |  10 ++--
 drivers/serial/68360serial.c   |   4 +-
 drivers/serial/crisv10.c       |   4 +-
 drivers/serial/serial_core.c   |  10 ++--
 drivers/video/console/vgacon.c |   4 +-
 include/linux/tty.h            |  31 +++++++++++
 17 files changed, 161 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/amiserial.c b/drivers/char/amiserial.c
index 4f8d60c25a98..1b21a7adeb58 100644
--- a/drivers/char/amiserial.c
+++ b/drivers/char/amiserial.c
@@ -1072,7 +1072,7 @@ static int get_serial_info(struct async_struct * info,
 	if (!retinfo)
 		return -EFAULT;
 	memset(&tmp, 0, sizeof(tmp));
-	lock_kernel();
+	tty_lock();
 	tmp.type = state->type;
 	tmp.line = state->line;
 	tmp.port = state->port;
@@ -1083,7 +1083,7 @@ static int get_serial_info(struct async_struct * info,
 	tmp.close_delay = state->close_delay;
 	tmp.closing_wait = state->closing_wait;
 	tmp.custom_divisor = state->custom_divisor;
-	unlock_kernel();
+	tty_unlock();
 	if (copy_to_user(retinfo,&tmp,sizeof(*retinfo)))
 		return -EFAULT;
 	return 0;
@@ -1100,14 +1100,14 @@ static int set_serial_info(struct async_struct * info,
 	if (copy_from_user(&new_serial,new_info,sizeof(new_serial)))
 		return -EFAULT;
 
-	lock_kernel();
+	tty_lock();
 	state = info->state;
 	old_state = *state;
   
 	change_irq = new_serial.irq != state->irq;
 	change_port = (new_serial.port != state->port);
 	if(change_irq || change_port || (new_serial.xmit_fifo_size != state->xmit_fifo_size)) {
-	  unlock_kernel();
+	  tty_unlock();
 	  return -EINVAL;
 	}
   
@@ -1127,7 +1127,7 @@ static int set_serial_info(struct async_struct * info,
 	}
 
 	if (new_serial.baud_base < 9600) {
-		unlock_kernel();
+		tty_unlock();
 		return -EINVAL;
 	}
 
@@ -1163,7 +1163,7 @@ check_and_exit:
 		}
 	} else
 		retval = startup(info);
-	unlock_kernel();
+	tty_unlock();
 	return retval;
 }
 
@@ -1538,7 +1538,7 @@ static void rs_wait_until_sent(struct tty_struct *tty, int timeout)
 
 	orig_jiffies = jiffies;
 
-	lock_kernel();
+	tty_lock_nested(); /* tty_wait_until_sent is called from lots of places */
 	/*
 	 * Set the check interval to be 1/5 of the estimated time to
 	 * send a single character, and make it at least 1.  The check
@@ -1579,7 +1579,7 @@ static void rs_wait_until_sent(struct tty_struct *tty, int timeout)
 			break;
 	}
 	__set_current_state(TASK_RUNNING);
-	unlock_kernel();
+	tty_unlock();
 #ifdef SERIAL_DEBUG_RS_WAIT_UNTIL_SENT
 	printk("lsr = %d (jiff=%lu)...done\n", lsr, jiffies);
 #endif
diff --git a/drivers/char/briq_panel.c b/drivers/char/briq_panel.c
index 555cd93c2ee5..d5fa113afe37 100644
--- a/drivers/char/briq_panel.c
+++ b/drivers/char/briq_panel.c
@@ -67,15 +67,15 @@ static void set_led(char state)
 
 static int briq_panel_open(struct inode *ino, struct file *filep)
 {
-	lock_kernel();
+	tty_lock();
 	/* enforce single access, vfd_is_open is protected by BKL */
 	if (vfd_is_open) {
-		unlock_kernel();
+		tty_unlock();
 		return -EBUSY;
 	}
 	vfd_is_open = 1;
 
-	unlock_kernel();
+	tty_unlock();
 	return 0;
 }
 
diff --git a/drivers/char/n_hdlc.c b/drivers/char/n_hdlc.c
index c68118efad84..47d32281032c 100644
--- a/drivers/char/n_hdlc.c
+++ b/drivers/char/n_hdlc.c
@@ -598,18 +598,18 @@ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file,
 		return -EFAULT;
 	}
 
-	lock_kernel();
+	tty_lock();
 
 	for (;;) {
 		if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) {
-			unlock_kernel();
+			tty_unlock();
 			return -EIO;
 		}
 
 		n_hdlc = tty2n_hdlc (tty);
 		if (!n_hdlc || n_hdlc->magic != HDLC_MAGIC ||
 			 tty != n_hdlc->tty) {
-			unlock_kernel();
+			tty_unlock();
 			return 0;
 		}
 
@@ -619,13 +619,13 @@ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file,
 			
 		/* no data */
 		if (file->f_flags & O_NONBLOCK) {
-			unlock_kernel();
+			tty_unlock();
 			return -EAGAIN;
 		}
 			
 		interruptible_sleep_on (&tty->read_wait);
 		if (signal_pending(current)) {
-			unlock_kernel();
+			tty_unlock();
 			return -EINTR;
 		}
 	}
@@ -648,7 +648,7 @@ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file,
 		kfree(rbuf);
 	else	
 		n_hdlc_buf_put(&n_hdlc->rx_free_buf_list,rbuf);
-	unlock_kernel();
+	tty_unlock();
 	return ret;
 	
 }	/* end of n_hdlc_tty_read() */
@@ -691,7 +691,7 @@ static ssize_t n_hdlc_tty_write(struct tty_struct *tty, struct file *file,
 		count = maxframe;
 	}
 	
-	lock_kernel();
+	tty_lock();
 
 	add_wait_queue(&tty->write_wait, &wait);
 	set_current_state(TASK_INTERRUPTIBLE);
@@ -731,7 +731,7 @@ static ssize_t n_hdlc_tty_write(struct tty_struct *tty, struct file *file,
 		n_hdlc_buf_put(&n_hdlc->tx_buf_list,tbuf);
 		n_hdlc_send_frames(n_hdlc,tty);
 	}
-	unlock_kernel();
+	tty_unlock();
 	return error;
 	
 }	/* end of n_hdlc_tty_write() */
diff --git a/drivers/char/n_r3964.c b/drivers/char/n_r3964.c
index c1d8b54c816d..f4bd2591e39f 100644
--- a/drivers/char/n_r3964.c
+++ b/drivers/char/n_r3964.c
@@ -1067,7 +1067,7 @@ static ssize_t r3964_read(struct tty_struct *tty, struct file *file,
 
 	TRACE_L("read()");
 
-	lock_kernel();
+	tty_lock();
 
 	pClient = findClient(pInfo, task_pid(current));
 	if (pClient) {
@@ -1109,7 +1109,7 @@ static ssize_t r3964_read(struct tty_struct *tty, struct file *file,
 	}
 	ret = -EPERM;
 unlock:
-	unlock_kernel();
+	tty_unlock();
 	return ret;
 }
 
@@ -1158,7 +1158,7 @@ static ssize_t r3964_write(struct tty_struct *tty, struct file *file,
 	pHeader->locks = 0;
 	pHeader->owner = NULL;
 
-	lock_kernel();
+	tty_lock();
 
 	pClient = findClient(pInfo, task_pid(current));
 	if (pClient) {
@@ -1177,7 +1177,7 @@ static ssize_t r3964_write(struct tty_struct *tty, struct file *file,
 	add_tx_queue(pInfo, pHeader);
 	trigger_transmit(pInfo);
 
-	unlock_kernel();
+	tty_unlock();
 
 	return 0;
 }
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index b640ef29be1c..622e21ca9a5c 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -692,9 +692,9 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 {
 	int ret;
 
-	lock_kernel();
+	tty_lock();
 	ret = __ptmx_open(inode, filp);
-	unlock_kernel();
+	tty_unlock();
 	return ret;
 }
 
diff --git a/drivers/char/selection.c b/drivers/char/selection.c
index 6e79340d732f..85211a3a5811 100644
--- a/drivers/char/selection.c
+++ b/drivers/char/selection.c
@@ -313,7 +313,7 @@ int paste_selection(struct tty_struct *tty)
 	struct  tty_ldisc *ld;
 	DECLARE_WAITQUEUE(wait, current);
 
-	lock_kernel();
+	tty_lock_nested(); /* always called with BTM from vt_ioctl */
 
 	acquire_console_sem();
 	poke_blanked_console();
@@ -338,6 +338,6 @@ int paste_selection(struct tty_struct *tty)
 	__set_current_state(TASK_RUNNING);
 
 	tty_ldisc_deref(ld);
-	unlock_kernel();
+	tty_unlock();
 	return 0;
 }
diff --git a/drivers/char/serial167.c b/drivers/char/serial167.c
index ecbe479c7d68..90b3ec0aabdd 100644
--- a/drivers/char/serial167.c
+++ b/drivers/char/serial167.c
@@ -1505,7 +1505,7 @@ cy_ioctl(struct tty_struct *tty, struct file *file,
 	printk("cy_ioctl %s, cmd = %x arg = %lx\n", tty->name, cmd, arg);	/* */
 #endif
 
-	lock_kernel();
+	tty_lock();
 
 	switch (cmd) {
 	case CYGETMON:
@@ -1561,7 +1561,7 @@ cy_ioctl(struct tty_struct *tty, struct file *file,
 	default:
 		ret_val = -ENOIOCTLCMD;
 	}
-	unlock_kernel();
+	tty_unlock();
 
 #ifdef SERIAL_DEBUG_OTHER
 	printk("cy_ioctl done\n");
diff --git a/drivers/char/sx.c b/drivers/char/sx.c
index a81ec4fcf6ff..5b24db4ff7f1 100644
--- a/drivers/char/sx.c
+++ b/drivers/char/sx.c
@@ -1699,7 +1699,7 @@ static long sx_fw_ioctl(struct file *filp, unsigned int cmd,
 	if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
 
-	lock_kernel();
+	tty_lock();
 
 	sx_dprintk(SX_DEBUG_FIRMWARE, "IOCTL %x: %lx\n", cmd, arg);
 
@@ -1848,7 +1848,7 @@ static long sx_fw_ioctl(struct file *filp, unsigned int cmd,
 		break;
 	}
 out:
-	unlock_kernel();
+	tty_unlock();
 	func_exit();
 	return rc;
 }
@@ -1859,7 +1859,7 @@ static int sx_break(struct tty_struct *tty, int flag)
 	int rv;
 
 	func_enter();
-	lock_kernel();
+	tty_lock();
 
 	if (flag)
 		rv = sx_send_command(port, HS_START, -1, HS_IDLE_BREAK);
@@ -1868,7 +1868,7 @@ static int sx_break(struct tty_struct *tty, int flag)
 	if (rv != 1)
 		printk(KERN_ERR "sx: couldn't send break (%x).\n",
 			read_sx_byte(port->board, CHAN_OFFSET(port, hi_hstat)));
-	unlock_kernel();
+	tty_unlock();
 	func_exit();
 	return 0;
 }
@@ -1909,7 +1909,7 @@ static int sx_ioctl(struct tty_struct *tty, struct file *filp,
 	/* func_enter2(); */
 
 	rc = 0;
-	lock_kernel();
+	tty_lock();
 	switch (cmd) {
 	case TIOCGSERIAL:
 		rc = gs_getserial(&port->gs, argp);
@@ -1921,7 +1921,7 @@ static int sx_ioctl(struct tty_struct *tty, struct file *filp,
 		rc = -ENOIOCTLCMD;
 		break;
 	}
-	unlock_kernel();
+	tty_unlock();
 
 	/* func_exit(); */
 	return rc;
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 507441ac6edb..5ee9081a560f 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -149,6 +149,7 @@ static long tty_compat_ioctl(struct file *file, unsigned int cmd,
 #else
 #define tty_compat_ioctl NULL
 #endif
+static int __tty_fasync(int fd, struct file *filp, int on);
 static int tty_fasync(int fd, struct file *filp, int on);
 static void release_tty(struct tty_struct *tty, int idx);
 static void __proc_set_tty(struct task_struct *tsk, struct tty_struct *tty);
@@ -483,7 +484,7 @@ EXPORT_SYMBOL_GPL(tty_wakeup);
  *	remains intact.
  *
  *	Locking:
- *		BKL
+ *		BTM
  *		  redirect lock for undoing redirection
  *		  file list lock for manipulating list of ttys
  *		  tty_ldisc_lock from called functions
@@ -513,8 +514,11 @@ static void do_tty_hangup(struct work_struct *work)
 	}
 	spin_unlock(&redirect_lock);
 
-	/* inuse_filps is protected by the single kernel lock */
-	lock_kernel();
+	/* inuse_filps is protected by the single tty lock,
+	   this really needs to change if we want to flush the
+	   workqueue with the lock held */
+	tty_lock_nested(); /* called with BTM held from pty_close and
+				others */
 	check_tty_count(tty, "do_tty_hangup");
 
 	file_list_lock();
@@ -525,7 +529,7 @@ static void do_tty_hangup(struct work_struct *work)
 		if (filp->f_op->write != tty_write)
 			continue;
 		closecount++;
-		tty_fasync(-1, filp, 0);	/* can't block */
+		__tty_fasync(-1, filp, 0);	/* can't block */
 		filp->f_op = &hung_up_tty_fops;
 	}
 	file_list_unlock();
@@ -594,7 +598,7 @@ static void do_tty_hangup(struct work_struct *work)
 	 */
 	set_bit(TTY_HUPPED, &tty->flags);
 	tty_ldisc_enable(tty);
-	unlock_kernel();
+	tty_unlock();
 	if (f)
 		fput(f);
 }
@@ -696,7 +700,8 @@ static void session_clear_tty(struct pid *session)
  *	exiting; it is 0 if called by the ioctl TIOCNOTTY.
  *
  *	Locking:
- *		BKL is taken for hysterical raisins
+ *		BTM is taken for hysterical raisins, and held when
+ *		  called from no_tty().
  *		  tty_mutex is taken to protect tty
  *		  ->siglock is taken to protect ->signal/->sighand
  *		  tasklist_lock is taken to walk process list for sessions
@@ -714,10 +719,10 @@ void disassociate_ctty(int on_exit)
 	tty = get_current_tty();
 	if (tty) {
 		tty_pgrp = get_pid(tty->pgrp);
-		lock_kernel();
+		tty_lock_nested(); /* see above */
 		if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY)
 			tty_vhangup(tty);
-		unlock_kernel();
+		tty_unlock();
 		tty_kref_put(tty);
 	} else if (on_exit) {
 		struct pid *old_pgrp;
@@ -774,9 +779,9 @@ void disassociate_ctty(int on_exit)
 void no_tty(void)
 {
 	struct task_struct *tsk = current;
-	lock_kernel();
+	tty_lock();
 	disassociate_ctty(0);
-	unlock_kernel();
+	tty_unlock();
 	proc_clear_tty(tsk);
 }
 
@@ -1013,19 +1018,19 @@ out:
  * We don't put it into the syslog queue right now maybe in the future if
  * really needed.
  *
- * We must still hold the BKL and test the CLOSING flag for the moment.
+ * We must still hold the BTM and test the CLOSING flag for the moment.
  */
 
 void tty_write_message(struct tty_struct *tty, char *msg)
 {
 	if (tty) {
 		mutex_lock(&tty->atomic_write_lock);
-		lock_kernel();
+		tty_lock();
 		if (tty->ops->write && !test_bit(TTY_CLOSING, &tty->flags)) {
-			unlock_kernel();
+			tty_unlock();
 			tty->ops->write(tty, msg, strlen(msg));
 		} else
-			unlock_kernel();
+			tty_unlock();
 		tty_write_unlock(tty);
 	}
 	return;
@@ -1208,18 +1213,18 @@ static int tty_driver_install_tty(struct tty_driver *driver,
 	int ret;
 
 	if (driver->ops->install) {
-		lock_kernel();
+		tty_lock_nested(); /* already called with BTM held */
 		ret = driver->ops->install(driver, tty);
-		unlock_kernel();
+		tty_unlock();
 		return ret;
 	}
 
 	if (tty_init_termios(tty) == 0) {
-		lock_kernel();
+		tty_lock_nested();
 		tty_driver_kref_get(driver);
 		tty->count++;
 		driver->ttys[idx] = tty;
-		unlock_kernel();
+		tty_unlock();
 		return 0;
 	}
 	return -ENOMEM;
@@ -1312,14 +1317,15 @@ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx,
 	struct tty_struct *tty;
 	int retval;
 
-	lock_kernel();
+	tty_lock_nested(); /* always called with tty lock held already */
+
 	/* Check if pty master is being opened multiple times */
 	if (driver->subtype == PTY_TYPE_MASTER &&
 		(driver->flags & TTY_DRIVER_DEVPTS_MEM) && !first_ok) {
-		unlock_kernel();
+		tty_unlock();
 		return ERR_PTR(-EIO);
 	}
-	unlock_kernel();
+	tty_unlock();
 
 	/*
 	 * First time open is complex, especially for PTY devices.
@@ -1363,9 +1369,9 @@ release_mem_out:
 	if (printk_ratelimit())
 		printk(KERN_INFO "tty_init_dev: ldisc open failed, "
 				 "clearing slot %d\n", idx);
-	lock_kernel();
+	tty_lock_nested();
 	release_tty(tty, idx);
-	unlock_kernel();
+	tty_unlock();
 	return ERR_PTR(retval);
 }
 
@@ -1512,10 +1518,10 @@ int tty_release(struct inode *inode, struct file *filp)
 	if (tty_paranoia_check(tty, inode, "tty_release_dev"))
 		return 0;
 
-	lock_kernel();
+	tty_lock();
 	check_tty_count(tty, "tty_release_dev");
 
-	tty_fasync(-1, filp, 0);
+	__tty_fasync(-1, filp, 0);
 
 	idx = tty->index;
 	pty_master = (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
@@ -1527,18 +1533,18 @@ int tty_release(struct inode *inode, struct file *filp)
 	if (idx < 0 || idx >= tty->driver->num) {
 		printk(KERN_DEBUG "tty_release_dev: bad idx when trying to "
 				  "free (%s)\n", tty->name);
-		unlock_kernel();
+		tty_unlock();
 		return 0;
 	}
 	if (!devpts) {
 		if (tty != tty->driver->ttys[idx]) {
-			unlock_kernel();
+			tty_unlock();
 			printk(KERN_DEBUG "tty_release_dev: driver.table[%d] not tty "
 			       "for (%s)\n", idx, tty->name);
 			return 0;
 		}
 		if (tty->termios != tty->driver->termios[idx]) {
-			unlock_kernel();
+			tty_unlock();
 			printk(KERN_DEBUG "tty_release_dev: driver.termios[%d] not termios "
 			       "for (%s)\n",
 			       idx, tty->name);
@@ -1556,21 +1562,21 @@ int tty_release(struct inode *inode, struct file *filp)
 	if (tty->driver->other &&
 	     !(tty->driver->flags & TTY_DRIVER_DEVPTS_MEM)) {
 		if (o_tty != tty->driver->other->ttys[idx]) {
-			unlock_kernel();
+			tty_unlock();
 			printk(KERN_DEBUG "tty_release_dev: other->table[%d] "
 					  "not o_tty for (%s)\n",
 			       idx, tty->name);
 			return 0 ;
 		}
 		if (o_tty->termios != tty->driver->other->termios[idx]) {
-			unlock_kernel();
+			tty_unlock();
 			printk(KERN_DEBUG "tty_release_dev: other->termios[%d] "
 					  "not o_termios for (%s)\n",
 			       idx, tty->name);
 			return 0;
 		}
 		if (o_tty->link != tty) {
-			unlock_kernel();
+			tty_unlock();
 			printk(KERN_DEBUG "tty_release_dev: bad pty pointers\n");
 			return 0;
 		}
@@ -1579,7 +1585,7 @@ int tty_release(struct inode *inode, struct file *filp)
 	if (tty->ops->close)
 		tty->ops->close(tty, filp);
 
-	unlock_kernel();
+	tty_unlock();
 	/*
 	 * Sanity check: if tty->count is going to zero, there shouldn't be
 	 * any waiters on tty->read_wait or tty->write_wait.  We test the
@@ -1602,7 +1608,7 @@ int tty_release(struct inode *inode, struct file *filp)
 		   opens on /dev/tty */
 
 		mutex_lock(&tty_mutex);
-		lock_kernel();
+		tty_lock();
 		tty_closing = tty->count <= 1;
 		o_tty_closing = o_tty &&
 			(o_tty->count <= (pty_master ? 1 : 0));
@@ -1633,7 +1639,7 @@ int tty_release(struct inode *inode, struct file *filp)
 
 		printk(KERN_WARNING "tty_release_dev: %s: read/write wait queue "
 				    "active!\n", tty_name(tty, buf));
-		unlock_kernel();
+		tty_unlock();
 		mutex_unlock(&tty_mutex);
 		schedule();
 	}
@@ -1698,7 +1704,7 @@ int tty_release(struct inode *inode, struct file *filp)
 
 	/* check whether both sides are closing ... */
 	if (!tty_closing || (o_tty && !o_tty_closing)) {
-		unlock_kernel();
+		tty_unlock();
 		return 0;
 	}
 
@@ -1718,7 +1724,7 @@ int tty_release(struct inode *inode, struct file *filp)
 	/* Make this pty number available for reallocation */
 	if (devpts)
 		devpts_kill_index(inode, idx);
-	unlock_kernel();
+	tty_unlock();
 	return 0;
 }
 
@@ -1760,12 +1766,12 @@ retry_open:
 	retval = 0;
 
 	mutex_lock(&tty_mutex);
-	lock_kernel();
+	tty_lock();
 
 	if (device == MKDEV(TTYAUX_MAJOR, 0)) {
 		tty = get_current_tty();
 		if (!tty) {
-			unlock_kernel();
+			tty_unlock();
 			mutex_unlock(&tty_mutex);
 			return -ENXIO;
 		}
@@ -1797,14 +1803,14 @@ retry_open:
 				goto got_driver;
 			}
 		}
-		unlock_kernel();
+		tty_unlock();
 		mutex_unlock(&tty_mutex);
 		return -ENODEV;
 	}
 
 	driver = get_tty_driver(device, &index);
 	if (!driver) {
-		unlock_kernel();
+		tty_unlock();
 		mutex_unlock(&tty_mutex);
 		return -ENODEV;
 	}
@@ -1814,7 +1820,7 @@ got_driver:
 		tty = tty_driver_lookup_tty(driver, inode, index);
 
 		if (IS_ERR(tty)) {
-			unlock_kernel();
+			tty_unlock();
 			mutex_unlock(&tty_mutex);
 			return PTR_ERR(tty);
 		}
@@ -1830,7 +1836,7 @@ got_driver:
 	mutex_unlock(&tty_mutex);
 	tty_driver_kref_put(driver);
 	if (IS_ERR(tty)) {
-		unlock_kernel();
+		tty_unlock();
 		return PTR_ERR(tty);
 	}
 
@@ -1862,11 +1868,11 @@ got_driver:
 #endif
 		tty_release(inode, filp);
 		if (retval != -ERESTARTSYS) {
-			unlock_kernel();
+			tty_unlock();
 			return retval;
 		}
 		if (signal_pending(current)) {
-			unlock_kernel();
+			tty_unlock();
 			return retval;
 		}
 		schedule();
@@ -1875,14 +1881,14 @@ got_driver:
 		 */
 		if (filp->f_op == &hung_up_tty_fops)
 			filp->f_op = &tty_fops;
-		unlock_kernel();
+		tty_unlock();
 		goto retry_open;
 	}
-	unlock_kernel();
+	tty_unlock();
 
 
 	mutex_lock(&tty_mutex);
-	lock_kernel();
+	tty_lock();
 	spin_lock_irq(&current->sighand->siglock);
 	if (!noctty &&
 	    current->signal->leader &&
@@ -1890,7 +1896,7 @@ got_driver:
 	    tty->session == NULL)
 		__proc_set_tty(current, tty);
 	spin_unlock_irq(&current->sighand->siglock);
-	unlock_kernel();
+	tty_unlock();
 	mutex_unlock(&tty_mutex);
 	return 0;
 }
@@ -1926,13 +1932,12 @@ static unsigned int tty_poll(struct file *filp, poll_table *wait)
 	return ret;
 }
 
-static int tty_fasync(int fd, struct file *filp, int on)
+static int __tty_fasync(int fd, struct file *filp, int on)
 {
 	struct tty_struct *tty;
 	unsigned long flags;
 	int retval = 0;
 
-	lock_kernel();
 	tty = (struct tty_struct *)filp->private_data;
 	if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_fasync"))
 		goto out;
@@ -1966,7 +1971,15 @@ static int tty_fasync(int fd, struct file *filp, int on)
 	}
 	retval = 0;
 out:
-	unlock_kernel();
+	return retval;
+}
+
+static int tty_fasync(int fd, struct file *filp, int on)
+{
+	int retval;
+	tty_lock();
+	retval = __tty_fasync(fd, filp, on);
+	tty_unlock();
 	return retval;
 }
 
diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index 500e740ec5e4..97681ffd6cbd 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -440,6 +440,8 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
  *
  *	A helper opening method. Also a convenient debugging and check
  *	point.
+ *
+ *	Locking: always called with BTM already held.
  */
 
 static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld)
@@ -447,10 +449,10 @@ static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld)
 	WARN_ON(test_and_set_bit(TTY_LDISC_OPEN, &tty->flags));
 	if (ld->ops->open) {
 		int ret;
-                /* BKL here locks verus a hangup event */
-		lock_kernel();
+                /* BTM here locks versus a hangup event */
+		tty_lock_nested(); /* always held here already */
 		ret = ld->ops->open(tty);
-		unlock_kernel();
+		tty_unlock();
 		return ret;
 	}
 	return 0;
@@ -553,7 +555,7 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 	if (IS_ERR(new_ldisc))
 		return PTR_ERR(new_ldisc);
 
-	lock_kernel();
+	tty_lock();
 	/*
 	 *	We need to look at the tty locking here for pty/tty pairs
 	 *	when both sides try to change in parallel.
@@ -567,12 +569,12 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 	 */
 
 	if (tty->ldisc->ops->num == ldisc) {
-		unlock_kernel();
+		tty_unlock();
 		tty_ldisc_put(new_ldisc);
 		return 0;
 	}
 
-	unlock_kernel();
+	tty_unlock();
 	/*
 	 *	Problem: What do we do if this blocks ?
 	 *	We could deadlock here
@@ -594,7 +596,7 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 		mutex_lock(&tty->ldisc_mutex);
 	}
 
-	lock_kernel();
+	tty_lock();
 
 	set_bit(TTY_LDISC_CHANGING, &tty->flags);
 
@@ -607,7 +609,7 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 
 	o_ldisc = tty->ldisc;
 
-	unlock_kernel();
+	tty_unlock();
 	/*
 	 *	Make sure we don't change while someone holds a
 	 *	reference to the line discipline. The TTY_LDISC bit
@@ -633,14 +635,14 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 	flush_scheduled_work();
 
 	mutex_lock(&tty->ldisc_mutex);
-	lock_kernel();
+	tty_lock();
 	if (test_bit(TTY_HUPPED, &tty->flags)) {
 		/* We were raced by the hangup method. It will have stomped
 		   the ldisc data and closed the ldisc down */
 		clear_bit(TTY_LDISC_CHANGING, &tty->flags);
 		mutex_unlock(&tty->ldisc_mutex);
 		tty_ldisc_put(new_ldisc);
-		unlock_kernel();
+		tty_unlock();
 		return -EIO;
 	}
 
@@ -682,7 +684,7 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 	if (o_work)
 		schedule_delayed_work(&o_tty->buf.work, 1);
 	mutex_unlock(&tty->ldisc_mutex);
-	unlock_kernel();
+	tty_unlock();
 	return retval;
 }
 
diff --git a/drivers/char/vc_screen.c b/drivers/char/vc_screen.c
index c1791a63d99d..bcce46c96b88 100644
--- a/drivers/char/vc_screen.c
+++ b/drivers/char/vc_screen.c
@@ -463,10 +463,10 @@ vcs_open(struct inode *inode, struct file *filp)
 	unsigned int currcons = iminor(inode) & 127;
 	int ret = 0;
 	
-	lock_kernel();
+	tty_lock();
 	if(currcons && !vc_cons_allocated(currcons-1))
 		ret = -ENXIO;
-	unlock_kernel();
+	tty_unlock();
 	return ret;
 }
 
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index 72dcfb2dc126..cf87c5336229 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -509,7 +509,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 
 	console = vc->vc_num;
 
-	lock_kernel();
+	tty_lock();
 
 	if (!vc_cons_allocated(console)) { 	/* impossible? */
 		ret = -ENOIOCTLCMD;
@@ -1336,7 +1336,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		ret = -ENOIOCTLCMD;
 	}
 out:
-	unlock_kernel();
+	tty_unlock();
 	return ret;
 eperm:
 	ret = -EPERM;
@@ -1503,7 +1503,7 @@ long vt_compat_ioctl(struct tty_struct *tty, struct file * file,
 
 	console = vc->vc_num;
 
-	lock_kernel();
+	tty_lock();
 
 	if (!vc_cons_allocated(console)) { 	/* impossible? */
 		ret = -ENOIOCTLCMD;
@@ -1571,11 +1571,11 @@ long vt_compat_ioctl(struct tty_struct *tty, struct file * file,
 		goto fallback;
 	}
 out:
-	unlock_kernel();
+	tty_unlock();
 	return ret;
 
 fallback:
-	unlock_kernel();
+	tty_unlock();
 	return vt_ioctl(tty, file, cmd, arg);
 }
 
diff --git a/drivers/serial/68360serial.c b/drivers/serial/68360serial.c
index 768612f8e41e..16f5f2fab032 100644
--- a/drivers/serial/68360serial.c
+++ b/drivers/serial/68360serial.c
@@ -1705,7 +1705,7 @@ static void rs_360_wait_until_sent(struct tty_struct *tty, int timeout)
 	printk("jiff=%lu...", jiffies);
 #endif
 
-	lock_kernel();
+	tty_lock_nested(); /* always held already since we come from ->close */
 	/* We go through the loop at least once because we can't tell
 	 * exactly when the last character exits the shifter.  There can
 	 * be at least two characters waiting to be sent after the buffers
@@ -1734,7 +1734,7 @@ static void rs_360_wait_until_sent(struct tty_struct *tty, int timeout)
 			bdp--;
 	} while (bdp->status & BD_SC_READY);
 	current->state = TASK_RUNNING;
-	unlock_kernel();
+	tty_unlock();
 #ifdef SERIAL_DEBUG_RS_WAIT_UNTIL_SENT
 	printk("lsr = %d (jiff=%lu)...done\n", lsr, jiffies);
 #endif
diff --git a/drivers/serial/crisv10.c b/drivers/serial/crisv10.c
index 30626440a062..f848e188deae 100644
--- a/drivers/serial/crisv10.c
+++ b/drivers/serial/crisv10.c
@@ -3935,7 +3935,7 @@ static void rs_wait_until_sent(struct tty_struct *tty, int timeout)
 	 * Check R_DMA_CHx_STATUS bit 0-6=number of available bytes in FIFO
 	 * R_DMA_CHx_HWSW bit 31-16=nbr of bytes left in DMA buffer (0=64k)
 	 */
-	lock_kernel();
+	tty_lock_nested(); /* locked already when coming from close */
 	orig_jiffies = jiffies;
 	while (info->xmit.head != info->xmit.tail || /* More in send queue */
 	       (*info->ostatusadr & 0x007f) ||  /* more in FIFO */
@@ -3952,7 +3952,7 @@ static void rs_wait_until_sent(struct tty_struct *tty, int timeout)
 			curr_time_usec - info->last_tx_active_usec;
 	}
 	set_current_state(TASK_RUNNING);
-	unlock_kernel();
+	tty_unlock();
 }
 
 /*
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 3d2acc2265f7..851d7c29132b 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -1274,7 +1274,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	struct uart_port *uport;
 	unsigned long flags;
 
-	BUG_ON(!kernel_locked());
+	BUG_ON(!tty_locked());
 
 	if (!state)
 		return;
@@ -1382,7 +1382,7 @@ static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 	if (port->type == PORT_UNKNOWN || port->fifosize == 0)
 		return;
 
-	lock_kernel();
+	tty_lock_nested(); /* already locked when coming from close */
 
 	/*
 	 * Set the check interval to be 1/5 of the estimated time to
@@ -1429,7 +1429,7 @@ static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 			break;
 	}
 	set_current_state(TASK_RUNNING); /* might not be needed */
-	unlock_kernel();
+	tty_unlock();
 }
 
 /*
@@ -1444,7 +1444,7 @@ static void uart_hangup(struct tty_struct *tty)
 	struct tty_port *port = &state->port;
 	unsigned long flags;
 
-	BUG_ON(!kernel_locked());
+	BUG_ON(!tty_locked());
 	pr_debug("uart_hangup(%d)\n", state->uart_port->line);
 
 	mutex_lock(&port->mutex);
@@ -1578,7 +1578,7 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 	struct tty_port *port;
 	int retval, line = tty->index;
 
-	BUG_ON(!kernel_locked());
+	BUG_ON(!tty_locked());
 	pr_debug("uart_open(%d) called\n", line);
 
 	/*
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 182dd6f8aadd..71970057c118 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -1108,7 +1108,7 @@ static int vgacon_do_font_op(struct vgastate *state,char *arg,int set,int ch512)
 		charmap += 4 * cmapsz;
 #endif
 
-	unlock_kernel();
+	tty_unlock();
 	spin_lock_irq(&vga_lock);
 	/* First, the Sequencer */
 	vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x1);
@@ -1192,7 +1192,7 @@ static int vgacon_do_font_op(struct vgastate *state,char *arg,int set,int ch512)
 		vga_wattr(state->vgabase, VGA_AR_ENABLE_DISPLAY, 0);	
 	}
 	spin_unlock_irq(&vga_lock);
-	lock_kernel();
+	tty_lock();
 	return 0;
 }
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 2df60e4ff40e..6ead6b60c743 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -13,6 +13,7 @@
 #include <linux/tty_driver.h>
 #include <linux/tty_ldisc.h>
 #include <linux/mutex.h>
+#include <linux/smp_lock.h>
 
 #include <asm/system.h>
 
@@ -576,5 +577,35 @@ extern int vt_ioctl(struct tty_struct *tty, struct file *file,
 extern long vt_compat_ioctl(struct tty_struct *tty, struct file * file,
 		     unsigned int cmd, unsigned long arg);
 
+/* functions for preparation of BKL removal */
+
+/*
+ * tty_lock_nested get the tty_lock while potentially holding it
+ *
+ * The Big TTY Mutex is a recursive lock, meaning you can take it
+ * from a thread that is already holding it.
+ * This is bad for a number of reasons, so tty_lock_nested should
+ * really be used as rarely as possible. If a code location can
+ * be shown to never get called with this held already, it should
+ * use tty_lock() instead.
+ */
+static inline void __lockfunc tty_lock_nested(void) __acquires(kernel_lock)
+{
+	lock_kernel();
+}
+static inline void tty_lock(void) __acquires(kernel_lock)
+{
+#ifdef CONFIG_LOCK_KERNEL
+	/* kernel_locked is 1 for !CONFIG_LOCK_KERNEL */
+	WARN_ON(kernel_locked());
+#endif
+	lock_kernel();
+}
+static inline void tty_unlock(void) __releases(kernel_lock)
+{
+	unlock_kernel();
+}
+#define tty_locked()		(kernel_locked())
+
 #endif /* __KERNEL__ */
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From be1bc2889a4db4961ef69f47fb471ecae9f23ade Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 1 Jun 2010 22:53:05 +0200
Subject: tty: introduce wait_event_interruptible_tty

Calling wait_event_interruptible implicitly
releases the BKL when it sleeps, but we need
to do this explcitly when we have converted
it to a mutex.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/cyclades.c  |  2 +-
 drivers/char/istallion.c | 12 ++++++++----
 drivers/char/n_r3964.c   |  2 +-
 drivers/char/tty_port.c  |  2 +-
 drivers/char/vt_ioctl.c  |  5 ++++-
 drivers/serial/crisv10.c |  4 ++--
 include/linux/tty.h      | 42 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 59 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 51acfe39a438..27aad9422332 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -1607,7 +1607,7 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 	 * If the port is the middle of closing, bail out now
 	 */
 	if (tty_hung_up_p(filp) || (info->port.flags & ASYNC_CLOSING)) {
-		wait_event_interruptible(info->port.close_wait,
+		wait_event_interruptible_tty(info->port.close_wait,
 				!(info->port.flags & ASYNC_CLOSING));
 		return (info->port.flags & ASYNC_HUP_NOTIFY) ? -EAGAIN: -ERESTARTSYS;
 	}
diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index 5e9a81d8ebcf..be28391adb79 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -954,7 +954,7 @@ static int stli_rawopen(struct stlibrd *brdp, struct stliport *portp, unsigned l
  *	order of opens and closes may not be preserved across shared
  *	memory, so we must wait until it is complete.
  */
-	wait_event_interruptible(portp->raw_wait,
+	wait_event_interruptible_tty(portp->raw_wait,
 			!test_bit(ST_CLOSING, &portp->state));
 	if (signal_pending(current)) {
 		return -ERESTARTSYS;
@@ -989,7 +989,7 @@ static int stli_rawopen(struct stlibrd *brdp, struct stliport *portp, unsigned l
 	set_bit(ST_OPENING, &portp->state);
 	spin_unlock_irqrestore(&brd_lock, flags);
 
-	wait_event_interruptible(portp->raw_wait,
+	wait_event_interruptible_tty(portp->raw_wait,
 			!test_bit(ST_OPENING, &portp->state));
 	if (signal_pending(current))
 		rc = -ERESTARTSYS;
@@ -1020,7 +1020,7 @@ static int stli_rawclose(struct stlibrd *brdp, struct stliport *portp, unsigned
  *	occurs on this port.
  */
 	if (wait) {
-		wait_event_interruptible(portp->raw_wait,
+		wait_event_interruptible_tty(portp->raw_wait,
 				!test_bit(ST_CLOSING, &portp->state));
 		if (signal_pending(current)) {
 			return -ERESTARTSYS;
@@ -1052,7 +1052,7 @@ static int stli_rawclose(struct stlibrd *brdp, struct stliport *portp, unsigned
  *	to come back.
  */
 	rc = 0;
-	wait_event_interruptible(portp->raw_wait,
+	wait_event_interruptible_tty(portp->raw_wait,
 			!test_bit(ST_CLOSING, &portp->state));
 	if (signal_pending(current))
 		rc = -ERESTARTSYS;
@@ -1073,6 +1073,10 @@ static int stli_rawclose(struct stlibrd *brdp, struct stliport *portp, unsigned
 
 static int stli_cmdwait(struct stlibrd *brdp, struct stliport *portp, unsigned long cmd, void *arg, int size, int copyback)
 {
+	/*
+	 * no need for wait_event_tty because clearing ST_CMDING cannot block
+	 * on BTM
+	 */
 	wait_event_interruptible(portp->raw_wait,
 			!test_bit(ST_CMDING, &portp->state));
 	if (signal_pending(current))
diff --git a/drivers/char/n_r3964.c b/drivers/char/n_r3964.c
index f4bd2591e39f..a98290d7a2c5 100644
--- a/drivers/char/n_r3964.c
+++ b/drivers/char/n_r3964.c
@@ -1079,7 +1079,7 @@ static ssize_t r3964_read(struct tty_struct *tty, struct file *file,
 				goto unlock;
 			}
 			/* block until there is a message: */
-			wait_event_interruptible(pInfo->read_wait,
+			wait_event_interruptible_tty(pInfo->read_wait,
 					(pMsg = remove_msg(pInfo, pClient)));
 		}
 
diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index a3bd1d0b66cf..35eb30402f18 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -231,7 +231,7 @@ int tty_port_block_til_ready(struct tty_port *port,
 
 	/* block if port is in the process of being closed */
 	if (tty_hung_up_p(filp) || port->flags & ASYNC_CLOSING) {
-		wait_event_interruptible(port->close_wait,
+		wait_event_interruptible_tty(port->close_wait,
 				!(port->flags & ASYNC_CLOSING));
 		if (port->flags & ASYNC_HUP_NOTIFY)
 			return -EAGAIN;
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index cf87c5336229..2bbeaaea46e9 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -133,7 +133,7 @@ static void vt_event_wait(struct vt_event_wait *vw)
 	list_add(&vw->list, &vt_events);
 	spin_unlock_irqrestore(&vt_event_lock, flags);
 	/* Wait for it to pass */
-	wait_event_interruptible(vt_event_waitqueue, vw->done);
+	wait_event_interruptible_tty(vt_event_waitqueue, vw->done);
 	/* Dequeue it */
 	spin_lock_irqsave(&vt_event_lock, flags);
 	list_del(&vw->list);
@@ -1761,10 +1761,13 @@ int vt_move_to_console(unsigned int vt, int alloc)
 		return -EIO;
 	}
 	release_console_sem();
+	tty_lock();
 	if (vt_waitactive(vt + 1)) {
 		pr_debug("Suspend: Can't switch VCs.");
+		tty_unlock();
 		return -EINTR;
 	}
+	tty_unlock();
 	return prev;
 }
 
diff --git a/drivers/serial/crisv10.c b/drivers/serial/crisv10.c
index f848e188deae..94bfb9f238e1 100644
--- a/drivers/serial/crisv10.c
+++ b/drivers/serial/crisv10.c
@@ -3992,7 +3992,7 @@ block_til_ready(struct tty_struct *tty, struct file * filp,
 	 */
 	if (tty_hung_up_p(filp) ||
 	    (info->flags & ASYNC_CLOSING)) {
-		wait_event_interruptible(info->close_wait,
+		wait_event_interruptible_tty(info->close_wait,
 			!(info->flags & ASYNC_CLOSING));
 #ifdef SERIAL_DO_RESTART
 		if (info->flags & ASYNC_HUP_NOTIFY)
@@ -4150,7 +4150,7 @@ rs_open(struct tty_struct *tty, struct file * filp)
 	 */
 	if (tty_hung_up_p(filp) ||
 	    (info->flags & ASYNC_CLOSING)) {
-		wait_event_interruptible(info->close_wait,
+		wait_event_interruptible_tty(info->close_wait,
 			!(info->flags & ASYNC_CLOSING));
 #ifdef SERIAL_DO_RESTART
 		return ((info->flags & ASYNC_HUP_NOTIFY) ?
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 6ead6b60c743..955d72ea71c0 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -607,5 +607,47 @@ static inline void tty_unlock(void) __releases(kernel_lock)
 }
 #define tty_locked()		(kernel_locked())
 
+/*
+ * wait_event_interruptible_tty -- wait for a condition with the tty lock held
+ *
+ * The condition we are waiting for might take a long time to
+ * become true, or might depend on another thread taking the
+ * BTM. In either case, we need to drop the BTM to guarantee
+ * forward progress. This is a leftover from the conversion
+ * from the BKL and should eventually get removed as the BTM
+ * falls out of use.
+ *
+ * Do not use in new code.
+ */
+#define wait_event_interruptible_tty(wq, condition)			\
+({									\
+	int __ret = 0;							\
+	if (!(condition)) {						\
+		__wait_event_interruptible_tty(wq, condition, __ret);	\
+	}								\
+	__ret;								\
+})
+
+#define __wait_event_interruptible_tty(wq, condition, ret)		\
+do {									\
+	DEFINE_WAIT(__wait);						\
+									\
+	for (;;) {							\
+		prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		if (!signal_pending(current)) {				\
+			tty_unlock();					\
+			schedule();					\
+			tty_lock();					\
+			continue;					\
+		}							\
+		ret = -ERESTARTSYS;					\
+		break;							\
+	}								\
+	finish_wait(&wq, &__wait);					\
+} while (0)
+
+
 #endif /* __KERNEL__ */
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From ddcd9fb66ae7f448b517242c10a31d4e17bcad45 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 1 Jun 2010 22:53:08 +0200
Subject: tty: remove tty_lock_nested

This changes all remaining users of tty_lock_nested
to be non-recursive, which lets us kill this function.
As a consequence, we won't need to keep the lock count
any more, which allows more simplifications later.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/pty.c       |  2 +-
 drivers/char/selection.c |  4 ++--
 drivers/char/tty_io.c    | 41 ++++++++++++++++++++---------------------
 drivers/char/tty_ldisc.c |  3 +--
 include/linux/tty.h      | 16 +---------------
 5 files changed, 25 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index de22ea952832..f2d7a76fab54 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -62,7 +62,7 @@ static void pty_close(struct tty_struct *tty, struct file *filp)
 		if (tty->driver == ptm_driver)
 			devpts_pty_kill(tty->link);
 #endif
-		tty_vhangup(tty->link);
+		tty_vhangup_locked(tty->link);
 	}
 }
 
diff --git a/drivers/char/selection.c b/drivers/char/selection.c
index 75889cd9375f..ebae344ce910 100644
--- a/drivers/char/selection.c
+++ b/drivers/char/selection.c
@@ -313,7 +313,8 @@ int paste_selection(struct tty_struct *tty)
 	struct  tty_ldisc *ld;
 	DECLARE_WAITQUEUE(wait, current);
 
-	tty_lock_nested(); /* always called with BTM from vt_ioctl */
+	/* always called with BTM from vt_ioctl */
+	WARN_ON(!tty_locked());
 
 	acquire_console_sem();
 	poke_blanked_console();
@@ -343,6 +344,5 @@ int paste_selection(struct tty_struct *tty)
 	__set_current_state(TASK_RUNNING);
 
 	tty_ldisc_deref(ld);
-	tty_unlock();
 	return 0;
 }
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index bb22cf495435..dcf7d368639e 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -492,10 +492,8 @@ EXPORT_SYMBOL_GPL(tty_wakeup);
  *		  tasklist_lock to walk task list for hangup event
  *		    ->siglock to protect ->signal/->sighand
  */
-static void do_tty_hangup(struct work_struct *work)
+void tty_vhangup_locked(struct tty_struct *tty)
 {
-	struct tty_struct *tty =
-		container_of(work, struct tty_struct, hangup_work);
 	struct file *cons_filp = NULL;
 	struct file *filp, *f = NULL;
 	struct task_struct *p;
@@ -517,8 +515,6 @@ static void do_tty_hangup(struct work_struct *work)
 	/* inuse_filps is protected by the single tty lock,
 	   this really needs to change if we want to flush the
 	   workqueue with the lock held */
-	tty_lock_nested(); /* called with BTM held from pty_close and
-				others */
 	check_tty_count(tty, "do_tty_hangup");
 
 	file_list_lock();
@@ -598,11 +594,20 @@ static void do_tty_hangup(struct work_struct *work)
 	 */
 	set_bit(TTY_HUPPED, &tty->flags);
 	tty_ldisc_enable(tty);
-	tty_unlock();
 	if (f)
 		fput(f);
 }
 
+static void do_tty_hangup(struct work_struct *work)
+{
+	struct tty_struct *tty =
+		container_of(work, struct tty_struct, hangup_work);
+
+	tty_lock();
+	tty_vhangup_locked(tty);
+	tty_unlock();
+}
+
 /**
  *	tty_hangup		-	trigger a hangup event
  *	@tty: tty to hangup
@@ -638,7 +643,9 @@ void tty_vhangup(struct tty_struct *tty)
 
 	printk(KERN_DEBUG "%s vhangup...\n", tty_name(tty, buf));
 #endif
-	do_tty_hangup(&tty->hangup_work);
+	tty_lock();
+	tty_vhangup_locked(tty);
+	tty_unlock();
 }
 
 EXPORT_SYMBOL(tty_vhangup);
@@ -719,10 +726,12 @@ void disassociate_ctty(int on_exit)
 	tty = get_current_tty();
 	if (tty) {
 		tty_pgrp = get_pid(tty->pgrp);
-		tty_lock_nested(); /* see above */
-		if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY)
-			tty_vhangup(tty);
-		tty_unlock();
+		if (on_exit) {
+			tty_lock();
+			if (tty->driver->type != TTY_DRIVER_TYPE_PTY)
+				tty_vhangup_locked(tty);
+			tty_unlock();
+		}
 		tty_kref_put(tty);
 	} else if (on_exit) {
 		struct pid *old_pgrp;
@@ -1213,18 +1222,14 @@ static int tty_driver_install_tty(struct tty_driver *driver,
 	int ret;
 
 	if (driver->ops->install) {
-		tty_lock_nested(); /* already called with BTM held */
 		ret = driver->ops->install(driver, tty);
-		tty_unlock();
 		return ret;
 	}
 
 	if (tty_init_termios(tty) == 0) {
-		tty_lock_nested();
 		tty_driver_kref_get(driver);
 		tty->count++;
 		driver->ttys[idx] = tty;
-		tty_unlock();
 		return 0;
 	}
 	return -ENOMEM;
@@ -1317,15 +1322,11 @@ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx,
 	struct tty_struct *tty;
 	int retval;
 
-	tty_lock_nested(); /* always called with tty lock held already */
-
 	/* Check if pty master is being opened multiple times */
 	if (driver->subtype == PTY_TYPE_MASTER &&
 		(driver->flags & TTY_DRIVER_DEVPTS_MEM) && !first_ok) {
-		tty_unlock();
 		return ERR_PTR(-EIO);
 	}
-	tty_unlock();
 
 	/*
 	 * First time open is complex, especially for PTY devices.
@@ -1369,9 +1370,7 @@ release_mem_out:
 	if (printk_ratelimit())
 		printk(KERN_INFO "tty_init_dev: ldisc open failed, "
 				 "clearing slot %d\n", idx);
-	tty_lock_nested();
 	release_tty(tty, idx);
-	tty_unlock();
 	return ERR_PTR(retval);
 }
 
diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index 0f494799da89..412f9775d19c 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -450,9 +450,8 @@ static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld)
 	if (ld->ops->open) {
 		int ret;
                 /* BTM here locks versus a hangup event */
-		tty_lock_nested(); /* always held here already */
+		WARN_ON(!tty_locked());
 		ret = ld->ops->open(tty);
-		tty_unlock();
 		return ret;
 	}
 	return 0;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 955d72ea71c0..0fbafb0b69bf 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -417,6 +417,7 @@ extern int is_ignored(int sig);
 extern int tty_signal(int sig, struct tty_struct *tty);
 extern void tty_hangup(struct tty_struct *tty);
 extern void tty_vhangup(struct tty_struct *tty);
+extern void tty_vhangup_locked(struct tty_struct *tty);
 extern void tty_vhangup_self(void);
 extern void tty_unhangup(struct file *filp);
 extern int tty_hung_up_p(struct file *filp);
@@ -578,21 +579,6 @@ extern long vt_compat_ioctl(struct tty_struct *tty, struct file * file,
 		     unsigned int cmd, unsigned long arg);
 
 /* functions for preparation of BKL removal */
-
-/*
- * tty_lock_nested get the tty_lock while potentially holding it
- *
- * The Big TTY Mutex is a recursive lock, meaning you can take it
- * from a thread that is already holding it.
- * This is bad for a number of reasons, so tty_lock_nested should
- * really be used as rarely as possible. If a code location can
- * be shown to never get called with this held already, it should
- * use tty_lock() instead.
- */
-static inline void __lockfunc tty_lock_nested(void) __acquires(kernel_lock)
-{
-	lock_kernel();
-}
 static inline void tty_lock(void) __acquires(kernel_lock)
 {
 #ifdef CONFIG_LOCK_KERNEL
-- 
cgit v1.2.3-59-g8ed1b


From b07471fa51358ce64cc25e1501544502362e4404 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 6 Aug 2010 21:40:30 +0200
Subject: tty: implement BTM as mutex instead of BKL

The tty locking now follows the rules for mutexes, so
we can replace the BKL usage with a new subsystem
wide mutex.

Using a regular mutex here will change the behaviour
when blocked on the BTM from spinning to sleeping,
but that should not be visible to the user.

Using the mutex also means that all the BTM is now
covered by lockdep.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/Makefile    |  1 +
 drivers/char/tty_mutex.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/tty.h      | 18 +++++-------------
 3 files changed, 53 insertions(+), 13 deletions(-)
 create mode 100644 drivers/char/tty_mutex.c

(limited to 'include/linux')

diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 273cee1cc77b..dc9641660605 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -9,6 +9,7 @@ FONTMAPFILE = cp437.uni
 
 obj-y	 += mem.o random.o tty_io.o n_tty.o tty_ioctl.o tty_ldisc.o tty_buffer.o tty_port.o
 
+obj-y				+= tty_mutex.o
 obj-$(CONFIG_LEGACY_PTYS)	+= pty.o
 obj-$(CONFIG_UNIX98_PTYS)	+= pty.o
 obj-y				+= misc.o
diff --git a/drivers/char/tty_mutex.c b/drivers/char/tty_mutex.c
new file mode 100644
index 000000000000..133697540c73
--- /dev/null
+++ b/drivers/char/tty_mutex.c
@@ -0,0 +1,47 @@
+/*
+ * drivers/char/tty_lock.c
+ */
+#include <linux/tty.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/semaphore.h>
+#include <linux/sched.h>
+
+/*
+ * The 'big tty mutex'
+ *
+ * This mutex is taken and released by tty_lock() and tty_unlock(),
+ * replacing the older big kernel lock.
+ * It can no longer be taken recursively, and does not get
+ * released implicitly while sleeping.
+ *
+ * Don't use in new code.
+ */
+static DEFINE_MUTEX(big_tty_mutex);
+struct task_struct *__big_tty_mutex_owner;
+EXPORT_SYMBOL_GPL(__big_tty_mutex_owner);
+
+/*
+ * Getting the big tty mutex.
+ */
+void __lockfunc tty_lock(void)
+{
+	struct task_struct *task = current;
+
+	WARN_ON(__big_tty_mutex_owner == task);
+
+	mutex_lock(&big_tty_mutex);
+	__big_tty_mutex_owner = task;
+}
+EXPORT_SYMBOL(tty_lock);
+
+void __lockfunc tty_unlock(void)
+{
+	struct task_struct *task = current;
+
+	WARN_ON(__big_tty_mutex_owner != task);
+	__big_tty_mutex_owner = NULL;
+
+	mutex_unlock(&big_tty_mutex);
+}
+EXPORT_SYMBOL(tty_unlock);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 0fbafb0b69bf..1437da3ddc62 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -578,20 +578,12 @@ extern int vt_ioctl(struct tty_struct *tty, struct file *file,
 extern long vt_compat_ioctl(struct tty_struct *tty, struct file * file,
 		     unsigned int cmd, unsigned long arg);
 
+/* tty_mutex.c */
 /* functions for preparation of BKL removal */
-static inline void tty_lock(void) __acquires(kernel_lock)
-{
-#ifdef CONFIG_LOCK_KERNEL
-	/* kernel_locked is 1 for !CONFIG_LOCK_KERNEL */
-	WARN_ON(kernel_locked());
-#endif
-	lock_kernel();
-}
-static inline void tty_unlock(void) __releases(kernel_lock)
-{
-	unlock_kernel();
-}
-#define tty_locked()		(kernel_locked())
+extern void __lockfunc tty_lock(void) __acquires(tty_lock);
+extern void __lockfunc tty_unlock(void) __releases(tty_lock);
+extern struct task_struct *__big_tty_mutex_owner;
+#define tty_locked()		(current == __big_tty_mutex_owner)
 
 /*
  * wait_event_interruptible_tty -- wait for a condition with the tty lock held
-- 
cgit v1.2.3-59-g8ed1b


From 61fd15262bb9c88a05fd89af22add9317dc1b1f4 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Wed, 30 Jun 2010 17:58:38 +0100
Subject: serial: max3107: Abstract out the platform specific bits

At the moment there is only one platform type supported and there is is
hard wired, but with these changes the infrastructure is now there for
anyone else to provide methods for their hardware.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/Kconfig        |  21 ++-
 drivers/serial/Makefile       |   1 +
 drivers/serial/max3107-aava.c | 344 +++++++++++++++++++++++++++++++++++
 drivers/serial/max3107.c      | 413 +++++++++---------------------------------
 drivers/serial/max3107.h      |  85 ++++++++-
 include/linux/serial_core.h   |   4 +
 6 files changed, 531 insertions(+), 337 deletions(-)
 create mode 100644 drivers/serial/max3107-aava.c

(limited to 'include/linux')

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index fd406273cb71..c34c217878b3 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -542,20 +542,29 @@ config SERIAL_S5PV210
 	help
 	  Serial port support for Samsung's S5P Family of SoC's
 
+
+config SERIAL_MAX3100
+	tristate "MAX3100 support"
+	depends on SPI
+	select SERIAL_CORE
+	help
+	  MAX3100 chip support
+
 config SERIAL_MAX3107
 	tristate "MAX3107 support"
-	depends on SPI && GPIOLIB
+	depends on SPI
 	select SERIAL_CORE
-	default y
 	help
 	  MAX3107 chip support
 
-config SERIAL_MAX3100
-	tristate "MAX3100 support"
-	depends on SPI
+config SERIAL_MAX3107_AAVA
+	tristate "MAX3107 AAVA platform support"
+	depends on X86_MRST && SERIAL_MAX3107 && GPIOLIB
 	select SERIAL_CORE
 	help
-	  MAX3100 chip support
+	  Support for the MAX3107 chip configuration found on the AAVA
+	  platform. Includes the extra initialisation and GPIO support
+	  neded for this device.
 
 config SERIAL_DZ
 	bool "DECstation DZ serial driver"
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index 4cd0c0694917..424067ac3347 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_SERIAL_S3C6400) += s3c6400.o
 obj-$(CONFIG_SERIAL_S5PV210) += s5pv210.o
 obj-$(CONFIG_SERIAL_MAX3100) += max3100.o
 obj-$(CONFIG_SERIAL_MAX3107) += max3107.o
+obj-$(CONFIG_SERIAL_MAX3107_AAVA) += max3107-aava.o
 obj-$(CONFIG_SERIAL_IP22_ZILOG) += ip22zilog.o
 obj-$(CONFIG_SERIAL_MUX) += mux.o
 obj-$(CONFIG_SERIAL_68328) += 68328serial.o
diff --git a/drivers/serial/max3107-aava.c b/drivers/serial/max3107-aava.c
new file mode 100644
index 000000000000..a1fe304f2f52
--- /dev/null
+++ b/drivers/serial/max3107-aava.c
@@ -0,0 +1,344 @@
+/*
+ *  max3107.c - spi uart protocol driver for Maxim 3107
+ *  Based on max3100.c
+ *	by Christian Pellegrin <chripell@evolware.org>
+ *  and	max3110.c
+ *	by Feng Tang <feng.tang@intel.com>
+ *
+ *  Copyright (C) Aavamobile 2009
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/serial_core.h>
+#include <linux/serial.h>
+#include <linux/spi/spi.h>
+#include <linux/freezer.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/sfi.h>
+#include <asm/mrst.h>
+#include "max3107.h"
+
+/* GPIO direction to input function */
+static int max3107_gpio_direction_in(struct gpio_chip *chip, unsigned offset)
+{
+	struct max3107_port *s = container_of(chip, struct max3107_port, chip);
+	u16 buf[1];		/* Buffer for SPI transfer */
+
+	if (offset >= MAX3107_GPIO_COUNT) {
+		dev_err(&s->spi->dev, "Invalid GPIO\n");
+		return -EINVAL;
+	}
+
+	/* Read current GPIO configuration register */
+	buf[0] = MAX3107_GPIOCFG_REG;
+	/* Perform SPI transfer */
+	if (max3107_rw(s, (u8 *)buf, (u8 *)buf, 2)) {
+		dev_err(&s->spi->dev, "SPI transfer GPIO read failed\n");
+		return -EIO;
+	}
+	buf[0] &= MAX3107_SPI_RX_DATA_MASK;
+
+	/* Set GPIO to input */
+	buf[0] &= ~(0x0001 << offset);
+
+	/* Write new GPIO configuration register value */
+	buf[0] |= (MAX3107_WRITE_BIT | MAX3107_GPIOCFG_REG);
+	/* Perform SPI transfer */
+	if (max3107_rw(s, (u8 *)buf, NULL, 2)) {
+		dev_err(&s->spi->dev, "SPI transfer GPIO write failed\n");
+		return -EIO;
+	}
+	return 0;
+}
+
+/* GPIO direction to output function */
+static int max3107_gpio_direction_out(struct gpio_chip *chip, unsigned offset,
+					int value)
+{
+	struct max3107_port *s = container_of(chip, struct max3107_port, chip);
+	u16 buf[2];	/* Buffer for SPI transfers */
+
+	if (offset >= MAX3107_GPIO_COUNT) {
+		dev_err(&s->spi->dev, "Invalid GPIO\n");
+		return -EINVAL;
+	}
+
+	/* Read current GPIO configuration and data registers */
+	buf[0] = MAX3107_GPIOCFG_REG;
+	buf[1] = MAX3107_GPIODATA_REG;
+	/* Perform SPI transfer */
+	if (max3107_rw(s, (u8 *)buf, (u8 *)buf, 4)) {
+		dev_err(&s->spi->dev, "SPI transfer gpio failed\n");
+		return -EIO;
+	}
+	buf[0] &= MAX3107_SPI_RX_DATA_MASK;
+	buf[1] &= MAX3107_SPI_RX_DATA_MASK;
+
+	/* Set GPIO to output */
+	buf[0] |= (0x0001 << offset);
+	/* Set value */
+	if (value)
+		buf[1] |= (0x0001 << offset);
+	else
+		buf[1] &= ~(0x0001 << offset);
+
+	/* Write new GPIO configuration and data register values */
+	buf[0] |= (MAX3107_WRITE_BIT | MAX3107_GPIOCFG_REG);
+	buf[1] |= (MAX3107_WRITE_BIT | MAX3107_GPIODATA_REG);
+	/* Perform SPI transfer */
+	if (max3107_rw(s, (u8 *)buf, NULL, 4)) {
+		dev_err(&s->spi->dev,
+			"SPI transfer for GPIO conf data w failed\n");
+		return -EIO;
+	}
+	return 0;
+}
+
+/* GPIO value query function */
+static int max3107_gpio_get(struct gpio_chip *chip, unsigned offset)
+{
+	struct max3107_port *s = container_of(chip, struct max3107_port, chip);
+	u16 buf[1];	/* Buffer for SPI transfer */
+
+	if (offset >= MAX3107_GPIO_COUNT) {
+		dev_err(&s->spi->dev, "Invalid GPIO\n");
+		return -EINVAL;
+	}
+
+	/* Read current GPIO data register */
+	buf[0] = MAX3107_GPIODATA_REG;
+	/* Perform SPI transfer */
+	if (max3107_rw(s, (u8 *)buf, (u8 *)buf, 2)) {
+		dev_err(&s->spi->dev, "SPI transfer GPIO data r failed\n");
+		return -EIO;
+	}
+	buf[0] &= MAX3107_SPI_RX_DATA_MASK;
+
+	/* Return value */
+	return buf[0] & (0x0001 << offset);
+}
+
+/* GPIO value set function */
+static void max3107_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
+{
+	struct max3107_port *s = container_of(chip, struct max3107_port, chip);
+	u16 buf[2];	/* Buffer for SPI transfers */
+
+	if (offset >= MAX3107_GPIO_COUNT) {
+		dev_err(&s->spi->dev, "Invalid GPIO\n");
+		return;
+	}
+
+	/* Read current GPIO configuration registers*/
+	buf[0] = MAX3107_GPIODATA_REG;
+	buf[1] = MAX3107_GPIOCFG_REG;
+	/* Perform SPI transfer */
+	if (max3107_rw(s, (u8 *)buf, (u8 *)buf, 4)) {
+		dev_err(&s->spi->dev,
+			"SPI transfer for GPIO data and config read failed\n");
+		return;
+	}
+	buf[0] &= MAX3107_SPI_RX_DATA_MASK;
+	buf[1] &= MAX3107_SPI_RX_DATA_MASK;
+
+	if (!(buf[1] & (0x0001 << offset))) {
+		/* Configured as input, can't set value */
+		dev_warn(&s->spi->dev,
+				"Trying to set value for input GPIO\n");
+		return;
+	}
+
+	/* Set value */
+	if (value)
+		buf[0] |= (0x0001 << offset);
+	else
+		buf[0] &= ~(0x0001 << offset);
+
+	/* Write new GPIO data register value */
+	buf[0] |= (MAX3107_WRITE_BIT | MAX3107_GPIODATA_REG);
+	/* Perform SPI transfer */
+	if (max3107_rw(s, (u8 *)buf, NULL, 2))
+		dev_err(&s->spi->dev, "SPI transfer GPIO data w failed\n");
+}
+
+/* GPIO chip data */
+static struct gpio_chip max3107_gpio_chip = {
+	.owner			= THIS_MODULE,
+	.direction_input	= max3107_gpio_direction_in,
+	.direction_output	= max3107_gpio_direction_out,
+	.get			= max3107_gpio_get,
+	.set			= max3107_gpio_set,
+	.can_sleep		= 1,
+	.base			= MAX3107_GPIO_BASE,
+	.ngpio			= MAX3107_GPIO_COUNT,
+};
+
+/**
+ *	max3107_aava_reset	-	reset on AAVA systems
+ *	@spi: The SPI device we are probing
+ *
+ *	Reset the device ready for probing.
+ */
+
+static int max3107_aava_reset(struct spi_device *spi)
+{
+	/* Reset the chip */
+	if (gpio_request(MAX3107_RESET_GPIO, "max3107")) {
+		pr_err("Requesting RESET GPIO failed\n");
+		return -EIO;
+	}
+	if (gpio_direction_output(MAX3107_RESET_GPIO, 0)) {
+		pr_err("Setting RESET GPIO to 0 failed\n");
+		gpio_free(MAX3107_RESET_GPIO);
+		return -EIO;
+	}
+	msleep(MAX3107_RESET_DELAY);
+	if (gpio_direction_output(MAX3107_RESET_GPIO, 1)) {
+		pr_err("Setting RESET GPIO to 1 failed\n");
+		gpio_free(MAX3107_RESET_GPIO);
+		return -EIO;
+	}
+	gpio_free(MAX3107_RESET_GPIO);
+	msleep(MAX3107_WAKEUP_DELAY);
+	return 0;
+}
+
+static int max3107_aava_configure(struct max3107_port *s)
+{
+	int retval;
+
+	/* Initialize GPIO chip data */
+	s->chip = max3107_gpio_chip;
+	s->chip.label = s->spi->modalias;
+	s->chip.dev = &s->spi->dev;
+
+	/* Add GPIO chip */
+	retval = gpiochip_add(&s->chip);
+	if (retval) {
+		dev_err(&s->spi->dev, "Adding GPIO chip failed\n");
+		return retval;
+	}
+
+	/* Temporary fix for EV2 boot problems, set modem reset to 0 */
+	max3107_gpio_direction_out(&s->chip, 3, 0);
+	return 0;
+}
+
+#if 0
+/* This will get enabled once we have the board stuff merged for this
+   specific case */
+
+static const struct baud_table brg13_ext[] = {
+	{ 300,    MAX3107_BRG13_B300 },
+	{ 600,    MAX3107_BRG13_B600 },
+	{ 1200,   MAX3107_BRG13_B1200 },
+	{ 2400,   MAX3107_BRG13_B2400 },
+	{ 4800,   MAX3107_BRG13_B4800 },
+	{ 9600,   MAX3107_BRG13_B9600 },
+	{ 19200,  MAX3107_BRG13_B19200 },
+	{ 57600,  MAX3107_BRG13_B57600 },
+	{ 115200, MAX3107_BRG13_B115200 },
+	{ 230400, MAX3107_BRG13_B230400 },
+	{ 460800, MAX3107_BRG13_B460800 },
+	{ 921600, MAX3107_BRG13_B921600 },
+	{ 0, 0 }
+};
+
+static void max3107_aava_init(struct max3107_port *s)
+{
+	/*override for AAVA SC specific*/
+	if (mrst_platform_id() == MRST_PLATFORM_AAVA_SC) {
+		if (get_koski_build_id() <= KOSKI_EV2)
+			if (s->ext_clk) {
+				s->brg_cfg = MAX3107_BRG13_B9600;
+				s->baud_tbl = (struct baud_table *)brg13_ext;
+			}
+	}
+}
+#endif
+
+static int __devexit max3107_aava_remove(struct spi_device *spi)
+{
+	struct max3107_port *s = dev_get_drvdata(&spi->dev);
+
+	/* Remove GPIO chip */
+	if (gpiochip_remove(&s->chip))
+		dev_warn(&spi->dev, "Removing GPIO chip failed\n");
+
+	/* Then do the default remove */
+	return max3107_remove(spi);
+}
+
+/* Platform data */
+static struct max3107_plat aava_plat_data = {
+	.loopback               = 0,
+	.ext_clk                = 1,
+/*	.init			= max3107_aava_init, */
+	.configure		= max3107_aava_configure,
+	.hw_suspend		= max3107_hw_susp,
+	.polled_mode            = 0,
+	.poll_time              = 0,
+};
+
+
+static int __devinit max3107_probe_aava(struct spi_device *spi)
+{
+	int err = max3107_aava_reset(spi);
+	if (err < 0)
+		return err;
+	return max3107_probe(spi, &aava_plat_data);
+}
+
+/* Spi driver data */
+static struct spi_driver max3107_driver = {
+	.driver = {
+		.name		= "aava-max3107",
+		.bus		= &spi_bus_type,
+		.owner		= THIS_MODULE,
+	},
+	.probe		= max3107_probe_aava,
+	.remove		= __devexit_p(max3107_aava_remove),
+	.suspend	= max3107_suspend,
+	.resume		= max3107_resume,
+};
+
+/* Driver init function */
+static int __init max3107_init(void)
+{
+	return spi_register_driver(&max3107_driver);
+}
+
+/* Driver exit function */
+static void __exit max3107_exit(void)
+{
+	spi_unregister_driver(&max3107_driver);
+}
+
+module_init(max3107_init);
+module_exit(max3107_exit);
+
+MODULE_DESCRIPTION("MAX3107 driver");
+MODULE_AUTHOR("Aavamobile");
+MODULE_ALIAS("aava-max3107-spi");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/serial/max3107.c b/drivers/serial/max3107.c
index a96ddd388ffd..67283c1a57ff 100644
--- a/drivers/serial/max3107.c
+++ b/drivers/serial/max3107.c
@@ -31,98 +31,12 @@
 #include <linux/device.h>
 #include <linux/serial_core.h>
 #include <linux/serial.h>
+#include <linux/gpio.h>
 #include <linux/spi/spi.h>
 #include <linux/freezer.h>
-#include <linux/platform_device.h>
-#include <linux/gpio.h>
-#include <linux/sfi.h>
-#include <asm/mrst.h>
 #include "max3107.h"
 
-struct baud_table {
-	int baud;
-	u32 new_brg;
-};
-
-struct max3107_port {
-	/* UART port structure */
-	struct uart_port port;
-
-	/* SPI device structure */
-	struct spi_device *spi;
-
-	/* GPIO chip stucture */
-	struct gpio_chip chip;
-
-	/* Workqueue that does all the magic */
-	struct workqueue_struct *workqueue;
-	struct work_struct work;
-
-	/* Lock for shared data */
-	spinlock_t data_lock;
-
-	/* Device configuration */
-	int ext_clk;		/* 1 if external clock used */
-	int loopback;		/* Current loopback mode state */
-	int baud;			/* Current baud rate */
-
-	/* State flags */
-	int suspended;		/* Indicates suspend mode */
-	int tx_fifo_empty;	/* Flag for TX FIFO state */
-	int rx_enabled;		/* Flag for receiver state */
-	int tx_enabled;		/* Flag for transmitter state */
-
-	u16 irqen_reg;		/* Current IRQ enable register value */
-	/* Shared data */
-	u16 mode1_reg;		/* Current mode1 register value*/
-	int mode1_commit;	/* Flag for setting new mode1 register value */
-	u16 lcr_reg;		/* Current LCR register value */
-	int lcr_commit;		/* Flag for setting new LCR register value */
-	u32 brg_cfg;		/* Current Baud rate generator config  */
-	int brg_commit;		/* Flag for setting new baud rate generator
-				 * config
-				 */
-	struct baud_table *baud_tbl;
-	int handle_irq;		/* Indicates that IRQ should be handled */
-
-	/* Rx buffer and str*/
-	u16 *rxbuf;
-	u8  *rxstr;
-	/* Tx buffer*/
-	u16 *txbuf;
-};
-
-/* Platform data structure */
-struct max3107_plat {
-	/* Loopback mode enable */
-	int loopback;
-	/* External clock enable */
-	int ext_clk;
-	/* HW suspend function */
-	void (*max3107_hw_suspend) (struct max3107_port *s, int suspend);
-	/* Polling mode enable */
-	int polled_mode;
-	/* Polling period if polling mode enabled */
-	int poll_time;
-};
-
-static struct baud_table brg13_ext[] = {
-	{ 300,    MAX3107_BRG13_B300 },
-	{ 600,    MAX3107_BRG13_B600 },
-	{ 1200,   MAX3107_BRG13_B1200 },
-	{ 2400,   MAX3107_BRG13_B2400 },
-	{ 4800,   MAX3107_BRG13_B4800 },
-	{ 9600,   MAX3107_BRG13_B9600 },
-	{ 19200,  MAX3107_BRG13_B19200 },
-	{ 57600,  MAX3107_BRG13_B57600 },
-	{ 115200, MAX3107_BRG13_B115200 },
-	{ 230400, MAX3107_BRG13_B230400 },
-	{ 460800, MAX3107_BRG13_B460800 },
-	{ 921600, MAX3107_BRG13_B921600 },
-	{ 0, 0 }
-};
-
-static struct baud_table brg26_ext[] = {
+static const struct baud_table brg26_ext[] = {
 	{ 300,    MAX3107_BRG26_B300 },
 	{ 600,    MAX3107_BRG26_B600 },
 	{ 1200,   MAX3107_BRG26_B1200 },
@@ -138,7 +52,7 @@ static struct baud_table brg26_ext[] = {
 	{ 0, 0 }
 };
 
-static struct baud_table brg13_int[] = {
+static const struct baud_table brg13_int[] = {
 	{ 300,    MAX3107_BRG13_IB300 },
 	{ 600,    MAX3107_BRG13_IB600 },
 	{ 1200,   MAX3107_BRG13_IB1200 },
@@ -157,7 +71,7 @@ static struct baud_table brg13_int[] = {
 static u32 get_new_brg(int baud, struct max3107_port *s)
 {
 	int i;
-	struct baud_table *baud_tbl = s->baud_tbl;
+	const struct baud_table *baud_tbl = s->baud_tbl;
 
 	for (i = 0; i < 13; i++) {
 		if (baud == baud_tbl[i].baud)
@@ -168,7 +82,7 @@ static u32 get_new_brg(int baud, struct max3107_port *s)
 }
 
 /* Perform SPI transfer for write/read of device register(s) */
-static int max3107_rw(struct max3107_port *s, u8 *tx, u8 *rx, int len)
+int max3107_rw(struct max3107_port *s, u8 *tx, u8 *rx, int len)
 {
 	struct spi_message spi_msg;
 	struct spi_transfer spi_xfer;
@@ -213,6 +127,7 @@ static int max3107_rw(struct max3107_port *s, u8 *tx, u8 *rx, int len)
 #endif
 	return 0;
 }
+EXPORT_SYMBOL_GPL(max3107_rw);
 
 /* Puts received data to circular buffer */
 static void put_data_to_circ_buf(struct max3107_port *s, unsigned char *data,
@@ -611,16 +526,10 @@ static void max3107_register_init(struct max3107_port *s)
 		s->brg_cfg = MAX3107_BRG13_IB9600;
 		s->baud_tbl = (struct baud_table *)brg13_int;
 	}
-#if 0
-	/*override for AAVA SC specific*/
-	if (mrst_platform_id() == MRST_PLATFORM_AAVA_SC) {
-		if (get_koski_build_id() <= KOSKI_EV2)
-			if (s->ext_clk) {
-				s->brg_cfg = MAX3107_BRG13_B9600;
-				s->baud_tbl = (struct baud_table *)brg13_ext;
-			}
-	}
-#endif
+
+	if (s->pdata->init)
+		s->pdata->init(s);
+
 	buf[0] = (MAX3107_WRITE_BIT | MAX3107_BRGDIVMSB_REG)
 		| ((s->brg_cfg >> 16) & MAX3107_SPI_TX_DATA_MASK);
 	buf[1] = (MAX3107_WRITE_BIT | MAX3107_BRGDIVLSB_REG)
@@ -734,7 +643,7 @@ static irqreturn_t max3107_irq(int irqno, void *dev_id)
  * used but that would mess the GPIOs
  *
  */
-static void max3107_hw_susp(struct max3107_port *s, int suspend)
+void max3107_hw_susp(struct max3107_port *s, int suspend)
 {
 	pr_debug("enter, suspend %d\n", suspend);
 
@@ -752,6 +661,7 @@ static void max3107_hw_susp(struct max3107_port *s, int suspend)
 		max3107_set_sleep(s, MAX3107_DISABLE_AUTOSLEEP);
 	}
 }
+EXPORT_SYMBOL_GPL(max3107_hw_susp);
 
 /* Modem status IRQ enabling */
 static void max3107_enable_ms(struct uart_port *port)
@@ -899,10 +809,8 @@ static void max3107_shutdown(struct uart_port *port)
 {
 	struct max3107_port *s = container_of(port, struct max3107_port, port);
 
-	if (s->suspended) {
-		/* Resume HW */
-		max3107_hw_susp(s, 0);
-	}
+	if (s->suspended && s->pdata->hw_suspend)
+		s->pdata->hw_suspend(s, 0);
 
 	/* Free the interrupt */
 	free_irq(s->spi->irq, s);
@@ -915,7 +823,8 @@ static void max3107_shutdown(struct uart_port *port)
 	}
 
 	/* Suspend HW */
-	max3107_hw_susp(s, 1);
+	if (s->pdata->hw_suspend)
+		s->pdata->hw_suspend(s, 1);
 }
 
 /* Port startup function */
@@ -941,7 +850,8 @@ static int max3107_startup(struct uart_port *port)
 	}
 
 	/* Resume HW */
-	max3107_hw_susp(s, 0);
+	if (s->pdata->hw_suspend)
+		s->pdata->hw_suspend(s, 0);
 
 	/* Init registers */
 	max3107_register_init(s);
@@ -973,16 +883,14 @@ static int max3107_request_port(struct uart_port *port)
 static void max3107_config_port(struct uart_port *port, int flags)
 {
 	struct max3107_port *s = container_of(port, struct max3107_port, port);
-
-	/* Use PORT_MAX3100 since we are at least int the same series */
-	s->port.type = PORT_MAX3100;
+	s->port.type = PORT_MAX3107;
 }
 
 /* Port verify function */
 static int max3107_verify_port(struct uart_port *port,
 				struct serial_struct *ser)
 {
-	if (ser->type == PORT_UNKNOWN || ser->type == PORT_MAX3100)
+	if (ser->type == PORT_UNKNOWN || ser->type == PORT_MAX3107)
 		return 0;
 
 	return -EINVAL;
@@ -1000,157 +908,6 @@ static void max3107_break_ctl(struct uart_port *port, int break_state)
 	/* We don't support break control, do nothing */
 }
 
-/* GPIO direction to input function */
-static int max3107_gpio_direction_in(struct gpio_chip *chip, unsigned offset)
-{
-	struct max3107_port *s = container_of(chip, struct max3107_port, chip);
-	u16 buf[1];		/* Buffer for SPI transfer */
-
-	if (offset >= MAX3107_GPIO_COUNT) {
-		dev_err(&s->spi->dev, "Invalid GPIO\n");
-		return -EINVAL;
-	}
-
-	/* Read current GPIO configuration register */
-	buf[0] = MAX3107_GPIOCFG_REG;
-	/* Perform SPI transfer */
-	if (max3107_rw(s, (u8 *)buf, (u8 *)buf, 2)) {
-		dev_err(&s->spi->dev, "SPI transfer GPIO read failed\n");
-		return -EIO;
-	}
-	buf[0] &= MAX3107_SPI_RX_DATA_MASK;
-
-	/* Set GPIO to input */
-	buf[0] &= ~(0x0001 << offset);
-
-	/* Write new GPIO configuration register value */
-	buf[0] |= (MAX3107_WRITE_BIT | MAX3107_GPIOCFG_REG);
-	/* Perform SPI transfer */
-	if (max3107_rw(s, (u8 *)buf, NULL, 2)) {
-		dev_err(&s->spi->dev, "SPI transfer GPIO write failed\n");
-		return -EIO;
-	}
-	return 0;
-}
-
-/* GPIO direction to output function */
-static int max3107_gpio_direction_out(struct gpio_chip *chip, unsigned offset,
-					int value)
-{
-	struct max3107_port *s = container_of(chip, struct max3107_port, chip);
-	u16 buf[2];	/* Buffer for SPI transfers */
-
-	if (offset >= MAX3107_GPIO_COUNT) {
-		dev_err(&s->spi->dev, "Invalid GPIO\n");
-		return -EINVAL;
-	}
-
-	/* Read current GPIO configuration and data registers */
-	buf[0] = MAX3107_GPIOCFG_REG;
-	buf[1] = MAX3107_GPIODATA_REG;
-	/* Perform SPI transfer */
-	if (max3107_rw(s, (u8 *)buf, (u8 *)buf, 4)) {
-		dev_err(&s->spi->dev, "SPI transfer gpio failed\n");
-		return -EIO;
-	}
-	buf[0] &= MAX3107_SPI_RX_DATA_MASK;
-	buf[1] &= MAX3107_SPI_RX_DATA_MASK;
-
-	/* Set GPIO to output */
-	buf[0] |= (0x0001 << offset);
-	/* Set value */
-	if (value)
-		buf[1] |= (0x0001 << offset);
-	else
-		buf[1] &= ~(0x0001 << offset);
-
-	/* Write new GPIO configuration and data register values */
-	buf[0] |= (MAX3107_WRITE_BIT | MAX3107_GPIOCFG_REG);
-	buf[1] |= (MAX3107_WRITE_BIT | MAX3107_GPIODATA_REG);
-	/* Perform SPI transfer */
-	if (max3107_rw(s, (u8 *)buf, NULL, 4)) {
-		dev_err(&s->spi->dev,
-			"SPI transfer for GPIO conf data w failed\n");
-		return -EIO;
-	}
-	return 0;
-}
-
-/* GPIO value query function */
-static int max3107_gpio_get(struct gpio_chip *chip, unsigned offset)
-{
-	struct max3107_port *s = container_of(chip, struct max3107_port, chip);
-	u16 buf[1];	/* Buffer for SPI transfer */
-
-	if (offset >= MAX3107_GPIO_COUNT) {
-		dev_err(&s->spi->dev, "Invalid GPIO\n");
-		return -EINVAL;
-	}
-
-	/* Read current GPIO data register */
-	buf[0] = MAX3107_GPIODATA_REG;
-	/* Perform SPI transfer */
-	if (max3107_rw(s, (u8 *)buf, (u8 *)buf, 2)) {
-		dev_err(&s->spi->dev, "SPI transfer GPIO data r failed\n");
-		return -EIO;
-	}
-	buf[0] &= MAX3107_SPI_RX_DATA_MASK;
-
-	/* Return value */
-	return buf[0] & (0x0001 << offset);
-}
-
-/* GPIO value set function */
-static void max3107_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
-{
-	struct max3107_port *s = container_of(chip, struct max3107_port, chip);
-	u16 buf[2];	/* Buffer for SPI transfers */
-
-	if (offset >= MAX3107_GPIO_COUNT) {
-		dev_err(&s->spi->dev, "Invalid GPIO\n");
-		return;
-	}
-
-	/* Read current GPIO configuration registers*/
-	buf[0] = MAX3107_GPIODATA_REG;
-	buf[1] = MAX3107_GPIOCFG_REG;
-	/* Perform SPI transfer */
-	if (max3107_rw(s, (u8 *)buf, (u8 *)buf, 4)) {
-		dev_err(&s->spi->dev,
-			"SPI transfer for GPIO data and config read failed\n");
-		return;
-	}
-	buf[0] &= MAX3107_SPI_RX_DATA_MASK;
-	buf[1] &= MAX3107_SPI_RX_DATA_MASK;
-
-	if (!(buf[1] & (0x0001 << offset))) {
-		/* Configured as input, can't set value */
-		dev_warn(&s->spi->dev,
-				"Trying to set value for input GPIO\n");
-		return;
-	}
-
-	/* Set value */
-	if (value)
-		buf[0] |= (0x0001 << offset);
-	else
-		buf[0] &= ~(0x0001 << offset);
-
-	/* Write new GPIO data register value */
-	buf[0] |= (MAX3107_WRITE_BIT | MAX3107_GPIODATA_REG);
-	/* Perform SPI transfer */
-	if (max3107_rw(s, (u8 *)buf, NULL, 2))
-		dev_err(&s->spi->dev, "SPI transfer GPIO data w failed\n");
-}
-
-/* Platform data */
-static struct max3107_plat max3107_plat_data = {
-	.loopback               = 0,
-	.ext_clk                = 1,
-	.max3107_hw_suspend     = &max3107_hw_susp,
-	.polled_mode            = 0,
-	.poll_time              = 0,
-};
 
 /* Port functions */
 static struct uart_ops max3107_ops = {
@@ -1180,52 +937,48 @@ static struct uart_driver max3107_uart_driver = {
 	.nr             = 1,
 };
 
-/* GPIO chip data */
-static struct gpio_chip max3107_gpio_chip = {
-	.owner			= THIS_MODULE,
-	.direction_input	= max3107_gpio_direction_in,
-	.direction_output	= max3107_gpio_direction_out,
-	.get			= max3107_gpio_get,
-	.set			= max3107_gpio_set,
-	.can_sleep		= 1,
-	.base			= MAX3107_GPIO_BASE,
-	.ngpio			= MAX3107_GPIO_COUNT,
+static int driver_registered = 0;
+
+
+
+/* 'Generic' platform data */
+static struct max3107_plat generic_plat_data = {
+	.loopback               = 0,
+	.ext_clk                = 1,
+	.hw_suspend		= max3107_hw_susp,
+	.polled_mode            = 0,
+	.poll_time              = 0,
 };
-/* Device probe function */
-static int __devinit max3107_probe(struct spi_device *spi)
+
+
+/*******************************************************************/
+
+/**
+ *	max3107_probe		-	SPI bus probe entry point
+ *	@spi: the spi device
+ *
+ *	SPI wants us to probe this device and if appropriate claim it.
+ *	Perform any platform specific requirements and then initialise
+ *	the device.
+ */
+
+int max3107_probe(struct spi_device *spi, struct max3107_plat *pdata)
 {
 	struct max3107_port *s;
-	struct max3107_plat *pdata = &max3107_plat_data;
 	u16 buf[2];	/* Buffer for SPI transfers */
 	int retval;
 
 	pr_info("enter max3107 probe\n");
 
-	/* Reset the chip */
-	if (gpio_request(MAX3107_RESET_GPIO, "max3107")) {
-		pr_err("Requesting RESET GPIO failed\n");
-		return -EIO;
-	}
-	if (gpio_direction_output(MAX3107_RESET_GPIO, 0)) {
-		pr_err("Setting RESET GPIO to 0 failed\n");
-		gpio_free(MAX3107_RESET_GPIO);
-		return -EIO;
-	}
-	msleep(MAX3107_RESET_DELAY);
-	if (gpio_direction_output(MAX3107_RESET_GPIO, 1)) {
-		pr_err("Setting RESET GPIO to 1 failed\n");
-		gpio_free(MAX3107_RESET_GPIO);
-		return -EIO;
-	}
-	gpio_free(MAX3107_RESET_GPIO);
-	msleep(MAX3107_WAKEUP_DELAY);
-
 	/* Allocate port structure */
 	s = kzalloc(sizeof(*s), GFP_KERNEL);
 	if (!s) {
 		pr_err("Allocating port structure failed\n");
 		return -ENOMEM;
 	}
+
+	s->pdata = pdata;
+
 	/* SPI Rx buffer
 	 * +2 for RX FIFO interrupt
 	 * disabling and RX level query
@@ -1298,10 +1051,13 @@ static int __devinit max3107_probe(struct spi_device *spi)
 	}
 
 	/* Register UART driver */
-	retval = uart_register_driver(&max3107_uart_driver);
-	if (retval) {
-		dev_err(&s->spi->dev, "Registering UART driver failed\n");
-		return retval;
+	if (!driver_registered) {
+		retval = uart_register_driver(&max3107_uart_driver);
+		if (retval) {
+			dev_err(&s->spi->dev, "Registering UART driver failed\n");
+			return retval;
+		}
+		driver_registered = 1;
 	}
 
 	/* Initialize UART port data */
@@ -1312,8 +1068,7 @@ static int __devinit max3107_probe(struct spi_device *spi)
 	s->port.uartclk = 9600;
 	s->port.flags = UPF_SKIP_TEST | UPF_BOOT_AUTOCONF;
 	s->port.irq = s->spi->irq;
-	/* Use PORT_MAX3100 since we are at least in the same series */
-	s->port.type = PORT_MAX3100;
+	s->port.type = PORT_MAX3107;
 
 	/* Add UART port */
 	retval = uart_add_one_port(&max3107_uart_driver, &s->port);
@@ -1322,44 +1077,31 @@ static int __devinit max3107_probe(struct spi_device *spi)
 		return retval;
 	}
 
-	/* Initialize GPIO chip data */
-	s->chip = max3107_gpio_chip;
-	s->chip.label = spi->modalias;
-	s->chip.dev = &spi->dev;
-
-	/* Add GPIO chip */
-	retval = gpiochip_add(&s->chip);
-	if (retval) {
-		dev_err(&s->spi->dev, "Adding GPIO chip failed\n");
-		return retval;
+	if (pdata->configure) {
+		retval = pdata->configure(s);
+		if (retval < 0)
+			return retval;
 	}
 
-	/* Temporary fix for EV2 boot problems, set modem reset to 0 */
-	max3107_gpio_direction_out(&s->chip, 3, 0);
-
 	/* Go to suspend mode */
-	max3107_hw_susp(s, 1);
+	if (pdata->hw_suspend)
+		pdata->hw_suspend(s, 1);
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(max3107_probe);
 
 /* Driver remove function */
-static int __devexit max3107_remove(struct spi_device *spi)
+int max3107_remove(struct spi_device *spi)
 {
 	struct max3107_port *s = dev_get_drvdata(&spi->dev);
 
 	pr_info("enter max3107 remove\n");
 
-	/* Remove GPIO chip */
-	if (gpiochip_remove(&s->chip))
-		dev_warn(&s->spi->dev, "Removing GPIO chip failed\n");
-
 	/* Remove port */
 	if (uart_remove_one_port(&max3107_uart_driver, &s->port))
 		dev_warn(&s->spi->dev, "Removing UART port failed\n");
 
-	/* Unregister UART driver */
-	uart_unregister_driver(&max3107_uart_driver);
 
 	/* Free TxRx buffer */
 	kfree(s->rxbuf);
@@ -1371,9 +1113,10 @@ static int __devexit max3107_remove(struct spi_device *spi)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(max3107_remove);
 
 /* Driver suspend function */
-static int max3107_suspend(struct spi_device *spi, pm_message_t state)
+int max3107_suspend(struct spi_device *spi, pm_message_t state)
 {
 #ifdef CONFIG_PM
 	struct max3107_port *s = dev_get_drvdata(&spi->dev);
@@ -1384,13 +1127,15 @@ static int max3107_suspend(struct spi_device *spi, pm_message_t state)
 	uart_suspend_port(&max3107_uart_driver, &s->port);
 
 	/* Go to suspend mode */
-	max3107_hw_susp(s, 1);
+	if (s->pdata->hw_suspend)
+		s->pdata->hw_suspend(s, 1);
 #endif	/* CONFIG_PM */
 	return 0;
 }
+EXPORT_SYMBOL_GPL(max3107_suspend);
 
 /* Driver resume function */
-static int max3107_resume(struct spi_device *spi)
+int max3107_resume(struct spi_device *spi)
 {
 #ifdef CONFIG_PM
 	struct max3107_port *s = dev_get_drvdata(&spi->dev);
@@ -1398,13 +1143,20 @@ static int max3107_resume(struct spi_device *spi)
 	pr_debug("enter resume\n");
 
 	/* Resume from suspend */
-	max3107_hw_susp(s, 0);
+	if (s->pdata->hw_suspend)
+		s->pdata->hw_suspend(s, 0);
 
 	/* Resume UART port */
 	uart_resume_port(&max3107_uart_driver, &s->port);
 #endif	/* CONFIG_PM */
 	return 0;
 }
+EXPORT_SYMBOL_GPL(max3107_resume);
+
+static int max3107_probe_generic(struct spi_device *spi)
+{
+	return max3107_probe(spi, &generic_plat_data);
+}
 
 /* Spi driver data */
 static struct spi_driver max3107_driver = {
@@ -1413,7 +1165,7 @@ static struct spi_driver max3107_driver = {
 		.bus		= &spi_bus_type,
 		.owner		= THIS_MODULE,
 	},
-	.probe		= max3107_probe,
+	.probe		= max3107_probe_generic,
 	.remove		= __devexit_p(max3107_remove),
 	.suspend	= max3107_suspend,
 	.resume		= max3107_resume,
@@ -1430,6 +1182,9 @@ static int __init max3107_init(void)
 static void __exit max3107_exit(void)
 {
 	pr_info("enter max3107 exit\n");
+	/* Unregister UART driver */
+	if (driver_registered)
+		uart_unregister_driver(&max3107_uart_driver);
 	spi_unregister_driver(&max3107_driver);
 }
 
@@ -1438,5 +1193,5 @@ module_exit(max3107_exit);
 
 MODULE_DESCRIPTION("MAX3107 driver");
 MODULE_AUTHOR("Aavamobile");
-MODULE_ALIAS("max3107-spi-uart");
-MODULE_LICENSE("GPLv2");
+MODULE_ALIAS("max3107-spi");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/serial/max3107.h b/drivers/serial/max3107.h
index a5625d1f263d..72b30415f417 100644
--- a/drivers/serial/max3107.h
+++ b/drivers/serial/max3107.h
@@ -10,8 +10,8 @@
  * (at your option) any later version.
  */
 
-#ifndef _LINUX_SERIAL_MAX3107_H
-#define _LINUX_SERIAL_MAX3107_H
+#ifndef _MAX3107_H
+#define _MAX3107_H
 
 /* Serial error status definitions */
 #define MAX3107_PARITY_ERROR	1
@@ -355,4 +355,85 @@
 #define MAX3107_BRG13_IB460800	(0x000000 | 0x00)
 #define MAX3107_BRG13_IB921600	(0x000000 | 0x00)
 
+
+struct baud_table {
+	int baud;
+	u32 new_brg;
+};
+
+struct max3107_port {
+	/* UART port structure */
+	struct uart_port port;
+
+	/* SPI device structure */
+	struct spi_device *spi;
+
+	/* GPIO chip stucture */
+	struct gpio_chip chip;
+
+	/* Workqueue that does all the magic */
+	struct workqueue_struct *workqueue;
+	struct work_struct work;
+
+	/* Lock for shared data */
+	spinlock_t data_lock;
+
+	/* Device configuration */
+	int ext_clk;		/* 1 if external clock used */
+	int loopback;		/* Current loopback mode state */
+	int baud;			/* Current baud rate */
+
+	/* State flags */
+	int suspended;		/* Indicates suspend mode */
+	int tx_fifo_empty;	/* Flag for TX FIFO state */
+	int rx_enabled;		/* Flag for receiver state */
+	int tx_enabled;		/* Flag for transmitter state */
+
+	u16 irqen_reg;		/* Current IRQ enable register value */
+	/* Shared data */
+	u16 mode1_reg;		/* Current mode1 register value*/
+	int mode1_commit;	/* Flag for setting new mode1 register value */
+	u16 lcr_reg;		/* Current LCR register value */
+	int lcr_commit;		/* Flag for setting new LCR register value */
+	u32 brg_cfg;		/* Current Baud rate generator config  */
+	int brg_commit;		/* Flag for setting new baud rate generator
+				 * config
+				 */
+	struct baud_table *baud_tbl;
+	int handle_irq;		/* Indicates that IRQ should be handled */
+
+	/* Rx buffer and str*/
+	u16 *rxbuf;
+	u8  *rxstr;
+	/* Tx buffer*/
+	u16 *txbuf;
+
+	struct max3107_plat *pdata;	/* Platform data */
+};
+
+/* Platform data structure */
+struct max3107_plat {
+	/* Loopback mode enable */
+	int loopback;
+	/* External clock enable */
+	int ext_clk;
+	/* Called during the register initialisation */
+	void (*init)(struct max3107_port *s);
+	/* Called when the port is found and configured */
+	int (*configure)(struct max3107_port *s);
+	/* HW suspend function */
+	void (*hw_suspend) (struct max3107_port *s, int suspend);
+	/* Polling mode enable */
+	int polled_mode;
+	/* Polling period if polling mode enabled */
+	int poll_time;
+};
+
+extern int max3107_rw(struct max3107_port *s, u8 *tx, u8 *rx, int len);
+extern void max3107_hw_susp(struct max3107_port *s, int suspend);
+extern int max3107_probe(struct spi_device *spi, struct max3107_plat *pdata);
+extern int max3107_remove(struct spi_device *spi);
+extern int max3107_suspend(struct spi_device *spi, pm_message_t state);
+extern int max3107_resume(struct spi_device *spi);
+
 #endif /* _LINUX_SERIAL_MAX3107_H */
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 32928161fab6..9ddc866ccc09 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -186,6 +186,10 @@
 #define PORT_ALTERA_JTAGUART	91
 #define PORT_ALTERA_UART	92
 
+/* MAX3107 */
+#define PORT_MAX3107	94
+
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-- 
cgit v1.2.3-59-g8ed1b


From 75e0b946cf2fef14236ff999b6d7eacbae2034b0 Mon Sep 17 00:00:00 2001
From: Kevin Winchester <kjwinchester@gmail.com>
Date: Sat, 10 Jul 2010 18:57:56 -0300
Subject: vt: Fix warning: statement with no effect due to vt_kern.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using:

	gcc (GCC) 4.5.0 20100610 (prerelease)

with CONFIG_CONSOLE_TRANSLATIONS=n, the following warnings are seen:

	drivers/char/vt_ioctl.c: In function ‘vt_ioctl’:
	drivers/char/vt_ioctl.c:1309:4: warning: statement with no effect
	drivers/char/vt.c: In function ‘vc_allocate’:
	drivers/char/vt.c:774:3: warning: statement with no effect
	drivers/video/console/vgacon.c: In function ‘vgacon_init’:
	drivers/video/console/vgacon.c:587:3: warning: statement with no effect
	drivers/video/console/vgacon.c: In function ‘vgacon_deinit’:
	drivers/video/console/vgacon.c:606:2: warning: statement with no effect
	drivers/video/console/fbcon.c: In function ‘fbcon_init’:
	drivers/video/console/fbcon.c:1087:3: warning: statement with no effect
	drivers/video/console/fbcon.c:1089:3: warning: statement with no effect
	drivers/video/console/fbcon.c: In function ‘fbcon_set_disp’:
	drivers/video/console/fbcon.c:1369:3: warning: statement with no effect
	drivers/video/console/fbcon.c:1371:3: warning: statement with no effect

This is because several functions in include/linux/vt_kern.h are
defined to (0).  Convert them to static inline functions to
silence the warnings and gain a bit of type safety.

Signed-off-by: Kevin Winchester <kjwinchester@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/vt_kern.h | 57 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index 56cce345aa8d..6625cc1ab758 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -76,17 +76,52 @@ int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc);
 #define vc_translate(vc, c) ((vc)->vc_translate[(c) |			\
 					((vc)->vc_toggle_meta ? 0x80 : 0)])
 #else
-#define con_set_trans_old(arg) (0)
-#define con_get_trans_old(arg) (-EINVAL)
-#define con_set_trans_new(arg) (0)
-#define con_get_trans_new(arg) (-EINVAL)
-#define con_clear_unimap(vc, ui) (0)
-#define con_set_unimap(vc, ct, list) (0)
-#define con_set_default_unimap(vc) (0)
-#define con_copy_unimap(d, s) (0)
-#define con_get_unimap(vc, ct, uct, list) (-EINVAL)
-#define con_free_unimap(vc) do { ; } while (0)
-#define con_protect_unimap(vc, rdonly) do { ; } while (0)
+static inline int con_set_trans_old(unsigned char __user *table)
+{
+	return 0;
+}
+static inline int con_get_trans_old(unsigned char __user *table)
+{
+	return -EINVAL;
+}
+static inline int con_set_trans_new(unsigned short __user *table)
+{
+	return 0;
+}
+static inline int con_get_trans_new(unsigned short __user *table)
+{
+	return -EINVAL;
+}
+static inline int con_clear_unimap(struct vc_data *vc, struct unimapinit *ui)
+{
+	return 0;
+}
+static inline
+int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list)
+{
+	return 0;
+}
+static inline
+int con_get_unimap(struct vc_data *vc, ushort ct, ushort __user *uct,
+		   struct unipair __user *list)
+{
+	return -EINVAL;
+}
+static inline int con_set_default_unimap(struct vc_data *vc)
+{
+	return 0;
+}
+static inline void con_free_unimap(struct vc_data *vc)
+{
+}
+static inline void con_protect_unimap(struct vc_data *vc, int rdonly)
+{
+}
+static inline
+int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc)
+{
+	return 0;
+}
 
 #define vc_translate(vc, c) (c)
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 93e3d58284626ff6466f9c3dac8800cd6f8079c6 Mon Sep 17 00:00:00 2001
From: John Villalovos <jvillalo@redhat.com>
Date: Tue, 20 Jul 2010 15:26:46 -0700
Subject: serial: fix missing bit coverage of ASYNC_FLAGS

It seems that currently ASYNC_FLAGS is one bit short of covering all the
bits of the ASYNC user flags.  In particular it does not cover the
ASYNC_AUTOPROBE bit.

ASYNCB_LAST_USER and ASYNCB_AUTOPROBE are both equal to 15.

Therefore:
ASYNC_AUTOPROBE = 1000 0000 0000 0000
ASYNC_FLAGS     = 0111 1111 1111 1111

So ASYNC_FLAGS is not covering the ASYNC_AUTOPROBE bit.

This patch fixes the issue and with the patch the values will be:
ASYNC_AUTOPROBE = 1000 0000 0000 0000
ASYNC_FLAGS     = 1111 1111 1111 1111

As a side note, doing a "git grep" I didn't find any use of
ASYNC_AUTOPROBE or ASYNCB_AUTOPROBE in the kernel, besides this include
file.

Signed-off-by: John Villalovos <john.l.villalovos@intel.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/serial.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial.h b/include/linux/serial.h
index c8613c3ff9d3..c3b45add494d 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -151,7 +151,7 @@ struct serial_uart_config {
 #define ASYNC_BUGGY_UART	(1U << ASYNCB_BUGGY_UART)
 #define ASYNC_AUTOPROBE		(1U << ASYNCB_AUTOPROBE)
 
-#define ASYNC_FLAGS		((1U << ASYNCB_LAST_USER) - 1)
+#define ASYNC_FLAGS		((1U << (ASYNCB_LAST_USER + 1)) - 1)
 #define ASYNC_USR_MASK		(ASYNC_SPD_HI|ASYNC_SPD_VHI| \
 		ASYNC_CALLOUT_NOHUP|ASYNC_SPD_SHI|ASYNC_LOW_LATENCY)
 #define ASYNC_SPD_CUST		(ASYNC_SPD_HI|ASYNC_SPD_VHI)
-- 
cgit v1.2.3-59-g8ed1b


From 1b6331848b69d1ed165a6bdc75c4046d68767563 Mon Sep 17 00:00:00 2001
From: Claudio Scordino <claudio@evidence.eu.com>
Date: Tue, 20 Jul 2010 15:26:47 -0700
Subject: serial: general fixes in the serial_rs485 structure

Fix several issues related to the RS485 interface:

 - It adds the flag SER_RS485_RTS_BEFORE_SEND that was missing from the
   serial_rs485 structure (even if "delay_rts_before_send" was existing)

 - It adds a further "delay_rts_after_send" field for those drivers that
   can have a delay after send (e.g., atmel_serial)

 - It fixes the usage of the structure in the atmel_serial driver (where
   "delay_rts_before_send" should be used instead of "delay_rts_after_send").

Signed-off-by: Claudio Scordino <claudio@evidence.eu.com>
Signed-off-by: Bernhard Roth <br@pwrnet.de>
Cc: Philippe De Muyter <phdm@macqel.be>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/atmel_serial.c | 11 ++++++++---
 include/linux/serial.h        |  4 +++-
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/atmel_serial.c b/drivers/serial/atmel_serial.c
index a182def7007d..3892666b5fbd 100644
--- a/drivers/serial/atmel_serial.c
+++ b/drivers/serial/atmel_serial.c
@@ -217,7 +217,8 @@ void atmel_config_rs485(struct uart_port *port, struct serial_rs485 *rs485conf)
 	if (rs485conf->flags & SER_RS485_ENABLED) {
 		dev_dbg(port->dev, "Setting UART to RS485\n");
 		atmel_port->tx_done_mask = ATMEL_US_TXEMPTY;
-		UART_PUT_TTGR(port, rs485conf->delay_rts_before_send);
+		if (rs485conf->flags & SER_RS485_RTS_AFTER_SEND)
+			UART_PUT_TTGR(port, rs485conf->delay_rts_after_send);
 		mode |= ATMEL_US_USMODE_RS485;
 	} else {
 		dev_dbg(port->dev, "Setting UART to RS232\n");
@@ -292,7 +293,9 @@ static void atmel_set_mctrl(struct uart_port *port, u_int mctrl)
 
 	if (atmel_port->rs485.flags & SER_RS485_ENABLED) {
 		dev_dbg(port->dev, "Setting UART to RS485\n");
-		UART_PUT_TTGR(port, atmel_port->rs485.delay_rts_before_send);
+		if (atmel_port->rs485.flags & SER_RS485_RTS_AFTER_SEND)
+			UART_PUT_TTGR(port,
+					atmel_port->rs485.delay_rts_after_send);
 		mode |= ATMEL_US_USMODE_RS485;
 	} else {
 		dev_dbg(port->dev, "Setting UART to RS232\n");
@@ -1211,7 +1214,9 @@ static void atmel_set_termios(struct uart_port *port, struct ktermios *termios,
 
 	if (atmel_port->rs485.flags & SER_RS485_ENABLED) {
 		dev_dbg(port->dev, "Setting UART to RS485\n");
-		UART_PUT_TTGR(port, atmel_port->rs485.delay_rts_before_send);
+		if (atmel_port->rs485.flags & SER_RS485_RTS_AFTER_SEND)
+			UART_PUT_TTGR(port,
+					atmel_port->rs485.delay_rts_after_send);
 		mode |= ATMEL_US_USMODE_RS485;
 	} else {
 		dev_dbg(port->dev, "Setting UART to RS232\n");
diff --git a/include/linux/serial.h b/include/linux/serial.h
index c3b45add494d..ef914061511e 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -210,8 +210,10 @@ struct serial_rs485 {
 #define SER_RS485_ENABLED		(1 << 0)
 #define SER_RS485_RTS_ON_SEND		(1 << 1)
 #define SER_RS485_RTS_AFTER_SEND	(1 << 2)
+#define SER_RS485_RTS_BEFORE_SEND	(1 << 3)
 	__u32	delay_rts_before_send;	/* Milliseconds */
-	__u32	padding[6];		/* Memory is cheap, new structs
+	__u32	delay_rts_after_send;	/* Milliseconds */
+	__u32	padding[5];		/* Memory is cheap, new structs
 					   are a royal PITA .. */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From d843fc6e9dc9bee7061b6833594860ea93ad98e1 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 27 Jul 2010 08:20:22 +0100
Subject: hsu: driver for Medfield High Speed UART device

This is a PCI & UART driver, which suppors both PIO and DMA mode
UART operation. It has 3 identical UART ports and one internal
DMA controller.

Current FW will export 4 pci devices for hsu: 3 uart ports and 1
dma controller, each has one IRQ line. And we need to discuss the
device model, one PCI device covering whole HSU should be a better
model, but there is a problem of how to export the 4 IRQs info

Current driver set the highest baud rate to 2746800bps, which is
easy to scale down to 115200/230400.... To suport higher baud rate,
we need add special process, change DLAB/DLH/PS/DIV/MUL registers
all together.

921600 is the highest baud rate that has been tested with Bluetooth
modem connected to HSU port 0. Will test more when there is right
BT firmware.

Current version contains several work around for A0's Silicon bugs

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/Kconfig      |   10 +
 drivers/serial/Makefile     |    1 +
 drivers/serial/mfd.c        | 1488 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/serial_core.h |    2 +
 include/linux/serial_mfd.h  |   47 ++
 include/linux/serial_reg.h  |   16 +
 6 files changed, 1564 insertions(+)
 create mode 100644 drivers/serial/mfd.c
 create mode 100644 include/linux/serial_mfd.h

(limited to 'include/linux')

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index c34c217878b3..a22e60c06f48 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -724,6 +724,16 @@ config MRST_MAX3110_IRQ
 	help
 	  This has to be enabled after Moorestown GPIO driver is loaded
 
+config SERIAL_MFD_HSU
+	tristate "Medfield High Speed UART support"
+	depends on PCI
+	select SERIAL_CORE
+
+config SERIAL_MFD_HSU_CONSOLE
+	boolean "Medfile HSU serial console support"
+	depends on SERIAL_MFD_HSU=y
+	select SERIAL_CORE_CONSOLE
+
 config SERIAL_BFIN
 	tristate "Blackfin serial port support"
 	depends on BLACKFIN
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index 424067ac3347..1ca4fd599ffe 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -87,3 +87,4 @@ obj-$(CONFIG_SERIAL_GRLIB_GAISLER_APBUART) += apbuart.o
 obj-$(CONFIG_SERIAL_ALTERA_JTAGUART) += altera_jtaguart.o
 obj-$(CONFIG_SERIAL_ALTERA_UART) += altera_uart.o
 obj-$(CONFIG_SERIAL_MRST_MAX3110)	+= mrst_max3110.o
+obj-$(CONFIG_SERIAL_MFD_HSU)	+= mfd.o
diff --git a/drivers/serial/mfd.c b/drivers/serial/mfd.c
new file mode 100644
index 000000000000..300dcb134e07
--- /dev/null
+++ b/drivers/serial/mfd.c
@@ -0,0 +1,1488 @@
+/*
+ * mfd.c: driver for High Speed UART device of Intel Medfield platform
+ *
+ * Refer pxa.c, 8250.c and some other drivers in drivers/serial/
+ *
+ * (C) Copyright 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+
+/* Notes:
+ * 1. there should be 2 types of register access method, one for
+ *    UART ports, the other for the general purpose registers
+ *
+ * 2. It used to have a Irda port, but was defeatured recently
+ *
+ * 3. Based on the info from HSU MAS, 0/1 channel are assigned to
+ *    port0, 2/3 chan to port 1, 4/5 chan to port 3. Even number
+ *    chan will be read, odd chan for write
+ *
+ * 4. HUS supports both the 64B and 16B FIFO version, but this driver
+ *    will only use 64B version
+ *
+ * 5. In A0 stepping, UART will not support TX half empty flag, thus
+ *    need add a #ifdef judgement
+ *
+ * 6. One more bug for A0, the loopback mode won't support AFC
+ *    auto-flow control
+ *
+ * 7. HSU has some special FCR control bits, we add it to serial_reg.h
+ *
+ * 8. The RI/DSR/DCD/DTR are not pinned out, DCD & DSR are always asserted,
+ *    only when the HW is reset the DDCD and DDSR will be triggered
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/sysrq.h>
+#include <linux/serial_reg.h>
+#include <linux/circ_buf.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial_core.h>
+#include <linux/serial_mfd.h>
+#include <linux/dma-mapping.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/debugfs.h>
+
+#define  MFD_HSU_A0_STEPPING	1
+
+#define HSU_DMA_BUF_SIZE	2048
+
+#define chan_readl(chan, offset)	readl(chan->reg + offset)
+#define chan_writel(chan, offset, val)	writel(val, chan->reg + offset)
+
+#define mfd_readl(obj, offset)		readl(obj->reg + offset)
+#define mfd_writel(obj, offset, val)	writel(val, obj->reg + offset)
+
+struct hsu_dma_buffer {
+	u8		*buf;
+	dma_addr_t	dma_addr;
+	u32		dma_size;
+	u32		ofs;
+};
+
+struct hsu_dma_chan {
+	u32	id;
+	u32	dirt;	/* to or from device */
+	struct uart_hsu_port	*uport;
+	void __iomem	*reg;
+};
+
+struct uart_hsu_port {
+	struct uart_port        port;
+	unsigned char           ier;
+	unsigned char           lcr;
+	unsigned char           mcr;
+	unsigned int            lsr_break_flag;
+	char			name[12];
+	int			index;
+	struct device		*dev;
+
+	struct hsu_dma_chan	*txc;
+	struct hsu_dma_chan	*rxc;
+	struct hsu_dma_buffer	txbuf;
+	struct hsu_dma_buffer	rxbuf;
+	int			use_dma;	/* flag for DMA/PIO */
+	int			running;
+	int			dma_tx_on;
+};
+
+/* Top level data structure of HSU */
+struct hsu_port {
+	struct pci_device	*pdev;
+
+	void __iomem	*reg;
+	unsigned long	paddr;
+	unsigned long	iolen;
+	u32		irq;
+
+	struct uart_hsu_port	port[3];
+	struct hsu_dma_chan	chans[10];
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs;
+#endif
+};
+
+static inline void hexdump(char *str, u8 *addr, int cnt)
+{
+	int i;
+
+	for (i = 0; i < cnt; i += 8) {
+		printk("0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x",
+			addr[i], addr[i+1], addr[i+2], addr[i+3],
+			addr[i+4], addr[i+5], addr[i+6], addr[i+7]);
+		printk("\n");
+	}
+}
+
+static inline unsigned int serial_in(struct uart_hsu_port *up, int offset)
+{
+	unsigned int val;
+
+	if (offset > UART_MSR) {
+		offset <<= 2;
+		val = readl(up->port.membase + offset);
+	} else
+		val = (unsigned int)readb(up->port.membase + offset);
+
+	return val;
+}
+
+static inline void serial_out(struct uart_hsu_port *up, int offset, int value)
+{
+	if (offset > UART_MSR) {
+		offset <<= 2;
+		writel(value, up->port.membase + offset);
+	} else {
+		unsigned char val = value & 0xff;
+		writeb(val, up->port.membase + offset);
+	}
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+#define HSU_REGS_BUFSIZE	1024
+
+static int hsu_show_regs_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+static ssize_t port_show_regs(struct file *file, char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	struct uart_hsu_port *up = file->private_data;
+	char *buf;
+	u32 len = 0;
+	ssize_t ret;
+
+	buf = kzalloc(HSU_REGS_BUFSIZE, GFP_KERNEL);
+	if (!buf)
+		return 0;
+
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"MFD HSU port[%d] regs:\n", up->index);
+
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"=================================\n");
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"IER: \t\t0x%08x\n", serial_in(up, UART_IER));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"IIR: \t\t0x%08x\n", serial_in(up, UART_IIR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"LCR: \t\t0x%08x\n", serial_in(up, UART_LCR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"MCR: \t\t0x%08x\n", serial_in(up, UART_MCR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"LSR: \t\t0x%08x\n", serial_in(up, UART_LSR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"MSR: \t\t0x%08x\n", serial_in(up, UART_MSR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"FOR: \t\t0x%08x\n", serial_in(up, UART_FOR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"PS: \t\t0x%08x\n", serial_in(up, UART_PS));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"MUL: \t\t0x%08x\n", serial_in(up, UART_MUL));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"DIV: \t\t0x%08x\n", serial_in(up, UART_DIV));
+
+	ret =  simple_read_from_buffer(user_buf, count, ppos, buf, len);
+	kfree(buf);
+	return ret;
+}
+
+static ssize_t dma_show_regs(struct file *file, char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	struct hsu_dma_chan *chan = file->private_data;
+	char *buf;
+	u32 len = 0;
+	ssize_t ret;
+
+	buf = kzalloc(HSU_REGS_BUFSIZE, GFP_KERNEL);
+	if (!buf)
+		return 0;
+
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"MFD HSU DMA channel [%d] regs:\n", chan->id);
+
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"=================================\n");
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"CR: \t\t0x%08x\n", chan_readl(chan, HSU_CH_CR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"DCR: \t\t0x%08x\n", chan_readl(chan, HSU_CH_DCR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"BSR: \t\t0x%08x\n", chan_readl(chan, HSU_CH_BSR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"MOTSR: \t\t0x%08x\n", chan_readl(chan, HSU_CH_MOTSR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"D0SAR: \t\t0x%08x\n", chan_readl(chan, HSU_CH_D0SAR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"D0TSR: \t\t0x%08x\n", chan_readl(chan, HSU_CH_D0TSR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"D0SAR: \t\t0x%08x\n", chan_readl(chan, HSU_CH_D1SAR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"D0TSR: \t\t0x%08x\n", chan_readl(chan, HSU_CH_D1TSR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"D0SAR: \t\t0x%08x\n", chan_readl(chan, HSU_CH_D2SAR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"D0TSR: \t\t0x%08x\n", chan_readl(chan, HSU_CH_D2TSR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"D0SAR: \t\t0x%08x\n", chan_readl(chan, HSU_CH_D3SAR));
+	len += snprintf(buf + len, HSU_REGS_BUFSIZE - len,
+			"D0TSR: \t\t0x%08x\n", chan_readl(chan, HSU_CH_D3TSR));
+
+	ret =  simple_read_from_buffer(user_buf, count, ppos, buf, len);
+	kfree(buf);
+	return ret;
+}
+
+static const struct file_operations port_regs_ops = {
+	.owner		= THIS_MODULE,
+	.open		= hsu_show_regs_open,
+	.read		= port_show_regs,
+};
+
+static const struct file_operations dma_regs_ops = {
+	.owner		= THIS_MODULE,
+	.open		= hsu_show_regs_open,
+	.read		= dma_show_regs,
+};
+
+static int hsu_debugfs_init(struct hsu_port *hsu)
+{
+	int i;
+	char name[32];
+
+	hsu->debugfs = debugfs_create_dir("hsu", NULL);
+	if (!hsu->debugfs)
+		return -ENOMEM;
+
+	for (i = 0; i < 3; i++) {
+		snprintf(name, sizeof(name), "port_%d_regs", i);
+		debugfs_create_file(name, S_IFREG | S_IRUGO,
+			hsu->debugfs, (void *)(&hsu->port[i]), &port_regs_ops);
+	}
+
+	for (i = 0; i < 6; i++) {
+		snprintf(name, sizeof(name), "dma_chan_%d_regs", i);
+		debugfs_create_file(name, S_IFREG | S_IRUGO,
+			hsu->debugfs, (void *)&hsu->chans[i], &dma_regs_ops);
+	}
+
+	return 0;
+}
+
+static void hsu_debugfs_remove(struct hsu_port *hsu)
+{
+	if (hsu->debugfs)
+		debugfs_remove_recursive(hsu->debugfs);
+}
+
+#else
+static inline int hsu_debugfs_init(struct hsu_port *hsu)
+{
+	return 0;
+}
+
+static inline void hsu_debugfs_remove(struct hsu_port *hsu)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static void serial_hsu_enable_ms(struct uart_port *port)
+{
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+
+	up->ier |= UART_IER_MSI;
+	serial_out(up, UART_IER, up->ier);
+}
+
+void hsu_dma_tx(struct uart_hsu_port *up)
+{
+	struct circ_buf *xmit = &up->port.state->xmit;
+	struct hsu_dma_buffer *dbuf = &up->txbuf;
+	int count;
+
+	/* test_and_set_bit may be better, but anyway it's in lock protected mode */
+	if (up->dma_tx_on)
+		return;
+
+	/* Update the circ buf info */
+	xmit->tail += dbuf->ofs;
+	xmit->tail &= UART_XMIT_SIZE - 1;
+
+	up->port.icount.tx += dbuf->ofs;
+	dbuf->ofs = 0;
+
+	/* Disable the channel */
+	chan_writel(up->txc, HSU_CH_CR, 0x0);
+
+	if (!uart_circ_empty(xmit) && !uart_tx_stopped(&up->port)) {
+		dma_sync_single_for_device(up->port.dev,
+					   dbuf->dma_addr,
+					   dbuf->dma_size,
+					   DMA_TO_DEVICE);
+
+		count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
+		dbuf->ofs = count;
+
+		/* Reprogram the channel */
+		chan_writel(up->txc, HSU_CH_D0SAR, dbuf->dma_addr + xmit->tail);
+		chan_writel(up->txc, HSU_CH_D0TSR, count);
+
+		/* Reenable the channel */
+		chan_writel(up->txc, HSU_CH_DCR, 0x1
+						 | (0x1 << 8)
+						 | (0x1 << 16)
+						 | (0x1 << 24));
+
+		WARN(chan_readl(up->txc, HSU_CH_CR) & 0x1,
+			"TX channel has already be started!!\n");
+		up->dma_tx_on = 1;
+		chan_writel(up->txc, HSU_CH_CR, 0x1);
+	}
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(&up->port);
+}
+
+/* The buffer is already cache coherent */
+void hsu_dma_start_rx_chan(struct hsu_dma_chan *rxc, struct hsu_dma_buffer *dbuf)
+{
+	/* Need start RX dma channel here */
+	dbuf->ofs = 0;
+
+	chan_writel(rxc, HSU_CH_BSR, 32);
+	chan_writel(rxc, HSU_CH_MOTSR, 4);
+
+	chan_writel(rxc, HSU_CH_D0SAR, dbuf->dma_addr);
+	chan_writel(rxc, HSU_CH_D0TSR, dbuf->dma_size);
+	chan_writel(rxc, HSU_CH_DCR, 0x1 | (0x1 << 8)
+					 | (0x1 << 16)
+					 | (0x1 << 24)	/* timeout bit, see HSU Errata 1 */
+					 );
+	chan_writel(rxc, HSU_CH_CR, 0x3);
+}
+
+/* Protected by spin_lock_irqsave(port->lock) */
+static void serial_hsu_start_tx(struct uart_port *port)
+{
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+
+	if (up->use_dma) {
+		hsu_dma_tx(up);
+	} else if (!(up->ier & UART_IER_THRI)) {
+		up->ier |= UART_IER_THRI;
+		serial_out(up, UART_IER, up->ier);
+	}
+}
+
+static void serial_hsu_stop_tx(struct uart_port *port)
+{
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+	struct hsu_dma_chan *txc = up->txc;
+
+	if (up->use_dma)
+		chan_writel(txc, HSU_CH_CR, 0x0);
+	else if (up->ier & UART_IER_THRI) {
+		up->ier &= ~UART_IER_THRI;
+		serial_out(up, UART_IER, up->ier);
+	}
+}
+
+/* This is always called in spinlock protected mode, so
+ * modify timeout timer is safe here */
+void hsu_dma_rx(struct uart_hsu_port *up, u32 int_sts)
+{
+	struct hsu_dma_buffer *dbuf = &up->rxbuf;
+	struct hsu_dma_chan *chan = up->rxc;
+	struct uart_port *port = &up->port;
+	struct tty_struct *tty = port->state->port.tty;
+	int count;
+
+	if (!tty)
+		return;
+
+	/*
+	 * first need to know how many is already transferred,
+	 * then check if its a timeout DMA irq, and return
+	 * the trail bytes out, push them up and reenable the
+	 * channel, better to use 2 descriptors at the same time
+	 */
+
+	/* timeout IRQ, need wait some time, see Errata 2 */
+	if (int_sts & 0xf00)
+		udelay(2);
+
+	/* Stop the channel */
+	chan_writel(chan, HSU_CH_CR, 0x0);
+
+	/* We can use 2 ways to calc the actual transfer len */
+	count = chan_readl(chan, HSU_CH_D0SAR) - dbuf->dma_addr;
+
+	if (!count)
+		return;
+
+	dma_sync_single_for_cpu(port->dev, dbuf->dma_addr,
+			dbuf->dma_size, DMA_FROM_DEVICE);
+
+	/*
+	 * head will only wrap around when we recycle
+	 * the DMA buffer, and when that happens, we
+	 * explicitly set tail to 0. So head will
+	 * always be greater than tail.
+	 */
+	tty_insert_flip_string(tty, dbuf->buf, count);
+	port->icount.rx += count;
+
+	dma_sync_single_for_device(up->port.dev, dbuf->dma_addr,
+			dbuf->dma_size, DMA_FROM_DEVICE);
+
+	/* Reprogram the channel */
+	chan_writel(chan, HSU_CH_D0SAR, dbuf->dma_addr);
+	chan_writel(chan, HSU_CH_D0TSR, dbuf->dma_size);
+	chan_writel(chan, HSU_CH_DCR, 0x1
+					 | (0x1 << 8)
+					 | (0x1 << 16)
+					 | (0x1 << 24)	/* timeout bit, see HSU Errata 1 */
+					 );
+	chan_writel(chan, HSU_CH_CR, 0x3);
+
+	tty_flip_buffer_push(tty);
+}
+
+static void serial_hsu_stop_rx(struct uart_port *port)
+{
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+	struct hsu_dma_chan *chan = up->rxc;
+
+	if (up->use_dma)
+		chan_writel(chan, HSU_CH_CR, 0x2);
+	else {
+		up->ier &= ~UART_IER_RLSI;
+		up->port.read_status_mask &= ~UART_LSR_DR;
+		serial_out(up, UART_IER, up->ier);
+	}
+}
+
+/*
+ * if there is error flag, should we just reset the FIFO or keeps
+ * working on it
+ */
+static inline void receive_chars(struct uart_hsu_port *up, int *status)
+{
+	struct tty_struct *tty = up->port.state->port.tty;
+	unsigned int ch, flag;
+	unsigned int max_count = 256;
+
+	if (!tty)
+		return;
+
+	do {
+		ch = serial_in(up, UART_RX);
+		flag = TTY_NORMAL;
+		up->port.icount.rx++;
+
+		if (unlikely(*status & (UART_LSR_BI | UART_LSR_PE |
+				       UART_LSR_FE | UART_LSR_OE))) {
+
+			dev_warn(up->dev, "We really rush into ERR/BI case"
+				"status = 0x%02x", *status);
+			/* For statistics only */
+			if (*status & UART_LSR_BI) {
+				*status &= ~(UART_LSR_FE | UART_LSR_PE);
+				up->port.icount.brk++;
+				/*
+				 * We do the SysRQ and SAK checking
+				 * here because otherwise the break
+				 * may get masked by ignore_status_mask
+				 * or read_status_mask.
+				 */
+				if (uart_handle_break(&up->port))
+					goto ignore_char;
+			} else if (*status & UART_LSR_PE)
+				up->port.icount.parity++;
+			else if (*status & UART_LSR_FE)
+				up->port.icount.frame++;
+			if (*status & UART_LSR_OE)
+				up->port.icount.overrun++;
+
+			/* Mask off conditions which should be ignored. */
+			*status &= up->port.read_status_mask;
+
+#ifdef CONFIG_SERIAL_MFD_HSU_CONSOLE
+			if (up->port.cons &&
+				up->port.cons->index == up->port.line) {
+				/* Recover the break flag from console xmit */
+				*status |= up->lsr_break_flag;
+				up->lsr_break_flag = 0;
+			}
+#endif
+			if (*status & UART_LSR_BI) {
+				flag = TTY_BREAK;
+			} else if (*status & UART_LSR_PE)
+				flag = TTY_PARITY;
+			else if (*status & UART_LSR_FE)
+				flag = TTY_FRAME;
+		}
+
+		if (uart_handle_sysrq_char(&up->port, ch))
+			goto ignore_char;
+
+		uart_insert_char(&up->port, *status, UART_LSR_OE, ch, flag);
+	ignore_char:
+		*status = serial_in(up, UART_LSR);
+	} while ((*status & UART_LSR_DR) && max_count--);
+	tty_flip_buffer_push(tty);
+}
+
+static void transmit_chars(struct uart_hsu_port *up)
+{
+	struct circ_buf *xmit = &up->port.state->xmit;
+	int count;
+	int i = 0; /* for debug use */
+
+	if (up->port.x_char) {
+		serial_out(up, UART_TX, up->port.x_char);
+		up->port.icount.tx++;
+		up->port.x_char = 0;
+		return;
+	}
+	if (uart_circ_empty(xmit) || uart_tx_stopped(&up->port)) {
+		serial_hsu_stop_tx(&up->port);
+		return;
+	}
+
+#ifndef MFD_HSU_A0_STEPPING
+	count = up->port.fifosize / 2;
+#else
+	/*
+	 * A0 only supports fully empty IRQ, and the first char written
+	 * into it won't clear the EMPT bit, so we may need be cautious
+	 * by useing a shorter buffer
+	 */
+	/* count = up->port.fifosize; */
+	count = up->port.fifosize - 4;
+#endif
+	do {
+		serial_out(up, UART_TX, xmit->buf[xmit->tail]);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		i++;
+
+		up->port.icount.tx++;
+		if (uart_circ_empty(xmit))
+			break;
+	} while (--count > 0);
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(&up->port);
+
+	if (uart_circ_empty(xmit))
+		serial_hsu_stop_tx(&up->port);
+}
+
+static inline void check_modem_status(struct uart_hsu_port *up)
+{
+	int status;
+
+	status = serial_in(up, UART_MSR);
+
+	if ((status & UART_MSR_ANY_DELTA) == 0)
+		return;
+
+	if (status & UART_MSR_TERI)
+		up->port.icount.rng++;
+	if (status & UART_MSR_DDSR)
+		up->port.icount.dsr++;
+	/* We may only get DDCD when HW init and reset */
+	if (status & UART_MSR_DDCD)
+		uart_handle_dcd_change(&up->port, status & UART_MSR_DCD);
+	/* will start/stop_tx accordingly */
+	if (status & UART_MSR_DCTS)
+		uart_handle_cts_change(&up->port, status & UART_MSR_CTS);
+
+	wake_up_interruptible(&up->port.state->port.delta_msr_wait);
+}
+
+/*
+ * This handles the interrupt from one port.
+ */
+static irqreturn_t port_irq(int irq, void *dev_id)
+{
+	struct uart_hsu_port *up = dev_id;
+	unsigned int iir, lsr;
+	unsigned long flags;
+
+	if (unlikely(!up->running))
+		return IRQ_NONE;
+
+	if (up->use_dma) {
+		lsr = serial_in(up, UART_LSR);
+		if (unlikely(lsr & (UART_LSR_BI | UART_LSR_PE |
+				       UART_LSR_FE | UART_LSR_OE)))
+			dev_warn(up->dev,
+				"Got lsr irq while using DMA, lsr = 0x%2x\n",
+				lsr);
+		check_modem_status(up);
+		return IRQ_HANDLED;
+	}
+
+	spin_lock_irqsave(&up->port.lock, flags);
+	iir = serial_in(up, UART_IIR);
+	if (iir & UART_IIR_NO_INT) {
+		spin_unlock_irqrestore(&up->port.lock, flags);
+		return IRQ_NONE;
+	}
+
+	lsr = serial_in(up, UART_LSR);
+
+	if (lsr & UART_LSR_DR)
+		receive_chars(up, &lsr);
+
+	/* lsr will be renewed during the receive_chars */
+	if (lsr & UART_LSR_THRE)
+		transmit_chars(up);
+
+	spin_unlock_irqrestore(&up->port.lock, flags);
+	return IRQ_HANDLED;
+}
+
+static inline void dma_chan_irq(struct hsu_dma_chan *chan)
+{
+	struct uart_hsu_port *up = chan->uport;
+	unsigned long flags;
+	u32 int_sts;
+
+	spin_lock_irqsave(&up->port.lock, flags);
+
+	if (!up->use_dma || !up->running)
+		goto exit;
+
+	/*
+	 * No matter what situation, need read clear the IRQ status
+	 * There is a bug, see Errata 5, HSD 2900918
+	 */
+	int_sts = chan_readl(chan, HSU_CH_SR);
+
+	/* Rx channel */
+	if (chan->dirt == DMA_FROM_DEVICE)
+		hsu_dma_rx(up, int_sts);
+
+	/* Tx channel */
+	if (chan->dirt == DMA_TO_DEVICE) {
+		/* dma for irq should be done */
+		chan_writel(chan, HSU_CH_CR, 0x0);
+		up->dma_tx_on = 0;
+		hsu_dma_tx(up);
+	}
+
+exit:
+	spin_unlock_irqrestore(&up->port.lock, flags);
+	return;
+}
+
+static irqreturn_t dma_irq(int irq, void *dev_id)
+{
+	struct hsu_port *hsu = dev_id;
+	u32 int_sts, i;
+
+	int_sts = mfd_readl(hsu, HSU_GBL_DMAISR);
+
+	/* Currently we only have 6 channels may be used */
+	for (i = 0; i < 6; i++) {
+		if (int_sts & 0x1)
+			dma_chan_irq(&hsu->chans[i]);
+		int_sts >>= 1;
+	}
+
+	return IRQ_HANDLED;
+}
+
+static unsigned int serial_hsu_tx_empty(struct uart_port *port)
+{
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+	unsigned long flags;
+	unsigned int ret;
+
+	spin_lock_irqsave(&up->port.lock, flags);
+	ret = serial_in(up, UART_LSR) & UART_LSR_TEMT ? TIOCSER_TEMT : 0;
+	spin_unlock_irqrestore(&up->port.lock, flags);
+
+	return ret;
+}
+
+static unsigned int serial_hsu_get_mctrl(struct uart_port *port)
+{
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+	unsigned char status;
+	unsigned int ret;
+
+	status = serial_in(up, UART_MSR);
+
+	ret = 0;
+	if (status & UART_MSR_DCD)
+		ret |= TIOCM_CAR;
+	if (status & UART_MSR_RI)
+		ret |= TIOCM_RNG;
+	if (status & UART_MSR_DSR)
+		ret |= TIOCM_DSR;
+	if (status & UART_MSR_CTS)
+		ret |= TIOCM_CTS;
+	return ret;
+}
+
+static void serial_hsu_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+	unsigned char mcr = 0;
+
+	if (mctrl & TIOCM_RTS)
+		mcr |= UART_MCR_RTS;
+	if (mctrl & TIOCM_DTR)
+		mcr |= UART_MCR_DTR;
+	if (mctrl & TIOCM_OUT1)
+		mcr |= UART_MCR_OUT1;
+	if (mctrl & TIOCM_OUT2)
+		mcr |= UART_MCR_OUT2;
+	if (mctrl & TIOCM_LOOP)
+		mcr |= UART_MCR_LOOP;
+
+	mcr |= up->mcr;
+
+	serial_out(up, UART_MCR, mcr);
+}
+
+static void serial_hsu_break_ctl(struct uart_port *port, int break_state)
+{
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+	unsigned long flags;
+
+	spin_lock_irqsave(&up->port.lock, flags);
+	if (break_state == -1)
+		up->lcr |= UART_LCR_SBC;
+	else
+		up->lcr &= ~UART_LCR_SBC;
+	serial_out(up, UART_LCR, up->lcr);
+	spin_unlock_irqrestore(&up->port.lock, flags);
+}
+
+/*
+ * What special to do:
+ * 1. chose the 64B fifo mode
+ * 2. make sure not to select half empty mode for A0 stepping
+ * 3. start dma or pio depends on configuration
+ * 4. we only allocate dma memory when needed
+ */
+static int serial_hsu_startup(struct uart_port *port)
+{
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+	unsigned long flags;
+
+	/*
+	 * Clear the FIFO buffers and disable them.
+	 * (they will be reenabled in set_termios())
+	 */
+	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);
+	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
+			UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
+	serial_out(up, UART_FCR, 0);
+
+	/* Clear the interrupt registers. */
+	(void) serial_in(up, UART_LSR);
+	(void) serial_in(up, UART_RX);
+	(void) serial_in(up, UART_IIR);
+	(void) serial_in(up, UART_MSR);
+
+	/* Now, initialize the UART, default is 8n1 */
+	serial_out(up, UART_LCR, UART_LCR_WLEN8);
+
+	spin_lock_irqsave(&up->port.lock, flags);
+
+	up->port.mctrl |= TIOCM_OUT2;
+	serial_hsu_set_mctrl(&up->port, up->port.mctrl);
+
+	/*
+	 * Finally, enable interrupts.  Note: Modem status interrupts
+	 * are set via set_termios(), which will be occurring imminently
+	 * anyway, so we don't enable them here.
+	 */
+	if (!up->use_dma)
+		up->ier = UART_IER_RLSI | UART_IER_RDI | UART_IER_RTOIE;
+	else
+		up->ier = 0;
+	serial_out(up, UART_IER, up->ier);
+
+	spin_unlock_irqrestore(&up->port.lock, flags);
+
+	/* DMA init */
+	/* When use DMA, TX/RX's FIFO and IRQ should be disabled */
+	if (up->use_dma) {
+		struct hsu_dma_buffer *dbuf;
+		struct circ_buf *xmit = &port->state->xmit;
+
+		up->dma_tx_on = 0;
+
+		/* First allocate the RX buffer */
+		dbuf = &up->rxbuf;
+		dbuf->buf = kzalloc(HSU_DMA_BUF_SIZE, GFP_KERNEL);
+		if (!dbuf->buf) {
+			up->use_dma = 0;
+			goto exit;
+		}
+		dbuf->dma_addr = dma_map_single(port->dev,
+						dbuf->buf,
+						HSU_DMA_BUF_SIZE,
+						DMA_FROM_DEVICE);
+		dbuf->dma_size = HSU_DMA_BUF_SIZE;
+
+		/* Start the RX channel right now */
+		hsu_dma_start_rx_chan(up->rxc, dbuf);
+
+		/* Next init the TX DMA */
+		dbuf = &up->txbuf;
+		dbuf->buf = xmit->buf;
+		dbuf->dma_addr = dma_map_single(port->dev,
+					       dbuf->buf,
+					       UART_XMIT_SIZE,
+					       DMA_TO_DEVICE);
+		dbuf->dma_size = UART_XMIT_SIZE;
+
+		/* This should not be changed all around */
+		chan_writel(up->txc, HSU_CH_BSR, 32);
+		chan_writel(up->txc, HSU_CH_MOTSR, 4);
+		dbuf->ofs = 0;
+	}
+
+exit:
+	 /* And clear the interrupt registers again for luck. */
+	(void) serial_in(up, UART_LSR);
+	(void) serial_in(up, UART_RX);
+	(void) serial_in(up, UART_IIR);
+	(void) serial_in(up, UART_MSR);
+
+	up->running = 1;
+	return 0;
+}
+
+static void serial_hsu_shutdown(struct uart_port *port)
+{
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+	unsigned long flags;
+
+	/* Disable interrupts from this port */
+	up->ier = 0;
+	serial_out(up, UART_IER, 0);
+	up->running = 0;
+
+	spin_lock_irqsave(&up->port.lock, flags);
+	up->port.mctrl &= ~TIOCM_OUT2;
+	serial_hsu_set_mctrl(&up->port, up->port.mctrl);
+	spin_unlock_irqrestore(&up->port.lock, flags);
+
+	/* Disable break condition and FIFOs */
+	serial_out(up, UART_LCR, serial_in(up, UART_LCR) & ~UART_LCR_SBC);
+	serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
+				  UART_FCR_CLEAR_RCVR |
+				  UART_FCR_CLEAR_XMIT);
+	serial_out(up, UART_FCR, 0);
+}
+
+static void
+serial_hsu_set_termios(struct uart_port *port, struct ktermios *termios,
+		       struct ktermios *old)
+{
+	struct uart_hsu_port *up =
+			container_of(port, struct uart_hsu_port, port);
+	struct tty_struct *tty = port->state->port.tty;
+	unsigned char cval, fcr = 0;
+	unsigned long flags;
+	unsigned int baud, quot;
+	u32 mul = 0x3600;
+	u32 ps = 0x10;
+
+	switch (termios->c_cflag & CSIZE) {
+	case CS5:
+		cval = UART_LCR_WLEN5;
+		break;
+	case CS6:
+		cval = UART_LCR_WLEN6;
+		break;
+	case CS7:
+		cval = UART_LCR_WLEN7;
+		break;
+	default:
+	case CS8:
+		cval = UART_LCR_WLEN8;
+		break;
+	}
+
+	/* CMSPAR isn't supported by this driver */
+	if (tty)
+		tty->termios->c_cflag &= ~CMSPAR;
+
+	if (termios->c_cflag & CSTOPB)
+		cval |= UART_LCR_STOP;
+	if (termios->c_cflag & PARENB)
+		cval |= UART_LCR_PARITY;
+	if (!(termios->c_cflag & PARODD))
+		cval |= UART_LCR_EPAR;
+
+	/*
+	 * For those basic low baud rate we can get the direct
+	 * scalar from 2746800, like 115200 = 2746800/24, for those
+	 * higher baud rate, we have to handle them case by case,
+	 * but DIV reg is never touched as its default value 0x3d09
+	 */
+	baud = uart_get_baud_rate(port, termios, old, 0, 4000000);
+	quot = uart_get_divisor(port, baud);
+
+	switch (baud) {
+	case 3500000:
+		mul = 0x3345;
+		ps = 0xC;
+		quot = 1;
+		break;
+	case 2500000:
+		mul = 0x2710;
+		ps = 0x10;
+		quot = 1;
+		break;
+	case 18432000:
+		mul = 0x2400;
+		ps = 0x10;
+		quot = 1;
+		break;
+	case 1500000:
+		mul = 0x1D4C;
+		ps = 0xc;
+		quot = 1;
+		break;
+	default:
+		;
+	}
+
+	if ((up->port.uartclk / quot) < (2400 * 16))
+		fcr = UART_FCR_ENABLE_FIFO | UART_FCR_HSU_64_1B;
+	else if ((up->port.uartclk / quot) < (230400 * 16))
+		fcr = UART_FCR_ENABLE_FIFO | UART_FCR_HSU_64_16B;
+	else
+		fcr = UART_FCR_ENABLE_FIFO | UART_FCR_HSU_64_32B;
+
+	fcr |= UART_FCR_HSU_64B_FIFO;
+#ifdef MFD_HSU_A0_STEPPING
+	/* A0 doesn't support half empty IRQ */
+	fcr |= UART_FCR_FULL_EMPT_TXI;
+#endif
+
+	/*
+	 * Ok, we're now changing the port state.  Do it with
+	 * interrupts disabled.
+	 */
+	spin_lock_irqsave(&up->port.lock, flags);
+
+	/* Update the per-port timeout */
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	up->port.read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR;
+	if (termios->c_iflag & INPCK)
+		up->port.read_status_mask |= UART_LSR_FE | UART_LSR_PE;
+	if (termios->c_iflag & (BRKINT | PARMRK))
+		up->port.read_status_mask |= UART_LSR_BI;
+
+	/* Characters to ignore */
+	up->port.ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		up->port.ignore_status_mask |= UART_LSR_PE | UART_LSR_FE;
+	if (termios->c_iflag & IGNBRK) {
+		up->port.ignore_status_mask |= UART_LSR_BI;
+		/*
+		 * If we're ignoring parity and break indicators,
+		 * ignore overruns too (for real raw support).
+		 */
+		if (termios->c_iflag & IGNPAR)
+			up->port.ignore_status_mask |= UART_LSR_OE;
+	}
+
+	/* Ignore all characters if CREAD is not set */
+	if ((termios->c_cflag & CREAD) == 0)
+		up->port.ignore_status_mask |= UART_LSR_DR;
+
+	/*
+	 * CTS flow control flag and modem status interrupts, disable
+	 * MSI by default
+	 */
+	up->ier &= ~UART_IER_MSI;
+	if (UART_ENABLE_MS(&up->port, termios->c_cflag))
+		up->ier |= UART_IER_MSI;
+
+	serial_out(up, UART_IER, up->ier);
+
+	if (termios->c_cflag & CRTSCTS)
+		up->mcr |= UART_MCR_AFE | UART_MCR_RTS;
+	else
+		up->mcr &= ~UART_MCR_AFE;
+
+	serial_out(up, UART_LCR, cval | UART_LCR_DLAB);	/* set DLAB */
+	serial_out(up, UART_DLL, quot & 0xff);		/* LS of divisor */
+	serial_out(up, UART_DLM, quot >> 8);		/* MS of divisor */
+	serial_out(up, UART_LCR, cval);			/* reset DLAB */
+	serial_out(up, UART_MUL, mul);			/* set MUL */
+	serial_out(up, UART_PS, ps);			/* set PS */
+	up->lcr = cval;					/* Save LCR */
+	serial_hsu_set_mctrl(&up->port, up->port.mctrl);
+	serial_out(up, UART_FCR, fcr);
+	spin_unlock_irqrestore(&up->port.lock, flags);
+}
+
+static void
+serial_hsu_pm(struct uart_port *port, unsigned int state,
+	      unsigned int oldstate)
+{
+}
+
+static void serial_hsu_release_port(struct uart_port *port)
+{
+}
+
+static int serial_hsu_request_port(struct uart_port *port)
+{
+	return 0;
+}
+
+static void serial_hsu_config_port(struct uart_port *port, int flags)
+{
+#if 0
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+	up->port.type = PORT_MFD;
+#endif
+}
+
+static int
+serial_hsu_verify_port(struct uart_port *port, struct serial_struct *ser)
+{
+	/* We don't want the core code to modify any port params */
+	return -EINVAL;
+}
+
+static const char *
+serial_hsu_type(struct uart_port *port)
+{
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+	return up->name;
+}
+
+/* Mainly for uart console use */
+static struct uart_hsu_port *serial_hsu_ports[3];
+static struct uart_driver serial_hsu_reg;
+
+#ifdef CONFIG_SERIAL_MFD_HSU_CONSOLE
+
+#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
+
+/* Wait for transmitter & holding register to empty */
+static inline void wait_for_xmitr(struct uart_hsu_port *up)
+{
+	unsigned int status, tmout = 1000;
+
+	/* Wait up to 1ms for the character to be sent. */
+	do {
+		status = serial_in(up, UART_LSR);
+
+		if (status & UART_LSR_BI)
+			up->lsr_break_flag = UART_LSR_BI;
+
+		if (--tmout == 0)
+			break;
+		udelay(1);
+	} while (!(status & BOTH_EMPTY));
+
+	/* Wait up to 1s for flow control if necessary */
+	if (up->port.flags & UPF_CONS_FLOW) {
+		tmout = 1000000;
+		while (--tmout &&
+		       ((serial_in(up, UART_MSR) & UART_MSR_CTS) == 0))
+			udelay(1);
+	}
+}
+
+static void serial_hsu_console_putchar(struct uart_port *port, int ch)
+{
+	struct uart_hsu_port *up =
+		container_of(port, struct uart_hsu_port, port);
+
+	wait_for_xmitr(up);
+	serial_out(up, UART_TX, ch);
+}
+
+/*
+ * Print a string to the serial port trying not to disturb
+ * any possible real use of the port...
+ *
+ *	The console_lock must be held when we get here.
+ */
+static void
+serial_hsu_console_write(struct console *co, const char *s, unsigned int count)
+{
+	struct uart_hsu_port *up = serial_hsu_ports[co->index];
+	unsigned long flags;
+	unsigned int ier;
+	int locked = 1;
+
+	local_irq_save(flags);
+	if (up->port.sysrq)
+		locked = 0;
+	else if (oops_in_progress) {
+		locked = spin_trylock(&up->port.lock);
+	} else
+		spin_lock(&up->port.lock);
+
+	/* First save the IER then disable the interrupts */
+	ier = serial_in(up, UART_IER);
+	serial_out(up, UART_IER, 0);
+
+	uart_console_write(&up->port, s, count, serial_hsu_console_putchar);
+
+	/*
+	 * Finally, wait for transmitter to become empty
+	 * and restore the IER
+	 */
+	wait_for_xmitr(up);
+	serial_out(up, UART_IER, ier);
+
+	if (locked)
+		spin_unlock(&up->port.lock);
+	local_irq_restore(flags);
+}
+
+static struct console serial_hsu_console;
+
+static int __init
+serial_hsu_console_setup(struct console *co, char *options)
+{
+	struct uart_hsu_port *up;
+	int baud = 115200;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+	int ret;
+
+	if (co->index == -1 || co->index >= serial_hsu_reg.nr)
+		co->index = 0;
+	up = serial_hsu_ports[co->index];
+	if (!up)
+		return -ENODEV;
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	ret = uart_set_options(&up->port, co, baud, parity, bits, flow);
+
+	return ret;
+}
+
+static struct console serial_hsu_console = {
+	.name		= "ttyMFD",
+	.write		= serial_hsu_console_write,
+	.device		= uart_console_device,
+	.setup		= serial_hsu_console_setup,
+	.flags		= CON_PRINTBUFFER,
+	.index		= 2,
+	.data		= &serial_hsu_reg,
+};
+#endif
+
+struct uart_ops serial_hsu_pops = {
+	.tx_empty	= serial_hsu_tx_empty,
+	.set_mctrl	= serial_hsu_set_mctrl,
+	.get_mctrl	= serial_hsu_get_mctrl,
+	.stop_tx	= serial_hsu_stop_tx,
+	.start_tx	= serial_hsu_start_tx,
+	.stop_rx	= serial_hsu_stop_rx,
+	.enable_ms	= serial_hsu_enable_ms,
+	.break_ctl	= serial_hsu_break_ctl,
+	.startup	= serial_hsu_startup,
+	.shutdown	= serial_hsu_shutdown,
+	.set_termios	= serial_hsu_set_termios,
+	.pm		= serial_hsu_pm,
+	.type		= serial_hsu_type,
+	.release_port	= serial_hsu_release_port,
+	.request_port	= serial_hsu_request_port,
+	.config_port	= serial_hsu_config_port,
+	.verify_port	= serial_hsu_verify_port,
+};
+
+static struct uart_driver serial_hsu_reg = {
+	.owner		= THIS_MODULE,
+	.driver_name	= "MFD serial",
+	.dev_name	= "ttyMFD",
+	.major		= TTY_MAJOR,
+	.minor		= 128,
+	.nr		= 3,
+};
+
+#ifdef CONFIG_PM
+static int serial_hsu_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct uart_hsu_port *up;
+
+	up = pci_get_drvdata(pdev);
+	if (!up)
+		return 0;
+
+	uart_suspend_port(&serial_hsu_reg, &up->port);
+
+        return 0;
+}
+
+static int serial_hsu_resume(struct pci_dev *pdev)
+{
+	struct uart_hsu_port *up;
+
+	up = pci_get_drvdata(pdev);
+	if (!up)
+		return 0;
+	uart_resume_port(&serial_hsu_reg, &up->port);
+	return 0;
+}
+#else
+#define serial_hsu_suspend	NULL
+#define serial_hsu_resume	NULL
+#endif
+
+/* temp global pointer before we settle down on using one or four PCI dev */
+static struct hsu_port *phsu;
+
+static int serial_hsu_probe(struct pci_dev *pdev,
+				const struct pci_device_id *ent)
+{
+	struct uart_hsu_port *uport;
+	int index, ret;
+
+	printk(KERN_INFO "HSU: found PCI Serial controller(ID: %04x:%04x)\n",
+		pdev->vendor, pdev->device);
+
+	switch (pdev->device) {
+	case 0x081B:
+		index = 0;
+		break;
+	case 0x081C:
+		index = 1;
+		break;
+	case 0x081D:
+		index = 2;
+		break;
+	case 0x081E:
+		/* internal DMA controller */
+		index = 3;
+		break;
+	default:
+		dev_err(&pdev->dev, "HSU: out of index!");
+		return -ENODEV;
+	}
+
+	ret = pci_enable_device(pdev);
+	if (ret)
+		return ret;
+
+	if (index == 3) {
+		/* DMA controller */
+		ret = request_irq(pdev->irq, dma_irq, 0, "hsu_dma", phsu);
+		if (ret) {
+			dev_err(&pdev->dev, "can not get IRQ\n");
+			goto err_disable;
+		}
+		pci_set_drvdata(pdev, phsu);
+	} else {
+		/* UART port 0~2 */
+		uport = &phsu->port[index];
+		uport->port.irq = pdev->irq;
+		uport->port.dev = &pdev->dev;
+		uport->dev = &pdev->dev;
+
+		ret = request_irq(pdev->irq, port_irq, 0, uport->name, uport);
+		if (ret) {
+			dev_err(&pdev->dev, "can not get IRQ\n");
+			goto err_disable;
+		}
+		uart_add_one_port(&serial_hsu_reg, &uport->port);
+
+#ifdef CONFIG_SERIAL_MFD_HSU_CONSOLE
+		if (index == 2) {
+			register_console(&serial_hsu_console);
+			uport->port.cons = &serial_hsu_console;
+		}
+#endif
+		pci_set_drvdata(pdev, uport);
+	}
+
+	return 0;
+
+err_disable:
+	pci_disable_device(pdev);
+	return ret;
+}
+
+static void hsu_global_init(void)
+{
+	struct hsu_port *hsu;
+	struct uart_hsu_port *uport;
+	struct hsu_dma_chan *dchan;
+	int i, ret;
+
+	hsu = kzalloc(sizeof(struct hsu_port), GFP_KERNEL);
+	if (!hsu)
+		return;
+
+	/* Get basic io resource and map it */
+	hsu->paddr = 0xffa28000;
+	hsu->iolen = 0x1000;
+
+	if (!(request_mem_region(hsu->paddr, hsu->iolen, "HSU global")))
+		pr_warning("HSU: error in request mem region\n");
+
+	hsu->reg = ioremap_nocache((unsigned long)hsu->paddr, hsu->iolen);
+	if (!hsu->reg) {
+		pr_err("HSU: error in ioremap\n");
+		ret = -ENOMEM;
+		goto err_free_region;
+	}
+
+	/* Initialise the 3 UART ports */
+	uport = hsu->port;
+	for (i = 0; i < 3; i++) {
+		uport->port.type = PORT_MFD;
+		uport->port.iotype = UPIO_MEM;
+		uport->port.mapbase = (resource_size_t)hsu->paddr
+					+ HSU_PORT_REG_OFFSET
+					+ i * HSU_PORT_REG_LENGTH;
+		uport->port.membase = hsu->reg + HSU_PORT_REG_OFFSET
+					+ i * HSU_PORT_REG_LENGTH;
+
+		sprintf(uport->name, "hsu_port%d", i);
+		uport->port.fifosize = 64;
+		uport->port.ops = &serial_hsu_pops;
+		uport->port.line = i;
+		uport->port.flags = UPF_IOREMAP;
+		/* make the maxim support rate to 2746800 bps */
+		uport->port.uartclk = 115200 * 24 * 16;
+
+		uport->running = 0;
+		uport->txc = &hsu->chans[i * 2];
+		uport->rxc = &hsu->chans[i * 2 + 1];
+
+		serial_hsu_ports[i] = uport;
+		uport->index = i;
+		uport++;
+	}
+
+	/* Initialise 6 dma channels */
+	dchan = hsu->chans;
+	for (i = 0; i < 6; i++) {
+		dchan->id = i;
+		dchan->dirt = (i & 0x1) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+		dchan->uport = &hsu->port[i/2];
+		dchan->reg = hsu->reg + HSU_DMA_CHANS_REG_OFFSET +
+				i * HSU_DMA_CHANS_REG_LENGTH;
+		dchan++;
+	}
+
+	phsu = hsu;
+
+	hsu_debugfs_init(hsu);
+	return;
+
+err_free_region:
+	release_mem_region(hsu->paddr, hsu->iolen);
+	kfree(hsu);
+	return;
+}
+
+static void serial_hsu_remove(struct pci_dev *pdev)
+{
+	struct hsu_port *hsu;
+	int i;
+
+	hsu = pci_get_drvdata(pdev);
+	if (!hsu)
+		return;
+
+	for (i = 0; i < 3; i++)
+		uart_remove_one_port(&serial_hsu_reg, &hsu->port[i].port);
+
+	pci_set_drvdata(pdev, NULL);
+	free_irq(hsu->irq, hsu);
+	pci_disable_device(pdev);
+}
+
+/* First 3 are UART ports, and the 4th is the DMA */
+static const struct pci_device_id pci_ids[] __devinitdata = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x081B) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x081C) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x081D) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x081E) },
+	{},
+};
+
+static struct pci_driver hsu_pci_driver = {
+	.name =		"HSU serial",
+	.id_table =	pci_ids,
+	.probe =	serial_hsu_probe,
+	.remove =	__devexit_p(serial_hsu_remove),
+	.suspend =	serial_hsu_suspend,
+	.resume	=	serial_hsu_resume,
+};
+
+static int __init hsu_pci_init(void)
+{
+	int ret;
+
+	hsu_global_init();
+
+	ret = uart_register_driver(&serial_hsu_reg);
+	if (ret)
+		return ret;
+
+	return pci_register_driver(&hsu_pci_driver);
+}
+
+static void __exit hsu_pci_exit(void)
+{
+	pci_unregister_driver(&hsu_pci_driver);
+	uart_unregister_driver(&serial_hsu_reg);
+
+	hsu_debugfs_remove(phsu);
+
+	kfree(phsu);
+}
+
+module_init(hsu_pci_init);
+module_exit(hsu_pci_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:medfield-hsu");
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 9ddc866ccc09..f8fce351463d 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -189,6 +189,8 @@
 /* MAX3107 */
 #define PORT_MAX3107	94
 
+/* High Speed UART for Medfield */
+#define PORT_MFD	95
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/serial_mfd.h b/include/linux/serial_mfd.h
new file mode 100644
index 000000000000..2b071e0b034d
--- /dev/null
+++ b/include/linux/serial_mfd.h
@@ -0,0 +1,47 @@
+#ifndef _SERIAL_MFD_H_
+#define _SERIAL_MFD_H_
+
+/* HW register offset definition */
+#define UART_FOR	0x08
+#define UART_PS		0x0C
+#define UART_MUL	0x0D
+#define UART_DIV	0x0E
+
+#define HSU_GBL_IEN	0x0
+#define HSU_GBL_IST	0x4
+
+#define HSU_GBL_INT_BIT_PORT0	0x0
+#define HSU_GBL_INT_BIT_PORT1	0x1
+#define HSU_GBL_INT_BIT_PORT2	0x2
+#define HSU_GBL_INT_BIT_IRI	0x3
+#define HSU_GBL_INT_BIT_HDLC	0x4
+#define HSU_GBL_INT_BIT_DMA	0x5
+
+#define HSU_GBL_ISR	0x8
+#define HSU_GBL_DMASR	0x400
+#define HSU_GBL_DMAISR	0x404
+
+#define HSU_PORT_REG_OFFSET	0x80
+#define HSU_PORT0_REG_OFFSET	0x80
+#define HSU_PORT1_REG_OFFSET	0x100
+#define HSU_PORT2_REG_OFFSET	0x180
+#define HSU_PORT_REG_LENGTH	0x80
+
+#define HSU_DMA_CHANS_REG_OFFSET	0x500
+#define HSU_DMA_CHANS_REG_LENGTH	0x40
+
+#define HSU_CH_SR		0x0	/* channel status reg */
+#define HSU_CH_CR		0x4	/* control reg */
+#define HSU_CH_DCR		0x8	/* descriptor control reg */
+#define HSU_CH_BSR		0x10	/* max fifo buffer size reg */
+#define HSU_CH_MOTSR		0x14	/* minimum ocp transfer size */
+#define HSU_CH_D0SAR		0x20	/* desc 0 start addr */
+#define HSU_CH_D0TSR		0x24	/* desc 0 transfer size */
+#define HSU_CH_D1SAR		0x28
+#define HSU_CH_D1TSR		0x2C
+#define HSU_CH_D2SAR		0x30
+#define HSU_CH_D2TSR		0x34
+#define HSU_CH_D3SAR		0x38
+#define HSU_CH_D3TSR		0x3C
+
+#endif
diff --git a/include/linux/serial_reg.h b/include/linux/serial_reg.h
index cf9327c051ad..c7a0ce11cd47 100644
--- a/include/linux/serial_reg.h
+++ b/include/linux/serial_reg.h
@@ -221,8 +221,24 @@
 #define UART_FCR_PXAR16	0x80	/* receive FIFO threshold = 16 */
 #define UART_FCR_PXAR32	0xc0	/* receive FIFO threshold = 32 */
 
+/*
+ * Intel MID on-chip HSU (High Speed UART) defined bits
+ */
+#define UART_FCR_HSU_64_1B	0x00	/* receive FIFO treshold = 1 */
+#define UART_FCR_HSU_64_16B	0x40	/* receive FIFO treshold = 16 */
+#define UART_FCR_HSU_64_32B	0x80	/* receive FIFO treshold = 32 */
+#define UART_FCR_HSU_64_56B	0xc0	/* receive FIFO treshold = 56 */
+
+#define UART_FCR_HSU_16_1B	0x00	/* receive FIFO treshold = 1 */
+#define UART_FCR_HSU_16_4B	0x40	/* receive FIFO treshold = 4 */
+#define UART_FCR_HSU_16_8B	0x80	/* receive FIFO treshold = 8 */
+#define UART_FCR_HSU_16_14B	0xc0	/* receive FIFO treshold = 14 */
 
+#define UART_FCR_HSU_64B_FIFO	0x20	/* chose 64 bytes FIFO */
+#define UART_FCR_HSU_16B_FIFO	0x00	/* chose 16 bytes FIFO */
 
+#define UART_FCR_HALF_EMPT_TXI	0x00	/* trigger TX_EMPT IRQ for half empty */
+#define UART_FCR_FULL_EMPT_TXI	0x08	/* trigger TX_EMPT IRQ for full empty */
 
 /*
  * These register definitions are for the 16C950
-- 
cgit v1.2.3-59-g8ed1b


From 235dae5d094c415fcf0fc79fa637f1901bc8afe2 Mon Sep 17 00:00:00 2001
From: Philippe Langlais <philippe.langlais@stericsson.com>
Date: Thu, 29 Jul 2010 17:13:57 +0200
Subject: U6715 16550A serial driver support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

UART Features extract from STEricsson U6715 data-sheet (arm926 SoC for mobile phone):
* Fully compatible with industry standard 16C550 and 16C450 from various
manufacturers
* RX and TX 64 byte FIFO reduces CPU interrupts
* Full double buffering
* Modem control signals include CTS, RTS, (and DSR, DTR on UART1 only)
* Automatic baud rate selection
* Manual or automatic RTS/CTS smart hardware flow control
* Programmable serial characteristics:
– Baud rate generation (50 to 3.25M baud)
– 5, 6, 7 or 8-bit characters
– Even, odd or no-parity bit generation and detection
– 1, 1.5 or 2 stop bit generation
* Independent control of transmit, receive, line status, data set interrupts and FIFOs
* Full status-reporting capabilities
* Separate DMA signaling for RX and TX
* Timed interrupt to spread receive interrupt on known duration
* DMA time-out interrupt to allow detection of end of reception
* Carkit pulse coding and decoding compliant with USB carkit control interface [40]

In 16550A auto-configuration, if the fifo size is 64 then it's an U6 16550A port
Add set_termios hook & export serial8250_do_set_termios to change uart
clock following baudrate

Signed-off-by: Philippe Langlais <philippe.langlais@stericsson.com>
Acked-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/8250.c       | 37 ++++++++++++++++++++++++++++++++++---
 include/linux/serial.h      |  3 ++-
 include/linux/serial_8250.h |  5 +++++
 include/linux/serial_core.h |  3 +++
 4 files changed, 44 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 355148dc085e..24110f6f61e0 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -300,6 +300,13 @@ static const struct serial8250_config uart_config[] = {
 		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
 		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
 	},
+	[PORT_U6_16550A] = {
+		.name		= "U6_16550A",
+		.fifo_size	= 64,
+		.tx_loadsz	= 64,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
+	},
 };
 
 #if defined(CONFIG_MIPS_ALCHEMY)
@@ -1070,6 +1077,15 @@ static void autoconfig_16550a(struct uart_8250_port *up)
 		DEBUG_AUTOCONF("Couldn't force IER_UUE to 0 ");
 	}
 	serial_outp(up, UART_IER, iersave);
+
+	/*
+	 * We distinguish between 16550A and U6 16550A by counting
+	 * how many bytes are in the FIFO.
+	 */
+	if (up->port.type == PORT_16550A && size_fifo(up) == 64) {
+		up->port.type = PORT_U6_16550A;
+		up->capabilities |= UART_CAP_AFE;
+	}
 }
 
 /*
@@ -2224,9 +2240,9 @@ static unsigned int serial8250_get_divisor(struct uart_port *port, unsigned int
 	return quot;
 }
 
-static void
-serial8250_set_termios(struct uart_port *port, struct ktermios *termios,
-		       struct ktermios *old)
+void
+serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
+		          struct ktermios *old)
 {
 	struct uart_8250_port *up = (struct uart_8250_port *)port;
 	unsigned char cval, fcr = 0;
@@ -2402,6 +2418,17 @@ serial8250_set_termios(struct uart_port *port, struct ktermios *termios,
 	if (tty_termios_baud_rate(termios))
 		tty_termios_encode_baud_rate(termios, baud, baud);
 }
+EXPORT_SYMBOL(serial8250_do_set_termios);
+
+static void
+serial8250_set_termios(struct uart_port *port, struct ktermios *termios,
+		       struct ktermios *old)
+{
+	if (port->set_termios)
+		port->set_termios(port, termios, old);
+	else
+		serial8250_do_set_termios(port, termios, old);
+}
 
 static void
 serial8250_set_ldisc(struct uart_port *port, int new)
@@ -2982,6 +3009,7 @@ static int __devinit serial8250_probe(struct platform_device *dev)
 		port.type		= p->type;
 		port.serial_in		= p->serial_in;
 		port.serial_out		= p->serial_out;
+		port.set_termios	= p->set_termios;
 		port.dev		= &dev->dev;
 		port.irqflags		|= irqflag;
 		ret = serial8250_register_port(&port);
@@ -3145,6 +3173,9 @@ int serial8250_register_port(struct uart_port *port)
 			uart->port.serial_in = port->serial_in;
 		if (port->serial_out)
 			uart->port.serial_out = port->serial_out;
+		/*  Possibly override set_termios call */
+		if (port->set_termios)
+			uart->port.set_termios = port->set_termios;
 
 		ret = uart_add_one_port(&serial8250_reg, &uart->port);
 		if (ret == 0)
diff --git a/include/linux/serial.h b/include/linux/serial.h
index ef914061511e..1ebc694a6d52 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -77,7 +77,8 @@ struct serial_struct {
 #define PORT_16654	11
 #define PORT_16850	12
 #define PORT_RSA	13	/* RSA-DV II/S card */
-#define PORT_MAX	13
+#define PORT_U6_16550A	14
+#define PORT_MAX	14
 
 #define SERIAL_IO_PORT	0
 #define SERIAL_IO_HUB6	1
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index fb46aba11fb5..7638deaaba65 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -32,6 +32,9 @@ struct plat_serial8250_port {
 	unsigned int	type;		/* If UPF_FIXED_TYPE */
 	unsigned int	(*serial_in)(struct uart_port *, int);
 	void		(*serial_out)(struct uart_port *, int, int);
+	void		(*set_termios)(struct uart_port *,
+			               struct ktermios *new,
+			               struct ktermios *old);
 };
 
 /*
@@ -71,5 +74,7 @@ extern int early_serial_setup(struct uart_port *port);
 extern int serial8250_find_port(struct uart_port *p);
 extern int serial8250_find_port_for_earlycon(void);
 extern int setup_early_serial8250_console(char *cmdline);
+extern void serial8250_do_set_termios(struct uart_port *port,
+		struct ktermios *termios, struct ktermios *old);
 
 #endif
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index f8fce351463d..8129ca2d57e3 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -282,6 +282,9 @@ struct uart_port {
 	unsigned char __iomem	*membase;		/* read/write[bwl] */
 	unsigned int		(*serial_in)(struct uart_port *, int);
 	void			(*serial_out)(struct uart_port *, int, int);
+	void			(*set_termios)(struct uart_port *,
+				               struct ktermios *new,
+				               struct ktermios *old);
 	unsigned int		irq;			/* irq number */
 	unsigned long		irqflags;		/* irq flags  */
 	unsigned int		uartclk;		/* base uart clock */
-- 
cgit v1.2.3-59-g8ed1b


From 6d88e6792574497bfac9a81403cc47712040636f Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 9 Jun 2010 17:34:17 -0400
Subject: USB: don't stop root-hub status polls too soon

This patch (as1390) fixes a problem that crops up when a UHCI host
controller is unbound from uhci-hcd while there are still some active
URBs.  The URBs have to be unlinked when the root hub is unregistered,
and uhci-hcd relies upon root-hub status polls as part of its
unlinking procedure.  But usb_hcd_poll_rh_status() won't make those
status calls if hcd->rh_registered is clear, and the flag is cleared
_before_ the unregistration takes place.

Since hcd->rh_registered is used for other things and needs to be
cleared early, the solution is to add a new flag (rh_pollable) and use
it instead.  It gets cleared _after_ the root hub is unregistered.

Now that the status polls don't end too soon, we have to make sure
they also don't occur too late -- after the root hub's usb_device
structure or the HCD's private structures are deallocated.  Therefore
the patch adds usb_get_device() and usb_put_device() calls to protect
the root hub structure, and it adds an extra del_timer_sync() to
prevent the root-hub timer from causing an unexpected status poll.

This additional complexity would not be needed if the HCD framework
had provided separate stop() and release() callbacks instead of just
stop().  This lack could be fixed at some future time (although it
would require changes to every host controller driver); when that
happens this patch won't be needed any more.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/hcd.c  | 32 +++++++++++++++++++++++++-------
 include/linux/usb/hcd.h |  1 +
 2 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index caae4625a1f1..53f14c82ff2e 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -667,7 +667,7 @@ void usb_hcd_poll_rh_status(struct usb_hcd *hcd)
 	unsigned long	flags;
 	char		buffer[6];	/* Any root hubs with > 31 ports? */
 
-	if (unlikely(!hcd->rh_registered))
+	if (unlikely(!hcd->rh_pollable))
 		return;
 	if (!hcd->uses_new_polling && !hcd->status_urb)
 		return;
@@ -2217,6 +2217,7 @@ int usb_add_hcd(struct usb_hcd *hcd,
 		retval = -ENOMEM;
 		goto err_allocate_root_hub;
 	}
+	hcd->self.root_hub = rhdev;
 
 	switch (hcd->driver->flags & HCD_MASK) {
 	case HCD_USB11:
@@ -2231,7 +2232,6 @@ int usb_add_hcd(struct usb_hcd *hcd,
 	default:
 		goto err_set_rh_speed;
 	}
-	hcd->self.root_hub = rhdev;
 
 	/* wakeup flag init defaults to "everything works" for root hubs,
 	 * but drivers can override it in reset() if needed, along with
@@ -2246,6 +2246,7 @@ int usb_add_hcd(struct usb_hcd *hcd,
 		dev_err(hcd->self.controller, "can't setup\n");
 		goto err_hcd_driver_setup;
 	}
+	hcd->rh_pollable = 1;
 
 	/* NOTE: root hub and controller capabilities may not be the same */
 	if (device_can_wakeup(hcd->self.controller)
@@ -2315,9 +2316,12 @@ error_create_attr_group:
 	cancel_work_sync(&hcd->wakeup_work);
 #endif
 	mutex_lock(&usb_bus_list_lock);
-	usb_disconnect(&hcd->self.root_hub);
+	usb_disconnect(&rhdev);		/* Sets rhdev to NULL */
 	mutex_unlock(&usb_bus_list_lock);
 err_register_root_hub:
+	hcd->rh_pollable = 0;
+	hcd->poll_rh = 0;
+	del_timer_sync(&hcd->rh_timer);
 	hcd->driver->stop(hcd);
 	hcd->state = HC_STATE_HALT;
 	hcd->poll_rh = 0;
@@ -2328,8 +2332,7 @@ err_hcd_driver_start:
 err_request_irq:
 err_hcd_driver_setup:
 err_set_rh_speed:
-	hcd->self.root_hub = NULL;
-	usb_put_dev(rhdev);
+	usb_put_dev(hcd->self.root_hub);
 err_allocate_root_hub:
 	usb_deregister_bus(&hcd->self);
 err_register_bus:
@@ -2348,9 +2351,12 @@ EXPORT_SYMBOL_GPL(usb_add_hcd);
  */
 void usb_remove_hcd(struct usb_hcd *hcd)
 {
+	struct usb_device *rhdev = hcd->self.root_hub;
+
 	dev_info(hcd->self.controller, "remove, state %x\n", hcd->state);
 
-	sysfs_remove_group(&hcd->self.root_hub->dev.kobj, &usb_bus_attr_group);
+	usb_get_dev(rhdev);
+	sysfs_remove_group(&rhdev->dev.kobj, &usb_bus_attr_group);
 
 	if (HC_IS_RUNNING (hcd->state))
 		hcd->state = HC_STATE_QUIESCING;
@@ -2365,17 +2371,29 @@ void usb_remove_hcd(struct usb_hcd *hcd)
 #endif
 
 	mutex_lock(&usb_bus_list_lock);
-	usb_disconnect(&hcd->self.root_hub);
+	usb_disconnect(&rhdev);		/* Sets rhdev to NULL */
 	mutex_unlock(&usb_bus_list_lock);
 
+	/* Prevent any more root-hub status calls from the timer.
+	 * The HCD might still restart the timer (if a port status change
+	 * interrupt occurs), but usb_hcd_poll_rh_status() won't invoke
+	 * the hub_status_data() callback.
+	 */
+	hcd->rh_pollable = 0;
+	hcd->poll_rh = 0;
+	del_timer_sync(&hcd->rh_timer);
+
 	hcd->driver->stop(hcd);
 	hcd->state = HC_STATE_HALT;
 
+	/* In case the HCD restarted the timer, stop it again. */
 	hcd->poll_rh = 0;
 	del_timer_sync(&hcd->rh_timer);
 
 	if (hcd->irq >= 0)
 		free_irq(hcd->irq, hcd);
+
+	usb_put_dev(hcd->self.root_hub);
 	usb_deregister_bus(&hcd->self);
 	hcd_buffer_destroy(hcd);
 }
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 2e3a4ea1a3da..11b638195901 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -95,6 +95,7 @@ struct usb_hcd {
 #define HCD_FLAG_SAW_IRQ	0x00000002
 
 	unsigned		rh_registered:1;/* is root hub registered? */
+	unsigned		rh_pollable:1;	/* may we poll the root hub? */
 
 	/* The next flag is a stopgap, to be removed when all the HCDs
 	 * support the new root-hub polling mechanism. */
-- 
cgit v1.2.3-59-g8ed1b


From 6e1c3b467ffd9d6eb725dda544f6fd10e471ea71 Mon Sep 17 00:00:00 2001
From: Igor Grinberg <grinberg@compulab.co.il>
Date: Thu, 27 May 2010 09:32:13 +0300
Subject: USB: otg.h: Fix the mixup in parameters order.

otg_io_write() function does not follow the declaration of
struct otg_io_access_ops.

Signed-off-by: Igor Grinberg <grinberg@compulab.co.il>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/otg.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/otg.h b/include/linux/usb/otg.h
index f8302d036a76..54b2c5e48b9d 100644
--- a/include/linux/usb/otg.h
+++ b/include/linux/usb/otg.h
@@ -146,10 +146,10 @@ static inline int otg_io_read(struct otg_transceiver *otg, u32 reg)
 	return -EINVAL;
 }
 
-static inline int otg_io_write(struct otg_transceiver *otg, u32 reg, u32 val)
+static inline int otg_io_write(struct otg_transceiver *otg, u32 val, u32 reg)
 {
 	if (otg->io_ops && otg->io_ops->write)
-		return otg->io_ops->write(otg, reg, val);
+		return otg->io_ops->write(otg, val, reg);
 
 	return -EINVAL;
 }
-- 
cgit v1.2.3-59-g8ed1b


From aa4d8342988d0c1a79ff19b2ede1e81dfbb16ea5 Mon Sep 17 00:00:00 2001
From: Alek Du <alek.du@intel.com>
Date: Fri, 4 Jun 2010 15:47:54 +0800
Subject: USB: EHCI: EHCI 1.1 addendum: preparation

EHCI 1.1 addendum introduced several energy efficiency extensions for
EHCI USB host controllers:
1. LPM (link power management)
2. Per-port change
3. Shorter periodic frame list
4. Hardware prefetching

This patch is intended to define the HW bits and debug interface for
EHCI 1.1 addendum. The LPM and Per-port change patches will be sent out
after this patch.

Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com>
Signed-off-by: Alek Du <alek.du@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/host/ehci-dbg.c  | 144 ++++++++++++++++++++++++++++++++++++++++---
 drivers/usb/host/ehci-hcd.c  |   1 +
 drivers/usb/host/ehci.h      |   1 +
 include/linux/usb/ehci_def.h |  23 +++++++
 4 files changed, 162 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/host/ehci-dbg.c b/drivers/usb/host/ehci-dbg.c
index 874d2000bf92..df5546bb8367 100644
--- a/drivers/usb/host/ehci-dbg.c
+++ b/drivers/usb/host/ehci-dbg.c
@@ -98,13 +98,18 @@ static void dbg_hcc_params (struct ehci_hcd *ehci, char *label)
 			HCC_64BIT_ADDR(params) ? " 64 bit addr" : "");
 	} else {
 		ehci_dbg (ehci,
-			"%s hcc_params %04x thresh %d uframes %s%s%s\n",
+			"%s hcc_params %04x thresh %d uframes %s%s%s%s%s%s%s\n",
 			label,
 			params,
 			HCC_ISOC_THRES(params),
 			HCC_PGM_FRAMELISTLEN(params) ? "256/512/1024" : "1024",
 			HCC_CANPARK(params) ? " park" : "",
-			HCC_64BIT_ADDR(params) ? " 64 bit addr" : "");
+			HCC_64BIT_ADDR(params) ? " 64 bit addr" : "",
+			HCC_LPM(params) ? " LPM" : "",
+			HCC_PER_PORT_CHANGE_EVENT(params) ? " ppce" : "",
+			HCC_HW_PREFETCH(params) ? " hw prefetch" : "",
+			HCC_32FRAME_PERIODIC_LIST(params) ?
+				" 32 peridic list" : "");
 	}
 }
 #else
@@ -191,8 +196,9 @@ static int __maybe_unused
 dbg_status_buf (char *buf, unsigned len, const char *label, u32 status)
 {
 	return scnprintf (buf, len,
-		"%s%sstatus %04x%s%s%s%s%s%s%s%s%s%s",
+		"%s%sstatus %04x%s%s%s%s%s%s%s%s%s%s%s",
 		label, label [0] ? " " : "", status,
+		(status & STS_PPCE_MASK) ? " PPCE" : "",
 		(status & STS_ASS) ? " Async" : "",
 		(status & STS_PSS) ? " Periodic" : "",
 		(status & STS_RECL) ? " Recl" : "",
@@ -210,8 +216,9 @@ static int __maybe_unused
 dbg_intr_buf (char *buf, unsigned len, const char *label, u32 enable)
 {
 	return scnprintf (buf, len,
-		"%s%sintrenable %02x%s%s%s%s%s%s",
+		"%s%sintrenable %02x%s%s%s%s%s%s%s",
 		label, label [0] ? " " : "", enable,
+		(enable & STS_PPCE_MASK) ? " PPCE" : "",
 		(enable & STS_IAA) ? " IAA" : "",
 		(enable & STS_FATAL) ? " FATAL" : "",
 		(enable & STS_FLR) ? " FLR" : "",
@@ -228,9 +235,15 @@ static int
 dbg_command_buf (char *buf, unsigned len, const char *label, u32 command)
 {
 	return scnprintf (buf, len,
-		"%s%scommand %06x %s=%d ithresh=%d%s%s%s%s period=%s%s %s",
+		"%s%scommand %07x %s%s%s%s%s%s=%d ithresh=%d%s%s%s%s "
+		"period=%s%s %s",
 		label, label [0] ? " " : "", command,
-		(command & CMD_PARK) ? "park" : "(park)",
+		(command & CMD_HIRD) ? " HIRD" : "",
+		(command & CMD_PPCEE) ? " PPCEE" : "",
+		(command & CMD_FSP) ? " FSP" : "",
+		(command & CMD_ASPE) ? " ASPE" : "",
+		(command & CMD_PSPE) ? " PSPE" : "",
+		(command & CMD_PARK) ? " park" : "(park)",
 		CMD_PARK_CNT (command),
 		(command >> 16) & 0x3f,
 		(command & CMD_LRESET) ? " LReset" : "",
@@ -257,11 +270,22 @@ dbg_port_buf (char *buf, unsigned len, const char *label, int port, u32 status)
 	}
 
 	return scnprintf (buf, len,
-		"%s%sport %d status %06x%s%s sig=%s%s%s%s%s%s%s%s%s%s",
+		"%s%sport:%d status %06x %d %s%s%s%s%s%s "
+		"sig=%s%s%s%s%s%s%s%s%s%s%s",
 		label, label [0] ? " " : "", port, status,
+		status>>25,/*device address */
+		(status & PORT_SSTS)>>23 == PORTSC_SUSPEND_STS_ACK ?
+						" ACK" : "",
+		(status & PORT_SSTS)>>23 == PORTSC_SUSPEND_STS_NYET ?
+						" NYET" : "",
+		(status & PORT_SSTS)>>23 == PORTSC_SUSPEND_STS_STALL ?
+						" STALL" : "",
+		(status & PORT_SSTS)>>23 == PORTSC_SUSPEND_STS_ERR ?
+						" ERR" : "",
 		(status & PORT_POWER) ? " POWER" : "",
 		(status & PORT_OWNER) ? " OWNER" : "",
 		sig,
+		(status & PORT_LPM) ? " LPM" : "",
 		(status & PORT_RESET) ? " RESET" : "",
 		(status & PORT_SUSPEND) ? " SUSPEND" : "",
 		(status & PORT_RESUME) ? " RESUME" : "",
@@ -330,6 +354,13 @@ static int debug_async_open(struct inode *, struct file *);
 static int debug_periodic_open(struct inode *, struct file *);
 static int debug_registers_open(struct inode *, struct file *);
 static int debug_async_open(struct inode *, struct file *);
+static int debug_lpm_open(struct inode *, struct file *);
+static ssize_t debug_lpm_read(struct file *file, char __user *user_buf,
+				   size_t count, loff_t *ppos);
+static ssize_t debug_lpm_write(struct file *file, const char __user *buffer,
+			      size_t count, loff_t *ppos);
+static int debug_lpm_close(struct inode *inode, struct file *file);
+
 static ssize_t debug_output(struct file*, char __user*, size_t, loff_t*);
 static int debug_close(struct inode *, struct file *);
 
@@ -351,6 +382,13 @@ static const struct file_operations debug_registers_fops = {
 	.read		= debug_output,
 	.release	= debug_close,
 };
+static const struct file_operations debug_lpm_fops = {
+	.owner		= THIS_MODULE,
+	.open		= debug_lpm_open,
+	.read		= debug_lpm_read,
+	.write		= debug_lpm_write,
+	.release	= debug_lpm_close,
+};
 
 static struct dentry *ehci_debug_root;
 
@@ -917,6 +955,94 @@ static int debug_registers_open(struct inode *inode, struct file *file)
 	return file->private_data ? 0 : -ENOMEM;
 }
 
+static int debug_lpm_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+static int debug_lpm_close(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static ssize_t debug_lpm_read(struct file *file, char __user *user_buf,
+				   size_t count, loff_t *ppos)
+{
+	/* TODO: show lpm stats */
+	return 0;
+}
+
+static ssize_t debug_lpm_write(struct file *file, const char __user *user_buf,
+			      size_t count, loff_t *ppos)
+{
+	struct usb_hcd		*hcd;
+	struct ehci_hcd		*ehci;
+	char buf[50];
+	size_t len;
+	u32 temp;
+	unsigned long port;
+	u32 __iomem	*portsc ;
+	u32 params;
+
+	hcd = bus_to_hcd(file->private_data);
+	ehci = hcd_to_ehci(hcd);
+
+	len = min(count, sizeof(buf) - 1);
+	if (copy_from_user(buf, user_buf, len))
+		return -EFAULT;
+	buf[len] = '\0';
+	if (len > 0 && buf[len - 1] == '\n')
+		buf[len - 1] = '\0';
+
+	if (strncmp(buf, "enable", 5) == 0) {
+		if (strict_strtoul(buf + 7, 10, &port))
+			return -EINVAL;
+		params = ehci_readl(ehci, &ehci->caps->hcs_params);
+		if (port > HCS_N_PORTS(params)) {
+			ehci_dbg(ehci, "ERR: LPM on bad port %lu\n", port);
+			return -ENODEV;
+		}
+		portsc = &ehci->regs->port_status[port-1];
+		temp = ehci_readl(ehci, portsc);
+		if (!(temp & PORT_DEV_ADDR)) {
+			ehci_dbg(ehci, "LPM: no device attached\n");
+			return -ENODEV;
+		}
+		temp |= PORT_LPM;
+		ehci_writel(ehci, temp, portsc);
+		printk(KERN_INFO "force enable LPM for port %lu\n", port);
+	} else if (strncmp(buf, "hird=", 5) == 0) {
+		unsigned long hird;
+		if (strict_strtoul(buf + 5, 16, &hird))
+			return -EINVAL;
+		printk(KERN_INFO "setting hird %s %lu\n", buf + 6, hird);
+		temp = ehci_readl(ehci, &ehci->regs->command);
+		temp &= ~CMD_HIRD;
+		temp |= hird << 24;
+		ehci_writel(ehci, temp, &ehci->regs->command);
+	} else if (strncmp(buf, "disable", 7) == 0) {
+		if (strict_strtoul(buf + 8, 10, &port))
+			return -EINVAL;
+		params = ehci_readl(ehci, &ehci->caps->hcs_params);
+		if (port > HCS_N_PORTS(params)) {
+			ehci_dbg(ehci, "ERR: LPM off bad port %lu\n", port);
+			return -ENODEV;
+		}
+		portsc = &ehci->regs->port_status[port-1];
+		temp = ehci_readl(ehci, portsc);
+		if (!(temp & PORT_DEV_ADDR)) {
+			ehci_dbg(ehci, "ERR: no device attached\n");
+			return -ENODEV;
+		}
+		temp &= ~PORT_LPM;
+		ehci_writel(ehci, temp, portsc);
+		printk(KERN_INFO "disabled LPM for port %lu\n", port);
+	} else
+		return -EOPNOTSUPP;
+	return count;
+}
+
 static inline void create_debug_files (struct ehci_hcd *ehci)
 {
 	struct usb_bus *bus = &ehci_to_hcd(ehci)->self;
@@ -940,6 +1066,10 @@ static inline void create_debug_files (struct ehci_hcd *ehci)
 	ehci->debug_registers = debugfs_create_file("registers", S_IRUGO,
 						    ehci->debug_dir, bus,
 						    &debug_registers_fops);
+
+	ehci->debug_registers = debugfs_create_file("lpm", S_IRUGO|S_IWUGO,
+						    ehci->debug_dir, bus,
+						    &debug_lpm_fops);
 	if (!ehci->debug_registers)
 		goto registers_error;
 	return;
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index a3ef2a9d9dc2..20ca6a9ff213 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -36,6 +36,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/debugfs.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 
 #include <asm/byteorder.h>
 #include <asm/io.h>
diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index 650a687f2854..bfaac1646365 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h
@@ -157,6 +157,7 @@ struct ehci_hcd {			/* one per controller */
 	struct dentry		*debug_async;
 	struct dentry		*debug_periodic;
 	struct dentry		*debug_registers;
+	struct dentry		*debug_lpm;
 #endif
 };
 
diff --git a/include/linux/usb/ehci_def.h b/include/linux/usb/ehci_def.h
index 80287af2a738..2e262cb15425 100644
--- a/include/linux/usb/ehci_def.h
+++ b/include/linux/usb/ehci_def.h
@@ -39,6 +39,12 @@ struct ehci_caps {
 #define HCS_N_PORTS(p)		(((p)>>0)&0xf)	/* bits 3:0, ports on HC */
 
 	u32		hcc_params;      /* HCCPARAMS - offset 0x8 */
+/* EHCI 1.1 addendum */
+#define HCC_32FRAME_PERIODIC_LIST(p)	((p)&(1 << 19))
+#define HCC_PER_PORT_CHANGE_EVENT(p)	((p)&(1 << 18))
+#define HCC_LPM(p)			((p)&(1 << 17))
+#define HCC_HW_PREFETCH(p)		((p)&(1 << 16))
+
 #define HCC_EXT_CAPS(p)		(((p)>>8)&0xff)	/* for pci extended caps */
 #define HCC_ISOC_CACHE(p)       ((p)&(1 << 7))  /* true: can cache isoc frame */
 #define HCC_ISOC_THRES(p)       (((p)>>4)&0x7)  /* bits 6:4, uframes cached */
@@ -54,6 +60,13 @@ struct ehci_regs {
 
 	/* USBCMD: offset 0x00 */
 	u32		command;
+
+/* EHCI 1.1 addendum */
+#define CMD_HIRD	(0xf<<24)	/* host initiated resume duration */
+#define CMD_PPCEE	(1<<15)		/* per port change event enable */
+#define CMD_FSP		(1<<14)		/* fully synchronized prefetch */
+#define CMD_ASPE	(1<<13)		/* async schedule prefetch enable */
+#define CMD_PSPE	(1<<12)		/* periodic schedule prefetch enable */
 /* 23:16 is r/w intr rate, in microframes; default "8" == 1/msec */
 #define CMD_PARK	(1<<11)		/* enable "park" on async qh */
 #define CMD_PARK_CNT(c)	(((c)>>8)&3)	/* how many transfers to park for */
@@ -67,6 +80,7 @@ struct ehci_regs {
 
 	/* USBSTS: offset 0x04 */
 	u32		status;
+#define STS_PPCE_MASK	(0xff<<16)	/* Per-Port change event 1-16 */
 #define STS_ASS		(1<<15)		/* Async Schedule Status */
 #define STS_PSS		(1<<14)		/* Periodic Schedule Status */
 #define STS_RECL	(1<<13)		/* Reclamation */
@@ -100,6 +114,14 @@ struct ehci_regs {
 
 	/* PORTSC: offset 0x44 */
 	u32		port_status[0];	/* up to N_PORTS */
+/* EHCI 1.1 addendum */
+#define PORTSC_SUSPEND_STS_ACK 0
+#define PORTSC_SUSPEND_STS_NYET 1
+#define PORTSC_SUSPEND_STS_STALL 2
+#define PORTSC_SUSPEND_STS_ERR 3
+
+#define PORT_DEV_ADDR	(0x7f<<25)		/* device address */
+#define PORT_SSTS	(0x3<<23)		/* suspend status */
 /* 31:23 reserved */
 #define PORT_WKOC_E	(1<<22)		/* wake on overcurrent (enable) */
 #define PORT_WKDISC_E	(1<<21)		/* wake on disconnect (enable) */
@@ -115,6 +137,7 @@ struct ehci_regs {
 #define PORT_USB11(x) (((x)&(3<<10)) == (1<<10))	/* USB 1.1 device */
 /* 11:10 for detecting lowspeed devices (reset vs release ownership) */
 /* 9 reserved */
+#define PORT_LPM	(1<<9)		/* LPM transaction */
 #define PORT_RESET	(1<<8)		/* reset port */
 #define PORT_SUSPEND	(1<<7)		/* suspend port */
 #define PORT_RESUME	(1<<6)		/* resume it */
-- 
cgit v1.2.3-59-g8ed1b


From 48f24970144479c29b8cee6d2e1dbedf6dcf9cfb Mon Sep 17 00:00:00 2001
From: Alek Du <alek.du@intel.com>
Date: Fri, 4 Jun 2010 15:47:55 +0800
Subject: USB: EHCI: EHCI 1.1 addendum: Basic LPM feature support

With this patch, the LPM capable EHCI host controller can put device
into L1 sleep state which is a mode that can enter/exit quickly, and
reduce power consumption.

Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com>
Signed-off-by: Alek Du <alek.du@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/hub.c      |  4 ++-
 drivers/usb/host/ehci-hcd.c | 17 ++++++++++
 drivers/usb/host/ehci-hub.c |  5 +++
 drivers/usb/host/ehci-lpm.c | 83 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/usb/host/ehci-pci.c | 21 ++++++++++++
 drivers/usb/host/ehci.h     |  2 +-
 include/linux/usb/hcd.h     |  4 +++
 7 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100644 drivers/usb/host/ehci-lpm.c

(limited to 'include/linux')

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 70cccc75a362..9cd77a2af821 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2880,7 +2880,9 @@ hub_port_init (struct usb_hub *hub, struct usb_device *udev, int port1,
 	}
 
 	retval = 0;
-
+	/* notify HCD that we have a device connected and addressed */
+	if (hcd->driver->update_device)
+		hcd->driver->update_device(hcd, udev);
 fail:
 	if (retval) {
 		hub_port_disable(hub, port1, 0);
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 20ca6a9ff213..baf9b648bb1f 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -101,6 +101,11 @@ static int ignore_oc = 0;
 module_param (ignore_oc, bool, S_IRUGO);
 MODULE_PARM_DESC (ignore_oc, "ignore bogus hardware overcurrent indications");
 
+/* for link power management(LPM) feature */
+static unsigned int hird;
+module_param(hird, int, S_IRUGO);
+MODULE_PARM_DESC(hird, "host initiated resume duration, +1 for each 75us\n");
+
 #define	INTR_MASK (STS_IAA | STS_FATAL | STS_PCD | STS_ERR | STS_INT)
 
 /*-------------------------------------------------------------------------*/
@@ -305,6 +310,7 @@ static void end_unlink_async(struct ehci_hcd *ehci);
 static void ehci_work(struct ehci_hcd *ehci);
 
 #include "ehci-hub.c"
+#include "ehci-lpm.c"
 #include "ehci-mem.c"
 #include "ehci-q.c"
 #include "ehci-sched.c"
@@ -604,6 +610,17 @@ static int ehci_init(struct usb_hcd *hcd)
 		default:	BUG();
 		}
 	}
+	if (HCC_LPM(hcc_params)) {
+		/* support link power management EHCI 1.1 addendum */
+		ehci_dbg(ehci, "support lpm\n");
+		ehci->has_lpm = 1;
+		if (hird > 0xf) {
+			ehci_dbg(ehci, "hird %d invalid, use default 0",
+			hird);
+			hird = 0;
+		}
+		temp |= hird << 24;
+	}
 	ehci->command = temp;
 
 	/* Accept arbitrarily long scatter-gather lists */
diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c
index e7d3d8def282..8a28dae8a375 100644
--- a/drivers/usb/host/ehci-hub.c
+++ b/drivers/usb/host/ehci-hub.c
@@ -790,6 +790,11 @@ static int ehci_hub_control (
 					  status_reg);
 			break;
 		case USB_PORT_FEAT_C_CONNECTION:
+			if (ehci->has_lpm) {
+				/* clear PORTSC bits on disconnect */
+				temp &= ~PORT_LPM;
+				temp &= ~PORT_DEV_ADDR;
+			}
 			ehci_writel(ehci, (temp & ~PORT_RWC_BITS) | PORT_CSC,
 					status_reg);
 			break;
diff --git a/drivers/usb/host/ehci-lpm.c b/drivers/usb/host/ehci-lpm.c
new file mode 100644
index 000000000000..b4d4d63c13ed
--- /dev/null
+++ b/drivers/usb/host/ehci-lpm.c
@@ -0,0 +1,83 @@
+/* ehci-lpm.c EHCI HCD LPM support code
+ * Copyright (c) 2008 - 2010,  Intel Corporation.
+ * Author: Jacob Pan <jacob.jun.pan@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+/* this file is part of ehci-hcd.c */
+static int ehci_lpm_set_da(struct ehci_hcd *ehci, int dev_addr, int port_num)
+{
+	u32 __iomem portsc;
+
+	ehci_dbg(ehci, "set dev address %d for port %d\n", dev_addr, port_num);
+	if (port_num > HCS_N_PORTS(ehci->hcs_params)) {
+		ehci_dbg(ehci, "invalid port number %d\n", port_num);
+		return -ENODEV;
+	}
+	portsc = ehci_readl(ehci, &ehci->regs->port_status[port_num-1]);
+	portsc &= ~PORT_DEV_ADDR;
+	portsc |= dev_addr<<25;
+	ehci_writel(ehci, portsc, &ehci->regs->port_status[port_num-1]);
+	return 0;
+}
+
+/*
+ * this function is used to check if the device support LPM
+ * if yes, mark the PORTSC register with PORT_LPM bit
+ */
+static int ehci_lpm_check(struct ehci_hcd *ehci, int port)
+{
+	u32 __iomem	*portsc ;
+	u32 val32;
+	int retval;
+
+	portsc = &ehci->regs->port_status[port-1];
+	val32 = ehci_readl(ehci, portsc);
+	if (!(val32 & PORT_DEV_ADDR)) {
+		ehci_dbg(ehci, "LPM: no device attached\n");
+		return -ENODEV;
+	}
+	val32 |= PORT_LPM;
+	ehci_writel(ehci, val32, portsc);
+	msleep(5);
+	val32 |= PORT_SUSPEND;
+	ehci_dbg(ehci, "Sending LPM 0x%08x to port %d\n", val32, port);
+	ehci_writel(ehci, val32, portsc);
+	/* wait for ACK */
+	msleep(10);
+	retval = handshake(ehci, &ehci->regs->port_status[port-1], PORT_SSTS,
+			PORTSC_SUSPEND_STS_ACK, 125);
+	dbg_port(ehci, "LPM", port, val32);
+	if (retval != -ETIMEDOUT) {
+		ehci_dbg(ehci, "LPM: device ACK for LPM\n");
+		val32 |= PORT_LPM;
+		/*
+		 * now device should be in L1 sleep, let's wake up the device
+		 * so that we can complete enumeration.
+		 */
+		ehci_writel(ehci, val32, portsc);
+		msleep(10);
+		val32 |= PORT_RESUME;
+		ehci_writel(ehci, val32, portsc);
+	} else {
+		ehci_dbg(ehci, "LPM: device does not ACK, disable LPM %d\n",
+			retval);
+		val32 &= ~PORT_LPM;
+		retval = -ETIMEDOUT;
+		ehci_writel(ehci, val32, portsc);
+	}
+
+	return retval;
+}
diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c
index d43d176161aa..a307d550bdaf 100644
--- a/drivers/usb/host/ehci-pci.c
+++ b/drivers/usb/host/ehci-pci.c
@@ -361,6 +361,22 @@ static int ehci_pci_resume(struct usb_hcd *hcd, bool hibernated)
 }
 #endif
 
+static int ehci_update_device(struct usb_hcd *hcd, struct usb_device *udev)
+{
+	struct ehci_hcd *ehci = hcd_to_ehci(hcd);
+	int rc = 0;
+
+	if (!udev->parent) /* udev is root hub itself, impossible */
+		rc = -1;
+	/* we only support lpm device connected to root hub yet */
+	if (ehci->has_lpm && !udev->parent->parent) {
+		rc = ehci_lpm_set_da(ehci, udev->devnum, udev->portnum);
+		if (!rc)
+			rc = ehci_lpm_check(ehci, udev->portnum);
+	}
+	return rc;
+}
+
 static const struct hc_driver ehci_pci_hc_driver = {
 	.description =		hcd_name,
 	.product_desc =		"EHCI Host Controller",
@@ -407,6 +423,11 @@ static const struct hc_driver ehci_pci_hc_driver = {
 	.relinquish_port =	ehci_relinquish_port,
 	.port_handed_over =	ehci_port_handed_over,
 
+	/*
+	 * call back when device connected and addressed
+	 */
+	.update_device =	ehci_update_device,
+
 	.clear_tt_buffer_complete	= ehci_clear_tt_buffer_complete,
 };
 
diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index bfaac1646365..21f30a0c3d2f 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h
@@ -140,7 +140,7 @@ struct ehci_hcd {			/* one per controller */
 	#define OHCI_HCCTRL_LEN         0x4
 	__hc32			*ohci_hcctrl_reg;
 	unsigned		has_hostpc:1;
-
+	unsigned		has_lpm:1;  /* support link power management */
 	u8			sbrn;		/* packed release number */
 
 	/* irq statistics */
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 11b638195901..9b867e64a0f4 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -300,6 +300,10 @@ struct hc_driver {
 	int	(*update_hub_device)(struct usb_hcd *, struct usb_device *hdev,
 			struct usb_tt *tt, gfp_t mem_flags);
 	int	(*reset_device)(struct usb_hcd *, struct usb_device *);
+		/* Notifies the HCD after a device is connected and its
+		 * address is set
+		 */
+	int	(*update_device)(struct usb_hcd *, struct usb_device *);
 };
 
 extern int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb);
-- 
cgit v1.2.3-59-g8ed1b


From c532b29a6f6d31e84a7c88f995eebdc75ebd4248 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 1 Jun 2010 23:04:41 +0200
Subject: USB-BKL: Convert usb_driver ioctl to unlocked_ioctl

And audit all the users. None needed the BKL.  That was easy
because there was only very few around.

Tested with allmodconfig build on x86-64

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
From: Andi Kleen <ak@linux.intel.com>
---
 drivers/usb/core/devio.c   | 7 ++-----
 drivers/usb/core/hub.c     | 3 ++-
 drivers/usb/misc/usbtest.c | 3 ++-
 include/linux/usb.h        | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index c2f62a3993d2..f1aaff6202a5 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -1668,13 +1668,10 @@ static int proc_ioctl(struct dev_state *ps, struct usbdevfs_ioctl *ctl)
 	default:
 		if (intf->dev.driver)
 			driver = to_usb_driver(intf->dev.driver);
-		if (driver == NULL || driver->ioctl == NULL) {
+		if (driver == NULL || driver->unlocked_ioctl == NULL) {
 			retval = -ENOTTY;
 		} else {
-			/* keep API that guarantees BKL */
-			lock_kernel();
-			retval = driver->ioctl(intf, ctl->ioctl_code, buf);
-			unlock_kernel();
+			retval = driver->unlocked_ioctl(intf, ctl->ioctl_code, buf);
 			if (retval == -ENOIOCTLCMD)
 				retval = -ENOTTY;
 		}
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 9cd77a2af821..d337ef80bf43 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -1294,6 +1294,7 @@ descriptor_error:
 	return -ENODEV;
 }
 
+/* No BKL needed */
 static int
 hub_ioctl(struct usb_interface *intf, unsigned int code, void *user_data)
 {
@@ -3465,7 +3466,7 @@ static struct usb_driver hub_driver = {
 	.reset_resume =	hub_reset_resume,
 	.pre_reset =	hub_pre_reset,
 	.post_reset =	hub_post_reset,
-	.ioctl =	hub_ioctl,
+	.unlocked_ioctl = hub_ioctl,
 	.id_table =	hub_id_table,
 	.supports_autosuspend =	1,
 };
diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c
index 16dffe99d9f1..0cfbd789ddf2 100644
--- a/drivers/usb/misc/usbtest.c
+++ b/drivers/usb/misc/usbtest.c
@@ -1548,6 +1548,7 @@ fail:
  * off just killing the userspace task and waiting for it to exit.
  */
 
+/* No BKL needed */
 static int
 usbtest_ioctl (struct usb_interface *intf, unsigned int code, void *buf)
 {
@@ -2170,7 +2171,7 @@ static struct usb_driver usbtest_driver = {
 	.name =		"usbtest",
 	.id_table =	id_table,
 	.probe =	usbtest_probe,
-	.ioctl =	usbtest_ioctl,
+	.unlocked_ioctl = usbtest_ioctl,
 	.disconnect =	usbtest_disconnect,
 	.suspend =	usbtest_suspend,
 	.resume =	usbtest_resume,
diff --git a/include/linux/usb.h b/include/linux/usb.h
index d5922a877994..e6cbc34901f4 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -843,7 +843,7 @@ struct usb_driver {
 
 	void (*disconnect) (struct usb_interface *intf);
 
-	int (*ioctl) (struct usb_interface *intf, unsigned int code,
+	int (*unlocked_ioctl) (struct usb_interface *intf, unsigned int code,
 			void *buf);
 
 	int (*suspend) (struct usb_interface *intf, pm_message_t message);
-- 
cgit v1.2.3-59-g8ed1b


From 7898aee1dacbb246fee958f0a6102320b61768d9 Mon Sep 17 00:00:00 2001
From: Michal Nazarewicz <m.nazarewicz@samsung.com>
Date: Wed, 16 Jun 2010 12:07:58 +0200
Subject: USB: gadget: f_fs: functionfs_add() renamed to
 functionfs_bind_config()

FunctionFS had a bit unique name for function used to add it
to USB configuration.  Renamed as to match naming convention
of other functions.

Signed-off-by: Michal Nazarewicz <m.nazarewicz@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/f_fs.c      | 6 +++---
 drivers/usb/gadget/g_ffs.c     | 2 +-
 include/linux/usb/functionfs.h | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/f_fs.c b/drivers/usb/gadget/f_fs.c
index c51c21314076..282b49e336be 100644
--- a/drivers/usb/gadget/f_fs.c
+++ b/drivers/usb/gadget/f_fs.c
@@ -1478,9 +1478,9 @@ static void ffs_epfiles_destroy(struct ffs_epfile *epfiles, unsigned count)
 }
 
 
-static int functionfs_add(struct usb_composite_dev *cdev,
-			  struct usb_configuration *c,
-			  struct ffs_data *ffs)
+static int functionfs_bind_config(struct usb_composite_dev *cdev,
+				  struct usb_configuration *c,
+				  struct ffs_data *ffs)
 {
 	struct ffs_function *func;
 	int ret;
diff --git a/drivers/usb/gadget/g_ffs.c b/drivers/usb/gadget/g_ffs.c
index d1af253a9105..da3a9e403497 100644
--- a/drivers/usb/gadget/g_ffs.c
+++ b/drivers/usb/gadget/g_ffs.c
@@ -388,7 +388,7 @@ static int __gfs_do_config(struct usb_configuration *c,
 			return ret;
 	}
 
-	ret = functionfs_add(c->cdev, c, gfs_ffs_data);
+	ret = functionfs_bind_config(c->cdev, c, gfs_ffs_data);
 	if (unlikely(ret < 0))
 		return ret;
 
diff --git a/include/linux/usb/functionfs.h b/include/linux/usb/functionfs.h
index a34a2a043b21..6f649c13193b 100644
--- a/include/linux/usb/functionfs.h
+++ b/include/linux/usb/functionfs.h
@@ -180,9 +180,9 @@ static int functionfs_bind(struct ffs_data *ffs, struct usb_composite_dev *cdev)
 static void functionfs_unbind(struct ffs_data *ffs)
 	__attribute__((nonnull));
 
-static int functionfs_add(struct usb_composite_dev *cdev,
-			  struct usb_configuration *c,
-			  struct ffs_data *ffs)
+static int functionfs_bind_config(struct usb_composite_dev *cdev,
+				  struct usb_configuration *c,
+				  struct ffs_data *ffs)
 	__attribute__((warn_unused_result, nonnull));
 
 
-- 
cgit v1.2.3-59-g8ed1b


From f2adc4f8aaf272de9ac71dcb18d95ebe05fc3f94 Mon Sep 17 00:00:00 2001
From: Michal Nazarewicz <m.nazarewicz@samsung.com>
Date: Wed, 16 Jun 2010 12:07:59 +0200
Subject: USB: gadget: composite: usb_string_ids_*() functions added

usb_string_ids_tab() and usb_string_ids_n() functions added to
the composite framework.  The first accepts an array of
usb_string object and for each registeres a string id and the
second registeres a given number of ids and returns the first.

This may simplify string ids registration since gadgets and
composite functions won't have to call usb_string_id() several
times and each time check for errer status -- all this will be
done with a single call.

Signed-off-by: Michal Nazarewicz <m.nazarewicz@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/composite.c | 71 +++++++++++++++++++++++++++++++++++++++---
 include/linux/usb/composite.h  |  4 +++
 2 files changed, 71 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 391d169f8d07..125167e17ce5 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -673,20 +673,83 @@ static int get_string(struct usb_composite_dev *cdev,
  * string IDs.  Drivers for functions, configurations, or gadgets will
  * then store that ID in the appropriate descriptors and string table.
  *
- * All string identifier should be allocated using this routine, to
- * ensure that for example different functions don't wrongly assign
- * different meanings to the same identifier.
+ * All string identifier should be allocated using this,
+ * @usb_string_ids_tab() or @usb_string_ids_n() routine, to ensure
+ * that for example different functions don't wrongly assign different
+ * meanings to the same identifier.
  */
 int usb_string_id(struct usb_composite_dev *cdev)
 {
 	if (cdev->next_string_id < 254) {
-		/* string id 0 is reserved */
+		/* string id 0 is reserved by USB spec for list of
+		 * supported languages */
+		/* 255 reserved as well? -- mina86 */
 		cdev->next_string_id++;
 		return cdev->next_string_id;
 	}
 	return -ENODEV;
 }
 
+/**
+ * usb_string_ids() - allocate unused string IDs in batch
+ * @cdev: the device whose string descriptor IDs are being allocated
+ * @str: an array of usb_string objects to assign numbers to
+ * Context: single threaded during gadget setup
+ *
+ * @usb_string_ids() is called from bind() callbacks to allocate
+ * string IDs.  Drivers for functions, configurations, or gadgets will
+ * then copy IDs from the string table to the appropriate descriptors
+ * and string table for other languages.
+ *
+ * All string identifier should be allocated using this,
+ * @usb_string_id() or @usb_string_ids_n() routine, to ensure that for
+ * example different functions don't wrongly assign different meanings
+ * to the same identifier.
+ */
+int usb_string_ids_tab(struct usb_composite_dev *cdev, struct usb_string *str)
+{
+	int next = cdev->next_string_id;
+
+	for (; str->s; ++str) {
+		if (unlikely(next >= 254))
+			return -ENODEV;
+		str->id = ++next;
+	}
+
+	cdev->next_string_id = next;
+
+	return 0;
+}
+
+/**
+ * usb_string_ids_n() - allocate unused string IDs in batch
+ * @cdev: the device whose string descriptor IDs are being allocated
+ * @n: number of string IDs to allocate
+ * Context: single threaded during gadget setup
+ *
+ * Returns the first requested ID.  This ID and next @n-1 IDs are now
+ * valid IDs.  At least providind that @n is non zore because if it
+ * is, returns last requested ID which is now very useful information.
+ *
+ * @usb_string_ids_n() is called from bind() callbacks to allocate
+ * string IDs.  Drivers for functions, configurations, or gadgets will
+ * then store that ID in the appropriate descriptors and string table.
+ *
+ * All string identifier should be allocated using this,
+ * @usb_string_id() or @usb_string_ids_n() routine, to ensure that for
+ * example different functions don't wrongly assign different meanings
+ * to the same identifier.
+ */
+int usb_string_ids_n(struct usb_composite_dev *c, unsigned n)
+{
+	unsigned next = c->next_string_id;
+	if (unlikely(n > 254 || (unsigned)next + n > 254))
+		return -ENODEV;
+	c->next_string_id += n;
+	return next + 1;
+}
+
+
 /*-------------------------------------------------------------------------*/
 
 static void composite_setup_complete(struct usb_ep *ep, struct usb_request *req)
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 139353efad34..f378075c839a 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -342,6 +342,10 @@ struct usb_composite_dev {
 };
 
 extern int usb_string_id(struct usb_composite_dev *c);
+extern int usb_string_ids_tab(struct usb_composite_dev *c,
+			      struct usb_string *str);
+extern int usb_string_ids_n(struct usb_composite_dev *c, unsigned n);
+
 
 /* messaging utils */
 #define DBG(d, fmt, args...) \
-- 
cgit v1.2.3-59-g8ed1b


From 3f3e12d050052032a51f75e72e540322e2a7da2b Mon Sep 17 00:00:00 2001
From: Michal Nazarewicz <m.nazarewicz@samsung.com>
Date: Mon, 21 Jun 2010 13:57:08 +0200
Subject: USB: gadget: composite: added disconnect callback

Added a disconnect() callback to composite devices which
is called by composite glue when its disconnect callback
is called by gadget.

Signed-off-by: Michal Nazarewicz <m.nazarewicz@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/composite.c | 2 ++
 include/linux/usb/composite.h  | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 125167e17ce5..e483f80822d2 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -956,6 +956,8 @@ static void composite_disconnect(struct usb_gadget *gadget)
 	spin_lock_irqsave(&cdev->lock, flags);
 	if (cdev->config)
 		reset_config(cdev);
+	if (composite->disconnect)
+		composite->disconnect(cdev);
 	spin_unlock_irqrestore(&cdev->lock, flags);
 }
 
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index f378075c839a..890bc1472190 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -276,6 +276,8 @@ struct usb_composite_driver {
 	int			(*bind)(struct usb_composite_dev *);
 	int			(*unbind)(struct usb_composite_dev *);
 
+	void			(*disconnect)(struct usb_composite_dev *);
+
 	/* global suspend hooks */
 	void			(*suspend)(struct usb_composite_dev *);
 	void			(*resume)(struct usb_composite_dev *);
-- 
cgit v1.2.3-59-g8ed1b


From 541c7d432f76771079e7c295d596ea47cc6a3030 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Tue, 22 Jun 2010 16:39:10 -0400
Subject: USB: convert usb_hcd bitfields into atomic flags

This patch (as1393) converts several of the single-bit fields in
struct usb_hcd to atomic flags.  This is for safety's sake; not all
CPUs can update bitfield values atomically, and these flags are used
in multiple contexts.

The flag fields that are set only during registration or removal can
remain as they are, since non-atomic accesses at those times will not
cause any problems.

(Strictly speaking, the authorized_default flag should become atomic
as well.  I didn't bother with it because it gets changed only via
sysfs.  It can be done later, if anyone wants.)

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/staging/usbip/vhci_hcd.c |  6 +++---
 drivers/usb/c67x00/c67x00-hcd.c  |  4 ++--
 drivers/usb/core/hcd.c           | 26 ++++++++++++--------------
 drivers/usb/gadget/dummy_hcd.c   |  6 +++---
 drivers/usb/host/ehci-dbg.c      |  2 +-
 drivers/usb/host/ehci-hcd.c      |  1 -
 drivers/usb/host/ehci-hub.c      |  2 +-
 drivers/usb/host/ehci-q.c        |  3 +--
 drivers/usb/host/ehci-sched.c    |  9 +++------
 drivers/usb/host/hwa-hc.c        |  4 ++--
 drivers/usb/host/isp1760-hcd.c   |  3 +--
 drivers/usb/host/ohci-dbg.c      |  4 ++--
 drivers/usb/host/ohci-hcd.c      |  6 +++---
 drivers/usb/host/ohci-hub.c      | 16 ++++++++++------
 drivers/usb/host/oxu210hp-hcd.c  |  7 ++-----
 drivers/usb/host/uhci-hcd.c      | 21 ++++++++++++---------
 drivers/usb/host/uhci-hub.c      |  4 ++--
 drivers/usb/host/whci/hcd.c      |  2 +-
 drivers/usb/host/xhci.c          |  3 +--
 drivers/usb/musb/musb_virthub.c  |  2 +-
 include/linux/usb/hcd.h          | 22 +++++++++++++++++-----
 21 files changed, 80 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/usbip/vhci_hcd.c b/drivers/staging/usbip/vhci_hcd.c
index be5d8db98165..0574d848b900 100644
--- a/drivers/staging/usbip/vhci_hcd.c
+++ b/drivers/staging/usbip/vhci_hcd.c
@@ -215,7 +215,7 @@ static int vhci_hub_status(struct usb_hcd *hcd, char *buf)
 	vhci = hcd_to_vhci(hcd);
 
 	spin_lock_irqsave(&vhci->lock, flags);
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)) {
+	if (!HCD_HW_ACCESSIBLE(hcd)) {
 		usbip_dbg_vhci_rh("hw accessible flag in on?\n");
 		goto done;
 	}
@@ -269,7 +269,7 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
 
 	u32 prev_port_status[VHCI_NPORTS];
 
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
+	if (!HCD_HW_ACCESSIBLE(hcd))
 		return -ETIMEDOUT;
 
 	/*
@@ -1041,7 +1041,7 @@ static int vhci_bus_resume(struct usb_hcd *hcd)
 	dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__);
 
 	spin_lock_irq(&vhci->lock);
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)) {
+	if (!HCD_HW_ACCESSIBLE(hcd)) {
 		rc = -ESHUTDOWN;
 	} else {
 		/* vhci->rh_state = DUMMY_RH_RUNNING;
diff --git a/drivers/usb/c67x00/c67x00-hcd.c b/drivers/usb/c67x00/c67x00-hcd.c
index a22b887f4e9e..d3e1356d091e 100644
--- a/drivers/usb/c67x00/c67x00-hcd.c
+++ b/drivers/usb/c67x00/c67x00-hcd.c
@@ -264,7 +264,7 @@ static void c67x00_hcd_irq(struct c67x00_sie *sie, u16 int_status, u16 msg)
 	if (unlikely(hcd->state == HC_STATE_HALT))
 		return;
 
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
+	if (!HCD_HW_ACCESSIBLE(hcd))
 		return;
 
 	/* Handle Start of frame events */
@@ -282,7 +282,7 @@ static int c67x00_hcd_start(struct usb_hcd *hcd)
 {
 	hcd->uses_new_polling = 1;
 	hcd->state = HC_STATE_RUNNING;
-	hcd->poll_rh = 1;
+	set_bit(HCD_FLAG_POLL_RH, &hcd->flags);
 
 	return 0;
 }
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 53f14c82ff2e..f2fe7c8e991d 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -679,7 +679,7 @@ void usb_hcd_poll_rh_status(struct usb_hcd *hcd)
 		spin_lock_irqsave(&hcd_root_hub_lock, flags);
 		urb = hcd->status_urb;
 		if (urb) {
-			hcd->poll_pending = 0;
+			clear_bit(HCD_FLAG_POLL_PENDING, &hcd->flags);
 			hcd->status_urb = NULL;
 			urb->actual_length = length;
 			memcpy(urb->transfer_buffer, buffer, length);
@@ -690,7 +690,7 @@ void usb_hcd_poll_rh_status(struct usb_hcd *hcd)
 			spin_lock(&hcd_root_hub_lock);
 		} else {
 			length = 0;
-			hcd->poll_pending = 1;
+			set_bit(HCD_FLAG_POLL_PENDING, &hcd->flags);
 		}
 		spin_unlock_irqrestore(&hcd_root_hub_lock, flags);
 	}
@@ -699,7 +699,7 @@ void usb_hcd_poll_rh_status(struct usb_hcd *hcd)
 	 * exceed that limit if HZ is 100. The math is more clunky than
 	 * maybe expected, this is to make sure that all timers for USB devices
 	 * fire at the same time to give the CPU a break inbetween */
-	if (hcd->uses_new_polling ? hcd->poll_rh :
+	if (hcd->uses_new_polling ? HCD_POLL_RH(hcd) :
 			(length == 0 && hcd->status_urb != NULL))
 		mod_timer (&hcd->rh_timer, (jiffies/(HZ/4) + 1) * (HZ/4));
 }
@@ -736,7 +736,7 @@ static int rh_queue_status (struct usb_hcd *hcd, struct urb *urb)
 		mod_timer(&hcd->rh_timer, (jiffies/(HZ/4) + 1) * (HZ/4));
 
 	/* If a status change has already occurred, report it ASAP */
-	else if (hcd->poll_pending)
+	else if (HCD_POLL_PENDING(hcd))
 		mod_timer(&hcd->rh_timer, jiffies);
 	retval = 0;
  done:
@@ -1150,8 +1150,7 @@ int usb_hcd_check_unlink_urb(struct usb_hcd *hcd, struct urb *urb,
 	 * finish unlinking the initial failed usb_set_address()
 	 * or device descriptor fetch.
 	 */
-	if (!test_bit(HCD_FLAG_SAW_IRQ, &hcd->flags) &&
-			!is_root_hub(urb->dev)) {
+	if (!HCD_SAW_IRQ(hcd) && !is_root_hub(urb->dev)) {
 		dev_warn(hcd->self.controller, "Unlink after no-IRQ?  "
 			"Controller is probably using the wrong IRQ.\n");
 		set_bit(HCD_FLAG_SAW_IRQ, &hcd->flags);
@@ -2063,8 +2062,7 @@ irqreturn_t usb_hcd_irq (int irq, void *__hcd)
 	 */
 	local_irq_save(flags);
 
-	if (unlikely(hcd->state == HC_STATE_HALT ||
-		     !test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))) {
+	if (unlikely(hcd->state == HC_STATE_HALT || !HCD_HW_ACCESSIBLE(hcd))) {
 		rc = IRQ_NONE;
 	} else if (hcd->driver->irq(hcd) == IRQ_NONE) {
 		rc = IRQ_NONE;
@@ -2098,7 +2096,7 @@ void usb_hc_died (struct usb_hcd *hcd)
 
 	spin_lock_irqsave (&hcd_root_hub_lock, flags);
 	if (hcd->rh_registered) {
-		hcd->poll_rh = 0;
+		clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
 
 		/* make khubd clean up old urbs and devices */
 		usb_set_device_state (hcd->self.root_hub,
@@ -2301,7 +2299,7 @@ int usb_add_hcd(struct usb_hcd *hcd,
 		       retval);
 		goto error_create_attr_group;
 	}
-	if (hcd->uses_new_polling && hcd->poll_rh)
+	if (hcd->uses_new_polling && HCD_POLL_RH(hcd))
 		usb_hcd_poll_rh_status(hcd);
 	return retval;
 
@@ -2320,11 +2318,11 @@ error_create_attr_group:
 	mutex_unlock(&usb_bus_list_lock);
 err_register_root_hub:
 	hcd->rh_pollable = 0;
-	hcd->poll_rh = 0;
+	clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
 	del_timer_sync(&hcd->rh_timer);
 	hcd->driver->stop(hcd);
 	hcd->state = HC_STATE_HALT;
-	hcd->poll_rh = 0;
+	clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
 	del_timer_sync(&hcd->rh_timer);
 err_hcd_driver_start:
 	if (hcd->irq >= 0)
@@ -2380,14 +2378,14 @@ void usb_remove_hcd(struct usb_hcd *hcd)
 	 * the hub_status_data() callback.
 	 */
 	hcd->rh_pollable = 0;
-	hcd->poll_rh = 0;
+	clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
 	del_timer_sync(&hcd->rh_timer);
 
 	hcd->driver->stop(hcd);
 	hcd->state = HC_STATE_HALT;
 
 	/* In case the HCD restarted the timer, stop it again. */
-	hcd->poll_rh = 0;
+	clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
 	del_timer_sync(&hcd->rh_timer);
 
 	if (hcd->irq >= 0)
diff --git a/drivers/usb/gadget/dummy_hcd.c b/drivers/usb/gadget/dummy_hcd.c
index 4f9e578cde9d..dc6546248ed9 100644
--- a/drivers/usb/gadget/dummy_hcd.c
+++ b/drivers/usb/gadget/dummy_hcd.c
@@ -1542,7 +1542,7 @@ static int dummy_hub_status (struct usb_hcd *hcd, char *buf)
 	dum = hcd_to_dummy (hcd);
 
 	spin_lock_irqsave (&dum->lock, flags);
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
+	if (!HCD_HW_ACCESSIBLE(hcd))
 		goto done;
 
 	if (dum->resuming && time_after_eq (jiffies, dum->re_timeout)) {
@@ -1588,7 +1588,7 @@ static int dummy_hub_control (
 	int		retval = 0;
 	unsigned long	flags;
 
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
+	if (!HCD_HW_ACCESSIBLE(hcd))
 		return -ETIMEDOUT;
 
 	dum = hcd_to_dummy (hcd);
@@ -1739,7 +1739,7 @@ static int dummy_bus_resume (struct usb_hcd *hcd)
 	dev_dbg (&hcd->self.root_hub->dev, "%s\n", __func__);
 
 	spin_lock_irq (&dum->lock);
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)) {
+	if (!HCD_HW_ACCESSIBLE(hcd)) {
 		rc = -ESHUTDOWN;
 	} else {
 		dum->rh_state = DUMMY_RH_RUNNING;
diff --git a/drivers/usb/host/ehci-dbg.c b/drivers/usb/host/ehci-dbg.c
index df5546bb8367..4498efb49b95 100644
--- a/drivers/usb/host/ehci-dbg.c
+++ b/drivers/usb/host/ehci-dbg.c
@@ -712,7 +712,7 @@ static ssize_t fill_registers_buffer(struct debug_buffer *buf)
 
 	spin_lock_irqsave (&ehci->lock, flags);
 
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)) {
+	if (!HCD_HW_ACCESSIBLE(hcd)) {
 		size = scnprintf (next, size,
 			"bus %s, device %s\n"
 			"%s\n"
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 8697ad19f313..2a19336c9824 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -642,7 +642,6 @@ static int ehci_run (struct usb_hcd *hcd)
 	u32			hcc_params;
 
 	hcd->uses_new_polling = 1;
-	hcd->poll_rh = 0;
 
 	/* EHCI spec section 4.1 */
 	if ((retval = ehci_reset(ehci)) != 0) {
diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c
index 84e792d71c22..0931f5a7dec4 100644
--- a/drivers/usb/host/ehci-hub.c
+++ b/drivers/usb/host/ehci-hub.c
@@ -316,7 +316,7 @@ static int ehci_bus_resume (struct usb_hcd *hcd)
 	if (time_before (jiffies, ehci->next_statechange))
 		msleep(5);
 	spin_lock_irq (&ehci->lock);
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)) {
+	if (!HCD_HW_ACCESSIBLE(hcd)) {
 		spin_unlock_irq(&ehci->lock);
 		return -ESHUTDOWN;
 	}
diff --git a/drivers/usb/host/ehci-q.c b/drivers/usb/host/ehci-q.c
index 11a79c4f4a9d..233c288e3f93 100644
--- a/drivers/usb/host/ehci-q.c
+++ b/drivers/usb/host/ehci-q.c
@@ -1126,8 +1126,7 @@ submit_async (
 #endif
 
 	spin_lock_irqsave (&ehci->lock, flags);
-	if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE,
-			       &ehci_to_hcd(ehci)->flags))) {
+	if (unlikely(!HCD_HW_ACCESSIBLE(ehci_to_hcd(ehci)))) {
 		rc = -ESHUTDOWN;
 		goto done;
 	}
diff --git a/drivers/usb/host/ehci-sched.c b/drivers/usb/host/ehci-sched.c
index 805ec633a652..d640346f9b56 100644
--- a/drivers/usb/host/ehci-sched.c
+++ b/drivers/usb/host/ehci-sched.c
@@ -880,8 +880,7 @@ static int intr_submit (
 
 	spin_lock_irqsave (&ehci->lock, flags);
 
-	if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE,
-			&ehci_to_hcd(ehci)->flags))) {
+	if (unlikely(!HCD_HW_ACCESSIBLE(ehci_to_hcd(ehci)))) {
 		status = -ESHUTDOWN;
 		goto done_not_linked;
 	}
@@ -1815,8 +1814,7 @@ static int itd_submit (struct ehci_hcd *ehci, struct urb *urb,
 
 	/* schedule ... need to lock */
 	spin_lock_irqsave (&ehci->lock, flags);
-	if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE,
-			       &ehci_to_hcd(ehci)->flags))) {
+	if (unlikely(!HCD_HW_ACCESSIBLE(ehci_to_hcd(ehci)))) {
 		status = -ESHUTDOWN;
 		goto done_not_linked;
 	}
@@ -2201,8 +2199,7 @@ static int sitd_submit (struct ehci_hcd *ehci, struct urb *urb,
 
 	/* schedule ... need to lock */
 	spin_lock_irqsave (&ehci->lock, flags);
-	if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE,
-			       &ehci_to_hcd(ehci)->flags))) {
+	if (unlikely(!HCD_HW_ACCESSIBLE(ehci_to_hcd(ehci)))) {
 		status = -ESHUTDOWN;
 		goto done_not_linked;
 	}
diff --git a/drivers/usb/host/hwa-hc.c b/drivers/usb/host/hwa-hc.c
index 35742f8c7cda..9bfac657572e 100644
--- a/drivers/usb/host/hwa-hc.c
+++ b/drivers/usb/host/hwa-hc.c
@@ -159,7 +159,7 @@ static int hwahc_op_start(struct usb_hcd *usb_hcd)
 		goto error_set_cluster_id;
 
 	usb_hcd->uses_new_polling = 1;
-	usb_hcd->poll_rh = 1;
+	set_bit(HCD_FLAG_POLL_RH, &usb_hcd->flags);
 	usb_hcd->state = HC_STATE_RUNNING;
 	result = 0;
 out:
@@ -776,7 +776,7 @@ static int hwahc_probe(struct usb_interface *usb_iface,
 		goto error_alloc;
 	}
 	usb_hcd->wireless = 1;
-	usb_hcd->flags |= HCD_FLAG_SAW_IRQ;
+	set_bit(HCD_FLAG_SAW_IRQ, &usb_hcd->flags);
 	wusbhc = usb_hcd_to_wusbhc(usb_hcd);
 	hwahc = container_of(wusbhc, struct hwahc, wusbhc);
 	hwahc_init(hwahc);
diff --git a/drivers/usb/host/isp1760-hcd.c b/drivers/usb/host/isp1760-hcd.c
index dbcafa29c775..d1a3dfc9a408 100644
--- a/drivers/usb/host/isp1760-hcd.c
+++ b/drivers/usb/host/isp1760-hcd.c
@@ -482,7 +482,6 @@ static int isp1760_run(struct usb_hcd *hcd)
 	u32 chipid;
 
 	hcd->uses_new_polling = 1;
-	hcd->poll_rh = 0;
 
 	hcd->state = HC_STATE_RUNNING;
 	isp1760_enable_interrupts(hcd);
@@ -1450,7 +1449,7 @@ static int isp1760_prepare_enqueue(struct isp1760_hcd *priv, struct urb *urb,
 	epnum = urb->ep->desc.bEndpointAddress;
 
 	spin_lock_irqsave(&priv->lock, flags);
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &priv_to_hcd(priv)->flags)) {
+	if (!HCD_HW_ACCESSIBLE(priv_to_hcd(priv))) {
 		rc = -ESHUTDOWN;
 		goto done;
 	}
diff --git a/drivers/usb/host/ohci-dbg.c b/drivers/usb/host/ohci-dbg.c
index 8ad2441b0284..36abd2baa3ea 100644
--- a/drivers/usb/host/ohci-dbg.c
+++ b/drivers/usb/host/ohci-dbg.c
@@ -645,7 +645,7 @@ static ssize_t fill_registers_buffer(struct debug_buffer *buf)
 		hcd->product_desc,
 		hcd_name);
 
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)) {
+	if (!HCD_HW_ACCESSIBLE(hcd)) {
 		size -= scnprintf (next, size,
 			"SUSPENDED (no register access)\n");
 		goto done;
@@ -687,7 +687,7 @@ static ssize_t fill_registers_buffer(struct debug_buffer *buf)
 	next += temp;
 
 	temp = scnprintf (next, size, "hub poll timer %s\n",
-			ohci_to_hcd(ohci)->poll_rh ? "ON" : "off");
+			HCD_POLL_RH(ohci_to_hcd(ohci)) ? "ON" : "off");
 	size -= temp;
 	next += temp;
 
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 02864a237a2c..c3b4ccc7337b 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -212,7 +212,7 @@ static int ohci_urb_enqueue (
 	spin_lock_irqsave (&ohci->lock, flags);
 
 	/* don't submit to a dead HC */
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)) {
+	if (!HCD_HW_ACCESSIBLE(hcd)) {
 		retval = -ENODEV;
 		goto fail;
 	}
@@ -685,7 +685,7 @@ retry:
 	}
 
 	/* use rhsc irqs after khubd is fully initialized */
-	hcd->poll_rh = 1;
+	set_bit(HCD_FLAG_POLL_RH, &hcd->flags);
 	hcd->uses_new_polling = 1;
 
 	/* start controller operations */
@@ -822,7 +822,7 @@ static irqreturn_t ohci_irq (struct usb_hcd *hcd)
 	else if (ints & OHCI_INTR_RD) {
 		ohci_vdbg(ohci, "resume detect\n");
 		ohci_writel(ohci, OHCI_INTR_RD, &regs->intrstatus);
-		hcd->poll_rh = 1;
+		set_bit(HCD_FLAG_POLL_RH, &hcd->flags);
 		if (ohci->autostop) {
 			spin_lock (&ohci->lock);
 			ohci_rh_resume (ohci);
diff --git a/drivers/usb/host/ohci-hub.c b/drivers/usb/host/ohci-hub.c
index 65cac8cc8921..4dd39022c388 100644
--- a/drivers/usb/host/ohci-hub.c
+++ b/drivers/usb/host/ohci-hub.c
@@ -284,7 +284,7 @@ static int ohci_bus_suspend (struct usb_hcd *hcd)
 
 	spin_lock_irq (&ohci->lock);
 
-	if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)))
+	if (unlikely(!HCD_HW_ACCESSIBLE(hcd)))
 		rc = -ESHUTDOWN;
 	else
 		rc = ohci_rh_suspend (ohci, 0);
@@ -302,7 +302,7 @@ static int ohci_bus_resume (struct usb_hcd *hcd)
 
 	spin_lock_irq (&ohci->lock);
 
-	if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)))
+	if (unlikely(!HCD_HW_ACCESSIBLE(hcd)))
 		rc = -ESHUTDOWN;
 	else
 		rc = ohci_rh_resume (ohci);
@@ -489,7 +489,7 @@ ohci_hub_status_data (struct usb_hcd *hcd, char *buf)
 	unsigned long	flags;
 
 	spin_lock_irqsave (&ohci->lock, flags);
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
+	if (!HCD_HW_ACCESSIBLE(hcd))
 		goto done;
 
 	/* undocumented erratum seen on at least rev D */
@@ -533,8 +533,12 @@ ohci_hub_status_data (struct usb_hcd *hcd, char *buf)
 		}
 	}
 
-	hcd->poll_rh = ohci_root_hub_state_changes(ohci, changed,
-			any_connected, rhsc_status);
+	if (ohci_root_hub_state_changes(ohci, changed,
+			any_connected, rhsc_status))
+		set_bit(HCD_FLAG_POLL_RH, &hcd->flags);
+	else
+		clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
+
 
 done:
 	spin_unlock_irqrestore (&ohci->lock, flags);
@@ -701,7 +705,7 @@ static int ohci_hub_control (
 	u32		temp;
 	int		retval = 0;
 
-	if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)))
+	if (unlikely(!HCD_HW_ACCESSIBLE(hcd)))
 		return -ESHUTDOWN;
 
 	switch (typeReq) {
diff --git a/drivers/usb/host/oxu210hp-hcd.c b/drivers/usb/host/oxu210hp-hcd.c
index f608dfd09a8a..d9c85a292737 100644
--- a/drivers/usb/host/oxu210hp-hcd.c
+++ b/drivers/usb/host/oxu210hp-hcd.c
@@ -1641,8 +1641,7 @@ static int submit_async(struct oxu_hcd	*oxu, struct urb *urb,
 #endif
 
 	spin_lock_irqsave(&oxu->lock, flags);
-	if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE,
-			       &oxu_to_hcd(oxu)->flags))) {
+	if (unlikely(!HCD_HW_ACCESSIBLE(oxu_to_hcd(oxu)))) {
 		rc = -ESHUTDOWN;
 		goto done;
 	}
@@ -2209,8 +2208,7 @@ static int intr_submit(struct oxu_hcd *oxu, struct urb *urb,
 
 	spin_lock_irqsave(&oxu->lock, flags);
 
-	if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE,
-			       &oxu_to_hcd(oxu)->flags))) {
+	if (unlikely(!HCD_HW_ACCESSIBLE(oxu_to_hcd(oxu)))) {
 		status = -ESHUTDOWN;
 		goto done;
 	}
@@ -2715,7 +2713,6 @@ static int oxu_run(struct usb_hcd *hcd)
 	u32 temp, hcc_params;
 
 	hcd->uses_new_polling = 1;
-	hcd->poll_rh = 0;
 
 	/* EHCI spec section 4.1 */
 	retval = ehci_reset(oxu);
diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index d1dce2166eff..2743ec770f0c 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -140,7 +140,7 @@ static void finish_reset(struct uhci_hcd *uhci)
 	uhci->rh_state = UHCI_RH_RESET;
 	uhci->is_stopped = UHCI_IS_STOPPED;
 	uhci_to_hcd(uhci)->state = HC_STATE_HALT;
-	uhci_to_hcd(uhci)->poll_rh = 0;
+	clear_bit(HCD_FLAG_POLL_RH, &uhci_to_hcd(uhci)->flags);
 
 	uhci->dead = 0;		/* Full reset resurrects the controller */
 }
@@ -344,7 +344,10 @@ __acquires(uhci->lock)
 	/* If interrupts don't work and remote wakeup is enabled then
 	 * the suspended root hub needs to be polled.
 	 */
-	uhci_to_hcd(uhci)->poll_rh = (!int_enable && wakeup_enable);
+	if (!int_enable && wakeup_enable)
+		set_bit(HCD_FLAG_POLL_RH, &uhci_to_hcd(uhci)->flags);
+	else
+		clear_bit(HCD_FLAG_POLL_RH, &uhci_to_hcd(uhci)->flags);
 
 	uhci_scan_schedule(uhci);
 	uhci_fsbr_off(uhci);
@@ -363,7 +366,7 @@ static void start_rh(struct uhci_hcd *uhci)
 			uhci->io_addr + USBINTR);
 	mb();
 	uhci->rh_state = UHCI_RH_RUNNING;
-	uhci_to_hcd(uhci)->poll_rh = 1;
+	set_bit(HCD_FLAG_POLL_RH, &uhci_to_hcd(uhci)->flags);
 }
 
 static void wakeup_rh(struct uhci_hcd *uhci)
@@ -733,7 +736,7 @@ static void uhci_stop(struct usb_hcd *hcd)
 	struct uhci_hcd *uhci = hcd_to_uhci(hcd);
 
 	spin_lock_irq(&uhci->lock);
-	if (test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags) && !uhci->dead)
+	if (HCD_HW_ACCESSIBLE(hcd) && !uhci->dead)
 		uhci_hc_died(uhci);
 	uhci_scan_schedule(uhci);
 	spin_unlock_irq(&uhci->lock);
@@ -750,7 +753,7 @@ static int uhci_rh_suspend(struct usb_hcd *hcd)
 	int rc = 0;
 
 	spin_lock_irq(&uhci->lock);
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
+	if (!HCD_HW_ACCESSIBLE(hcd))
 		rc = -ESHUTDOWN;
 	else if (uhci->dead)
 		;		/* Dead controllers tell no tales */
@@ -777,7 +780,7 @@ static int uhci_rh_resume(struct usb_hcd *hcd)
 	int rc = 0;
 
 	spin_lock_irq(&uhci->lock);
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
+	if (!HCD_HW_ACCESSIBLE(hcd))
 		rc = -ESHUTDOWN;
 	else if (!uhci->dead)
 		wakeup_rh(uhci);
@@ -793,7 +796,7 @@ static int uhci_pci_suspend(struct usb_hcd *hcd)
 	dev_dbg(uhci_dev(uhci), "%s\n", __func__);
 
 	spin_lock_irq(&uhci->lock);
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags) || uhci->dead)
+	if (!HCD_HW_ACCESSIBLE(hcd) || uhci->dead)
 		goto done_okay;		/* Already suspended or dead */
 
 	if (uhci->rh_state > UHCI_RH_SUSPENDED) {
@@ -807,7 +810,7 @@ static int uhci_pci_suspend(struct usb_hcd *hcd)
 	 */
 	pci_write_config_word(to_pci_dev(uhci_dev(uhci)), USBLEGSUP, 0);
 	mb();
-	hcd->poll_rh = 0;
+	clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
 
 	/* FIXME: Enable non-PME# remote wakeup? */
 
@@ -860,7 +863,7 @@ static int uhci_pci_resume(struct usb_hcd *hcd, bool hibernated)
 	 * the suspended root hub needs to be polled.
 	 */
 	if (!uhci->RD_enable && hcd->self.root_hub->do_remote_wakeup) {
-		hcd->poll_rh = 1;
+		set_bit(HCD_FLAG_POLL_RH, &hcd->flags);
 		usb_hcd_poll_rh_status(hcd);
 	}
 	return 0;
diff --git a/drivers/usb/host/uhci-hub.c b/drivers/usb/host/uhci-hub.c
index 8270055848ca..f0c58116c0ad 100644
--- a/drivers/usb/host/uhci-hub.c
+++ b/drivers/usb/host/uhci-hub.c
@@ -190,7 +190,7 @@ static int uhci_hub_status_data(struct usb_hcd *hcd, char *buf)
 	spin_lock_irqsave(&uhci->lock, flags);
 
 	uhci_scan_schedule(uhci);
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags) || uhci->dead)
+	if (!HCD_HW_ACCESSIBLE(hcd) || uhci->dead)
 		goto done;
 	uhci_check_ports(uhci);
 
@@ -246,7 +246,7 @@ static int uhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
 	u16 wPortChange, wPortStatus;
 	unsigned long flags;
 
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags) || uhci->dead)
+	if (!HCD_HW_ACCESSIBLE(hcd) || uhci->dead)
 		return -ETIMEDOUT;
 
 	spin_lock_irqsave(&uhci->lock, flags);
diff --git a/drivers/usb/host/whci/hcd.c b/drivers/usb/host/whci/hcd.c
index e0d3401285c8..72b6892fda67 100644
--- a/drivers/usb/host/whci/hcd.c
+++ b/drivers/usb/host/whci/hcd.c
@@ -68,7 +68,7 @@ static int whc_start(struct usb_hcd *usb_hcd)
 	whc_write_wusbcmd(whc, WUSBCMD_RUN, WUSBCMD_RUN);
 
 	usb_hcd->uses_new_polling = 1;
-	usb_hcd->poll_rh = 1;
+	set_bit(HCD_FLAG_POLL_RH, &usb_hcd->flags);
 	usb_hcd->state = HC_STATE_RUNNING;
 
 out:
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 343f1047f5d0..5e73386b3899 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -427,7 +427,6 @@ int xhci_run(struct usb_hcd *hcd)
 	void (*doorbell)(struct xhci_hcd *) = NULL;
 
 	hcd->uses_new_polling = 1;
-	hcd->poll_rh = 0;
 
 	xhci_dbg(xhci, "xhci_run\n");
 #if 0	/* FIXME: MSI not setup yet */
@@ -733,7 +732,7 @@ int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags)
 		ret = -EINVAL;
 		goto exit;
 	}
-	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)) {
+	if (!HCD_HW_ACCESSIBLE(hcd)) {
 		if (!in_interrupt())
 			xhci_dbg(xhci, "urb submitted during PCI suspend\n");
 		ret = -ESHUTDOWN;
diff --git a/drivers/usb/musb/musb_virthub.c b/drivers/usb/musb/musb_virthub.c
index 92e85e027cfb..43233c397b6e 100644
--- a/drivers/usb/musb/musb_virthub.c
+++ b/drivers/usb/musb/musb_virthub.c
@@ -244,7 +244,7 @@ int musb_hub_control(
 
 	spin_lock_irqsave(&musb->lock, flags);
 
-	if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))) {
+	if (unlikely(!HCD_HW_ACCESSIBLE(hcd))) {
 		spin_unlock_irqrestore(&musb->lock, flags);
 		return -ESHUTDOWN;
 	}
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 9b867e64a0f4..f8f8fa7a56e8 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -89,19 +89,31 @@ struct usb_hcd {
 	 */
 	const struct hc_driver	*driver;	/* hw-specific hooks */
 
-	/* Flags that need to be manipulated atomically */
+	/* Flags that need to be manipulated atomically because they can
+	 * change while the host controller is running.  Always use
+	 * set_bit() or clear_bit() to change their values.
+	 */
 	unsigned long		flags;
-#define HCD_FLAG_HW_ACCESSIBLE	0x00000001
-#define HCD_FLAG_SAW_IRQ	0x00000002
+#define HCD_FLAG_HW_ACCESSIBLE		0	/* at full power */
+#define HCD_FLAG_SAW_IRQ		1
+#define HCD_FLAG_POLL_RH		2	/* poll for rh status? */
+#define HCD_FLAG_POLL_PENDING		3	/* status has changed? */
+
+	/* The flags can be tested using these macros; they are likely to
+	 * be slightly faster than test_bit().
+	 */
+#define HCD_HW_ACCESSIBLE(hcd)	((hcd)->flags & (1U << HCD_FLAG_HW_ACCESSIBLE))
+#define HCD_SAW_IRQ(hcd)	((hcd)->flags & (1U << HCD_FLAG_SAW_IRQ))
+#define HCD_POLL_RH(hcd)	((hcd)->flags & (1U << HCD_FLAG_POLL_RH))
+#define HCD_POLL_PENDING(hcd)	((hcd)->flags & (1U << HCD_FLAG_POLL_PENDING))
 
+	/* Flags that get set only during HCD registration or removal. */
 	unsigned		rh_registered:1;/* is root hub registered? */
 	unsigned		rh_pollable:1;	/* may we poll the root hub? */
 
 	/* The next flag is a stopgap, to be removed when all the HCDs
 	 * support the new root-hub polling mechanism. */
 	unsigned		uses_new_polling:1;
-	unsigned		poll_rh:1;	/* poll for rh status? */
-	unsigned		poll_pending:1;	/* status has changed? */
 	unsigned		wireless:1;	/* Wireless USB HCD */
 	unsigned		authorized_default:1;
 	unsigned		has_tt:1;	/* Integrated TT in root hub */
-- 
cgit v1.2.3-59-g8ed1b


From 4147200d25c423e627ab4487530b3d9f2ef829c8 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 25 Jun 2010 14:02:14 -0400
Subject: USB: add do_wakeup parameter for PCI HCD suspend

This patch (as1385) adds a "do_wakeup" parameter to the pci_suspend
method used by PCI-based host controller drivers.  ehci-hcd in
particular needs to know whether or not to enable wakeup when
suspending a controller.  Although that information is currently
available through device_may_wakeup(), when support is added for
runtime suspend this will no longer be true.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/hcd-pci.c     | 4 +++-
 drivers/usb/host/ehci-au1xxx.c | 2 +-
 drivers/usb/host/ehci-fsl.c    | 3 ++-
 drivers/usb/host/ehci-hub.c    | 5 ++---
 drivers/usb/host/ehci-pci.c    | 4 ++--
 drivers/usb/host/ehci.h        | 8 ++++----
 drivers/usb/host/ohci-pci.c    | 2 +-
 drivers/usb/host/uhci-hcd.c    | 2 +-
 include/linux/usb/hcd.h        | 2 +-
 9 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c
index f0156de8db67..e387e394f876 100644
--- a/drivers/usb/core/hcd-pci.c
+++ b/drivers/usb/core/hcd-pci.c
@@ -386,7 +386,9 @@ static int hcd_pci_suspend(struct device *dev)
 		return retval;
 
 	if (hcd->driver->pci_suspend) {
-		retval = hcd->driver->pci_suspend(hcd);
+		bool	do_wakeup = device_may_wakeup(dev);
+
+		retval = hcd->driver->pci_suspend(hcd, do_wakeup);
 		suspend_report_result(hcd->driver->pci_suspend, retval);
 		if (retval)
 			return retval;
diff --git a/drivers/usb/host/ehci-au1xxx.c b/drivers/usb/host/ehci-au1xxx.c
index faa61748db70..2baf8a849086 100644
--- a/drivers/usb/host/ehci-au1xxx.c
+++ b/drivers/usb/host/ehci-au1xxx.c
@@ -228,7 +228,7 @@ static int ehci_hcd_au1xxx_drv_suspend(struct device *dev)
 	 * the root hub is either suspended or stopped.
 	 */
 	spin_lock_irqsave(&ehci->lock, flags);
-	ehci_prepare_ports_for_controller_suspend(ehci);
+	ehci_prepare_ports_for_controller_suspend(ehci, device_may_wakeup(dev));
 	ehci_writel(ehci, 0, &ehci->regs->intr_enable);
 	(void)ehci_readl(ehci, &ehci->regs->intr_enable);
 
diff --git a/drivers/usb/host/ehci-fsl.c b/drivers/usb/host/ehci-fsl.c
index 5cd967d28938..a416421abfa2 100644
--- a/drivers/usb/host/ehci-fsl.c
+++ b/drivers/usb/host/ehci-fsl.c
@@ -313,7 +313,8 @@ static int ehci_fsl_drv_suspend(struct device *dev)
 	struct ehci_fsl *ehci_fsl = hcd_to_ehci_fsl(hcd);
 	void __iomem *non_ehci = hcd->regs;
 
-	ehci_prepare_ports_for_controller_suspend(hcd_to_ehci(hcd));
+	ehci_prepare_ports_for_controller_suspend(hcd_to_ehci(hcd),
+			device_may_wakeup(dev));
 	if (!fsl_deep_sleep())
 		return 0;
 
diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c
index 0931f5a7dec4..1292a5b2197a 100644
--- a/drivers/usb/host/ehci-hub.c
+++ b/drivers/usb/host/ehci-hub.c
@@ -107,7 +107,7 @@ static void ehci_handover_companion_ports(struct ehci_hcd *ehci)
 }
 
 static void ehci_adjust_port_wakeup_flags(struct ehci_hcd *ehci,
-		bool suspending)
+		bool suspending, bool do_wakeup)
 {
 	int		port;
 	u32		temp;
@@ -117,8 +117,7 @@ static void ehci_adjust_port_wakeup_flags(struct ehci_hcd *ehci,
 	 * when the controller is suspended or resumed.  In all other
 	 * cases they don't need to be changed.
 	 */
-	if (!ehci_to_hcd(ehci)->self.root_hub->do_remote_wakeup ||
-			device_may_wakeup(ehci_to_hcd(ehci)->self.controller))
+	if (!ehci_to_hcd(ehci)->self.root_hub->do_remote_wakeup || do_wakeup)
 		return;
 
 	/* clear phy low-power mode before changing wakeup flags */
diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c
index a307d550bdaf..f555e4f35a04 100644
--- a/drivers/usb/host/ehci-pci.c
+++ b/drivers/usb/host/ehci-pci.c
@@ -277,7 +277,7 @@ done:
  * Also they depend on separate root hub suspend/resume.
  */
 
-static int ehci_pci_suspend(struct usb_hcd *hcd)
+static int ehci_pci_suspend(struct usb_hcd *hcd, bool do_wakeup)
 {
 	struct ehci_hcd		*ehci = hcd_to_ehci(hcd);
 	unsigned long		flags;
@@ -291,7 +291,7 @@ static int ehci_pci_suspend(struct usb_hcd *hcd)
 	 * the root hub is either suspended or stopped.
 	 */
 	spin_lock_irqsave (&ehci->lock, flags);
-	ehci_prepare_ports_for_controller_suspend(ehci);
+	ehci_prepare_ports_for_controller_suspend(ehci, do_wakeup);
 	ehci_writel(ehci, 0, &ehci->regs->intr_enable);
 	(void)ehci_readl(ehci, &ehci->regs->intr_enable);
 
diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index e6c57cc416f6..a4a63ce290e9 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h
@@ -540,11 +540,11 @@ struct ehci_fstn {
 
 /* Prepare the PORTSC wakeup flags during controller suspend/resume */
 
-#define ehci_prepare_ports_for_controller_suspend(ehci)		\
-		ehci_adjust_port_wakeup_flags(ehci, true);
+#define ehci_prepare_ports_for_controller_suspend(ehci, do_wakeup)	\
+		ehci_adjust_port_wakeup_flags(ehci, true, do_wakeup);
 
-#define ehci_prepare_ports_for_controller_resume(ehci)		\
-		ehci_adjust_port_wakeup_flags(ehci, false);
+#define ehci_prepare_ports_for_controller_resume(ehci)			\
+		ehci_adjust_port_wakeup_flags(ehci, false, false);
 
 /*-------------------------------------------------------------------------*/
 
diff --git a/drivers/usb/host/ohci-pci.c b/drivers/usb/host/ohci-pci.c
index b8a1148f248e..6bdc8b25a6a1 100644
--- a/drivers/usb/host/ohci-pci.c
+++ b/drivers/usb/host/ohci-pci.c
@@ -392,7 +392,7 @@ static int __devinit ohci_pci_start (struct usb_hcd *hcd)
 
 #ifdef	CONFIG_PM
 
-static int ohci_pci_suspend(struct usb_hcd *hcd)
+static int ohci_pci_suspend(struct usb_hcd *hcd, bool do_wakeup)
 {
 	struct ohci_hcd	*ohci = hcd_to_ohci (hcd);
 	unsigned long	flags;
diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index 2743ec770f0c..a7850f51fdc5 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -788,7 +788,7 @@ static int uhci_rh_resume(struct usb_hcd *hcd)
 	return rc;
 }
 
-static int uhci_pci_suspend(struct usb_hcd *hcd)
+static int uhci_pci_suspend(struct usb_hcd *hcd, bool do_wakeup)
 {
 	struct uhci_hcd *uhci = hcd_to_uhci(hcd);
 	int rc = 0;
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index f8f8fa7a56e8..ae10020b4023 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -211,7 +211,7 @@ struct hc_driver {
 	 * a whole, not just the root hub; they're for PCI bus glue.
 	 */
 	/* called after suspending the hub, before entering D3 etc */
-	int	(*pci_suspend)(struct usb_hcd *hcd);
+	int	(*pci_suspend)(struct usb_hcd *hcd, bool do_wakeup);
 
 	/* called after entering D0 (etc), before resuming the hub */
 	int	(*pci_resume)(struct usb_hcd *hcd, bool hibernated);
-- 
cgit v1.2.3-59-g8ed1b


From ff2f07874362d34684296f2bd5547a099f33c6d4 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 25 Jun 2010 14:02:35 -0400
Subject: USB: fix race between root-hub wakeup & controller suspend

This patch (as1395) adds code to hcd_pci_suspend() for handling wakeup
races.  This is another general race pattern, similar to the "open
vs. unregister" race we're all familiar with.  Here, the race is
between suspending a device and receiving a wakeup request from one of
the device's suspended children.

In particular, if a root-hub wakeup is requested at about the same
time as the corresponding USB controller is suspended, and if the
controller is enabled for wakeup, then the controller should either
fail to suspend or else wake right back up again.

During system sleep this won't happen very much, especially since host
controllers generally aren't enabled for wakeup during sleep.  However
it is definitely an issue for runtime PM.  Something like this will be
needed to prevent the controller from autosuspending while waiting for
a root-hub resume to take place.  (That is, in fact, the common case,
for which there is an extra test.)

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/hcd-pci.c | 12 ++++++++++++
 drivers/usb/core/hcd.c     |  5 ++++-
 include/linux/usb/hcd.h    |  2 ++
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c
index e387e394f876..352577baa53d 100644
--- a/drivers/usb/core/hcd-pci.c
+++ b/drivers/usb/core/hcd-pci.c
@@ -388,8 +388,20 @@ static int hcd_pci_suspend(struct device *dev)
 	if (hcd->driver->pci_suspend) {
 		bool	do_wakeup = device_may_wakeup(dev);
 
+		/* Optimization: Don't suspend if a root-hub wakeup is
+		 * pending and it would cause the HCD to wake up anyway.
+		 */
+		if (do_wakeup && HCD_WAKEUP_PENDING(hcd))
+			return -EBUSY;
 		retval = hcd->driver->pci_suspend(hcd, do_wakeup);
 		suspend_report_result(hcd->driver->pci_suspend, retval);
+
+		/* Check again in case wakeup raced with pci_suspend */
+		if (retval == 0 && do_wakeup && HCD_WAKEUP_PENDING(hcd)) {
+			if (hcd->driver->pci_resume)
+				hcd->driver->pci_resume(hcd, false);
+			retval = -EBUSY;
+		}
 		if (retval)
 			return retval;
 	}
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index f2fe7c8e991d..0358c05e6e8a 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1940,6 +1940,7 @@ int hcd_bus_resume(struct usb_device *rhdev, pm_message_t msg)
 
 	dev_dbg(&rhdev->dev, "usb %s%s\n",
 			(msg.event & PM_EVENT_AUTO ? "auto-" : ""), "resume");
+	clear_bit(HCD_FLAG_WAKEUP_PENDING, &hcd->flags);
 	if (!hcd->driver->bus_resume)
 		return -ENOENT;
 	if (hcd->state == HC_STATE_RUNNING)
@@ -1993,8 +1994,10 @@ void usb_hcd_resume_root_hub (struct usb_hcd *hcd)
 	unsigned long flags;
 
 	spin_lock_irqsave (&hcd_root_hub_lock, flags);
-	if (hcd->rh_registered)
+	if (hcd->rh_registered) {
+		set_bit(HCD_FLAG_WAKEUP_PENDING, &hcd->flags);
 		queue_work(pm_wq, &hcd->wakeup_work);
+	}
 	spin_unlock_irqrestore (&hcd_root_hub_lock, flags);
 }
 EXPORT_SYMBOL_GPL(usb_hcd_resume_root_hub);
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index ae10020b4023..3b571f1ffbb3 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -98,6 +98,7 @@ struct usb_hcd {
 #define HCD_FLAG_SAW_IRQ		1
 #define HCD_FLAG_POLL_RH		2	/* poll for rh status? */
 #define HCD_FLAG_POLL_PENDING		3	/* status has changed? */
+#define HCD_FLAG_WAKEUP_PENDING		4	/* root hub is resuming? */
 
 	/* The flags can be tested using these macros; they are likely to
 	 * be slightly faster than test_bit().
@@ -106,6 +107,7 @@ struct usb_hcd {
 #define HCD_SAW_IRQ(hcd)	((hcd)->flags & (1U << HCD_FLAG_SAW_IRQ))
 #define HCD_POLL_RH(hcd)	((hcd)->flags & (1U << HCD_FLAG_POLL_RH))
 #define HCD_POLL_PENDING(hcd)	((hcd)->flags & (1U << HCD_FLAG_POLL_PENDING))
+#define HCD_WAKEUP_PENDING(hcd)	((hcd)->flags & (1U << HCD_FLAG_WAKEUP_PENDING))
 
 	/* Flags that get set only during HCD registration or removal. */
 	unsigned		rh_registered:1;/* is root hub registered? */
-- 
cgit v1.2.3-59-g8ed1b


From 5128993b6f5f38bc567f3c246248ca29fd599132 Mon Sep 17 00:00:00 2001
From: Ajay Kumar Gupta <ajay.gupta@ti.com>
Date: Thu, 8 Jul 2010 14:03:01 +0530
Subject: USB: ulpi: fix compilation warning

Fixes below compilation warning from ulpi.h

include/linux/usb/ulpi.h:145:
        warning: 'struct otg_io_access_ops' declared inside parameter list
include/linux/usb/ulpi.h:145:
         warning: its scope is only this definition or declaration,
         which is probably not what you want

Signed-off-by: Ajay Kumar Gupta <ajay.gupta@ti.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/ulpi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/ulpi.h b/include/linux/usb/ulpi.h
index 2369d07c3c87..900d97b7096a 100644
--- a/include/linux/usb/ulpi.h
+++ b/include/linux/usb/ulpi.h
@@ -11,6 +11,7 @@
 #ifndef __LINUX_USB_ULPI_H
 #define __LINUX_USB_ULPI_H
 
+#include <linux/usb/otg.h>
 /*-------------------------------------------------------------------------*/
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 13dd0c9767349b280cf131c34461f85e5effc42a Mon Sep 17 00:00:00 2001
From: Igor Grinberg <grinberg@compulab.co.il>
Date: Thu, 15 Jul 2010 16:00:16 +0300
Subject: USB: otg/ulpi: extend the generic ulpi driver.

1) Introduce ulpi specific flags for control of the ulpi phy
2) Extend the generic ulpi driver with support for Function and
Interface control of upli phy
3) Update the platforms using the generic ulpi driver with new ulpi
flags
4) Remove the otg control flags not in use

Signed-off-by: Igor Grinberg <grinberg@compulab.co.il>
Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/arm/mach-mx3/mach-armadillo5x0.c    |   4 +-
 arch/arm/mach-mx3/mach-mx31lilly.c       |   4 +-
 arch/arm/mach-mx3/mach-mx31lite.c        |   2 +-
 arch/arm/mach-mx3/mach-mx31moboard.c     |   2 +-
 arch/arm/mach-mx3/mach-pcm037.c          |   4 +-
 arch/arm/mach-mx3/mach-pcm043.c          |   2 +-
 arch/arm/mach-mx3/mx31moboard-smartbot.c |   2 +-
 drivers/usb/otg/ulpi.c                   | 127 ++++++++++++++++++++++++++++---
 include/linux/usb/otg.h                  |   7 --
 include/linux/usb/ulpi.h                 |  39 ++++++++++
 10 files changed, 166 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-mx3/mach-armadillo5x0.c b/arch/arm/mach-mx3/mach-armadillo5x0.c
index 96aadcadb4ff..68879c996a55 100644
--- a/arch/arm/mach-mx3/mach-armadillo5x0.c
+++ b/arch/arm/mach-mx3/mach-armadillo5x0.c
@@ -551,9 +551,9 @@ static void __init armadillo5x0_init(void)
 	/* USB */
 #if defined(CONFIG_USB_ULPI)
 	usbotg_pdata.otg = otg_ulpi_create(&mxc_ulpi_access_ops,
-			USB_OTG_DRV_VBUS | USB_OTG_DRV_VBUS_EXT);
+			ULPI_OTG_DRVVBUS | ULPI_OTG_DRVVBUS_EXT);
 	usbh2_pdata.otg = otg_ulpi_create(&mxc_ulpi_access_ops,
-			USB_OTG_DRV_VBUS | USB_OTG_DRV_VBUS_EXT);
+			ULPI_OTG_DRVVBUS | ULPI_OTG_DRVVBUS_EXT);
 
 	mxc_register_device(&mxc_otg_host, &usbotg_pdata);
 	mxc_register_device(&mxc_usbh2, &usbh2_pdata);
diff --git a/arch/arm/mach-mx3/mach-mx31lilly.c b/arch/arm/mach-mx3/mach-mx31lilly.c
index 8f66f65e80e2..7c37daabb757 100644
--- a/arch/arm/mach-mx3/mach-mx31lilly.c
+++ b/arch/arm/mach-mx3/mach-mx31lilly.c
@@ -245,9 +245,9 @@ static struct mxc_usbh_platform_data usbh2_pdata = {
 static void lilly1131_usb_init(void)
 {
 	usbotg_pdata.otg = otg_ulpi_create(&mxc_ulpi_access_ops,
-				USB_OTG_DRV_VBUS | USB_OTG_DRV_VBUS_EXT);
+				ULPI_OTG_DRVVBUS | ULPI_OTG_DRVVBUS_EXT);
 	usbh2_pdata.otg = otg_ulpi_create(&mxc_ulpi_access_ops,
-				USB_OTG_DRV_VBUS | USB_OTG_DRV_VBUS_EXT);
+				ULPI_OTG_DRVVBUS | ULPI_OTG_DRVVBUS_EXT);
 
 	mxc_register_device(&mxc_usbh1, &usbh1_pdata);
 	mxc_register_device(&mxc_usbh2, &usbh2_pdata);
diff --git a/arch/arm/mach-mx3/mach-mx31lite.c b/arch/arm/mach-mx3/mach-mx31lite.c
index da236c497d2a..f66a9576d8c2 100644
--- a/arch/arm/mach-mx3/mach-mx31lite.c
+++ b/arch/arm/mach-mx3/mach-mx31lite.c
@@ -256,7 +256,7 @@ static void __init mxc_board_init(void)
 #if defined(CONFIG_USB_ULPI)
 	/* USB */
 	usbh2_pdata.otg = otg_ulpi_create(&mxc_ulpi_access_ops,
-				USB_OTG_DRV_VBUS | USB_OTG_DRV_VBUS_EXT);
+				ULPI_OTG_DRVVBUS | ULPI_OTG_DRVVBUS_EXT);
 
 	mxc_register_device(&mxc_usbh2, &usbh2_pdata);
 #endif
diff --git a/arch/arm/mach-mx3/mach-mx31moboard.c b/arch/arm/mach-mx3/mach-mx31moboard.c
index 67776bc61c33..7a075e8bf2d4 100644
--- a/arch/arm/mach-mx3/mach-mx31moboard.c
+++ b/arch/arm/mach-mx3/mach-mx31moboard.c
@@ -412,7 +412,7 @@ static struct mxc_usbh_platform_data usbh2_pdata = {
 static int __init moboard_usbh2_init(void)
 {
 	usbh2_pdata.otg = otg_ulpi_create(&mxc_ulpi_access_ops,
-			USB_OTG_DRV_VBUS | USB_OTG_DRV_VBUS_EXT);
+			ULPI_OTG_DRVVBUS | ULPI_OTG_DRVVBUS_EXT);
 
 	return mxc_register_device(&mxc_usbh2, &usbh2_pdata);
 }
diff --git a/arch/arm/mach-mx3/mach-pcm037.c b/arch/arm/mach-mx3/mach-pcm037.c
index 8a292dd1a714..214de11b20b9 100644
--- a/arch/arm/mach-mx3/mach-pcm037.c
+++ b/arch/arm/mach-mx3/mach-pcm037.c
@@ -654,13 +654,13 @@ static void __init mxc_board_init(void)
 #if defined(CONFIG_USB_ULPI)
 	if (otg_mode_host) {
 		otg_pdata.otg = otg_ulpi_create(&mxc_ulpi_access_ops,
-				USB_OTG_DRV_VBUS | USB_OTG_DRV_VBUS_EXT);
+				ULPI_OTG_DRVVBUS | ULPI_OTG_DRVVBUS_EXT);
 
 		mxc_register_device(&mxc_otg_host, &otg_pdata);
 	}
 
 	usbh2_pdata.otg = otg_ulpi_create(&mxc_ulpi_access_ops,
-				USB_OTG_DRV_VBUS | USB_OTG_DRV_VBUS_EXT);
+				ULPI_OTG_DRVVBUS | ULPI_OTG_DRVVBUS_EXT);
 
 	mxc_register_device(&mxc_usbh2, &usbh2_pdata);
 #endif
diff --git a/arch/arm/mach-mx3/mach-pcm043.c b/arch/arm/mach-mx3/mach-pcm043.c
index 47f5311b301a..28886f0e62f9 100644
--- a/arch/arm/mach-mx3/mach-pcm043.c
+++ b/arch/arm/mach-mx3/mach-pcm043.c
@@ -378,7 +378,7 @@ static void __init mxc_board_init(void)
 #if defined(CONFIG_USB_ULPI)
 	if (otg_mode_host) {
 		otg_pdata.otg = otg_ulpi_create(&mxc_ulpi_access_ops,
-				USB_OTG_DRV_VBUS | USB_OTG_DRV_VBUS_EXT);
+				ULPI_OTG_DRVVBUS | ULPI_OTG_DRVVBUS_EXT);
 
 		mxc_register_device(&mxc_otg_host, &otg_pdata);
 	}
diff --git a/arch/arm/mach-mx3/mx31moboard-smartbot.c b/arch/arm/mach-mx3/mx31moboard-smartbot.c
index 40c3e7564cb6..417757e78c65 100644
--- a/arch/arm/mach-mx3/mx31moboard-smartbot.c
+++ b/arch/arm/mach-mx3/mx31moboard-smartbot.c
@@ -134,7 +134,7 @@ static struct mxc_usbh_platform_data otg_host_pdata = {
 static int __init smartbot_otg_host_init(void)
 {
 	otg_host_pdata.otg = otg_ulpi_create(&mxc_ulpi_access_ops,
-			USB_OTG_DRV_VBUS | USB_OTG_DRV_VBUS_EXT);
+			ULPI_OTG_DRVVBUS | ULPI_OTG_DRVVBUS_EXT);
 
 	return mxc_register_device(&mxc_otg_host, &otg_host_pdata);
 }
diff --git a/drivers/usb/otg/ulpi.c b/drivers/usb/otg/ulpi.c
index ef7dbe40f111..ccc81950822b 100644
--- a/drivers/usb/otg/ulpi.c
+++ b/drivers/usb/otg/ulpi.c
@@ -37,25 +37,106 @@ static unsigned int ulpi_ids[] = {
 	ULPI_ID(0x0424, 0x0006),        /* SMSC USB3319 */
 };
 
-static int ulpi_set_flags(struct otg_transceiver *otg)
+static int ulpi_set_otg_flags(struct otg_transceiver *otg)
 {
-	unsigned int flags = 0;
+	unsigned int flags = ULPI_OTG_CTRL_DP_PULLDOWN |
+			     ULPI_OTG_CTRL_DM_PULLDOWN;
 
-	if (otg->flags & USB_OTG_PULLUP_ID)
+	if (otg->flags & ULPI_OTG_ID_PULLUP)
 		flags |= ULPI_OTG_CTRL_ID_PULLUP;
 
-	if (otg->flags & USB_OTG_PULLDOWN_DM)
-		flags |= ULPI_OTG_CTRL_DM_PULLDOWN;
+	/*
+	 * ULPI Specification rev.1.1 default
+	 * for Dp/DmPulldown is enabled.
+	 */
+	if (otg->flags & ULPI_OTG_DP_PULLDOWN_DIS)
+		flags &= ~ULPI_OTG_CTRL_DP_PULLDOWN;
 
-	if (otg->flags & USB_OTG_PULLDOWN_DP)
-		flags |= ULPI_OTG_CTRL_DP_PULLDOWN;
+	if (otg->flags & ULPI_OTG_DM_PULLDOWN_DIS)
+		flags &= ~ULPI_OTG_CTRL_DM_PULLDOWN;
 
-	if (otg->flags & USB_OTG_EXT_VBUS_INDICATOR)
+	if (otg->flags & ULPI_OTG_EXTVBUSIND)
 		flags |= ULPI_OTG_CTRL_EXTVBUSIND;
 
 	return otg_io_write(otg, flags, ULPI_OTG_CTRL);
 }
 
+static int ulpi_set_fc_flags(struct otg_transceiver *otg)
+{
+	unsigned int flags = 0;
+
+	/*
+	 * ULPI Specification rev.1.1 default
+	 * for XcvrSelect is Full Speed.
+	 */
+	if (otg->flags & ULPI_FC_HS)
+		flags |= ULPI_FUNC_CTRL_HIGH_SPEED;
+	else if (otg->flags & ULPI_FC_LS)
+		flags |= ULPI_FUNC_CTRL_LOW_SPEED;
+	else if (otg->flags & ULPI_FC_FS4LS)
+		flags |= ULPI_FUNC_CTRL_FS4LS;
+	else
+		flags |= ULPI_FUNC_CTRL_FULL_SPEED;
+
+	if (otg->flags & ULPI_FC_TERMSEL)
+		flags |= ULPI_FUNC_CTRL_TERMSELECT;
+
+	/*
+	 * ULPI Specification rev.1.1 default
+	 * for OpMode is Normal Operation.
+	 */
+	if (otg->flags & ULPI_FC_OP_NODRV)
+		flags |= ULPI_FUNC_CTRL_OPMODE_NONDRIVING;
+	else if (otg->flags & ULPI_FC_OP_DIS_NRZI)
+		flags |= ULPI_FUNC_CTRL_OPMODE_DISABLE_NRZI;
+	else if (otg->flags & ULPI_FC_OP_NSYNC_NEOP)
+		flags |= ULPI_FUNC_CTRL_OPMODE_NOSYNC_NOEOP;
+	else
+		flags |= ULPI_FUNC_CTRL_OPMODE_NORMAL;
+
+	/*
+	 * ULPI Specification rev.1.1 default
+	 * for SuspendM is Powered.
+	 */
+	flags |= ULPI_FUNC_CTRL_SUSPENDM;
+
+	return otg_io_write(otg, flags, ULPI_FUNC_CTRL);
+}
+
+static int ulpi_set_ic_flags(struct otg_transceiver *otg)
+{
+	unsigned int flags = 0;
+
+	if (otg->flags & ULPI_IC_AUTORESUME)
+		flags |= ULPI_IFC_CTRL_AUTORESUME;
+
+	if (otg->flags & ULPI_IC_EXTVBUS_INDINV)
+		flags |= ULPI_IFC_CTRL_EXTERNAL_VBUS;
+
+	if (otg->flags & ULPI_IC_IND_PASSTHRU)
+		flags |= ULPI_IFC_CTRL_PASSTHRU;
+
+	if (otg->flags & ULPI_IC_PROTECT_DIS)
+		flags |= ULPI_IFC_CTRL_PROTECT_IFC_DISABLE;
+
+	return otg_io_write(otg, flags, ULPI_IFC_CTRL);
+}
+
+static int ulpi_set_flags(struct otg_transceiver *otg)
+{
+	int ret;
+
+	ret = ulpi_set_otg_flags(otg);
+	if (ret)
+		return ret;
+
+	ret = ulpi_set_ic_flags(otg);
+	if (ret)
+		return ret;
+
+	return ulpi_set_fc_flags(otg);
+}
+
 static int ulpi_init(struct otg_transceiver *otg)
 {
 	int i, vid, pid, ret;
@@ -80,6 +161,31 @@ static int ulpi_init(struct otg_transceiver *otg)
 	return -ENODEV;
 }
 
+static int ulpi_set_host(struct otg_transceiver *otg, struct usb_bus *host)
+{
+	unsigned int flags = otg_io_read(otg, ULPI_IFC_CTRL);
+
+	if (!host) {
+		otg->host = NULL;
+		return 0;
+	}
+
+	otg->host = host;
+
+	flags &= ~(ULPI_IFC_CTRL_6_PIN_SERIAL_MODE |
+		   ULPI_IFC_CTRL_3_PIN_SERIAL_MODE |
+		   ULPI_IFC_CTRL_CARKITMODE);
+
+	if (otg->flags & ULPI_IC_6PIN_SERIAL)
+		flags |= ULPI_IFC_CTRL_6_PIN_SERIAL_MODE;
+	else if (otg->flags & ULPI_IC_3PIN_SERIAL)
+		flags |= ULPI_IFC_CTRL_3_PIN_SERIAL_MODE;
+	else if (otg->flags & ULPI_IC_CARKIT)
+		flags |= ULPI_IFC_CTRL_CARKITMODE;
+
+	return otg_io_write(otg, flags, ULPI_IFC_CTRL);
+}
+
 static int ulpi_set_vbus(struct otg_transceiver *otg, bool on)
 {
 	unsigned int flags = otg_io_read(otg, ULPI_OTG_CTRL);
@@ -87,10 +193,10 @@ static int ulpi_set_vbus(struct otg_transceiver *otg, bool on)
 	flags &= ~(ULPI_OTG_CTRL_DRVVBUS | ULPI_OTG_CTRL_DRVVBUS_EXT);
 
 	if (on) {
-		if (otg->flags & USB_OTG_DRV_VBUS)
+		if (otg->flags & ULPI_OTG_DRVVBUS)
 			flags |= ULPI_OTG_CTRL_DRVVBUS;
 
-		if (otg->flags & USB_OTG_DRV_VBUS_EXT)
+		if (otg->flags & ULPI_OTG_DRVVBUS_EXT)
 			flags |= ULPI_OTG_CTRL_DRVVBUS_EXT;
 	}
 
@@ -111,6 +217,7 @@ otg_ulpi_create(struct otg_io_access_ops *ops,
 	otg->flags	= flags;
 	otg->io_ops	= ops;
 	otg->init	= ulpi_init;
+	otg->set_host	= ulpi_set_host;
 	otg->set_vbus	= ulpi_set_vbus;
 
 	return otg;
diff --git a/include/linux/usb/otg.h b/include/linux/usb/otg.h
index 54b2c5e48b9d..545cba73ccaf 100644
--- a/include/linux/usb/otg.h
+++ b/include/linux/usb/otg.h
@@ -43,13 +43,6 @@ enum usb_xceiv_events {
 	USB_EVENT_ENUMERATED,   /* gadget driver enumerated */
 };
 
-#define USB_OTG_PULLUP_ID		(1 << 0)
-#define USB_OTG_PULLDOWN_DP		(1 << 1)
-#define USB_OTG_PULLDOWN_DM		(1 << 2)
-#define USB_OTG_EXT_VBUS_INDICATOR	(1 << 3)
-#define USB_OTG_DRV_VBUS		(1 << 4)
-#define USB_OTG_DRV_VBUS_EXT		(1 << 5)
-
 struct otg_transceiver;
 
 /* for transceivers connected thru an ULPI interface, the user must
diff --git a/include/linux/usb/ulpi.h b/include/linux/usb/ulpi.h
index 900d97b7096a..82b1507f4735 100644
--- a/include/linux/usb/ulpi.h
+++ b/include/linux/usb/ulpi.h
@@ -14,6 +14,41 @@
 #include <linux/usb/otg.h>
 /*-------------------------------------------------------------------------*/
 
+/*
+ * ULPI Flags
+ */
+#define ULPI_OTG_ID_PULLUP		(1 << 0)
+#define ULPI_OTG_DP_PULLDOWN_DIS	(1 << 1)
+#define ULPI_OTG_DM_PULLDOWN_DIS	(1 << 2)
+#define ULPI_OTG_DISCHRGVBUS		(1 << 3)
+#define ULPI_OTG_CHRGVBUS		(1 << 4)
+#define ULPI_OTG_DRVVBUS		(1 << 5)
+#define ULPI_OTG_DRVVBUS_EXT		(1 << 6)
+#define ULPI_OTG_EXTVBUSIND		(1 << 7)
+
+#define ULPI_IC_6PIN_SERIAL		(1 << 8)
+#define ULPI_IC_3PIN_SERIAL		(1 << 9)
+#define ULPI_IC_CARKIT			(1 << 10)
+#define ULPI_IC_CLKSUSPM		(1 << 11)
+#define ULPI_IC_AUTORESUME		(1 << 12)
+#define ULPI_IC_EXTVBUS_INDINV		(1 << 13)
+#define ULPI_IC_IND_PASSTHRU		(1 << 14)
+#define ULPI_IC_PROTECT_DIS		(1 << 15)
+
+#define ULPI_FC_HS			(1 << 16)
+#define ULPI_FC_FS			(1 << 17)
+#define ULPI_FC_LS			(1 << 18)
+#define ULPI_FC_FS4LS			(1 << 19)
+#define ULPI_FC_TERMSEL			(1 << 20)
+#define ULPI_FC_OP_NORM			(1 << 21)
+#define ULPI_FC_OP_NODRV		(1 << 22)
+#define ULPI_FC_OP_DIS_NRZI		(1 << 23)
+#define ULPI_FC_OP_NSYNC_NEOP		(1 << 24)
+#define ULPI_FC_RST			(1 << 25)
+#define ULPI_FC_SUSPM			(1 << 26)
+
+/*-------------------------------------------------------------------------*/
+
 /*
  * Macros for Set and Clear
  * See ULPI 1.1 specification to find the registers with Set and Clear offsets
@@ -59,6 +94,10 @@
 
 /*-------------------------------------------------------------------------*/
 
+/*
+ * Register Bits
+ */
+
 /* Function Control */
 #define ULPI_FUNC_CTRL_XCVRSEL			(1 << 0)
 #define  ULPI_FUNC_CTRL_XCVRSEL_MASK		(3 << 0)
-- 
cgit v1.2.3-59-g8ed1b


From 93362a875fc69881ae69299efaf19a55a1f57db0 Mon Sep 17 00:00:00 2001
From: Phil Dibowitz <phil@ipom.com>
Date: Thu, 22 Jul 2010 00:05:01 +0200
Subject: USB delay init quirk for logitech Harmony 700-series devices

The Logitech Harmony 700 series needs an extra delay during
initialization.  This patch adds a USB quirk which enables such a delay
and adds the device to the quirks list.

Signed-off-by: Phil Dibowitz <phil@ipom.com>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/hub.c     | 6 +++++-
 drivers/usb/core/quirks.c  | 3 +++
 include/linux/usb/quirks.h | 4 ++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index d337ef80bf43..84c1897188d2 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -20,6 +20,7 @@
 #include <linux/usb.h>
 #include <linux/usbdevice_fs.h>
 #include <linux/usb/hcd.h>
+#include <linux/usb/quirks.h>
 #include <linux/kthread.h>
 #include <linux/mutex.h>
 #include <linux/freezer.h>
@@ -1802,7 +1803,6 @@ int usb_new_device(struct usb_device *udev)
 	pm_runtime_set_active(&udev->dev);
 	pm_runtime_enable(&udev->dev);
 
-	usb_detect_quirks(udev);
 	err = usb_enumerate_device(udev);	/* Read descriptors */
 	if (err < 0)
 		goto fail;
@@ -3114,6 +3114,10 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1,
 		if (status < 0)
 			goto loop;
 
+		usb_detect_quirks(udev);
+		if (udev->quirks & USB_QUIRK_DELAY_INIT)
+			msleep(1000);
+
 		/* consecutive bus-powered hubs aren't reliable; they can
 		 * violate the voltage drop budget.  if the new child has
 		 * a "powered" LED, users should notice we didn't enable it
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index db99c084df92..25719da45e33 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -38,6 +38,9 @@ static const struct usb_device_id usb_quirk_list[] = {
 	/* Creative SB Audigy 2 NX */
 	{ USB_DEVICE(0x041e, 0x3020), .driver_info = USB_QUIRK_RESET_RESUME },
 
+	/* Logitech Harmony 700-series */
+	{ USB_DEVICE(0x046d, 0xc122), .driver_info = USB_QUIRK_DELAY_INIT },
+
 	/* Philips PSC805 audio device */
 	{ USB_DEVICE(0x0471, 0x0155), .driver_info = USB_QUIRK_RESET_RESUME },
 
diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h
index 16b7f3347545..3e93de7ecbc3 100644
--- a/include/linux/usb/quirks.h
+++ b/include/linux/usb/quirks.h
@@ -26,4 +26,8 @@
    and can't handle talking to these interfaces */
 #define USB_QUIRK_HONOR_BNUMINTERFACES	0x00000020
 
+/* device needs a pause during initialization, after we read the device
+   descriptor */
+#define USB_QUIRK_DELAY_INIT		0x00000040
+
 #endif /* __LINUX_USB_QUIRKS_H */
-- 
cgit v1.2.3-59-g8ed1b


From c6ba1c2af2da31ffb57949edbd1dba34f97d1d4b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 29 Jul 2010 15:54:38 -0700
Subject: USB:: fix linux/usb.h kernel-doc warnings

Fix kernel-doc warnings in linux/usb.h:

Warning(include/linux/usb.h:185): No description found for parameter 'resetting_device'
Warning(include/linux/usb.h:1212): No description found for parameter 'stream_id'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index e6cbc34901f4..35fe6ab222bb 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -127,6 +127,8 @@ enum usb_interface_condition {
  *      queued reset so that usb_cancel_queued_reset() doesn't try to
  *      remove from the workqueue when running inside the worker
  *      thread. See __usb_queue_reset_device().
+ * @resetting_device: USB core reset the device, so use alt setting 0 as
+ *	current; needs bandwidth alloc after reset.
  *
  * USB device drivers attach to interfaces on a physical device.  Each
  * interface encapsulates a single high level function, such as feeding
@@ -1015,6 +1017,7 @@ typedef void (*usb_complete_t)(struct urb *);
  *	is a different endpoint (and pipe) from "out" endpoint two.
  *	The current configuration controls the existence, type, and
  *	maximum packet size of any given endpoint.
+ * @stream_id: the endpoint's stream ID for bulk streams
  * @dev: Identifies the USB device to perform the request.
  * @status: This is read in non-iso completion functions to get the
  *	status of the particular request.  ISO requests only use it
-- 
cgit v1.2.3-59-g8ed1b


From 72d2e9f9f90ccafdce1f0a4a9eaabfb031f86def Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 9 Aug 2010 16:37:16 -0700
Subject: i2c.h: fix kernel-doc warnings

Fix kernel-doc warnings in linux/i2c.h:

  Warning(include/linux/i2c.h:176): No description found for parameter 'alert'
  Warning(include/linux/i2c.h:259): No description found for parameter 'of_node'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/i2c.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 21067b418536..38dd4025aa4e 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -108,6 +108,7 @@ extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client,
  * @shutdown: Callback for device shutdown
  * @suspend: Callback for device suspend
  * @resume: Callback for device resume
+ * @alert: Alert callback, for example for the SMBus alert protocol
  * @command: Callback for bus-wide signaling (optional)
  * @driver: Device driver model driver
  * @id_table: List of I2C devices supported by this driver
@@ -233,6 +234,7 @@ static inline void i2c_set_clientdata(struct i2c_client *dev, void *data)
  * @addr: stored in i2c_client.addr
  * @platform_data: stored in i2c_client.dev.platform_data
  * @archdata: copied into i2c_client.dev.archdata
+ * @of_node: pointer to OpenFirmware device node
  * @irq: stored in i2c_client.irq
  *
  * I2C doesn't actually support hardware probing, although controllers and
-- 
cgit v1.2.3-59-g8ed1b


From 969a6e521730153380ad7781095f503c040b684c Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Tue, 10 Aug 2010 16:24:41 -0700
Subject: net: make netpoll_rx return bool for !CONFIG_NETPOLL

"netpoll: Use 'bool' for netpoll_rx() return type." missed the case when
CONFIG_NETPOLL is disabled.

Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 413742c92d14..791d5109f34c 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -122,7 +122,7 @@ static inline int netpoll_tx_running(struct net_device *dev)
 }
 
 #else
-static inline int netpoll_rx(struct sk_buff *skb)
+static inline bool netpoll_rx(struct sk_buff *skb)
 {
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 496ee9b8f349a8ae2065114c414a47e89bdeb930 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Thu, 22 Jul 2010 03:11:48 +0200
Subject: V7: Adjust sanity checks for some volumes

Newly mkfs-ed filesystems from Seventh Edition have last modification
time set to zero, but are otherwise perfectly valid.

Also, tighten up other sanity checks to filter out most filesystems with
different bytesex than we're using.

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysv/super.c         |  6 ++++--
 include/linux/sysv_fs.h | 11 +++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 2da3075aff78..5c0aab0b7e18 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -469,7 +469,7 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
 	v7sb = (struct v7_super_block *) bh->b_data;
 	if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
 	    fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
-	    fs32_to_cpu(sbi, v7sb->s_time) == 0)
+	    fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
 		goto failed;
 
 	/* plausibility check on root inode: it is a directory,
@@ -479,7 +479,9 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
 	v7i = (struct sysv_inode *)(bh2->b_data + 64);
 	if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
 	    (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
-	    (fs32_to_cpu(sbi, v7i->i_size) & 017) != 0)
+	    (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
+	    (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
+	     sizeof (struct sysv_dir_entry)))
 		goto failed;
 	brelse(bh2);
 	bh2 = NULL;
diff --git a/include/linux/sysv_fs.h b/include/linux/sysv_fs.h
index 96411306eec6..e47d6d90023d 100644
--- a/include/linux/sysv_fs.h
+++ b/include/linux/sysv_fs.h
@@ -148,6 +148,17 @@ struct v7_super_block {
 	char    s_fname[6];     /* file system name */
 	char    s_fpack[6];     /* file system pack name */
 };
+/* Constants to aid sanity checking */
+/* This is not a hard limit, nor enforced by v7 kernel. It's actually just
+ * the limit used by Seventh Edition's ls, though is high enough to assume
+ * that no reasonable file system would have that much entries in root
+ * directory. Thus, if we see anything higher, we just probably got the
+ * endiannes wrong. */
+#define V7_NFILES	1024
+/* The disk addresses are three-byte (despite direct block addresses being
+ * aligned word-wise in inode). If the most significant byte is non-zero,
+ * something is most likely wrong (not a filesystem, bad bytesex). */
+#define V7_MAXSIZE	0x00ffffff
 
 /* Coherent super-block data on disk */
 #define COH_NICINOD	100	/* number of inode cache entries */
-- 
cgit v1.2.3-59-g8ed1b


From f7ad3c6be90809b53b7f0ae9d4eaa45ce2564a79 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 10 Aug 2010 11:41:36 +0200
Subject: vfs: add helpers to get root and pwd

Add three helpers that retrieve a refcounted copy of the root and cwd
from the supplied fs_struct.

 get_fs_root()
 get_fs_pwd()
 get_fs_root_and_pwd()

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cachefiles/daemon.c    | 14 ++------------
 fs/dcache.c               | 12 ++----------
 fs/fs_struct.c            |  7 +------
 fs/namei.c                | 15 +++------------
 fs/namespace.c            |  5 +----
 fs/proc/base.c            | 22 +++++++++++-----------
 include/linux/fs_struct.h | 27 +++++++++++++++++++++++++++
 kernel/auditsc.c          |  9 ++-------
 8 files changed, 49 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 72d4d0042826..727caedcdd92 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -552,7 +552,6 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
  */
 static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 {
-	struct fs_struct *fs;
 	struct path path;
 	const struct cred *saved_cred;
 	int ret;
@@ -573,11 +572,7 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 	}
 
 	/* extract the directory dentry from the cwd */
-	fs = current->fs;
-	read_lock(&fs->lock);
-	path = fs->pwd;
-	path_get(&path);
-	read_unlock(&fs->lock);
+	get_fs_pwd(current->fs, &path);
 
 	if (!S_ISDIR(path.dentry->d_inode->i_mode))
 		goto notdir;
@@ -629,7 +624,6 @@ inval:
  */
 static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 {
-	struct fs_struct *fs;
 	struct path path;
 	const struct cred *saved_cred;
 	int ret;
@@ -650,11 +644,7 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 	}
 
 	/* extract the directory dentry from the cwd */
-	fs = current->fs;
-	read_lock(&fs->lock);
-	path = fs->pwd;
-	path_get(&path);
-	read_unlock(&fs->lock);
+	get_fs_pwd(current->fs, &path);
 
 	if (!S_ISDIR(path.dentry->d_inode->i_mode))
 		goto notdir;
diff --git a/fs/dcache.c b/fs/dcache.c
index 9f2c13417969..995d08069d26 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2014,10 +2014,7 @@ char *d_path(const struct path *path, char *buf, int buflen)
 	if (path->dentry->d_op && path->dentry->d_op->d_dname)
 		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
 
-	read_lock(&current->fs->lock);
-	root = current->fs->root;
-	path_get(&root);
-	read_unlock(&current->fs->lock);
+	get_fs_root(current->fs, &root);
 	spin_lock(&dcache_lock);
 	tmp = root;
 	res = __d_path(path, &tmp, buf, buflen);
@@ -2129,12 +2126,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 	if (!page)
 		return -ENOMEM;
 
-	read_lock(&current->fs->lock);
-	pwd = current->fs->pwd;
-	path_get(&pwd);
-	root = current->fs->root;
-	path_get(&root);
-	read_unlock(&current->fs->lock);
+	get_fs_root_and_pwd(current->fs, &root, &pwd);
 
 	error = -ENOENT;
 	spin_lock(&dcache_lock);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index eee059052db5..1ee40eb9a2c0 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -106,12 +106,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 		fs->in_exec = 0;
 		rwlock_init(&fs->lock);
 		fs->umask = old->umask;
-		read_lock(&old->lock);
-		fs->root = old->root;
-		path_get(&old->root);
-		fs->pwd = old->pwd;
-		path_get(&old->pwd);
-		read_unlock(&old->lock);
+		get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
 	}
 	return fs;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 13ff4abdbdca..17ea76bf2fbe 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -483,13 +483,8 @@ ok:
 
 static __always_inline void set_root(struct nameidata *nd)
 {
-	if (!nd->root.mnt) {
-		struct fs_struct *fs = current->fs;
-		read_lock(&fs->lock);
-		nd->root = fs->root;
-		path_get(&nd->root);
-		read_unlock(&fs->lock);
-	}
+	if (!nd->root.mnt)
+		get_fs_root(current->fs, &nd->root);
 }
 
 static int link_path_walk(const char *, struct nameidata *);
@@ -1015,11 +1010,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 		nd->path = nd->root;
 		path_get(&nd->root);
 	} else if (dfd == AT_FDCWD) {
-		struct fs_struct *fs = current->fs;
-		read_lock(&fs->lock);
-		nd->path = fs->pwd;
-		path_get(&fs->pwd);
-		read_unlock(&fs->lock);
+		get_fs_pwd(current->fs, &nd->path);
 	} else {
 		struct dentry *dentry;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 66c4f7e781cb..7972e51ccc06 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2213,10 +2213,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		goto out1;
 	}
 
-	read_lock(&current->fs->lock);
-	root = current->fs->root;
-	path_get(&current->fs->root);
-	read_unlock(&current->fs->lock);
+	get_fs_root(current->fs, &root);
 	down_write(&namespace_sem);
 	mutex_lock(&old.dentry->d_inode->i_mutex);
 	error = -EINVAL;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c806dfb24e08..4cf5abf77ddf 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -149,18 +149,13 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
 	return count;
 }
 
-static int get_fs_path(struct task_struct *task, struct path *path, bool root)
+static int get_task_root(struct task_struct *task, struct path *root)
 {
-	struct fs_struct *fs;
 	int result = -ENOENT;
 
 	task_lock(task);
-	fs = task->fs;
-	if (fs) {
-		read_lock(&fs->lock);
-		*path = root ? fs->root : fs->pwd;
-		path_get(path);
-		read_unlock(&fs->lock);
+	if (task->fs) {
+		get_fs_root(task->fs, root);
 		result = 0;
 	}
 	task_unlock(task);
@@ -173,7 +168,12 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
 	int result = -ENOENT;
 
 	if (task) {
-		result = get_fs_path(task, path, 0);
+		task_lock(task);
+		if (task->fs) {
+			get_fs_pwd(task->fs, path);
+			result = 0;
+		}
+		task_unlock(task);
 		put_task_struct(task);
 	}
 	return result;
@@ -185,7 +185,7 @@ static int proc_root_link(struct inode *inode, struct path *path)
 	int result = -ENOENT;
 
 	if (task) {
-		result = get_fs_path(task, path, 1);
+		result = get_task_root(task, path);
 		put_task_struct(task);
 	}
 	return result;
@@ -597,7 +597,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 				get_mnt_ns(ns);
 		}
 		rcu_read_unlock();
-		if (ns && get_fs_path(task, &root, 1) == 0)
+		if (ns && get_task_root(task, &root) == 0)
 			ret = 0;
 		put_task_struct(task);
 	}
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 78a05bfcd8eb..eca3d5202138 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -21,4 +21,31 @@ extern void free_fs_struct(struct fs_struct *);
 extern void daemonize_fs_struct(void);
 extern int unshare_fs_struct(void);
 
+static inline void get_fs_root(struct fs_struct *fs, struct path *root)
+{
+	read_lock(&fs->lock);
+	*root = fs->root;
+	path_get(root);
+	read_unlock(&fs->lock);
+}
+
+static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
+{
+	read_lock(&fs->lock);
+	*pwd = fs->pwd;
+	path_get(pwd);
+	read_unlock(&fs->lock);
+}
+
+static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root,
+				       struct path *pwd)
+{
+	read_lock(&fs->lock);
+	*root = fs->root;
+	path_get(root);
+	*pwd = fs->pwd;
+	path_get(pwd);
+	read_unlock(&fs->lock);
+}
+
 #endif /* _LINUX_FS_STRUCT_H */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b87a63beb66c..1b31c130d034 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1835,13 +1835,8 @@ void __audit_getname(const char *name)
 	context->names[context->name_count].ino  = (unsigned long)-1;
 	context->names[context->name_count].osid = 0;
 	++context->name_count;
-	if (!context->pwd.dentry) {
-		read_lock(&current->fs->lock);
-		context->pwd = current->fs->pwd;
-		path_get(&current->fs->pwd);
-		read_unlock(&current->fs->lock);
-	}
-
+	if (!context->pwd.dentry)
+		get_fs_pwd(current->fs, &context->pwd);
 }
 
 /* audit_putname - intercept a putname request
-- 
cgit v1.2.3-59-g8ed1b


From 8df9d1a4142311c084ffeeacb67cd34d190eff74 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 10 Aug 2010 11:41:41 +0200
Subject: vfs: show unreachable paths in getcwd and proc

Prepend "(unreachable)" to path strings if the path is not reachable
from the current root.

Two places updated are
 - the return string from getcwd()
 - and symlinks under /proc/$PID.

Other uses of d_path() are left unchanged (we know that some old
software crashes if /proc/mounts is changed).

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 54 ++++++++++++++++++++++++++++++++++++++++++++++----
 fs/proc/base.c         |  2 +-
 include/linux/dcache.h |  1 +
 include/linux/path.h   |  5 +++++
 4 files changed, 57 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 7fccb00f498d..166d35d56868 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2019,6 +2019,11 @@ static int path_with_deleted(const struct path *path, struct path *root,
 	return prepend_path(path, root, buf, buflen);
 }
 
+static int prepend_unreachable(char **buffer, int *buflen)
+{
+	return prepend(buffer, buflen, "(unreachable)", 13);
+}
+
 /**
  * d_path - return the path of a dentry
  * @path: path to report
@@ -2064,6 +2069,39 @@ char *d_path(const struct path *path, char *buf, int buflen)
 }
 EXPORT_SYMBOL(d_path);
 
+/**
+ * d_path_with_unreachable - return the path of a dentry
+ * @path: path to report
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * The difference from d_path() is that this prepends "(unreachable)"
+ * to paths which are unreachable from the current process' root.
+ */
+char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
+{
+	char *res = buf + buflen;
+	struct path root;
+	struct path tmp;
+	int error;
+
+	if (path->dentry->d_op && path->dentry->d_op->d_dname)
+		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
+
+	get_fs_root(current->fs, &root);
+	spin_lock(&dcache_lock);
+	tmp = root;
+	error = path_with_deleted(path, &tmp, &res, &buflen);
+	if (!error && !path_equal(&tmp, &root))
+		error = prepend_unreachable(&res, &buflen);
+	spin_unlock(&dcache_lock);
+	path_put(&root);
+	if (error)
+		res =  ERR_PTR(error);
+
+	return res;
+}
+
 /*
  * Helper function for dentry_operations.d_dname() members
  */
@@ -2173,15 +2211,23 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 	if (!d_unlinked(pwd.dentry)) {
 		unsigned long len;
 		struct path tmp = root;
-		char * cwd;
+		char *cwd = page + PAGE_SIZE;
+		int buflen = PAGE_SIZE;
 
-		cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE);
+		prepend(&cwd, &buflen, "\0", 1);
+		error = prepend_path(&pwd, &tmp, &cwd, &buflen);
 		spin_unlock(&dcache_lock);
 
-		error = PTR_ERR(cwd);
-		if (IS_ERR(cwd))
+		if (error)
 			goto out;
 
+		/* Unreachable from current root */
+		if (!path_equal(&tmp, &root)) {
+			error = prepend_unreachable(&cwd, &buflen);
+			if (error)
+				goto out;
+		}
+
 		error = -ERANGE;
 		len = PAGE_SIZE + page - cwd;
 		if (len <= size) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4cf5abf77ddf..a1c43e7c8a7b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1526,7 +1526,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
 	if (!tmp)
 		return -ENOMEM;
 
-	pathname = d_path(path, tmp, PAGE_SIZE);
+	pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE);
 	len = PTR_ERR(pathname);
 	if (IS_ERR(pathname))
 		goto out;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index d23be0386e2d..6a4aea30aa09 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -315,6 +315,7 @@ extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
 
 extern char *__d_path(const struct path *path, struct path *root, char *, int);
 extern char *d_path(const struct path *, char *, int);
+extern char *d_path_with_unreachable(const struct path *, char *, int);
 extern char *__dentry_path(struct dentry *, char *, int);
 extern char *dentry_path(struct dentry *, char *, int);
 
diff --git a/include/linux/path.h b/include/linux/path.h
index 915e0c382a51..edc98dec6266 100644
--- a/include/linux/path.h
+++ b/include/linux/path.h
@@ -12,4 +12,9 @@ struct path {
 extern void path_get(struct path *);
 extern void path_put(struct path *);
 
+static inline int path_equal(const struct path *path1, const struct path *path2)
+{
+	return path1->mnt == path2->mnt && path1->dentry == path2->dentry;
+}
+
 #endif  /* _LINUX_PATH_H */
-- 
cgit v1.2.3-59-g8ed1b


From 532490f0a5350fd92d838b7430a4c846bc8eac3f Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 2 Aug 2010 13:46:56 +0200
Subject: vfs: remove unused MNT_STRICTATIME

Commit d0adde574b8487ef30f69e2d08bba769e4be513f added MNT_STRICTATIME
but it isn't actually used (MS_STRICTATIME clears MNT_RELATIME and
MNT_NOATIME rather than setting any mount flag).

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 1 -
 include/linux/mount.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 7972e51ccc06..2e10cb19c5b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -788,7 +788,6 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
 		{ MNT_NOATIME, ",noatime" },
 		{ MNT_NODIRATIME, ",nodiratime" },
 		{ MNT_RELATIME, ",relatime" },
-		{ MNT_STRICTATIME, ",strictatime" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 907210bd9f9c..5e7a59408dd4 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -27,7 +27,6 @@ struct mnt_namespace;
 #define MNT_NODIRATIME	0x10
 #define MNT_RELATIME	0x20
 #define MNT_READONLY	0x40	/* does the user want this to be r/o? */
-#define MNT_STRICTATIME 0x80
 
 #define MNT_SHRINKABLE	0x100
 #define MNT_WRITE_HOLD	0x200
-- 
cgit v1.2.3-59-g8ed1b


From 8edf344c66a3f214d709dad1421c29d678915b3f Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 28 May 2010 09:29:15 +0900
Subject: hugetlb: move definition of is_vm_hugetlb_page() to hugepage_inline.h

is_vm_hugetlb_page() is a widely used inline function to insert hooks
into hugetlb code.
But we can't use it in pagemap.h because of circular dependency of
the header files. This patch removes this limitation.

Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/hugetlb.h        | 11 +----------
 include/linux/hugetlb_inline.h | 22 ++++++++++++++++++++++
 include/linux/pagemap.h        |  1 +
 3 files changed, 24 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/hugetlb_inline.h

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 78b4bc64c006..d47a7c41745d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -2,6 +2,7 @@
 #define _LINUX_HUGETLB_H
 
 #include <linux/fs.h>
+#include <linux/hugetlb_inline.h>
 
 struct ctl_table;
 struct user_struct;
@@ -14,11 +15,6 @@ struct user_struct;
 
 int PageHuge(struct page *page);
 
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
-{
-	return vma->vm_flags & VM_HUGETLB;
-}
-
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
 int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
@@ -77,11 +73,6 @@ static inline int PageHuge(struct page *page)
 	return 0;
 }
 
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
-{
-	return 0;
-}
-
 static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 {
 }
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
new file mode 100644
index 000000000000..cf00b6df53dc
--- /dev/null
+++ b/include/linux/hugetlb_inline.h
@@ -0,0 +1,22 @@
+#ifndef _LINUX_HUGETLB_INLINE_H
+#define _LINUX_HUGETLB_INLINE_H 1
+
+#ifdef CONFIG_HUGETLBFS
+
+#include <linux/mm.h>
+
+static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & VM_HUGETLB;
+}
+
+#else
+
+static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+{
+	return 0;
+}
+
+#endif
+
+#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3c62ed408492..b2bd2bae9775 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -13,6 +13,7 @@
 #include <linux/gfp.h>
 #include <linux/bitops.h>
 #include <linux/hardirq.h> /* for in_interrupt() */
+#include <linux/hugetlb_inline.h>
 
 /*
  * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
-- 
cgit v1.2.3-59-g8ed1b


From 0fe6e20b9c4c53b3e97096ee73a0857f60aad43f Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 28 May 2010 09:29:16 +0900
Subject: hugetlb, rmap: add reverse mapping for hugepage

This patch adds reverse mapping feature for hugepage by introducing
mapcount for shared/private-mapped hugepage and anon_vma for
private-mapped hugepage.

While hugepage is not currently swappable, reverse mapping can be useful
for memory error handler.

Without this patch, memory error handler cannot identify processes
using the bad hugepage nor unmap it from them. That is:
- for shared hugepage:
  we can collect processes using a hugepage through pagecache,
  but can not unmap the hugepage because of the lack of mapcount.
- for privately mapped hugepage:
  we can neither collect processes nor unmap the hugepage.
This patch solves these problems.

This patch include the bug fix given by commit 23be7468e8, so reverts it.

Dependency:
  "hugetlb: move definition of is_vm_hugetlb_page() to hugepage_inline.h"

ChangeLog since May 24.
- create hugetlb_inline.h and move is_vm_hugetlb_index() in it.
- move functions setting up anon_vma for hugepage into mm/rmap.c.

ChangeLog since May 13.
- rebased to 2.6.34
- fix logic error (in case that private mapping and shared mapping coexist)
- move is_vm_hugetlb_page() into include/linux/mm.h to use this function
  from linear_page_index()
- define and use linear_hugepage_index() instead of compound_order()
- use page_move_anon_rmap() in hugetlb_cow()
- copy exclusive switch of __set_page_anon_rmap() into hugepage counterpart.
- revert commit 24be7468 completely

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Acked-by: Fengguang Wu <fengguang.wu@intel.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/hugetlb.h |  1 +
 include/linux/pagemap.h |  8 ++++++-
 include/linux/poison.h  |  9 --------
 include/linux/rmap.h    |  5 +++++
 mm/hugetlb.c            | 44 ++++++++++++++++++++++++++++++++++--
 mm/rmap.c               | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 114 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d47a7c41745d..e688fd89354d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -99,6 +99,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
 #define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
+#define huge_pte_offset(mm, address)	0
 
 #define hugetlb_change_protection(vma, address, end, newprot)
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index b2bd2bae9775..a547d9689170 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -282,10 +282,16 @@ static inline loff_t page_offset(struct page *page)
 	return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
 }
 
+extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+				     unsigned long address);
+
 static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 					unsigned long address)
 {
-	pgoff_t pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
+	pgoff_t pgoff;
+	if (unlikely(is_vm_hugetlb_page(vma)))
+		return linear_hugepage_index(vma, address);
+	pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
 	pgoff += vma->vm_pgoff;
 	return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 }
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 34066ffd893d..2110a81c5e2a 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -48,15 +48,6 @@
 #define POISON_FREE	0x6b	/* for use-after-free poisoning */
 #define	POISON_END	0xa5	/* end-byte of poisoning */
 
-/********** mm/hugetlb.c **********/
-/*
- * Private mappings of hugetlb pages use this poisoned value for
- * page->mapping. The core VM should not be doing anything with this mapping
- * but futex requires the existence of some page->mapping value even though it
- * is unused if PAGE_MAPPING_ANON is set.
- */
-#define HUGETLB_POISON	((void *)(0x00300300 + POISON_POINTER_DELTA + PAGE_MAPPING_ANON))
-
 /********** arch/$ARCH/mm/init.c **********/
 #define POISON_FREE_INITMEM	0xcc
 
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 77216742c178..9d50e7ef5f5a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -140,6 +140,11 @@ void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned lon
 void page_add_file_rmap(struct page *);
 void page_remove_rmap(struct page *);
 
+void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
+			    unsigned long);
+void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+				unsigned long);
+
 static inline void page_dup_rmap(struct page *page)
 {
 	atomic_inc(&page->_mapcount);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d42b009dbe..aa3c51739378 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
+#include <linux/rmap.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -220,6 +221,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
 			(vma->vm_pgoff >> huge_page_order(h));
 }
 
+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+				     unsigned long address)
+{
+	return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
+
 /*
  * Return the size of the pages allocated when backing a VMA. In the majority
  * cases this will be same size as used by the page table entries.
@@ -552,6 +559,7 @@ static void free_huge_page(struct page *page)
 	set_page_private(page, 0);
 	page->mapping = NULL;
 	BUG_ON(page_count(page));
+	BUG_ON(page_mapcount(page));
 	INIT_LIST_HEAD(&page->lru);
 
 	spin_lock(&hugetlb_lock);
@@ -2129,6 +2137,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			entry = huge_ptep_get(src_pte);
 			ptepage = pte_page(entry);
 			get_page(ptepage);
+			page_dup_rmap(ptepage);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
@@ -2207,6 +2216,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	flush_tlb_range(vma, start, end);
 	mmu_notifier_invalidate_range_end(mm, start, end);
 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
+		page_remove_rmap(page);
 		list_del(&page->lru);
 		put_page(page);
 	}
@@ -2272,6 +2282,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	return 1;
 }
 
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ */
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte,
 			struct page *pagecache_page)
@@ -2286,8 +2299,11 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 retry_avoidcopy:
 	/* If no-one else is actually using this page, avoid the copy
 	 * and just make the page writable */
-	avoidcopy = (page_count(old_page) == 1);
+	avoidcopy = (page_mapcount(old_page) == 1);
 	if (avoidcopy) {
+		if (!trylock_page(old_page))
+			if (PageAnon(old_page))
+				page_move_anon_rmap(old_page, vma, address);
 		set_huge_ptep_writable(vma, address, ptep);
 		return 0;
 	}
@@ -2338,6 +2354,13 @@ retry_avoidcopy:
 		return -PTR_ERR(new_page);
 	}
 
+	/*
+	 * When the original hugepage is shared one, it does not have
+	 * anon_vma prepared.
+	 */
+	if (unlikely(anon_vma_prepare(vma)))
+		return VM_FAULT_OOM;
+
 	copy_huge_page(new_page, old_page, address, vma);
 	__SetPageUptodate(new_page);
 
@@ -2352,6 +2375,8 @@ retry_avoidcopy:
 		huge_ptep_clear_flush(vma, address, ptep);
 		set_huge_pte_at(mm, address, ptep,
 				make_huge_pte(vma, new_page, 1));
+		page_remove_rmap(old_page);
+		hugepage_add_anon_rmap(new_page, vma, address);
 		/* Make the old page be freed below */
 		new_page = old_page;
 	}
@@ -2452,10 +2477,17 @@ retry:
 			spin_lock(&inode->i_lock);
 			inode->i_blocks += blocks_per_huge_page(h);
 			spin_unlock(&inode->i_lock);
+			page_dup_rmap(page);
 		} else {
 			lock_page(page);
-			page->mapping = HUGETLB_POISON;
+			if (unlikely(anon_vma_prepare(vma))) {
+				ret = VM_FAULT_OOM;
+				goto backout_unlocked;
+			}
+			hugepage_add_new_anon_rmap(page, vma, address);
 		}
+	} else {
+		page_dup_rmap(page);
 	}
 
 	/*
@@ -2507,6 +2539,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t *ptep;
 	pte_t entry;
 	int ret;
+	struct page *page = NULL;
 	struct page *pagecache_page = NULL;
 	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 	struct hstate *h = hstate_vma(vma);
@@ -2548,6 +2581,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 								vma, address);
 	}
 
+	if (!pagecache_page) {
+		page = pte_page(entry);
+		lock_page(page);
+	}
+
 	spin_lock(&mm->page_table_lock);
 	/* Check for a racing update before calling hugetlb_cow */
 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2573,6 +2611,8 @@ out_page_table_lock:
 	if (pagecache_page) {
 		unlock_page(pagecache_page);
 		put_page(pagecache_page);
+	} else {
+		unlock_page(page);
 	}
 
 out_mutex:
diff --git a/mm/rmap.c b/mm/rmap.c
index 38a336e2eea1..0ad53572eaf2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
+#include <linux/hugetlb.h>
 
 #include <asm/tlbflush.h>
 
@@ -326,6 +327,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	unsigned long address;
 
+	if (unlikely(is_vm_hugetlb_page(vma)))
+		pgoff = page->index << huge_page_order(page_hstate(page));
 	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
 		/* page should be within @vma mapping range */
@@ -369,6 +372,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 	pte_t *pte;
 	spinlock_t *ptl;
 
+	if (unlikely(PageHuge(page))) {
+		pte = huge_pte_offset(mm, address);
+		ptl = &mm->page_table_lock;
+		goto check;
+	}
+
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
 		return NULL;
@@ -389,6 +398,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 	}
 
 	ptl = pte_lockptr(mm, pmd);
+check:
 	spin_lock(ptl);
 	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
 		*ptlp = ptl;
@@ -873,6 +883,12 @@ void page_remove_rmap(struct page *page)
 		page_clear_dirty(page);
 		set_page_dirty(page);
 	}
+	/*
+	 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
+	 * and not charged by memcg for now.
+	 */
+	if (unlikely(PageHuge(page)))
+		return;
 	if (PageAnon(page)) {
 		mem_cgroup_uncharge_page(page);
 		__dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1445,3 +1461,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
 		return rmap_walk_file(page, rmap_one, arg);
 }
 #endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_HUGETLBFS
+/*
+ * The following three functions are for anonymous (private mapped) hugepages.
+ * Unlike common anonymous pages, anonymous hugepages have no accounting code
+ * and no lru code, because we handle hugepages differently from common pages.
+ */
+static void __hugepage_set_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+	BUG_ON(!anon_vma);
+	if (!exclusive) {
+		struct anon_vma_chain *avc;
+		avc = list_entry(vma->anon_vma_chain.prev,
+				 struct anon_vma_chain, same_vma);
+		anon_vma = avc->anon_vma;
+	}
+	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+	page->mapping = (struct address_space *) anon_vma;
+	page->index = linear_page_index(vma, address);
+}
+
+void hugepage_add_anon_rmap(struct page *page,
+			    struct vm_area_struct *vma, unsigned long address)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+	int first;
+	BUG_ON(!anon_vma);
+	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+	first = atomic_inc_and_test(&page->_mapcount);
+	if (first)
+		__hugepage_set_anon_rmap(page, vma, address, 0);
+}
+
+void hugepage_add_new_anon_rmap(struct page *page,
+			struct vm_area_struct *vma, unsigned long address)
+{
+	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+	atomic_set(&page->_mapcount, 0);
+	__hugepage_set_anon_rmap(page, vma, address, 1);
+}
+#endif /* CONFIG_HUGETLBFS */
-- 
cgit v1.2.3-59-g8ed1b


From 93f70f900da36fbc19c13c2aa04b2e468c8d00fb Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 28 May 2010 09:29:20 +0900
Subject: HWPOISON, hugetlb: isolate corrupted hugepage

If error hugepage is not in-use, we can fully recovery from error
by dequeuing it from freelist, so return RECOVERY.
Otherwise whether or not we can recovery depends on user processes,
so return DELAYED.

Dependency:
  "HWPOISON, hugetlb: enable error handling path for hugepage"

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/hugetlb.h |  2 ++
 mm/hugetlb.c            | 16 ++++++++++++++++
 mm/memory-failure.c     | 28 ++++++++++++++++++++--------
 3 files changed, 38 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e688fd89354d..f479700df61b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -43,6 +43,7 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						int acctflags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
+void __isolate_hwpoisoned_huge_page(struct page *page);
 
 extern unsigned long hugepages_treat_as_movable;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -100,6 +101,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
 #define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
 #define huge_pte_offset(mm, address)	0
+#define __isolate_hwpoisoned_huge_page(page)	0
 
 #define hugetlb_change_protection(vma, address, end, newprot)
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aa3c51739378..8c163f64cf10 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2825,3 +2825,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 	hugetlb_put_quota(inode->i_mapping, (chg - freed));
 	hugetlb_acct_memory(h, -(chg - freed));
 }
+
+/*
+ * This function is called from memory failure code.
+ * Assume the caller holds page lock of the head page.
+ */
+void __isolate_hwpoisoned_huge_page(struct page *hpage)
+{
+	struct hstate *h = page_hstate(hpage);
+	int nid = page_to_nid(hpage);
+
+	spin_lock(&hugetlb_lock);
+	list_del(&hpage->lru);
+	h->free_huge_pages--;
+	h->free_huge_pages_node[nid]--;
+	spin_unlock(&hugetlb_lock);
+}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 473f15a3356d..d0b420aba726 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -690,17 +690,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
 /*
  * Huge pages. Needs work.
  * Issues:
- * No rmap support so we cannot find the original mapper. In theory could walk
- * all MMs and look for the mappings, but that would be non atomic and racy.
- * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
- * like just walking the current process and hoping it has it mapped (that
- * should be usually true for the common "shared database cache" case)
- * Should handle free huge pages and dequeue them too, but this needs to
- * handle huge page accounting correctly.
+ * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
+ *   To narrow down kill region to one page, we need to break up pmd.
+ * - To support soft-offlining for hugepage, we need to support hugepage
+ *   migration.
  */
 static int me_huge_page(struct page *p, unsigned long pfn)
 {
-	return FAILED;
+	struct page *hpage = compound_head(p);
+	/*
+	 * We can safely recover from error on free or reserved (i.e.
+	 * not in-use) hugepage by dequeuing it from freelist.
+	 * To check whether a hugepage is in-use or not, we can't use
+	 * page->lru because it can be used in other hugepage operations,
+	 * such as __unmap_hugepage_range() and gather_surplus_pages().
+	 * So instead we use page_mapping() and PageAnon().
+	 * We assume that this function is called with page lock held,
+	 * so there is no race between isolation and mapping/unmapping.
+	 */
+	if (!(page_mapping(hpage) || PageAnon(hpage))) {
+		__isolate_hwpoisoned_huge_page(hpage);
+		return RECOVERED;
+	}
+	return DELAYED;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From e3390f67a7267daa227380b6f1bbf13c7ddd4aff Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Tue, 15 Jun 2010 13:18:13 +0900
Subject: hwpoison: rename CONFIG

CONFIG_HUGETLBFS controls hugetlbfs interface code.
OTOH, CONFIG_HUGETLB_PAGE controls hugepage management code.
So we should use CONFIG_HUGETLB_PAGE here.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/hugetlb_inline.h | 4 ++--
 mm/rmap.c                      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index cf00b6df53dc..6931489a5c14 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_HUGETLB_INLINE_H
-#define _LINUX_HUGETLB_INLINE_H 1
+#define _LINUX_HUGETLB_INLINE_H
 
-#ifdef CONFIG_HUGETLBFS
+#ifdef CONFIG_HUGETLB_PAGE
 
 #include <linux/mm.h>
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 0ad53572eaf2..71bd30a147cf 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1462,7 +1462,7 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
 }
 #endif /* CONFIG_MIGRATION */
 
-#ifdef CONFIG_HUGETLBFS
+#ifdef CONFIG_HUGETLB_PAGE
 /*
  * The following three functions are for anonymous (private mapped) hugepages.
  * Unlike common anonymous pages, anonymous hugepages have no accounting code
@@ -1503,4 +1503,4 @@ void hugepage_add_new_anon_rmap(struct page *page,
 	atomic_set(&page->_mapcount, 0);
 	__hugepage_set_anon_rmap(page, vma, address, 1);
 }
-#endif /* CONFIG_HUGETLBFS */
+#endif /* CONFIG_HUGETLB_PAGE */
-- 
cgit v1.2.3-59-g8ed1b


From 156f252857dfd81f03d77d09e33b5f7d2b113e2b Mon Sep 17 00:00:00 2001
From: Kyungmin Park <kyungmin.park@samsung.com>
Date: Wed, 16 Jun 2010 09:04:16 +0200
Subject: drivers: regulator: add Maxim 8998 driver

Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>

This patch adds voltage regulator driver for Maxim 8998 chip. This chip
is used on Samsung Aquila and GONI boards and provides following
functionalities:
- 4 BUCK voltage converters, 17 LDO power regulators and 5 other power
  controllers
- battery charger

This patch adds basic driver for voltage regulators and MAX 8998 MFD core.

Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/mfd/Kconfig                 |  10 +
 drivers/mfd/Makefile                |   1 +
 drivers/mfd/max8998.c               | 160 ++++++++++
 drivers/regulator/Kconfig           |   9 +
 drivers/regulator/Makefile          |   1 +
 drivers/regulator/max8998.c         | 610 ++++++++++++++++++++++++++++++++++++
 include/linux/mfd/max8998-private.h | 112 +++++++
 include/linux/mfd/max8998.h         |  78 +++++
 8 files changed, 981 insertions(+)
 create mode 100644 drivers/mfd/max8998.c
 create mode 100644 drivers/regulator/max8998.c
 create mode 100644 include/linux/mfd/max8998-private.h
 create mode 100644 include/linux/mfd/max8998.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 9da0e504bbe9..ad61a9e8e04e 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -252,6 +252,16 @@ config MFD_MAX8925
 	  accessing the device, additional drivers must be enabled in order
 	  to use the functionality of the device.
 
+config MFD_MAX8998
+	bool "Maxim Semiconductor MAX8998 PMIC Support"
+	depends on I2C=y
+	select MFD_CORE
+	help
+	  Say yes here to support for Maxim Semiconductor MAX8998. This is
+	  a Power Management IC. This driver provies common support for
+	  accessing the device, additional drivers must be enabled in order
+	  to use the functionality of the device.
+
 config MFD_WM8400
 	tristate "Support Wolfson Microelectronics WM8400"
 	select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index fb503e77dc60..a362ccfe8997 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_UCB1400_CORE)	+= ucb1400_core.o
 obj-$(CONFIG_PMIC_DA903X)	+= da903x.o
 max8925-objs			:= max8925-core.o max8925-i2c.o
 obj-$(CONFIG_MFD_MAX8925)	+= max8925.o
+obj-$(CONFIG_MFD_MAX8998)	+= max8998.o
 
 pcf50633-objs			:= pcf50633-core.o pcf50633-irq.o
 obj-$(CONFIG_MFD_PCF50633)	+= pcf50633.o
diff --git a/drivers/mfd/max8998.c b/drivers/mfd/max8998.c
new file mode 100644
index 000000000000..0d68de21ea9e
--- /dev/null
+++ b/drivers/mfd/max8998.c
@@ -0,0 +1,160 @@
+/*
+ * max8698.c - mfd core driver for the Maxim 8998
+ *
+ *  Copyright (C) 2009-2010 Samsung Electronics
+ *  Kyungmin Park <kyungmin.park@samsung.com>
+ *  Marek Szyprowski <m.szyprowski@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/mutex.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/max8998.h>
+#include <linux/mfd/max8998-private.h>
+
+static struct mfd_cell max8998_devs[] = {
+	{
+		.name = "max8998-pmic",
+	}
+};
+
+static int max8998_i2c_device_read(struct max8998_dev *max8998, u8 reg, u8 *dest)
+{
+	struct i2c_client *client = max8998->i2c_client;
+	int ret;
+
+	mutex_lock(&max8998->iolock);
+	ret = i2c_smbus_read_byte_data(client, reg);
+	mutex_unlock(&max8998->iolock);
+	if (ret < 0)
+		return ret;
+
+	ret &= 0xff;
+	*dest = ret;
+	return 0;
+}
+
+static int max8998_i2c_device_write(struct max8998_dev *max8998, u8 reg, u8 value)
+{
+	struct i2c_client *client = max8998->i2c_client;
+	int ret;
+
+	mutex_lock(&max8998->iolock);
+	ret = i2c_smbus_write_byte_data(client, reg, value);
+	mutex_unlock(&max8998->iolock);
+	return ret;
+}
+
+static int max8998_i2c_device_update(struct max8998_dev *max8998, u8 reg,
+				     u8 val, u8 mask)
+{
+	struct i2c_client *client = max8998->i2c_client;
+	int ret;
+
+	mutex_lock(&max8998->iolock);
+	ret = i2c_smbus_read_byte_data(client, reg);
+	if (ret >= 0) {
+		u8 old_val = ret & 0xff;
+		u8 new_val = (val & mask) | (old_val & (~mask));
+		ret = i2c_smbus_write_byte_data(client, reg, new_val);
+		if (ret >= 0)
+			ret = 0;
+	}
+	mutex_unlock(&max8998->iolock);
+	return ret;
+}
+
+static int max8998_i2c_probe(struct i2c_client *i2c,
+			    const struct i2c_device_id *id)
+{
+	struct max8998_dev *max8998;
+	int ret = 0;
+
+	max8998 = kzalloc(sizeof(struct max8998_dev), GFP_KERNEL);
+	if (max8998 == NULL) {
+		kfree(i2c);
+		return -ENOMEM;
+	}
+
+	i2c_set_clientdata(i2c, max8998);
+	max8998->dev = &i2c->dev;
+	max8998->i2c_client = i2c;
+	max8998->dev_read = max8998_i2c_device_read;
+	max8998->dev_write = max8998_i2c_device_write;
+	max8998->dev_update = max8998_i2c_device_update;
+	mutex_init(&max8998->iolock);
+
+	ret = mfd_add_devices(max8998->dev, -1,
+			      max8998_devs, ARRAY_SIZE(max8998_devs),
+			      NULL, 0);
+	if (ret < 0)
+		goto err;
+
+	return ret;
+
+err:
+	mfd_remove_devices(max8998->dev);
+	kfree(max8998);
+	return ret;
+}
+
+static int max8998_i2c_remove(struct i2c_client *i2c)
+{
+	struct max8998_dev *max8998 = i2c_get_clientdata(i2c);
+
+	mfd_remove_devices(max8998->dev);
+	kfree(max8998);
+
+	return 0;
+}
+
+static const struct i2c_device_id max8998_i2c_id[] = {
+       { "max8998", 0 },
+       { }
+};
+MODULE_DEVICE_TABLE(i2c, max8998_i2c_id);
+
+static struct i2c_driver max8998_i2c_driver = {
+	.driver = {
+		   .name = "max8998",
+		   .owner = THIS_MODULE,
+	},
+	.probe = max8998_i2c_probe,
+	.remove = max8998_i2c_remove,
+	.id_table = max8998_i2c_id,
+};
+
+static int __init max8998_i2c_init(void)
+{
+	return i2c_add_driver(&max8998_i2c_driver);
+}
+/* init early so consumer devices can complete system boot */
+subsys_initcall(max8998_i2c_init);
+
+static void __exit max8998_i2c_exit(void)
+{
+	i2c_del_driver(&max8998_i2c_driver);
+}
+module_exit(max8998_i2c_exit);
+
+MODULE_DESCRIPTION("MAXIM 8998 multi-function core driver");
+MODULE_AUTHOR("Kyungmin Park <kyungmin.park@samsung.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 6bf56c93a336..671b81a39482 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -100,6 +100,15 @@ config REGULATOR_MAX8925
 	help
 	  Say y here to support the voltage regulaltor of Maxim MAX8925 PMIC.
 
+config REGULATOR_MAX8998
+	tristate "Maxim 8998 voltage regulator"
+	depends on I2C
+	default n
+	help
+	  This driver controls a Maxim 8998 voltage output regulator
+	  via I2C bus. The provided regulator is suitable for S3C6410
+	  and S5PC1XX chips to control VCC_CORE and VCC_USIM voltages.
+
 config REGULATOR_TWL4030
 	bool "TI TWL4030/TWL5030/TWL6030/TPS695x0 PMIC"
 	depends on TWL4030_CORE
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 451a14a873f6..74a4638bd9a6 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_REGULATOR_TWL4030) += twl-regulator.o
 obj-$(CONFIG_REGULATOR_MAX8649)	+= max8649.o
 obj-$(CONFIG_REGULATOR_MAX8660) += max8660.o
 obj-$(CONFIG_REGULATOR_MAX8925) += max8925-regulator.o
+obj-$(CONFIG_REGULATOR_MAX8998) += max8998.o
 obj-$(CONFIG_REGULATOR_WM831X) += wm831x-dcdc.o
 obj-$(CONFIG_REGULATOR_WM831X) += wm831x-isink.o
 obj-$(CONFIG_REGULATOR_WM831X) += wm831x-ldo.o
diff --git a/drivers/regulator/max8998.c b/drivers/regulator/max8998.c
new file mode 100644
index 000000000000..174fd1957ae4
--- /dev/null
+++ b/drivers/regulator/max8998.c
@@ -0,0 +1,610 @@
+/*
+ * max8998.c - Voltage regulator driver for the Maxim 8998
+ *
+ *  Copyright (C) 2009-2010 Samsung Electronics
+ *  Kyungmin Park <kyungmin.park@samsung.com>
+ *  Marek Szyprowski <m.szyprowski@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/i2c.h>
+#include <linux/err.h>
+#include <linux/gpio.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/driver.h>
+#include <linux/mfd/max8998.h>
+#include <linux/mfd/max8998-private.h>
+
+struct max8998_data {
+	struct device		*dev;
+	struct max8998_dev	*iodev;
+	int			num_regulators;
+	struct regulator_dev	**rdev;
+};
+
+struct voltage_map_desc {
+	int min;
+	int max;
+	int step;
+};
+
+/* Voltage maps */
+static const struct voltage_map_desc ldo23_voltage_map_desc = {
+	.min = 800,	.step = 50,	.max = 1300,
+};
+static const struct voltage_map_desc ldo456711_voltage_map_desc = {
+	.min = 1600,	.step = 100,	.max = 3600,
+};
+static const struct voltage_map_desc ldo8_voltage_map_desc = {
+	.min = 3000,	.step = 100,	.max = 3600,
+};
+static const struct voltage_map_desc ldo9_voltage_map_desc = {
+	.min = 2800,	.step = 100,	.max = 3100,
+};
+static const struct voltage_map_desc ldo10_voltage_map_desc = {
+	.min = 950,	.step = 50,	.max = 1300,
+};
+static const struct voltage_map_desc ldo1213_voltage_map_desc = {
+	.min = 800,	.step = 100,	.max = 3300,
+};
+static const struct voltage_map_desc ldo1415_voltage_map_desc = {
+	.min = 1200,	.step = 100,	.max = 3300,
+};
+static const struct voltage_map_desc ldo1617_voltage_map_desc = {
+	.min = 1600,	.step = 100,	.max = 3600,
+};
+static const struct voltage_map_desc buck12_voltage_map_desc = {
+	.min = 750,	.step = 25,	.max = 1525,
+};
+static const struct voltage_map_desc buck3_voltage_map_desc = {
+	.min = 1600,	.step = 100,	.max = 3600,
+};
+static const struct voltage_map_desc buck4_voltage_map_desc = {
+	.min = 800,	.step = 100,	.max = 2300,
+};
+
+static const struct voltage_map_desc *ldo_voltage_map[] = {
+	NULL,
+	NULL,
+	&ldo23_voltage_map_desc,	/* LDO2 */
+	&ldo23_voltage_map_desc,	/* LDO3 */
+	&ldo456711_voltage_map_desc,	/* LDO4 */
+	&ldo456711_voltage_map_desc,	/* LDO5 */
+	&ldo456711_voltage_map_desc,	/* LDO6 */
+	&ldo456711_voltage_map_desc,	/* LDO7 */
+	&ldo8_voltage_map_desc,		/* LDO8 */
+	&ldo9_voltage_map_desc,		/* LDO9 */
+	&ldo10_voltage_map_desc,	/* LDO10 */
+	&ldo456711_voltage_map_desc,	/* LDO11 */
+	&ldo1213_voltage_map_desc,	/* LDO12 */
+	&ldo1213_voltage_map_desc,	/* LDO13 */
+	&ldo1415_voltage_map_desc,	/* LDO14 */
+	&ldo1415_voltage_map_desc,	/* LDO15 */
+	&ldo1617_voltage_map_desc,	/* LDO16 */
+	&ldo1617_voltage_map_desc,	/* LDO17 */
+	&buck12_voltage_map_desc,	/* BUCK1 */
+	&buck12_voltage_map_desc,	/* BUCK2 */
+	&buck3_voltage_map_desc,	/* BUCK3 */
+	&buck4_voltage_map_desc,	/* BUCK4 */
+};
+
+static inline int max8998_get_ldo(struct regulator_dev *rdev)
+{
+	return rdev_get_id(rdev);
+}
+
+static int max8998_list_voltage(struct regulator_dev *rdev,
+				unsigned int selector)
+{
+	const struct voltage_map_desc *desc;
+	int ldo = max8998_get_ldo(rdev);
+	int val;
+
+	if (ldo > ARRAY_SIZE(ldo_voltage_map))
+		return -EINVAL;
+
+	desc = ldo_voltage_map[ldo];
+	if (desc == NULL)
+		return -EINVAL;
+
+	val = desc->min + desc->step * selector;
+	if (val > desc->max)
+		return -EINVAL;
+
+	return val * 1000;
+}
+
+static int max8998_get_enable_register(struct regulator_dev *rdev,
+					int *reg, int *shift)
+{
+	int ldo = max8998_get_ldo(rdev);
+
+	switch (ldo) {
+	case MAX8998_LDO2 ... MAX8998_LDO5:
+		*reg = MAX8998_REG_ONOFF1;
+		*shift = 3 - (ldo - MAX8998_LDO2);
+		break;
+	case MAX8998_LDO6 ... MAX8998_LDO13:
+		*reg = MAX8998_REG_ONOFF2;
+		*shift = 7 - (ldo - MAX8998_LDO6);
+		break;
+	case MAX8998_LDO14 ... MAX8998_LDO17:
+		*reg = MAX8998_REG_ONOFF3;
+		*shift = 7 - (ldo - MAX8998_LDO14);
+		break;
+	case MAX8998_BUCK1 ... MAX8998_BUCK4:
+		*reg = MAX8998_REG_ONOFF1;
+		*shift = 7 - (ldo - MAX8998_BUCK1);
+		break;
+	case MAX8998_EN32KHZ_AP ... MAX8998_ENVICHG:
+		*reg = MAX8998_REG_ONOFF4;
+		*shift = 7 - (ldo - MAX8998_EN32KHZ_AP);
+		break;
+	case MAX8998_ESAFEOUT1 ... MAX8998_ESAFEOUT2:
+		*reg = MAX8998_REG_CHGR2;
+		*shift = 7 - (ldo - MAX8998_ESAFEOUT1);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int max8998_ldo_is_enabled(struct regulator_dev *rdev)
+{
+	struct max8998_data *max8998 = rdev_get_drvdata(rdev);
+	int ret, reg, shift = 8;
+	u8 val;
+
+	ret = max8998_get_enable_register(rdev, &reg, &shift);
+	if (ret)
+		return ret;
+
+	ret = max8998_read_reg(max8998->iodev, reg, &val);
+	if (ret)
+		return ret;
+
+	return val & (1 << shift);
+}
+
+static int max8998_ldo_enable(struct regulator_dev *rdev)
+{
+	struct max8998_data *max8998 = rdev_get_drvdata(rdev);
+	int reg, shift = 8, ret;
+
+	ret = max8998_get_enable_register(rdev, &reg, &shift);
+	if (ret)
+		return ret;
+
+	return max8998_update_reg(max8998->iodev, reg, 1<<shift, 1<<shift);
+}
+
+static int max8998_ldo_disable(struct regulator_dev *rdev)
+{
+	struct max8998_data *max8998 = rdev_get_drvdata(rdev);
+	int reg, shift = 8, ret;
+
+	ret = max8998_get_enable_register(rdev, &reg, &shift);
+	if (ret)
+		return ret;
+
+	return max8998_update_reg(max8998->iodev, reg, 0, 1<<shift);
+}
+
+static int max8998_get_voltage_register(struct regulator_dev *rdev,
+				int *_reg, int *_shift, int *_mask)
+{
+	int ldo = max8998_get_ldo(rdev);
+	int reg, shift = 0, mask = 0xff;
+
+	switch (ldo) {
+	case MAX8998_LDO2 ... MAX8998_LDO3:
+		reg = MAX8998_REG_LDO2_LDO3;
+		mask = 0xf;
+		if (ldo == MAX8998_LDO2)
+			shift = 4;
+		else
+			shift = 0;
+		break;
+	case MAX8998_LDO4 ... MAX8998_LDO7:
+		reg = MAX8998_REG_LDO4 + (ldo - MAX8998_LDO4);
+		break;
+	case MAX8998_LDO8 ... MAX8998_LDO9:
+		reg = MAX8998_REG_LDO8_LDO9;
+		mask = 0xf;
+		if (ldo == MAX8998_LDO8)
+			shift = 4;
+		else
+			shift = 0;
+		break;
+	case MAX8998_LDO10 ... MAX8998_LDO11:
+		reg = MAX8998_REG_LDO10_LDO11;
+		if (ldo == MAX8998_LDO10) {
+			shift = 5;
+			mask = 0x7;
+		} else {
+			shift = 0;
+			mask = 0x1f;
+		}
+		break;
+	case MAX8998_LDO12 ... MAX8998_LDO17:
+		reg = MAX8998_REG_LDO12 + (ldo - MAX8998_LDO12);
+		break;
+	case MAX8998_BUCK1:
+		reg = MAX8998_REG_BUCK1_DVSARM1;
+		break;
+	case MAX8998_BUCK2:
+		reg = MAX8998_REG_BUCK2_DVSINT1;
+		break;
+	case MAX8998_BUCK3:
+		reg = MAX8998_REG_BUCK3;
+		break;
+	case MAX8998_BUCK4:
+		reg = MAX8998_REG_BUCK4;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*_reg = reg;
+	*_shift = shift;
+	*_mask = mask;
+
+	return 0;
+}
+
+static int max8998_get_voltage(struct regulator_dev *rdev)
+{
+	struct max8998_data *max8998 = rdev_get_drvdata(rdev);
+	int reg, shift = 0, mask, ret;
+	u8 val;
+
+	ret = max8998_get_voltage_register(rdev, &reg, &shift, &mask);
+	if (ret)
+		return ret;
+
+	ret = max8998_read_reg(max8998->iodev, reg, &val);
+	if (ret)
+		return ret;
+
+	val >>= shift;
+	val &= mask;
+
+	return max8998_list_voltage(rdev, val);
+}
+
+static int max8998_set_voltage(struct regulator_dev *rdev,
+				int min_uV, int max_uV)
+{
+	struct max8998_data *max8998 = rdev_get_drvdata(rdev);
+	int min_vol = min_uV / 1000, max_vol = max_uV / 1000;
+	const struct voltage_map_desc *desc;
+	int ldo = max8998_get_ldo(rdev);
+	int reg, shift = 0, mask, ret;
+	int i = 0;
+
+	if (ldo > ARRAY_SIZE(ldo_voltage_map))
+		return -EINVAL;
+
+	desc = ldo_voltage_map[ldo];
+	if (desc == NULL)
+		return -EINVAL;
+
+	if (max_vol < desc->min || min_vol > desc->max)
+		return -EINVAL;
+
+	while (desc->min + desc->step*i < max_vol &&
+	       desc->min + desc->step*i < desc->max)
+		i++;
+
+	ret = max8998_get_voltage_register(rdev, &reg, &shift, &mask);
+	if (ret)
+		return ret;
+
+	return max8998_update_reg(max8998->iodev, reg, i<<shift, mask<<shift);
+}
+
+static struct regulator_ops max8998_ldo_ops = {
+	.list_voltage		= max8998_list_voltage,
+	.is_enabled		= max8998_ldo_is_enabled,
+	.enable			= max8998_ldo_enable,
+	.disable		= max8998_ldo_disable,
+	.get_voltage		= max8998_get_voltage,
+	.set_voltage		= max8998_set_voltage,
+	.set_suspend_enable	= max8998_ldo_enable,
+	.set_suspend_disable	= max8998_ldo_disable,
+};
+
+static struct regulator_ops max8998_buck_ops = {
+	.list_voltage		= max8998_list_voltage,
+	.is_enabled		= max8998_ldo_is_enabled,
+	.enable			= max8998_ldo_enable,
+	.disable		= max8998_ldo_disable,
+	.get_voltage		= max8998_get_voltage,
+	.set_voltage		= max8998_set_voltage,
+	.set_suspend_enable	= max8998_ldo_enable,
+	.set_suspend_disable	= max8998_ldo_disable,
+};
+
+static struct regulator_ops max8998_others_ops = {
+	.is_enabled		= max8998_ldo_is_enabled,
+	.enable			= max8998_ldo_enable,
+	.disable		= max8998_ldo_disable,
+	.set_suspend_enable	= max8998_ldo_enable,
+	.set_suspend_disable	= max8998_ldo_disable,
+};
+
+static struct regulator_desc regulators[] = {
+	{
+		.name		= "LDO2",
+		.id		= MAX8998_LDO2,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO3",
+		.id		= MAX8998_LDO3,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO4",
+		.id		= MAX8998_LDO4,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO5",
+		.id		= MAX8998_LDO5,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO6",
+		.id		= MAX8998_LDO6,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO7",
+		.id		= MAX8998_LDO7,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO8",
+		.id		= MAX8998_LDO8,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO9",
+		.id		= MAX8998_LDO9,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO10",
+		.id		= MAX8998_LDO10,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO11",
+		.id		= MAX8998_LDO11,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO12",
+		.id		= MAX8998_LDO12,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO13",
+		.id		= MAX8998_LDO13,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO14",
+		.id		= MAX8998_LDO14,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO15",
+		.id		= MAX8998_LDO15,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO16",
+		.id		= MAX8998_LDO16,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "LDO17",
+		.id		= MAX8998_LDO17,
+		.ops		= &max8998_ldo_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "BUCK1",
+		.id		= MAX8998_BUCK1,
+		.ops		= &max8998_buck_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "BUCK2",
+		.id		= MAX8998_BUCK2,
+		.ops		= &max8998_buck_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "BUCK3",
+		.id		= MAX8998_BUCK3,
+		.ops		= &max8998_buck_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "BUCK4",
+		.id		= MAX8998_BUCK4,
+		.ops		= &max8998_buck_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "EN32KHz AP",
+		.id		= MAX8998_EN32KHZ_AP,
+		.ops		= &max8998_others_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "EN32KHz CP",
+		.id		= MAX8998_EN32KHZ_CP,
+		.ops		= &max8998_others_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "ENVICHG",
+		.id		= MAX8998_ENVICHG,
+		.ops		= &max8998_others_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "ESAFEOUT1",
+		.id		= MAX8998_ESAFEOUT1,
+		.ops		= &max8998_others_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}, {
+		.name		= "ESAFEOUT2",
+		.id		= MAX8998_ESAFEOUT2,
+		.ops		= &max8998_others_ops,
+		.type		= REGULATOR_VOLTAGE,
+		.owner		= THIS_MODULE,
+	}
+};
+
+static __devinit int max8998_pmic_probe(struct platform_device *pdev)
+{
+	struct max8998_dev *iodev = dev_get_drvdata(pdev->dev.parent);
+	struct max8998_platform_data *pdata = dev_get_platdata(iodev->dev);
+	struct regulator_dev **rdev;
+	struct max8998_data *max8998;
+	int i, ret, size;
+
+	if (!pdata) {
+		dev_err(pdev->dev.parent, "No platform init data supplied\n");
+		return -ENODEV;
+	}
+
+	max8998 = kzalloc(sizeof(struct max8998_data), GFP_KERNEL);
+	if (!max8998)
+		return -ENOMEM;
+
+	size = sizeof(struct regulator_dev *) * (pdata->num_regulators + 1);
+	max8998->rdev = kzalloc(size, GFP_KERNEL);
+	if (!max8998->rdev) {
+		kfree(max8998);
+		return -ENOMEM;
+	}
+
+	rdev = max8998->rdev;
+	max8998->iodev = iodev;
+	platform_set_drvdata(pdev, max8998);
+
+	for (i = 0; i < pdata->num_regulators; i++) {
+		const struct voltage_map_desc *desc;
+		int id = pdata->regulators[i].id;
+		int index = id - MAX8998_LDO2;
+
+		desc = ldo_voltage_map[id];
+		if (desc && regulators[index].ops != &max8998_others_ops) {
+			int count = (desc->max - desc->min) / desc->step + 1;
+			regulators[index].n_voltages = count;
+		}
+		rdev[i] = regulator_register(&regulators[index], max8998->dev,
+				pdata->regulators[i].initdata, max8998);
+		if (IS_ERR(rdev[i])) {
+			ret = PTR_ERR(rdev[i]);
+			dev_err(max8998->dev, "regulator init failed\n");
+			rdev[i] = NULL;
+			goto err;
+		}
+	}
+
+
+	return 0;
+err:
+	for (i = 0; i <= max8998->num_regulators; i++)
+		if (rdev[i])
+			regulator_unregister(rdev[i]);
+
+	kfree(max8998->rdev);
+	kfree(max8998);
+
+	return ret;
+}
+
+static int __devexit max8998_pmic_remove(struct platform_device *pdev)
+{
+	struct max8998_data *max8998 = platform_get_drvdata(pdev);
+	struct regulator_dev **rdev = max8998->rdev;
+	int i;
+
+	for (i = 0; i <= max8998->num_regulators; i++)
+		if (rdev[i])
+			regulator_unregister(rdev[i]);
+
+	kfree(max8998->rdev);
+	kfree(max8998);
+
+	return 0;
+}
+
+static struct platform_driver max8998_pmic_driver = {
+	.driver = {
+		.name = "max8998-pmic",
+		.owner = THIS_MODULE,
+	},
+	.probe = max8998_pmic_probe,
+	.remove = __devexit_p(max8998_pmic_remove),
+};
+
+static int __init max8998_pmic_init(void)
+{
+	return platform_driver_register(&max8998_pmic_driver);
+}
+subsys_initcall(max8998_pmic_init);
+
+static void __exit max8998_pmic_cleanup(void)
+{
+	platform_driver_unregister(&max8998_pmic_driver);
+}
+module_exit(max8998_pmic_cleanup);
+
+MODULE_DESCRIPTION("MAXIM 8998 voltage regulator driver");
+MODULE_AUTHOR("Kyungmin Park <kyungmin.park@samsung.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/max8998-private.h b/include/linux/mfd/max8998-private.h
new file mode 100644
index 000000000000..6dc75b3e2d33
--- /dev/null
+++ b/include/linux/mfd/max8998-private.h
@@ -0,0 +1,112 @@
+/*
+ * max8698.h - Voltage regulator driver for the Maxim 8998
+ *
+ *  Copyright (C) 2009-2010 Samsung Electrnoics
+ *  Kyungmin Park <kyungmin.park@samsung.com>
+ *  Marek Szyprowski <m.szyprowski@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __LINUX_MFD_MAX8998_PRIV_H
+#define __LINUX_MFD_MAX8998_PRIV_H
+
+/* MAX 8998 registers */
+enum {
+	MAX8998_REG_IRQ1,
+	MAX8998_REG_IRQ2,
+	MAX8998_REG_IRQ3,
+	MAX8998_REG_IRQ4,
+	MAX8998_REG_IRQM1,
+	MAX8998_REG_IRQM2,
+	MAX8998_REG_IRQM3,
+	MAX8998_REG_IRQM4,
+	MAX8998_REG_STATUS1,
+	MAX8998_REG_STATUS2,
+	MAX8998_REG_STATUSM1,
+	MAX8998_REG_STATUSM2,
+	MAX8998_REG_CHGR1,
+	MAX8998_REG_CHGR2,
+	MAX8998_REG_LDO_ACTIVE_DISCHARGE1,
+	MAX8998_REG_LDO_ACTIVE_DISCHARGE2,
+	MAX8998_REG_BUCK_ACTIVE_DISCHARGE3,
+	MAX8998_REG_ONOFF1,
+	MAX8998_REG_ONOFF2,
+	MAX8998_REG_ONOFF3,
+	MAX8998_REG_ONOFF4,
+	MAX8998_REG_BUCK1_DVSARM1,
+	MAX8998_REG_BUCK1_DVSARM2,
+	MAX8998_REG_BUCK1_DVSARM3,
+	MAX8998_REG_BUCK1_DVSARM4,
+	MAX8998_REG_BUCK2_DVSINT1,
+	MAX8998_REG_BUCK2_DVSINT2,
+	MAX8998_REG_BUCK3,
+	MAX8998_REG_BUCK4,
+	MAX8998_REG_LDO2_LDO3,
+	MAX8998_REG_LDO4,
+	MAX8998_REG_LDO5,
+	MAX8998_REG_LDO6,
+	MAX8998_REG_LDO7,
+	MAX8998_REG_LDO8_LDO9,
+	MAX8998_REG_LDO10_LDO11,
+	MAX8998_REG_LDO12,
+	MAX8998_REG_LDO13,
+	MAX8998_REG_LDO14,
+	MAX8998_REG_LDO15,
+	MAX8998_REG_LDO16,
+	MAX8998_REG_LDO17,
+	MAX8998_REG_BKCHR,
+	MAX8998_REG_LBCNFG1,
+	MAX8998_REG_LBCNFG2,
+};
+
+/**
+ * struct max8998_dev - max8998 master device for sub-drivers
+ * @dev: master device of the chip (can be used to access platform data)
+ * @i2c_client: i2c client private data
+ * @dev_read():	chip register read function
+ * @dev_write(): chip register write function
+ * @dev_update(): chip register update function
+ * @iolock: mutex for serializing io access
+ */
+
+struct max8998_dev {
+	struct device *dev;
+	struct i2c_client *i2c_client;
+	int (*dev_read)(struct max8998_dev *max8998, u8 reg, u8 *dest);
+	int (*dev_write)(struct max8998_dev *max8998, u8 reg, u8 val);
+	int (*dev_update)(struct max8998_dev *max8998, u8 reg, u8 val, u8 mask);
+	struct mutex iolock;
+};
+
+static inline int max8998_read_reg(struct max8998_dev *max8998, u8 reg,
+				   u8 *value)
+{
+	return max8998->dev_read(max8998, reg, value);
+}
+
+static inline int max8998_write_reg(struct max8998_dev *max8998, u8 reg,
+				    u8 value)
+{
+	return max8998->dev_write(max8998, reg, value);
+}
+
+static inline int max8998_update_reg(struct max8998_dev *max8998, u8 reg,
+				     u8 value, u8 mask)
+{
+	return max8998->dev_update(max8998, reg, value, mask);
+}
+
+#endif /*  __LINUX_MFD_MAX8998_PRIV_H */
diff --git a/include/linux/mfd/max8998.h b/include/linux/mfd/max8998.h
new file mode 100644
index 000000000000..1d3601a2d853
--- /dev/null
+++ b/include/linux/mfd/max8998.h
@@ -0,0 +1,78 @@
+/*
+ * max8698.h - Voltage regulator driver for the Maxim 8998
+ *
+ *  Copyright (C) 2009-2010 Samsung Electrnoics
+ *  Kyungmin Park <kyungmin.park@samsung.com>
+ *  Marek Szyprowski <m.szyprowski@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __LINUX_MFD_MAX8998_H
+#define __LINUX_MFD_MAX8998_H
+
+#include <linux/regulator/machine.h>
+
+/* MAX 8998 regulator ids */
+enum {
+	MAX8998_LDO2 = 2,
+	MAX8998_LDO3,
+	MAX8998_LDO4,
+	MAX8998_LDO5,
+	MAX8998_LDO6,
+	MAX8998_LDO7,
+	MAX8998_LDO8,
+	MAX8998_LDO9,
+	MAX8998_LDO10,
+	MAX8998_LDO11,
+	MAX8998_LDO12,
+	MAX8998_LDO13,
+	MAX8998_LDO14,
+	MAX8998_LDO15,
+	MAX8998_LDO16,
+	MAX8998_LDO17,
+	MAX8998_BUCK1,
+	MAX8998_BUCK2,
+	MAX8998_BUCK3,
+	MAX8998_BUCK4,
+	MAX8998_EN32KHZ_AP,
+	MAX8998_EN32KHZ_CP,
+	MAX8998_ENVICHG,
+	MAX8998_ESAFEOUT1,
+	MAX8998_ESAFEOUT2,
+};
+
+/**
+ * max8998_regulator_data - regulator data
+ * @id: regulator id
+ * @initdata: regulator init data (contraints, supplies, ...)
+ */
+struct max8998_regulator_data {
+	int				id;
+	struct regulator_init_data	*initdata;
+};
+
+/**
+ * struct max8998_board - packages regulator init data
+ * @num_regulators: number of regultors used
+ * @regulators: array of defined regulators
+ */
+
+struct max8998_platform_data {
+	int				num_regulators;
+	struct max8998_regulator_data	*regulators;
+};
+
+#endif /*  __LINUX_MFD_MAX8998_H */
-- 
cgit v1.2.3-59-g8ed1b


From 549931f99e030d63a437c23943fd8dc9b7c0e41c Mon Sep 17 00:00:00 2001
From: Sundar R Iyer <sundar.iyer@stericsson.com>
Date: Tue, 13 Jul 2010 11:51:28 +0530
Subject: ab8500-mfd: add regulator support to ab8500 mfd device

Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Acked-By: Mattias Wallin <mattias.wallin@stericsson.com>
Acked-By: Bengt JONSSON <bengt.g.jonsson@stericsson.com>
Signed-off-by: Sundar R Iyer <sundar.iyer@stericsson.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 drivers/mfd/ab8500-core.c        |  4 +++-
 include/linux/mfd/ab8500.h       |  6 ++++++
 include/linux/regulator/ab8500.h | 25 +++++++++++++++++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/regulator/ab8500.h

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index f3d26fa9c34d..defa786dee34 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -16,6 +16,7 @@
 #include <linux/platform_device.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/ab8500.h>
+#include <linux/regulator/ab8500.h>
 
 /*
  * Interrupt register offsets
@@ -352,6 +353,7 @@ static struct mfd_cell ab8500_devs[] = {
 	{ .name = "ab8500-audio", },
 	{ .name = "ab8500-usb", },
 	{ .name = "ab8500-pwm", },
+	{ .name = "ab8500-regulator", },
 };
 
 int __devinit ab8500_init(struct ab8500 *ab8500)
@@ -411,7 +413,7 @@ int __devinit ab8500_init(struct ab8500 *ab8500)
 			goto out_removeirq;
 	}
 
-	ret = mfd_add_devices(ab8500->dev, -1, ab8500_devs,
+	ret = mfd_add_devices(ab8500->dev, 0, ab8500_devs,
 			      ARRAY_SIZE(ab8500_devs), NULL,
 			      ab8500->irq_base);
 	if (ret)
diff --git a/include/linux/mfd/ab8500.h b/include/linux/mfd/ab8500.h
index b63ff3ba3351..f5cec4500f38 100644
--- a/include/linux/mfd/ab8500.h
+++ b/include/linux/mfd/ab8500.h
@@ -76,6 +76,8 @@
 #define AB8500_NR_IRQS			104
 #define AB8500_NUM_IRQ_REGS		13
 
+#define AB8500_NUM_REGULATORS   15
+
 /**
  * struct ab8500 - ab8500 internal structure
  * @dev: parent device
@@ -108,14 +110,18 @@ struct ab8500 {
 	u8 oldmask[AB8500_NUM_IRQ_REGS];
 };
 
+struct regulator_init_data;
+
 /**
  * struct ab8500_platform_data - AB8500 platform data
  * @irq_base: start of AB8500 IRQs, AB8500_NR_IRQS will be used
  * @init: board-specific initialization after detection of ab8500
+ * @regulator: machine-specific constraints for regulators
  */
 struct ab8500_platform_data {
 	int irq_base;
 	void (*init) (struct ab8500 *);
+	struct regulator_init_data *regulator[AB8500_NUM_REGULATORS];
 };
 
 extern int ab8500_write(struct ab8500 *a8500, u16 addr, u8 data);
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
new file mode 100644
index 000000000000..f509877c2ed4
--- /dev/null
+++ b/include/linux/regulator/ab8500.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) ST-Ericsson SA 2010
+ *
+ * License Terms: GNU General Public License v2
+ *
+ * Author: Sundar Iyer <sundar.iyer@stericsson.com> for ST-Ericsson
+ *
+ */
+
+#ifndef __LINUX_MFD_AB8500_REGULATOR_H
+#define __LINUX_MFD_AB8500_REGULATOR_H
+
+/* AB8500 regulators */
+#define AB8500_LDO_AUX1         0
+#define AB8500_LDO_AUX2         1
+#define AB8500_LDO_AUX3         2
+#define AB8500_LDO_INTCORE      3
+#define AB8500_LDO_TVOUT        4
+#define AB8500_LDO_AUDIO	5
+#define AB8500_LDO_ANAMIC1      6
+#define AB8500_LDO_ANAMIC2      7
+#define AB8500_LDO_DMIC         8
+#define AB8500_LDO_ANA          9
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 9bbb9e5a33109b2832e2e63dcc7a132924ab374b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Aug 2010 23:04:12 -0600
Subject: param: use ops in struct kernel_param, rather than get and set fns
 directly

This is more kernel-ish, saves some space, and also allows us to
expand the ops without breaking all the callers who are happy for the
new members to be NULL.

The few places which defined their own param types are changed to the
new scheme (more which crept in recently fixed in following patches).

Since we're touching them anyway, we change get() and set() to take a
const struct kernel_param (which they really are).  This causes some
harmless warnings until we fix them (in following patches).

To reduce churn, module_param_call creates the ops struct so the callers
don't have to change (and casts the functions to reduce warnings).
The modern version which takes an ops struct is called module_param_cb.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ville Syrjala <syrjala@sci.fi>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Alessandro Rubini <rubini@ipvvis.unipv.it>
Cc: Michal Januszewski <spock@gentoo.org>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Neil Brown <neilb@suse.de>
Cc: linux-kernel@vger.kernel.org
Cc: linux-input@vger.kernel.org
Cc: linux-fbdev-devel@lists.sourceforge.net
Cc: linux-nfs@vger.kernel.org
Cc: netdev@vger.kernel.org
---
 drivers/input/misc/ati_remote2.c   |  26 +++++---
 drivers/input/mouse/psmouse-base.c |  14 +++--
 drivers/video/uvesafb.c            |   7 ++-
 fs/nfs/callback.c                  |  11 ++--
 include/linux/moduleparam.h        | 123 ++++++++++++++++++++++---------------
 kernel/params.c                    |  90 ++++++++++++++++++---------
 net/sunrpc/xprtsock.c              |  26 ++++----
 7 files changed, 186 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/misc/ati_remote2.c b/drivers/input/misc/ati_remote2.c
index e148749b5851..23257652b8e8 100644
--- a/drivers/input/misc/ati_remote2.c
+++ b/drivers/input/misc/ati_remote2.c
@@ -38,7 +38,8 @@ enum {
 };
 
 static int ati_remote2_set_mask(const char *val,
-				struct kernel_param *kp, unsigned int max)
+				const struct kernel_param *kp,
+				unsigned int max)
 {
 	unsigned long mask;
 	int ret;
@@ -59,28 +60,31 @@ static int ati_remote2_set_mask(const char *val,
 }
 
 static int ati_remote2_set_channel_mask(const char *val,
-					struct kernel_param *kp)
+					const struct kernel_param *kp)
 {
 	pr_debug("%s()\n", __func__);
 
 	return ati_remote2_set_mask(val, kp, ATI_REMOTE2_MAX_CHANNEL_MASK);
 }
 
-static int ati_remote2_get_channel_mask(char *buffer, struct kernel_param *kp)
+static int ati_remote2_get_channel_mask(char *buffer,
+					const struct kernel_param *kp)
 {
 	pr_debug("%s()\n", __func__);
 
 	return sprintf(buffer, "0x%04x", *(unsigned int *)kp->arg);
 }
 
-static int ati_remote2_set_mode_mask(const char *val, struct kernel_param *kp)
+static int ati_remote2_set_mode_mask(const char *val,
+				     const struct kernel_param *kp)
 {
 	pr_debug("%s()\n", __func__);
 
 	return ati_remote2_set_mask(val, kp, ATI_REMOTE2_MAX_MODE_MASK);
 }
 
-static int ati_remote2_get_mode_mask(char *buffer, struct kernel_param *kp)
+static int ati_remote2_get_mode_mask(char *buffer,
+				     const struct kernel_param *kp)
 {
 	pr_debug("%s()\n", __func__);
 
@@ -89,15 +93,19 @@ static int ati_remote2_get_mode_mask(char *buffer, struct kernel_param *kp)
 
 static unsigned int channel_mask = ATI_REMOTE2_MAX_CHANNEL_MASK;
 #define param_check_channel_mask(name, p) __param_check(name, p, unsigned int)
-#define param_set_channel_mask ati_remote2_set_channel_mask
-#define param_get_channel_mask ati_remote2_get_channel_mask
+static struct kernel_param_ops param_ops_channel_mask = {
+	.set = ati_remote2_set_channel_mask,
+	.get = ati_remote2_get_channel_mask,
+};
 module_param(channel_mask, channel_mask, 0644);
 MODULE_PARM_DESC(channel_mask, "Bitmask of channels to accept <15:Channel16>...<1:Channel2><0:Channel1>");
 
 static unsigned int mode_mask = ATI_REMOTE2_MAX_MODE_MASK;
 #define param_check_mode_mask(name, p) __param_check(name, p, unsigned int)
-#define param_set_mode_mask ati_remote2_set_mode_mask
-#define param_get_mode_mask ati_remote2_get_mode_mask
+static struct kernel_param_ops param_ops_mode_mask = {
+	.set = ati_remote2_set_mode_mask,
+	.get = ati_remote2_get_mode_mask,
+};
 module_param(mode_mask, mode_mask, 0644);
 MODULE_PARM_DESC(mode_mask, "Bitmask of modes to accept <4:PC><3:AUX4><2:AUX3><1:AUX2><0:AUX1>");
 
diff --git a/drivers/input/mouse/psmouse-base.c b/drivers/input/mouse/psmouse-base.c
index 979c50215282..73a7af2542a8 100644
--- a/drivers/input/mouse/psmouse-base.c
+++ b/drivers/input/mouse/psmouse-base.c
@@ -39,11 +39,13 @@ MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");
 
 static unsigned int psmouse_max_proto = PSMOUSE_AUTO;
-static int psmouse_set_maxproto(const char *val, struct kernel_param *kp);
-static int psmouse_get_maxproto(char *buffer, struct kernel_param *kp);
+static int psmouse_set_maxproto(const char *val, const struct kernel_param *);
+static int psmouse_get_maxproto(char *buffer, const struct kernel_param *kp);
+static struct kernel_param_ops param_ops_proto_abbrev = {
+	.set = psmouse_set_maxproto,
+	.get = psmouse_get_maxproto,
+};
 #define param_check_proto_abbrev(name, p)	__param_check(name, p, unsigned int)
-#define param_set_proto_abbrev			psmouse_set_maxproto
-#define param_get_proto_abbrev			psmouse_get_maxproto
 module_param_named(proto, psmouse_max_proto, proto_abbrev, 0644);
 MODULE_PARM_DESC(proto, "Highest protocol extension to probe (bare, imps, exps, any). Useful for KVM switches.");
 
@@ -1679,7 +1681,7 @@ static ssize_t psmouse_attr_set_resolution(struct psmouse *psmouse, void *data,
 }
 
 
-static int psmouse_set_maxproto(const char *val, struct kernel_param *kp)
+static int psmouse_set_maxproto(const char *val, const struct kernel_param *kp)
 {
 	const struct psmouse_protocol *proto;
 
@@ -1696,7 +1698,7 @@ static int psmouse_set_maxproto(const char *val, struct kernel_param *kp)
 	return 0;
 }
 
-static int psmouse_get_maxproto(char *buffer, struct kernel_param *kp)
+static int psmouse_get_maxproto(char *buffer, const struct kernel_param *kp)
 {
 	int type = *((unsigned int *)kp->arg);
 
diff --git a/drivers/video/uvesafb.c b/drivers/video/uvesafb.c
index 7b8839ebf3c4..52ec0959d462 100644
--- a/drivers/video/uvesafb.c
+++ b/drivers/video/uvesafb.c
@@ -1977,8 +1977,7 @@ static void __devexit uvesafb_exit(void)
 
 module_exit(uvesafb_exit);
 
-#define param_get_scroll NULL
-static int param_set_scroll(const char *val, struct kernel_param *kp)
+static int param_set_scroll(const char *val, const struct kernel_param *kp)
 {
 	ypan = 0;
 
@@ -1993,7 +1992,9 @@ static int param_set_scroll(const char *val, struct kernel_param *kp)
 
 	return 0;
 }
-
+static struct kernel_param_ops param_ops_scroll = {
+	.set = param_set_scroll,
+};
 #define param_check_scroll(name, p) __param_check(name, p, void)
 
 module_param_named(scroll, ypan, scroll, 0);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 36dfdae95123..e17b49e2eabd 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -45,7 +45,7 @@ unsigned short nfs_callback_tcpport;
 unsigned short nfs_callback_tcpport6;
 #define NFS_CALLBACK_MAXPORTNR (65535U)
 
-static int param_set_portnr(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
 {
 	unsigned long num;
 	int ret;
@@ -58,11 +58,10 @@ static int param_set_portnr(const char *val, struct kernel_param *kp)
 	*((unsigned int *)kp->arg) = num;
 	return 0;
 }
-
-static int param_get_portnr(char *buffer, struct kernel_param *kp)
-{
-	return param_get_uint(buffer, kp);
-}
+static struct kernel_param_ops param_ops_portnr = {
+	.set = param_set_portnr,
+	.get = param_get_uint,
+};
 #define param_check_portnr(name, p) __param_check(name, p, unsigned int);
 
 module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 82a9124f7d75..02e5090ce32f 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -31,20 +31,21 @@ static const char __module_cat(name,__LINE__)[]				  \
 
 struct kernel_param;
 
-/* Returns 0, or -errno.  arg is in kp->arg. */
-typedef int (*param_set_fn)(const char *val, struct kernel_param *kp);
-/* Returns length written or -errno.  Buffer is 4k (ie. be short!) */
-typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp);
+struct kernel_param_ops {
+	/* Returns 0, or -errno.  arg is in kp->arg. */
+	int (*set)(const char *val, const struct kernel_param *kp);
+	/* Returns length written or -errno.  Buffer is 4k (ie. be short!) */
+	int (*get)(char *buffer, const struct kernel_param *kp);
+};
 
 /* Flag bits for kernel_param.flags */
 #define KPARAM_ISBOOL		2
 
 struct kernel_param {
 	const char *name;
+	const struct kernel_param_ops *ops;
 	u16 perm;
 	u16 flags;
-	param_set_fn set;
-	param_get_fn get;
 	union {
 		void *arg;
 		const struct kparam_string *str;
@@ -63,8 +64,7 @@ struct kparam_array
 {
 	unsigned int max;
 	unsigned int *num;
-	param_set_fn set;
-	param_get_fn get;
+	const struct kernel_param_ops *ops;
 	unsigned int elemsize;
 	void *elem;
 };
@@ -83,7 +83,7 @@ struct kparam_array
    parameters.  perm sets the visibility in sysfs: 000 means it's
    not there, read bits mean it's readable, write bits mean it's
    writable. */
-#define __module_param_call(prefix, name, set, get, arg, isbool, perm)	\
+#define __module_param_call(prefix, name, ops, arg, isbool, perm)	\
 	/* Default value instead of permissions? */			\
 	static int __param_perm_check_##name __attribute__((unused)) =	\
 	BUILD_BUG_ON_ZERO((perm) < 0 || (perm) > 0777 || ((perm) & 2))	\
@@ -92,20 +92,37 @@ struct kparam_array
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used								\
     __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \
-	= { __param_str_##name, perm, isbool ? KPARAM_ISBOOL : 0,	\
-	    set, get, { arg } }
+	= { __param_str_##name, ops, perm, isbool ? KPARAM_ISBOOL : 0,	\
+	    { arg } }
+
+/* Obsolete - use module_param_cb() */
+#define module_param_call(name, set, get, arg, perm)			\
+	static struct kernel_param_ops __param_ops_##name =		\
+		 { (void *)set, (void *)get };				\
+	__module_param_call(MODULE_PARAM_PREFIX,			\
+			    name, &__param_ops_##name, arg,		\
+			    __same_type(*(arg), bool),			\
+			    (perm) + sizeof(__check_old_set_param(set))*0)
+
+/* We don't get oldget: it's often a new-style param_get_uint, etc. */
+static inline int
+__check_old_set_param(int (*oldset)(const char *, struct kernel_param *))
+{
+	return 0;
+}
 
-#define module_param_call(name, set, get, arg, perm)			      \
+#define module_param_cb(name, ops, arg, perm)				      \
 	__module_param_call(MODULE_PARAM_PREFIX,			      \
-			    name, set, get, arg,			      \
-			    __same_type(*(arg), bool), perm)
+			    name, ops, arg, __same_type(*(arg), bool), perm)
 
-/* Helper functions: type is byte, short, ushort, int, uint, long,
-   ulong, charp, bool or invbool, or XXX if you define param_get_XXX,
-   param_set_XXX and param_check_XXX. */
+/*
+ * Helper functions: type is byte, short, ushort, int, uint, long,
+ * ulong, charp, bool or invbool, or XXX if you define param_ops_XXX
+ * and param_check_XXX.
+ */
 #define module_param_named(name, value, type, perm)			   \
 	param_check_##type(name, &(value));				   \
-	module_param_call(name, param_set_##type, param_get_##type, &value, perm); \
+	module_param_cb(name, &param_ops_##type, &value, perm);		   \
 	__MODULE_PARM_TYPE(name, #type)
 
 #define module_param(name, type, perm)				\
@@ -126,7 +143,7 @@ struct kparam_array
  */
 #define core_param(name, var, type, perm)				\
 	param_check_##type(name, &(var));				\
-	__module_param_call("", name, param_set_##type, param_get_##type, \
+	__module_param_call("", name, &param_ops_##type,		\
 			    &var, __same_type(var, bool), perm)
 #endif /* !MODULE */
 
@@ -135,7 +152,7 @@ struct kparam_array
 	static const struct kparam_string __param_string_##name		\
 		= { len, string };					\
 	__module_param_call(MODULE_PARAM_PREFIX, name,			\
-			    param_set_copystring, param_get_string,	\
+			    &param_ops_string,				\
 			    .str = &__param_string_##name, 0, perm);	\
 	__MODULE_PARM_TYPE(name, "string")
 
@@ -162,41 +179,50 @@ static inline void destroy_params(const struct kernel_param *params,
 #define __param_check(name, p, type) \
 	static inline type *__check_##name(void) { return(p); }
 
-extern int param_set_byte(const char *val, struct kernel_param *kp);
-extern int param_get_byte(char *buffer, struct kernel_param *kp);
+extern struct kernel_param_ops param_ops_byte;
+extern int param_set_byte(const char *val, const struct kernel_param *kp);
+extern int param_get_byte(char *buffer, const struct kernel_param *kp);
 #define param_check_byte(name, p) __param_check(name, p, unsigned char)
 
-extern int param_set_short(const char *val, struct kernel_param *kp);
-extern int param_get_short(char *buffer, struct kernel_param *kp);
+extern struct kernel_param_ops param_ops_short;
+extern int param_set_short(const char *val, const struct kernel_param *kp);
+extern int param_get_short(char *buffer, const struct kernel_param *kp);
 #define param_check_short(name, p) __param_check(name, p, short)
 
-extern int param_set_ushort(const char *val, struct kernel_param *kp);
-extern int param_get_ushort(char *buffer, struct kernel_param *kp);
+extern struct kernel_param_ops param_ops_ushort;
+extern int param_set_ushort(const char *val, const struct kernel_param *kp);
+extern int param_get_ushort(char *buffer, const struct kernel_param *kp);
 #define param_check_ushort(name, p) __param_check(name, p, unsigned short)
 
-extern int param_set_int(const char *val, struct kernel_param *kp);
-extern int param_get_int(char *buffer, struct kernel_param *kp);
+extern struct kernel_param_ops param_ops_int;
+extern int param_set_int(const char *val, const struct kernel_param *kp);
+extern int param_get_int(char *buffer, const struct kernel_param *kp);
 #define param_check_int(name, p) __param_check(name, p, int)
 
-extern int param_set_uint(const char *val, struct kernel_param *kp);
-extern int param_get_uint(char *buffer, struct kernel_param *kp);
+extern struct kernel_param_ops param_ops_uint;
+extern int param_set_uint(const char *val, const struct kernel_param *kp);
+extern int param_get_uint(char *buffer, const struct kernel_param *kp);
 #define param_check_uint(name, p) __param_check(name, p, unsigned int)
 
-extern int param_set_long(const char *val, struct kernel_param *kp);
-extern int param_get_long(char *buffer, struct kernel_param *kp);
+extern struct kernel_param_ops param_ops_long;
+extern int param_set_long(const char *val, const struct kernel_param *kp);
+extern int param_get_long(char *buffer, const struct kernel_param *kp);
 #define param_check_long(name, p) __param_check(name, p, long)
 
-extern int param_set_ulong(const char *val, struct kernel_param *kp);
-extern int param_get_ulong(char *buffer, struct kernel_param *kp);
+extern struct kernel_param_ops param_ops_ulong;
+extern int param_set_ulong(const char *val, const struct kernel_param *kp);
+extern int param_get_ulong(char *buffer, const struct kernel_param *kp);
 #define param_check_ulong(name, p) __param_check(name, p, unsigned long)
 
-extern int param_set_charp(const char *val, struct kernel_param *kp);
-extern int param_get_charp(char *buffer, struct kernel_param *kp);
+extern struct kernel_param_ops param_ops_charp;
+extern int param_set_charp(const char *val, const struct kernel_param *kp);
+extern int param_get_charp(char *buffer, const struct kernel_param *kp);
 #define param_check_charp(name, p) __param_check(name, p, char *)
 
 /* For historical reasons "bool" parameters can be (unsigned) "int". */
-extern int param_set_bool(const char *val, struct kernel_param *kp);
-extern int param_get_bool(char *buffer, struct kernel_param *kp);
+extern struct kernel_param_ops param_ops_bool;
+extern int param_set_bool(const char *val, const struct kernel_param *kp);
+extern int param_get_bool(char *buffer, const struct kernel_param *kp);
 #define param_check_bool(name, p)					\
 	static inline void __check_##name(void)				\
 	{								\
@@ -205,17 +231,18 @@ extern int param_get_bool(char *buffer, struct kernel_param *kp);
 			     !__same_type(*(p), int));			\
 	}
 
-extern int param_set_invbool(const char *val, struct kernel_param *kp);
-extern int param_get_invbool(char *buffer, struct kernel_param *kp);
+extern struct kernel_param_ops param_ops_invbool;
+extern int param_set_invbool(const char *val, const struct kernel_param *kp);
+extern int param_get_invbool(char *buffer, const struct kernel_param *kp);
 #define param_check_invbool(name, p) __param_check(name, p, bool)
 
 /* Comma-separated array: *nump is set to number they actually specified. */
 #define module_param_array_named(name, array, type, nump, perm)		\
 	static const struct kparam_array __param_arr_##name		\
-	= { ARRAY_SIZE(array), nump, param_set_##type, param_get_##type,\
+	= { ARRAY_SIZE(array), nump, &param_ops_##type,			\
 	    sizeof(array[0]), array };					\
 	__module_param_call(MODULE_PARAM_PREFIX, name,			\
-			    param_array_set, param_array_get,		\
+			    &param_array_ops,				\
 			    .arr = &__param_arr_##name,			\
 			    __same_type(array[0], bool), perm);		\
 	__MODULE_PARM_TYPE(name, "array of " #type)
@@ -223,11 +250,11 @@ extern int param_get_invbool(char *buffer, struct kernel_param *kp);
 #define module_param_array(name, type, nump, perm)		\
 	module_param_array_named(name, name, type, nump, perm)
 
-extern int param_array_set(const char *val, struct kernel_param *kp);
-extern int param_array_get(char *buffer, struct kernel_param *kp);
+extern struct kernel_param_ops param_array_ops;
 
-extern int param_set_copystring(const char *val, struct kernel_param *kp);
-extern int param_get_string(char *buffer, struct kernel_param *kp);
+extern struct kernel_param_ops param_ops_string;
+extern int param_set_copystring(const char *val, const struct kernel_param *);
+extern int param_get_string(char *buffer, const struct kernel_param *kp);
 
 /* for exporting parameters in /sys/parameters */
 
@@ -235,13 +262,13 @@ struct module;
 
 #if defined(CONFIG_SYSFS) && defined(CONFIG_MODULES)
 extern int module_param_sysfs_setup(struct module *mod,
-				    struct kernel_param *kparam,
+				    const struct kernel_param *kparam,
 				    unsigned int num_params);
 
 extern void module_param_sysfs_remove(struct module *mod);
 #else
 static inline int module_param_sysfs_setup(struct module *mod,
-			     struct kernel_param *kparam,
+			     const struct kernel_param *kparam,
 			     unsigned int num_params)
 {
 	return 0;
diff --git a/kernel/params.c b/kernel/params.c
index 3e78fdb445e7..a550698ae02d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -59,11 +59,11 @@ static int parse_one(char *param,
 	for (i = 0; i < num_params; i++) {
 		if (parameq(param, params[i].name)) {
 			/* Noone handled NULL, so do it here. */
-			if (!val && params[i].set != param_set_bool)
+			if (!val && params[i].ops->set != param_set_bool)
 				return -EINVAL;
 			DEBUGP("They are equal!  Calling %p\n",
-			       params[i].set);
-			return params[i].set(val, &params[i]);
+			       params[i].ops->set);
+			return params[i].ops->set(val, &params[i]);
 		}
 	}
 
@@ -179,7 +179,7 @@ int parse_args(const char *name,
 
 /* Lazy bastard, eh? */
 #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn)      	\
-	int param_set_##name(const char *val, struct kernel_param *kp)	\
+	int param_set_##name(const char *val, const struct kernel_param *kp) \
 	{								\
 		tmptype l;						\
 		int ret;						\
@@ -190,12 +190,18 @@ int parse_args(const char *name,
 		*((type *)kp->arg) = l;					\
 		return 0;						\
 	}								\
-	int param_get_##name(char *buffer, struct kernel_param *kp)	\
+	int param_get_##name(char *buffer, const struct kernel_param *kp) \
 	{								\
 		return sprintf(buffer, format, *((type *)kp->arg));	\
 	}								\
+	struct kernel_param_ops param_ops_##name = {			\
+		.set = param_set_##name,				\
+		.get = param_get_##name,				\
+	};								\
 	EXPORT_SYMBOL(param_set_##name);				\
-	EXPORT_SYMBOL(param_get_##name)
+	EXPORT_SYMBOL(param_get_##name);				\
+	EXPORT_SYMBOL(param_ops_##name)
+
 
 STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
 STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
@@ -205,7 +211,7 @@ STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
 STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
 STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
 
-int param_set_charp(const char *val, struct kernel_param *kp)
+int param_set_charp(const char *val, const struct kernel_param *kp)
 {
 	if (strlen(val) > 1024) {
 		printk(KERN_ERR "%s: string parameter too long\n",
@@ -226,14 +232,20 @@ int param_set_charp(const char *val, struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_set_charp);
 
-int param_get_charp(char *buffer, struct kernel_param *kp)
+int param_get_charp(char *buffer, const struct kernel_param *kp)
 {
 	return sprintf(buffer, "%s", *((char **)kp->arg));
 }
 EXPORT_SYMBOL(param_get_charp);
 
+struct kernel_param_ops param_ops_charp = {
+	.set = param_set_charp,
+	.get = param_get_charp,
+};
+EXPORT_SYMBOL(param_ops_charp);
+
 /* Actually could be a bool or an int, for historical reasons. */
-int param_set_bool(const char *val, struct kernel_param *kp)
+int param_set_bool(const char *val, const struct kernel_param *kp)
 {
 	bool v;
 
@@ -260,7 +272,7 @@ int param_set_bool(const char *val, struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_set_bool);
 
-int param_get_bool(char *buffer, struct kernel_param *kp)
+int param_get_bool(char *buffer, const struct kernel_param *kp)
 {
 	bool val;
 	if (kp->flags & KPARAM_ISBOOL)
@@ -273,8 +285,14 @@ int param_get_bool(char *buffer, struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_get_bool);
 
+struct kernel_param_ops param_ops_bool = {
+	.set = param_set_bool,
+	.get = param_get_bool,
+};
+EXPORT_SYMBOL(param_ops_bool);
+
 /* This one must be bool. */
-int param_set_invbool(const char *val, struct kernel_param *kp)
+int param_set_invbool(const char *val, const struct kernel_param *kp)
 {
 	int ret;
 	bool boolval;
@@ -289,18 +307,24 @@ int param_set_invbool(const char *val, struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_set_invbool);
 
-int param_get_invbool(char *buffer, struct kernel_param *kp)
+int param_get_invbool(char *buffer, const struct kernel_param *kp)
 {
 	return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
 }
 EXPORT_SYMBOL(param_get_invbool);
 
+struct kernel_param_ops param_ops_invbool = {
+	.set = param_set_invbool,
+	.get = param_get_invbool,
+};
+EXPORT_SYMBOL(param_ops_invbool);
+
 /* We break the rule and mangle the string. */
 static int param_array(const char *name,
 		       const char *val,
 		       unsigned int min, unsigned int max,
 		       void *elem, int elemsize,
-		       int (*set)(const char *, struct kernel_param *kp),
+		       int (*set)(const char *, const struct kernel_param *kp),
 		       u16 flags,
 		       unsigned int *num)
 {
@@ -345,18 +369,17 @@ static int param_array(const char *name,
 	return 0;
 }
 
-int param_array_set(const char *val, struct kernel_param *kp)
+static int param_array_set(const char *val, const struct kernel_param *kp)
 {
 	const struct kparam_array *arr = kp->arr;
 	unsigned int temp_num;
 
 	return param_array(kp->name, val, 1, arr->max, arr->elem,
-			   arr->elemsize, arr->set, kp->flags,
+			   arr->elemsize, arr->ops->set, kp->flags,
 			   arr->num ?: &temp_num);
 }
-EXPORT_SYMBOL(param_array_set);
 
-int param_array_get(char *buffer, struct kernel_param *kp)
+static int param_array_get(char *buffer, const struct kernel_param *kp)
 {
 	int i, off, ret;
 	const struct kparam_array *arr = kp->arr;
@@ -367,7 +390,7 @@ int param_array_get(char *buffer, struct kernel_param *kp)
 		if (i)
 			buffer[off++] = ',';
 		p.arg = arr->elem + arr->elemsize * i;
-		ret = arr->get(buffer + off, &p);
+		ret = arr->ops->get(buffer + off, &p);
 		if (ret < 0)
 			return ret;
 		off += ret;
@@ -375,9 +398,14 @@ int param_array_get(char *buffer, struct kernel_param *kp)
 	buffer[off] = '\0';
 	return off;
 }
-EXPORT_SYMBOL(param_array_get);
 
-int param_set_copystring(const char *val, struct kernel_param *kp)
+struct kernel_param_ops param_array_ops = {
+	.set = param_array_set,
+	.get = param_array_get,
+};
+EXPORT_SYMBOL(param_array_ops);
+
+int param_set_copystring(const char *val, const struct kernel_param *kp)
 {
 	const struct kparam_string *kps = kp->str;
 
@@ -391,13 +419,19 @@ int param_set_copystring(const char *val, struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_set_copystring);
 
-int param_get_string(char *buffer, struct kernel_param *kp)
+int param_get_string(char *buffer, const struct kernel_param *kp)
 {
 	const struct kparam_string *kps = kp->str;
 	return strlcpy(buffer, kps->string, kps->maxlen);
 }
 EXPORT_SYMBOL(param_get_string);
 
+struct kernel_param_ops param_ops_string = {
+	.set = param_set_copystring,
+	.get = param_get_string,
+};
+EXPORT_SYMBOL(param_ops_string);
+
 /* sysfs output in /sys/modules/XYZ/parameters/ */
 #define to_module_attr(n) container_of(n, struct module_attribute, attr)
 #define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
@@ -407,7 +441,7 @@ extern struct kernel_param __start___param[], __stop___param[];
 struct param_attribute
 {
 	struct module_attribute mattr;
-	struct kernel_param *param;
+	const struct kernel_param *param;
 };
 
 struct module_param_attrs
@@ -426,10 +460,10 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
 	int count;
 	struct param_attribute *attribute = to_param_attr(mattr);
 
-	if (!attribute->param->get)
+	if (!attribute->param->ops->get)
 		return -EPERM;
 
-	count = attribute->param->get(buf, attribute->param);
+	count = attribute->param->ops->get(buf, attribute->param);
 	if (count > 0) {
 		strcat(buf, "\n");
 		++count;
@@ -445,10 +479,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
  	int err;
 	struct param_attribute *attribute = to_param_attr(mattr);
 
-	if (!attribute->param->set)
+	if (!attribute->param->ops->set)
 		return -EPERM;
 
-	err = attribute->param->set(buf, attribute->param);
+	err = attribute->param->ops->set(buf, attribute->param);
 	if (!err)
 		return len;
 	return err;
@@ -473,7 +507,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
  * if there's an error.
  */
 static __modinit int add_sysfs_param(struct module_kobject *mk,
-				     struct kernel_param *kp,
+				     const struct kernel_param *kp,
 				     const char *name)
 {
 	struct module_param_attrs *new;
@@ -555,7 +589,7 @@ static void free_module_param_attrs(struct module_kobject *mk)
  * /sys/module/[mod->name]/parameters/
  */
 int module_param_sysfs_setup(struct module *mod,
-			     struct kernel_param *kparam,
+			     const struct kernel_param *kparam,
 			     unsigned int num_params)
 {
 	int i, err;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7ca65c7005ea..49a62f0c4b87 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2577,7 +2577,8 @@ void cleanup_socket_xprt(void)
 	xprt_unregister_transport(&xs_bc_tcp_transport);
 }
 
-static int param_set_uint_minmax(const char *val, struct kernel_param *kp,
+static int param_set_uint_minmax(const char *val,
+		const struct kernel_param *kp,
 		unsigned int min, unsigned int max)
 {
 	unsigned long num;
@@ -2592,34 +2593,37 @@ static int param_set_uint_minmax(const char *val, struct kernel_param *kp,
 	return 0;
 }
 
-static int param_set_portnr(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
 {
 	return param_set_uint_minmax(val, kp,
 			RPC_MIN_RESVPORT,
 			RPC_MAX_RESVPORT);
 }
 
-static int param_get_portnr(char *buffer, struct kernel_param *kp)
-{
-	return param_get_uint(buffer, kp);
-}
+static struct kernel_param_ops param_ops_portnr = {
+	.set = param_set_portnr,
+	.get = param_get_uint,
+};
+
 #define param_check_portnr(name, p) \
 	__param_check(name, p, unsigned int);
 
 module_param_named(min_resvport, xprt_min_resvport, portnr, 0644);
 module_param_named(max_resvport, xprt_max_resvport, portnr, 0644);
 
-static int param_set_slot_table_size(const char *val, struct kernel_param *kp)
+static int param_set_slot_table_size(const char *val,
+				     const struct kernel_param *kp)
 {
 	return param_set_uint_minmax(val, kp,
 			RPC_MIN_SLOT_TABLE,
 			RPC_MAX_SLOT_TABLE);
 }
 
-static int param_get_slot_table_size(char *buffer, struct kernel_param *kp)
-{
-	return param_get_uint(buffer, kp);
-}
+static struct kernel_param_ops param_ops_slot_table_size = {
+	.set = param_set_slot_table_size,
+	.get = param_get_uint,
+};
+
 #define param_check_slot_table_size(name, p) \
 	__param_check(name, p, unsigned int);
 
-- 
cgit v1.2.3-59-g8ed1b


From e6df34a4429b77fdffb6e05adf263468a3dcda33 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Aug 2010 23:04:17 -0600
Subject: param: add a free hook to kernel_param_ops.

This allows us to generalize the KPARAM_KMALLOCED flag, by calling a function
on every parameter when a module is unloaded.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
---
 include/linux/moduleparam.h |  2 ++
 kernel/params.c             | 17 ++++++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 02e5090ce32f..9f51568f51c8 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -36,6 +36,8 @@ struct kernel_param_ops {
 	int (*set)(const char *val, const struct kernel_param *kp);
 	/* Returns length written or -errno.  Buffer is 4k (ie. be short!) */
 	int (*get)(char *buffer, const struct kernel_param *kp);
+	/* Optional function to free kp->arg when module unloaded. */
+	void (*free)(void *arg);
 };
 
 /* Flag bits for kernel_param.flags */
diff --git a/kernel/params.c b/kernel/params.c
index a550698ae02d..458a09b886c4 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -399,9 +399,20 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
 	return off;
 }
 
+static void param_array_free(void *arg)
+{
+	unsigned int i;
+	const struct kparam_array *arr = arg;
+
+	if (arr->ops->free)
+		for (i = 0; i < (arr->num ? *arr->num : arr->max); i++)
+			arr->ops->free(arr->elem + arr->elemsize * i);
+}
+
 struct kernel_param_ops param_array_ops = {
 	.set = param_array_set,
 	.get = param_array_get,
+	.free = param_array_free,
 };
 EXPORT_SYMBOL(param_array_ops);
 
@@ -634,7 +645,11 @@ void module_param_sysfs_remove(struct module *mod)
 
 void destroy_params(const struct kernel_param *params, unsigned num)
 {
-	/* FIXME: This should free kmalloced charp parameters.  It doesn't. */
+	unsigned int i;
+
+	for (i = 0; i < num; i++)
+		if (params[i].ops->free)
+			params[i].ops->free(params[i].arg);
 }
 
 static void __init kernel_add_sysfs_param(const char *name,
-- 
cgit v1.2.3-59-g8ed1b


From 914dcaa84c53f2c3efa6016efcae13fd92a8a17c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Aug 2010 23:04:18 -0600
Subject: param: make param sections const.

Since this section can be read-only (they're in .rodata), they should
always have been const.  Minor flow-through various functions.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Tested-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
---
 include/linux/moduleparam.h | 2 +-
 init/main.c                 | 8 ++++----
 kernel/params.c             | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 9f51568f51c8..6d48831fe7d2 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -161,7 +161,7 @@ __check_old_set_param(int (*oldset)(const char *, struct kernel_param *))
 /* Called on module insert or kernel boot */
 extern int parse_args(const char *name,
 		      char *args,
-		      struct kernel_param *params,
+		      const struct kernel_param *params,
 		      unsigned num,
 		      int (*unknown)(char *param, char *val));
 
diff --git a/init/main.c b/init/main.c
index 86cbfd085b01..22d61cb06f98 100644
--- a/init/main.c
+++ b/init/main.c
@@ -201,11 +201,11 @@ static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
 char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
 static const char *panic_later, *panic_param;
 
-extern struct obs_kernel_param __setup_start[], __setup_end[];
+extern const struct obs_kernel_param __setup_start[], __setup_end[];
 
 static int __init obsolete_checksetup(char *line)
 {
-	struct obs_kernel_param *p;
+	const struct obs_kernel_param *p;
 	int had_early_param = 0;
 
 	p = __setup_start;
@@ -458,7 +458,7 @@ static noinline void __init_refok rest_init(void)
 /* Check for early params. */
 static int __init do_early_param(char *param, char *val)
 {
-	struct obs_kernel_param *p;
+	const struct obs_kernel_param *p;
 
 	for (p = __setup_start; p < __setup_end; p++) {
 		if ((p->early && strcmp(param, p->str) == 0) ||
@@ -536,7 +536,7 @@ static void __init mm_init(void)
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
-	extern struct kernel_param __start___param[], __stop___param[];
+	extern const struct kernel_param __start___param[], __stop___param[];
 
 	smp_setup_processor_id();
 
diff --git a/kernel/params.c b/kernel/params.c
index ef60db14fae0..a3eeeefc9472 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -88,7 +88,7 @@ static inline int parameq(const char *input, const char *paramname)
 
 static int parse_one(char *param,
 		     char *val,
-		     struct kernel_param *params, 
+		     const struct kernel_param *params,
 		     unsigned num_params,
 		     int (*handle_unknown)(char *param, char *val))
 {
@@ -170,7 +170,7 @@ static char *next_arg(char *args, char **param, char **val)
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
 int parse_args(const char *name,
 	       char *args,
-	       struct kernel_param *params,
+	       const struct kernel_param *params,
 	       unsigned num,
 	       int (*unknown)(char *param, char *val))
 {
-- 
cgit v1.2.3-59-g8ed1b


From 907b29eb41aa604477a655bff7345731da94514d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Aug 2010 23:04:19 -0600
Subject: param: locking for kernel parameters

There may be cases (most obviously, sysfs-writable charp parameters) where
a module needs to prevent sysfs access to parameters.

Rather than express this in terms of a big lock, the functions are
expressed in terms of what they protect against.  This is clearer, esp.
if the implementation changes to a module-level or even param-level lock.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
---
 include/linux/moduleparam.h | 56 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/params.c             | 33 ++++++++++++++++++++------
 2 files changed, 82 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 6d48831fe7d2..ca74a3402d63 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -130,6 +130,62 @@ __check_old_set_param(int (*oldset)(const char *, struct kernel_param *))
 #define module_param(name, type, perm)				\
 	module_param_named(name, name, type, perm)
 
+/**
+ * kparam_block_sysfs_write - make sure a parameter isn't written via sysfs.
+ * @name: the name of the parameter
+ *
+ * There's no point blocking write on a paramter that isn't writable via sysfs!
+ */
+#define kparam_block_sysfs_write(name)			\
+	do {						\
+		BUG_ON(!(__param_##name.perm & 0222));	\
+		__kernel_param_lock();			\
+	} while (0)
+
+/**
+ * kparam_unblock_sysfs_write - allows sysfs to write to a parameter again.
+ * @name: the name of the parameter
+ */
+#define kparam_unblock_sysfs_write(name)		\
+	do {						\
+		BUG_ON(!(__param_##name.perm & 0222));	\
+		__kernel_param_unlock();		\
+	} while (0)
+
+/**
+ * kparam_block_sysfs_read - make sure a parameter isn't read via sysfs.
+ * @name: the name of the parameter
+ *
+ * This also blocks sysfs writes.
+ */
+#define kparam_block_sysfs_read(name)			\
+	do {						\
+		BUG_ON(!(__param_##name.perm & 0444));	\
+		__kernel_param_lock();			\
+	} while (0)
+
+/**
+ * kparam_unblock_sysfs_read - allows sysfs to read a parameter again.
+ * @name: the name of the parameter
+ */
+#define kparam_unblock_sysfs_read(name)			\
+	do {						\
+		BUG_ON(!(__param_##name.perm & 0444));	\
+		__kernel_param_unlock();		\
+	} while (0)
+
+#ifdef CONFIG_SYSFS
+extern void __kernel_param_lock(void);
+extern void __kernel_param_unlock(void);
+#else
+static inline void __kernel_param_lock(void)
+{
+}
+static inline void __kernel_param_unlock(void)
+{
+}
+#endif
+
 #ifndef MODULE
 /**
  * core_param - define a historical core kernel parameter.
diff --git a/kernel/params.c b/kernel/params.c
index a3eeeefc9472..08107d181758 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,12 +31,14 @@
 #define DEBUGP(fmt, a...)
 #endif
 
+/* Protects all parameters, and incidentally kmalloced_param list. */
+static DEFINE_MUTEX(param_lock);
+
 /* This just allows us to keep track of which parameters are kmalloced. */
 struct kmalloced_param {
 	struct list_head list;
 	char val[];
 };
-static DEFINE_MUTEX(param_lock);
 static LIST_HEAD(kmalloced_params);
 
 static void *kmalloc_parameter(unsigned int size)
@@ -47,10 +49,7 @@ static void *kmalloc_parameter(unsigned int size)
 	if (!p)
 		return NULL;
 
-	mutex_lock(&param_lock);
 	list_add(&p->list, &kmalloced_params);
-	mutex_unlock(&param_lock);
-
 	return p->val;
 }
 
@@ -59,7 +58,6 @@ static void maybe_kfree_parameter(void *param)
 {
 	struct kmalloced_param *p;
 
-	mutex_lock(&param_lock);
 	list_for_each_entry(p, &kmalloced_params, list) {
 		if (p->val == param) {
 			list_del(&p->list);
@@ -67,7 +65,6 @@ static void maybe_kfree_parameter(void *param)
 			break;
 		}
 	}
-	mutex_unlock(&param_lock);
 }
 
 static inline char dash2underscore(char c)
@@ -93,6 +90,7 @@ static int parse_one(char *param,
 		     int (*handle_unknown)(char *param, char *val))
 {
 	unsigned int i;
+	int err;
 
 	/* Find parameter */
 	for (i = 0; i < num_params; i++) {
@@ -102,7 +100,10 @@ static int parse_one(char *param,
 				return -EINVAL;
 			DEBUGP("They are equal!  Calling %p\n",
 			       params[i].ops->set);
-			return params[i].ops->set(val, &params[i]);
+			mutex_lock(&param_lock);
+			err = params[i].ops->set(val, &params[i]);
+			mutex_unlock(&param_lock);
+			return err;
 		}
 	}
 
@@ -400,6 +401,7 @@ static int param_array(const char *name,
 		/* nul-terminate and parse */
 		save = val[len];
 		((char *)val)[len] = '\0';
+		BUG_ON(!mutex_is_locked(&param_lock));
 		ret = set(val, &kp);
 
 		if (ret != 0)
@@ -438,6 +440,7 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
 		if (i)
 			buffer[off++] = ',';
 		p.arg = arr->elem + arr->elemsize * i;
+		BUG_ON(!mutex_is_locked(&param_lock));
 		ret = arr->ops->get(buffer + off, &p);
 		if (ret < 0)
 			return ret;
@@ -522,7 +525,9 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
 	if (!attribute->param->ops->get)
 		return -EPERM;
 
+	mutex_lock(&param_lock);
 	count = attribute->param->ops->get(buf, attribute->param);
+	mutex_unlock(&param_lock);
 	if (count > 0) {
 		strcat(buf, "\n");
 		++count;
@@ -541,7 +546,9 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
 	if (!attribute->param->ops->set)
 		return -EPERM;
 
+	mutex_lock(&param_lock);
 	err = attribute->param->ops->set(buf, attribute->param);
+	mutex_unlock(&param_lock);
 	if (!err)
 		return len;
 	return err;
@@ -555,6 +562,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
 #endif
 
 #ifdef CONFIG_SYSFS
+void __kernel_param_lock(void)
+{
+	mutex_lock(&param_lock);
+}
+EXPORT_SYMBOL(__kernel_param_lock);
+
+void __kernel_param_unlock(void)
+{
+	mutex_unlock(&param_lock);
+}
+EXPORT_SYMBOL(__kernel_param_unlock);
+
 /*
  * add_sysfs_param - add a parameter to sysfs
  * @mk: struct module_kobject
-- 
cgit v1.2.3-59-g8ed1b


From 546970bc6afc7fb37447fbac09b82c7884662c21 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Aug 2010 23:04:20 -0600
Subject: param: add kerneldoc to moduleparam.h

Also reorders the macros with the most common ones at the top.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
---
 include/linux/moduleparam.h | 121 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 95 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index ca74a3402d63..893549c04265 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -71,6 +71,62 @@ struct kparam_array
 	void *elem;
 };
 
+/**
+ * module_param - typesafe helper for a module/cmdline parameter
+ * @value: the variable to alter, and exposed parameter name.
+ * @type: the type of the parameter
+ * @perm: visibility in sysfs.
+ *
+ * @value becomes the module parameter, or (prefixed by KBUILD_MODNAME and a
+ * ".") the kernel commandline parameter.  Note that - is changed to _, so
+ * the user can use "foo-bar=1" even for variable "foo_bar".
+ *
+ * @perm is 0 if the the variable is not to appear in sysfs, or 0444
+ * for world-readable, 0644 for root-writable, etc.  Note that if it
+ * is writable, you may need to use kparam_block_sysfs_write() around
+ * accesses (esp. charp, which can be kfreed when it changes).
+ *
+ * The @type is simply pasted to refer to a param_ops_##type and a
+ * param_check_##type: for convenience many standard types are provided but
+ * you can create your own by defining those variables.
+ *
+ * Standard types are:
+ *	byte, short, ushort, int, uint, long, ulong
+ *	charp: a character pointer
+ *	bool: a bool, values 0/1, y/n, Y/N.
+ *	invbool: the above, only sense-reversed (N = true).
+ */
+#define module_param(name, type, perm)				\
+	module_param_named(name, name, type, perm)
+
+/**
+ * module_param_named - typesafe helper for a renamed module/cmdline parameter
+ * @name: a valid C identifier which is the parameter name.
+ * @value: the actual lvalue to alter.
+ * @type: the type of the parameter
+ * @perm: visibility in sysfs.
+ *
+ * Usually it's a good idea to have variable names and user-exposed names the
+ * same, but that's harder if the variable must be non-static or is inside a
+ * structure.  This allows exposure under a different name.
+ */
+#define module_param_named(name, value, type, perm)			   \
+	param_check_##type(name, &(value));				   \
+	module_param_cb(name, &param_ops_##type, &value, perm);		   \
+	__MODULE_PARM_TYPE(name, #type)
+
+/**
+ * module_param_cb - general callback for a module/cmdline parameter
+ * @name: a valid C identifier which is the parameter name.
+ * @ops: the set & get operations for this parameter.
+ * @perm: visibility in sysfs.
+ *
+ * The ops can have NULL set or get functions.
+ */
+#define module_param_cb(name, ops, arg, perm)				      \
+	__module_param_call(MODULE_PARAM_PREFIX,			      \
+			    name, ops, arg, __same_type(*(arg), bool), perm)
+
 /* On alpha, ia64 and ppc64 relocations to global data cannot go into
    read-only sections (which is part of respective UNIX ABI on these
    platforms). So 'const' makes no sense and even causes compile failures
@@ -82,9 +138,7 @@ struct kparam_array
 #endif
 
 /* This is the fundamental function for registering boot/module
-   parameters.  perm sets the visibility in sysfs: 000 means it's
-   not there, read bits mean it's readable, write bits mean it's
-   writable. */
+   parameters. */
 #define __module_param_call(prefix, name, ops, arg, isbool, perm)	\
 	/* Default value instead of permissions? */			\
 	static int __param_perm_check_##name __attribute__((unused)) =	\
@@ -113,23 +167,6 @@ __check_old_set_param(int (*oldset)(const char *, struct kernel_param *))
 	return 0;
 }
 
-#define module_param_cb(name, ops, arg, perm)				      \
-	__module_param_call(MODULE_PARAM_PREFIX,			      \
-			    name, ops, arg, __same_type(*(arg), bool), perm)
-
-/*
- * Helper functions: type is byte, short, ushort, int, uint, long,
- * ulong, charp, bool or invbool, or XXX if you define param_ops_XXX
- * and param_check_XXX.
- */
-#define module_param_named(name, value, type, perm)			   \
-	param_check_##type(name, &(value));				   \
-	module_param_cb(name, &param_ops_##type, &value, perm);		   \
-	__MODULE_PARM_TYPE(name, #type)
-
-#define module_param(name, type, perm)				\
-	module_param_named(name, name, type, perm)
-
 /**
  * kparam_block_sysfs_write - make sure a parameter isn't written via sysfs.
  * @name: the name of the parameter
@@ -191,7 +228,7 @@ static inline void __kernel_param_unlock(void)
  * core_param - define a historical core kernel parameter.
  * @name: the name of the cmdline and sysfs parameter (often the same as var)
  * @var: the variable
- * @type: the type (for param_set_##type and param_get_##type)
+ * @type: the type of the parameter
  * @perm: visibility in sysfs
  *
  * core_param is just like module_param(), but cannot be modular and
@@ -205,7 +242,16 @@ static inline void __kernel_param_unlock(void)
 			    &var, __same_type(var, bool), perm)
 #endif /* !MODULE */
 
-/* Actually copy string: maxlen param is usually sizeof(string). */
+/**
+ * module_param_string - a char array parameter
+ * @name: the name of the parameter
+ * @string: the string variable
+ * @len: the maximum length of the string, incl. terminator
+ * @perm: visibility in sysfs.
+ *
+ * This actually copies the string when it's set (unlike type charp).
+ * @len is usually just sizeof(string).
+ */
 #define module_param_string(name, string, len, perm)			\
 	static const struct kparam_string __param_string_##name		\
 		= { len, string };					\
@@ -294,7 +340,33 @@ extern int param_set_invbool(const char *val, const struct kernel_param *kp);
 extern int param_get_invbool(char *buffer, const struct kernel_param *kp);
 #define param_check_invbool(name, p) __param_check(name, p, bool)
 
-/* Comma-separated array: *nump is set to number they actually specified. */
+/**
+ * module_param_array - a parameter which is an array of some type
+ * @name: the name of the array variable
+ * @type: the type, as per module_param()
+ * @nump: optional pointer filled in with the number written
+ * @perm: visibility in sysfs
+ *
+ * Input and output are as comma-separated values.  Commas inside values
+ * don't work properly (eg. an array of charp).
+ *
+ * ARRAY_SIZE(@name) is used to determine the number of elements in the
+ * array, so the definition must be visible.
+ */
+#define module_param_array(name, type, nump, perm)		\
+	module_param_array_named(name, name, type, nump, perm)
+
+/**
+ * module_param_array_named - renamed parameter which is an array of some type
+ * @name: a valid C identifier which is the parameter name
+ * @array: the name of the array variable
+ * @type: the type, as per module_param()
+ * @nump: optional pointer filled in with the number written
+ * @perm: visibility in sysfs
+ *
+ * This exposes a different name than the actual variable name.  See
+ * module_param_named() for why this might be necessary.
+ */
 #define module_param_array_named(name, array, type, nump, perm)		\
 	static const struct kparam_array __param_arr_##name		\
 	= { ARRAY_SIZE(array), nump, &param_ops_##type,			\
@@ -305,9 +377,6 @@ extern int param_get_invbool(char *buffer, const struct kernel_param *kp);
 			    __same_type(array[0], bool), perm);		\
 	__MODULE_PARM_TYPE(name, "array of " #type)
 
-#define module_param_array(name, type, nump, perm)		\
-	module_param_array_named(name, name, type, nump, perm)
-
 extern struct kernel_param_ops param_array_ops;
 
 extern struct kernel_param_ops param_ops_string;
-- 
cgit v1.2.3-59-g8ed1b


From a6de51b2787012ba3ab62c7d50df1b749b83d5f0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 11 Aug 2010 23:04:40 -0600
Subject: param: don't deref arg in __same_type() checks

gcc allows this when arg is a function, but sparse complains:

drivers/char/ipmi/ipmi_watchdog.c:303:1: error: cannot dereference this type
drivers/char/ipmi/ipmi_watchdog.c:307:1: error: cannot dereference this type
drivers/char/ipmi/ipmi_watchdog.c:311:1: error: cannot dereference this type

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Tested-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/moduleparam.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 893549c04265..9d2f1837b3d8 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -125,7 +125,7 @@ struct kparam_array
  */
 #define module_param_cb(name, ops, arg, perm)				      \
 	__module_param_call(MODULE_PARAM_PREFIX,			      \
-			    name, ops, arg, __same_type(*(arg), bool), perm)
+			    name, ops, arg, __same_type((arg), bool *), perm)
 
 /* On alpha, ia64 and ppc64 relocations to global data cannot go into
    read-only sections (which is part of respective UNIX ABI on these
@@ -157,7 +157,7 @@ struct kparam_array
 		 { (void *)set, (void *)get };				\
 	__module_param_call(MODULE_PARAM_PREFIX,			\
 			    name, &__param_ops_##name, arg,		\
-			    __same_type(*(arg), bool),			\
+			    __same_type(arg, bool *),			\
 			    (perm) + sizeof(__check_old_set_param(set))*0)
 
 /* We don't get oldget: it's often a new-style param_get_uint, etc. */
@@ -330,9 +330,9 @@ extern int param_get_bool(char *buffer, const struct kernel_param *kp);
 #define param_check_bool(name, p)					\
 	static inline void __check_##name(void)				\
 	{								\
-		BUILD_BUG_ON(!__same_type(*(p), bool) &&		\
-			     !__same_type(*(p), unsigned int) &&	\
-			     !__same_type(*(p), int));			\
+		BUILD_BUG_ON(!__same_type((p), bool *) &&		\
+			     !__same_type((p), unsigned int *) &&	\
+			     !__same_type((p), int *));			\
 	}
 
 extern struct kernel_param_ops param_ops_invbool;
-- 
cgit v1.2.3-59-g8ed1b


From 13bcbc008790b05413c9a16763b423c206528c0a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 10 Aug 2010 18:01:09 -0700
Subject: include/linux/fs.h: complete hexification of FMODE_* constants

One straggler which was missed due to merge ordering issues.

Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1542e0e52b2e..267d02630517 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -93,7 +93,7 @@ struct inodes_stat_t {
 #define FMODE_RANDOM		((__force fmode_t)0x1000)
 
 /* File was opened by fanotify and shouldn't generate fanotify events */
-#define FMODE_NONOTIFY		((__force fmode_t)16777216) /* 0x1000000 */
+#define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
 /*
  * The below are the various read and write types that we support. Some of
-- 
cgit v1.2.3-59-g8ed1b


From 5a19ae4bb003a428b9c8367daf05eed5029dc4cd Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 10 Aug 2010 18:01:28 -0700
Subject: virtio_9p.h: include linux/types.h

Add <linux/types.h> to <linux/virtio_9p.h> so that types are explicitly
defined:
linux/virtio_9p.h:15: found __[us]{8,16,32,64} type without #include <linux/types.h>

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/virtio_9p.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_9p.h b/include/linux/virtio_9p.h
index 395c38a47adb..1faa80d92f05 100644
--- a/include/linux/virtio_9p.h
+++ b/include/linux/virtio_9p.h
@@ -2,6 +2,7 @@
 #define _LINUX_VIRTIO_9P_H
 /* This header is BSD licensed so anyone can use the definitions to implement
  * compatible drivers/servers. */
+#include <linux/types.h>
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
 #include <linux/types.h>
-- 
cgit v1.2.3-59-g8ed1b


From 6da24b786ed1963a7f872c1899627968c76d17d7 Mon Sep 17 00:00:00 2001
From: Kyungmin Park <kmpark@infradead.org>
Date: Tue, 10 Aug 2010 18:01:36 -0700
Subject: mmc: recognize CSD structure

The eMMC spec 4.4 and 4.3 + additional feature chips has CSD structure
version 3 and version 3 have to check the CSD_STRUCTURE byte in the
EXT_CSD register.

Also fix EXT_CSD revision message.

[akpm@linux-foundation.org: fix comment, per Chris Ball]
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Adrian Hunter <adrian.hunter@nokia.com>
Cc: Chris Ball <cjb@laptop.org>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/mmc.c   | 26 +++++++++++++++++++-------
 include/linux/mmc/card.h |  1 +
 include/linux/mmc/mmc.h  |  1 +
 3 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 89f7a25b7ac1..cd8d3d2ea901 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -114,17 +114,18 @@ static int mmc_decode_cid(struct mmc_card *card)
 static int mmc_decode_csd(struct mmc_card *card)
 {
 	struct mmc_csd *csd = &card->csd;
-	unsigned int e, m, csd_struct;
+	unsigned int e, m;
 	u32 *resp = card->raw_csd;
 
 	/*
 	 * We only understand CSD structure v1.1 and v1.2.
 	 * v1.2 has extra information in bits 15, 11 and 10.
+	 * We also support eMMC v4.4 & v4.41.
 	 */
-	csd_struct = UNSTUFF_BITS(resp, 126, 2);
-	if (csd_struct != 1 && csd_struct != 2) {
+	csd->structure = UNSTUFF_BITS(resp, 126, 2);
+	if (csd->structure == 0) {
 		printk(KERN_ERR "%s: unrecognised CSD structure version %d\n",
-			mmc_hostname(card->host), csd_struct);
+			mmc_hostname(card->host), csd->structure);
 		return -EINVAL;
 	}
 
@@ -207,11 +208,22 @@ static int mmc_read_ext_csd(struct mmc_card *card)
 		goto out;
 	}
 
+	/* Version is coded in the CSD_STRUCTURE byte in the EXT_CSD register */
+	if (card->csd.structure == 3) {
+		int ext_csd_struct = ext_csd[EXT_CSD_STRUCTURE];
+		if (ext_csd_struct > 2) {
+			printk(KERN_ERR "%s: unrecognised EXT_CSD structure "
+				"version %d\n", mmc_hostname(card->host),
+					ext_csd_struct);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
 	card->ext_csd.rev = ext_csd[EXT_CSD_REV];
 	if (card->ext_csd.rev > 5) {
-		printk(KERN_ERR "%s: unrecognised EXT_CSD structure "
-			"version %d\n", mmc_hostname(card->host),
-			card->ext_csd.rev);
+		printk(KERN_ERR "%s: unrecognised EXT_CSD revision %d\n",
+			mmc_hostname(card->host), card->ext_csd.rev);
 		err = -EINVAL;
 		goto out;
 	}
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index d02d2c6e0cfe..c83c7a7303fd 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -24,6 +24,7 @@ struct mmc_cid {
 };
 
 struct mmc_csd {
+	unsigned char		structure;
 	unsigned char		mmca_vsn;
 	unsigned short		cmdclass;
 	unsigned short		tacc_clks;
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 8a49cbf0376d..52ce98866287 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -254,6 +254,7 @@ struct _mmc_csd {
 #define EXT_CSD_BUS_WIDTH	183	/* R/W */
 #define EXT_CSD_HS_TIMING	185	/* R/W */
 #define EXT_CSD_CARD_TYPE	196	/* RO */
+#define EXT_CSD_STRUCTURE	194	/* RO */
 #define EXT_CSD_REV		192	/* RO */
 #define EXT_CSD_SEC_CNT		212	/* RO, 4 bytes */
 #define EXT_CSD_S_A_TIMEOUT	217
-- 
cgit v1.2.3-59-g8ed1b


From 7310ece86ad7da027f85a37a0638164118a5d12f Mon Sep 17 00:00:00 2001
From: Michal Miroslaw <mirq-linux@rere.qmqm.pl>
Date: Tue, 10 Aug 2010 18:01:40 -0700
Subject: mmc: implement SD-combo (IO+mem) support

Signed-off-by: Michal Miroslaw <mirq-linux@rere.qmqm.pl>
Cc: Adrian Hunter <adrian.hunter@nokia.com>
Cc: Chris Ball <cjb@laptop.org>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/bus.c   |   9 ++++
 drivers/mmc/core/core.c  |  12 ++++-
 drivers/mmc/core/sdio.c  | 135 +++++++++++++++++++++++++++++++++++++++--------
 include/linux/mmc/card.h |   1 +
 4 files changed, 134 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index 49d9dcaeca49..7cd9749dc21d 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -37,6 +37,8 @@ static ssize_t mmc_type_show(struct device *dev,
 		return sprintf(buf, "SD\n");
 	case MMC_TYPE_SDIO:
 		return sprintf(buf, "SDIO\n");
+	case MMC_TYPE_SD_COMBO:
+		return sprintf(buf, "SDcombo\n");
 	default:
 		return -EFAULT;
 	}
@@ -74,6 +76,9 @@ mmc_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
 	case MMC_TYPE_SDIO:
 		type = "SDIO";
 		break;
+	case MMC_TYPE_SD_COMBO:
+		type = "SDcombo";
+		break;
 	default:
 		type = NULL;
 	}
@@ -239,6 +244,10 @@ int mmc_add_card(struct mmc_card *card)
 	case MMC_TYPE_SDIO:
 		type = "SDIO";
 		break;
+	case MMC_TYPE_SD_COMBO:
+		type = "SD-combo";
+		if (mmc_card_blockaddr(card))
+			type = "SDHC-combo";
 	default:
 		type = "?";
 		break;
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 569e94da844c..b69ce91b11e1 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1099,8 +1099,15 @@ void mmc_rescan(struct work_struct *work)
 	 */
 	err = mmc_send_io_op_cond(host, 0, &ocr);
 	if (!err) {
-		if (mmc_attach_sdio(host, ocr))
-			mmc_power_off(host);
+		if (mmc_attach_sdio(host, ocr)) {
+			mmc_claim_host(host);
+			/* try SDMEM (but not MMC) even if SDIO is broken */
+			if (mmc_send_app_op_cond(host, 0, &ocr))
+				goto out_fail;
+
+			if (mmc_attach_sd(host, ocr))
+				mmc_power_off(host);
+		}
 		goto out;
 	}
 
@@ -1124,6 +1131,7 @@ void mmc_rescan(struct work_struct *work)
 		goto out;
 	}
 
+out_fail:
 	mmc_release_host(host);
 	mmc_power_off(host);
 
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 47d1708810bd..b0b6ce93e519 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -160,9 +160,7 @@ static int sdio_enable_wide(struct mmc_card *card)
 	if (ret)
 		return ret;
 
-	mmc_set_bus_width(card->host, MMC_BUS_WIDTH_4);
-
-	return 0;
+	return 1;
 }
 
 /*
@@ -222,10 +220,34 @@ static int sdio_disable_wide(struct mmc_card *card)
 	return 0;
 }
 
+
+static int sdio_enable_4bit_bus(struct mmc_card *card)
+{
+	int err;
+
+	if (card->type == MMC_TYPE_SDIO)
+		return sdio_enable_wide(card);
+
+	if ((card->host->caps & MMC_CAP_4_BIT_DATA) &&
+		(card->scr.bus_widths & SD_SCR_BUS_WIDTH_4)) {
+		err = mmc_app_set_bus_width(card, MMC_BUS_WIDTH_4);
+		if (err)
+			return err;
+	} else
+		return 0;
+
+	err = sdio_enable_wide(card);
+	if (err <= 0)
+		mmc_app_set_bus_width(card, MMC_BUS_WIDTH_1);
+
+	return err;
+}
+
+
 /*
  * Test if the card supports high-speed mode and, if so, switch to it.
  */
-static int sdio_enable_hs(struct mmc_card *card)
+static int mmc_sdio_switch_hs(struct mmc_card *card, int enable)
 {
 	int ret;
 	u8 speed;
@@ -240,7 +262,10 @@ static int sdio_enable_hs(struct mmc_card *card)
 	if (ret)
 		return ret;
 
-	speed |= SDIO_SPEED_EHS;
+	if (enable)
+		speed |= SDIO_SPEED_EHS;
+	else
+		speed &= ~SDIO_SPEED_EHS;
 
 	ret = mmc_io_rw_direct(card, 1, 0, SDIO_CCCR_SPEED, speed, NULL);
 	if (ret)
@@ -249,6 +274,24 @@ static int sdio_enable_hs(struct mmc_card *card)
 	return 1;
 }
 
+/*
+ * Enable SDIO/combo card's high-speed mode. Return 0/1 if [not]supported.
+ */
+static int sdio_enable_hs(struct mmc_card *card)
+{
+	int ret;
+
+	ret = mmc_sdio_switch_hs(card, true);
+	if (ret <= 0 || card->type == MMC_TYPE_SDIO)
+		return ret;
+
+	ret = mmc_sd_switch_hs(card);
+	if (ret <= 0)
+		mmc_sdio_switch_hs(card, false);
+
+	return ret;
+}
+
 static unsigned mmc_sdio_get_max_clock(struct mmc_card *card)
 {
 	unsigned max_dtr;
@@ -265,6 +308,9 @@ static unsigned mmc_sdio_get_max_clock(struct mmc_card *card)
 		max_dtr = card->cis.max_dtr;
 	}
 
+	if (card->type == MMC_TYPE_SD_COMBO)
+		max_dtr = min(max_dtr, mmc_sd_get_max_clock(card));
+
 	return max_dtr;
 }
 
@@ -310,7 +356,24 @@ static int mmc_sdio_init_card(struct mmc_host *host, u32 ocr,
 		goto err;
 	}
 
-	card->type = MMC_TYPE_SDIO;
+	err = mmc_sd_get_cid(host, host->ocr & ocr, card->raw_cid);
+
+	if (!err) {
+		card->type = MMC_TYPE_SD_COMBO;
+
+		if (oldcard && (oldcard->type != MMC_TYPE_SD_COMBO ||
+		    memcmp(card->raw_cid, oldcard->raw_cid, sizeof(card->raw_cid)) != 0)) {
+			mmc_remove_card(card);
+			return -ENOENT;
+		}
+	} else {
+		card->type = MMC_TYPE_SDIO;
+
+		if (oldcard && oldcard->type != MMC_TYPE_SDIO) {
+			mmc_remove_card(card);
+			return -ENOENT;
+		}
+	}
 
 	/*
 	 * Call the optional HC's init_card function to handle quirks.
@@ -329,6 +392,17 @@ static int mmc_sdio_init_card(struct mmc_host *host, u32 ocr,
 		mmc_set_bus_mode(host, MMC_BUSMODE_PUSHPULL);
 	}
 
+	/*
+	 * Read CSD, before selecting the card
+	 */
+	if (!oldcard && card->type == MMC_TYPE_SD_COMBO) {
+		err = mmc_sd_get_csd(host, card);
+		if (err)
+			return err;
+
+		mmc_decode_cid(card);
+	}
+
 	/*
 	 * Select card, as all following commands rely on that.
 	 */
@@ -356,14 +430,33 @@ static int mmc_sdio_init_card(struct mmc_host *host, u32 ocr,
 		int same = (card->cis.vendor == oldcard->cis.vendor &&
 			    card->cis.device == oldcard->cis.device);
 		mmc_remove_card(card);
-		if (!same) {
-			err = -ENOENT;
-			goto err;
-		}
+		if (!same)
+			return -ENOENT;
+
 		card = oldcard;
 		return 0;
 	}
 
+	if (card->type == MMC_TYPE_SD_COMBO) {
+		err = mmc_sd_setup_card(host, card, oldcard != NULL);
+		/* handle as SDIO-only card if memory init failed */
+		if (err) {
+			mmc_go_idle(host);
+			if (mmc_host_is_spi(host))
+				/* should not fail, as it worked previously */
+				mmc_spi_set_crc(host, use_spi_crc);
+			card->type = MMC_TYPE_SDIO;
+		} else
+			card->dev.type = &sd_type;
+	}
+
+	/*
+	 * If needed, disconnect card detection pull-up resistor.
+	 */
+	err = sdio_disable_cd(card);
+	if (err)
+		goto remove;
+
 	/*
 	 * Switch to high-speed (if supported).
 	 */
@@ -381,8 +474,10 @@ static int mmc_sdio_init_card(struct mmc_host *host, u32 ocr,
 	/*
 	 * Switch to wider bus (if supported).
 	 */
-	err = sdio_enable_wide(card);
-	if (err)
+	err = sdio_enable_4bit_bus(card);
+	if (err > 0)
+		mmc_set_bus_width(card->host, MMC_BUS_WIDTH_4);
+	else if (err)
 		goto remove;
 
 	if (!oldcard)
@@ -496,9 +591,14 @@ static int mmc_sdio_resume(struct mmc_host *host)
 	mmc_claim_host(host);
 	err = mmc_sdio_init_card(host, host->ocr, host->card,
 				 (host->pm_flags & MMC_PM_KEEP_POWER));
-	if (!err)
+	if (!err) {
 		/* We may have switched to 1-bit mode during suspend. */
-		err = sdio_enable_wide(host->card);
+		err = sdio_enable_4bit_bus(host->card);
+		if (err > 0) {
+			mmc_set_bus_width(host, MMC_BUS_WIDTH_4);
+			err = 0;
+		}
+	}
 	if (!err && host->sdio_irqs)
 		mmc_signal_sdio_irq(host);
 	mmc_release_host(host);
@@ -582,13 +682,6 @@ int mmc_attach_sdio(struct mmc_host *host, u32 ocr)
 	funcs = (ocr & 0x70000000) >> 28;
 	card->sdio_funcs = 0;
 
-	/*
-	 * If needed, disconnect card detection pull-up resistor.
-	 */
-	err = sdio_disable_cd(card);
-	if (err)
-		goto remove;
-
 	/*
 	 * Initialize (but don't add) all present functions.
 	 */
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index c83c7a7303fd..340d391aecbb 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -93,6 +93,7 @@ struct mmc_card {
 #define MMC_TYPE_MMC		0		/* MMC card */
 #define MMC_TYPE_SD		1		/* SD card */
 #define MMC_TYPE_SDIO		2		/* SDIO card */
+#define MMC_TYPE_SD_COMBO	3		/* SD combo (IO+mem) card */
 	unsigned int		state;		/* (our) card state */
 #define MMC_STATE_PRESENT	(1<<0)		/* present in sysfs */
 #define MMC_STATE_READONLY	(1<<1)		/* card is read-only */
-- 
cgit v1.2.3-59-g8ed1b


From 4c2ef25fe0b847d2ae818f74758ddb0be1c27d8e Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <maximlevitsky@gmail.com>
Date: Tue, 10 Aug 2010 18:01:41 -0700
Subject: mmc: fix all hangs related to mmc/sd card insert/removal during
 suspend/resume

If you don't use CONFIG_MMC_UNSAFE_RESUME, as soon as you attempt to
suspend, the card will be removed, therefore this patch doesn't change the
behavior of this option.

However the removal will be done by pm notifier, which runs while
userspace is still not frozen and thus can freely use del_gendisk, without
the risk of deadlock which would happen otherwise.

Card detect workqueue is now disabled while userspace is frozen, Therefore
if you do use CONFIG_MMC_UNSAFE_RESUME, and remove the card during
suspend, the removal will be detected as soon as userspace is unfrozen,
again at the moment it is safe to call del_gendisk.

Tested with and without CONFIG_MMC_UNSAFE_RESUME with suspend and hibernate.

[akpm@linux-foundation.org: clean up function prototype]
[akpm@linux-foundation.org: fix CONFIG_PM-n linkage, small cleanups]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Maxim Levitsky <maximlevitsky@gmail.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/core.c  | 83 +++++++++++++++++++++++++++++++++---------------
 drivers/mmc/core/host.c  |  4 +++
 include/linux/mmc/host.h |  3 ++
 3 files changed, 64 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index b69ce91b11e1..83240faa1dc8 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1057,6 +1057,17 @@ void mmc_rescan(struct work_struct *work)
 		container_of(work, struct mmc_host, detect.work);
 	u32 ocr;
 	int err;
+	unsigned long flags;
+
+	spin_lock_irqsave(&host->lock, flags);
+
+	if (host->rescan_disable) {
+		spin_unlock_irqrestore(&host->lock, flags);
+		return;
+	}
+
+	spin_unlock_irqrestore(&host->lock, flags);
+
 
 	mmc_bus_get(host);
 
@@ -1274,19 +1285,6 @@ int mmc_suspend_host(struct mmc_host *host)
 	if (host->bus_ops && !host->bus_dead) {
 		if (host->bus_ops->suspend)
 			err = host->bus_ops->suspend(host);
-		if (err == -ENOSYS || !host->bus_ops->resume) {
-			/*
-			 * We simply "remove" the card in this case.
-			 * It will be redetected on resume.
-			 */
-			if (host->bus_ops->remove)
-				host->bus_ops->remove(host);
-			mmc_claim_host(host);
-			mmc_detach_bus(host);
-			mmc_release_host(host);
-			host->pm_flags = 0;
-			err = 0;
-		}
 	}
 	mmc_bus_put(host);
 
@@ -1318,28 +1316,61 @@ int mmc_resume_host(struct mmc_host *host)
 			printk(KERN_WARNING "%s: error %d during resume "
 					    "(card was removed?)\n",
 					    mmc_hostname(host), err);
-			if (host->bus_ops->remove)
-				host->bus_ops->remove(host);
-			mmc_claim_host(host);
-			mmc_detach_bus(host);
-			mmc_release_host(host);
-			/* no need to bother upper layers */
 			err = 0;
 		}
 	}
 	mmc_bus_put(host);
 
-	/*
-	 * We add a slight delay here so that resume can progress
-	 * in parallel.
-	 */
-	mmc_detect_change(host, 1);
-
 	return err;
 }
-
 EXPORT_SYMBOL(mmc_resume_host);
 
+/* Do the card removal on suspend if card is assumed removeable
+ * Do that in pm notifier while userspace isn't yet frozen, so we will be able
+   to sync the card.
+*/
+int mmc_pm_notify(struct notifier_block *notify_block,
+					unsigned long mode, void *unused)
+{
+	struct mmc_host *host = container_of(
+		notify_block, struct mmc_host, pm_notify);
+	unsigned long flags;
+
+
+	switch (mode) {
+	case PM_HIBERNATION_PREPARE:
+	case PM_SUSPEND_PREPARE:
+
+		spin_lock_irqsave(&host->lock, flags);
+		host->rescan_disable = 1;
+		spin_unlock_irqrestore(&host->lock, flags);
+		cancel_delayed_work_sync(&host->detect);
+
+		if (!host->bus_ops || host->bus_ops->suspend)
+			break;
+
+		mmc_claim_host(host);
+
+		if (host->bus_ops->remove)
+			host->bus_ops->remove(host);
+
+		mmc_detach_bus(host);
+		mmc_release_host(host);
+		host->pm_flags = 0;
+		break;
+
+	case PM_POST_SUSPEND:
+	case PM_POST_HIBERNATION:
+
+		spin_lock_irqsave(&host->lock, flags);
+		host->rescan_disable = 0;
+		spin_unlock_irqrestore(&host->lock, flags);
+		mmc_detect_change(host, 0);
+
+	}
+
+	return 0;
+}
 #endif
 
 static int __init mmc_init(void)
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 47353909e345..0efe631e50ca 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -17,6 +17,7 @@
 #include <linux/pagemap.h>
 #include <linux/leds.h>
 #include <linux/slab.h>
+#include <linux/suspend.h>
 
 #include <linux/mmc/host.h>
 
@@ -85,6 +86,7 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	init_waitqueue_head(&host->wq);
 	INIT_DELAYED_WORK(&host->detect, mmc_rescan);
 	INIT_DELAYED_WORK_DEFERRABLE(&host->disable, mmc_host_deeper_disable);
+	host->pm_notify.notifier_call = mmc_pm_notify;
 
 	/*
 	 * By default, hosts do not support SGIO or large requests.
@@ -133,6 +135,7 @@ int mmc_add_host(struct mmc_host *host)
 #endif
 
 	mmc_start_host(host);
+	register_pm_notifier(&host->pm_notify);
 
 	return 0;
 }
@@ -149,6 +152,7 @@ EXPORT_SYMBOL(mmc_add_host);
  */
 void mmc_remove_host(struct mmc_host *host)
 {
+	unregister_pm_notifier(&host->pm_notify);
 	mmc_stop_host(host);
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index f65913c9f5a4..513ff0376b09 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -124,6 +124,7 @@ struct mmc_host {
 	unsigned int		f_min;
 	unsigned int		f_max;
 	u32			ocr_avail;
+	struct notifier_block	pm_notify;
 
 #define MMC_VDD_165_195		0x00000080	/* VDD voltage 1.65 - 1.95 */
 #define MMC_VDD_20_21		0x00000100	/* VDD voltage 2.0 ~ 2.1 */
@@ -183,6 +184,7 @@ struct mmc_host {
 
 	/* Only used with MMC_CAP_DISABLE */
 	int			enabled;	/* host is enabled */
+	int			rescan_disable;	/* disable card detection */
 	int			nesting_cnt;	/* "enable" nesting count */
 	int			en_dis_recurs;	/* detect recursion */
 	unsigned int		disable_delay;	/* disable delay in msecs */
@@ -257,6 +259,7 @@ int mmc_card_can_sleep(struct mmc_host *host);
 int mmc_host_enable(struct mmc_host *host);
 int mmc_host_disable(struct mmc_host *host);
 int mmc_host_lazy_disable(struct mmc_host *host);
+int mmc_pm_notify(struct notifier_block *notify_block, unsigned long, void *);
 
 static inline void mmc_set_disable_delay(struct mmc_host *host,
 					 unsigned int disable_delay)
-- 
cgit v1.2.3-59-g8ed1b


From 6f51be3d37dff73cf8db771df4169f4c2f1cbf66 Mon Sep 17 00:00:00 2001
From: Grazvydas Ignotas <notasas@gmail.com>
Date: Tue, 10 Aug 2010 18:01:50 -0700
Subject: sdio: allow non-standard SDIO cards

There are some chips (like TI WL12xx series) that can be interfaced over
SDIO but don't support the SDIO specification, meaning that they are
missing CIA (Common I/O Area) with all it's registers.  Current Linux SDIO
implementation relies on those registers to identify and configure the
card, so non-standard cards can not function and cause lots of warnings
from the core when it reads invalid data from non-existent registers.

After this patch, init_card() host callback can now set new quirk
MMC_QUIRK_NONSTD_SDIO, which means that SDIO core should not try to access
any standard SDIO registers and rely on init_card() to fill all SDIO
structures instead.  As those cards are usually embedded chips, all the
required information can be obtained from machine board files by the host
driver when it's called through init_card() callback.

Signed-off-by: Grazvydas Ignotas <notasas@gmail.com>
Cc: Adrian Hunter <adrian.hunter@nokia.com>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Bob Copeland <me@bobcopeland.com>
Cc: Kalle Valo <kvalo@adurom.com>
Cc: Madhusudhan Chikkature <madhu.cr@ti.com>
Cc: Kishore Kadiyala <kishore.kadiyala@ti.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/core/sdio.c  | 36 ++++++++++++++++++++++++++++++------
 include/linux/mmc/card.h |  2 ++
 2 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index b0b6ce93e519..bd2755e8d9a3 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -63,13 +63,19 @@ static int sdio_init_func(struct mmc_card *card, unsigned int fn)
 
 	func->num = fn;
 
-	ret = sdio_read_fbr(func);
-	if (ret)
-		goto fail;
+	if (!(card->quirks & MMC_QUIRK_NONSTD_SDIO)) {
+		ret = sdio_read_fbr(func);
+		if (ret)
+			goto fail;
 
-	ret = sdio_read_func_cis(func);
-	if (ret)
-		goto fail;
+		ret = sdio_read_func_cis(func);
+		if (ret)
+			goto fail;
+	} else {
+		func->vendor = func->card->cis.vendor;
+		func->device = func->card->cis.device;
+		func->max_blksize = func->card->cis.blksize;
+	}
 
 	card->sdio_func[fn - 1] = func;
 
@@ -412,6 +418,23 @@ static int mmc_sdio_init_card(struct mmc_host *host, u32 ocr,
 			goto remove;
 	}
 
+	if (card->quirks & MMC_QUIRK_NONSTD_SDIO) {
+		/*
+		 * This is non-standard SDIO device, meaning it doesn't
+		 * have any CIA (Common I/O area) registers present.
+		 * It's host's responsibility to fill cccr and cis
+		 * structures in init_card().
+		 */
+		mmc_set_clock(host, card->cis.max_dtr);
+
+		if (card->cccr.high_speed) {
+			mmc_card_set_highspeed(card);
+			mmc_set_timing(card->host, MMC_TIMING_SD_HS);
+		}
+
+		goto finish;
+	}
+
 	/*
 	 * Read the common registers.
 	 */
@@ -480,6 +503,7 @@ static int mmc_sdio_init_card(struct mmc_host *host, u32 ocr,
 	else if (err)
 		goto remove;
 
+finish:
 	if (!oldcard)
 		host->card = card;
 	return 0;
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 340d391aecbb..4d893eaf8174 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -103,6 +103,8 @@ struct mmc_card {
 #define MMC_QUIRK_LENIENT_FN0	(1<<0)		/* allow SDIO FN0 writes outside of the VS CCCR range */
 #define MMC_QUIRK_BLKSZ_FOR_BYTE_MODE (1<<1)	/* use func->cur_blksize */
 						/* for byte mode */
+#define MMC_QUIRK_NONSTD_SDIO	(1<<2)		/* non-standard SDIO card attached */
+						/* (missing CIA registers) */
 
 	u32			raw_cid[4];	/* raw card CID */
 	u32			raw_csd[4];	/* raw card CSD */
-- 
cgit v1.2.3-59-g8ed1b


From 4a22b8a4ad5561436b16f5278d2f9e406ffb8705 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Tue, 10 Aug 2010 18:02:23 -0700
Subject: gpio: max730x: make pullups configurable via platformdata

The gpios on the max730x chips have support for internal pullups while in
input mode.

This patch adds support for configuring these pullups via platform data.
A new member ("input_pullup_active") to the platform data struct is
introduced.  A set bit in this variable activates the pullups while the
respective port is in input mode.  This is a compatible enhancement since
unset bits lead to disables pullups which was the default in the original
driver.

_Note_: the 4 lowest bits in "input_pullup_active" are unused because the
first 4 ports of the controller are not used, too.

Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Reviewed-by: Wolfram Sang <w.sang@pengutronix.de>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/max730x.c      | 22 +++++++++++++++-------
 include/linux/spi/max7301.h |  8 ++++++++
 2 files changed, 23 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/max730x.c b/drivers/gpio/max730x.c
index 7696a5625d58..94ce773f95f8 100644
--- a/drivers/gpio/max730x.c
+++ b/drivers/gpio/max730x.c
@@ -54,7 +54,7 @@ static int max7301_direction_input(struct gpio_chip *chip, unsigned offset)
 {
 	struct max7301 *ts = container_of(chip, struct max7301, chip);
 	u8 *config;
-	u8 offset_bits;
+	u8 offset_bits, pin_config;
 	int ret;
 
 	/* First 4 pins are unused in the controller */
@@ -63,12 +63,15 @@ static int max7301_direction_input(struct gpio_chip *chip, unsigned offset)
 
 	config = &ts->port_config[offset >> 2];
 
+	if (ts->input_pullup_active & BIT(offset))
+		pin_config = PIN_CONFIG_IN_PULLUP;
+	else
+		pin_config = PIN_CONFIG_IN_WO_PULLUP;
+
 	mutex_lock(&ts->lock);
 
-	/* Standard GPIO API doesn't support pull-ups, has to be extended.
-	 * Hard-coding no pollup for now. */
 	*config = (*config & ~(PIN_CONFIG_MASK << offset_bits))
-			   | (PIN_CONFIG_IN_WO_PULLUP << offset_bits);
+			   | (pin_config << offset_bits);
 
 	ret = ts->write(ts->dev, 0x08 + (offset >> 2), *config);
 
@@ -177,6 +180,7 @@ int __devinit __max730x_probe(struct max7301 *ts)
 	/* Power up the chip and disable IRQ output */
 	ts->write(dev, 0x04, 0x01);
 
+	ts->input_pullup_active = pdata->input_pullup_active;
 	ts->chip.label = dev->driver->name;
 
 	ts->chip.direction_input = max7301_direction_input;
@@ -191,13 +195,17 @@ int __devinit __max730x_probe(struct max7301 *ts)
 	ts->chip.owner = THIS_MODULE;
 
 	/*
-	 * tristate all pins in hardware and cache the
+	 * initialize pullups according to platform data and cache the
 	 * register values for later use.
 	 */
 	for (i = 1; i < 8; i++) {
 		int j;
-		/* 0xAA means input with internal pullup disabled */
-		ts->write(dev, 0x08 + i, 0xAA);
+		/*
+		 * initialize port_config with "0xAA", which means
+		 * input with internal pullup disabled. This is needed
+		 * to avoid writing zeros (in the inner for loop),
+		 * which is not allowed according to the datasheet.
+		 */
 		ts->port_config[i] = 0xAA;
 		for (j = 0; j < 4; j++) {
 			int offset = (i - 1) * 4 + j;
diff --git a/include/linux/spi/max7301.h b/include/linux/spi/max7301.h
index 34af0a3477bf..bcaa2f762cc1 100644
--- a/include/linux/spi/max7301.h
+++ b/include/linux/spi/max7301.h
@@ -11,6 +11,7 @@ struct max7301 {
 	struct mutex	lock;
 	u8		port_config[8];	/* field 0 is unused */
 	u32		out_level;	/* cached output levels */
+	u32		input_pullup_active;
 	struct gpio_chip chip;
 	struct device *dev;
 	int (*write)(struct device *dev, unsigned int reg, unsigned int val);
@@ -20,6 +21,13 @@ struct max7301 {
 struct max7301_platform_data {
 	/* number assigned to the first GPIO */
 	unsigned	base;
+	/*
+	 * bitmask controlling the pullup configuration,
+	 *
+	 * _note_ the 4 lowest bits are unused, because the first 4
+	 * ports of the controller are not used, too.
+	 */
+	u32		input_pullup_active;
 };
 
 extern int __max730x_remove(struct device *dev);
-- 
cgit v1.2.3-59-g8ed1b


From c34f16b70a52e348a62944fe0d5c7c1eb9ad5b72 Mon Sep 17 00:00:00 2001
From: Gregory Bean <gbean@codeaurora.org>
Date: Tue, 10 Aug 2010 18:02:27 -0700
Subject: gpio: sx150x: add Semtech I2C sx150x gpio expander driver

Add support for Semtech SX150-series I2C GPIO expanders.  Compatible
models include:

8 bits:  sx1508q
16 bits: sx1509q

Signed-off-by: Gregory Bean <gbean@codeaurora.org>
Cc: David Brownell <david-b@pacbell.net>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Trilok Soni <tsoni@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/Kconfig       |  11 +
 drivers/gpio/Makefile      |   1 +
 drivers/gpio/sx150x.c      | 645 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c/sx150x.h |  78 ++++++
 4 files changed, 735 insertions(+)
 create mode 100644 drivers/gpio/sx150x.c
 create mode 100644 include/linux/i2c/sx150x.h

(limited to 'include/linux')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 7face915b963..f623953b5797 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -195,6 +195,17 @@ config GPIO_PCF857X
 	  This driver provides an in-kernel interface to those GPIOs using
 	  platform-neutral GPIO calls.
 
+config GPIO_SX150X
+	bool "Semtech SX150x I2C GPIO expander"
+	depends on I2C=y
+	default n
+	help
+	  Say yes here to provide support for Semtech SX150-series I2C
+	  GPIO expanders. Compatible models include:
+
+	  8 bits:  sx1508q
+	  16 bits: sx1509q
+
 config GPIO_TC35892
 	bool "TC35892 GPIOs"
 	depends on MFD_TC35892
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index e53dcff49b4f..a69e0609ff7f 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -35,3 +35,4 @@ obj-$(CONFIG_GPIO_WM8994)	+= wm8994-gpio.o
 obj-$(CONFIG_GPIO_SCH)		+= sch_gpio.o
 obj-$(CONFIG_GPIO_RDC321X)	+= rdc321x-gpio.o
 obj-$(CONFIG_GPIO_JANZ_TTL)	+= janz-ttl.o
+obj-$(CONFIG_GPIO_SX150X)	+= sx150x.o
diff --git a/drivers/gpio/sx150x.c b/drivers/gpio/sx150x.c
new file mode 100644
index 000000000000..b42f42ca70c3
--- /dev/null
+++ b/drivers/gpio/sx150x.c
@@ -0,0 +1,645 @@
+/* Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/i2c/sx150x.h>
+
+struct sx150x_device_data {
+	u8 reg_pullup;
+	u8 reg_pulldn;
+	u8 reg_drain;
+	u8 reg_polarity;
+	u8 reg_dir;
+	u8 reg_data;
+	u8 reg_irq_mask;
+	u8 reg_irq_src;
+	u8 reg_sense;
+	u8 reg_clock;
+	u8 reg_misc;
+	u8 reg_reset;
+	u8 ngpios;
+};
+
+struct sx150x_chip {
+	struct gpio_chip                 gpio_chip;
+	struct i2c_client               *client;
+	const struct sx150x_device_data *dev_cfg;
+	int                              irq_summary;
+	int                              irq_base;
+	u32                              irq_sense;
+	unsigned long                    irq_set_type_pending;
+	struct irq_chip                  irq_chip;
+	struct mutex                     lock;
+};
+
+static const struct sx150x_device_data sx150x_devices[] = {
+	[0] = { /* sx1508q */
+		.reg_pullup   = 0x03,
+		.reg_pulldn   = 0x04,
+		.reg_drain    = 0x05,
+		.reg_polarity = 0x06,
+		.reg_dir      = 0x07,
+		.reg_data     = 0x08,
+		.reg_irq_mask = 0x09,
+		.reg_irq_src  = 0x0c,
+		.reg_sense    = 0x0b,
+		.reg_clock    = 0x0f,
+		.reg_misc     = 0x10,
+		.reg_reset    = 0x7d,
+		.ngpios       = 8
+	},
+	[1] = { /* sx1509q */
+		.reg_pullup   = 0x07,
+		.reg_pulldn   = 0x09,
+		.reg_drain    = 0x0b,
+		.reg_polarity = 0x0d,
+		.reg_dir      = 0x0f,
+		.reg_data     = 0x11,
+		.reg_irq_mask = 0x13,
+		.reg_irq_src  = 0x19,
+		.reg_sense    = 0x17,
+		.reg_clock    = 0x1e,
+		.reg_misc     = 0x1f,
+		.reg_reset    = 0x7d,
+		.ngpios       = 16
+	},
+};
+
+static const struct i2c_device_id sx150x_id[] = {
+	{"sx1508q", 0},
+	{"sx1509q", 1},
+	{}
+};
+MODULE_DEVICE_TABLE(i2c, sx150x_id);
+
+static s32 sx150x_i2c_write(struct i2c_client *client, u8 reg, u8 val)
+{
+	s32 err = i2c_smbus_write_byte_data(client, reg, val);
+
+	if (err < 0)
+		dev_warn(&client->dev,
+			"i2c write fail: can't write %02x to %02x: %d\n",
+			val, reg, err);
+	return err;
+}
+
+static s32 sx150x_i2c_read(struct i2c_client *client, u8 reg, u8 *val)
+{
+	s32 err = i2c_smbus_read_byte_data(client, reg);
+
+	if (err >= 0)
+		*val = err;
+	else
+		dev_warn(&client->dev,
+			"i2c read fail: can't read from %02x: %d\n",
+			reg, err);
+	return err;
+}
+
+static inline bool offset_is_oscio(struct sx150x_chip *chip, unsigned offset)
+{
+	return (chip->dev_cfg->ngpios == offset);
+}
+
+/*
+ * These utility functions solve the common problem of locating and setting
+ * configuration bits.  Configuration bits are grouped into registers
+ * whose indexes increase downwards.  For example, with eight-bit registers,
+ * sixteen gpios would have their config bits grouped in the following order:
+ * REGISTER N-1 [ f e d c b a 9 8 ]
+ *          N   [ 7 6 5 4 3 2 1 0 ]
+ *
+ * For multi-bit configurations, the pattern gets wider:
+ * REGISTER N-3 [ f f e e d d c c ]
+ *          N-2 [ b b a a 9 9 8 8 ]
+ *          N-1 [ 7 7 6 6 5 5 4 4 ]
+ *          N   [ 3 3 2 2 1 1 0 0 ]
+ *
+ * Given the address of the starting register 'N', the index of the gpio
+ * whose configuration we seek to change, and the width in bits of that
+ * configuration, these functions allow us to locate the correct
+ * register and mask the correct bits.
+ */
+static inline void sx150x_find_cfg(u8 offset, u8 width,
+				u8 *reg, u8 *mask, u8 *shift)
+{
+	*reg   -= offset * width / 8;
+	*mask   = (1 << width) - 1;
+	*shift  = (offset * width) % 8;
+	*mask <<= *shift;
+}
+
+static s32 sx150x_write_cfg(struct sx150x_chip *chip,
+			u8 offset, u8 width, u8 reg, u8 val)
+{
+	u8  mask;
+	u8  data;
+	u8  shift;
+	s32 err;
+
+	sx150x_find_cfg(offset, width, &reg, &mask, &shift);
+	err = sx150x_i2c_read(chip->client, reg, &data);
+	if (err < 0)
+		return err;
+
+	data &= ~mask;
+	data |= (val << shift) & mask;
+	return sx150x_i2c_write(chip->client, reg, data);
+}
+
+static int sx150x_get_io(struct sx150x_chip *chip, unsigned offset)
+{
+	u8  reg = chip->dev_cfg->reg_data;
+	u8  mask;
+	u8  data;
+	u8  shift;
+	s32 err;
+
+	sx150x_find_cfg(offset, 1, &reg, &mask, &shift);
+	err = sx150x_i2c_read(chip->client, reg, &data);
+	if (err >= 0)
+		err = (data & mask) != 0 ? 1 : 0;
+
+	return err;
+}
+
+static void sx150x_set_oscio(struct sx150x_chip *chip, int val)
+{
+	sx150x_i2c_write(chip->client,
+			chip->dev_cfg->reg_clock,
+			(val ? 0x1f : 0x10));
+}
+
+static void sx150x_set_io(struct sx150x_chip *chip, unsigned offset, int val)
+{
+	sx150x_write_cfg(chip,
+			offset,
+			1,
+			chip->dev_cfg->reg_data,
+			(val ? 1 : 0));
+}
+
+static int sx150x_io_input(struct sx150x_chip *chip, unsigned offset)
+{
+	return sx150x_write_cfg(chip,
+				offset,
+				1,
+				chip->dev_cfg->reg_dir,
+				1);
+}
+
+static int sx150x_io_output(struct sx150x_chip *chip, unsigned offset, int val)
+{
+	int err;
+
+	err = sx150x_write_cfg(chip,
+			offset,
+			1,
+			chip->dev_cfg->reg_data,
+			(val ? 1 : 0));
+	if (err >= 0)
+		err = sx150x_write_cfg(chip,
+				offset,
+				1,
+				chip->dev_cfg->reg_dir,
+				0);
+	return err;
+}
+
+static int sx150x_gpio_get(struct gpio_chip *gc, unsigned offset)
+{
+	struct sx150x_chip *chip;
+	int status = -EINVAL;
+
+	chip = container_of(gc, struct sx150x_chip, gpio_chip);
+
+	if (!offset_is_oscio(chip, offset)) {
+		mutex_lock(&chip->lock);
+		status = sx150x_get_io(chip, offset);
+		mutex_unlock(&chip->lock);
+	}
+
+	return status;
+}
+
+static void sx150x_gpio_set(struct gpio_chip *gc, unsigned offset, int val)
+{
+	struct sx150x_chip *chip;
+
+	chip = container_of(gc, struct sx150x_chip, gpio_chip);
+
+	mutex_lock(&chip->lock);
+	if (offset_is_oscio(chip, offset))
+		sx150x_set_oscio(chip, val);
+	else
+		sx150x_set_io(chip, offset, val);
+	mutex_unlock(&chip->lock);
+}
+
+static int sx150x_gpio_direction_input(struct gpio_chip *gc, unsigned offset)
+{
+	struct sx150x_chip *chip;
+	int status = -EINVAL;
+
+	chip = container_of(gc, struct sx150x_chip, gpio_chip);
+
+	if (!offset_is_oscio(chip, offset)) {
+		mutex_lock(&chip->lock);
+		status = sx150x_io_input(chip, offset);
+		mutex_unlock(&chip->lock);
+	}
+	return status;
+}
+
+static int sx150x_gpio_direction_output(struct gpio_chip *gc,
+					unsigned offset,
+					int val)
+{
+	struct sx150x_chip *chip;
+	int status = 0;
+
+	chip = container_of(gc, struct sx150x_chip, gpio_chip);
+
+	if (!offset_is_oscio(chip, offset)) {
+		mutex_lock(&chip->lock);
+		status = sx150x_io_output(chip, offset, val);
+		mutex_unlock(&chip->lock);
+	}
+	return status;
+}
+
+static int sx150x_gpio_to_irq(struct gpio_chip *gc, unsigned offset)
+{
+	struct sx150x_chip *chip;
+
+	chip = container_of(gc, struct sx150x_chip, gpio_chip);
+
+	if (offset >= chip->dev_cfg->ngpios)
+		return -EINVAL;
+
+	if (chip->irq_base < 0)
+		return -EINVAL;
+
+	return chip->irq_base + offset;
+}
+
+static void sx150x_irq_mask(unsigned int irq)
+{
+	struct irq_chip *ic = get_irq_chip(irq);
+	struct sx150x_chip *chip;
+	unsigned n;
+
+	chip = container_of(ic, struct sx150x_chip, irq_chip);
+	n = irq - chip->irq_base;
+
+	sx150x_write_cfg(chip, n, 1, chip->dev_cfg->reg_irq_mask, 1);
+	sx150x_write_cfg(chip, n, 2, chip->dev_cfg->reg_sense, 0);
+}
+
+static void sx150x_irq_unmask(unsigned int irq)
+{
+	struct irq_chip *ic = get_irq_chip(irq);
+	struct sx150x_chip *chip;
+	unsigned n;
+
+	chip = container_of(ic, struct sx150x_chip, irq_chip);
+	n = irq - chip->irq_base;
+
+	sx150x_write_cfg(chip, n, 1, chip->dev_cfg->reg_irq_mask, 0);
+	sx150x_write_cfg(chip, n, 2, chip->dev_cfg->reg_sense,
+			 chip->irq_sense >> (n * 2));
+}
+
+static int sx150x_irq_set_type(unsigned int irq, unsigned int flow_type)
+{
+	struct irq_chip *ic = get_irq_chip(irq);
+	struct sx150x_chip *chip;
+	unsigned n, val = 0;
+
+	if (flow_type & (IRQ_TYPE_LEVEL_HIGH | IRQ_TYPE_LEVEL_LOW))
+		return -EINVAL;
+
+	chip = container_of(ic, struct sx150x_chip, irq_chip);
+	n = irq - chip->irq_base;
+
+	if (flow_type & IRQ_TYPE_EDGE_RISING)
+		val |= 0x1;
+	if (flow_type & IRQ_TYPE_EDGE_FALLING)
+		val |= 0x2;
+
+	chip->irq_sense &= ~(3UL << (n * 2));
+	chip->irq_sense |= val << (n * 2);
+	chip->irq_set_type_pending |= BIT(n);
+	return 0;
+}
+
+static irqreturn_t sx150x_irq_thread_fn(int irq, void *dev_id)
+{
+	struct sx150x_chip *chip = (struct sx150x_chip *)dev_id;
+	unsigned nhandled = 0;
+	unsigned sub_irq;
+	unsigned n;
+	s32 err;
+	u8 val;
+	int i;
+
+	for (i = (chip->dev_cfg->ngpios / 8) - 1; i >= 0; --i) {
+		err = sx150x_i2c_read(chip->client,
+				      chip->dev_cfg->reg_irq_src - i,
+				      &val);
+		if (err < 0)
+			continue;
+
+		sx150x_i2c_write(chip->client,
+				chip->dev_cfg->reg_irq_src - i,
+				val);
+		for (n = 0; n < 8; ++n) {
+			if (val & (1 << n)) {
+				sub_irq = chip->irq_base + (i * 8) + n;
+				handle_nested_irq(sub_irq);
+				++nhandled;
+			}
+		}
+	}
+
+	return (nhandled > 0 ? IRQ_HANDLED : IRQ_NONE);
+}
+
+static void sx150x_irq_bus_lock(unsigned int irq)
+{
+	struct irq_chip *ic = get_irq_chip(irq);
+	struct sx150x_chip *chip;
+
+	chip = container_of(ic, struct sx150x_chip, irq_chip);
+
+	mutex_lock(&chip->lock);
+}
+
+static void sx150x_irq_bus_sync_unlock(unsigned int irq)
+{
+	struct irq_chip *ic = get_irq_chip(irq);
+	struct sx150x_chip *chip;
+	unsigned n;
+
+	chip = container_of(ic, struct sx150x_chip, irq_chip);
+
+	while (chip->irq_set_type_pending) {
+		n = __ffs(chip->irq_set_type_pending);
+		chip->irq_set_type_pending &= ~BIT(n);
+		if (!(irq_to_desc(n + chip->irq_base)->status & IRQ_MASKED))
+			sx150x_write_cfg(chip, n, 2,
+					chip->dev_cfg->reg_sense,
+					chip->irq_sense >> (n * 2));
+	}
+
+	mutex_unlock(&chip->lock);
+}
+
+static void sx150x_init_chip(struct sx150x_chip *chip,
+			struct i2c_client *client,
+			kernel_ulong_t driver_data,
+			struct sx150x_platform_data *pdata)
+{
+	mutex_init(&chip->lock);
+
+	chip->client                     = client;
+	chip->dev_cfg                    = &sx150x_devices[driver_data];
+	chip->gpio_chip.label            = client->name;
+	chip->gpio_chip.direction_input  = sx150x_gpio_direction_input;
+	chip->gpio_chip.direction_output = sx150x_gpio_direction_output;
+	chip->gpio_chip.get              = sx150x_gpio_get;
+	chip->gpio_chip.set              = sx150x_gpio_set;
+	chip->gpio_chip.to_irq           = sx150x_gpio_to_irq;
+	chip->gpio_chip.base             = pdata->gpio_base;
+	chip->gpio_chip.can_sleep        = 1;
+	chip->gpio_chip.ngpio            = chip->dev_cfg->ngpios;
+	if (pdata->oscio_is_gpo)
+		++chip->gpio_chip.ngpio;
+
+	chip->irq_chip.name            = client->name;
+	chip->irq_chip.mask            = sx150x_irq_mask;
+	chip->irq_chip.unmask          = sx150x_irq_unmask;
+	chip->irq_chip.set_type        = sx150x_irq_set_type;
+	chip->irq_chip.bus_lock        = sx150x_irq_bus_lock;
+	chip->irq_chip.bus_sync_unlock = sx150x_irq_bus_sync_unlock;
+	chip->irq_summary              = -1;
+	chip->irq_base                 = -1;
+	chip->irq_sense                = 0;
+	chip->irq_set_type_pending     = 0;
+}
+
+static int sx150x_init_io(struct sx150x_chip *chip, u8 base, u16 cfg)
+{
+	int err = 0;
+	unsigned n;
+
+	for (n = 0; err >= 0 && n < (chip->dev_cfg->ngpios / 8); ++n)
+		err = sx150x_i2c_write(chip->client, base - n, cfg >> (n * 8));
+	return err;
+}
+
+static int sx150x_init_hw(struct sx150x_chip *chip,
+			struct sx150x_platform_data *pdata)
+{
+	int err = 0;
+
+	err = i2c_smbus_write_word_data(chip->client,
+					chip->dev_cfg->reg_reset,
+					0x3412);
+	if (err < 0)
+		return err;
+
+	err = sx150x_i2c_write(chip->client,
+			chip->dev_cfg->reg_misc,
+			0x01);
+	if (err < 0)
+		return err;
+
+	err = sx150x_init_io(chip, chip->dev_cfg->reg_pullup,
+			pdata->io_pullup_ena);
+	if (err < 0)
+		return err;
+
+	err = sx150x_init_io(chip, chip->dev_cfg->reg_pulldn,
+			pdata->io_pulldn_ena);
+	if (err < 0)
+		return err;
+
+	err = sx150x_init_io(chip, chip->dev_cfg->reg_drain,
+			pdata->io_open_drain_ena);
+	if (err < 0)
+		return err;
+
+	err = sx150x_init_io(chip, chip->dev_cfg->reg_polarity,
+			pdata->io_polarity);
+	if (err < 0)
+		return err;
+
+	if (pdata->oscio_is_gpo)
+		sx150x_set_oscio(chip, 0);
+
+	return err;
+}
+
+static int sx150x_install_irq_chip(struct sx150x_chip *chip,
+				int irq_summary,
+				int irq_base)
+{
+	int err;
+	unsigned n;
+	unsigned irq;
+
+	chip->irq_summary = irq_summary;
+	chip->irq_base    = irq_base;
+
+	for (n = 0; n < chip->dev_cfg->ngpios; ++n) {
+		irq = irq_base + n;
+		set_irq_chip_and_handler(irq, &chip->irq_chip, handle_edge_irq);
+		set_irq_nested_thread(irq, 1);
+#ifdef CONFIG_ARM
+		set_irq_flags(irq, IRQF_VALID);
+#else
+		set_irq_noprobe(irq);
+#endif
+	}
+
+	err = request_threaded_irq(irq_summary,
+				NULL,
+				sx150x_irq_thread_fn,
+				IRQF_SHARED | IRQF_TRIGGER_FALLING,
+				chip->irq_chip.name,
+				chip);
+	if (err < 0) {
+		chip->irq_summary = -1;
+		chip->irq_base    = -1;
+	}
+
+	return err;
+}
+
+static void sx150x_remove_irq_chip(struct sx150x_chip *chip)
+{
+	unsigned n;
+	unsigned irq;
+
+	free_irq(chip->irq_summary, chip);
+
+	for (n = 0; n < chip->dev_cfg->ngpios; ++n) {
+		irq = chip->irq_base + n;
+		set_irq_handler(irq, NULL);
+		set_irq_chip(irq, NULL);
+	}
+}
+
+static int __devinit sx150x_probe(struct i2c_client *client,
+				const struct i2c_device_id *id)
+{
+	static const u32 i2c_funcs = I2C_FUNC_SMBUS_BYTE_DATA |
+				     I2C_FUNC_SMBUS_WRITE_WORD_DATA;
+	struct sx150x_platform_data *pdata;
+	struct sx150x_chip *chip;
+	int rc;
+
+	pdata = client->dev.platform_data;
+	if (!pdata)
+		return -EINVAL;
+
+	if (!i2c_check_functionality(client->adapter, i2c_funcs))
+		return -ENOSYS;
+
+	chip = kzalloc(sizeof(struct sx150x_chip), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	sx150x_init_chip(chip, client, id->driver_data, pdata);
+	rc = sx150x_init_hw(chip, pdata);
+	if (rc < 0)
+		goto probe_fail_pre_gpiochip_add;
+
+	rc = gpiochip_add(&chip->gpio_chip);
+	if (rc < 0)
+		goto probe_fail_pre_gpiochip_add;
+
+	if (pdata->irq_summary >= 0) {
+		rc = sx150x_install_irq_chip(chip,
+					pdata->irq_summary,
+					pdata->irq_base);
+		if (rc < 0)
+			goto probe_fail_post_gpiochip_add;
+	}
+
+	i2c_set_clientdata(client, chip);
+
+	return 0;
+probe_fail_post_gpiochip_add:
+	WARN_ON(gpiochip_remove(&chip->gpio_chip) < 0);
+probe_fail_pre_gpiochip_add:
+	kfree(chip);
+	return rc;
+}
+
+static int __devexit sx150x_remove(struct i2c_client *client)
+{
+	struct sx150x_chip *chip;
+	int rc;
+
+	chip = i2c_get_clientdata(client);
+	rc = gpiochip_remove(&chip->gpio_chip);
+	if (rc < 0)
+		return rc;
+
+	if (chip->irq_summary >= 0)
+		sx150x_remove_irq_chip(chip);
+
+	kfree(chip);
+
+	return 0;
+}
+
+static struct i2c_driver sx150x_driver = {
+	.driver = {
+		.name = "sx150x",
+		.owner = THIS_MODULE
+	},
+	.probe    = sx150x_probe,
+	.remove   = __devexit_p(sx150x_remove),
+	.id_table = sx150x_id,
+};
+
+static int __init sx150x_init(void)
+{
+	return i2c_add_driver(&sx150x_driver);
+}
+subsys_initcall(sx150x_init);
+
+static void __exit sx150x_exit(void)
+{
+	return i2c_del_driver(&sx150x_driver);
+}
+module_exit(sx150x_exit);
+
+MODULE_AUTHOR("Gregory Bean <gbean@codeaurora.org>");
+MODULE_DESCRIPTION("Driver for Semtech SX150X I2C GPIO Expanders");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("i2c:sx150x");
diff --git a/include/linux/i2c/sx150x.h b/include/linux/i2c/sx150x.h
new file mode 100644
index 000000000000..ee3049cb9ba5
--- /dev/null
+++ b/include/linux/i2c/sx150x.h
@@ -0,0 +1,78 @@
+/*
+ * Driver for the Semtech SX150x I2C GPIO Expanders
+ *
+ * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+#ifndef __LINUX_I2C_SX150X_H
+#define __LINUX_I2C_SX150X_H
+
+/**
+ * struct sx150x_platform_data - config data for SX150x driver
+ * @gpio_base: The index number of the first GPIO assigned to this
+ *             GPIO expander.  The expander will create a block of
+ *             consecutively numbered gpios beginning at the given base,
+ *             with the size of the block depending on the model of the
+ *             expander chip.
+ * @oscio_is_gpo: If set to true, the driver will configure OSCIO as a GPO
+ *                instead of as an oscillator, increasing the size of the
+ *                GP(I)O pool created by this expander by one.  The
+ *                output-only GPO pin will be added at the end of the block.
+ * @io_pullup_ena: A bit-mask which enables or disables the pull-up resistor
+ *                 for each IO line in the expander.  Setting the bit at
+ *                 position n will enable the pull-up for the IO at
+ *                 the corresponding offset.  For chips with fewer than
+ *                 16 IO pins, high-end bits are ignored.
+ * @io_pulldn_ena: A bit-mask which enables-or disables the pull-down
+ *                 resistor for each IO line in the expander. Setting the
+ *                 bit at position n will enable the pull-down for the IO at
+ *                 the corresponding offset.  For chips with fewer than
+ *                 16 IO pins, high-end bits are ignored.
+ * @io_open_drain_ena: A bit-mask which enables-or disables open-drain
+ *                     operation for each IO line in the expander. Setting the
+ *                     bit at position n enables open-drain operation for
+ *                     the IO at the corresponding offset.  Clearing the bit
+ *                     enables regular push-pull operation for that IO.
+ *                     For chips with fewer than 16 IO pins, high-end bits
+ *                     are ignored.
+ * @io_polarity: A bit-mask which enables polarity inversion for each IO line
+ *               in the expander.  Setting the bit at position n inverts
+ *               the polarity of that IO line, while clearing it results
+ *               in normal polarity. For chips with fewer than 16 IO pins,
+ *               high-end bits are ignored.
+ * @irq_summary: The 'summary IRQ' line to which the GPIO expander's INT line
+ *               is connected, via which it reports interrupt events
+ *               across all GPIO lines.  This must be a real,
+ *               pre-existing IRQ line.
+ *               Setting this value < 0 disables the irq_chip functionality
+ *               of the driver.
+ * @irq_base: The first 'virtual IRQ' line at which our block of GPIO-based
+ *            IRQ lines will appear.  Similarly to gpio_base, the expander
+ *            will create a block of irqs beginning at this number.
+ *            This value is ignored if irq_summary is < 0.
+ */
+struct sx150x_platform_data {
+	unsigned gpio_base;
+	bool     oscio_is_gpo;
+	u16      io_pullup_ena;
+	u16      io_pulldn_ena;
+	u16      io_open_drain_ena;
+	u16      io_polarity;
+	int      irq_summary;
+	unsigned irq_base;
+};
+
+#endif /* __LINUX_I2C_SX150X_H */
-- 
cgit v1.2.3-59-g8ed1b


From 158e0a2d1b3cffed8b46cbc56393a1394672ef79 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 10 Aug 2010 18:03:00 -0700
Subject: memcg: use find_lock_task_mm() in memory cgroups oom

When the OOM killer scans task, it check a task is under memcg or
not when it's called via memcg's context.

But, as Oleg pointed out, a thread group leader may have NULL ->mm
and task_in_mem_cgroup() may do wrong decision. We have to use
find_lock_task_mm() in memcg as generic OOM-Killer does.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h |  2 ++
 mm/memcontrol.c     | 10 +++++++---
 mm/oom_kill.c       |  2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index f209b683e118..5e3aa8311c5e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -66,6 +66,8 @@ static inline void oom_killer_enable(void)
 extern unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
 		      const nodemask_t *nodemask, unsigned long uptime);
 
+extern struct task_struct *find_lock_task_mm(struct task_struct *p);
+
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ea5f5edf00b7..f52b0a1861c4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -47,6 +47,7 @@
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
+#include <linux/oom.h>
 #include "internal.h"
 
 #include <asm/uaccess.h>
@@ -838,10 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
 	int ret;
 	struct mem_cgroup *curr = NULL;
+	struct task_struct *p;
 
-	task_lock(task);
-	curr = try_get_mem_cgroup_from_mm(task->mm);
-	task_unlock(task);
+	p = find_lock_task_mm(task);
+	if (!p)
+		return 0;
+	curr = try_get_mem_cgroup_from_mm(p->mm);
+	task_unlock(p);
 	if (!curr)
 		return 0;
 	/*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d3def05a33d9..5014e50644d1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -106,7 +106,7 @@ static void boost_dying_task_prio(struct task_struct *p,
  * pointer.  Return p, or any of its subthreads with a valid ->mm, with
  * task_lock() held.
  */
-static struct task_struct *find_lock_task_mm(struct task_struct *p)
+struct task_struct *find_lock_task_mm(struct task_struct *p)
 {
 	struct task_struct *t = p;
 
-- 
cgit v1.2.3-59-g8ed1b


From 14fec79680f7cc4617d6ba69324e63d4a732986c Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 10 Aug 2010 18:03:05 -0700
Subject: memcg: mem_cgroup_shrink_node_zone() doesn't need sc.nodemask

Currently mem_cgroup_shrink_node_zone() call shrink_zone() directly.  thus
it doesn't need to initialize sc.nodemask because shrink_zone() doesn't
use it at all.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Nishimura Daisuke <d-nishimura@mtf.biglobe.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 3 +--
 mm/memcontrol.c      | 3 +--
 mm/vmscan.c          | 5 +----
 3 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 91c9d3fc8513..2fee51a11b73 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -244,8 +244,7 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						unsigned int swappiness,
-						struct zone *zone,
-						int nid);
+						struct zone *zone);
 extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a1c3c317a4dd..8d0bfd7fe328 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1300,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 		/* we use swappiness of local cgroup */
 		if (check_soft)
 			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-				noswap, get_swappiness(victim), zone,
-				zone->zone_pgdat->node_id);
+				noswap, get_swappiness(victim), zone);
 		else
 			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
 						noswap, get_swappiness(victim));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 291270496b6f..06ccda66dec0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1969,7 +1969,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						unsigned int swappiness,
-						struct zone *zone, int nid)
+						struct zone *zone)
 {
 	struct scan_control sc = {
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
@@ -1980,11 +1980,8 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 		.order = 0,
 		.mem_cgroup = mem,
 	};
-	nodemask_t nm  = nodemask_of_node(nid);
-
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
-	sc.nodemask = &nm;
 
 	trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
 						      sc.may_writepage,
-- 
cgit v1.2.3-59-g8ed1b


From 00918b6ab89df8984ca06397cb77994dabd73f9b Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 10 Aug 2010 18:03:05 -0700
Subject: memcg: remove nid and zid argument from
 mem_cgroup_soft_limit_reclaim()

mem_cgroup_soft_limit_reclaim() has zone, nid and zid argument.  but nid
and zid can be calculated from zone.  So remove it.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Nishimura Daisuke <d-nishimura@mtf.biglobe.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 5 ++---
 mm/memcontrol.c            | 5 ++---
 mm/vmscan.c                | 7 ++-----
 3 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 73564cac38c7..159a0762aeaf 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -123,8 +123,7 @@ static inline bool mem_cgroup_disabled(void)
 
 void mem_cgroup_update_file_mapped(struct page *page, int val);
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-						gfp_t gfp_mask, int nid,
-						int zid);
+						gfp_t gfp_mask);
 u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
 
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
@@ -301,7 +300,7 @@ static inline void mem_cgroup_update_file_mapped(struct page *page,
 
 static inline
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-					    gfp_t gfp_mask, int nid, int zid)
+					    gfp_t gfp_mask)
 {
 	return 0;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8d0bfd7fe328..5e95996ddfb1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2865,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 }
 
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-						gfp_t gfp_mask, int nid,
-						int zid)
+					    gfp_t gfp_mask)
 {
 	unsigned long nr_reclaimed = 0;
 	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2878,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 	if (order > 0)
 		return 0;
 
-	mctz = soft_limit_tree_node_zone(nid, zid);
+	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
 	/*
 	 * This loop can run a while, specially if mem_cgroup's continuously
 	 * keep exceeding their soft limit and putting the system under
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 06ccda66dec0..c391c320dbaf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2168,7 +2168,6 @@ loop_again:
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
-			int nid, zid;
 
 			if (!populated_zone(zone))
 				continue;
@@ -2178,14 +2177,12 @@ loop_again:
 
 			sc.nr_scanned = 0;
 
-			nid = pgdat->node_id;
-			zid = zone_idx(zone);
 			/*
 			 * Call soft limit reclaim before calling shrink_zone.
 			 * For now we ignore the return value
 			 */
-			mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
-							nid, zid);
+			mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+
 			/*
 			 * We put equal pressure on every zone, unless one
 			 * zone has way too many pages free already.
-- 
cgit v1.2.3-59-g8ed1b


From a6eb9fe105d5de0053b261148cee56c94b4720ca Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 10 Aug 2010 18:03:22 -0700
Subject: dma-mapping: rename ARCH_KMALLOC_MINALIGN to ARCH_DMA_MINALIGN

Now each architecture has the own dma_get_cache_alignment implementation.

dma_get_cache_alignment returns the minimum DMA alignment.  Architectures
define it as ARCH_KMALLOC_MINALIGN (it's used to make sure that malloc'ed
buffer is DMA-safe; the buffer doesn't share a cache with the others).  So
we can unify dma_get_cache_alignment implementations.

This patch:

dma_get_cache_alignment() needs to know if an architecture defines
ARCH_KMALLOC_MINALIGN or not (needs to know if architecture has DMA
alignment restriction).  However, slab.h define ARCH_KMALLOC_MINALIGN if
architectures doesn't define it.

Let's rename ARCH_KMALLOC_MINALIGN to ARCH_DMA_MINALIGN.
ARCH_KMALLOC_MINALIGN is used only in the internals of slab/slob/slub
(except for crypto).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/cache.h                 | 2 +-
 arch/avr32/include/asm/cache.h               | 2 +-
 arch/blackfin/include/asm/cache.h            | 2 +-
 arch/frv/include/asm/mem-layout.h            | 2 +-
 arch/m68k/include/asm/cache.h                | 2 +-
 arch/microblaze/include/asm/page.h           | 2 +-
 arch/mips/include/asm/mach-generic/kmalloc.h | 2 +-
 arch/mips/include/asm/mach-ip27/kmalloc.h    | 2 +-
 arch/mips/include/asm/mach-ip32/kmalloc.h    | 4 ++--
 arch/mn10300/include/asm/cache.h             | 2 +-
 arch/powerpc/include/asm/page_32.h           | 2 +-
 arch/sh/include/asm/page.h                   | 2 +-
 arch/xtensa/include/asm/cache.h              | 2 +-
 include/linux/slab_def.h                     | 4 +++-
 include/linux/slob_def.h                     | 4 +++-
 include/linux/slub_def.h                     | 8 +++++---
 16 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h
index 66c160b8547f..9d6122096fbe 100644
--- a/arch/arm/include/asm/cache.h
+++ b/arch/arm/include/asm/cache.h
@@ -14,7 +14,7 @@
  * cache before the transfer is done, causing old data to be seen by
  * the CPU.
  */
-#define ARCH_KMALLOC_MINALIGN	L1_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
 
 /*
  * With EABI on ARMv5 and above we must have 64-bit aligned slab pointers.
diff --git a/arch/avr32/include/asm/cache.h b/arch/avr32/include/asm/cache.h
index d3cf35ab11ab..c3a58a189a91 100644
--- a/arch/avr32/include/asm/cache.h
+++ b/arch/avr32/include/asm/cache.h
@@ -11,7 +11,7 @@
  * cache before the transfer is done, causing old data to be seen by
  * the CPU.
  */
-#define ARCH_KMALLOC_MINALIGN	L1_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
 
 #ifndef __ASSEMBLER__
 struct cache_info {
diff --git a/arch/blackfin/include/asm/cache.h b/arch/blackfin/include/asm/cache.h
index 93f6c634fdf4..bd0641a267f1 100644
--- a/arch/blackfin/include/asm/cache.h
+++ b/arch/blackfin/include/asm/cache.h
@@ -15,7 +15,7 @@
 #define L1_CACHE_BYTES	(1 << L1_CACHE_SHIFT)
 #define SMP_CACHE_BYTES	L1_CACHE_BYTES
 
-#define ARCH_KMALLOC_MINALIGN	L1_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
 
 #ifdef CONFIG_SMP
 #define __cacheline_aligned
diff --git a/arch/frv/include/asm/mem-layout.h b/arch/frv/include/asm/mem-layout.h
index ccae981876fa..e9a0ec85a402 100644
--- a/arch/frv/include/asm/mem-layout.h
+++ b/arch/frv/include/asm/mem-layout.h
@@ -35,7 +35,7 @@
  * the slab must be aligned such that load- and store-double instructions don't
  * fault if used
  */
-#define	ARCH_KMALLOC_MINALIGN		L1_CACHE_BYTES
+#define	ARCH_DMA_MINALIGN		L1_CACHE_BYTES
 #define	ARCH_SLAB_MINALIGN		L1_CACHE_BYTES
 
 /*****************************************************************************/
diff --git a/arch/m68k/include/asm/cache.h b/arch/m68k/include/asm/cache.h
index ecafbe1718c3..0395c51e46a6 100644
--- a/arch/m68k/include/asm/cache.h
+++ b/arch/m68k/include/asm/cache.h
@@ -8,6 +8,6 @@
 #define        L1_CACHE_SHIFT  4
 #define        L1_CACHE_BYTES  (1<< L1_CACHE_SHIFT)
 
-#define ARCH_KMALLOC_MINALIGN	L1_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
 
 #endif
diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h
index 4f268faa0126..cf377d91da71 100644
--- a/arch/microblaze/include/asm/page.h
+++ b/arch/microblaze/include/asm/page.h
@@ -40,7 +40,7 @@
 #ifndef __ASSEMBLY__
 
 /* MS be sure that SLAB allocates aligned objects */
-#define ARCH_KMALLOC_MINALIGN	L1_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
 
 #define ARCH_SLAB_MINALIGN	L1_CACHE_BYTES
 
diff --git a/arch/mips/include/asm/mach-generic/kmalloc.h b/arch/mips/include/asm/mach-generic/kmalloc.h
index b8e6deba352f..a5d669086ed9 100644
--- a/arch/mips/include/asm/mach-generic/kmalloc.h
+++ b/arch/mips/include/asm/mach-generic/kmalloc.h
@@ -7,7 +7,7 @@
  * Total overkill for most systems but need as a safe default.
  * Set this one if any device in the system might do non-coherent DMA.
  */
-#define ARCH_KMALLOC_MINALIGN	128
+#define ARCH_DMA_MINALIGN	128
 #endif
 
 #endif /* __ASM_MACH_GENERIC_KMALLOC_H */
diff --git a/arch/mips/include/asm/mach-ip27/kmalloc.h b/arch/mips/include/asm/mach-ip27/kmalloc.h
index 426bd049b2d7..82c23ce2afa7 100644
--- a/arch/mips/include/asm/mach-ip27/kmalloc.h
+++ b/arch/mips/include/asm/mach-ip27/kmalloc.h
@@ -2,7 +2,7 @@
 #define __ASM_MACH_IP27_KMALLOC_H
 
 /*
- * All happy, no need to define ARCH_KMALLOC_MINALIGN
+ * All happy, no need to define ARCH_DMA_MINALIGN
  */
 
 #endif /* __ASM_MACH_IP27_KMALLOC_H */
diff --git a/arch/mips/include/asm/mach-ip32/kmalloc.h b/arch/mips/include/asm/mach-ip32/kmalloc.h
index b1e0be60f720..042ca926c48f 100644
--- a/arch/mips/include/asm/mach-ip32/kmalloc.h
+++ b/arch/mips/include/asm/mach-ip32/kmalloc.h
@@ -3,9 +3,9 @@
 
 
 #if defined(CONFIG_CPU_R5000) || defined(CONFIG_CPU_RM7000)
-#define ARCH_KMALLOC_MINALIGN	32
+#define ARCH_DMA_MINALIGN	32
 #else
-#define ARCH_KMALLOC_MINALIGN	128
+#define ARCH_DMA_MINALIGN	128
 #endif
 
 #endif /* __ASM_MACH_IP32_KMALLOC_H */
diff --git a/arch/mn10300/include/asm/cache.h b/arch/mn10300/include/asm/cache.h
index 6e2fe28dde4e..781bf613366d 100644
--- a/arch/mn10300/include/asm/cache.h
+++ b/arch/mn10300/include/asm/cache.h
@@ -21,7 +21,7 @@
 #define L1_CACHE_DISPARITY	L1_CACHE_NENTRIES * L1_CACHE_BYTES
 #endif
 
-#define ARCH_KMALLOC_MINALIGN	L1_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
 
 /* data cache purge registers
  * - read from the register to unconditionally purge that cache line
diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h
index bd0849dbcaaa..68d73b2a7bfc 100644
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@@ -10,7 +10,7 @@
 #define VM_DATA_DEFAULT_FLAGS	VM_DATA_DEFAULT_FLAGS32
 
 #ifdef CONFIG_NOT_COHERENT_CACHE
-#define ARCH_KMALLOC_MINALIGN	L1_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
 #endif
 
 #ifdef CONFIG_PTE_64BIT
diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h
index fb703d120d09..c4e0b3d472b9 100644
--- a/arch/sh/include/asm/page.h
+++ b/arch/sh/include/asm/page.h
@@ -180,7 +180,7 @@ typedef struct page *pgtable_t;
  * Some drivers need to perform DMA into kmalloc'ed buffers
  * and so we have to increase the kmalloc minalign for this.
  */
-#define ARCH_KMALLOC_MINALIGN	L1_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
 
 #ifdef CONFIG_SUPERH64
 /*
diff --git a/arch/xtensa/include/asm/cache.h b/arch/xtensa/include/asm/cache.h
index ed8cd3cbd499..d2fd932fdb4d 100644
--- a/arch/xtensa/include/asm/cache.h
+++ b/arch/xtensa/include/asm/cache.h
@@ -29,6 +29,6 @@
 # define CACHE_WAY_SIZE ICACHE_WAY_SIZE
 #endif
 
-#define ARCH_KMALLOC_MINALIGN	L1_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
 
 #endif	/* _XTENSA_CACHE_H */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 1acfa73ce2ac..791a502f6906 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -17,7 +17,6 @@
 
 #include <trace/events/kmem.h>
 
-#ifndef ARCH_KMALLOC_MINALIGN
 /*
  * Enforce a minimum alignment for the kmalloc caches.
  * Usually, the kmalloc caches are cache_line_size() aligned, except when
@@ -27,6 +26,9 @@
  * ARCH_KMALLOC_MINALIGN allows that.
  * Note that increasing this value may disable some debug features.
  */
+#ifdef ARCH_DMA_MINALIGN
+#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
+#else
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 #endif
 
diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h
index 62667f72c2ef..4382db09df4f 100644
--- a/include/linux/slob_def.h
+++ b/include/linux/slob_def.h
@@ -1,7 +1,9 @@
 #ifndef __LINUX_SLOB_DEF_H
 #define __LINUX_SLOB_DEF_H
 
-#ifndef ARCH_KMALLOC_MINALIGN
+#ifdef ARCH_DMA_MINALIGN
+#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
+#else
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
 #endif
 
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 6447a723ecb1..6d14409c4d9a 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -106,15 +106,17 @@ struct kmem_cache {
 /*
  * Kmalloc subsystem.
  */
-#if defined(ARCH_KMALLOC_MINALIGN) && ARCH_KMALLOC_MINALIGN > 8
-#define KMALLOC_MIN_SIZE ARCH_KMALLOC_MINALIGN
+#if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
+#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
 #else
 #define KMALLOC_MIN_SIZE 8
 #endif
 
 #define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
 
-#ifndef ARCH_KMALLOC_MINALIGN
+#ifdef ARCH_DMA_MINALIGN
+#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
+#else
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From 4565f0170dfc849b3629c27d769db800467baa62 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 10 Aug 2010 18:03:22 -0700
Subject: dma-mapping: unify dma_get_cache_alignment implementations

dma_get_cache_alignment returns the minimum DMA alignment.  Architectures
defines it as ARCH_DMA_MINALIGN (formally ARCH_KMALLOC_MINALIGN).  So we
can unify dma_get_cache_alignment implementations.

Note that some architectures implement dma_get_cache_alignment wrongly.
dma_get_cache_alignment() should return the minimum DMA alignment.  So
fully-coherent architectures should return 1.  This patch also fixes this
issue.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/dma-mapping.h      |  1 -
 arch/arm/include/asm/dma-mapping.h        |  5 -----
 arch/avr32/include/asm/dma-mapping.h      |  5 -----
 arch/blackfin/include/asm/dma-mapping.h   |  1 -
 arch/cris/include/asm/dma-mapping.h       |  6 ------
 arch/frv/include/asm/dma-mapping.h        |  6 ------
 arch/ia64/include/asm/dma-mapping.h       |  2 --
 arch/ia64/kernel/setup.c                  |  6 ------
 arch/m68k/include/asm/dma-mapping.h       |  5 -----
 arch/microblaze/include/asm/dma-mapping.h |  5 -----
 arch/mips/include/asm/dma-mapping.h       |  7 -------
 arch/mn10300/include/asm/dma-mapping.h    |  6 ------
 arch/parisc/include/asm/dma-mapping.h     |  6 ------
 arch/powerpc/include/asm/dma-mapping.h    | 15 ---------------
 arch/sh/include/asm/dma-mapping.h         |  9 ---------
 arch/sparc/include/asm/dma-mapping.h      |  9 ---------
 arch/tile/include/asm/dma-mapping.h       |  7 -------
 arch/x86/include/asm/dma-mapping.h        |  7 -------
 arch/xtensa/include/asm/dma-mapping.h     |  6 ------
 include/linux/dma-mapping.h               |  8 ++++++++
 20 files changed, 8 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h
index 1bce8169733c..b3423d96acc7 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -44,6 +44,5 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
 #define dma_is_consistent(d, h)			(1)
 
 #define dma_cache_sync(dev, va, size, dir)		  ((void)0)
-#define dma_get_cache_alignment()			  L1_CACHE_BYTES
 
 #endif	/* _ALPHA_DMA_MAPPING_H */
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 69ce0727edb5..f4a996d5ae96 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -144,11 +144,6 @@ static inline int dma_set_mask(struct device *dev, u64 dma_mask)
 	return 0;
 }
 
-static inline int dma_get_cache_alignment(void)
-{
-	return 32;
-}
-
 static inline int dma_is_consistent(struct device *dev, dma_addr_t handle)
 {
 	return !!arch_is_coherent();
diff --git a/arch/avr32/include/asm/dma-mapping.h b/arch/avr32/include/asm/dma-mapping.h
index 0399359ab5d8..af6b81655074 100644
--- a/arch/avr32/include/asm/dma-mapping.h
+++ b/arch/avr32/include/asm/dma-mapping.h
@@ -341,9 +341,4 @@ static inline int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 	return 1;
 }
 
-static inline int dma_get_cache_alignment(void)
-{
-	return boot_cpu_data.dcache.linesz;
-}
-
 #endif /* __ASM_AVR32_DMA_MAPPING_H */
diff --git a/arch/blackfin/include/asm/dma-mapping.h b/arch/blackfin/include/asm/dma-mapping.h
index 212cb80fd74b..6694a0f55de9 100644
--- a/arch/blackfin/include/asm/dma-mapping.h
+++ b/arch/blackfin/include/asm/dma-mapping.h
@@ -21,7 +21,6 @@ void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 #define dma_supported(d, m)         (1)
-#define dma_get_cache_alignment()   (32)
 #define dma_is_consistent(d, h)     (1)
 
 static inline int
diff --git a/arch/cris/include/asm/dma-mapping.h b/arch/cris/include/asm/dma-mapping.h
index da8ef8e8f842..fc30fd0b2a04 100644
--- a/arch/cris/include/asm/dma-mapping.h
+++ b/arch/cris/include/asm/dma-mapping.h
@@ -152,12 +152,6 @@ dma_set_mask(struct device *dev, u64 mask)
 	return 0;
 }
 
-static inline int
-dma_get_cache_alignment(void)
-{
-	return (1 << INTERNODE_CACHE_SHIFT);
-}
-
 #define dma_is_consistent(d, h)	(1)
 
 static inline void
diff --git a/arch/frv/include/asm/dma-mapping.h b/arch/frv/include/asm/dma-mapping.h
index 6af5d83e2fb2..7b05ce14177e 100644
--- a/arch/frv/include/asm/dma-mapping.h
+++ b/arch/frv/include/asm/dma-mapping.h
@@ -125,12 +125,6 @@ int dma_set_mask(struct device *dev, u64 mask)
 	return 0;
 }
 
-static inline
-int dma_get_cache_alignment(void)
-{
-	return 1 << L1_CACHE_SHIFT;
-}
-
 #define dma_is_consistent(d, h)	(1)
 
 static inline
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 7d09a09cdaad..8d52deed3750 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -86,8 +86,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 	return daddr;
 }
 
-extern int dma_get_cache_alignment(void);
-
 static inline void
 dma_cache_sync (struct device *dev, void *vaddr, size_t size,
 	enum dma_data_direction dir)
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 41ae6a596b50..8fb958abf8d0 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -98,12 +98,6 @@ static struct resource bss_resource = {
 
 unsigned long ia64_max_cacheline_size;
 
-int dma_get_cache_alignment(void)
-{
-        return ia64_max_cacheline_size;
-}
-EXPORT_SYMBOL(dma_get_cache_alignment);
-
 unsigned long ia64_iobase;	/* virtual address for I/O accesses */
 EXPORT_SYMBOL(ia64_iobase);
 struct io_space io_space[MAX_IO_SPACES];
diff --git a/arch/m68k/include/asm/dma-mapping.h b/arch/m68k/include/asm/dma-mapping.h
index 26f505488c11..a1ae732c7247 100644
--- a/arch/m68k/include/asm/dma-mapping.h
+++ b/arch/m68k/include/asm/dma-mapping.h
@@ -16,11 +16,6 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
 	return 0;
 }
 
-static inline int dma_get_cache_alignment(void)
-{
-	return 1 << L1_CACHE_SHIFT;
-}
-
 static inline int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
diff --git a/arch/microblaze/include/asm/dma-mapping.h b/arch/microblaze/include/asm/dma-mapping.h
index 507389580709..21df7cbae65e 100644
--- a/arch/microblaze/include/asm/dma-mapping.h
+++ b/arch/microblaze/include/asm/dma-mapping.h
@@ -132,11 +132,6 @@ static inline void dma_free_coherent(struct device *dev, size_t size,
 	ops->free_coherent(dev, size, cpu_addr, dma_handle);
 }
 
-static inline int dma_get_cache_alignment(void)
-{
-	return L1_CACHE_BYTES;
-}
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		enum dma_data_direction direction)
 {
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index 664ba53dc32a..d724a15f0438 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -62,13 +62,6 @@ dma_set_mask(struct device *dev, u64 mask)
 	return 0;
 }
 
-static inline int
-dma_get_cache_alignment(void)
-{
-	/* XXX Largest on any MIPS */
-	return 128;
-}
-
 extern int dma_is_consistent(struct device *dev, dma_addr_t dma_addr);
 
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
diff --git a/arch/mn10300/include/asm/dma-mapping.h b/arch/mn10300/include/asm/dma-mapping.h
index 4ed1522b38d2..8d452a657795 100644
--- a/arch/mn10300/include/asm/dma-mapping.h
+++ b/arch/mn10300/include/asm/dma-mapping.h
@@ -161,12 +161,6 @@ int dma_set_mask(struct device *dev, u64 mask)
 	return 0;
 }
 
-static inline
-int dma_get_cache_alignment(void)
-{
-	return 1 << L1_CACHE_SHIFT;
-}
-
 #define dma_is_consistent(d)	(1)
 
 static inline
diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h
index da6943380908..44d3f62ccf9d 100644
--- a/arch/parisc/include/asm/dma-mapping.h
+++ b/arch/parisc/include/asm/dma-mapping.h
@@ -184,12 +184,6 @@ dma_set_mask(struct device *dev, u64 mask)
 	return 0;
 }
 
-static inline int
-dma_get_cache_alignment(void)
-{
-	return dcache_stride;
-}
-
 static inline int
 dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
 {
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index c85ef230135b..a77ba280af04 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -215,21 +215,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 #define dma_is_consistent(d, h)	(1)
 #endif
 
-static inline int dma_get_cache_alignment(void)
-{
-#ifdef CONFIG_PPC64
-	/* no easy way to get cache size on all processors, so return
-	 * the maximum possible, to be safe */
-	return (1 << INTERNODE_CACHE_SHIFT);
-#else
-	/*
-	 * Each processor family will define its own L1_CACHE_SHIFT,
-	 * L1_CACHE_BYTES wraps to this, so this is always safe.
-	 */
-	return L1_CACHE_BYTES;
-#endif
-}
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		enum dma_data_direction direction)
 {
diff --git a/arch/sh/include/asm/dma-mapping.h b/arch/sh/include/asm/dma-mapping.h
index bea3337a426a..6bb5cc9decf8 100644
--- a/arch/sh/include/asm/dma-mapping.h
+++ b/arch/sh/include/asm/dma-mapping.h
@@ -48,15 +48,6 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 #define dma_is_consistent(d, h) (0)
 #endif
 
-static inline int dma_get_cache_alignment(void)
-{
-	/*
-	 * Each processor family will define its own L1_CACHE_SHIFT,
-	 * L1_CACHE_BYTES wraps to this, so this is always safe.
-	 */
-	return L1_CACHE_BYTES;
-}
-
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index 4b4a0c0b0ccd..74db853ec2cf 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -52,15 +52,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return (dma_addr == DMA_ERROR_CODE);
 }
 
-static inline int dma_get_cache_alignment(void)
-{
-	/*
-	 * no easy way to get cache size on all processors, so return
-	 * the maximum possible, to be safe
-	 */
-	return (1 << INTERNODE_CACHE_SHIFT);
-}
-
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
 #ifdef CONFIG_PCI
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index cf466b39aa13..1326b910fec6 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -90,13 +90,6 @@ dma_set_mask(struct device *dev, u64 mask)
 	return 0;
 }
 
-static inline int
-dma_get_cache_alignment(void)
-{
-	return L2_CACHE_BYTES;
-}
-
 #define dma_is_consistent(d, h)	(1)
 
-
 #endif /* _ASM_TILE_DMA_MAPPING_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index ac91eed21061..f9c67e83f648 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -87,13 +87,6 @@ dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	flush_write_buffers();
 }
 
-static inline int dma_get_cache_alignment(void)
-{
-	/* no easy way to get cache size on all x86, so return the
-	 * maximum possible, to be safe */
-	return boot_cpu_data.x86_clflush_size;
-}
-
 static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
 						    gfp_t gfp)
 {
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index 51882ae3db4d..7104f2f9823e 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -161,12 +161,6 @@ dma_set_mask(struct device *dev, u64 mask)
 	return 0;
 }
 
-static inline int
-dma_get_cache_alignment(void)
-{
-	return L1_CACHE_BYTES;
-}
-
 #define dma_is_consistent(d, h)	(1)
 
 static inline void
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 89b7e1a605b8..e0670a512056 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -142,6 +142,14 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
 		return -EIO;
 }
 
+static inline int dma_get_cache_alignment(void)
+{
+#ifdef ARCH_DMA_MINALIGN
+	return ARCH_DMA_MINALIGN;
+#endif
+	return 1;
+}
+
 /* flags for the coherent memory api */
 #define	DMA_MEMORY_MAP			0x01
 #define DMA_MEMORY_IO			0x02
-- 
cgit v1.2.3-59-g8ed1b


From c7ff0d9c92435e836e13aaa8d0e56d4000424bcc Mon Sep 17 00:00:00 2001
From: TAMUKI Shoichi <tamuki@linet.gr.jp>
Date: Tue, 10 Aug 2010 18:03:28 -0700
Subject: panic: keep blinking in spite of long spin timer mode

To keep panic_timeout accuracy when running under a hypervisor, the
current implementation only spins on long time (1 second) calls to mdelay.
 That brings a good effect, but the problem is the keyboard LEDs don't
blink at all on that situation.

This patch changes to call to panic_blink_enter() between every mdelay and
keeps blinking in spite of long spin timer mode.

The time to call to mdelay is now 100ms.  Even this change will keep
panic_timeout accuracy enough when running under a hypervisor.

Signed-off-by: TAMUKI Shoichi <tamuki@linet.gr.jp>
Cc: Ben Dooks <ben-linux@fluff.org>
Cc: Russell King <linux@arm.linux.org.uk>
Acked-by: Dmitry Torokhov <dtor@mail.ru>
Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kernel-parameters.txt |  3 --
 arch/arm/mach-s3c2440/mach-gta02.c  | 17 ++++-------
 drivers/input/serio/i8042.c         | 25 ++++------------
 include/linux/kernel.h              |  2 +-
 kernel/panic.c                      | 58 +++++++++++++++++--------------------
 5 files changed, 37 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index d529b1363e95..873b68090098 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -915,9 +915,6 @@ and is between 256 and 4096 characters. It is defined in the file
 			     controller
 	i8042.nopnp	[HW] Don't use ACPIPnP / PnPBIOS to discover KBD/AUX
 			     controllers
-	i8042.panicblink=
-			[HW] Frequency with which keyboard LEDs should blink
-			     when kernel panics (default is 0.5 sec)
 	i8042.reset	[HW] Reset the controller during init and cleanup
 	i8042.unlock	[HW] Unlock (ignore) the keylock
 
diff --git a/arch/arm/mach-s3c2440/mach-gta02.c b/arch/arm/mach-s3c2440/mach-gta02.c
index 9e39faa283b9..deaabe86741d 100644
--- a/arch/arm/mach-s3c2440/mach-gta02.c
+++ b/arch/arm/mach-s3c2440/mach-gta02.c
@@ -90,24 +90,17 @@
 static struct pcf50633 *gta02_pcf;
 
 /*
- * This gets called every 1ms when we paniced.
+ * This gets called frequently when we paniced.
  */
 
-static long gta02_panic_blink(long count)
+static long gta02_panic_blink(int state)
 {
 	long delay = 0;
-	static long last_blink;
-	static char led;
+	char led;
 
-	/* Fast blink: 200ms period. */
-	if (count - last_blink < 100)
-		return 0;
-
-	led ^= 1;
+	led = (state) ? 1 : 0;
 	gpio_direction_output(GTA02_GPIO_AUX_LED, led);
 
-	last_blink = count;
-
 	return delay;
 }
 
@@ -556,7 +549,7 @@ static void gta02_poweroff(void)
 
 static void __init gta02_machine_init(void)
 {
-	/* Set the panic callback to make AUX LED blink at ~5Hz. */
+	/* Set the panic callback to turn AUX LED on or off. */
 	panic_blink = gta02_panic_blink;
 
 	s3c_pm_init();
diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
index 258b98b9d7c2..46e4ba0b9246 100644
--- a/drivers/input/serio/i8042.c
+++ b/drivers/input/serio/i8042.c
@@ -61,10 +61,6 @@ static bool i8042_noloop;
 module_param_named(noloop, i8042_noloop, bool, 0);
 MODULE_PARM_DESC(noloop, "Disable the AUX Loopback command while probing for the AUX port");
 
-static unsigned int i8042_blink_frequency = 500;
-module_param_named(panicblink, i8042_blink_frequency, uint, 0600);
-MODULE_PARM_DESC(panicblink, "Frequency with which keyboard LEDs should blink when kernel panics");
-
 #ifdef CONFIG_X86
 static bool i8042_dritek;
 module_param_named(dritek, i8042_dritek, bool, 0);
@@ -1030,8 +1026,8 @@ static void i8042_controller_reset(void)
 
 
 /*
- * i8042_panic_blink() will flash the keyboard LEDs and is called when
- * kernel panics. Flashing LEDs is useful for users running X who may
+ * i8042_panic_blink() will turn the keyboard LEDs on or off and is called
+ * when kernel panics. Flashing LEDs is useful for users running X who may
  * not see the console and will help distingushing panics from "real"
  * lockups.
  *
@@ -1041,22 +1037,12 @@ static void i8042_controller_reset(void)
 
 #define DELAY do { mdelay(1); if (++delay > 10) return delay; } while(0)
 
-static long i8042_panic_blink(long count)
+static long i8042_panic_blink(int state)
 {
 	long delay = 0;
-	static long last_blink;
-	static char led;
-
-	/*
-	 * We expect frequency to be about 1/2s. KDB uses about 1s.
-	 * Make sure they are different.
-	 */
-	if (!i8042_blink_frequency)
-		return 0;
-	if (count - last_blink < i8042_blink_frequency)
-		return 0;
+	char led;
 
-	led ^= 0x01 | 0x04;
+	led = (state) ? 0x01 | 0x04 : 0;
 	while (i8042_read_status() & I8042_STR_IBF)
 		DELAY;
 	dbg("%02x -> i8042 (panic blink)", 0xed);
@@ -1069,7 +1055,6 @@ static long i8042_panic_blink(long count)
 	dbg("%02x -> i8042 (panic blink)", led);
 	i8042_write_data(led);
 	DELAY;
-	last_blink = count;
 	return delay;
 }
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5b57236dfbd0..452833d67b21 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -177,7 +177,7 @@ struct va_format {
 };
 
 extern struct atomic_notifier_head panic_notifier_list;
-extern long (*panic_blink)(long time);
+extern long (*panic_blink)(int state);
 NORET_TYPE void panic(const char * fmt, ...)
 	__attribute__ ((NORET_AND format (printf, 1, 2))) __cold;
 extern void oops_enter(void);
diff --git a/kernel/panic.c b/kernel/panic.c
index 3b16cd93fa7d..3e9037ae10e1 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,9 @@
 #include <linux/nmi.h>
 #include <linux/dmi.h>
 
+#define PANIC_TIMER_STEP 100
+#define PANIC_BLINK_SPD 18
+
 int panic_on_oops;
 static unsigned long tainted_mask;
 static int pause_on_oops;
@@ -36,36 +39,15 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
 EXPORT_SYMBOL(panic_notifier_list);
 
-/* Returns how long it waited in ms */
-long (*panic_blink)(long time);
-EXPORT_SYMBOL(panic_blink);
-
-static void panic_blink_one_second(void)
+static long no_blink(int state)
 {
-	static long i = 0, end;
-
-	if (panic_blink) {
-		end = i + MSEC_PER_SEC;
-
-		while (i < end) {
-			i += panic_blink(i);
-			mdelay(1);
-			i++;
-		}
-	} else {
-		/*
-		 * When running under a hypervisor a small mdelay may get
-		 * rounded up to the hypervisor timeslice. For example, with
-		 * a 1ms in 10ms hypervisor timeslice we might inflate a
-		 * mdelay(1) loop by 10x.
-		 *
-		 * If we have nothing to blink, spin on 1 second calls to
-		 * mdelay to avoid this.
-		 */
-		mdelay(MSEC_PER_SEC);
-	}
+	return 0;
 }
 
+/* Returns how long it waited in ms */
+long (*panic_blink)(int state);
+EXPORT_SYMBOL(panic_blink);
+
 /**
  *	panic - halt the system
  *	@fmt: The text string to print
@@ -78,7 +60,8 @@ NORET_TYPE void panic(const char * fmt, ...)
 {
 	static char buf[1024];
 	va_list args;
-	long i;
+	long i, i_next = 0;
+	int state = 0;
 
 	/*
 	 * It's possible to come here directly from a panic-assertion and
@@ -117,6 +100,9 @@ NORET_TYPE void panic(const char * fmt, ...)
 
 	bust_spinlocks(0);
 
+	if (!panic_blink)
+		panic_blink = no_blink;
+
 	if (panic_timeout > 0) {
 		/*
 		 * Delay timeout seconds before rebooting the machine.
@@ -124,9 +110,13 @@ NORET_TYPE void panic(const char * fmt, ...)
 		 */
 		printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
 
-		for (i = 0; i < panic_timeout; i++) {
+		for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
 			touch_nmi_watchdog();
-			panic_blink_one_second();
+			if (i >= i_next) {
+				i += panic_blink(state ^= 1);
+				i_next = i + 3600 / PANIC_BLINK_SPD;
+			}
+			mdelay(PANIC_TIMER_STEP);
 		}
 		/*
 		 * This will not be a clean reboot, with everything
@@ -152,9 +142,13 @@ NORET_TYPE void panic(const char * fmt, ...)
 	}
 #endif
 	local_irq_enable();
-	while (1) {
+	for (i = 0; ; i += PANIC_TIMER_STEP) {
 		touch_softlockup_watchdog();
-		panic_blink_one_second();
+		if (i >= i_next) {
+			i += panic_blink(state ^= 1);
+			i_next = i + 3600 / PANIC_BLINK_SPD;
+		}
+		mdelay(PANIC_TIMER_STEP);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 863a6049202412a6d655d052eb1c45ca7dd74a83 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 10 Aug 2010 18:03:30 -0700
Subject: lib/bug.c: add oops end marker to WARN implementation

We are missing the oops end marker for the exception based WARN implementation
in lib/bug.c. This is useful for logfile analysis tools.

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 1 +
 kernel/panic.c         | 2 +-
 lib/bug.c              | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 452833d67b21..d848cb854655 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -182,6 +182,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	__attribute__ ((NORET_AND format (printf, 1, 2))) __cold;
 extern void oops_enter(void);
 extern void oops_exit(void);
+void print_oops_end_marker(void);
 extern int oops_may_print(void);
 NORET_TYPE void do_exit(long error_code)
 	ATTRIB_NORET;
diff --git a/kernel/panic.c b/kernel/panic.c
index 3e9037ae10e1..4c13b1a88ebb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -338,7 +338,7 @@ static int init_oops_id(void)
 }
 late_initcall(init_oops_id);
 
-static void print_oops_end_marker(void)
+void print_oops_end_marker(void)
 {
 	init_oops_id();
 	printk(KERN_WARNING "---[ end trace %016llx ]---\n",
diff --git a/lib/bug.c b/lib/bug.c
index 6c5b30cf3f0f..7cdfad88128f 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -166,6 +166,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 
 		print_modules();
 		show_regs(regs);
+		print_oops_end_marker();
 		add_taint(BUG_GET_TAINT(bug));
 		return BUG_TRAP_TYPE_WARN;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 0bcaa65a56ab74003666cf741b0bfa1e9263a11c Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Tue, 10 Aug 2010 18:03:33 -0700
Subject: fs/sysv: v7: adjust sanity checks for some volumes

Newly mkfs-ed filesystems from Seventh Edition have last modification time
set to zero, but are otherwise perfectly valid.

Also, tighten up other sanity checks to filter out most filesystems with

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sysv/super.c         |  6 ++++--
 include/linux/sysv_fs.h | 11 +++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 2da3075aff78..3785262b98e0 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -469,7 +469,7 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
 	v7sb = (struct v7_super_block *) bh->b_data;
 	if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
 	    fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
-	    fs32_to_cpu(sbi, v7sb->s_time) == 0)
+	    fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
 		goto failed;
 
 	/* plausibility check on root inode: it is a directory,
@@ -479,7 +479,9 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
 	v7i = (struct sysv_inode *)(bh2->b_data + 64);
 	if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
 	    (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
-	    (fs32_to_cpu(sbi, v7i->i_size) & 017) != 0)
+	    (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
+	    (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
+	     sizeof(struct sysv_dir_entry)))
 		goto failed;
 	brelse(bh2);
 	bh2 = NULL;
diff --git a/include/linux/sysv_fs.h b/include/linux/sysv_fs.h
index 96411306eec6..e47d6d90023d 100644
--- a/include/linux/sysv_fs.h
+++ b/include/linux/sysv_fs.h
@@ -148,6 +148,17 @@ struct v7_super_block {
 	char    s_fname[6];     /* file system name */
 	char    s_fpack[6];     /* file system pack name */
 };
+/* Constants to aid sanity checking */
+/* This is not a hard limit, nor enforced by v7 kernel. It's actually just
+ * the limit used by Seventh Edition's ls, though is high enough to assume
+ * that no reasonable file system would have that much entries in root
+ * directory. Thus, if we see anything higher, we just probably got the
+ * endiannes wrong. */
+#define V7_NFILES	1024
+/* The disk addresses are three-byte (despite direct block addresses being
+ * aligned word-wise in inode). If the most significant byte is non-zero,
+ * something is most likely wrong (not a filesystem, bad bytesex). */
+#define V7_MAXSIZE	0x00ffffff
 
 /* Coherent super-block data on disk */
 #define COH_NICINOD	100	/* number of inode cache entries */
-- 
cgit v1.2.3-59-g8ed1b


From ad9c7ed0685406fe3cd7f0749b82bf433a39dd9c Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Tue, 10 Aug 2010 18:03:34 -0700
Subject: kfifo: kfifo_is_{full,empty} should return bools, not ints

For consistency with other kfifo routines, return bool, not int.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Cc: Stefani Seibold <stefani@seibold.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kfifo.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 9fad0527344f..57c4eedf4dd6 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -171,7 +171,7 @@ static inline unsigned int kfifo_len(struct kfifo *fifo)
  * kfifo_is_empty - returns true if the fifo is empty
  * @fifo: the fifo to be used.
  */
-static inline __must_check int kfifo_is_empty(struct kfifo *fifo)
+static inline __must_check bool kfifo_is_empty(struct kfifo *fifo)
 {
 	return fifo->in == fifo->out;
 }
@@ -180,7 +180,7 @@ static inline __must_check int kfifo_is_empty(struct kfifo *fifo)
  * kfifo_is_full - returns true if the fifo is full
  * @fifo: the fifo to be used.
  */
-static inline __must_check int kfifo_is_full(struct kfifo *fifo)
+static inline __must_check bool kfifo_is_full(struct kfifo *fifo)
 {
 	return kfifo_len(fifo) == kfifo_size(fifo);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 4201d9a8e86b51dd40aa8a0dabd093376c859985 Mon Sep 17 00:00:00 2001
From: Stefani Seibold <stefani@seibold.net>
Date: Tue, 10 Aug 2010 18:03:38 -0700
Subject: kfifo: add the new generic kfifo API

Add the new version of the kfifo API files kfifo.c and kfifo.h.

Signed-off-by: Stefani Seibold <stefani@seibold.net>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kfifo-new.h | 844 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/kfifo-new.c        | 602 +++++++++++++++++++++++++++++++++
 2 files changed, 1446 insertions(+)
 create mode 100644 include/linux/kfifo-new.h
 create mode 100644 kernel/kfifo-new.c

(limited to 'include/linux')

diff --git a/include/linux/kfifo-new.h b/include/linux/kfifo-new.h
new file mode 100644
index 000000000000..311f8753d713
--- /dev/null
+++ b/include/linux/kfifo-new.h
@@ -0,0 +1,844 @@
+/*
+ * A generic kernel FIFO implementation
+ *
+ * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef _LINUX_KFIFO_H
+#define _LINUX_KFIFO_H
+
+/*
+ * How to porting drivers to the new generic FIFO API:
+ *
+ * - Modify the declaration of the "struct kfifo *" object into a
+ *   in-place "struct kfifo" object
+ * - Init the in-place object with kfifo_alloc() or kfifo_init()
+ *   Note: The address of the in-place "struct kfifo" object must be
+ *   passed as the first argument to this functions
+ * - Replace the use of __kfifo_put into kfifo_in and __kfifo_get
+ *   into kfifo_out
+ * - Replace the use of kfifo_put into kfifo_in_spinlocked and kfifo_get
+ *   into kfifo_out_spinlocked
+ *   Note: the spinlock pointer formerly passed to kfifo_init/kfifo_alloc
+ *   must be passed now to the kfifo_in_spinlocked and kfifo_out_spinlocked
+ *   as the last parameter
+ * - The formerly __kfifo_* functions are renamed into kfifo_*
+ */
+
+/*
+ * Note about locking : There is no locking required until only * one reader
+ * and one writer is using the fifo and no kfifo_reset() will be * called
+ *  kfifo_reset_out() can be safely used, until it will be only called
+ * in the reader thread.
+ *  For multiple writer and one reader there is only a need to lock the writer.
+ * And vice versa for only one writer and multiple reader there is only a need
+ * to lock the reader.
+ */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/scatterlist.h>
+
+struct __kfifo {
+	unsigned int	in;
+	unsigned int	out;
+	unsigned int	mask;
+	unsigned int	esize;
+	void		*data;
+};
+
+#define __STRUCT_KFIFO_COMMON(datatype, recsize, ptrtype) \
+	union { \
+		struct __kfifo	kfifo; \
+		datatype	*type; \
+		char		(*rectype)[recsize]; \
+		ptrtype		*ptr; \
+		const ptrtype	*ptr_const; \
+	}
+
+#define __STRUCT_KFIFO(type, size, recsize, ptrtype) \
+{ \
+	__STRUCT_KFIFO_COMMON(type, recsize, ptrtype); \
+	type		buf[((size < 2) || (size & (size - 1))) ? -1 : size]; \
+}
+
+#define STRUCT_KFIFO(type, size) \
+	struct __STRUCT_KFIFO(type, size, 0, type)
+
+#define __STRUCT_KFIFO_PTR(type, recsize, ptrtype) \
+{ \
+	__STRUCT_KFIFO_COMMON(type, recsize, ptrtype); \
+	type		buf[0]; \
+}
+
+#define STRUCT_KFIFO_PTR(type) \
+	struct __STRUCT_KFIFO_PTR(type, 0, type)
+
+/*
+ * define compatibility "struct kfifo" for dynamic allocated fifos
+ */
+struct kfifo __STRUCT_KFIFO_PTR(unsigned char, 0, void);
+
+#define STRUCT_KFIFO_REC_1(size) \
+	struct __STRUCT_KFIFO(unsigned char, size, 1, void)
+
+#define STRUCT_KFIFO_REC_2(size) \
+	struct __STRUCT_KFIFO(unsigned char, size, 2, void)
+
+/*
+ * define kfifo_rec types
+ */
+struct kfifo_rec_ptr_1 __STRUCT_KFIFO_PTR(unsigned char, 1, void);
+struct kfifo_rec_ptr_2 __STRUCT_KFIFO_PTR(unsigned char, 2, void);
+
+/*
+ * helper macro to distinguish between real in place fifo where the fifo
+ * array is a part of the structure and the fifo type where the array is
+ * outside of the fifo structure.
+ */
+#define	__is_kfifo_ptr(fifo)	(sizeof(*fifo) == sizeof(struct __kfifo))
+
+/**
+ * DECLARE_KFIFO_PTR - macro to declare a fifo pointer object
+ * @fifo: name of the declared fifo
+ * @type: type of the fifo elements
+ */
+#define DECLARE_KFIFO_PTR(fifo, type)	STRUCT_KFIFO_PTR(type) fifo
+
+/**
+ * DECLARE_KFIFO - macro to declare a fifo object
+ * @fifo: name of the declared fifo
+ * @type: type of the fifo elements
+ * @size: the number of elements in the fifo, this must be a power of 2
+ */
+#define DECLARE_KFIFO(fifo, type, size)	STRUCT_KFIFO(type, size) fifo
+
+/**
+ * INIT_KFIFO - Initialize a fifo declared by DECLARE_KFIFO
+ * @fifo: name of the declared fifo datatype
+ */
+#define INIT_KFIFO(fifo) \
+(void)({ \
+	typeof(&(fifo)) __tmp = &(fifo); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	__kfifo->in = 0; \
+	__kfifo->out = 0; \
+	__kfifo->mask = __is_kfifo_ptr(__tmp) ? 0 : ARRAY_SIZE(__tmp->buf) - 1;\
+	__kfifo->esize = sizeof(*__tmp->buf); \
+	__kfifo->data = __is_kfifo_ptr(__tmp) ?  NULL : __tmp->buf; \
+})
+
+/**
+ * DEFINE_KFIFO - macro to define and initialize a fifo
+ * @fifo: name of the declared fifo datatype
+ * @type: type of the fifo elements
+ * @size: the number of elements in the fifo, this must be a power of 2
+ *
+ * Note: the macro can be used for global and local fifo data type variables.
+ */
+#define DEFINE_KFIFO(fifo, type, size) \
+	DECLARE_KFIFO(fifo, type, size) = \
+	(typeof(fifo)) { \
+		{ \
+			{ \
+			.in	= 0, \
+			.out	= 0, \
+			.mask	= __is_kfifo_ptr(&(fifo)) ? \
+				  0 : \
+				  ARRAY_SIZE((fifo).buf) - 1, \
+			.esize	= sizeof(*(fifo).buf), \
+			.data	= __is_kfifo_ptr(&(fifo)) ? \
+				NULL : \
+				(fifo).buf, \
+			} \
+		} \
+	}
+
+
+static inline unsigned int __must_check
+__kfifo_must_check_helper(unsigned int val)
+{
+	return val;
+}
+
+/**
+ * kfifo_initialized - Check if the fifo is initialized
+ * @fifo: address of the fifo to check
+ *
+ * Return %true if fifo is initialized, otherwise %false.
+ * Assumes the fifo was 0 before.
+ */
+#define kfifo_initialized(fifo) ((fifo)->kfifo.mask)
+
+/**
+ * kfifo_esize - returns the size of the element managed by the fifo
+ * @fifo: address of the fifo to be used
+ */
+#define kfifo_esize(fifo)	((fifo)->kfifo.esize)
+
+/**
+ * kfifo_recsize - returns the size of the record length field
+ * @fifo: address of the fifo to be used
+ */
+#define kfifo_recsize(fifo)	(sizeof(*(fifo)->rectype))
+
+/**
+ * kfifo_size - returns the size of the fifo in elements
+ * @fifo: address of the fifo to be used
+ */
+#define kfifo_size(fifo)	((fifo)->kfifo.mask + 1)
+
+/**
+ * kfifo_reset - removes the entire fifo content
+ * @fifo: address of the fifo to be used
+ *
+ * Note: usage of kfifo_reset() is dangerous. It should be only called when the
+ * fifo is exclusived locked or when it is secured that no other thread is
+ * accessing the fifo.
+ */
+#define kfifo_reset(fifo) \
+(void)({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	__tmp->kfifo.in = __tmp->kfifo.out = 0; \
+})
+
+/**
+ * kfifo_reset_out - skip fifo content
+ * @fifo: address of the fifo to be used
+ *
+ * Note: The usage of kfifo_reset_out() is safe until it will be only called
+ * from the reader thread and there is only one concurrent reader. Otherwise
+ * it is dangerous and must be handled in the same way as kfifo_reset().
+ */
+#define kfifo_reset_out(fifo)	\
+(void)({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	__tmp->kfifo.out = __tmp->kfifo.in; \
+})
+
+/**
+ * kfifo_len - returns the number of used elements in the fifo
+ * @fifo: address of the fifo to be used
+ */
+#define kfifo_len(fifo) \
+({ \
+	typeof(fifo + 1) __tmpl = (fifo); \
+	__tmpl->kfifo.in - __tmpl->kfifo.out; \
+})
+
+/**
+ * kfifo_is_empty - returns true if the fifo is empty
+ * @fifo: address of the fifo to be used
+ */
+#define	kfifo_is_empty(fifo) \
+({ \
+	typeof(fifo + 1) __tmpq = (fifo); \
+	__tmpq->kfifo.in == __tmpq->kfifo.out; \
+})
+
+/**
+ * kfifo_is_full - returns true if the fifo is full
+ * @fifo: address of the fifo to be used
+ */
+#define	kfifo_is_full(fifo) \
+({ \
+	typeof(fifo + 1) __tmpq = (fifo); \
+	kfifo_len(__tmpq) > __tmpq->kfifo.mask; \
+})
+
+/**
+ * kfifo_avail - returns the number of unused elements in the fifo
+ * @fifo: address of the fifo to be used
+ */
+#define	kfifo_avail(fifo) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmpq = (fifo); \
+	const size_t __recsize = sizeof(*__tmpq->rectype); \
+	unsigned int __avail = kfifo_size(__tmpq) - kfifo_len(__tmpq); \
+	(__recsize) ? ((__avail <= __recsize) ? 0 : \
+	__kfifo_max_r(__avail - __recsize, __recsize)) : \
+	__avail; \
+}) \
+)
+
+/**
+ * kfifo_skip - skip output data
+ * @fifo: address of the fifo to be used
+ */
+#define	kfifo_skip(fifo) \
+(void)({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (__recsize) \
+		__kfifo_skip_r(__kfifo, __recsize); \
+	else \
+		__kfifo->out++; \
+})
+
+/**
+ * kfifo_peek_len - gets the size of the next fifo record
+ * @fifo: address of the fifo to be used
+ *
+ * This function returns the size of the next fifo record in number of bytes.
+ */
+#define kfifo_peek_len(fifo) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	(!__recsize) ? kfifo_len(__tmp) * sizeof(*__tmp->type) : \
+	__kfifo_len_r(__kfifo, __recsize); \
+}) \
+)
+
+/**
+ * kfifo_alloc - dynamically allocates a new fifo buffer
+ * @fifo: pointer to the fifo
+ * @size: the number of elements in the fifo, this must be a power of 2
+ * @gfp_mask: get_free_pages mask, passed to kmalloc()
+ *
+ * This macro dynamically allocates a new fifo buffer.
+ *
+ * The numer of elements will be rounded-up to a power of 2.
+ * The fifo will be release with kfifo_free().
+ * Return 0 if no error, otherwise an error code.
+ */
+#define kfifo_alloc(fifo, size, gfp_mask) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	__is_kfifo_ptr(__tmp) ? \
+	__kfifo_alloc(__kfifo, size, sizeof(*__tmp->type), gfp_mask) : \
+	-EINVAL; \
+}) \
+)
+
+/**
+ * kfifo_free - frees the fifo
+ * @fifo: the fifo to be freed
+ */
+#define kfifo_free(fifo) \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (__is_kfifo_ptr(__tmp)) \
+		__kfifo_free(__kfifo); \
+})
+
+/**
+ * kfifo_init - initialize a fifo using a preallocated buffer
+ * @fifo: the fifo to assign the buffer
+ * @buffer: the preallocated buffer to be used
+ * @size: the size of the internal buffer, this have to be a power of 2
+ *
+ * This macro initialize a fifo using a preallocated buffer.
+ *
+ * The numer of elements will be rounded-up to a power of 2.
+ * Return 0 if no error, otherwise an error code.
+ */
+#define kfifo_init(fifo, buffer, size) \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	__is_kfifo_ptr(__tmp) ? \
+	__kfifo_init(__kfifo, buffer, size, sizeof(*__tmp->type)) : \
+	-EINVAL; \
+})
+
+/**
+ * kfifo_put - put data into the fifo
+ * @fifo: address of the fifo to be used
+ * @val: the data to be added
+ *
+ * This macro copies the given value into the fifo.
+ * It returns 0 if the fifo was full. Otherwise it returns the number
+ * processed elements.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_put(fifo, val) \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	typeof(val + 1) __val = (val); \
+	unsigned int __ret; \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (0) { \
+		typeof(__tmp->ptr_const) __dummy __attribute__ ((unused)); \
+		__dummy = (typeof(__val))NULL; \
+	} \
+	if (__recsize) \
+		__ret = __kfifo_in_r(__kfifo, __val, sizeof(*__val), \
+			__recsize); \
+	else { \
+		__ret = !kfifo_is_full(__tmp); \
+		if (__ret) { \
+			(__is_kfifo_ptr(__tmp) ? \
+			((typeof(__tmp->type))__kfifo->data) : \
+			(__tmp->buf) \
+			)[__kfifo->in & __tmp->kfifo.mask] = \
+				*(typeof(__tmp->type))__val; \
+			smp_wmb(); \
+			__kfifo->in++; \
+		} \
+	} \
+	__ret; \
+})
+
+/**
+ * kfifo_get - get data from the fifo
+ * @fifo: address of the fifo to be used
+ * @val: the var where to store the data to be added
+ *
+ * This macro reads the data from the fifo.
+ * It returns 0 if the fifo was empty. Otherwise it returns the number
+ * processed elements.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_get(fifo, val) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	typeof(val + 1) __val = (val); \
+	unsigned int __ret; \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (0) \
+		__val = (typeof(__tmp->ptr))0; \
+	if (__recsize) \
+		__ret = __kfifo_out_r(__kfifo, __val, sizeof(*__val), \
+			__recsize); \
+	else { \
+		__ret = !kfifo_is_empty(__tmp); \
+		if (__ret) { \
+			*(typeof(__tmp->type))__val = \
+				(__is_kfifo_ptr(__tmp) ? \
+				((typeof(__tmp->type))__kfifo->data) : \
+				(__tmp->buf) \
+				)[__kfifo->out & __tmp->kfifo.mask]; \
+			smp_wmb(); \
+			__kfifo->out++; \
+		} \
+	} \
+	__ret; \
+}) \
+)
+
+/**
+ * kfifo_peek - get data from the fifo without removing
+ * @fifo: address of the fifo to be used
+ * @val: the var where to store the data to be added
+ *
+ * This reads the data from the fifo without removing it from the fifo.
+ * It returns 0 if the fifo was empty. Otherwise it returns the number
+ * processed elements.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_peek(fifo, val) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	typeof(val + 1) __val = (val); \
+	unsigned int __ret; \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (0) \
+		__val = (typeof(__tmp->ptr))NULL; \
+	if (__recsize) \
+		__ret = __kfifo_out_peek_r(__kfifo, __val, sizeof(*__val), \
+			__recsize); \
+	else { \
+		__ret = !kfifo_is_empty(__tmp); \
+		if (__ret) { \
+			*(typeof(__tmp->type))__val = \
+				(__is_kfifo_ptr(__tmp) ? \
+				((typeof(__tmp->type))__kfifo->data) : \
+				(__tmp->buf) \
+				)[__kfifo->out & __tmp->kfifo.mask]; \
+			smp_wmb(); \
+		} \
+	} \
+	__ret; \
+}) \
+)
+
+/**
+ * kfifo_in - put data into the fifo
+ * @fifo: address of the fifo to be used
+ * @buf: the data to be added
+ * @n: number of elements to be added
+ *
+ * This macro copies the given buffer into the fifo and returns the
+ * number of copied elements.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_in(fifo, buf, n) \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	typeof(buf + 1) __buf = (buf); \
+	unsigned long __n = (n); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (0) { \
+		typeof(__tmp->ptr_const) __dummy __attribute__ ((unused)); \
+		__dummy = (typeof(__buf))NULL; \
+	} \
+	(__recsize) ?\
+	__kfifo_in_r(__kfifo, __buf, __n, __recsize) : \
+	__kfifo_in(__kfifo, __buf, __n); \
+})
+
+/**
+ * kfifo_in_spinlocked - put data into the fifo using a spinlock for locking
+ * @fifo: address of the fifo to be used
+ * @buf: the data to be added
+ * @n: number of elements to be added
+ * @lock: pointer to the spinlock to use for locking
+ *
+ * This macro copies the given values buffer into the fifo and returns the
+ * number of copied elements.
+ */
+#define	kfifo_in_spinlocked(fifo, buf, n, lock) \
+({ \
+	unsigned long __flags; \
+	unsigned int __ret; \
+	spin_lock_irqsave(lock, __flags); \
+	__ret = kfifo_in(fifo, buf, n); \
+	spin_unlock_irqrestore(lock, __flags); \
+	__ret; \
+})
+
+/* alias for kfifo_in_spinlocked, will be removed in a future release */
+#define kfifo_in_locked(fifo, buf, n, lock) \
+		kfifo_in_spinlocked(fifo, buf, n, lock)
+
+/**
+ * kfifo_out - get data from the fifo
+ * @fifo: address of the fifo to be used
+ * @buf: pointer to the storage buffer
+ * @n: max. number of elements to get
+ *
+ * This macro get some data from the fifo and return the numbers of elements
+ * copied.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_out(fifo, buf, n) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	typeof(buf + 1) __buf = (buf); \
+	unsigned long __n = (n); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (0) { \
+		typeof(__tmp->ptr) __dummy = NULL; \
+		__buf = __dummy; \
+	} \
+	(__recsize) ?\
+	__kfifo_out_r(__kfifo, __buf, __n, __recsize) : \
+	__kfifo_out(__kfifo, __buf, __n); \
+}) \
+)
+
+/**
+ * kfifo_out_spinlocked - get data from the fifo using a spinlock for locking
+ * @fifo: address of the fifo to be used
+ * @buf: pointer to the storage buffer
+ * @n: max. number of elements to get
+ * @lock: pointer to the spinlock to use for locking
+ *
+ * This macro get the data from the fifo and return the numbers of elements
+ * copied.
+ */
+#define	kfifo_out_spinlocked(fifo, buf, n, lock) \
+__kfifo_must_check_helper( \
+({ \
+	unsigned long __flags; \
+	unsigned int __ret; \
+	spin_lock_irqsave(lock, __flags); \
+	__ret = kfifo_out(fifo, buf, n); \
+	spin_unlock_irqrestore(lock, __flags); \
+	__ret; \
+}) \
+)
+
+/* alias for kfifo_out_spinlocked, will be removed in a future release */
+#define kfifo_out_locked(fifo, buf, n, lock) \
+		kfifo_out_spinlocked(fifo, buf, n, lock)
+
+/**
+ * kfifo_from_user - puts some data from user space into the fifo
+ * @fifo: address of the fifo to be used
+ * @from: pointer to the data to be added
+ * @len: the length of the data to be added
+ * @copied: pointer to output variable to store the number of copied bytes
+ *
+ * This macro copies at most @len bytes from the @from into the
+ * fifo, depending of the available space and returns -EFAULT/0.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_from_user(fifo, from, len, copied) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	const void __user *__from = (from); \
+	unsigned int __len = (len); \
+	unsigned int *__copied = (copied); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	(__recsize) ? \
+	__kfifo_from_user_r(__kfifo, __from, __len,  __copied, __recsize) : \
+	__kfifo_from_user(__kfifo, __from, __len, __copied); \
+}) \
+)
+
+/**
+ * kfifo_to_user - copies data from the fifo into user space
+ * @fifo: address of the fifo to be used
+ * @to: where the data must be copied
+ * @len: the size of the destination buffer
+ * @copied: pointer to output variable to store the number of copied bytes
+ *
+ * This macro copies at most @len bytes from the fifo into the
+ * @to buffer and returns -EFAULT/0.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_to_user(fifo, to, len, copied) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	void __user *__to = (to); \
+	unsigned int __len = (len); \
+	unsigned int *__copied = (copied); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	(__recsize) ? \
+	__kfifo_to_user_r(__kfifo, __to, __len, __copied, __recsize) : \
+	__kfifo_to_user(__kfifo, __to, __len, __copied); \
+}) \
+)
+
+/**
+ * kfifo_dma_in_prepare - setup a scatterlist for DMA input
+ * @fifo: address of the fifo to be used
+ * @sgl: pointer to the scatterlist array
+ * @nents: number of entries in the scatterlist array
+ * @len: number of elements to transfer
+ *
+ * This macro fills a scatterlist for DMA input.
+ * It returns the number entries in the scatterlist array.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macros.
+ */
+#define	kfifo_dma_in_prepare(fifo, sgl, nents, len) \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	struct scatterlist *__sgl = (sgl); \
+	int __nents = (nents); \
+	unsigned int __len = (len); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	(__recsize) ? \
+	__kfifo_dma_in_prepare_r(__kfifo, __sgl, __nents, __len, __recsize) : \
+	__kfifo_dma_in_prepare(__kfifo, __sgl, __nents, __len); \
+})
+
+/**
+ * kfifo_dma_in_finish - finish a DMA IN operation
+ * @fifo: address of the fifo to be used
+ * @len: number of bytes to received
+ *
+ * This macro finish a DMA IN operation. The in counter will be updated by
+ * the len parameter. No error checking will be done.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macros.
+ */
+#define kfifo_dma_in_finish(fifo, len) \
+(void)({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	unsigned int __len = (len); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (__recsize) \
+		__kfifo_dma_in_finish_r(__kfifo, __len, __recsize); \
+	else \
+		__kfifo->in += __len / sizeof(*__tmp->type); \
+})
+
+/**
+ * kfifo_dma_out_prepare - setup a scatterlist for DMA output
+ * @fifo: address of the fifo to be used
+ * @sgl: pointer to the scatterlist array
+ * @nents: number of entries in the scatterlist array
+ * @len: number of elements to transfer
+ *
+ * This macro fills a scatterlist for DMA output which at most @len bytes
+ * to transfer.
+ * It returns the number entries in the scatterlist array.
+ * A zero means there is no space available and the scatterlist is not filled.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macros.
+ */
+#define	kfifo_dma_out_prepare(fifo, sgl, nents, len) \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	struct scatterlist *__sgl = (sgl); \
+	int __nents = (nents); \
+	unsigned int __len = (len); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	(__recsize) ? \
+	__kfifo_dma_out_prepare_r(__kfifo, __sgl, __nents, __len, __recsize) : \
+	__kfifo_dma_out_prepare(__kfifo, __sgl, __nents, __len); \
+})
+
+/**
+ * kfifo_dma_out_finish - finish a DMA OUT operation
+ * @fifo: address of the fifo to be used
+ * @len: number of bytes transferd
+ *
+ * This macro finish a DMA OUT operation. The out counter will be updated by
+ * the len parameter. No error checking will be done.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macros.
+ */
+#define kfifo_dma_out_finish(fifo, len) \
+(void)({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	unsigned int __len = (len); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (__recsize) \
+		__kfifo_dma_out_finish_r(__kfifo, __recsize); \
+	else \
+		__kfifo->out += __len / sizeof(*__tmp->type); \
+})
+
+/**
+ * kfifo_out_peek - gets some data from the fifo
+ * @fifo: address of the fifo to be used
+ * @buf: pointer to the storage buffer
+ * @n: max. number of elements to get
+ *
+ * This macro get the data from the fifo and return the numbers of elements
+ * copied. The data is not removed from the fifo.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_out_peek(fifo, buf, n) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	typeof(buf + 1) __buf = (buf); \
+	unsigned long __n = (n); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (0) { \
+		typeof(__tmp->ptr) __dummy __attribute__ ((unused)) = NULL; \
+		__buf = __dummy; \
+	} \
+	(__recsize) ? \
+	__kfifo_out_peek_r(__kfifo, __buf, __n, __recsize) : \
+	__kfifo_out_peek(__kfifo, __buf, __n); \
+}) \
+)
+
+extern int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
+	size_t esize, gfp_t gfp_mask);
+
+extern void __kfifo_free(struct __kfifo *fifo);
+
+extern int __kfifo_init(struct __kfifo *fifo, void *buffer,
+	unsigned int size, size_t esize);
+
+extern unsigned int __kfifo_in(struct __kfifo *fifo,
+	const void *buf, unsigned int len);
+
+extern unsigned int __kfifo_out(struct __kfifo *fifo,
+	void *buf, unsigned int len);
+
+extern int __kfifo_from_user(struct __kfifo *fifo,
+	const void __user *from, unsigned long len, unsigned int *copied);
+
+extern int __kfifo_to_user(struct __kfifo *fifo,
+	void __user *to, unsigned long len, unsigned int *copied);
+
+extern unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len);
+
+extern unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len);
+
+extern unsigned int __kfifo_out_peek(struct __kfifo *fifo,
+	void *buf, unsigned int len);
+
+extern unsigned int __kfifo_in_r(struct __kfifo *fifo,
+	const void *buf, unsigned int len, size_t recsize);
+
+extern unsigned int __kfifo_out_r(struct __kfifo *fifo,
+	void *buf, unsigned int len, size_t recsize);
+
+extern int __kfifo_from_user_r(struct __kfifo *fifo,
+	const void __user *from, unsigned long len, unsigned int *copied,
+	size_t recsize);
+
+extern int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
+	unsigned long len, unsigned int *copied, size_t recsize);
+
+extern unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize);
+
+extern void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
+	unsigned int len, size_t recsize);
+
+extern unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize);
+
+extern void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize);
+
+extern unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize);
+
+extern unsigned int __kfifo_out_peek_r(struct __kfifo *fifo,
+	void *buf, unsigned int len, size_t recsize);
+
+extern unsigned int __kfifo_max_r(unsigned int len, size_t recsize);
+
+#endif
diff --git a/kernel/kfifo-new.c b/kernel/kfifo-new.c
new file mode 100644
index 000000000000..02192dd905cc
--- /dev/null
+++ b/kernel/kfifo-new.c
@@ -0,0 +1,602 @@
+/*
+ * A generic kernel FIFO implementation
+ *
+ * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/log2.h>
+#include <linux/uaccess.h>
+#include <linux/kfifo.h>
+
+/*
+ * internal helper to calculate the unused elements in a fifo
+ */
+static inline unsigned int kfifo_unused(struct __kfifo *fifo)
+{
+	return (fifo->mask + 1) - (fifo->in - fifo->out);
+}
+
+int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
+		size_t esize, gfp_t gfp_mask)
+{
+	/*
+	 * round down to the next power of 2, since our 'let the indices
+	 * wrap' technique works only in this case.
+	 */
+	if (!is_power_of_2(size))
+		size = rounddown_pow_of_two(size);
+
+	fifo->in = 0;
+	fifo->out = 0;
+	fifo->esize = esize;
+
+	if (size < 2) {
+		fifo->data = NULL;
+		fifo->mask = 0;
+		return -EINVAL;
+	}
+
+	fifo->data = kmalloc(size * esize, gfp_mask);
+
+	if (!fifo->data) {
+		fifo->mask = 0;
+		return -ENOMEM;
+	}
+	fifo->mask = size - 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(__kfifo_alloc);
+
+void __kfifo_free(struct __kfifo *fifo)
+{
+	kfree(fifo->data);
+	fifo->in = 0;
+	fifo->out = 0;
+	fifo->esize = 0;
+	fifo->data = NULL;
+	fifo->mask = 0;
+}
+EXPORT_SYMBOL(__kfifo_free);
+
+int __kfifo_init(struct __kfifo *fifo, void *buffer,
+		unsigned int size, size_t esize)
+{
+	size /= esize;
+
+	if (!is_power_of_2(size))
+		size = rounddown_pow_of_two(size);
+
+	fifo->in = 0;
+	fifo->out = 0;
+	fifo->esize = esize;
+	fifo->data = buffer;
+
+	if (size < 2) {
+		fifo->mask = 0;
+		return -EINVAL;
+	}
+	fifo->mask = size - 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(__kfifo_init);
+
+static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
+		unsigned int len, unsigned int off)
+{
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
+	unsigned int l;
+
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	memcpy(fifo->data + off, src, l);
+	memcpy(fifo->data, src + l, len - l);
+	/*
+	 * make sure that the data in the fifo is up to date before
+	 * incrementing the fifo->in index counter
+	 */
+	smp_wmb();
+}
+
+unsigned int __kfifo_in(struct __kfifo *fifo,
+		const void *buf, unsigned int len)
+{
+	unsigned int l;
+
+	l = kfifo_unused(fifo);
+	if (len > l)
+		len = l;
+
+	kfifo_copy_in(fifo, buf, len, fifo->in);
+	fifo->in += len;
+	return len;
+}
+EXPORT_SYMBOL(__kfifo_in);
+
+static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
+		unsigned int len, unsigned int off)
+{
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
+	unsigned int l;
+
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	memcpy(dst, fifo->data + off, l);
+	memcpy(dst + l, fifo->data, len - l);
+	/*
+	 * make sure that the data is copied before
+	 * incrementing the fifo->out index counter
+	 */
+	smp_wmb();
+}
+
+unsigned int __kfifo_out_peek(struct __kfifo *fifo,
+		void *buf, unsigned int len)
+{
+	unsigned int l;
+
+	l = fifo->in - fifo->out;
+	if (len > l)
+		len = l;
+
+	kfifo_copy_out(fifo, buf, len, fifo->out);
+	return len;
+}
+EXPORT_SYMBOL(__kfifo_out_peek);
+
+unsigned int __kfifo_out(struct __kfifo *fifo,
+		void *buf, unsigned int len)
+{
+	len = __kfifo_out_peek(fifo, buf, len);
+	fifo->out += len;
+	return len;
+}
+EXPORT_SYMBOL(__kfifo_out);
+
+static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
+	const void __user *from, unsigned int len, unsigned int off,
+	unsigned int *copied)
+{
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
+	unsigned int l;
+	unsigned long ret;
+
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	ret = copy_from_user(fifo->data + off, from, l);
+	if (unlikely(ret))
+		ret = DIV_ROUND_UP(ret + len - l, esize);
+	else {
+		ret = copy_from_user(fifo->data, from + l, len - l);
+		if (unlikely(ret))
+			ret = DIV_ROUND_UP(ret, esize);
+	}
+	/*
+	 * make sure that the data in the fifo is up to date before
+	 * incrementing the fifo->in index counter
+	 */
+	smp_wmb();
+	*copied = len - ret;
+	/* return the number of elements which are not copied */
+	return ret;
+}
+
+int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
+		unsigned long len, unsigned int *copied)
+{
+	unsigned int l;
+	unsigned long ret;
+	unsigned int esize = fifo->esize;
+	int err;
+
+	if (esize != 1)
+		len /= esize;
+
+	l = kfifo_unused(fifo);
+	if (len > l)
+		len = l;
+
+	ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
+	if (unlikely(ret)) {
+		len -= ret;
+		err = -EFAULT;
+	} else
+		err = 0;
+	fifo->in += len;
+	return err;
+}
+EXPORT_SYMBOL(__kfifo_from_user);
+
+static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
+		unsigned int len, unsigned int off, unsigned int *copied)
+{
+	unsigned int l;
+	unsigned long ret;
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
+
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	ret = copy_to_user(to, fifo->data + off, l);
+	if (unlikely(ret))
+		ret = DIV_ROUND_UP(ret + len - l, esize);
+	else {
+		ret = copy_to_user(to + l, fifo->data, len - l);
+		if (unlikely(ret))
+			ret = DIV_ROUND_UP(ret, esize);
+	}
+	/*
+	 * make sure that the data is copied before
+	 * incrementing the fifo->out index counter
+	 */
+	smp_wmb();
+	*copied = len - ret;
+	/* return the number of elements which are not copied */
+	return ret;
+}
+
+int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
+		unsigned long len, unsigned int *copied)
+{
+	unsigned int l;
+	unsigned long ret;
+	unsigned int esize = fifo->esize;
+	int err;
+
+	if (esize != 1)
+		len /= esize;
+
+	l = fifo->in - fifo->out;
+	if (len > l)
+		len = l;
+	ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
+	if (unlikely(ret)) {
+		len -= ret;
+		err = -EFAULT;
+	} else
+		err = 0;
+	fifo->out += len;
+	return err;
+}
+EXPORT_SYMBOL(__kfifo_to_user);
+
+static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
+		int nents, unsigned int len)
+{
+	int n;
+	unsigned int l;
+	unsigned int off;
+	struct page *page;
+
+	if (!nents)
+		return 0;
+
+	if (!len)
+		return 0;
+
+	n = 0;
+	page = virt_to_page(buf);
+	off = offset_in_page(buf);
+	l = 0;
+
+	while (len >= l + PAGE_SIZE - off) {
+		struct page *npage;
+
+		l += PAGE_SIZE;
+		buf += PAGE_SIZE;
+		npage = virt_to_page(buf);
+		if (page_to_phys(page) != page_to_phys(npage) - l) {
+			sgl->page_link = 0;
+			sg_set_page(sgl++, page, l - off, off);
+			if (++n == nents)
+				return n;
+			page = npage;
+			len -= l - off;
+			l = off = 0;
+		}
+	}
+	sgl->page_link = 0;
+	sg_set_page(sgl++, page, len, off);
+	return n + 1;
+}
+
+static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
+		int nents, unsigned int len, unsigned int off)
+{
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
+	unsigned int l;
+	unsigned int n;
+
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	n  = setup_sgl_buf(sgl, fifo->data + off, nents, l);
+	n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
+
+	if (n)
+		sg_mark_end(sgl + n - 1);
+	return n;
+}
+
+unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
+		struct scatterlist *sgl, int nents, unsigned int len)
+{
+	unsigned int l;
+
+	l = kfifo_unused(fifo);
+	if (len > l)
+		len = l;
+
+	return setup_sgl(fifo, sgl, nents, len, fifo->in);
+}
+EXPORT_SYMBOL(__kfifo_dma_in_prepare);
+
+unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
+		struct scatterlist *sgl, int nents, unsigned int len)
+{
+	unsigned int l;
+
+	l = fifo->in - fifo->out;
+	if (len > l)
+		len = l;
+
+	return setup_sgl(fifo, sgl, nents, len, fifo->out);
+}
+EXPORT_SYMBOL(__kfifo_dma_out_prepare);
+
+unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
+{
+	unsigned int max = (1 << (recsize << 3)) - 1;
+
+	if (len > max)
+		return max;
+	return len;
+}
+
+#define	__KFIFO_PEEK(data, out, mask) \
+	((data)[(out) & (mask)])
+/*
+ * __kfifo_peek_n internal helper function for determinate the length of
+ * the next record in the fifo
+ */
+static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
+{
+	unsigned int l;
+	unsigned int mask = fifo->mask;
+	unsigned char *data = fifo->data;
+
+	l = __KFIFO_PEEK(data, fifo->out, mask);
+
+	if (--recsize)
+		l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
+
+	return l;
+}
+
+#define	__KFIFO_POKE(data, in, mask, val) \
+	( \
+	(data)[(in) & (mask)] = (unsigned char)(val) \
+	)
+
+/*
+ * __kfifo_poke_n internal helper function for storeing the length of
+ * the record into the fifo
+ */
+static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
+{
+	unsigned int mask = fifo->mask;
+	unsigned char *data = fifo->data;
+
+	__KFIFO_POKE(data, fifo->in, mask, n);
+
+	if (recsize > 1)
+		__KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
+}
+
+unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
+{
+	return __kfifo_peek_n(fifo, recsize);
+}
+EXPORT_SYMBOL(__kfifo_len_r);
+
+unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
+		unsigned int len, size_t recsize)
+{
+	if (len + recsize > kfifo_unused(fifo))
+		return 0;
+
+	__kfifo_poke_n(fifo, len, recsize);
+
+	kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
+	fifo->in += len + recsize;
+	return len;
+}
+EXPORT_SYMBOL(__kfifo_in_r);
+
+static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
+	void *buf, unsigned int len, size_t recsize, unsigned int *n)
+{
+	*n = __kfifo_peek_n(fifo, recsize);
+
+	if (len > *n)
+		len = *n;
+
+	kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
+	return len;
+}
+
+unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
+		unsigned int len, size_t recsize)
+{
+	unsigned int n;
+
+	if (fifo->in == fifo->out)
+		return 0;
+
+	return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
+}
+EXPORT_SYMBOL(__kfifo_out_peek_r);
+
+unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
+		unsigned int len, size_t recsize)
+{
+	unsigned int n;
+
+	if (fifo->in == fifo->out)
+		return 0;
+
+	len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
+	fifo->out += n + recsize;
+	return len;
+}
+EXPORT_SYMBOL(__kfifo_out_r);
+
+int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
+	unsigned long len, unsigned int *copied, size_t recsize)
+{
+	unsigned long ret;
+
+	len = __kfifo_max_r(len, recsize);
+
+	if (len + recsize > kfifo_unused(fifo)) {
+		*copied = 0;
+		return 0;
+	}
+
+	__kfifo_poke_n(fifo, len, recsize);
+
+	ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
+	if (unlikely(ret)) {
+		*copied = 0;
+		return -EFAULT;
+	}
+	fifo->in += len + recsize;
+	return 0;
+}
+EXPORT_SYMBOL(__kfifo_from_user_r);
+
+int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
+	unsigned long len, unsigned int *copied, size_t recsize)
+{
+	unsigned long ret;
+	unsigned int n;
+
+	if (fifo->in == fifo->out) {
+		*copied = 0;
+		return 0;
+	}
+
+	n = __kfifo_peek_n(fifo, recsize);
+	if (len > n)
+		len = n;
+
+	ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
+	if (unlikely(ret)) {
+		*copied = 0;
+		return -EFAULT;
+	}
+	fifo->out += n + recsize;
+	return 0;
+}
+EXPORT_SYMBOL(__kfifo_to_user_r);
+
+unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
+{
+	if (!nents)
+		BUG();
+
+	len = __kfifo_max_r(len, recsize);
+
+	if (len + recsize > kfifo_unused(fifo))
+		return 0;
+
+	return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
+}
+EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
+
+void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
+	unsigned int len, size_t recsize)
+{
+	len = __kfifo_max_r(len, recsize);
+	__kfifo_poke_n(fifo, len, recsize);
+	fifo->in += len + recsize;
+}
+EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
+
+unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
+{
+	if (!nents)
+		BUG();
+
+	len = __kfifo_max_r(len, recsize);
+
+	if (len + recsize > fifo->in - fifo->out)
+		return 0;
+
+	return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
+}
+EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
+
+void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
+{
+	unsigned int len;
+
+	len = __kfifo_peek_n(fifo, recsize);
+	fifo->out += len + recsize;
+}
+EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
-- 
cgit v1.2.3-59-g8ed1b


From 2e956fb320568cc70861761483e2f0e2db75fd66 Mon Sep 17 00:00:00 2001
From: Stefani Seibold <stefani@seibold.net>
Date: Tue, 10 Aug 2010 18:03:38 -0700
Subject: kfifo: replace the old non generic API

Simply replace the whole kfifo.c and kfifo.h files with the new generic
version and fix the kerneldoc API template file.

Signed-off-by: Stefani Seibold <stefani@seibold.net>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DocBook/kernel-api.tmpl |    1 -
 include/linux/kfifo-new.h             |  844 -----------------------
 include/linux/kfifo.h                 | 1193 ++++++++++++++++++++-------------
 kernel/kfifo-new.c                    |  602 -----------------
 kernel/kfifo.c                        |  749 +++++++++++++--------
 5 files changed, 1164 insertions(+), 2225 deletions(-)
 delete mode 100644 include/linux/kfifo-new.h
 delete mode 100644 kernel/kfifo-new.c

(limited to 'include/linux')

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 44b3def961a2..a20c6f6fffc3 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -132,7 +132,6 @@ X!Ilib/string.c
      <title>FIFO Buffer</title>
      <sect1><title>kfifo interface</title>
 !Iinclude/linux/kfifo.h
-!Ekernel/kfifo.c
      </sect1>
   </chapter>
 
diff --git a/include/linux/kfifo-new.h b/include/linux/kfifo-new.h
deleted file mode 100644
index 311f8753d713..000000000000
--- a/include/linux/kfifo-new.h
+++ /dev/null
@@ -1,844 +0,0 @@
-/*
- * A generic kernel FIFO implementation
- *
- * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#ifndef _LINUX_KFIFO_H
-#define _LINUX_KFIFO_H
-
-/*
- * How to porting drivers to the new generic FIFO API:
- *
- * - Modify the declaration of the "struct kfifo *" object into a
- *   in-place "struct kfifo" object
- * - Init the in-place object with kfifo_alloc() or kfifo_init()
- *   Note: The address of the in-place "struct kfifo" object must be
- *   passed as the first argument to this functions
- * - Replace the use of __kfifo_put into kfifo_in and __kfifo_get
- *   into kfifo_out
- * - Replace the use of kfifo_put into kfifo_in_spinlocked and kfifo_get
- *   into kfifo_out_spinlocked
- *   Note: the spinlock pointer formerly passed to kfifo_init/kfifo_alloc
- *   must be passed now to the kfifo_in_spinlocked and kfifo_out_spinlocked
- *   as the last parameter
- * - The formerly __kfifo_* functions are renamed into kfifo_*
- */
-
-/*
- * Note about locking : There is no locking required until only * one reader
- * and one writer is using the fifo and no kfifo_reset() will be * called
- *  kfifo_reset_out() can be safely used, until it will be only called
- * in the reader thread.
- *  For multiple writer and one reader there is only a need to lock the writer.
- * And vice versa for only one writer and multiple reader there is only a need
- * to lock the reader.
- */
-
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/stddef.h>
-#include <linux/scatterlist.h>
-
-struct __kfifo {
-	unsigned int	in;
-	unsigned int	out;
-	unsigned int	mask;
-	unsigned int	esize;
-	void		*data;
-};
-
-#define __STRUCT_KFIFO_COMMON(datatype, recsize, ptrtype) \
-	union { \
-		struct __kfifo	kfifo; \
-		datatype	*type; \
-		char		(*rectype)[recsize]; \
-		ptrtype		*ptr; \
-		const ptrtype	*ptr_const; \
-	}
-
-#define __STRUCT_KFIFO(type, size, recsize, ptrtype) \
-{ \
-	__STRUCT_KFIFO_COMMON(type, recsize, ptrtype); \
-	type		buf[((size < 2) || (size & (size - 1))) ? -1 : size]; \
-}
-
-#define STRUCT_KFIFO(type, size) \
-	struct __STRUCT_KFIFO(type, size, 0, type)
-
-#define __STRUCT_KFIFO_PTR(type, recsize, ptrtype) \
-{ \
-	__STRUCT_KFIFO_COMMON(type, recsize, ptrtype); \
-	type		buf[0]; \
-}
-
-#define STRUCT_KFIFO_PTR(type) \
-	struct __STRUCT_KFIFO_PTR(type, 0, type)
-
-/*
- * define compatibility "struct kfifo" for dynamic allocated fifos
- */
-struct kfifo __STRUCT_KFIFO_PTR(unsigned char, 0, void);
-
-#define STRUCT_KFIFO_REC_1(size) \
-	struct __STRUCT_KFIFO(unsigned char, size, 1, void)
-
-#define STRUCT_KFIFO_REC_2(size) \
-	struct __STRUCT_KFIFO(unsigned char, size, 2, void)
-
-/*
- * define kfifo_rec types
- */
-struct kfifo_rec_ptr_1 __STRUCT_KFIFO_PTR(unsigned char, 1, void);
-struct kfifo_rec_ptr_2 __STRUCT_KFIFO_PTR(unsigned char, 2, void);
-
-/*
- * helper macro to distinguish between real in place fifo where the fifo
- * array is a part of the structure and the fifo type where the array is
- * outside of the fifo structure.
- */
-#define	__is_kfifo_ptr(fifo)	(sizeof(*fifo) == sizeof(struct __kfifo))
-
-/**
- * DECLARE_KFIFO_PTR - macro to declare a fifo pointer object
- * @fifo: name of the declared fifo
- * @type: type of the fifo elements
- */
-#define DECLARE_KFIFO_PTR(fifo, type)	STRUCT_KFIFO_PTR(type) fifo
-
-/**
- * DECLARE_KFIFO - macro to declare a fifo object
- * @fifo: name of the declared fifo
- * @type: type of the fifo elements
- * @size: the number of elements in the fifo, this must be a power of 2
- */
-#define DECLARE_KFIFO(fifo, type, size)	STRUCT_KFIFO(type, size) fifo
-
-/**
- * INIT_KFIFO - Initialize a fifo declared by DECLARE_KFIFO
- * @fifo: name of the declared fifo datatype
- */
-#define INIT_KFIFO(fifo) \
-(void)({ \
-	typeof(&(fifo)) __tmp = &(fifo); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	__kfifo->in = 0; \
-	__kfifo->out = 0; \
-	__kfifo->mask = __is_kfifo_ptr(__tmp) ? 0 : ARRAY_SIZE(__tmp->buf) - 1;\
-	__kfifo->esize = sizeof(*__tmp->buf); \
-	__kfifo->data = __is_kfifo_ptr(__tmp) ?  NULL : __tmp->buf; \
-})
-
-/**
- * DEFINE_KFIFO - macro to define and initialize a fifo
- * @fifo: name of the declared fifo datatype
- * @type: type of the fifo elements
- * @size: the number of elements in the fifo, this must be a power of 2
- *
- * Note: the macro can be used for global and local fifo data type variables.
- */
-#define DEFINE_KFIFO(fifo, type, size) \
-	DECLARE_KFIFO(fifo, type, size) = \
-	(typeof(fifo)) { \
-		{ \
-			{ \
-			.in	= 0, \
-			.out	= 0, \
-			.mask	= __is_kfifo_ptr(&(fifo)) ? \
-				  0 : \
-				  ARRAY_SIZE((fifo).buf) - 1, \
-			.esize	= sizeof(*(fifo).buf), \
-			.data	= __is_kfifo_ptr(&(fifo)) ? \
-				NULL : \
-				(fifo).buf, \
-			} \
-		} \
-	}
-
-
-static inline unsigned int __must_check
-__kfifo_must_check_helper(unsigned int val)
-{
-	return val;
-}
-
-/**
- * kfifo_initialized - Check if the fifo is initialized
- * @fifo: address of the fifo to check
- *
- * Return %true if fifo is initialized, otherwise %false.
- * Assumes the fifo was 0 before.
- */
-#define kfifo_initialized(fifo) ((fifo)->kfifo.mask)
-
-/**
- * kfifo_esize - returns the size of the element managed by the fifo
- * @fifo: address of the fifo to be used
- */
-#define kfifo_esize(fifo)	((fifo)->kfifo.esize)
-
-/**
- * kfifo_recsize - returns the size of the record length field
- * @fifo: address of the fifo to be used
- */
-#define kfifo_recsize(fifo)	(sizeof(*(fifo)->rectype))
-
-/**
- * kfifo_size - returns the size of the fifo in elements
- * @fifo: address of the fifo to be used
- */
-#define kfifo_size(fifo)	((fifo)->kfifo.mask + 1)
-
-/**
- * kfifo_reset - removes the entire fifo content
- * @fifo: address of the fifo to be used
- *
- * Note: usage of kfifo_reset() is dangerous. It should be only called when the
- * fifo is exclusived locked or when it is secured that no other thread is
- * accessing the fifo.
- */
-#define kfifo_reset(fifo) \
-(void)({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	__tmp->kfifo.in = __tmp->kfifo.out = 0; \
-})
-
-/**
- * kfifo_reset_out - skip fifo content
- * @fifo: address of the fifo to be used
- *
- * Note: The usage of kfifo_reset_out() is safe until it will be only called
- * from the reader thread and there is only one concurrent reader. Otherwise
- * it is dangerous and must be handled in the same way as kfifo_reset().
- */
-#define kfifo_reset_out(fifo)	\
-(void)({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	__tmp->kfifo.out = __tmp->kfifo.in; \
-})
-
-/**
- * kfifo_len - returns the number of used elements in the fifo
- * @fifo: address of the fifo to be used
- */
-#define kfifo_len(fifo) \
-({ \
-	typeof(fifo + 1) __tmpl = (fifo); \
-	__tmpl->kfifo.in - __tmpl->kfifo.out; \
-})
-
-/**
- * kfifo_is_empty - returns true if the fifo is empty
- * @fifo: address of the fifo to be used
- */
-#define	kfifo_is_empty(fifo) \
-({ \
-	typeof(fifo + 1) __tmpq = (fifo); \
-	__tmpq->kfifo.in == __tmpq->kfifo.out; \
-})
-
-/**
- * kfifo_is_full - returns true if the fifo is full
- * @fifo: address of the fifo to be used
- */
-#define	kfifo_is_full(fifo) \
-({ \
-	typeof(fifo + 1) __tmpq = (fifo); \
-	kfifo_len(__tmpq) > __tmpq->kfifo.mask; \
-})
-
-/**
- * kfifo_avail - returns the number of unused elements in the fifo
- * @fifo: address of the fifo to be used
- */
-#define	kfifo_avail(fifo) \
-__kfifo_must_check_helper( \
-({ \
-	typeof(fifo + 1) __tmpq = (fifo); \
-	const size_t __recsize = sizeof(*__tmpq->rectype); \
-	unsigned int __avail = kfifo_size(__tmpq) - kfifo_len(__tmpq); \
-	(__recsize) ? ((__avail <= __recsize) ? 0 : \
-	__kfifo_max_r(__avail - __recsize, __recsize)) : \
-	__avail; \
-}) \
-)
-
-/**
- * kfifo_skip - skip output data
- * @fifo: address of the fifo to be used
- */
-#define	kfifo_skip(fifo) \
-(void)({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	if (__recsize) \
-		__kfifo_skip_r(__kfifo, __recsize); \
-	else \
-		__kfifo->out++; \
-})
-
-/**
- * kfifo_peek_len - gets the size of the next fifo record
- * @fifo: address of the fifo to be used
- *
- * This function returns the size of the next fifo record in number of bytes.
- */
-#define kfifo_peek_len(fifo) \
-__kfifo_must_check_helper( \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	(!__recsize) ? kfifo_len(__tmp) * sizeof(*__tmp->type) : \
-	__kfifo_len_r(__kfifo, __recsize); \
-}) \
-)
-
-/**
- * kfifo_alloc - dynamically allocates a new fifo buffer
- * @fifo: pointer to the fifo
- * @size: the number of elements in the fifo, this must be a power of 2
- * @gfp_mask: get_free_pages mask, passed to kmalloc()
- *
- * This macro dynamically allocates a new fifo buffer.
- *
- * The numer of elements will be rounded-up to a power of 2.
- * The fifo will be release with kfifo_free().
- * Return 0 if no error, otherwise an error code.
- */
-#define kfifo_alloc(fifo, size, gfp_mask) \
-__kfifo_must_check_helper( \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	__is_kfifo_ptr(__tmp) ? \
-	__kfifo_alloc(__kfifo, size, sizeof(*__tmp->type), gfp_mask) : \
-	-EINVAL; \
-}) \
-)
-
-/**
- * kfifo_free - frees the fifo
- * @fifo: the fifo to be freed
- */
-#define kfifo_free(fifo) \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	if (__is_kfifo_ptr(__tmp)) \
-		__kfifo_free(__kfifo); \
-})
-
-/**
- * kfifo_init - initialize a fifo using a preallocated buffer
- * @fifo: the fifo to assign the buffer
- * @buffer: the preallocated buffer to be used
- * @size: the size of the internal buffer, this have to be a power of 2
- *
- * This macro initialize a fifo using a preallocated buffer.
- *
- * The numer of elements will be rounded-up to a power of 2.
- * Return 0 if no error, otherwise an error code.
- */
-#define kfifo_init(fifo, buffer, size) \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	__is_kfifo_ptr(__tmp) ? \
-	__kfifo_init(__kfifo, buffer, size, sizeof(*__tmp->type)) : \
-	-EINVAL; \
-})
-
-/**
- * kfifo_put - put data into the fifo
- * @fifo: address of the fifo to be used
- * @val: the data to be added
- *
- * This macro copies the given value into the fifo.
- * It returns 0 if the fifo was full. Otherwise it returns the number
- * processed elements.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these macro.
- */
-#define	kfifo_put(fifo, val) \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	typeof(val + 1) __val = (val); \
-	unsigned int __ret; \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	if (0) { \
-		typeof(__tmp->ptr_const) __dummy __attribute__ ((unused)); \
-		__dummy = (typeof(__val))NULL; \
-	} \
-	if (__recsize) \
-		__ret = __kfifo_in_r(__kfifo, __val, sizeof(*__val), \
-			__recsize); \
-	else { \
-		__ret = !kfifo_is_full(__tmp); \
-		if (__ret) { \
-			(__is_kfifo_ptr(__tmp) ? \
-			((typeof(__tmp->type))__kfifo->data) : \
-			(__tmp->buf) \
-			)[__kfifo->in & __tmp->kfifo.mask] = \
-				*(typeof(__tmp->type))__val; \
-			smp_wmb(); \
-			__kfifo->in++; \
-		} \
-	} \
-	__ret; \
-})
-
-/**
- * kfifo_get - get data from the fifo
- * @fifo: address of the fifo to be used
- * @val: the var where to store the data to be added
- *
- * This macro reads the data from the fifo.
- * It returns 0 if the fifo was empty. Otherwise it returns the number
- * processed elements.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these macro.
- */
-#define	kfifo_get(fifo, val) \
-__kfifo_must_check_helper( \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	typeof(val + 1) __val = (val); \
-	unsigned int __ret; \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	if (0) \
-		__val = (typeof(__tmp->ptr))0; \
-	if (__recsize) \
-		__ret = __kfifo_out_r(__kfifo, __val, sizeof(*__val), \
-			__recsize); \
-	else { \
-		__ret = !kfifo_is_empty(__tmp); \
-		if (__ret) { \
-			*(typeof(__tmp->type))__val = \
-				(__is_kfifo_ptr(__tmp) ? \
-				((typeof(__tmp->type))__kfifo->data) : \
-				(__tmp->buf) \
-				)[__kfifo->out & __tmp->kfifo.mask]; \
-			smp_wmb(); \
-			__kfifo->out++; \
-		} \
-	} \
-	__ret; \
-}) \
-)
-
-/**
- * kfifo_peek - get data from the fifo without removing
- * @fifo: address of the fifo to be used
- * @val: the var where to store the data to be added
- *
- * This reads the data from the fifo without removing it from the fifo.
- * It returns 0 if the fifo was empty. Otherwise it returns the number
- * processed elements.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these macro.
- */
-#define	kfifo_peek(fifo, val) \
-__kfifo_must_check_helper( \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	typeof(val + 1) __val = (val); \
-	unsigned int __ret; \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	if (0) \
-		__val = (typeof(__tmp->ptr))NULL; \
-	if (__recsize) \
-		__ret = __kfifo_out_peek_r(__kfifo, __val, sizeof(*__val), \
-			__recsize); \
-	else { \
-		__ret = !kfifo_is_empty(__tmp); \
-		if (__ret) { \
-			*(typeof(__tmp->type))__val = \
-				(__is_kfifo_ptr(__tmp) ? \
-				((typeof(__tmp->type))__kfifo->data) : \
-				(__tmp->buf) \
-				)[__kfifo->out & __tmp->kfifo.mask]; \
-			smp_wmb(); \
-		} \
-	} \
-	__ret; \
-}) \
-)
-
-/**
- * kfifo_in - put data into the fifo
- * @fifo: address of the fifo to be used
- * @buf: the data to be added
- * @n: number of elements to be added
- *
- * This macro copies the given buffer into the fifo and returns the
- * number of copied elements.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these macro.
- */
-#define	kfifo_in(fifo, buf, n) \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	typeof(buf + 1) __buf = (buf); \
-	unsigned long __n = (n); \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	if (0) { \
-		typeof(__tmp->ptr_const) __dummy __attribute__ ((unused)); \
-		__dummy = (typeof(__buf))NULL; \
-	} \
-	(__recsize) ?\
-	__kfifo_in_r(__kfifo, __buf, __n, __recsize) : \
-	__kfifo_in(__kfifo, __buf, __n); \
-})
-
-/**
- * kfifo_in_spinlocked - put data into the fifo using a spinlock for locking
- * @fifo: address of the fifo to be used
- * @buf: the data to be added
- * @n: number of elements to be added
- * @lock: pointer to the spinlock to use for locking
- *
- * This macro copies the given values buffer into the fifo and returns the
- * number of copied elements.
- */
-#define	kfifo_in_spinlocked(fifo, buf, n, lock) \
-({ \
-	unsigned long __flags; \
-	unsigned int __ret; \
-	spin_lock_irqsave(lock, __flags); \
-	__ret = kfifo_in(fifo, buf, n); \
-	spin_unlock_irqrestore(lock, __flags); \
-	__ret; \
-})
-
-/* alias for kfifo_in_spinlocked, will be removed in a future release */
-#define kfifo_in_locked(fifo, buf, n, lock) \
-		kfifo_in_spinlocked(fifo, buf, n, lock)
-
-/**
- * kfifo_out - get data from the fifo
- * @fifo: address of the fifo to be used
- * @buf: pointer to the storage buffer
- * @n: max. number of elements to get
- *
- * This macro get some data from the fifo and return the numbers of elements
- * copied.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these macro.
- */
-#define	kfifo_out(fifo, buf, n) \
-__kfifo_must_check_helper( \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	typeof(buf + 1) __buf = (buf); \
-	unsigned long __n = (n); \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	if (0) { \
-		typeof(__tmp->ptr) __dummy = NULL; \
-		__buf = __dummy; \
-	} \
-	(__recsize) ?\
-	__kfifo_out_r(__kfifo, __buf, __n, __recsize) : \
-	__kfifo_out(__kfifo, __buf, __n); \
-}) \
-)
-
-/**
- * kfifo_out_spinlocked - get data from the fifo using a spinlock for locking
- * @fifo: address of the fifo to be used
- * @buf: pointer to the storage buffer
- * @n: max. number of elements to get
- * @lock: pointer to the spinlock to use for locking
- *
- * This macro get the data from the fifo and return the numbers of elements
- * copied.
- */
-#define	kfifo_out_spinlocked(fifo, buf, n, lock) \
-__kfifo_must_check_helper( \
-({ \
-	unsigned long __flags; \
-	unsigned int __ret; \
-	spin_lock_irqsave(lock, __flags); \
-	__ret = kfifo_out(fifo, buf, n); \
-	spin_unlock_irqrestore(lock, __flags); \
-	__ret; \
-}) \
-)
-
-/* alias for kfifo_out_spinlocked, will be removed in a future release */
-#define kfifo_out_locked(fifo, buf, n, lock) \
-		kfifo_out_spinlocked(fifo, buf, n, lock)
-
-/**
- * kfifo_from_user - puts some data from user space into the fifo
- * @fifo: address of the fifo to be used
- * @from: pointer to the data to be added
- * @len: the length of the data to be added
- * @copied: pointer to output variable to store the number of copied bytes
- *
- * This macro copies at most @len bytes from the @from into the
- * fifo, depending of the available space and returns -EFAULT/0.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these macro.
- */
-#define	kfifo_from_user(fifo, from, len, copied) \
-__kfifo_must_check_helper( \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	const void __user *__from = (from); \
-	unsigned int __len = (len); \
-	unsigned int *__copied = (copied); \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	(__recsize) ? \
-	__kfifo_from_user_r(__kfifo, __from, __len,  __copied, __recsize) : \
-	__kfifo_from_user(__kfifo, __from, __len, __copied); \
-}) \
-)
-
-/**
- * kfifo_to_user - copies data from the fifo into user space
- * @fifo: address of the fifo to be used
- * @to: where the data must be copied
- * @len: the size of the destination buffer
- * @copied: pointer to output variable to store the number of copied bytes
- *
- * This macro copies at most @len bytes from the fifo into the
- * @to buffer and returns -EFAULT/0.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these macro.
- */
-#define	kfifo_to_user(fifo, to, len, copied) \
-__kfifo_must_check_helper( \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	void __user *__to = (to); \
-	unsigned int __len = (len); \
-	unsigned int *__copied = (copied); \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	(__recsize) ? \
-	__kfifo_to_user_r(__kfifo, __to, __len, __copied, __recsize) : \
-	__kfifo_to_user(__kfifo, __to, __len, __copied); \
-}) \
-)
-
-/**
- * kfifo_dma_in_prepare - setup a scatterlist for DMA input
- * @fifo: address of the fifo to be used
- * @sgl: pointer to the scatterlist array
- * @nents: number of entries in the scatterlist array
- * @len: number of elements to transfer
- *
- * This macro fills a scatterlist for DMA input.
- * It returns the number entries in the scatterlist array.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these macros.
- */
-#define	kfifo_dma_in_prepare(fifo, sgl, nents, len) \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	struct scatterlist *__sgl = (sgl); \
-	int __nents = (nents); \
-	unsigned int __len = (len); \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	(__recsize) ? \
-	__kfifo_dma_in_prepare_r(__kfifo, __sgl, __nents, __len, __recsize) : \
-	__kfifo_dma_in_prepare(__kfifo, __sgl, __nents, __len); \
-})
-
-/**
- * kfifo_dma_in_finish - finish a DMA IN operation
- * @fifo: address of the fifo to be used
- * @len: number of bytes to received
- *
- * This macro finish a DMA IN operation. The in counter will be updated by
- * the len parameter. No error checking will be done.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these macros.
- */
-#define kfifo_dma_in_finish(fifo, len) \
-(void)({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	unsigned int __len = (len); \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	if (__recsize) \
-		__kfifo_dma_in_finish_r(__kfifo, __len, __recsize); \
-	else \
-		__kfifo->in += __len / sizeof(*__tmp->type); \
-})
-
-/**
- * kfifo_dma_out_prepare - setup a scatterlist for DMA output
- * @fifo: address of the fifo to be used
- * @sgl: pointer to the scatterlist array
- * @nents: number of entries in the scatterlist array
- * @len: number of elements to transfer
- *
- * This macro fills a scatterlist for DMA output which at most @len bytes
- * to transfer.
- * It returns the number entries in the scatterlist array.
- * A zero means there is no space available and the scatterlist is not filled.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these macros.
- */
-#define	kfifo_dma_out_prepare(fifo, sgl, nents, len) \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	struct scatterlist *__sgl = (sgl); \
-	int __nents = (nents); \
-	unsigned int __len = (len); \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	(__recsize) ? \
-	__kfifo_dma_out_prepare_r(__kfifo, __sgl, __nents, __len, __recsize) : \
-	__kfifo_dma_out_prepare(__kfifo, __sgl, __nents, __len); \
-})
-
-/**
- * kfifo_dma_out_finish - finish a DMA OUT operation
- * @fifo: address of the fifo to be used
- * @len: number of bytes transferd
- *
- * This macro finish a DMA OUT operation. The out counter will be updated by
- * the len parameter. No error checking will be done.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these macros.
- */
-#define kfifo_dma_out_finish(fifo, len) \
-(void)({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	unsigned int __len = (len); \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	if (__recsize) \
-		__kfifo_dma_out_finish_r(__kfifo, __recsize); \
-	else \
-		__kfifo->out += __len / sizeof(*__tmp->type); \
-})
-
-/**
- * kfifo_out_peek - gets some data from the fifo
- * @fifo: address of the fifo to be used
- * @buf: pointer to the storage buffer
- * @n: max. number of elements to get
- *
- * This macro get the data from the fifo and return the numbers of elements
- * copied. The data is not removed from the fifo.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these macro.
- */
-#define	kfifo_out_peek(fifo, buf, n) \
-__kfifo_must_check_helper( \
-({ \
-	typeof(fifo + 1) __tmp = (fifo); \
-	typeof(buf + 1) __buf = (buf); \
-	unsigned long __n = (n); \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	if (0) { \
-		typeof(__tmp->ptr) __dummy __attribute__ ((unused)) = NULL; \
-		__buf = __dummy; \
-	} \
-	(__recsize) ? \
-	__kfifo_out_peek_r(__kfifo, __buf, __n, __recsize) : \
-	__kfifo_out_peek(__kfifo, __buf, __n); \
-}) \
-)
-
-extern int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
-	size_t esize, gfp_t gfp_mask);
-
-extern void __kfifo_free(struct __kfifo *fifo);
-
-extern int __kfifo_init(struct __kfifo *fifo, void *buffer,
-	unsigned int size, size_t esize);
-
-extern unsigned int __kfifo_in(struct __kfifo *fifo,
-	const void *buf, unsigned int len);
-
-extern unsigned int __kfifo_out(struct __kfifo *fifo,
-	void *buf, unsigned int len);
-
-extern int __kfifo_from_user(struct __kfifo *fifo,
-	const void __user *from, unsigned long len, unsigned int *copied);
-
-extern int __kfifo_to_user(struct __kfifo *fifo,
-	void __user *to, unsigned long len, unsigned int *copied);
-
-extern unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
-	struct scatterlist *sgl, int nents, unsigned int len);
-
-extern unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
-	struct scatterlist *sgl, int nents, unsigned int len);
-
-extern unsigned int __kfifo_out_peek(struct __kfifo *fifo,
-	void *buf, unsigned int len);
-
-extern unsigned int __kfifo_in_r(struct __kfifo *fifo,
-	const void *buf, unsigned int len, size_t recsize);
-
-extern unsigned int __kfifo_out_r(struct __kfifo *fifo,
-	void *buf, unsigned int len, size_t recsize);
-
-extern int __kfifo_from_user_r(struct __kfifo *fifo,
-	const void __user *from, unsigned long len, unsigned int *copied,
-	size_t recsize);
-
-extern int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
-	unsigned long len, unsigned int *copied, size_t recsize);
-
-extern unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
-	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize);
-
-extern void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
-	unsigned int len, size_t recsize);
-
-extern unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
-	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize);
-
-extern void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize);
-
-extern unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize);
-
-extern unsigned int __kfifo_out_peek_r(struct __kfifo *fifo,
-	void *buf, unsigned int len, size_t recsize);
-
-extern unsigned int __kfifo_max_r(unsigned int len, size_t recsize);
-
-#endif
diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 57c4eedf4dd6..311f8753d713 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -1,8 +1,7 @@
 /*
- * A generic kernel FIFO implementation.
+ * A generic kernel FIFO implementation
  *
- * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
- * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
+ * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -20,8 +19,11 @@
  *
  */
 
+#ifndef _LINUX_KFIFO_H
+#define _LINUX_KFIFO_H
+
 /*
- * Howto porting drivers to the new generic fifo API:
+ * How to porting drivers to the new generic FIFO API:
  *
  * - Modify the declaration of the "struct kfifo *" object into a
  *   in-place "struct kfifo" object
@@ -30,586 +32,813 @@
  *   passed as the first argument to this functions
  * - Replace the use of __kfifo_put into kfifo_in and __kfifo_get
  *   into kfifo_out
- * - Replace the use of kfifo_put into kfifo_in_locked and kfifo_get
- *   into kfifo_out_locked
+ * - Replace the use of kfifo_put into kfifo_in_spinlocked and kfifo_get
+ *   into kfifo_out_spinlocked
  *   Note: the spinlock pointer formerly passed to kfifo_init/kfifo_alloc
- *   must be passed now to the kfifo_in_locked and kfifo_out_locked
- *   as the last parameter.
- * - All formerly name __kfifo_* functions has been renamed into kfifo_*
+ *   must be passed now to the kfifo_in_spinlocked and kfifo_out_spinlocked
+ *   as the last parameter
+ * - The formerly __kfifo_* functions are renamed into kfifo_*
  */
 
-#ifndef _LINUX_KFIFO_H
-#define _LINUX_KFIFO_H
+/*
+ * Note about locking : There is no locking required until only * one reader
+ * and one writer is using the fifo and no kfifo_reset() will be * called
+ *  kfifo_reset_out() can be safely used, until it will be only called
+ * in the reader thread.
+ *  For multiple writer and one reader there is only a need to lock the writer.
+ * And vice versa for only one writer and multiple reader there is only a need
+ * to lock the reader.
+ */
 
 #include <linux/kernel.h>
 #include <linux/spinlock.h>
-
-struct kfifo {
-	unsigned char *buffer;	/* the buffer holding the data */
-	unsigned int size;	/* the size of the allocated buffer */
-	unsigned int in;	/* data is added at offset (in % size) */
-	unsigned int out;	/* data is extracted from off. (out % size) */
+#include <linux/stddef.h>
+#include <linux/scatterlist.h>
+
+struct __kfifo {
+	unsigned int	in;
+	unsigned int	out;
+	unsigned int	mask;
+	unsigned int	esize;
+	void		*data;
 };
 
-/*
- * Macros for declaration and initialization of the kfifo datatype
- */
-
-/* helper macro */
-#define __kfifo_initializer(s, b) \
-	(struct kfifo) { \
-		.size	= s, \
-		.in	= 0, \
-		.out	= 0, \
-		.buffer = b \
+#define __STRUCT_KFIFO_COMMON(datatype, recsize, ptrtype) \
+	union { \
+		struct __kfifo	kfifo; \
+		datatype	*type; \
+		char		(*rectype)[recsize]; \
+		ptrtype		*ptr; \
+		const ptrtype	*ptr_const; \
 	}
 
-/**
- * DECLARE_KFIFO - macro to declare a kfifo and the associated buffer
- * @name: name of the declared kfifo datatype
- * @size: size of the fifo buffer. Must be a power of two.
- *
- * Note1: the macro can be used inside struct or union declaration
- * Note2: the macro creates two objects:
- *  A kfifo object with the given name and a buffer for the kfifo
- *  object named name##kfifo_buffer
- */
-#define DECLARE_KFIFO(name, size) \
-union { \
-	struct kfifo name; \
-	unsigned char name##kfifo_buffer[size + sizeof(struct kfifo)]; \
+#define __STRUCT_KFIFO(type, size, recsize, ptrtype) \
+{ \
+	__STRUCT_KFIFO_COMMON(type, recsize, ptrtype); \
+	type		buf[((size < 2) || (size & (size - 1))) ? -1 : size]; \
 }
 
-/**
- * INIT_KFIFO - Initialize a kfifo declared by DECLARE_KFIFO
- * @name: name of the declared kfifo datatype
+#define STRUCT_KFIFO(type, size) \
+	struct __STRUCT_KFIFO(type, size, 0, type)
+
+#define __STRUCT_KFIFO_PTR(type, recsize, ptrtype) \
+{ \
+	__STRUCT_KFIFO_COMMON(type, recsize, ptrtype); \
+	type		buf[0]; \
+}
+
+#define STRUCT_KFIFO_PTR(type) \
+	struct __STRUCT_KFIFO_PTR(type, 0, type)
+
+/*
+ * define compatibility "struct kfifo" for dynamic allocated fifos
  */
-#define INIT_KFIFO(name) \
-	name = __kfifo_initializer(sizeof(name##kfifo_buffer) - \
-				sizeof(struct kfifo), \
-				name##kfifo_buffer + sizeof(struct kfifo))
+struct kfifo __STRUCT_KFIFO_PTR(unsigned char, 0, void);
 
-/**
- * DEFINE_KFIFO - macro to define and initialize a kfifo
- * @name: name of the declared kfifo datatype
- * @size: size of the fifo buffer. Must be a power of two.
- *
- * Note1: the macro can be used for global and local kfifo data type variables
- * Note2: the macro creates two objects:
- *  A kfifo object with the given name and a buffer for the kfifo
- *  object named name##kfifo_buffer
+#define STRUCT_KFIFO_REC_1(size) \
+	struct __STRUCT_KFIFO(unsigned char, size, 1, void)
+
+#define STRUCT_KFIFO_REC_2(size) \
+	struct __STRUCT_KFIFO(unsigned char, size, 2, void)
+
+/*
+ * define kfifo_rec types
  */
-#define DEFINE_KFIFO(name, size) \
-	unsigned char name##kfifo_buffer[size]; \
-	struct kfifo name = __kfifo_initializer(size, name##kfifo_buffer)
+struct kfifo_rec_ptr_1 __STRUCT_KFIFO_PTR(unsigned char, 1, void);
+struct kfifo_rec_ptr_2 __STRUCT_KFIFO_PTR(unsigned char, 2, void);
 
-extern void kfifo_init(struct kfifo *fifo, void *buffer,
-			unsigned int size);
-extern __must_check int kfifo_alloc(struct kfifo *fifo, unsigned int size,
-			gfp_t gfp_mask);
-extern void kfifo_free(struct kfifo *fifo);
-extern unsigned int kfifo_in(struct kfifo *fifo,
-				const void *from, unsigned int len);
-extern __must_check unsigned int kfifo_out(struct kfifo *fifo,
-				void *to, unsigned int len);
-extern __must_check unsigned int kfifo_out_peek(struct kfifo *fifo,
-				void *to, unsigned int len, unsigned offset);
+/*
+ * helper macro to distinguish between real in place fifo where the fifo
+ * array is a part of the structure and the fifo type where the array is
+ * outside of the fifo structure.
+ */
+#define	__is_kfifo_ptr(fifo)	(sizeof(*fifo) == sizeof(struct __kfifo))
 
 /**
- * kfifo_initialized - Check if kfifo is initialized.
- * @fifo: fifo to check
- * Return %true if FIFO is initialized, otherwise %false.
- * Assumes the fifo was 0 before.
+ * DECLARE_KFIFO_PTR - macro to declare a fifo pointer object
+ * @fifo: name of the declared fifo
+ * @type: type of the fifo elements
  */
-static inline bool kfifo_initialized(struct kfifo *fifo)
-{
-	return fifo->buffer != NULL;
-}
+#define DECLARE_KFIFO_PTR(fifo, type)	STRUCT_KFIFO_PTR(type) fifo
 
 /**
- * kfifo_reset - removes the entire FIFO contents
- * @fifo: the fifo to be emptied.
+ * DECLARE_KFIFO - macro to declare a fifo object
+ * @fifo: name of the declared fifo
+ * @type: type of the fifo elements
+ * @size: the number of elements in the fifo, this must be a power of 2
  */
-static inline void kfifo_reset(struct kfifo *fifo)
-{
-	fifo->in = fifo->out = 0;
-}
+#define DECLARE_KFIFO(fifo, type, size)	STRUCT_KFIFO(type, size) fifo
 
 /**
- * kfifo_reset_out - skip FIFO contents
- * @fifo: the fifo to be emptied.
- */
-static inline void kfifo_reset_out(struct kfifo *fifo)
-{
-	smp_mb();
-	fifo->out = fifo->in;
-}
+ * INIT_KFIFO - Initialize a fifo declared by DECLARE_KFIFO
+ * @fifo: name of the declared fifo datatype
+ */
+#define INIT_KFIFO(fifo) \
+(void)({ \
+	typeof(&(fifo)) __tmp = &(fifo); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	__kfifo->in = 0; \
+	__kfifo->out = 0; \
+	__kfifo->mask = __is_kfifo_ptr(__tmp) ? 0 : ARRAY_SIZE(__tmp->buf) - 1;\
+	__kfifo->esize = sizeof(*__tmp->buf); \
+	__kfifo->data = __is_kfifo_ptr(__tmp) ?  NULL : __tmp->buf; \
+})
 
 /**
- * kfifo_size - returns the size of the fifo in bytes
- * @fifo: the fifo to be used.
- */
-static inline __must_check unsigned int kfifo_size(struct kfifo *fifo)
+ * DEFINE_KFIFO - macro to define and initialize a fifo
+ * @fifo: name of the declared fifo datatype
+ * @type: type of the fifo elements
+ * @size: the number of elements in the fifo, this must be a power of 2
+ *
+ * Note: the macro can be used for global and local fifo data type variables.
+ */
+#define DEFINE_KFIFO(fifo, type, size) \
+	DECLARE_KFIFO(fifo, type, size) = \
+	(typeof(fifo)) { \
+		{ \
+			{ \
+			.in	= 0, \
+			.out	= 0, \
+			.mask	= __is_kfifo_ptr(&(fifo)) ? \
+				  0 : \
+				  ARRAY_SIZE((fifo).buf) - 1, \
+			.esize	= sizeof(*(fifo).buf), \
+			.data	= __is_kfifo_ptr(&(fifo)) ? \
+				NULL : \
+				(fifo).buf, \
+			} \
+		} \
+	}
+
+
+static inline unsigned int __must_check
+__kfifo_must_check_helper(unsigned int val)
 {
-	return fifo->size;
+	return val;
 }
 
 /**
- * kfifo_len - returns the number of used bytes in the FIFO
- * @fifo: the fifo to be used.
+ * kfifo_initialized - Check if the fifo is initialized
+ * @fifo: address of the fifo to check
+ *
+ * Return %true if fifo is initialized, otherwise %false.
+ * Assumes the fifo was 0 before.
  */
-static inline unsigned int kfifo_len(struct kfifo *fifo)
-{
-	register unsigned int	out;
-
-	out = fifo->out;
-	smp_rmb();
-	return fifo->in - out;
-}
+#define kfifo_initialized(fifo) ((fifo)->kfifo.mask)
 
 /**
- * kfifo_is_empty - returns true if the fifo is empty
- * @fifo: the fifo to be used.
+ * kfifo_esize - returns the size of the element managed by the fifo
+ * @fifo: address of the fifo to be used
  */
-static inline __must_check bool kfifo_is_empty(struct kfifo *fifo)
-{
-	return fifo->in == fifo->out;
-}
+#define kfifo_esize(fifo)	((fifo)->kfifo.esize)
 
 /**
- * kfifo_is_full - returns true if the fifo is full
- * @fifo: the fifo to be used.
+ * kfifo_recsize - returns the size of the record length field
+ * @fifo: address of the fifo to be used
  */
-static inline __must_check bool kfifo_is_full(struct kfifo *fifo)
-{
-	return kfifo_len(fifo) == kfifo_size(fifo);
-}
+#define kfifo_recsize(fifo)	(sizeof(*(fifo)->rectype))
 
 /**
- * kfifo_avail - returns the number of bytes available in the FIFO
- * @fifo: the fifo to be used.
+ * kfifo_size - returns the size of the fifo in elements
+ * @fifo: address of the fifo to be used
  */
-static inline __must_check unsigned int kfifo_avail(struct kfifo *fifo)
-{
-	return kfifo_size(fifo) - kfifo_len(fifo);
-}
+#define kfifo_size(fifo)	((fifo)->kfifo.mask + 1)
 
 /**
- * kfifo_in_locked - puts some data into the FIFO using a spinlock for locking
- * @fifo: the fifo to be used.
- * @from: the data to be added.
- * @n: the length of the data to be added.
- * @lock: pointer to the spinlock to use for locking.
+ * kfifo_reset - removes the entire fifo content
+ * @fifo: address of the fifo to be used
  *
- * This function copies at most @n bytes from the @from buffer into
- * the FIFO depending on the free space, and returns the number of
- * bytes copied.
+ * Note: usage of kfifo_reset() is dangerous. It should be only called when the
+ * fifo is exclusived locked or when it is secured that no other thread is
+ * accessing the fifo.
  */
-static inline unsigned int kfifo_in_locked(struct kfifo *fifo,
-		const void *from, unsigned int n, spinlock_t *lock)
-{
-	unsigned long flags;
-	unsigned int ret;
-
-	spin_lock_irqsave(lock, flags);
-
-	ret = kfifo_in(fifo, from, n);
-
-	spin_unlock_irqrestore(lock, flags);
-
-	return ret;
-}
+#define kfifo_reset(fifo) \
+(void)({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	__tmp->kfifo.in = __tmp->kfifo.out = 0; \
+})
 
 /**
- * kfifo_out_locked - gets some data from the FIFO using a spinlock for locking
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @n: the size of the destination buffer.
- * @lock: pointer to the spinlock to use for locking.
+ * kfifo_reset_out - skip fifo content
+ * @fifo: address of the fifo to be used
  *
- * This function copies at most @n bytes from the FIFO into the
- * @to buffer and returns the number of copied bytes.
- */
-static inline __must_check unsigned int kfifo_out_locked(struct kfifo *fifo,
-	void *to, unsigned int n, spinlock_t *lock)
-{
-	unsigned long flags;
-	unsigned int ret;
-
-	spin_lock_irqsave(lock, flags);
-
-	ret = kfifo_out(fifo, to, n);
-
-	spin_unlock_irqrestore(lock, flags);
-
-	return ret;
-}
-
-extern void kfifo_skip(struct kfifo *fifo, unsigned int len);
-
-extern __must_check int kfifo_from_user(struct kfifo *fifo,
-	const void __user *from, unsigned int n, unsigned *lenout);
-
-extern __must_check int kfifo_to_user(struct kfifo *fifo,
-	void __user *to, unsigned int n, unsigned *lenout);
-
-/*
- * __kfifo_add_out internal helper function for updating the out offset
+ * Note: The usage of kfifo_reset_out() is safe until it will be only called
+ * from the reader thread and there is only one concurrent reader. Otherwise
+ * it is dangerous and must be handled in the same way as kfifo_reset().
  */
-static inline void __kfifo_add_out(struct kfifo *fifo,
-				unsigned int off)
-{
-	smp_mb();
-	fifo->out += off;
-}
+#define kfifo_reset_out(fifo)	\
+(void)({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	__tmp->kfifo.out = __tmp->kfifo.in; \
+})
 
-/*
- * __kfifo_add_in internal helper function for updating the in offset
+/**
+ * kfifo_len - returns the number of used elements in the fifo
+ * @fifo: address of the fifo to be used
  */
-static inline void __kfifo_add_in(struct kfifo *fifo,
-				unsigned int off)
-{
-	smp_wmb();
-	fifo->in += off;
-}
+#define kfifo_len(fifo) \
+({ \
+	typeof(fifo + 1) __tmpl = (fifo); \
+	__tmpl->kfifo.in - __tmpl->kfifo.out; \
+})
 
-/*
- * __kfifo_off internal helper function for calculating the index of a
- * given offeset
+/**
+ * kfifo_is_empty - returns true if the fifo is empty
+ * @fifo: address of the fifo to be used
  */
-static inline unsigned int __kfifo_off(struct kfifo *fifo, unsigned int off)
-{
-	return off & (fifo->size - 1);
-}
+#define	kfifo_is_empty(fifo) \
+({ \
+	typeof(fifo + 1) __tmpq = (fifo); \
+	__tmpq->kfifo.in == __tmpq->kfifo.out; \
+})
 
-/*
- * __kfifo_peek_n internal helper function for determinate the length of
- * the next record in the fifo
+/**
+ * kfifo_is_full - returns true if the fifo is full
+ * @fifo: address of the fifo to be used
  */
-static inline unsigned int __kfifo_peek_n(struct kfifo *fifo,
-				unsigned int recsize)
-{
-#define __KFIFO_GET(fifo, off, shift) \
-	((fifo)->buffer[__kfifo_off((fifo), (fifo)->out+(off))] << (shift))
+#define	kfifo_is_full(fifo) \
+({ \
+	typeof(fifo + 1) __tmpq = (fifo); \
+	kfifo_len(__tmpq) > __tmpq->kfifo.mask; \
+})
 
-	unsigned int l;
+/**
+ * kfifo_avail - returns the number of unused elements in the fifo
+ * @fifo: address of the fifo to be used
+ */
+#define	kfifo_avail(fifo) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmpq = (fifo); \
+	const size_t __recsize = sizeof(*__tmpq->rectype); \
+	unsigned int __avail = kfifo_size(__tmpq) - kfifo_len(__tmpq); \
+	(__recsize) ? ((__avail <= __recsize) ? 0 : \
+	__kfifo_max_r(__avail - __recsize, __recsize)) : \
+	__avail; \
+}) \
+)
 
-	l = __KFIFO_GET(fifo, 0, 0);
+/**
+ * kfifo_skip - skip output data
+ * @fifo: address of the fifo to be used
+ */
+#define	kfifo_skip(fifo) \
+(void)({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (__recsize) \
+		__kfifo_skip_r(__kfifo, __recsize); \
+	else \
+		__kfifo->out++; \
+})
 
-	if (--recsize)
-		l |= __KFIFO_GET(fifo, 1, 8);
+/**
+ * kfifo_peek_len - gets the size of the next fifo record
+ * @fifo: address of the fifo to be used
+ *
+ * This function returns the size of the next fifo record in number of bytes.
+ */
+#define kfifo_peek_len(fifo) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	(!__recsize) ? kfifo_len(__tmp) * sizeof(*__tmp->type) : \
+	__kfifo_len_r(__kfifo, __recsize); \
+}) \
+)
 
-	return l;
-#undef	__KFIFO_GET
-}
+/**
+ * kfifo_alloc - dynamically allocates a new fifo buffer
+ * @fifo: pointer to the fifo
+ * @size: the number of elements in the fifo, this must be a power of 2
+ * @gfp_mask: get_free_pages mask, passed to kmalloc()
+ *
+ * This macro dynamically allocates a new fifo buffer.
+ *
+ * The numer of elements will be rounded-up to a power of 2.
+ * The fifo will be release with kfifo_free().
+ * Return 0 if no error, otherwise an error code.
+ */
+#define kfifo_alloc(fifo, size, gfp_mask) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	__is_kfifo_ptr(__tmp) ? \
+	__kfifo_alloc(__kfifo, size, sizeof(*__tmp->type), gfp_mask) : \
+	-EINVAL; \
+}) \
+)
 
-/*
- * __kfifo_poke_n internal helper function for storing the length of
- * the next record into the fifo
+/**
+ * kfifo_free - frees the fifo
+ * @fifo: the fifo to be freed
  */
-static inline void __kfifo_poke_n(struct kfifo *fifo,
-			unsigned int recsize, unsigned int n)
-{
-#define __KFIFO_PUT(fifo, off, val, shift) \
-		( \
-		(fifo)->buffer[__kfifo_off((fifo), (fifo)->in+(off))] = \
-		(unsigned char)((val) >> (shift)) \
-		)
+#define kfifo_free(fifo) \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (__is_kfifo_ptr(__tmp)) \
+		__kfifo_free(__kfifo); \
+})
 
-	__KFIFO_PUT(fifo, 0, n, 0);
+/**
+ * kfifo_init - initialize a fifo using a preallocated buffer
+ * @fifo: the fifo to assign the buffer
+ * @buffer: the preallocated buffer to be used
+ * @size: the size of the internal buffer, this have to be a power of 2
+ *
+ * This macro initialize a fifo using a preallocated buffer.
+ *
+ * The numer of elements will be rounded-up to a power of 2.
+ * Return 0 if no error, otherwise an error code.
+ */
+#define kfifo_init(fifo, buffer, size) \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	__is_kfifo_ptr(__tmp) ? \
+	__kfifo_init(__kfifo, buffer, size, sizeof(*__tmp->type)) : \
+	-EINVAL; \
+})
 
-	if (--recsize)
-		__KFIFO_PUT(fifo, 1, n, 8);
-#undef	__KFIFO_PUT
-}
+/**
+ * kfifo_put - put data into the fifo
+ * @fifo: address of the fifo to be used
+ * @val: the data to be added
+ *
+ * This macro copies the given value into the fifo.
+ * It returns 0 if the fifo was full. Otherwise it returns the number
+ * processed elements.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_put(fifo, val) \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	typeof(val + 1) __val = (val); \
+	unsigned int __ret; \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (0) { \
+		typeof(__tmp->ptr_const) __dummy __attribute__ ((unused)); \
+		__dummy = (typeof(__val))NULL; \
+	} \
+	if (__recsize) \
+		__ret = __kfifo_in_r(__kfifo, __val, sizeof(*__val), \
+			__recsize); \
+	else { \
+		__ret = !kfifo_is_full(__tmp); \
+		if (__ret) { \
+			(__is_kfifo_ptr(__tmp) ? \
+			((typeof(__tmp->type))__kfifo->data) : \
+			(__tmp->buf) \
+			)[__kfifo->in & __tmp->kfifo.mask] = \
+				*(typeof(__tmp->type))__val; \
+			smp_wmb(); \
+			__kfifo->in++; \
+		} \
+	} \
+	__ret; \
+})
 
-/*
- * __kfifo_in_... internal functions for put date into the fifo
- * do not call it directly, use kfifo_in_rec() instead
- */
-extern unsigned int __kfifo_in_n(struct kfifo *fifo,
-	const void *from, unsigned int n, unsigned int recsize);
+/**
+ * kfifo_get - get data from the fifo
+ * @fifo: address of the fifo to be used
+ * @val: the var where to store the data to be added
+ *
+ * This macro reads the data from the fifo.
+ * It returns 0 if the fifo was empty. Otherwise it returns the number
+ * processed elements.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_get(fifo, val) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	typeof(val + 1) __val = (val); \
+	unsigned int __ret; \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (0) \
+		__val = (typeof(__tmp->ptr))0; \
+	if (__recsize) \
+		__ret = __kfifo_out_r(__kfifo, __val, sizeof(*__val), \
+			__recsize); \
+	else { \
+		__ret = !kfifo_is_empty(__tmp); \
+		if (__ret) { \
+			*(typeof(__tmp->type))__val = \
+				(__is_kfifo_ptr(__tmp) ? \
+				((typeof(__tmp->type))__kfifo->data) : \
+				(__tmp->buf) \
+				)[__kfifo->out & __tmp->kfifo.mask]; \
+			smp_wmb(); \
+			__kfifo->out++; \
+		} \
+	} \
+	__ret; \
+}) \
+)
 
-extern unsigned int __kfifo_in_generic(struct kfifo *fifo,
-	const void *from, unsigned int n, unsigned int recsize);
+/**
+ * kfifo_peek - get data from the fifo without removing
+ * @fifo: address of the fifo to be used
+ * @val: the var where to store the data to be added
+ *
+ * This reads the data from the fifo without removing it from the fifo.
+ * It returns 0 if the fifo was empty. Otherwise it returns the number
+ * processed elements.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_peek(fifo, val) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	typeof(val + 1) __val = (val); \
+	unsigned int __ret; \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (0) \
+		__val = (typeof(__tmp->ptr))NULL; \
+	if (__recsize) \
+		__ret = __kfifo_out_peek_r(__kfifo, __val, sizeof(*__val), \
+			__recsize); \
+	else { \
+		__ret = !kfifo_is_empty(__tmp); \
+		if (__ret) { \
+			*(typeof(__tmp->type))__val = \
+				(__is_kfifo_ptr(__tmp) ? \
+				((typeof(__tmp->type))__kfifo->data) : \
+				(__tmp->buf) \
+				)[__kfifo->out & __tmp->kfifo.mask]; \
+			smp_wmb(); \
+		} \
+	} \
+	__ret; \
+}) \
+)
 
-static inline unsigned int __kfifo_in_rec(struct kfifo *fifo,
-	const void *from, unsigned int n, unsigned int recsize)
-{
-	unsigned int ret;
+/**
+ * kfifo_in - put data into the fifo
+ * @fifo: address of the fifo to be used
+ * @buf: the data to be added
+ * @n: number of elements to be added
+ *
+ * This macro copies the given buffer into the fifo and returns the
+ * number of copied elements.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_in(fifo, buf, n) \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	typeof(buf + 1) __buf = (buf); \
+	unsigned long __n = (n); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (0) { \
+		typeof(__tmp->ptr_const) __dummy __attribute__ ((unused)); \
+		__dummy = (typeof(__buf))NULL; \
+	} \
+	(__recsize) ?\
+	__kfifo_in_r(__kfifo, __buf, __n, __recsize) : \
+	__kfifo_in(__kfifo, __buf, __n); \
+})
 
-	ret = __kfifo_in_n(fifo, from, n, recsize);
+/**
+ * kfifo_in_spinlocked - put data into the fifo using a spinlock for locking
+ * @fifo: address of the fifo to be used
+ * @buf: the data to be added
+ * @n: number of elements to be added
+ * @lock: pointer to the spinlock to use for locking
+ *
+ * This macro copies the given values buffer into the fifo and returns the
+ * number of copied elements.
+ */
+#define	kfifo_in_spinlocked(fifo, buf, n, lock) \
+({ \
+	unsigned long __flags; \
+	unsigned int __ret; \
+	spin_lock_irqsave(lock, __flags); \
+	__ret = kfifo_in(fifo, buf, n); \
+	spin_unlock_irqrestore(lock, __flags); \
+	__ret; \
+})
+
+/* alias for kfifo_in_spinlocked, will be removed in a future release */
+#define kfifo_in_locked(fifo, buf, n, lock) \
+		kfifo_in_spinlocked(fifo, buf, n, lock)
 
-	if (likely(ret == 0)) {
-		if (recsize)
-			__kfifo_poke_n(fifo, recsize, n);
-		__kfifo_add_in(fifo, n + recsize);
-	}
-	return ret;
-}
+/**
+ * kfifo_out - get data from the fifo
+ * @fifo: address of the fifo to be used
+ * @buf: pointer to the storage buffer
+ * @n: max. number of elements to get
+ *
+ * This macro get some data from the fifo and return the numbers of elements
+ * copied.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_out(fifo, buf, n) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	typeof(buf + 1) __buf = (buf); \
+	unsigned long __n = (n); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (0) { \
+		typeof(__tmp->ptr) __dummy = NULL; \
+		__buf = __dummy; \
+	} \
+	(__recsize) ?\
+	__kfifo_out_r(__kfifo, __buf, __n, __recsize) : \
+	__kfifo_out(__kfifo, __buf, __n); \
+}) \
+)
+
+/**
+ * kfifo_out_spinlocked - get data from the fifo using a spinlock for locking
+ * @fifo: address of the fifo to be used
+ * @buf: pointer to the storage buffer
+ * @n: max. number of elements to get
+ * @lock: pointer to the spinlock to use for locking
+ *
+ * This macro get the data from the fifo and return the numbers of elements
+ * copied.
+ */
+#define	kfifo_out_spinlocked(fifo, buf, n, lock) \
+__kfifo_must_check_helper( \
+({ \
+	unsigned long __flags; \
+	unsigned int __ret; \
+	spin_lock_irqsave(lock, __flags); \
+	__ret = kfifo_out(fifo, buf, n); \
+	spin_unlock_irqrestore(lock, __flags); \
+	__ret; \
+}) \
+)
+
+/* alias for kfifo_out_spinlocked, will be removed in a future release */
+#define kfifo_out_locked(fifo, buf, n, lock) \
+		kfifo_out_spinlocked(fifo, buf, n, lock)
 
 /**
- * kfifo_in_rec - puts some record data into the FIFO
- * @fifo: the fifo to be used.
- * @from: the data to be added.
- * @n: the length of the data to be added.
- * @recsize: size of record field
+ * kfifo_from_user - puts some data from user space into the fifo
+ * @fifo: address of the fifo to be used
+ * @from: pointer to the data to be added
+ * @len: the length of the data to be added
+ * @copied: pointer to output variable to store the number of copied bytes
  *
- * This function copies @n bytes from the @from into the FIFO and returns
- * the number of bytes which cannot be copied.
- * A returned value greater than the @n value means that the record doesn't
- * fit into the buffer.
+ * This macro copies at most @len bytes from the @from into the
+ * fifo, depending of the available space and returns -EFAULT/0.
  *
  * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-static inline __must_check unsigned int kfifo_in_rec(struct kfifo *fifo,
-	void *from, unsigned int n, unsigned int recsize)
-{
-	if (!__builtin_constant_p(recsize))
-		return __kfifo_in_generic(fifo, from, n, recsize);
-	return __kfifo_in_rec(fifo, from, n, recsize);
-}
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_from_user(fifo, from, len, copied) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	const void __user *__from = (from); \
+	unsigned int __len = (len); \
+	unsigned int *__copied = (copied); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	(__recsize) ? \
+	__kfifo_from_user_r(__kfifo, __from, __len,  __copied, __recsize) : \
+	__kfifo_from_user(__kfifo, __from, __len, __copied); \
+}) \
+)
 
-/*
- * __kfifo_out_... internal functions for get date from the fifo
- * do not call it directly, use kfifo_out_rec() instead
- */
-extern unsigned int __kfifo_out_n(struct kfifo *fifo,
-	void *to, unsigned int reclen, unsigned int recsize);
+/**
+ * kfifo_to_user - copies data from the fifo into user space
+ * @fifo: address of the fifo to be used
+ * @to: where the data must be copied
+ * @len: the size of the destination buffer
+ * @copied: pointer to output variable to store the number of copied bytes
+ *
+ * This macro copies at most @len bytes from the fifo into the
+ * @to buffer and returns -EFAULT/0.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define	kfifo_to_user(fifo, to, len, copied) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	void __user *__to = (to); \
+	unsigned int __len = (len); \
+	unsigned int *__copied = (copied); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	(__recsize) ? \
+	__kfifo_to_user_r(__kfifo, __to, __len, __copied, __recsize) : \
+	__kfifo_to_user(__kfifo, __to, __len, __copied); \
+}) \
+)
+
+/**
+ * kfifo_dma_in_prepare - setup a scatterlist for DMA input
+ * @fifo: address of the fifo to be used
+ * @sgl: pointer to the scatterlist array
+ * @nents: number of entries in the scatterlist array
+ * @len: number of elements to transfer
+ *
+ * This macro fills a scatterlist for DMA input.
+ * It returns the number entries in the scatterlist array.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macros.
+ */
+#define	kfifo_dma_in_prepare(fifo, sgl, nents, len) \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	struct scatterlist *__sgl = (sgl); \
+	int __nents = (nents); \
+	unsigned int __len = (len); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	(__recsize) ? \
+	__kfifo_dma_in_prepare_r(__kfifo, __sgl, __nents, __len, __recsize) : \
+	__kfifo_dma_in_prepare(__kfifo, __sgl, __nents, __len); \
+})
 
-extern unsigned int __kfifo_out_generic(struct kfifo *fifo,
-	void *to, unsigned int n,
-	unsigned int recsize, unsigned int *total);
+/**
+ * kfifo_dma_in_finish - finish a DMA IN operation
+ * @fifo: address of the fifo to be used
+ * @len: number of bytes to received
+ *
+ * This macro finish a DMA IN operation. The in counter will be updated by
+ * the len parameter. No error checking will be done.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macros.
+ */
+#define kfifo_dma_in_finish(fifo, len) \
+(void)({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	unsigned int __len = (len); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (__recsize) \
+		__kfifo_dma_in_finish_r(__kfifo, __len, __recsize); \
+	else \
+		__kfifo->in += __len / sizeof(*__tmp->type); \
+})
 
-static inline unsigned int __kfifo_out_rec(struct kfifo *fifo,
-	void *to, unsigned int n, unsigned int recsize,
-	unsigned int *total)
-{
-	unsigned int l;
-
-	if (!recsize) {
-		l = n;
-		if (total)
-			*total = l;
-	} else {
-		l = __kfifo_peek_n(fifo, recsize);
-		if (total)
-			*total = l;
-		if (n < l)
-			return l;
-	}
+/**
+ * kfifo_dma_out_prepare - setup a scatterlist for DMA output
+ * @fifo: address of the fifo to be used
+ * @sgl: pointer to the scatterlist array
+ * @nents: number of entries in the scatterlist array
+ * @len: number of elements to transfer
+ *
+ * This macro fills a scatterlist for DMA output which at most @len bytes
+ * to transfer.
+ * It returns the number entries in the scatterlist array.
+ * A zero means there is no space available and the scatterlist is not filled.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macros.
+ */
+#define	kfifo_dma_out_prepare(fifo, sgl, nents, len) \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	struct scatterlist *__sgl = (sgl); \
+	int __nents = (nents); \
+	unsigned int __len = (len); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	(__recsize) ? \
+	__kfifo_dma_out_prepare_r(__kfifo, __sgl, __nents, __len, __recsize) : \
+	__kfifo_dma_out_prepare(__kfifo, __sgl, __nents, __len); \
+})
 
-	return __kfifo_out_n(fifo, to, l, recsize);
-}
+/**
+ * kfifo_dma_out_finish - finish a DMA OUT operation
+ * @fifo: address of the fifo to be used
+ * @len: number of bytes transferd
+ *
+ * This macro finish a DMA OUT operation. The out counter will be updated by
+ * the len parameter. No error checking will be done.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macros.
+ */
+#define kfifo_dma_out_finish(fifo, len) \
+(void)({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	unsigned int __len = (len); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (__recsize) \
+		__kfifo_dma_out_finish_r(__kfifo, __recsize); \
+	else \
+		__kfifo->out += __len / sizeof(*__tmp->type); \
+})
 
 /**
- * kfifo_out_rec - gets some record data from the FIFO
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @n: the size of the destination buffer.
- * @recsize: size of record field
- * @total: pointer where the total number of to copied bytes should stored
+ * kfifo_out_peek - gets some data from the fifo
+ * @fifo: address of the fifo to be used
+ * @buf: pointer to the storage buffer
+ * @n: max. number of elements to get
  *
- * This function copies at most @n bytes from the FIFO to @to and returns the
- * number of bytes which cannot be copied.
- * A returned value greater than the @n value means that the record doesn't
- * fit into the @to buffer.
+ * This macro get the data from the fifo and return the numbers of elements
+ * copied. The data is not removed from the fifo.
  *
  * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
+ * writer, you don't need extra locking to use these macro.
  */
-static inline __must_check unsigned int kfifo_out_rec(struct kfifo *fifo,
-	void *to, unsigned int n, unsigned int recsize,
-	unsigned int *total)
+#define	kfifo_out_peek(fifo, buf, n) \
+__kfifo_must_check_helper( \
+({ \
+	typeof(fifo + 1) __tmp = (fifo); \
+	typeof(buf + 1) __buf = (buf); \
+	unsigned long __n = (n); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	if (0) { \
+		typeof(__tmp->ptr) __dummy __attribute__ ((unused)) = NULL; \
+		__buf = __dummy; \
+	} \
+	(__recsize) ? \
+	__kfifo_out_peek_r(__kfifo, __buf, __n, __recsize) : \
+	__kfifo_out_peek(__kfifo, __buf, __n); \
+}) \
+)
 
-{
-	if (!__builtin_constant_p(recsize))
-		return __kfifo_out_generic(fifo, to, n, recsize, total);
-	return __kfifo_out_rec(fifo, to, n, recsize, total);
-}
+extern int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
+	size_t esize, gfp_t gfp_mask);
 
-/*
- * __kfifo_from_user_... internal functions for transfer from user space into
- * the fifo. do not call it directly, use kfifo_from_user_rec() instead
- */
-extern unsigned int __kfifo_from_user_n(struct kfifo *fifo,
-	const void __user *from, unsigned int n, unsigned int recsize);
+extern void __kfifo_free(struct __kfifo *fifo);
 
-extern unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
-	const void __user *from, unsigned int n, unsigned int recsize);
+extern int __kfifo_init(struct __kfifo *fifo, void *buffer,
+	unsigned int size, size_t esize);
 
-static inline unsigned int __kfifo_from_user_rec(struct kfifo *fifo,
-	const void __user *from, unsigned int n, unsigned int recsize)
-{
-	unsigned int ret;
+extern unsigned int __kfifo_in(struct __kfifo *fifo,
+	const void *buf, unsigned int len);
 
-	ret = __kfifo_from_user_n(fifo, from, n, recsize);
+extern unsigned int __kfifo_out(struct __kfifo *fifo,
+	void *buf, unsigned int len);
 
-	if (likely(ret == 0)) {
-		if (recsize)
-			__kfifo_poke_n(fifo, recsize, n);
-		__kfifo_add_in(fifo, n + recsize);
-	}
-	return ret;
-}
+extern int __kfifo_from_user(struct __kfifo *fifo,
+	const void __user *from, unsigned long len, unsigned int *copied);
 
-/**
- * kfifo_from_user_rec - puts some data from user space into the FIFO
- * @fifo: the fifo to be used.
- * @from: pointer to the data to be added.
- * @n: the length of the data to be added.
- * @recsize: size of record field
- *
- * This function copies @n bytes from the @from into the
- * FIFO and returns the number of bytes which cannot be copied.
- *
- * If the returned value is equal or less the @n value, the copy_from_user()
- * functions has failed. Otherwise the record doesn't fit into the buffer.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-static inline __must_check unsigned int kfifo_from_user_rec(struct kfifo *fifo,
-	const void __user *from, unsigned int n, unsigned int recsize)
-{
-	if (!__builtin_constant_p(recsize))
-		return __kfifo_from_user_generic(fifo, from, n, recsize);
-	return __kfifo_from_user_rec(fifo, from, n, recsize);
-}
+extern int __kfifo_to_user(struct __kfifo *fifo,
+	void __user *to, unsigned long len, unsigned int *copied);
 
-/*
- * __kfifo_to_user_... internal functions for transfer fifo data into user space
- * do not call it directly, use kfifo_to_user_rec() instead
- */
-extern unsigned int __kfifo_to_user_n(struct kfifo *fifo,
-	void __user *to, unsigned int n, unsigned int reclen,
-	unsigned int recsize);
+extern unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len);
 
-extern unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
-	void __user *to, unsigned int n, unsigned int recsize,
-	unsigned int *total);
+extern unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len);
 
-static inline unsigned int __kfifo_to_user_rec(struct kfifo *fifo,
-	void __user *to, unsigned int n,
-	unsigned int recsize, unsigned int *total)
-{
-	unsigned int l;
-
-	if (!recsize) {
-		l = n;
-		if (total)
-			*total = l;
-	} else {
-		l = __kfifo_peek_n(fifo, recsize);
-		if (total)
-			*total = l;
-		if (n < l)
-			return l;
-	}
+extern unsigned int __kfifo_out_peek(struct __kfifo *fifo,
+	void *buf, unsigned int len);
 
-	return __kfifo_to_user_n(fifo, to, n, l, recsize);
-}
+extern unsigned int __kfifo_in_r(struct __kfifo *fifo,
+	const void *buf, unsigned int len, size_t recsize);
 
-/**
- * kfifo_to_user_rec - gets data from the FIFO and write it to user space
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @n: the size of the destination buffer.
- * @recsize: size of record field
- * @total: pointer where the total number of to copied bytes should stored
- *
- * This function copies at most @n bytes from the FIFO to the @to.
- * In case of an error, the function returns the number of bytes which cannot
- * be copied.
- * If the returned value is equal or less the @n value, the copy_to_user()
- * functions has failed. Otherwise the record doesn't fit into the @to buffer.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-static inline __must_check unsigned int kfifo_to_user_rec(struct kfifo *fifo,
-		void __user *to, unsigned int n, unsigned int recsize,
-		unsigned int *total)
-{
-	if (!__builtin_constant_p(recsize))
-		return __kfifo_to_user_generic(fifo, to, n, recsize, total);
-	return __kfifo_to_user_rec(fifo, to, n, recsize, total);
-}
+extern unsigned int __kfifo_out_r(struct __kfifo *fifo,
+	void *buf, unsigned int len, size_t recsize);
 
-/*
- * __kfifo_peek_... internal functions for peek into the next fifo record
- * do not call it directly, use kfifo_peek_rec() instead
- */
-extern unsigned int __kfifo_peek_generic(struct kfifo *fifo,
-				unsigned int recsize);
+extern int __kfifo_from_user_r(struct __kfifo *fifo,
+	const void __user *from, unsigned long len, unsigned int *copied,
+	size_t recsize);
 
-/**
- * kfifo_peek_rec - gets the size of the next FIFO record data
- * @fifo: the fifo to be used.
- * @recsize: size of record field
- *
- * This function returns the size of the next FIFO record in number of bytes
- */
-static inline __must_check unsigned int kfifo_peek_rec(struct kfifo *fifo,
-	unsigned int recsize)
-{
-	if (!__builtin_constant_p(recsize))
-		return __kfifo_peek_generic(fifo, recsize);
-	if (!recsize)
-		return kfifo_len(fifo);
-	return __kfifo_peek_n(fifo, recsize);
-}
+extern int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
+	unsigned long len, unsigned int *copied, size_t recsize);
 
-/*
- * __kfifo_skip_... internal functions for skip the next fifo record
- * do not call it directly, use kfifo_skip_rec() instead
- */
-extern void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize);
+extern unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize);
 
-static inline void __kfifo_skip_rec(struct kfifo *fifo,
-	unsigned int recsize)
-{
-	unsigned int l;
+extern void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
+	unsigned int len, size_t recsize);
 
-	if (recsize) {
-		l = __kfifo_peek_n(fifo, recsize);
+extern unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize);
 
-		if (l + recsize <= kfifo_len(fifo)) {
-			__kfifo_add_out(fifo, l + recsize);
-			return;
-		}
-	}
-	kfifo_reset_out(fifo);
-}
+extern void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize);
 
-/**
- * kfifo_skip_rec - skip the next fifo out record
- * @fifo: the fifo to be used.
- * @recsize: size of record field
- *
- * This function skips the next FIFO record
- */
-static inline void kfifo_skip_rec(struct kfifo *fifo,
-	unsigned int recsize)
-{
-	if (!__builtin_constant_p(recsize))
-		__kfifo_skip_generic(fifo, recsize);
-	else
-		__kfifo_skip_rec(fifo, recsize);
-}
+extern unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize);
 
-/**
- * kfifo_avail_rec - returns the number of bytes available in a record FIFO
- * @fifo: the fifo to be used.
- * @recsize: size of record field
- */
-static inline __must_check unsigned int kfifo_avail_rec(struct kfifo *fifo,
-	unsigned int recsize)
-{
-	unsigned int l = kfifo_size(fifo) - kfifo_len(fifo);
+extern unsigned int __kfifo_out_peek_r(struct __kfifo *fifo,
+	void *buf, unsigned int len, size_t recsize);
 
-	return (l > recsize) ? l - recsize : 0;
-}
+extern unsigned int __kfifo_max_r(unsigned int len, size_t recsize);
 
 #endif
diff --git a/kernel/kfifo-new.c b/kernel/kfifo-new.c
deleted file mode 100644
index 02192dd905cc..000000000000
--- a/kernel/kfifo-new.c
+++ /dev/null
@@ -1,602 +0,0 @@
-/*
- * A generic kernel FIFO implementation
- *
- * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/log2.h>
-#include <linux/uaccess.h>
-#include <linux/kfifo.h>
-
-/*
- * internal helper to calculate the unused elements in a fifo
- */
-static inline unsigned int kfifo_unused(struct __kfifo *fifo)
-{
-	return (fifo->mask + 1) - (fifo->in - fifo->out);
-}
-
-int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
-		size_t esize, gfp_t gfp_mask)
-{
-	/*
-	 * round down to the next power of 2, since our 'let the indices
-	 * wrap' technique works only in this case.
-	 */
-	if (!is_power_of_2(size))
-		size = rounddown_pow_of_two(size);
-
-	fifo->in = 0;
-	fifo->out = 0;
-	fifo->esize = esize;
-
-	if (size < 2) {
-		fifo->data = NULL;
-		fifo->mask = 0;
-		return -EINVAL;
-	}
-
-	fifo->data = kmalloc(size * esize, gfp_mask);
-
-	if (!fifo->data) {
-		fifo->mask = 0;
-		return -ENOMEM;
-	}
-	fifo->mask = size - 1;
-
-	return 0;
-}
-EXPORT_SYMBOL(__kfifo_alloc);
-
-void __kfifo_free(struct __kfifo *fifo)
-{
-	kfree(fifo->data);
-	fifo->in = 0;
-	fifo->out = 0;
-	fifo->esize = 0;
-	fifo->data = NULL;
-	fifo->mask = 0;
-}
-EXPORT_SYMBOL(__kfifo_free);
-
-int __kfifo_init(struct __kfifo *fifo, void *buffer,
-		unsigned int size, size_t esize)
-{
-	size /= esize;
-
-	if (!is_power_of_2(size))
-		size = rounddown_pow_of_two(size);
-
-	fifo->in = 0;
-	fifo->out = 0;
-	fifo->esize = esize;
-	fifo->data = buffer;
-
-	if (size < 2) {
-		fifo->mask = 0;
-		return -EINVAL;
-	}
-	fifo->mask = size - 1;
-
-	return 0;
-}
-EXPORT_SYMBOL(__kfifo_init);
-
-static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
-		unsigned int len, unsigned int off)
-{
-	unsigned int size = fifo->mask + 1;
-	unsigned int esize = fifo->esize;
-	unsigned int l;
-
-	off &= fifo->mask;
-	if (esize != 1) {
-		off *= esize;
-		size *= esize;
-		len *= esize;
-	}
-	l = min(len, size - off);
-
-	memcpy(fifo->data + off, src, l);
-	memcpy(fifo->data, src + l, len - l);
-	/*
-	 * make sure that the data in the fifo is up to date before
-	 * incrementing the fifo->in index counter
-	 */
-	smp_wmb();
-}
-
-unsigned int __kfifo_in(struct __kfifo *fifo,
-		const void *buf, unsigned int len)
-{
-	unsigned int l;
-
-	l = kfifo_unused(fifo);
-	if (len > l)
-		len = l;
-
-	kfifo_copy_in(fifo, buf, len, fifo->in);
-	fifo->in += len;
-	return len;
-}
-EXPORT_SYMBOL(__kfifo_in);
-
-static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
-		unsigned int len, unsigned int off)
-{
-	unsigned int size = fifo->mask + 1;
-	unsigned int esize = fifo->esize;
-	unsigned int l;
-
-	off &= fifo->mask;
-	if (esize != 1) {
-		off *= esize;
-		size *= esize;
-		len *= esize;
-	}
-	l = min(len, size - off);
-
-	memcpy(dst, fifo->data + off, l);
-	memcpy(dst + l, fifo->data, len - l);
-	/*
-	 * make sure that the data is copied before
-	 * incrementing the fifo->out index counter
-	 */
-	smp_wmb();
-}
-
-unsigned int __kfifo_out_peek(struct __kfifo *fifo,
-		void *buf, unsigned int len)
-{
-	unsigned int l;
-
-	l = fifo->in - fifo->out;
-	if (len > l)
-		len = l;
-
-	kfifo_copy_out(fifo, buf, len, fifo->out);
-	return len;
-}
-EXPORT_SYMBOL(__kfifo_out_peek);
-
-unsigned int __kfifo_out(struct __kfifo *fifo,
-		void *buf, unsigned int len)
-{
-	len = __kfifo_out_peek(fifo, buf, len);
-	fifo->out += len;
-	return len;
-}
-EXPORT_SYMBOL(__kfifo_out);
-
-static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
-	const void __user *from, unsigned int len, unsigned int off,
-	unsigned int *copied)
-{
-	unsigned int size = fifo->mask + 1;
-	unsigned int esize = fifo->esize;
-	unsigned int l;
-	unsigned long ret;
-
-	off &= fifo->mask;
-	if (esize != 1) {
-		off *= esize;
-		size *= esize;
-		len *= esize;
-	}
-	l = min(len, size - off);
-
-	ret = copy_from_user(fifo->data + off, from, l);
-	if (unlikely(ret))
-		ret = DIV_ROUND_UP(ret + len - l, esize);
-	else {
-		ret = copy_from_user(fifo->data, from + l, len - l);
-		if (unlikely(ret))
-			ret = DIV_ROUND_UP(ret, esize);
-	}
-	/*
-	 * make sure that the data in the fifo is up to date before
-	 * incrementing the fifo->in index counter
-	 */
-	smp_wmb();
-	*copied = len - ret;
-	/* return the number of elements which are not copied */
-	return ret;
-}
-
-int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
-		unsigned long len, unsigned int *copied)
-{
-	unsigned int l;
-	unsigned long ret;
-	unsigned int esize = fifo->esize;
-	int err;
-
-	if (esize != 1)
-		len /= esize;
-
-	l = kfifo_unused(fifo);
-	if (len > l)
-		len = l;
-
-	ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
-	if (unlikely(ret)) {
-		len -= ret;
-		err = -EFAULT;
-	} else
-		err = 0;
-	fifo->in += len;
-	return err;
-}
-EXPORT_SYMBOL(__kfifo_from_user);
-
-static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
-		unsigned int len, unsigned int off, unsigned int *copied)
-{
-	unsigned int l;
-	unsigned long ret;
-	unsigned int size = fifo->mask + 1;
-	unsigned int esize = fifo->esize;
-
-	off &= fifo->mask;
-	if (esize != 1) {
-		off *= esize;
-		size *= esize;
-		len *= esize;
-	}
-	l = min(len, size - off);
-
-	ret = copy_to_user(to, fifo->data + off, l);
-	if (unlikely(ret))
-		ret = DIV_ROUND_UP(ret + len - l, esize);
-	else {
-		ret = copy_to_user(to + l, fifo->data, len - l);
-		if (unlikely(ret))
-			ret = DIV_ROUND_UP(ret, esize);
-	}
-	/*
-	 * make sure that the data is copied before
-	 * incrementing the fifo->out index counter
-	 */
-	smp_wmb();
-	*copied = len - ret;
-	/* return the number of elements which are not copied */
-	return ret;
-}
-
-int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
-		unsigned long len, unsigned int *copied)
-{
-	unsigned int l;
-	unsigned long ret;
-	unsigned int esize = fifo->esize;
-	int err;
-
-	if (esize != 1)
-		len /= esize;
-
-	l = fifo->in - fifo->out;
-	if (len > l)
-		len = l;
-	ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
-	if (unlikely(ret)) {
-		len -= ret;
-		err = -EFAULT;
-	} else
-		err = 0;
-	fifo->out += len;
-	return err;
-}
-EXPORT_SYMBOL(__kfifo_to_user);
-
-static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
-		int nents, unsigned int len)
-{
-	int n;
-	unsigned int l;
-	unsigned int off;
-	struct page *page;
-
-	if (!nents)
-		return 0;
-
-	if (!len)
-		return 0;
-
-	n = 0;
-	page = virt_to_page(buf);
-	off = offset_in_page(buf);
-	l = 0;
-
-	while (len >= l + PAGE_SIZE - off) {
-		struct page *npage;
-
-		l += PAGE_SIZE;
-		buf += PAGE_SIZE;
-		npage = virt_to_page(buf);
-		if (page_to_phys(page) != page_to_phys(npage) - l) {
-			sgl->page_link = 0;
-			sg_set_page(sgl++, page, l - off, off);
-			if (++n == nents)
-				return n;
-			page = npage;
-			len -= l - off;
-			l = off = 0;
-		}
-	}
-	sgl->page_link = 0;
-	sg_set_page(sgl++, page, len, off);
-	return n + 1;
-}
-
-static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
-		int nents, unsigned int len, unsigned int off)
-{
-	unsigned int size = fifo->mask + 1;
-	unsigned int esize = fifo->esize;
-	unsigned int l;
-	unsigned int n;
-
-	off &= fifo->mask;
-	if (esize != 1) {
-		off *= esize;
-		size *= esize;
-		len *= esize;
-	}
-	l = min(len, size - off);
-
-	n  = setup_sgl_buf(sgl, fifo->data + off, nents, l);
-	n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
-
-	if (n)
-		sg_mark_end(sgl + n - 1);
-	return n;
-}
-
-unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
-		struct scatterlist *sgl, int nents, unsigned int len)
-{
-	unsigned int l;
-
-	l = kfifo_unused(fifo);
-	if (len > l)
-		len = l;
-
-	return setup_sgl(fifo, sgl, nents, len, fifo->in);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare);
-
-unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
-		struct scatterlist *sgl, int nents, unsigned int len)
-{
-	unsigned int l;
-
-	l = fifo->in - fifo->out;
-	if (len > l)
-		len = l;
-
-	return setup_sgl(fifo, sgl, nents, len, fifo->out);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare);
-
-unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
-{
-	unsigned int max = (1 << (recsize << 3)) - 1;
-
-	if (len > max)
-		return max;
-	return len;
-}
-
-#define	__KFIFO_PEEK(data, out, mask) \
-	((data)[(out) & (mask)])
-/*
- * __kfifo_peek_n internal helper function for determinate the length of
- * the next record in the fifo
- */
-static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
-{
-	unsigned int l;
-	unsigned int mask = fifo->mask;
-	unsigned char *data = fifo->data;
-
-	l = __KFIFO_PEEK(data, fifo->out, mask);
-
-	if (--recsize)
-		l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
-
-	return l;
-}
-
-#define	__KFIFO_POKE(data, in, mask, val) \
-	( \
-	(data)[(in) & (mask)] = (unsigned char)(val) \
-	)
-
-/*
- * __kfifo_poke_n internal helper function for storeing the length of
- * the record into the fifo
- */
-static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
-{
-	unsigned int mask = fifo->mask;
-	unsigned char *data = fifo->data;
-
-	__KFIFO_POKE(data, fifo->in, mask, n);
-
-	if (recsize > 1)
-		__KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
-}
-
-unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
-{
-	return __kfifo_peek_n(fifo, recsize);
-}
-EXPORT_SYMBOL(__kfifo_len_r);
-
-unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
-		unsigned int len, size_t recsize)
-{
-	if (len + recsize > kfifo_unused(fifo))
-		return 0;
-
-	__kfifo_poke_n(fifo, len, recsize);
-
-	kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
-	fifo->in += len + recsize;
-	return len;
-}
-EXPORT_SYMBOL(__kfifo_in_r);
-
-static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
-	void *buf, unsigned int len, size_t recsize, unsigned int *n)
-{
-	*n = __kfifo_peek_n(fifo, recsize);
-
-	if (len > *n)
-		len = *n;
-
-	kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
-	return len;
-}
-
-unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
-		unsigned int len, size_t recsize)
-{
-	unsigned int n;
-
-	if (fifo->in == fifo->out)
-		return 0;
-
-	return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
-}
-EXPORT_SYMBOL(__kfifo_out_peek_r);
-
-unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
-		unsigned int len, size_t recsize)
-{
-	unsigned int n;
-
-	if (fifo->in == fifo->out)
-		return 0;
-
-	len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
-	fifo->out += n + recsize;
-	return len;
-}
-EXPORT_SYMBOL(__kfifo_out_r);
-
-int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
-	unsigned long len, unsigned int *copied, size_t recsize)
-{
-	unsigned long ret;
-
-	len = __kfifo_max_r(len, recsize);
-
-	if (len + recsize > kfifo_unused(fifo)) {
-		*copied = 0;
-		return 0;
-	}
-
-	__kfifo_poke_n(fifo, len, recsize);
-
-	ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
-	if (unlikely(ret)) {
-		*copied = 0;
-		return -EFAULT;
-	}
-	fifo->in += len + recsize;
-	return 0;
-}
-EXPORT_SYMBOL(__kfifo_from_user_r);
-
-int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
-	unsigned long len, unsigned int *copied, size_t recsize)
-{
-	unsigned long ret;
-	unsigned int n;
-
-	if (fifo->in == fifo->out) {
-		*copied = 0;
-		return 0;
-	}
-
-	n = __kfifo_peek_n(fifo, recsize);
-	if (len > n)
-		len = n;
-
-	ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
-	if (unlikely(ret)) {
-		*copied = 0;
-		return -EFAULT;
-	}
-	fifo->out += n + recsize;
-	return 0;
-}
-EXPORT_SYMBOL(__kfifo_to_user_r);
-
-unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
-	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
-	if (!nents)
-		BUG();
-
-	len = __kfifo_max_r(len, recsize);
-
-	if (len + recsize > kfifo_unused(fifo))
-		return 0;
-
-	return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
-
-void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
-	unsigned int len, size_t recsize)
-{
-	len = __kfifo_max_r(len, recsize);
-	__kfifo_poke_n(fifo, len, recsize);
-	fifo->in += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
-
-unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
-	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
-	if (!nents)
-		BUG();
-
-	len = __kfifo_max_r(len, recsize);
-
-	if (len + recsize > fifo->in - fifo->out)
-		return 0;
-
-	return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
-
-void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
-{
-	unsigned int len;
-
-	len = __kfifo_peek_n(fifo, recsize);
-	fifo->out += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 35edbe22e9a9..02192dd905cc 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,8 +1,7 @@
 /*
- * A generic kernel FIFO implementation.
+ * A generic kernel FIFO implementation
  *
- * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
- * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
+ * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -24,422 +23,580 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/err.h>
-#include <linux/kfifo.h>
 #include <linux/log2.h>
 #include <linux/uaccess.h>
+#include <linux/kfifo.h>
 
-static void _kfifo_init(struct kfifo *fifo, void *buffer,
-		unsigned int size)
-{
-	fifo->buffer = buffer;
-	fifo->size = size;
-
-	kfifo_reset(fifo);
-}
-
-/**
- * kfifo_init - initialize a FIFO using a preallocated buffer
- * @fifo: the fifo to assign the buffer
- * @buffer: the preallocated buffer to be used.
- * @size: the size of the internal buffer, this has to be a power of 2.
- *
+/*
+ * internal helper to calculate the unused elements in a fifo
  */
-void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
+static inline unsigned int kfifo_unused(struct __kfifo *fifo)
 {
-	/* size must be a power of 2 */
-	BUG_ON(!is_power_of_2(size));
-
-	_kfifo_init(fifo, buffer, size);
+	return (fifo->mask + 1) - (fifo->in - fifo->out);
 }
-EXPORT_SYMBOL(kfifo_init);
 
-/**
- * kfifo_alloc - allocates a new FIFO internal buffer
- * @fifo: the fifo to assign then new buffer
- * @size: the size of the buffer to be allocated, this have to be a power of 2.
- * @gfp_mask: get_free_pages mask, passed to kmalloc()
- *
- * This function dynamically allocates a new fifo internal buffer
- *
- * The size will be rounded-up to a power of 2.
- * The buffer will be release with kfifo_free().
- * Return 0 if no error, otherwise the an error code
- */
-int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
+int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
+		size_t esize, gfp_t gfp_mask)
 {
-	unsigned char *buffer;
-
 	/*
-	 * round up to the next power of 2, since our 'let the indices
+	 * round down to the next power of 2, since our 'let the indices
 	 * wrap' technique works only in this case.
 	 */
-	if (!is_power_of_2(size)) {
-		BUG_ON(size > 0x80000000);
-		size = roundup_pow_of_two(size);
+	if (!is_power_of_2(size))
+		size = rounddown_pow_of_two(size);
+
+	fifo->in = 0;
+	fifo->out = 0;
+	fifo->esize = esize;
+
+	if (size < 2) {
+		fifo->data = NULL;
+		fifo->mask = 0;
+		return -EINVAL;
 	}
 
-	buffer = kmalloc(size, gfp_mask);
-	if (!buffer) {
-		_kfifo_init(fifo, NULL, 0);
+	fifo->data = kmalloc(size * esize, gfp_mask);
+
+	if (!fifo->data) {
+		fifo->mask = 0;
 		return -ENOMEM;
 	}
-
-	_kfifo_init(fifo, buffer, size);
+	fifo->mask = size - 1;
 
 	return 0;
 }
-EXPORT_SYMBOL(kfifo_alloc);
+EXPORT_SYMBOL(__kfifo_alloc);
 
-/**
- * kfifo_free - frees the FIFO internal buffer
- * @fifo: the fifo to be freed.
- */
-void kfifo_free(struct kfifo *fifo)
+void __kfifo_free(struct __kfifo *fifo)
 {
-	kfree(fifo->buffer);
-	_kfifo_init(fifo, NULL, 0);
+	kfree(fifo->data);
+	fifo->in = 0;
+	fifo->out = 0;
+	fifo->esize = 0;
+	fifo->data = NULL;
+	fifo->mask = 0;
 }
-EXPORT_SYMBOL(kfifo_free);
+EXPORT_SYMBOL(__kfifo_free);
 
-/**
- * kfifo_skip - skip output data
- * @fifo: the fifo to be used.
- * @len: number of bytes to skip
- */
-void kfifo_skip(struct kfifo *fifo, unsigned int len)
+int __kfifo_init(struct __kfifo *fifo, void *buffer,
+		unsigned int size, size_t esize)
 {
-	if (len < kfifo_len(fifo)) {
-		__kfifo_add_out(fifo, len);
-		return;
+	size /= esize;
+
+	if (!is_power_of_2(size))
+		size = rounddown_pow_of_two(size);
+
+	fifo->in = 0;
+	fifo->out = 0;
+	fifo->esize = esize;
+	fifo->data = buffer;
+
+	if (size < 2) {
+		fifo->mask = 0;
+		return -EINVAL;
 	}
-	kfifo_reset_out(fifo);
+	fifo->mask = size - 1;
+
+	return 0;
 }
-EXPORT_SYMBOL(kfifo_skip);
+EXPORT_SYMBOL(__kfifo_init);
 
-static inline void __kfifo_in_data(struct kfifo *fifo,
-		const void *from, unsigned int len, unsigned int off)
+static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
+		unsigned int len, unsigned int off)
 {
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
 	unsigned int l;
 
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	memcpy(fifo->data + off, src, l);
+	memcpy(fifo->data, src + l, len - l);
 	/*
-	 * Ensure that we sample the fifo->out index -before- we
-	 * start putting bytes into the kfifo.
+	 * make sure that the data in the fifo is up to date before
+	 * incrementing the fifo->in index counter
 	 */
+	smp_wmb();
+}
 
-	smp_mb();
-
-	off = __kfifo_off(fifo, fifo->in + off);
+unsigned int __kfifo_in(struct __kfifo *fifo,
+		const void *buf, unsigned int len)
+{
+	unsigned int l;
 
-	/* first put the data starting from fifo->in to buffer end */
-	l = min(len, fifo->size - off);
-	memcpy(fifo->buffer + off, from, l);
+	l = kfifo_unused(fifo);
+	if (len > l)
+		len = l;
 
-	/* then put the rest (if any) at the beginning of the buffer */
-	memcpy(fifo->buffer, from + l, len - l);
+	kfifo_copy_in(fifo, buf, len, fifo->in);
+	fifo->in += len;
+	return len;
 }
+EXPORT_SYMBOL(__kfifo_in);
 
-static inline void __kfifo_out_data(struct kfifo *fifo,
-		void *to, unsigned int len, unsigned int off)
+static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
+		unsigned int len, unsigned int off)
 {
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
 	unsigned int l;
 
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	memcpy(dst, fifo->data + off, l);
+	memcpy(dst + l, fifo->data, len - l);
 	/*
-	 * Ensure that we sample the fifo->in index -before- we
-	 * start removing bytes from the kfifo.
+	 * make sure that the data is copied before
+	 * incrementing the fifo->out index counter
 	 */
+	smp_wmb();
+}
 
-	smp_rmb();
+unsigned int __kfifo_out_peek(struct __kfifo *fifo,
+		void *buf, unsigned int len)
+{
+	unsigned int l;
 
-	off = __kfifo_off(fifo, fifo->out + off);
+	l = fifo->in - fifo->out;
+	if (len > l)
+		len = l;
 
-	/* first get the data from fifo->out until the end of the buffer */
-	l = min(len, fifo->size - off);
-	memcpy(to, fifo->buffer + off, l);
+	kfifo_copy_out(fifo, buf, len, fifo->out);
+	return len;
+}
+EXPORT_SYMBOL(__kfifo_out_peek);
 
-	/* then get the rest (if any) from the beginning of the buffer */
-	memcpy(to + l, fifo->buffer, len - l);
+unsigned int __kfifo_out(struct __kfifo *fifo,
+		void *buf, unsigned int len)
+{
+	len = __kfifo_out_peek(fifo, buf, len);
+	fifo->out += len;
+	return len;
 }
+EXPORT_SYMBOL(__kfifo_out);
 
-static inline int __kfifo_from_user_data(struct kfifo *fifo,
-	 const void __user *from, unsigned int len, unsigned int off,
-	 unsigned *lenout)
+static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
+	const void __user *from, unsigned int len, unsigned int off,
+	unsigned int *copied)
 {
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
 	unsigned int l;
-	int ret;
+	unsigned long ret;
 
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	ret = copy_from_user(fifo->data + off, from, l);
+	if (unlikely(ret))
+		ret = DIV_ROUND_UP(ret + len - l, esize);
+	else {
+		ret = copy_from_user(fifo->data, from + l, len - l);
+		if (unlikely(ret))
+			ret = DIV_ROUND_UP(ret, esize);
+	}
 	/*
-	 * Ensure that we sample the fifo->out index -before- we
-	 * start putting bytes into the kfifo.
+	 * make sure that the data in the fifo is up to date before
+	 * incrementing the fifo->in index counter
 	 */
+	smp_wmb();
+	*copied = len - ret;
+	/* return the number of elements which are not copied */
+	return ret;
+}
 
-	smp_mb();
+int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
+		unsigned long len, unsigned int *copied)
+{
+	unsigned int l;
+	unsigned long ret;
+	unsigned int esize = fifo->esize;
+	int err;
 
-	off = __kfifo_off(fifo, fifo->in + off);
+	if (esize != 1)
+		len /= esize;
 
-	/* first put the data starting from fifo->in to buffer end */
-	l = min(len, fifo->size - off);
-	ret = copy_from_user(fifo->buffer + off, from, l);
-	if (unlikely(ret)) {
-		*lenout = ret;
-		return -EFAULT;
-	}
-	*lenout = l;
+	l = kfifo_unused(fifo);
+	if (len > l)
+		len = l;
 
-	/* then put the rest (if any) at the beginning of the buffer */
-	ret = copy_from_user(fifo->buffer, from + l, len - l);
-	*lenout += ret ? ret : len - l;
-	return ret ? -EFAULT : 0;
+	ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
+	if (unlikely(ret)) {
+		len -= ret;
+		err = -EFAULT;
+	} else
+		err = 0;
+	fifo->in += len;
+	return err;
 }
+EXPORT_SYMBOL(__kfifo_from_user);
 
-static inline int __kfifo_to_user_data(struct kfifo *fifo,
-		void __user *to, unsigned int len, unsigned int off, unsigned *lenout)
+static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
+		unsigned int len, unsigned int off, unsigned int *copied)
 {
 	unsigned int l;
-	int ret;
-
+	unsigned long ret;
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
+
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	ret = copy_to_user(to, fifo->data + off, l);
+	if (unlikely(ret))
+		ret = DIV_ROUND_UP(ret + len - l, esize);
+	else {
+		ret = copy_to_user(to + l, fifo->data, len - l);
+		if (unlikely(ret))
+			ret = DIV_ROUND_UP(ret, esize);
+	}
 	/*
-	 * Ensure that we sample the fifo->in index -before- we
-	 * start removing bytes from the kfifo.
+	 * make sure that the data is copied before
+	 * incrementing the fifo->out index counter
 	 */
+	smp_wmb();
+	*copied = len - ret;
+	/* return the number of elements which are not copied */
+	return ret;
+}
 
-	smp_rmb();
+int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
+		unsigned long len, unsigned int *copied)
+{
+	unsigned int l;
+	unsigned long ret;
+	unsigned int esize = fifo->esize;
+	int err;
 
-	off = __kfifo_off(fifo, fifo->out + off);
+	if (esize != 1)
+		len /= esize;
 
-	/* first get the data from fifo->out until the end of the buffer */
-	l = min(len, fifo->size - off);
-	ret = copy_to_user(to, fifo->buffer + off, l);
-	*lenout = l;
+	l = fifo->in - fifo->out;
+	if (len > l)
+		len = l;
+	ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
 	if (unlikely(ret)) {
-		*lenout -= ret;
-		return -EFAULT;
-	}
+		len -= ret;
+		err = -EFAULT;
+	} else
+		err = 0;
+	fifo->out += len;
+	return err;
+}
+EXPORT_SYMBOL(__kfifo_to_user);
 
-	/* then get the rest (if any) from the beginning of the buffer */
-	len -= l;
-	ret = copy_to_user(to + l, fifo->buffer, len);
-	if (unlikely(ret)) {
-		*lenout += len - ret;
-		return -EFAULT;
+static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
+		int nents, unsigned int len)
+{
+	int n;
+	unsigned int l;
+	unsigned int off;
+	struct page *page;
+
+	if (!nents)
+		return 0;
+
+	if (!len)
+		return 0;
+
+	n = 0;
+	page = virt_to_page(buf);
+	off = offset_in_page(buf);
+	l = 0;
+
+	while (len >= l + PAGE_SIZE - off) {
+		struct page *npage;
+
+		l += PAGE_SIZE;
+		buf += PAGE_SIZE;
+		npage = virt_to_page(buf);
+		if (page_to_phys(page) != page_to_phys(npage) - l) {
+			sgl->page_link = 0;
+			sg_set_page(sgl++, page, l - off, off);
+			if (++n == nents)
+				return n;
+			page = npage;
+			len -= l - off;
+			l = off = 0;
+		}
 	}
-	*lenout += len;
-	return 0;
+	sgl->page_link = 0;
+	sg_set_page(sgl++, page, len, off);
+	return n + 1;
 }
 
-unsigned int __kfifo_in_n(struct kfifo *fifo,
-	const void *from, unsigned int len, unsigned int recsize)
+static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
+		int nents, unsigned int len, unsigned int off)
 {
-	if (kfifo_avail(fifo) < len + recsize)
-		return len + 1;
+	unsigned int size = fifo->mask + 1;
+	unsigned int esize = fifo->esize;
+	unsigned int l;
+	unsigned int n;
 
-	__kfifo_in_data(fifo, from, len, recsize);
-	return 0;
+	off &= fifo->mask;
+	if (esize != 1) {
+		off *= esize;
+		size *= esize;
+		len *= esize;
+	}
+	l = min(len, size - off);
+
+	n  = setup_sgl_buf(sgl, fifo->data + off, nents, l);
+	n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
+
+	if (n)
+		sg_mark_end(sgl + n - 1);
+	return n;
 }
-EXPORT_SYMBOL(__kfifo_in_n);
 
-/**
- * kfifo_in - puts some data into the FIFO
- * @fifo: the fifo to be used.
- * @from: the data to be added.
- * @len: the length of the data to be added.
- *
- * This function copies at most @len bytes from the @from buffer into
- * the FIFO depending on the free space, and returns the number of
- * bytes copied.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-unsigned int kfifo_in(struct kfifo *fifo, const void *from,
-				unsigned int len)
+unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
+		struct scatterlist *sgl, int nents, unsigned int len)
 {
-	len = min(kfifo_avail(fifo), len);
+	unsigned int l;
 
-	__kfifo_in_data(fifo, from, len, 0);
-	__kfifo_add_in(fifo, len);
-	return len;
+	l = kfifo_unused(fifo);
+	if (len > l)
+		len = l;
+
+	return setup_sgl(fifo, sgl, nents, len, fifo->in);
 }
-EXPORT_SYMBOL(kfifo_in);
+EXPORT_SYMBOL(__kfifo_dma_in_prepare);
 
-unsigned int __kfifo_in_generic(struct kfifo *fifo,
-	const void *from, unsigned int len, unsigned int recsize)
+unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
+		struct scatterlist *sgl, int nents, unsigned int len)
 {
-	return __kfifo_in_rec(fifo, from, len, recsize);
+	unsigned int l;
+
+	l = fifo->in - fifo->out;
+	if (len > l)
+		len = l;
+
+	return setup_sgl(fifo, sgl, nents, len, fifo->out);
 }
-EXPORT_SYMBOL(__kfifo_in_generic);
+EXPORT_SYMBOL(__kfifo_dma_out_prepare);
 
-unsigned int __kfifo_out_n(struct kfifo *fifo,
-	void *to, unsigned int len, unsigned int recsize)
+unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
 {
-	if (kfifo_len(fifo) < len + recsize)
-		return len;
+	unsigned int max = (1 << (recsize << 3)) - 1;
 
-	__kfifo_out_data(fifo, to, len, recsize);
-	__kfifo_add_out(fifo, len + recsize);
-	return 0;
+	if (len > max)
+		return max;
+	return len;
 }
-EXPORT_SYMBOL(__kfifo_out_n);
 
-/**
- * kfifo_out - gets some data from the FIFO
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @len: the size of the destination buffer.
- *
- * This function copies at most @len bytes from the FIFO into the
- * @to buffer and returns the number of copied bytes.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
+#define	__KFIFO_PEEK(data, out, mask) \
+	((data)[(out) & (mask)])
+/*
+ * __kfifo_peek_n internal helper function for determinate the length of
+ * the next record in the fifo
  */
-unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len)
+static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
 {
-	len = min(kfifo_len(fifo), len);
+	unsigned int l;
+	unsigned int mask = fifo->mask;
+	unsigned char *data = fifo->data;
 
-	__kfifo_out_data(fifo, to, len, 0);
-	__kfifo_add_out(fifo, len);
+	l = __KFIFO_PEEK(data, fifo->out, mask);
 
-	return len;
+	if (--recsize)
+		l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
+
+	return l;
 }
-EXPORT_SYMBOL(kfifo_out);
-
-/**
- * kfifo_out_peek - copy some data from the FIFO, but do not remove it
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @len: the size of the destination buffer.
- * @offset: offset into the fifo
- *
- * This function copies at most @len bytes at @offset from the FIFO
- * into the @to buffer and returns the number of copied bytes.
- * The data is not removed from the FIFO.
+
+#define	__KFIFO_POKE(data, in, mask, val) \
+	( \
+	(data)[(in) & (mask)] = (unsigned char)(val) \
+	)
+
+/*
+ * __kfifo_poke_n internal helper function for storeing the length of
+ * the record into the fifo
  */
-unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len,
-			    unsigned offset)
+static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
 {
-	len = min(kfifo_len(fifo), len + offset);
+	unsigned int mask = fifo->mask;
+	unsigned char *data = fifo->data;
 
-	__kfifo_out_data(fifo, to, len, offset);
-	return len;
+	__KFIFO_POKE(data, fifo->in, mask, n);
+
+	if (recsize > 1)
+		__KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
 }
-EXPORT_SYMBOL(kfifo_out_peek);
 
-unsigned int __kfifo_out_generic(struct kfifo *fifo,
-	void *to, unsigned int len, unsigned int recsize,
-	unsigned int *total)
+unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
 {
-	return __kfifo_out_rec(fifo, to, len, recsize, total);
+	return __kfifo_peek_n(fifo, recsize);
 }
-EXPORT_SYMBOL(__kfifo_out_generic);
+EXPORT_SYMBOL(__kfifo_len_r);
 
-unsigned int __kfifo_from_user_n(struct kfifo *fifo,
-	const void __user *from, unsigned int len, unsigned int recsize)
+unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
+		unsigned int len, size_t recsize)
 {
-	unsigned total;
+	if (len + recsize > kfifo_unused(fifo))
+		return 0;
 
-	if (kfifo_avail(fifo) < len + recsize)
-		return len + 1;
+	__kfifo_poke_n(fifo, len, recsize);
 
-	__kfifo_from_user_data(fifo, from, len, recsize, &total);
-	return total;
+	kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
+	fifo->in += len + recsize;
+	return len;
 }
-EXPORT_SYMBOL(__kfifo_from_user_n);
-
-/**
- * kfifo_from_user - puts some data from user space into the FIFO
- * @fifo: the fifo to be used.
- * @from: pointer to the data to be added.
- * @len: the length of the data to be added.
- * @total: the actual returned data length.
- *
- * This function copies at most @len bytes from the @from into the
- * FIFO depending and returns -EFAULT/0.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-int kfifo_from_user(struct kfifo *fifo,
-        const void __user *from, unsigned int len, unsigned *total)
+EXPORT_SYMBOL(__kfifo_in_r);
+
+static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
+	void *buf, unsigned int len, size_t recsize, unsigned int *n)
 {
-	int ret;
-	len = min(kfifo_avail(fifo), len);
-	ret = __kfifo_from_user_data(fifo, from, len, 0, total);
-	if (ret)
-		return ret;
-	__kfifo_add_in(fifo, len);
-	return 0;
+	*n = __kfifo_peek_n(fifo, recsize);
+
+	if (len > *n)
+		len = *n;
+
+	kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
+	return len;
+}
+
+unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
+		unsigned int len, size_t recsize)
+{
+	unsigned int n;
+
+	if (fifo->in == fifo->out)
+		return 0;
+
+	return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
 }
-EXPORT_SYMBOL(kfifo_from_user);
+EXPORT_SYMBOL(__kfifo_out_peek_r);
 
-unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
-	const void __user *from, unsigned int len, unsigned int recsize)
+unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
+		unsigned int len, size_t recsize)
 {
-	return __kfifo_from_user_rec(fifo, from, len, recsize);
+	unsigned int n;
+
+	if (fifo->in == fifo->out)
+		return 0;
+
+	len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
+	fifo->out += n + recsize;
+	return len;
 }
-EXPORT_SYMBOL(__kfifo_from_user_generic);
+EXPORT_SYMBOL(__kfifo_out_r);
 
-unsigned int __kfifo_to_user_n(struct kfifo *fifo,
-	void __user *to, unsigned int len, unsigned int reclen,
-	unsigned int recsize)
+int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
+	unsigned long len, unsigned int *copied, size_t recsize)
 {
-	unsigned int ret, total;
+	unsigned long ret;
 
-	if (kfifo_len(fifo) < reclen + recsize)
-		return len;
+	len = __kfifo_max_r(len, recsize);
 
-	ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total);
+	if (len + recsize > kfifo_unused(fifo)) {
+		*copied = 0;
+		return 0;
+	}
 
-	if (likely(ret == 0))
-		__kfifo_add_out(fifo, reclen + recsize);
+	__kfifo_poke_n(fifo, len, recsize);
 
-	return total;
+	ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
+	if (unlikely(ret)) {
+		*copied = 0;
+		return -EFAULT;
+	}
+	fifo->in += len + recsize;
+	return 0;
 }
-EXPORT_SYMBOL(__kfifo_to_user_n);
-
-/**
- * kfifo_to_user - gets data from the FIFO and write it to user space
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @len: the size of the destination buffer.
- * @lenout: pointer to output variable with copied data
- *
- * This function copies at most @len bytes from the FIFO into the
- * @to buffer and 0 or -EFAULT.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-int kfifo_to_user(struct kfifo *fifo,
-	void __user *to, unsigned int len, unsigned *lenout)
+EXPORT_SYMBOL(__kfifo_from_user_r);
+
+int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
+	unsigned long len, unsigned int *copied, size_t recsize)
 {
-	int ret;
-	len = min(kfifo_len(fifo), len);
-	ret = __kfifo_to_user_data(fifo, to, len, 0, lenout);
-	__kfifo_add_out(fifo, *lenout);
-	return ret;
+	unsigned long ret;
+	unsigned int n;
+
+	if (fifo->in == fifo->out) {
+		*copied = 0;
+		return 0;
+	}
+
+	n = __kfifo_peek_n(fifo, recsize);
+	if (len > n)
+		len = n;
+
+	ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
+	if (unlikely(ret)) {
+		*copied = 0;
+		return -EFAULT;
+	}
+	fifo->out += n + recsize;
+	return 0;
 }
-EXPORT_SYMBOL(kfifo_to_user);
+EXPORT_SYMBOL(__kfifo_to_user_r);
 
-unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
-	void __user *to, unsigned int len, unsigned int recsize,
-	unsigned int *total)
+unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
 {
-	return __kfifo_to_user_rec(fifo, to, len, recsize, total);
+	if (!nents)
+		BUG();
+
+	len = __kfifo_max_r(len, recsize);
+
+	if (len + recsize > kfifo_unused(fifo))
+		return 0;
+
+	return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
 }
-EXPORT_SYMBOL(__kfifo_to_user_generic);
+EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
 
-unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize)
+void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
+	unsigned int len, size_t recsize)
 {
-	if (recsize == 0)
-		return kfifo_avail(fifo);
-
-	return __kfifo_peek_n(fifo, recsize);
+	len = __kfifo_max_r(len, recsize);
+	__kfifo_poke_n(fifo, len, recsize);
+	fifo->in += len + recsize;
 }
-EXPORT_SYMBOL(__kfifo_peek_generic);
+EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
 
-void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize)
+unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
 {
-	__kfifo_skip_rec(fifo, recsize);
+	if (!nents)
+		BUG();
+
+	len = __kfifo_max_r(len, recsize);
+
+	if (len + recsize > fifo->in - fifo->out)
+		return 0;
+
+	return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
 }
-EXPORT_SYMBOL(__kfifo_skip_generic);
+EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
+
+void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
+{
+	unsigned int len;
 
+	len = __kfifo_peek_n(fifo, recsize);
+	fifo->out += len + recsize;
+}
+EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
-- 
cgit v1.2.3-59-g8ed1b


From 9a94241afcc9a481691a9c29b7460217925b59b8 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Wed, 11 Aug 2010 18:20:56 +0200
Subject: i2c: Add support for custom probe function

The probe method used by i2c_new_probed_device() may not be suitable
for all cases. Let the caller provide its own, optional probe
function.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Acked-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 Documentation/i2c/instantiating-devices   |  2 +-
 drivers/i2c/i2c-core.c                    | 16 ++++++++++------
 drivers/macintosh/therm_windtunnel.c      |  4 ++--
 drivers/media/video/bt8xx/bttv-i2c.c      |  2 +-
 drivers/media/video/cx18/cx18-i2c.c       |  3 ++-
 drivers/media/video/em28xx/em28xx-cards.c |  2 +-
 drivers/media/video/ivtv/ivtv-i2c.c       |  9 +++++----
 drivers/media/video/v4l2-common.c         |  3 ++-
 drivers/usb/host/ohci-pnx4008.c           |  2 +-
 drivers/video/matrox/i2c-matroxfb.c       |  2 +-
 include/linux/i2c.h                       |  7 +++++--
 11 files changed, 31 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/instantiating-devices b/Documentation/i2c/instantiating-devices
index e89490270aba..87da405a8597 100644
--- a/Documentation/i2c/instantiating-devices
+++ b/Documentation/i2c/instantiating-devices
@@ -102,7 +102,7 @@ static int __devinit usb_hcd_pnx4008_probe(struct platform_device *pdev)
 	memset(&i2c_info, 0, sizeof(struct i2c_board_info));
 	strlcpy(i2c_info.name, "isp1301_pnx", I2C_NAME_SIZE);
 	isp1301_i2c_client = i2c_new_probed_device(i2c_adap, &i2c_info,
-						   normal_i2c);
+						   normal_i2c, NULL);
 	i2c_put_adapter(i2c_adap);
 	(...)
 }
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index df937df845eb..cf14ca063181 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -1464,14 +1464,18 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver)
 struct i2c_client *
 i2c_new_probed_device(struct i2c_adapter *adap,
 		      struct i2c_board_info *info,
-		      unsigned short const *addr_list)
+		      unsigned short const *addr_list,
+		      int (*probe)(struct i2c_adapter *, unsigned short addr))
 {
 	int i;
 
-	/* Stop here if the bus doesn't support probing */
-	if (!i2c_check_functionality(adap, I2C_FUNC_SMBUS_READ_BYTE)) {
-		dev_err(&adap->dev, "Probing not supported\n");
-		return NULL;
+	if (!probe) {
+		/* Stop here if the bus doesn't support probing */
+		if (!i2c_check_functionality(adap, I2C_FUNC_SMBUS_READ_BYTE)) {
+			dev_err(&adap->dev, "Probing not supported\n");
+			return NULL;
+		}
+		probe = i2c_default_probe;
 	}
 
 	for (i = 0; addr_list[i] != I2C_CLIENT_END; i++) {
@@ -1490,7 +1494,7 @@ i2c_new_probed_device(struct i2c_adapter *adap,
 		}
 
 		/* Test address responsiveness */
-		if (i2c_default_probe(adap, addr_list[i]))
+		if (probe(adap, addr_list[i]))
 			break;
 	}
 
diff --git a/drivers/macintosh/therm_windtunnel.c b/drivers/macintosh/therm_windtunnel.c
index 5c9367acf0cf..7d1bfeb6a82c 100644
--- a/drivers/macintosh/therm_windtunnel.c
+++ b/drivers/macintosh/therm_windtunnel.c
@@ -322,10 +322,10 @@ do_attach( struct i2c_adapter *adapter )
 
 		memset(&info, 0, sizeof(struct i2c_board_info));
 		strlcpy(info.type, "therm_ds1775", I2C_NAME_SIZE);
-		i2c_new_probed_device(adapter, &info, scan_ds1775);
+		i2c_new_probed_device(adapter, &info, scan_ds1775, NULL);
 
 		strlcpy(info.type, "therm_adm1030", I2C_NAME_SIZE);
-		i2c_new_probed_device(adapter, &info, scan_adm1030);
+		i2c_new_probed_device(adapter, &info, scan_adm1030, NULL);
 
 		if( x.thermostat && x.fan ) {
 			x.running = 1;
diff --git a/drivers/media/video/bt8xx/bttv-i2c.c b/drivers/media/video/bt8xx/bttv-i2c.c
index 407fa61e4cda..685d6597ee79 100644
--- a/drivers/media/video/bt8xx/bttv-i2c.c
+++ b/drivers/media/video/bt8xx/bttv-i2c.c
@@ -411,7 +411,7 @@ void __devinit init_bttv_i2c_ir(struct bttv *btv)
 
 		memset(&info, 0, sizeof(struct i2c_board_info));
 		strlcpy(info.type, "ir_video", I2C_NAME_SIZE);
-		i2c_new_probed_device(&btv->c.i2c_adap, &info, addr_list);
+		i2c_new_probed_device(&btv->c.i2c_adap, &info, addr_list, NULL);
 	}
 }
 
diff --git a/drivers/media/video/cx18/cx18-i2c.c b/drivers/media/video/cx18/cx18-i2c.c
index 809f7d37129c..73ce90c2f577 100644
--- a/drivers/media/video/cx18/cx18-i2c.c
+++ b/drivers/media/video/cx18/cx18-i2c.c
@@ -117,7 +117,8 @@ static int cx18_i2c_new_ir(struct cx18 *cx, struct i2c_adapter *adap, u32 hw,
 		break;
 	}
 
-	return i2c_new_probed_device(adap, &info, addr_list) == NULL ? -1 : 0;
+	return i2c_new_probed_device(adap, &info, addr_list, NULL) == NULL ?
+	       -1 : 0;
 }
 
 int cx18_i2c_register(struct cx18 *cx, unsigned idx)
diff --git a/drivers/media/video/em28xx/em28xx-cards.c b/drivers/media/video/em28xx/em28xx-cards.c
index ffbe544e30f4..e7efb4bffabd 100644
--- a/drivers/media/video/em28xx/em28xx-cards.c
+++ b/drivers/media/video/em28xx/em28xx-cards.c
@@ -2385,7 +2385,7 @@ void em28xx_register_i2c_ir(struct em28xx *dev)
 
 	if (dev->init_data.name)
 		info.platform_data = &dev->init_data;
-	i2c_new_probed_device(&dev->i2c_adap, &info, addr_list);
+	i2c_new_probed_device(&dev->i2c_adap, &info, addr_list, NULL);
 }
 
 void em28xx_card_setup(struct em28xx *dev)
diff --git a/drivers/media/video/ivtv/ivtv-i2c.c b/drivers/media/video/ivtv/ivtv-i2c.c
index d391bbdb0b8a..a74fa099c565 100644
--- a/drivers/media/video/ivtv/ivtv-i2c.c
+++ b/drivers/media/video/ivtv/ivtv-i2c.c
@@ -183,8 +183,8 @@ static int ivtv_i2c_new_ir(struct ivtv *itv, u32 hw, const char *type, u8 addr)
 			return -1;
 		memset(&info, 0, sizeof(struct i2c_board_info));
 		strlcpy(info.type, type, I2C_NAME_SIZE);
-		return i2c_new_probed_device(adap, &info, addr_list) == NULL
-								     ? -1 : 0;
+		return i2c_new_probed_device(adap, &info, addr_list, NULL)
+							   == NULL ? -1 : 0;
 	}
 
 	/* Only allow one IR receiver to be registered per board */
@@ -221,7 +221,8 @@ static int ivtv_i2c_new_ir(struct ivtv *itv, u32 hw, const char *type, u8 addr)
 	info.platform_data = init_data;
 	strlcpy(info.type, type, I2C_NAME_SIZE);
 
-	return i2c_new_probed_device(adap, &info, addr_list) == NULL ? -1 : 0;
+	return i2c_new_probed_device(adap, &info, addr_list, NULL) == NULL ?
+	       -1 : 0;
 }
 
 /* Instantiate the IR receiver device using probing -- undesirable */
@@ -249,7 +250,7 @@ struct i2c_client *ivtv_i2c_new_ir_legacy(struct ivtv *itv)
 
 	memset(&info, 0, sizeof(struct i2c_board_info));
 	strlcpy(info.type, "ir_video", I2C_NAME_SIZE);
-	return i2c_new_probed_device(&itv->i2c_adap, &info, addr_list);
+	return i2c_new_probed_device(&itv->i2c_adap, &info, addr_list, NULL);
 }
 
 int ivtv_i2c_register(struct ivtv *itv, unsigned idx)
diff --git a/drivers/media/video/v4l2-common.c b/drivers/media/video/v4l2-common.c
index 3ce7c64e5789..8ee1179be926 100644
--- a/drivers/media/video/v4l2-common.c
+++ b/drivers/media/video/v4l2-common.c
@@ -381,7 +381,8 @@ struct v4l2_subdev *v4l2_i2c_new_subdev_board(struct v4l2_device *v4l2_dev,
 
 	/* Create the i2c client */
 	if (info->addr == 0 && probe_addrs)
-		client = i2c_new_probed_device(adapter, info, probe_addrs);
+		client = i2c_new_probed_device(adapter, info, probe_addrs,
+					       NULL);
 	else
 		client = i2c_new_device(adapter, info);
 
diff --git a/drivers/usb/host/ohci-pnx4008.c b/drivers/usb/host/ohci-pnx4008.c
index cd74bbdd007c..653d6a60edb5 100644
--- a/drivers/usb/host/ohci-pnx4008.c
+++ b/drivers/usb/host/ohci-pnx4008.c
@@ -329,7 +329,7 @@ static int __devinit usb_hcd_pnx4008_probe(struct platform_device *pdev)
 	memset(&i2c_info, 0, sizeof(struct i2c_board_info));
 	strlcpy(i2c_info.type, "isp1301_pnx", I2C_NAME_SIZE);
 	isp1301_i2c_client = i2c_new_probed_device(i2c_adap, &i2c_info,
-						   normal_i2c);
+						   normal_i2c, NULL);
 	i2c_put_adapter(i2c_adap);
 	if (!isp1301_i2c_client) {
 		err("failed to connect I2C to ISP1301 USB Transceiver");
diff --git a/drivers/video/matrox/i2c-matroxfb.c b/drivers/video/matrox/i2c-matroxfb.c
index 403b14445a78..0fb280ead3dc 100644
--- a/drivers/video/matrox/i2c-matroxfb.c
+++ b/drivers/video/matrox/i2c-matroxfb.c
@@ -191,7 +191,7 @@ static void* i2c_matroxfb_probe(struct matrox_fb_info* minfo) {
 			};
 
 			i2c_new_probed_device(&m2info->maven.adapter,
-					      &maven_info, addr_list);
+					      &maven_info, addr_list, NULL);
 		}
 	}
 	return m2info;
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 38dd4025aa4e..59a9f3cdc0b5 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -284,12 +284,15 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info);
 
 /* If you don't know the exact address of an I2C device, use this variant
  * instead, which can probe for device presence in a list of possible
- * addresses.
+ * addresses. The "probe" callback function is optional. If it is provided,
+ * it must return 1 on successful probe, 0 otherwise. If it is not provided,
+ * a default probing method is used.
  */
 extern struct i2c_client *
 i2c_new_probed_device(struct i2c_adapter *adap,
 		      struct i2c_board_info *info,
-		      unsigned short const *addr_list);
+		      unsigned short const *addr_list,
+		      int (*probe)(struct i2c_adapter *, unsigned short addr));
 
 /* For devices that use several addresses, use i2c_new_dummy() to make
  * client handles for the extra addresses.
-- 
cgit v1.2.3-59-g8ed1b


From d44f19d586b6113fb5db10e1a36457f0db3b01aa Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Wed, 11 Aug 2010 18:20:57 +0200
Subject: V4L/DVB: Use custom I2C probing function mechanism

Now that i2c-core offers the possibility to provide custom probing
function for I2C devices, let's make use of it.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Acked-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/i2c/i2c-core.c                    |  7 +++++++
 drivers/media/video/cx23885/cx23885-i2c.c | 15 ++++-----------
 drivers/media/video/cx88/cx88-i2c.c       | 19 ++++---------------
 include/linux/i2c.h                       |  3 +++
 4 files changed, 18 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index cf14ca063181..6e1c2f54d9cf 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -1461,6 +1461,13 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver)
 	return err;
 }
 
+int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short addr)
+{
+	return i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_READ, 0,
+			      I2C_SMBUS_QUICK, NULL) >= 0;
+}
+EXPORT_SYMBOL_GPL(i2c_probe_func_quick_read);
+
 struct i2c_client *
 i2c_new_probed_device(struct i2c_adapter *adap,
 		      struct i2c_board_info *info,
diff --git a/drivers/media/video/cx23885/cx23885-i2c.c b/drivers/media/video/cx23885/cx23885-i2c.c
index 1a391486e551..ed3d8f55029b 100644
--- a/drivers/media/video/cx23885/cx23885-i2c.c
+++ b/drivers/media/video/cx23885/cx23885-i2c.c
@@ -364,17 +364,10 @@ int cx23885_i2c_register(struct cx23885_i2c *bus)
 
 		memset(&info, 0, sizeof(struct i2c_board_info));
 		strlcpy(info.type, "ir_video", I2C_NAME_SIZE);
-		/*
-		 * We can't call i2c_new_probed_device() because it uses
-		 * quick writes for probing and the IR receiver device only
-		 * replies to reads.
-		 */
-		if (i2c_smbus_xfer(&bus->i2c_adap, addr_list[0], 0,
-				   I2C_SMBUS_READ, 0, I2C_SMBUS_QUICK,
-				   NULL) >= 0) {
-			info.addr = addr_list[0];
-			i2c_new_device(&bus->i2c_adap, &info);
-		}
+		/* Use quick read command for probe, some IR chips don't
+		 * support writes */
+		i2c_new_probed_device(&bus->i2c_adap, &info, addr_list,
+				      i2c_probe_func_quick_read);
 	}
 
 	return bus->i2c_rc;
diff --git a/drivers/media/video/cx88/cx88-i2c.c b/drivers/media/video/cx88/cx88-i2c.c
index 375ad53f7961..82db555b22dd 100644
--- a/drivers/media/video/cx88/cx88-i2c.c
+++ b/drivers/media/video/cx88/cx88-i2c.c
@@ -193,24 +193,13 @@ void cx88_i2c_init_ir(struct cx88_core *core)
 			0x18, 0x6b, 0x71,
 			I2C_CLIENT_END
 		};
-		const unsigned short *addrp;
 
 		memset(&info, 0, sizeof(struct i2c_board_info));
 		strlcpy(info.type, "ir_video", I2C_NAME_SIZE);
-		/*
-		 * We can't call i2c_new_probed_device() because it uses
-		 * quick writes for probing and at least some R receiver
-		 * devices only reply to reads.
-		 */
-		for (addrp = addr_list; *addrp != I2C_CLIENT_END; addrp++) {
-			if (i2c_smbus_xfer(&core->i2c_adap, *addrp, 0,
-					   I2C_SMBUS_READ, 0,
-					   I2C_SMBUS_QUICK, NULL) >= 0) {
-				info.addr = *addrp;
-				i2c_new_device(&core->i2c_adap, &info);
-				break;
-			}
-		}
+		/* Use quick read command for probe, some IR chips don't
+		 * support writes */
+		i2c_new_probed_device(&core->i2c_adap, &info, addr_list,
+				      i2c_probe_func_quick_read);
 	}
 }
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 59a9f3cdc0b5..c8627e453e97 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -294,6 +294,9 @@ i2c_new_probed_device(struct i2c_adapter *adap,
 		      unsigned short const *addr_list,
 		      int (*probe)(struct i2c_adapter *, unsigned short addr));
 
+/* Common custom probe functions */
+extern int i2c_probe_func_quick_read(struct i2c_adapter *, unsigned short addr);
+
 /* For devices that use several addresses, use i2c_new_dummy() to make
  * client handles for the extra addresses.
  */
-- 
cgit v1.2.3-59-g8ed1b


From fe61e07e9ebc890c70d97a1f72ddaad4bee2d848 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Wed, 11 Aug 2010 18:20:58 +0200
Subject: i2c: Move adapter locking helpers to i2c-core

Uninline i2c adapter locking helper functions, move them to i2c-core,
and use them in i2c-core itself. The functions are still exported for
external users. This makes future updates to the locking model (which
will be needed for multiplexing support) possible and transparent.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Michael Lawnick <ml.lawnick@gmx.de>
---
 drivers/i2c/i2c-core.c | 39 ++++++++++++++++++++++++++++++++++-----
 include/linux/i2c.h    | 20 +++-----------------
 2 files changed, 37 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 6e1c2f54d9cf..e09e143379b1 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -429,6 +429,35 @@ static int i2c_check_addr_busy(struct i2c_adapter *adapter, int addr)
 				     __i2c_check_addr_busy);
 }
 
+/**
+ * i2c_lock_adapter - Get exclusive access to an I2C bus segment
+ * @adapter: Target I2C bus segment
+ */
+void i2c_lock_adapter(struct i2c_adapter *adapter)
+{
+	rt_mutex_lock(&adapter->bus_lock);
+}
+EXPORT_SYMBOL_GPL(i2c_lock_adapter);
+
+/**
+ * i2c_trylock_adapter - Try to get exclusive access to an I2C bus segment
+ * @adapter: Target I2C bus segment
+ */
+static int i2c_trylock_adapter(struct i2c_adapter *adapter)
+{
+	return rt_mutex_trylock(&adapter->bus_lock);
+}
+
+/**
+ * i2c_unlock_adapter - Release exclusive access to an I2C bus segment
+ * @adapter: Target I2C bus segment
+ */
+void i2c_unlock_adapter(struct i2c_adapter *adapter)
+{
+	rt_mutex_unlock(&adapter->bus_lock);
+}
+EXPORT_SYMBOL_GPL(i2c_unlock_adapter);
+
 /**
  * i2c_new_device - instantiate an i2c device
  * @adap: the adapter managing the device
@@ -1238,12 +1267,12 @@ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
 #endif
 
 		if (in_atomic() || irqs_disabled()) {
-			ret = rt_mutex_trylock(&adap->bus_lock);
+			ret = i2c_trylock_adapter(adap);
 			if (!ret)
 				/* I2C activity is ongoing. */
 				return -EAGAIN;
 		} else {
-			rt_mutex_lock(&adap->bus_lock);
+			i2c_lock_adapter(adap);
 		}
 
 		/* Retry automatically on arbitration loss */
@@ -1255,7 +1284,7 @@ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
 			if (time_after(jiffies, orig_jiffies + adap->timeout))
 				break;
 		}
-		rt_mutex_unlock(&adap->bus_lock);
+		i2c_unlock_adapter(adap);
 
 		return ret;
 	} else {
@@ -2013,7 +2042,7 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
 	flags &= I2C_M_TEN | I2C_CLIENT_PEC;
 
 	if (adapter->algo->smbus_xfer) {
-		rt_mutex_lock(&adapter->bus_lock);
+		i2c_lock_adapter(adapter);
 
 		/* Retry automatically on arbitration loss */
 		orig_jiffies = jiffies;
@@ -2027,7 +2056,7 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
 				       orig_jiffies + adapter->timeout))
 				break;
 		}
-		rt_mutex_unlock(&adapter->bus_lock);
+		i2c_unlock_adapter(adapter);
 	} else
 		res = i2c_smbus_xfer_emulated(adapter, addr, flags, read_write,
 					      command, protocol, data);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index c8627e453e97..5bf0f4beea31 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -382,23 +382,9 @@ static inline void i2c_set_adapdata(struct i2c_adapter *dev, void *data)
 	dev_set_drvdata(&dev->dev, data);
 }
 
-/**
- * i2c_lock_adapter - Prevent access to an I2C bus segment
- * @adapter: Target I2C bus segment
- */
-static inline void i2c_lock_adapter(struct i2c_adapter *adapter)
-{
-	rt_mutex_lock(&adapter->bus_lock);
-}
-
-/**
- * i2c_unlock_adapter - Reauthorize access to an I2C bus segment
- * @adapter: Target I2C bus segment
- */
-static inline void i2c_unlock_adapter(struct i2c_adapter *adapter)
-{
-	rt_mutex_unlock(&adapter->bus_lock);
-}
+/* Adapter locking functions, exported for shared pin cases */
+void i2c_lock_adapter(struct i2c_adapter *);
+void i2c_unlock_adapter(struct i2c_adapter *);
 
 /*flags for the client struct: */
 #define I2C_CLIENT_PEC	0x04		/* Use Packet Error Checking */
-- 
cgit v1.2.3-59-g8ed1b


From dafc50d141c27959dbd3a1cfe9857a86d23402a7 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Wed, 11 Aug 2010 18:21:01 +0200
Subject: i2c: Use a separate mutex for userspace client lists

Moving userspace-instantiated clients to separate lists wasn't nearly
enough to avoid deadlocks in multiplexed bus cases. We also want to
have a dedicated mutex to protect each list.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Michael Lawnick <ml.lawnick@gmx.de>
---
 drivers/i2c/i2c-core.c | 13 +++++++------
 include/linux/i2c.h    |  1 +
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index ba86af63f5cf..97f96b66653c 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -662,9 +662,9 @@ i2c_sysfs_new_device(struct device *dev, struct device_attribute *attr,
 		return -EINVAL;
 
 	/* Keep track of the added device */
-	i2c_lock_adapter(adap);
+	mutex_lock(&adap->userspace_clients_lock);
 	list_add_tail(&client->detected, &adap->userspace_clients);
-	i2c_unlock_adapter(adap);
+	mutex_unlock(&adap->userspace_clients_lock);
 	dev_info(dev, "%s: Instantiated device %s at 0x%02hx\n", "new_device",
 		 info.type, info.addr);
 
@@ -703,7 +703,7 @@ i2c_sysfs_delete_device(struct device *dev, struct device_attribute *attr,
 
 	/* Make sure the device was added through sysfs */
 	res = -ENOENT;
-	i2c_lock_adapter(adap);
+	mutex_lock(&adap->userspace_clients_lock);
 	list_for_each_entry_safe(client, next, &adap->userspace_clients,
 				 detected) {
 		if (client->addr == addr) {
@@ -716,7 +716,7 @@ i2c_sysfs_delete_device(struct device *dev, struct device_attribute *attr,
 			break;
 		}
 	}
-	i2c_unlock_adapter(adap);
+	mutex_unlock(&adap->userspace_clients_lock);
 
 	if (res < 0)
 		dev_err(dev, "%s: Can't find device in list\n",
@@ -798,6 +798,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 	}
 
 	rt_mutex_init(&adap->bus_lock);
+	mutex_init(&adap->userspace_clients_lock);
 	INIT_LIST_HEAD(&adap->userspace_clients);
 
 	/* Set default timeout to 1 second if not already set */
@@ -1003,7 +1004,7 @@ int i2c_del_adapter(struct i2c_adapter *adap)
 		return res;
 
 	/* Remove devices instantiated from sysfs */
-	i2c_lock_adapter(adap);
+	mutex_lock(&adap->userspace_clients_lock);
 	list_for_each_entry_safe(client, next, &adap->userspace_clients,
 				 detected) {
 		dev_dbg(&adap->dev, "Removing %s at 0x%x\n", client->name,
@@ -1011,7 +1012,7 @@ int i2c_del_adapter(struct i2c_adapter *adap)
 		list_del(&client->detected);
 		i2c_unregister_device(client);
 	}
-	i2c_unlock_adapter(adap);
+	mutex_unlock(&adap->userspace_clients_lock);
 
 	/* Detach any active clients. This can't fail, thus we do not
 	   checking the returned value. */
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 5bf0f4beea31..798bad8741e4 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -368,6 +368,7 @@ struct i2c_adapter {
 	char name[48];
 	struct completion dev_released;
 
+	struct mutex userspace_clients_lock;
 	struct list_head userspace_clients;
 };
 #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
-- 
cgit v1.2.3-59-g8ed1b


From 0826374bff57411d239f2fcb15da3c35af0a93cd Mon Sep 17 00:00:00 2001
From: Michael Lawnick <ml.lawnick@gmx.de>
Date: Wed, 11 Aug 2010 18:21:02 +0200
Subject: i2c: Multiplexed I2C bus core support

Add multiplexed bus core support. I2C multiplexer and switches
like pca954x get instantiated as new adapters per port.

Signed-off-by: Michael Lawnick <ml.lawnick@gmx.de>
Acked-by: Rodolfo Giometti <giometti@linux.it>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/Kconfig     |  11 ++++
 drivers/i2c/Makefile    |   1 +
 drivers/i2c/i2c-core.c  |  64 +++++++++++++++++--
 drivers/i2c/i2c-dev.c   |  40 +++++++++++-
 drivers/i2c/i2c-mux.c   | 165 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c-mux.h |  46 ++++++++++++++
 include/linux/i2c.h     |   8 +++
 7 files changed, 327 insertions(+), 8 deletions(-)
 create mode 100644 drivers/i2c/i2c-mux.c
 create mode 100644 include/linux/i2c-mux.h

(limited to 'include/linux')

diff --git a/drivers/i2c/Kconfig b/drivers/i2c/Kconfig
index d06083fdffbb..efb48ad1ba34 100644
--- a/drivers/i2c/Kconfig
+++ b/drivers/i2c/Kconfig
@@ -47,6 +47,17 @@ config I2C_CHARDEV
 	  This support is also available as a module.  If so, the module 
 	  will be called i2c-dev.
 
+config I2C_MUX
+	tristate "I2C bus multiplexing support"
+	depends on EXPERIMENTAL
+	help
+	  Say Y here if you want the I2C core to support the ability to
+	  handle multiplexed I2C bus topologies, by presenting each
+	  multiplexed segment as a I2C adapter.
+
+	  This support is also available as a module.  If so, the module
+	  will be called i2c-mux.
+
 config I2C_HELPER_AUTO
 	bool "Autoselect pertinent helper modules"
 	default y
diff --git a/drivers/i2c/Makefile b/drivers/i2c/Makefile
index a7d9b4be9bb3..f363258daa3d 100644
--- a/drivers/i2c/Makefile
+++ b/drivers/i2c/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_I2C_BOARDINFO)	+= i2c-boardinfo.o
 obj-$(CONFIG_I2C)		+= i2c-core.o
 obj-$(CONFIG_I2C_SMBUS)		+= i2c-smbus.o
 obj-$(CONFIG_I2C_CHARDEV)	+= i2c-dev.o
+obj-$(CONFIG_I2C_MUX)		+= i2c-mux.o
 obj-y				+= algos/ busses/
 
 ifeq ($(CONFIG_I2C_DEBUG_CORE),y)
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 97f96b66653c..6649176de940 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -20,7 +20,9 @@
 /* With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi>.
    All SMBus-related things are written by Frodo Looijaard <frodol@dds.nl>
    SMBus 2.0 support by Mark Studebaker <mdsxyz123@yahoo.com> and
-   Jean Delvare <khali@linux-fr.org> */
+   Jean Delvare <khali@linux-fr.org>
+   Mux support by Rodolfo Giometti <giometti@enneenne.com> and
+   Michael Lawnick <michael.lawnick.ext@nsn.com> */
 
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -423,10 +425,48 @@ static int __i2c_check_addr_busy(struct device *dev, void *addrp)
 	return 0;
 }
 
+/* walk up mux tree */
+static int i2c_check_mux_parents(struct i2c_adapter *adapter, int addr)
+{
+	int result;
+
+	result = device_for_each_child(&adapter->dev, &addr,
+					__i2c_check_addr_busy);
+
+	if (!result && i2c_parent_is_i2c_adapter(adapter))
+		result = i2c_check_mux_parents(
+				    to_i2c_adapter(adapter->dev.parent), addr);
+
+	return result;
+}
+
+/* recurse down mux tree */
+static int i2c_check_mux_children(struct device *dev, void *addrp)
+{
+	int result;
+
+	if (dev->type == &i2c_adapter_type)
+		result = device_for_each_child(dev, addrp,
+						i2c_check_mux_children);
+	else
+		result = __i2c_check_addr_busy(dev, addrp);
+
+	return result;
+}
+
 static int i2c_check_addr_busy(struct i2c_adapter *adapter, int addr)
 {
-	return device_for_each_child(&adapter->dev, &addr,
-				     __i2c_check_addr_busy);
+	int result = 0;
+
+	if (i2c_parent_is_i2c_adapter(adapter))
+		result = i2c_check_mux_parents(
+				    to_i2c_adapter(adapter->dev.parent), addr);
+
+	if (!result)
+		result = device_for_each_child(&adapter->dev, &addr,
+						i2c_check_mux_children);
+
+	return result;
 }
 
 /**
@@ -435,7 +475,10 @@ static int i2c_check_addr_busy(struct i2c_adapter *adapter, int addr)
  */
 void i2c_lock_adapter(struct i2c_adapter *adapter)
 {
-	rt_mutex_lock(&adapter->bus_lock);
+	if (i2c_parent_is_i2c_adapter(adapter))
+		i2c_lock_adapter(to_i2c_adapter(adapter->dev.parent));
+	else
+		rt_mutex_lock(&adapter->bus_lock);
 }
 EXPORT_SYMBOL_GPL(i2c_lock_adapter);
 
@@ -445,7 +488,10 @@ EXPORT_SYMBOL_GPL(i2c_lock_adapter);
  */
 static int i2c_trylock_adapter(struct i2c_adapter *adapter)
 {
-	return rt_mutex_trylock(&adapter->bus_lock);
+	if (i2c_parent_is_i2c_adapter(adapter))
+		return i2c_trylock_adapter(to_i2c_adapter(adapter->dev.parent));
+	else
+		return rt_mutex_trylock(&adapter->bus_lock);
 }
 
 /**
@@ -454,7 +500,10 @@ static int i2c_trylock_adapter(struct i2c_adapter *adapter)
  */
 void i2c_unlock_adapter(struct i2c_adapter *adapter)
 {
-	rt_mutex_unlock(&adapter->bus_lock);
+	if (i2c_parent_is_i2c_adapter(adapter))
+		i2c_unlock_adapter(to_i2c_adapter(adapter->dev.parent));
+	else
+		rt_mutex_unlock(&adapter->bus_lock);
 }
 EXPORT_SYMBOL_GPL(i2c_unlock_adapter);
 
@@ -743,10 +792,11 @@ static const struct attribute_group *i2c_adapter_attr_groups[] = {
 	NULL
 };
 
-static struct device_type i2c_adapter_type = {
+struct device_type i2c_adapter_type = {
 	.groups		= i2c_adapter_attr_groups,
 	.release	= i2c_adapter_dev_release,
 };
+EXPORT_SYMBOL_GPL(i2c_adapter_type);
 
 #ifdef CONFIG_I2C_COMPAT
 static struct class_compat *i2c_adapter_compat_class;
diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c
index 0b0427f7d348..5f3a52d517c3 100644
--- a/drivers/i2c/i2c-dev.c
+++ b/drivers/i2c/i2c-dev.c
@@ -189,12 +189,50 @@ static int i2cdev_check(struct device *dev, void *addrp)
 	return dev->driver ? -EBUSY : 0;
 }
 
+/* walk up mux tree */
+static int i2cdev_check_mux_parents(struct i2c_adapter *adapter, int addr)
+{
+	int result;
+
+	result = device_for_each_child(&adapter->dev, &addr, i2cdev_check);
+
+	if (!result && i2c_parent_is_i2c_adapter(adapter))
+		result = i2cdev_check_mux_parents(
+				    to_i2c_adapter(adapter->dev.parent), addr);
+
+	return result;
+}
+
+/* recurse down mux tree */
+static int i2cdev_check_mux_children(struct device *dev, void *addrp)
+{
+	int result;
+
+	if (dev->type == &i2c_adapter_type)
+		result = device_for_each_child(dev, addrp,
+						i2cdev_check_mux_children);
+	else
+		result = i2cdev_check(dev, addrp);
+
+	return result;
+}
+
 /* This address checking function differs from the one in i2c-core
    in that it considers an address with a registered device, but no
    driver bound to it, as NOT busy. */
 static int i2cdev_check_addr(struct i2c_adapter *adapter, unsigned int addr)
 {
-	return device_for_each_child(&adapter->dev, &addr, i2cdev_check);
+	int result = 0;
+
+	if (i2c_parent_is_i2c_adapter(adapter))
+		result = i2cdev_check_mux_parents(
+				    to_i2c_adapter(adapter->dev.parent), addr);
+
+	if (!result)
+		result = device_for_each_child(&adapter->dev, &addr,
+						i2cdev_check_mux_children);
+
+	return result;
 }
 
 static noinline int i2cdev_ioctl_rdrw(struct i2c_client *client,
diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c
new file mode 100644
index 000000000000..d32a4843fc3a
--- /dev/null
+++ b/drivers/i2c/i2c-mux.c
@@ -0,0 +1,165 @@
+/*
+ * Multiplexed I2C bus driver.
+ *
+ * Copyright (c) 2008-2009 Rodolfo Giometti <giometti@linux.it>
+ * Copyright (c) 2008-2009 Eurotech S.p.A. <info@eurotech.it>
+ * Copyright (c) 2009-2010 NSN GmbH & Co KG <michael.lawnick.ext@nsn.com>
+ *
+ * Simplifies access to complex multiplexed I2C bus topologies, by presenting
+ * each multiplexed bus segment as an additional I2C adapter.
+ * Supports multi-level mux'ing (mux behind a mux).
+ *
+ * Based on:
+ *	i2c-virt.c from Kumar Gala <galak@kernel.crashing.org>
+ *	i2c-virtual.c from Ken Harrenstien, Copyright (c) 2004 Google, Inc.
+ *	i2c-virtual.c from Brian Kuschak <bkuschak@yahoo.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/i2c-mux.h>
+
+/* multiplexer per channel data */
+struct i2c_mux_priv {
+	struct i2c_adapter adap;
+	struct i2c_algorithm algo;
+
+	struct i2c_adapter *parent;
+	void *mux_dev;	/* the mux chip/device */
+	u32  chan_id;	/* the channel id */
+
+	int (*select)(struct i2c_adapter *, void *mux_dev, u32 chan_id);
+	int (*deselect)(struct i2c_adapter *, void *mux_dev, u32 chan_id);
+};
+
+static int i2c_mux_master_xfer(struct i2c_adapter *adap,
+			       struct i2c_msg msgs[], int num)
+{
+	struct i2c_mux_priv *priv = adap->algo_data;
+	struct i2c_adapter *parent = priv->parent;
+	int ret;
+
+	/* Switch to the right mux port and perform the transfer. */
+
+	ret = priv->select(parent, priv->mux_dev, priv->chan_id);
+	if (ret >= 0)
+		ret = parent->algo->master_xfer(parent, msgs, num);
+	if (priv->deselect)
+		priv->deselect(parent, priv->mux_dev, priv->chan_id);
+
+	return ret;
+}
+
+static int i2c_mux_smbus_xfer(struct i2c_adapter *adap,
+			      u16 addr, unsigned short flags,
+			      char read_write, u8 command,
+			      int size, union i2c_smbus_data *data)
+{
+	struct i2c_mux_priv *priv = adap->algo_data;
+	struct i2c_adapter *parent = priv->parent;
+	int ret;
+
+	/* Select the right mux port and perform the transfer. */
+
+	ret = priv->select(parent, priv->mux_dev, priv->chan_id);
+	if (ret >= 0)
+		ret = parent->algo->smbus_xfer(parent, addr, flags,
+					read_write, command, size, data);
+	if (priv->deselect)
+		priv->deselect(parent, priv->mux_dev, priv->chan_id);
+
+	return ret;
+}
+
+/* Return the parent's functionality */
+static u32 i2c_mux_functionality(struct i2c_adapter *adap)
+{
+	struct i2c_mux_priv *priv = adap->algo_data;
+	struct i2c_adapter *parent = priv->parent;
+
+	return parent->algo->functionality(parent);
+}
+
+struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
+				void *mux_dev, u32 force_nr, u32 chan_id,
+				int (*select) (struct i2c_adapter *,
+					       void *, u32),
+				int (*deselect) (struct i2c_adapter *,
+						 void *, u32))
+{
+	struct i2c_mux_priv *priv;
+	int ret;
+
+	priv = kzalloc(sizeof(struct i2c_mux_priv), GFP_KERNEL);
+	if (!priv)
+		return NULL;
+
+	/* Set up private adapter data */
+	priv->parent = parent;
+	priv->mux_dev = mux_dev;
+	priv->chan_id = chan_id;
+	priv->select = select;
+	priv->deselect = deselect;
+
+	/* Need to do algo dynamically because we don't know ahead
+	 * of time what sort of physical adapter we'll be dealing with.
+	 */
+	if (parent->algo->master_xfer)
+		priv->algo.master_xfer = i2c_mux_master_xfer;
+	if (parent->algo->smbus_xfer)
+		priv->algo.smbus_xfer = i2c_mux_smbus_xfer;
+	priv->algo.functionality = i2c_mux_functionality;
+
+	/* Now fill out new adapter structure */
+	snprintf(priv->adap.name, sizeof(priv->adap.name),
+		 "i2c-%d-mux (chan_id %d)", i2c_adapter_id(parent), chan_id);
+	priv->adap.owner = THIS_MODULE;
+	priv->adap.id = parent->id;
+	priv->adap.algo = &priv->algo;
+	priv->adap.algo_data = priv;
+	priv->adap.dev.parent = &parent->dev;
+
+	if (force_nr) {
+		priv->adap.nr = force_nr;
+		ret = i2c_add_numbered_adapter(&priv->adap);
+	} else {
+		ret = i2c_add_adapter(&priv->adap);
+	}
+	if (ret < 0) {
+		dev_err(&parent->dev,
+			"failed to add mux-adapter (error=%d)\n",
+			ret);
+		kfree(priv);
+		return NULL;
+	}
+
+	dev_info(&parent->dev, "Added multiplexed i2c bus %d\n",
+		 i2c_adapter_id(&priv->adap));
+
+	return &priv->adap;
+}
+EXPORT_SYMBOL_GPL(i2c_add_mux_adapter);
+
+int i2c_del_mux_adapter(struct i2c_adapter *adap)
+{
+	struct i2c_mux_priv *priv = adap->algo_data;
+	int ret;
+
+	ret = i2c_del_adapter(adap);
+	if (ret < 0)
+		return ret;
+	kfree(priv);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(i2c_del_mux_adapter);
+
+MODULE_AUTHOR("Rodolfo Giometti <giometti@linux.it>");
+MODULE_DESCRIPTION("I2C driver for multiplexed I2C busses");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/i2c-mux.h b/include/linux/i2c-mux.h
new file mode 100644
index 000000000000..34536effd652
--- /dev/null
+++ b/include/linux/i2c-mux.h
@@ -0,0 +1,46 @@
+/*
+ *
+ * i2c-mux.h - functions for the i2c-bus mux support
+ *
+ * Copyright (c) 2008-2009 Rodolfo Giometti <giometti@linux.it>
+ * Copyright (c) 2008-2009 Eurotech S.p.A. <info@eurotech.it>
+ * Michael Lawnick <michael.lawnick.ext@nsn.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _LINUX_I2C_MUX_H
+#define _LINUX_I2C_MUX_H
+
+#ifdef __KERNEL__
+
+/*
+ * Called to create a i2c bus on a multiplexed bus segment.
+ * The mux_dev and chan_id parameters are passed to the select
+ * and deselect callback functions to perform hardware-specific
+ * mux control.
+ */
+struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
+				void *mux_dev, u32 force_nr, u32 chan_id,
+				int (*select) (struct i2c_adapter *,
+					       void *mux_dev, u32 chan_id),
+				int (*deselect) (struct i2c_adapter *,
+						 void *mux_dev, u32 chan_id));
+
+int i2c_del_mux_adapter(struct i2c_adapter *adap);
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_I2C_MUX_H */
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 798bad8741e4..4bae0b72ed3c 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -37,6 +37,7 @@
 #include <linux/of.h>		/* for struct device_node */
 
 extern struct bus_type i2c_bus_type;
+extern struct device_type i2c_adapter_type;
 
 /* --- General options ------------------------------------------------	*/
 
@@ -383,6 +384,13 @@ static inline void i2c_set_adapdata(struct i2c_adapter *dev, void *data)
 	dev_set_drvdata(&dev->dev, data);
 }
 
+static inline int i2c_parent_is_i2c_adapter(const struct i2c_adapter *adapter)
+{
+	return adapter->dev.parent != NULL
+		&& adapter->dev.parent->bus == &i2c_bus_type
+		&& adapter->dev.parent->type == &i2c_adapter_type;
+}
+
 /* Adapter locking functions, exported for shared pin cases */
 void i2c_lock_adapter(struct i2c_adapter *);
 void i2c_unlock_adapter(struct i2c_adapter *);
-- 
cgit v1.2.3-59-g8ed1b


From 7f528135da9704d67db1f727162024b078e1cd8f Mon Sep 17 00:00:00 2001
From: Michael Lawnick <ml.lawnick@gmx.de>
Date: Wed, 11 Aug 2010 18:21:03 +0200
Subject: i2c: I2C bus multiplexer driver pca954x

I2C driver for PCA954x I2C multiplexer series.

Signed-off-by: Michael Lawnick <ml.lawnick@gmx.de>
Acked-by: Rodolfo Giometti <giometti@linux.it>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/Kconfig         |   2 +
 drivers/i2c/Makefile        |   2 +-
 drivers/i2c/muxes/Kconfig   |  18 +++
 drivers/i2c/muxes/Makefile  |   8 ++
 drivers/i2c/muxes/pca954x.c | 301 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c/pca954x.h |  47 +++++++
 6 files changed, 377 insertions(+), 1 deletion(-)
 create mode 100644 drivers/i2c/muxes/Kconfig
 create mode 100644 drivers/i2c/muxes/Makefile
 create mode 100644 drivers/i2c/muxes/pca954x.c
 create mode 100644 include/linux/i2c/pca954x.h

(limited to 'include/linux')

diff --git a/drivers/i2c/Kconfig b/drivers/i2c/Kconfig
index efb48ad1ba34..30f06e956bfb 100644
--- a/drivers/i2c/Kconfig
+++ b/drivers/i2c/Kconfig
@@ -58,6 +58,8 @@ config I2C_MUX
 	  This support is also available as a module.  If so, the module
 	  will be called i2c-mux.
 
+source drivers/i2c/muxes/Kconfig
+
 config I2C_HELPER_AUTO
 	bool "Autoselect pertinent helper modules"
 	default y
diff --git a/drivers/i2c/Makefile b/drivers/i2c/Makefile
index f363258daa3d..c00fd66388f5 100644
--- a/drivers/i2c/Makefile
+++ b/drivers/i2c/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_I2C)		+= i2c-core.o
 obj-$(CONFIG_I2C_SMBUS)		+= i2c-smbus.o
 obj-$(CONFIG_I2C_CHARDEV)	+= i2c-dev.o
 obj-$(CONFIG_I2C_MUX)		+= i2c-mux.o
-obj-y				+= algos/ busses/
+obj-y				+= algos/ busses/ muxes/
 
 ifeq ($(CONFIG_I2C_DEBUG_CORE),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig
new file mode 100644
index 000000000000..4c9a99c4fcb0
--- /dev/null
+++ b/drivers/i2c/muxes/Kconfig
@@ -0,0 +1,18 @@
+#
+# Multiplexer I2C chip drivers configuration
+#
+
+menu "Multiplexer I2C Chip support"
+	depends on I2C_MUX
+
+config I2C_MUX_PCA954x
+	tristate "Philips PCA954x I2C Mux/switches"
+	depends on EXPERIMENTAL
+	help
+	  If you say yes here you get support for the Philips PCA954x
+	  I2C mux/switch devices.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called pca954x.
+
+endmenu
diff --git a/drivers/i2c/muxes/Makefile b/drivers/i2c/muxes/Makefile
new file mode 100644
index 000000000000..bd83b5274815
--- /dev/null
+++ b/drivers/i2c/muxes/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for multiplexer I2C chip drivers.
+
+obj-$(CONFIG_I2C_MUX_PCA954x)	+= pca954x.o
+
+ifeq ($(CONFIG_I2C_DEBUG_BUS),y)
+EXTRA_CFLAGS += -DDEBUG
+endif
diff --git a/drivers/i2c/muxes/pca954x.c b/drivers/i2c/muxes/pca954x.c
new file mode 100644
index 000000000000..6f9accf3189d
--- /dev/null
+++ b/drivers/i2c/muxes/pca954x.c
@@ -0,0 +1,301 @@
+/*
+ * I2C multiplexer
+ *
+ * Copyright (c) 2008-2009 Rodolfo Giometti <giometti@linux.it>
+ * Copyright (c) 2008-2009 Eurotech S.p.A. <info@eurotech.it>
+ *
+ * This module supports the PCA954x series of I2C multiplexer/switch chips
+ * made by Philips Semiconductors.
+ * This includes the:
+ *	 PCA9540, PCA9542, PCA9543, PCA9544, PCA9545, PCA9546, PCA9547
+ *	 and PCA9548.
+ *
+ * These chips are all controlled via the I2C bus itself, and all have a
+ * single 8-bit register. The upstream "parent" bus fans out to two,
+ * four, or eight downstream busses or channels; which of these
+ * are selected is determined by the chip type and register contents. A
+ * mux can select only one sub-bus at a time; a switch can select any
+ * combination simultaneously.
+ *
+ * Based on:
+ *	pca954x.c from Kumar Gala <galak@kernel.crashing.org>
+ * Copyright (C) 2006
+ *
+ * Based on:
+ *	pca954x.c from Ken Harrenstien
+ * Copyright (C) 2004 Google, Inc. (Ken Harrenstien)
+ *
+ * Based on:
+ *	i2c-virtual_cb.c from Brian Kuschak <bkuschak@yahoo.com>
+ * and
+ *	pca9540.c from Jean Delvare <khali@linux-fr.org>.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/i2c.h>
+#include <linux/i2c-mux.h>
+
+#include <linux/i2c/pca954x.h>
+
+#define PCA954X_MAX_NCHANS 8
+
+enum pca_type {
+	pca_9540,
+	pca_9542,
+	pca_9543,
+	pca_9544,
+	pca_9545,
+	pca_9546,
+	pca_9547,
+	pca_9548,
+};
+
+struct pca954x {
+	enum pca_type type;
+	struct i2c_adapter *virt_adaps[PCA954X_MAX_NCHANS];
+
+	u8 last_chan;		/* last register value */
+};
+
+struct chip_desc {
+	u8 nchans;
+	u8 enable;	/* used for muxes only */
+	enum muxtype {
+		pca954x_ismux = 0,
+		pca954x_isswi
+	} muxtype;
+};
+
+/* Provide specs for the PCA954x types we know about */
+static const struct chip_desc chips[] = {
+	[pca_9540] = {
+		.nchans = 2,
+		.enable = 0x4,
+		.muxtype = pca954x_ismux,
+	},
+	[pca_9543] = {
+		.nchans = 2,
+		.muxtype = pca954x_isswi,
+	},
+	[pca_9544] = {
+		.nchans = 4,
+		.enable = 0x4,
+		.muxtype = pca954x_ismux,
+	},
+	[pca_9545] = {
+		.nchans = 4,
+		.muxtype = pca954x_isswi,
+	},
+	[pca_9547] = {
+		.nchans = 8,
+		.enable = 0x8,
+		.muxtype = pca954x_ismux,
+	},
+	[pca_9548] = {
+		.nchans = 8,
+		.muxtype = pca954x_isswi,
+	},
+};
+
+static const struct i2c_device_id pca954x_id[] = {
+	{ "pca9540", pca_9540 },
+	{ "pca9542", pca_9540 },
+	{ "pca9543", pca_9543 },
+	{ "pca9544", pca_9544 },
+	{ "pca9545", pca_9545 },
+	{ "pca9546", pca_9545 },
+	{ "pca9547", pca_9547 },
+	{ "pca9548", pca_9548 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, pca954x_id);
+
+/* Write to mux register. Don't use i2c_transfer()/i2c_smbus_xfer()
+   for this as they will try to lock adapter a second time */
+static int pca954x_reg_write(struct i2c_adapter *adap,
+			     struct i2c_client *client, u8 val)
+{
+	int ret = -ENODEV;
+
+	if (adap->algo->master_xfer) {
+		struct i2c_msg msg;
+		char buf[1];
+
+		msg.addr = client->addr;
+		msg.flags = 0;
+		msg.len = 1;
+		buf[0] = val;
+		msg.buf = buf;
+		ret = adap->algo->master_xfer(adap, &msg, 1);
+	} else {
+		union i2c_smbus_data data;
+		ret = adap->algo->smbus_xfer(adap, client->addr,
+					     client->flags,
+					     I2C_SMBUS_WRITE,
+					     val, I2C_SMBUS_BYTE, &data);
+	}
+
+	return ret;
+}
+
+static int pca954x_select_chan(struct i2c_adapter *adap,
+			       void *client, u32 chan)
+{
+	struct pca954x *data = i2c_get_clientdata(client);
+	const struct chip_desc *chip = &chips[data->type];
+	u8 regval;
+	int ret = 0;
+
+	/* we make switches look like muxes, not sure how to be smarter */
+	if (chip->muxtype == pca954x_ismux)
+		regval = chan | chip->enable;
+	else
+		regval = 1 << chan;
+
+	/* Only select the channel if its different from the last channel */
+	if (data->last_chan != regval) {
+		ret = pca954x_reg_write(adap, client, regval);
+		data->last_chan = regval;
+	}
+
+	return ret;
+}
+
+static int pca954x_deselect_mux(struct i2c_adapter *adap,
+				void *client, u32 chan)
+{
+	struct pca954x *data = i2c_get_clientdata(client);
+
+	/* Deselect active channel */
+	data->last_chan = 0;
+	return pca954x_reg_write(adap, client, data->last_chan);
+}
+
+/*
+ * I2C init/probing/exit functions
+ */
+static int __devinit pca954x_probe(struct i2c_client *client,
+				   const struct i2c_device_id *id)
+{
+	struct i2c_adapter *adap = to_i2c_adapter(client->dev.parent);
+	struct pca954x_platform_data *pdata = client->dev.platform_data;
+	int num, force;
+	struct pca954x *data;
+	int ret = -ENODEV;
+
+	if (!i2c_check_functionality(adap, I2C_FUNC_SMBUS_BYTE))
+		goto err;
+
+	data = kzalloc(sizeof(struct pca954x), GFP_KERNEL);
+	if (!data) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	i2c_set_clientdata(client, data);
+
+	/* Read the mux register at addr to verify
+	 * that the mux is in fact present.
+	 */
+	if (i2c_smbus_read_byte(client) < 0) {
+		dev_warn(&client->dev, "probe failed\n");
+		goto exit_free;
+	}
+
+	data->type = id->driver_data;
+	data->last_chan = 0;		   /* force the first selection */
+
+	/* Now create an adapter for each channel */
+	for (num = 0; num < chips[data->type].nchans; num++) {
+		force = 0;			  /* dynamic adap number */
+		if (pdata) {
+			if (num < pdata->num_modes)
+				/* force static number */
+				force = pdata->modes[num].adap_id;
+			else
+				/* discard unconfigured channels */
+				break;
+		}
+
+		data->virt_adaps[num] =
+			i2c_add_mux_adapter(adap, client,
+				force, num, pca954x_select_chan,
+				(pdata && pdata->modes[num].deselect_on_exit)
+					? pca954x_deselect_mux : NULL);
+
+		if (data->virt_adaps[num] == NULL) {
+			ret = -ENODEV;
+			dev_err(&client->dev,
+				"failed to register multiplexed adapter"
+				" %d as bus %d\n", num, force);
+			goto virt_reg_failed;
+		}
+	}
+
+	dev_info(&client->dev,
+		 "registered %d multiplexed busses for I2C %s %s\n",
+		 num, chips[data->type].muxtype == pca954x_ismux
+				? "mux" : "switch", client->name);
+
+	return 0;
+
+virt_reg_failed:
+	for (num--; num >= 0; num--)
+		i2c_del_mux_adapter(data->virt_adaps[num]);
+exit_free:
+	kfree(data);
+err:
+	return ret;
+}
+
+static int __devexit pca954x_remove(struct i2c_client *client)
+{
+	struct pca954x *data = i2c_get_clientdata(client);
+	const struct chip_desc *chip = &chips[data->type];
+	int i, err;
+
+	for (i = 0; i < chip->nchans; ++i)
+		if (data->virt_adaps[i]) {
+			err = i2c_del_mux_adapter(data->virt_adaps[i]);
+			if (err)
+				return err;
+			data->virt_adaps[i] = NULL;
+		}
+
+	kfree(data);
+	return 0;
+}
+
+static struct i2c_driver pca954x_driver = {
+	.driver		= {
+		.name	= "pca954x",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= pca954x_probe,
+	.remove		= __devexit_p(pca954x_remove),
+	.id_table	= pca954x_id,
+};
+
+static int __init pca954x_init(void)
+{
+	return i2c_add_driver(&pca954x_driver);
+}
+
+static void __exit pca954x_exit(void)
+{
+	i2c_del_driver(&pca954x_driver);
+}
+
+module_init(pca954x_init);
+module_exit(pca954x_exit);
+
+MODULE_AUTHOR("Rodolfo Giometti <giometti@linux.it>");
+MODULE_DESCRIPTION("PCA954x I2C mux/switch driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/i2c/pca954x.h b/include/linux/i2c/pca954x.h
new file mode 100644
index 000000000000..28f1f8d5ab1f
--- /dev/null
+++ b/include/linux/i2c/pca954x.h
@@ -0,0 +1,47 @@
+/*
+ *
+ * pca954x.h - I2C multiplexer/switch support
+ *
+ * Copyright (c) 2008-2009 Rodolfo Giometti <giometti@linux.it>
+ * Copyright (c) 2008-2009 Eurotech S.p.A. <info@eurotech.it>
+ * Michael Lawnick <michael.lawnick.ext@nsn.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#ifndef _LINUX_I2C_PCA954X_H
+#define _LINUX_I2C_PCA954X_H
+
+/* Platform data for the PCA954x I2C multiplexers */
+
+/* Per channel initialisation data:
+ * @adap_id: bus number for the adapter. 0 = don't care
+ * @deselect_on_exit: set this entry to 1, if your H/W needs deselection
+ *                    of this channel after transaction.
+ *
+ */
+struct pca954x_platform_mode {
+	int		adap_id;
+	unsigned int	deselect_on_exit:1;
+};
+
+/* Per mux/switch data, used with i2c_register_board_info */
+struct pca954x_platform_data {
+	struct pca954x_platform_mode *modes;
+	int num_modes;
+};
+
+#endif /* _LINUX_I2C_PCA954X_H */
-- 
cgit v1.2.3-59-g8ed1b


From d5302fe41ffb28d0a48be6a71becba36d3453ae0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 12 Aug 2010 06:38:24 +1000
Subject: Make lib/raid6/test build correctly.

Some bit-rot needs to be cleaned out.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/linux/raid/pq.h | 2 ++
 lib/raid6/algos.c       | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 1cbbd2c11aa9..2b59cc824395 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -62,7 +62,9 @@ extern const char raid6_empty_zero_page[PAGE_SIZE];
 #define disable_kernel_altivec()
 
 #define EXPORT_SYMBOL(sym)
+#define EXPORT_SYMBOL_GPL(sym)
 #define MODULE_LICENSE(licence)
+#define MODULE_DESCRIPTION(desc)
 #define subsys_initcall(x)
 #define module_exit(x)
 #endif /* __KERNEL__ */
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 1f8784bfd44d..df7ff72777b8 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -17,11 +17,11 @@
  */
 
 #include <linux/raid/pq.h>
-#include <linux/gfp.h>
 #ifndef __KERNEL__
 #include <sys/mman.h>
 #include <stdio.h>
 #else
+#include <linux/gfp.h>
 #if !RAID6_USE_EMPTY_ZERO_PAGE
 /* In .bss so it's zeroed */
 const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-- 
cgit v1.2.3-59-g8ed1b


From 71beefe8c0b9e12ed4ac9fe13b98a035ddeaf8eb Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 12 Aug 2010 11:44:50 +1000
Subject: vgaarb: drop vga.h include

We don't actually need this include on any platform.

built on powerpc + x86, reported on m68k.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/vgaarb.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h
index 814f294d4cd0..6228b5b77d35 100644
--- a/include/linux/vgaarb.h
+++ b/include/linux/vgaarb.h
@@ -31,7 +31,6 @@
 #ifndef LINUX_VGA_H
 #define LINUX_VGA_H
 
-#include <asm/vga.h>
 
 /* Legacy VGA regions */
 #define VGA_RSRC_NONE	       0x00
-- 
cgit v1.2.3-59-g8ed1b


From 31ce4bfdfd10bf5db9bf85c92bbe0cf2edbdcad8 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 12 Aug 2010 11:47:50 +1000
Subject: io-mapping: move asm include inside the config option

nouveau starting using these APIs, the first on non-x86 hw, and this
include isn't required on anything with real amounts of vmalloc space.

this fixes a build problem on powerpc.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/io-mapping.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index e0ea40f6c515..0a6b3d5c490c 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -22,7 +22,6 @@
 #include <linux/slab.h>
 #include <asm/io.h>
 #include <asm/page.h>
-#include <asm/iomap.h>
 
 /*
  * The io_mapping mechanism provides an abstraction for mapping
@@ -33,6 +32,8 @@
 
 #ifdef CONFIG_HAVE_ATOMIC_IOMAP
 
+#include <asm/iomap.h>
+
 struct io_mapping {
 	resource_size_t base;
 	unsigned long size;
-- 
cgit v1.2.3-59-g8ed1b


From a5664dad7e1a278d2915c2bf79cf42250e12d7db Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 12 Aug 2010 04:14:01 +0100
Subject: dm ioctl: make bio or request based device type immutable

Determine whether a mapped device is bio-based or request-based when
loading its first (inactive) table and don't allow that to be changed
later.

This patch performs different device initialisation in each of the two
cases.  (We don't think it's necessary to add code to support changing
between the two types.)

Allowed md->type transitions:
  DM_TYPE_NONE to DM_TYPE_BIO_BASED
  DM_TYPE_NONE to DM_TYPE_REQUEST_BASED

We now prevent table_load from replacing the inactive table with a
conflicting type of table even after an explicit table_clear.

Introduce 'type_lock' into the struct mapped_device to protect md->type
and to prepare for the next patch that will change the queue
initialization and allocate memory while md->type_lock is held.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

 drivers/md/dm-ioctl.c    |   15 +++++++++++++++
 drivers/md/dm.c          |   37 ++++++++++++++++++++++++++++++-------
 drivers/md/dm.h          |    5 +++++
 include/linux/dm-ioctl.h |    4 ++--
 4 files changed, 52 insertions(+), 9 deletions(-)
---
 drivers/md/dm-ioctl.c    | 15 +++++++++++++++
 drivers/md/dm.c          | 37 ++++++++++++++++++++++++++++++-------
 drivers/md/dm.h          |  5 +++++
 include/linux/dm-ioctl.h |  4 ++--
 4 files changed, 52 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 3fd8f0e169e7..4702f380cb45 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1189,6 +1189,21 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 		goto out;
 	}
 
+	/* Protect md->type against concurrent table loads. */
+	dm_lock_md_type(md);
+	if (dm_get_md_type(md) == DM_TYPE_NONE)
+		/* Initial table load: acquire type of table. */
+		dm_set_md_type(md, dm_table_get_type(t));
+	else if (dm_get_md_type(md) != dm_table_get_type(t)) {
+		DMWARN("can't change device type after initial table load.");
+		dm_table_destroy(t);
+		dm_unlock_md_type(md);
+		r = -EINVAL;
+		goto out;
+	}
+	dm_unlock_md_type(md);
+
+	/* stage inactive table */
 	down_write(&_hash_lock);
 	hc = dm_get_mdptr(md);
 	if (!hc || hc->md != md) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f3cc5d99fe8d..345e94c10c65 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -125,6 +125,10 @@ struct mapped_device {
 	unsigned long flags;
 
 	struct request_queue *queue;
+	unsigned type;
+	/* Protect type against concurrent access. */
+	struct mutex type_lock;
+
 	struct gendisk *disk;
 	char name[16];
 
@@ -1877,8 +1881,10 @@ static struct mapped_device *alloc_dev(int minor)
 	if (r < 0)
 		goto bad_minor;
 
+	md->type = DM_TYPE_NONE;
 	init_rwsem(&md->io_lock);
 	mutex_init(&md->suspend_lock);
+	mutex_init(&md->type_lock);
 	spin_lock_init(&md->deferred_lock);
 	spin_lock_init(&md->barrier_error_lock);
 	rwlock_init(&md->map_lock);
@@ -2130,6 +2136,30 @@ int dm_create(int minor, struct mapped_device **result)
 	return 0;
 }
 
+/*
+ * Functions to manage md->type.
+ * All are required to hold md->type_lock.
+ */
+void dm_lock_md_type(struct mapped_device *md)
+{
+	mutex_lock(&md->type_lock);
+}
+
+void dm_unlock_md_type(struct mapped_device *md)
+{
+	mutex_unlock(&md->type_lock);
+}
+
+void dm_set_md_type(struct mapped_device *md, unsigned type)
+{
+	md->type = type;
+}
+
+unsigned dm_get_md_type(struct mapped_device *md)
+{
+	return md->type;
+}
+
 static struct mapped_device *dm_find_md(dev_t dev)
 {
 	struct mapped_device *md;
@@ -2440,13 +2470,6 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 		goto out;
 	}
 
-	/* cannot change the device type, once a table is bound */
-	if (md->map &&
-	    (dm_table_get_type(md->map) != dm_table_get_type(table))) {
-		DMWARN("can't change the device type after a table is bound");
-		goto out;
-	}
-
 	map = __bind(md, table, &limits);
 
 out:
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 8223671e4901..1db782530ce6 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -66,6 +66,11 @@ int dm_table_alloc_md_mempools(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
+void dm_lock_md_type(struct mapped_device *md);
+void dm_unlock_md_type(struct mapped_device *md);
+void dm_set_md_type(struct mapped_device *md, unsigned type);
+unsigned dm_get_md_type(struct mapped_device *md);
+
 /*
  * To check the return value from dm_table_find_target().
  */
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 2c445e113790..43b2de17449b 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -266,9 +266,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	17
+#define DM_VERSION_MINOR	18
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2010-03-05)"
+#define DM_VERSION_EXTRA	"-ioctl (2010-06-29)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3-59-g8ed1b


From 57cba5d3658d9fdc019c6af14a2d80aefa651e56 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 12 Aug 2010 04:14:04 +0100
Subject: dm: rename map_info flush_request to target_request_nr

'target_request_nr' is a more generic name that reflects the fact that
it will be used for both flush and discard support.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c          |  2 +-
 drivers/md/dm-stripe.c        |  6 ++++--
 drivers/md/dm.c               | 18 +++++++++---------
 include/linux/device-mapper.h |  4 ++--
 4 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 96feada5e761..5974d3094d97 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1692,7 +1692,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 	chunk_t chunk;
 
 	if (unlikely(bio_empty_barrier(bio))) {
-		if (!map_context->flush_request)
+		if (!map_context->target_request_nr)
 			bio->bi_bdev = s->origin->bdev;
 		else
 			bio->bi_bdev = s->cow->bdev;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index d6e28d732b4d..22d5e2fdab8b 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -213,10 +213,12 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
 	struct stripe_c *sc = (struct stripe_c *) ti->private;
 	sector_t offset, chunk;
 	uint32_t stripe;
+	unsigned target_request_nr;
 
 	if (unlikely(bio_empty_barrier(bio))) {
-		BUG_ON(map_context->flush_request >= sc->stripes);
-		bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev;
+		target_request_nr = map_context->target_request_nr;
+		BUG_ON(target_request_nr >= sc->stripes);
+		bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
 		return DM_MAPIO_REMAPPED;
 	}
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5ae0a05b4811..0d4710175885 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1183,12 +1183,12 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
 }
 
 static void __flush_target(struct clone_info *ci, struct dm_target *ti,
-			  unsigned flush_nr)
+			  unsigned request_nr)
 {
 	struct dm_target_io *tio = alloc_tio(ci, ti);
 	struct bio *clone;
 
-	tio->info.flush_request = flush_nr;
+	tio->info.target_request_nr = request_nr;
 
 	clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
 	__bio_clone(clone, ci->bio);
@@ -1199,13 +1199,13 @@ static void __flush_target(struct clone_info *ci, struct dm_target *ti,
 
 static int __clone_and_map_empty_barrier(struct clone_info *ci)
 {
-	unsigned target_nr = 0, flush_nr;
+	unsigned target_nr = 0, request_nr;
 	struct dm_target *ti;
 
 	while ((ti = dm_table_get_target(ci->map, target_nr++)))
-		for (flush_nr = 0; flush_nr < ti->num_flush_requests;
-		     flush_nr++)
-			__flush_target(ci, ti, flush_nr);
+		for (request_nr = 0; request_nr < ti->num_flush_requests;
+		     request_nr++)
+			__flush_target(ci, ti, request_nr);
 
 	ci->sector_count = 0;
 
@@ -2424,11 +2424,11 @@ static void dm_queue_flush(struct mapped_device *md)
 	queue_work(md->wq, &md->work);
 }
 
-static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
+static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
-	tio->info.flush_request = flush_nr;
+	tio->info.target_request_nr = request_nr;
 }
 
 /* Issue barrier requests to targets and wait for their completion. */
@@ -2446,7 +2446,7 @@ static int dm_rq_barrier(struct mapped_device *md)
 		ti = dm_table_get_target(map, i);
 		for (j = 0; j < ti->num_flush_requests; j++) {
 			clone = clone_rq(md->flush_request, md, GFP_NOIO);
-			dm_rq_set_flush_nr(clone, j);
+			dm_rq_set_target_request_nr(clone, j);
 			atomic_inc(&md->pending[rq_data_dir(clone)]);
 			map_request(ti, clone, md);
 		}
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 1381cd97b4ed..531a6f2635ae 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -22,7 +22,7 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 union map_info {
 	void *ptr;
 	unsigned long long ll;
-	unsigned flush_request;
+	unsigned target_request_nr;
 };
 
 /*
@@ -174,7 +174,7 @@ struct dm_target {
 	 * A number of zero-length barrier requests that will be submitted
 	 * to the target for the purpose of flushing cache.
 	 *
-	 * The request number will be placed in union map_info->flush_request.
+	 * The request number will be placed in union map_info->target_request_nr.
 	 * It is a responsibility of the target driver to remap these requests
 	 * to the real underlying devices.
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From 7e507eb6432afdd798d4c6dccf949b8c43ef151c Mon Sep 17 00:00:00 2001
From: Peter Rajnoha <prajnoha@redhat.com>
Date: Thu, 12 Aug 2010 04:14:05 +0100
Subject: dm: allow autoloading of dm mod

Add devname:mapper/control and MAPPER_CTRL_MINOR module alias
to support dm-mod module autoloading.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Peter Rajnoha <prajnoha@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/devices.txt  | 1 +
 drivers/md/dm-ioctl.c      | 7 +++++--
 include/linux/dm-ioctl.h   | 1 +
 include/linux/miscdevice.h | 1 +
 4 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index f2da781705b2..d0d1df6cb5de 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -445,6 +445,7 @@ Your cooperation is appreciated.
 		233 = /dev/kmview	View-OS A process with a view
 		234 = /dev/btrfs-control	Btrfs control device
 		235 = /dev/autofs	Autofs control device
+		236 = /dev/mapper/control	Device-Mapper control device
 		240-254			Reserved for local use
 		255			Reserved for MISC_DYNAMIC_MINOR
 
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4d4ced8e4307..3e39193e5036 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1599,12 +1599,15 @@ static const struct file_operations _ctl_fops = {
 };
 
 static struct miscdevice _dm_misc = {
-	.minor 		= MISC_DYNAMIC_MINOR,
+	.minor		= MAPPER_CTRL_MINOR,
 	.name  		= DM_NAME,
-	.nodename	= "mapper/control",
+	.nodename	= DM_DIR "/" DM_CONTROL_NODE,
 	.fops  		= &_ctl_fops
 };
 
+MODULE_ALIAS_MISCDEV(MAPPER_CTRL_MINOR);
+MODULE_ALIAS("devname:" DM_DIR "/" DM_CONTROL_NODE);
+
 /*
  * Create misc character device and link to DM_DIR/control.
  */
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 43b2de17449b..49eab360d5d4 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 
 #define DM_DIR "mapper"		/* Slashes not supported */
+#define DM_CONTROL_NODE "control"
 #define DM_MAX_TYPE_NAME 16
 #define DM_NAME_LEN 128
 #define DM_UUID_LEN 129
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index f6c9b7dcb9fd..bafffc737903 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -38,6 +38,7 @@
 #define KVM_MINOR		232
 #define BTRFS_MINOR		234
 #define AUTOFS_MINOR		235
+#define MAPPER_CTRL_MINOR	236
 #define MISC_DYNAMIC_MINOR	255
 
 struct device;
-- 
cgit v1.2.3-59-g8ed1b


From 5ae89a8720c28caf35c4e53711d77df2856c404e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 12 Aug 2010 04:14:08 +0100
Subject: dm: linear support discard

Allow discards to be passed through to linear mappings if at least one
underlying device supports it.  Discards will be forwarded only to
devices that support them.

A target that supports discards should set num_discard_requests to
indicate how many times each discard request must be submitted to it.

Verify table's underlying devices support discards prior to setting the
associated DM device as capable of discards (via QUEUE_FLAG_DISCARD).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Joe Thornber <thornber@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-linear.c        |  1 +
 drivers/md/dm-table.c         | 44 +++++++++++++++++++++++++++++
 drivers/md/dm.c               | 65 +++++++++++++++++++++++++++++++++++--------
 drivers/md/dm.h               |  1 +
 include/linux/device-mapper.h |  6 ++++
 5 files changed, 105 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 9200dbf2391a..f043b5f433b2 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	ti->num_flush_requests = 1;
+	ti->num_discard_requests = 1;
 	ti->private = lc;
 	return 0;
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index bc60ef77a0d8..f9fc07d7a4b9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -54,6 +54,8 @@ struct dm_table {
 	sector_t *highs;
 	struct dm_target *targets;
 
+	unsigned discards_supported:1;
+
 	/*
 	 * Indicates the rw permissions for the new logical
 	 * device.  This should be a combination of FMODE_READ
@@ -203,6 +205,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 
 	INIT_LIST_HEAD(&t->devices);
 	atomic_set(&t->holders, 0);
+	t->discards_supported = 1;
 
 	if (!num_targets)
 		num_targets = KEYS_PER_NODE;
@@ -770,6 +773,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 
+	if (!tgt->num_discard_requests)
+		t->discards_supported = 0;
+
 	return 0;
 
  bad:
@@ -1135,6 +1141,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
 
+	if (!dm_table_supports_discards(t))
+		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+	else
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+
 	dm_table_set_integrity(t);
 
 	/*
@@ -1281,6 +1292,39 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
 	return t->md;
 }
 
+static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
+				  sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && blk_queue_discard(q);
+}
+
+bool dm_table_supports_discards(struct dm_table *t)
+{
+	struct dm_target *ti;
+	unsigned i = 0;
+
+	if (!t->discards_supported)
+		return 0;
+
+	/*
+	 * Ensure that at least one underlying device supports discards.
+	 * t->devices includes internal dm devices such as mirror logs
+	 * so we need to use iterate_devices here, which targets
+	 * supporting discard must provide.
+	 */
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+
+		if (ti->type->iterate_devices &&
+		    ti->type->iterate_devices(ti, device_discard_capable, NULL))
+			return 1;
+	}
+
+	return 0;
+}
+
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0d4710175885..44aba29154fc 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1212,6 +1212,53 @@ static int __clone_and_map_empty_barrier(struct clone_info *ci)
 	return 0;
 }
 
+/*
+ * Perform all io with a single clone.
+ */
+static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
+{
+	struct bio *clone, *bio = ci->bio;
+	struct dm_target_io *tio;
+
+	tio = alloc_tio(ci, ti);
+	clone = clone_bio(bio, ci->sector, ci->idx,
+			  bio->bi_vcnt - ci->idx, ci->sector_count,
+			  ci->md->bs);
+	__map_bio(ti, clone, tio);
+	ci->sector_count = 0;
+}
+
+static int __clone_and_map_discard(struct clone_info *ci)
+{
+	struct dm_target *ti;
+	sector_t max;
+
+	ti = dm_table_find_target(ci->map, ci->sector);
+	if (!dm_target_is_valid(ti))
+		return -EIO;
+
+	/*
+	 * Even though the device advertised discard support,
+	 * reconfiguration might have changed that since the
+	 * check was performed.
+	 */
+
+	if (!ti->num_discard_requests)
+		return -EOPNOTSUPP;
+
+	max = max_io_len(ci->md, ci->sector, ti);
+
+	if (ci->sector_count > max)
+		/*
+		 * FIXME: Handle a discard that spans two or more targets.
+		 */
+		return -EOPNOTSUPP;
+
+	__clone_and_map_simple(ci, ti);
+
+	return 0;
+}
+
 static int __clone_and_map(struct clone_info *ci)
 {
 	struct bio *clone, *bio = ci->bio;
@@ -1222,27 +1269,21 @@ static int __clone_and_map(struct clone_info *ci)
 	if (unlikely(bio_empty_barrier(bio)))
 		return __clone_and_map_empty_barrier(ci);
 
+	if (unlikely(bio->bi_rw & REQ_DISCARD))
+		return __clone_and_map_discard(ci);
+
 	ti = dm_table_find_target(ci->map, ci->sector);
 	if (!dm_target_is_valid(ti))
 		return -EIO;
 
 	max = max_io_len(ci->md, ci->sector, ti);
 
-	/*
-	 * Allocate a target io object.
-	 */
-	tio = alloc_tio(ci, ti);
-
 	if (ci->sector_count <= max) {
 		/*
 		 * Optimise for the simple case where we can do all of
 		 * the remaining io with a single clone.
 		 */
-		clone = clone_bio(bio, ci->sector, ci->idx,
-				  bio->bi_vcnt - ci->idx, ci->sector_count,
-				  ci->md->bs);
-		__map_bio(ti, clone, tio);
-		ci->sector_count = 0;
+		__clone_and_map_simple(ci, ti);
 
 	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
 		/*
@@ -1263,6 +1304,7 @@ static int __clone_and_map(struct clone_info *ci)
 			len += bv_len;
 		}
 
+		tio = alloc_tio(ci, ti);
 		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
 				  ci->md->bs);
 		__map_bio(ti, clone, tio);
@@ -1286,12 +1328,11 @@ static int __clone_and_map(struct clone_info *ci)
 					return -EIO;
 
 				max = max_io_len(ci->md, ci->sector, ti);
-
-				tio = alloc_tio(ci, ti);
 			}
 
 			len = min(remaining, max);
 
+			tio = alloc_tio(ci, ti);
 			clone = split_bvec(bio, ci->sector, ci->idx,
 					   bv->bv_offset + offset, len,
 					   ci->md->bs);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0d7b374c5dc2..0c2dd5f4af76 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -61,6 +61,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 int dm_table_any_busy_target(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
+bool dm_table_supports_discards(struct dm_table *t);
 int dm_table_alloc_md_mempools(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 531a6f2635ae..751ce21dea7b 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -180,6 +180,12 @@ struct dm_target {
 	 */
 	unsigned num_flush_requests;
 
+	/*
+	 * The number of discard requests that will be submitted to the
+	 * target.  map_info->request_nr is used just like num_flush_requests.
+	 */
+	unsigned num_discard_requests;
+
 	/* target specific data */
 	void *private;
 
-- 
cgit v1.2.3-59-g8ed1b


From 56a67df766039666f61fb15b079f713e44a735ae Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 12 Aug 2010 04:14:10 +0100
Subject: dm: factor out max_io_len_target_boundary

Split max_io_len_target_boundary out of max_io_len so that the discard
support can make use of it without duplicating max_io_len code.

Avoiding max_io_len's split_io logic enables DM's discard support to
submit the entire discard request to a target.  But discards must still
be split on target boundaries.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c               | 26 ++++++++++++++++++--------
 include/linux/device-mapper.h |  6 ++++++
 2 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3dd846e801fb..561313a7dac2 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1030,17 +1030,27 @@ static void end_clone_request(struct request *clone, int error)
 	dm_complete_request(clone, error);
 }
 
-static sector_t max_io_len(struct mapped_device *md,
-			   sector_t sector, struct dm_target *ti)
+/*
+ * Return maximum size of I/O possible at the supplied sector up to the current
+ * target boundary.
+ */
+static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
+{
+	sector_t target_offset = dm_target_offset(ti, sector);
+
+	return ti->len - target_offset;
+}
+
+static sector_t max_io_len(sector_t sector, struct dm_target *ti)
 {
-	sector_t offset = sector - ti->begin;
-	sector_t len = ti->len - offset;
+	sector_t len = max_io_len_target_boundary(sector, ti);
 
 	/*
 	 * Does the target need to split even further ?
 	 */
 	if (ti->split_io) {
 		sector_t boundary;
+		sector_t offset = dm_target_offset(ti, sector);
 		boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
 			   - offset;
 		if (len > boundary)
@@ -1258,7 +1268,7 @@ static int __clone_and_map_discard(struct clone_info *ci)
 	if (!ti->num_discard_requests)
 		return -EOPNOTSUPP;
 
-	max = max_io_len(ci->md, ci->sector, ti);
+	max = max_io_len(ci->sector, ti);
 
 	if (ci->sector_count > max)
 		/*
@@ -1290,7 +1300,7 @@ static int __clone_and_map(struct clone_info *ci)
 	if (!dm_target_is_valid(ti))
 		return -EIO;
 
-	max = max_io_len(ci->md, ci->sector, ti);
+	max = max_io_len(ci->sector, ti);
 
 	if (ci->sector_count <= max) {
 		/*
@@ -1341,7 +1351,7 @@ static int __clone_and_map(struct clone_info *ci)
 				if (!dm_target_is_valid(ti))
 					return -EIO;
 
-				max = max_io_len(ci->md, ci->sector, ti);
+				max = max_io_len(ci->sector, ti);
 			}
 
 			len = min(remaining, max);
@@ -1428,7 +1438,7 @@ static int dm_merge_bvec(struct request_queue *q,
 	/*
 	 * Find maximum amount of I/O that won't need splitting
 	 */
-	max_sectors = min(max_io_len(md, bvm->bi_sector, ti),
+	max_sectors = min(max_io_len(bvm->bi_sector, ti),
 			  (sector_t) BIO_MAX_SECTORS);
 	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
 	if (max_size < 0)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 751ce21dea7b..2970022faa63 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -398,6 +398,12 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
 #define dm_array_too_big(fixed, obj, num) \
 	((num) > (UINT_MAX - (fixed)) / (obj))
 
+/*
+ * Sector offset taken relative to the start of the target instead of
+ * relative to the start of the device.
+ */
+#define dm_target_offset(ti, sector) ((sector) - (ti)->begin)
+
 static inline sector_t to_sector(unsigned long n)
 {
 	return (n >> SECTOR_SHIFT);
-- 
cgit v1.2.3-59-g8ed1b


From 27e34995e1a863c1e9beba30e51dfe2a083f918d Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin.vincent@stericsson.com>
Date: Fri, 2 Jul 2010 16:52:08 +0530
Subject: mfd: Add STMPE I/O Expander support

Add support for the STMPE family of I/O Expanders from
STMicroelectronics.  These devices include upto 24 gpios and a varying
selection of blocks, including PWM, keypad, and touchscreen controllers.
This patch adds the MFD core.

[l.fu@pengutronix.de: fix stmpe811 enable hook]
[l.fu@pengutronix.de: add touchscreen platform data]
Acked-by: Luotao Fu <l.fu@pengutronix.de>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Rabin Vincent <rabin.vincent@stericsson.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Kconfig       |  23 ++
 drivers/mfd/Makefile      |   1 +
 drivers/mfd/stmpe.c       | 915 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/mfd/stmpe.h       | 176 +++++++++
 include/linux/mfd/stmpe.h | 197 ++++++++++
 5 files changed, 1312 insertions(+)
 create mode 100644 drivers/mfd/stmpe.c
 create mode 100644 drivers/mfd/stmpe.h
 create mode 100644 include/linux/mfd/stmpe.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 29781ca1eb6f..8f5145b4b56f 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -186,6 +186,29 @@ config TWL4030_CODEC
 	select MFD_CORE
 	default n
 
+config MFD_STMPE
+	bool "Support STMicroelectronics STMPE"
+	depends on I2C=y && GENERIC_HARDIRQS
+	select MFD_CORE
+	help
+	  Support for the STMPE family of I/O Expanders from
+	  STMicroelectronics.
+
+	  Currently supported devices are:
+
+		STMPE811: GPIO, Touchscreen
+		STMPE1601: GPIO, Keypad
+		STMPE2401: GPIO, Keypad
+		STMPE2403: GPIO, Keypad
+
+	  This driver provides common support for accessing the device,
+	  additional drivers must be enabled in order to use the functionality
+	  of the device.  Currently available sub drivers are:
+
+		GPIO: stmpe-gpio
+		Keypad: stmpe-keypad
+		Touchscreen: stmpe-ts
+
 config MFD_TC35892
 	bool "Support Toshiba TC35892"
 	depends on I2C=y && GENERIC_HARDIRQS
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index fb503e77dc60..4410747f7b5b 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_HTC_I2CPLD)	+= htc-i2cpld.o
 obj-$(CONFIG_MFD_DAVINCI_VOICECODEC)	+= davinci_voicecodec.o
 obj-$(CONFIG_MFD_DM355EVM_MSP)	+= dm355evm_msp.o
 
+obj-$(CONFIG_MFD_STMPE)		+= stmpe.o
 obj-$(CONFIG_MFD_TC35892)	+= tc35892.o
 obj-$(CONFIG_MFD_T7L66XB)	+= t7l66xb.o tmio_core.o
 obj-$(CONFIG_MFD_TC6387XB)	+= tc6387xb.o tmio_core.o
diff --git a/drivers/mfd/stmpe.c b/drivers/mfd/stmpe.c
new file mode 100644
index 000000000000..a7f3099fdcfd
--- /dev/null
+++ b/drivers/mfd/stmpe.c
@@ -0,0 +1,915 @@
+/*
+ * Copyright (C) ST-Ericsson SA 2010
+ *
+ * License Terms: GNU General Public License, version 2
+ * Author: Rabin Vincent <rabin.vincent@stericsson.com> for ST-Ericsson
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/stmpe.h>
+#include "stmpe.h"
+
+static int __stmpe_enable(struct stmpe *stmpe, unsigned int blocks)
+{
+	return stmpe->variant->enable(stmpe, blocks, true);
+}
+
+static int __stmpe_disable(struct stmpe *stmpe, unsigned int blocks)
+{
+	return stmpe->variant->enable(stmpe, blocks, false);
+}
+
+static int __stmpe_reg_read(struct stmpe *stmpe, u8 reg)
+{
+	int ret;
+
+	ret = i2c_smbus_read_byte_data(stmpe->i2c, reg);
+	if (ret < 0)
+		dev_err(stmpe->dev, "failed to read reg %#x: %d\n",
+			reg, ret);
+
+	dev_vdbg(stmpe->dev, "rd: reg %#x => data %#x\n", reg, ret);
+
+	return ret;
+}
+
+static int __stmpe_reg_write(struct stmpe *stmpe, u8 reg, u8 val)
+{
+	int ret;
+
+	dev_vdbg(stmpe->dev, "wr: reg %#x <= %#x\n", reg, val);
+
+	ret = i2c_smbus_write_byte_data(stmpe->i2c, reg, val);
+	if (ret < 0)
+		dev_err(stmpe->dev, "failed to write reg %#x: %d\n",
+			reg, ret);
+
+	return ret;
+}
+
+static int __stmpe_set_bits(struct stmpe *stmpe, u8 reg, u8 mask, u8 val)
+{
+	int ret;
+
+	ret = __stmpe_reg_read(stmpe, reg);
+	if (ret < 0)
+		return ret;
+
+	ret &= ~mask;
+	ret |= val;
+
+	return __stmpe_reg_write(stmpe, reg, ret);
+}
+
+static int __stmpe_block_read(struct stmpe *stmpe, u8 reg, u8 length,
+			      u8 *values)
+{
+	int ret;
+
+	ret = i2c_smbus_read_i2c_block_data(stmpe->i2c, reg, length, values);
+	if (ret < 0)
+		dev_err(stmpe->dev, "failed to read regs %#x: %d\n",
+			reg, ret);
+
+	dev_vdbg(stmpe->dev, "rd: reg %#x (%d) => ret %#x\n", reg, length, ret);
+	stmpe_dump_bytes("stmpe rd: ", values, length);
+
+	return ret;
+}
+
+static int __stmpe_block_write(struct stmpe *stmpe, u8 reg, u8 length,
+			const u8 *values)
+{
+	int ret;
+
+	dev_vdbg(stmpe->dev, "wr: regs %#x (%d)\n", reg, length);
+	stmpe_dump_bytes("stmpe wr: ", values, length);
+
+	ret = i2c_smbus_write_i2c_block_data(stmpe->i2c, reg, length,
+					     values);
+	if (ret < 0)
+		dev_err(stmpe->dev, "failed to write regs %#x: %d\n",
+			reg, ret);
+
+	return ret;
+}
+
+/**
+ * stmpe_enable - enable blocks on an STMPE device
+ * @stmpe:	Device to work on
+ * @blocks:	Mask of blocks (enum stmpe_block values) to enable
+ */
+int stmpe_enable(struct stmpe *stmpe, unsigned int blocks)
+{
+	int ret;
+
+	mutex_lock(&stmpe->lock);
+	ret = __stmpe_enable(stmpe, blocks);
+	mutex_unlock(&stmpe->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(stmpe_enable);
+
+/**
+ * stmpe_disable - disable blocks on an STMPE device
+ * @stmpe:	Device to work on
+ * @blocks:	Mask of blocks (enum stmpe_block values) to enable
+ */
+int stmpe_disable(struct stmpe *stmpe, unsigned int blocks)
+{
+	int ret;
+
+	mutex_lock(&stmpe->lock);
+	ret = __stmpe_disable(stmpe, blocks);
+	mutex_unlock(&stmpe->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(stmpe_disable);
+
+/**
+ * stmpe_reg_read() - read a single STMPE register
+ * @stmpe:	Device to read from
+ * @reg:	Register to read
+ */
+int stmpe_reg_read(struct stmpe *stmpe, u8 reg)
+{
+	int ret;
+
+	mutex_lock(&stmpe->lock);
+	ret = __stmpe_reg_read(stmpe, reg);
+	mutex_unlock(&stmpe->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(stmpe_reg_read);
+
+/**
+ * stmpe_reg_write() - write a single STMPE register
+ * @stmpe:	Device to write to
+ * @reg:	Register to write
+ * @val:	Value to write
+ */
+int stmpe_reg_write(struct stmpe *stmpe, u8 reg, u8 val)
+{
+	int ret;
+
+	mutex_lock(&stmpe->lock);
+	ret = __stmpe_reg_write(stmpe, reg, val);
+	mutex_unlock(&stmpe->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(stmpe_reg_write);
+
+/**
+ * stmpe_set_bits() - set the value of a bitfield in a STMPE register
+ * @stmpe:	Device to write to
+ * @reg:	Register to write
+ * @mask:	Mask of bits to set
+ * @val:	Value to set
+ */
+int stmpe_set_bits(struct stmpe *stmpe, u8 reg, u8 mask, u8 val)
+{
+	int ret;
+
+	mutex_lock(&stmpe->lock);
+	ret = __stmpe_set_bits(stmpe, reg, mask, val);
+	mutex_unlock(&stmpe->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(stmpe_set_bits);
+
+/**
+ * stmpe_block_read() - read multiple STMPE registers
+ * @stmpe:	Device to read from
+ * @reg:	First register
+ * @length:	Number of registers
+ * @values:	Buffer to write to
+ */
+int stmpe_block_read(struct stmpe *stmpe, u8 reg, u8 length, u8 *values)
+{
+	int ret;
+
+	mutex_lock(&stmpe->lock);
+	ret = __stmpe_block_read(stmpe, reg, length, values);
+	mutex_unlock(&stmpe->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(stmpe_block_read);
+
+/**
+ * stmpe_block_write() - write multiple STMPE registers
+ * @stmpe:	Device to write to
+ * @reg:	First register
+ * @length:	Number of registers
+ * @values:	Values to write
+ */
+int stmpe_block_write(struct stmpe *stmpe, u8 reg, u8 length,
+		      const u8 *values)
+{
+	int ret;
+
+	mutex_lock(&stmpe->lock);
+	ret = __stmpe_block_write(stmpe, reg, length, values);
+	mutex_unlock(&stmpe->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(stmpe_block_write);
+
+/**
+ * stmpe_set_altfunc: set the alternate function for STMPE pins
+ * @stmpe:	Device to configure
+ * @pins:	Bitmask of pins to affect
+ * @block:	block to enable alternate functions for
+ *
+ * @pins is assumed to have a bit set for each of the bits whose alternate
+ * function is to be changed, numbered according to the GPIOXY numbers.
+ *
+ * If the GPIO module is not enabled, this function automatically enables it in
+ * order to perform the change.
+ */
+int stmpe_set_altfunc(struct stmpe *stmpe, u32 pins, enum stmpe_block block)
+{
+	struct stmpe_variant_info *variant = stmpe->variant;
+	u8 regaddr = stmpe->regs[STMPE_IDX_GPAFR_U_MSB];
+	int af_bits = variant->af_bits;
+	int numregs = DIV_ROUND_UP(stmpe->num_gpios * af_bits, 8);
+	int afperreg = 8 / af_bits;
+	int mask = (1 << af_bits) - 1;
+	u8 regs[numregs];
+	int af;
+	int ret;
+
+	mutex_lock(&stmpe->lock);
+
+	ret = __stmpe_enable(stmpe, STMPE_BLOCK_GPIO);
+	if (ret < 0)
+		goto out;
+
+	ret = __stmpe_block_read(stmpe, regaddr, numregs, regs);
+	if (ret < 0)
+		goto out;
+
+	af = variant->get_altfunc(stmpe, block);
+
+	while (pins) {
+		int pin = __ffs(pins);
+		int regoffset = numregs - (pin / afperreg) - 1;
+		int pos = (pin % afperreg) * (8 / afperreg);
+
+		regs[regoffset] &= ~(mask << pos);
+		regs[regoffset] |= af << pos;
+
+		pins &= ~(1 << pin);
+	}
+
+	ret = __stmpe_block_write(stmpe, regaddr, numregs, regs);
+
+out:
+	mutex_unlock(&stmpe->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(stmpe_set_altfunc);
+
+/*
+ * GPIO (all variants)
+ */
+
+static struct resource stmpe_gpio_resources[] = {
+	/* Start and end filled dynamically */
+	{
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct mfd_cell stmpe_gpio_cell = {
+	.name		= "stmpe-gpio",
+	.resources	= stmpe_gpio_resources,
+	.num_resources	= ARRAY_SIZE(stmpe_gpio_resources),
+};
+
+/*
+ * Keypad (1601, 2401, 2403)
+ */
+
+static struct resource stmpe_keypad_resources[] = {
+	{
+		.name	= "KEYPAD",
+		.start	= 0,
+		.end	= 0,
+		.flags	= IORESOURCE_IRQ,
+	},
+	{
+		.name	= "KEYPAD_OVER",
+		.start	= 1,
+		.end	= 1,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct mfd_cell stmpe_keypad_cell = {
+	.name		= "stmpe-keypad",
+	.resources	= stmpe_keypad_resources,
+	.num_resources	= ARRAY_SIZE(stmpe_keypad_resources),
+};
+
+/*
+ * Touchscreen (STMPE811)
+ */
+
+static struct resource stmpe_ts_resources[] = {
+	{
+		.name	= "TOUCH_DET",
+		.start	= 0,
+		.end	= 0,
+		.flags	= IORESOURCE_IRQ,
+	},
+	{
+		.name	= "FIFO_TH",
+		.start	= 1,
+		.end	= 1,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct mfd_cell stmpe_ts_cell = {
+	.name		= "stmpe-ts",
+	.resources	= stmpe_ts_resources,
+	.num_resources	= ARRAY_SIZE(stmpe_ts_resources),
+};
+
+/*
+ * STMPE811
+ */
+
+static const u8 stmpe811_regs[] = {
+	[STMPE_IDX_CHIP_ID]	= STMPE811_REG_CHIP_ID,
+	[STMPE_IDX_ICR_LSB]	= STMPE811_REG_INT_CTRL,
+	[STMPE_IDX_IER_LSB]	= STMPE811_REG_INT_EN,
+	[STMPE_IDX_ISR_MSB]	= STMPE811_REG_INT_STA,
+	[STMPE_IDX_GPMR_LSB]	= STMPE811_REG_GPIO_MP_STA,
+	[STMPE_IDX_GPSR_LSB]	= STMPE811_REG_GPIO_SET_PIN,
+	[STMPE_IDX_GPCR_LSB]	= STMPE811_REG_GPIO_CLR_PIN,
+	[STMPE_IDX_GPDR_LSB]	= STMPE811_REG_GPIO_DIR,
+	[STMPE_IDX_GPRER_LSB]	= STMPE811_REG_GPIO_RE,
+	[STMPE_IDX_GPFER_LSB]	= STMPE811_REG_GPIO_FE,
+	[STMPE_IDX_GPAFR_U_MSB]	= STMPE811_REG_GPIO_AF,
+	[STMPE_IDX_IEGPIOR_LSB]	= STMPE811_REG_GPIO_INT_EN,
+	[STMPE_IDX_ISGPIOR_MSB]	= STMPE811_REG_GPIO_INT_STA,
+	[STMPE_IDX_GPEDR_MSB]	= STMPE811_REG_GPIO_ED,
+};
+
+static struct stmpe_variant_block stmpe811_blocks[] = {
+	{
+		.cell	= &stmpe_gpio_cell,
+		.irq	= STMPE811_IRQ_GPIOC,
+		.block	= STMPE_BLOCK_GPIO,
+	},
+	{
+		.cell	= &stmpe_ts_cell,
+		.irq	= STMPE811_IRQ_TOUCH_DET,
+		.block	= STMPE_BLOCK_TOUCHSCREEN,
+	},
+};
+
+static int stmpe811_enable(struct stmpe *stmpe, unsigned int blocks,
+			   bool enable)
+{
+	unsigned int mask = 0;
+
+	if (blocks & STMPE_BLOCK_GPIO)
+		mask |= STMPE811_SYS_CTRL2_GPIO_OFF;
+
+	if (blocks & STMPE_BLOCK_ADC)
+		mask |= STMPE811_SYS_CTRL2_ADC_OFF;
+
+	if (blocks & STMPE_BLOCK_TOUCHSCREEN)
+		mask |= STMPE811_SYS_CTRL2_TSC_OFF;
+
+	return __stmpe_set_bits(stmpe, STMPE811_REG_SYS_CTRL2, mask,
+				enable ? 0 : mask);
+}
+
+static int stmpe811_get_altfunc(struct stmpe *stmpe, enum stmpe_block block)
+{
+	/* 0 for touchscreen, 1 for GPIO */
+	return block != STMPE_BLOCK_TOUCHSCREEN;
+}
+
+static struct stmpe_variant_info stmpe811 = {
+	.name		= "stmpe811",
+	.id_val		= 0x0811,
+	.id_mask	= 0xffff,
+	.num_gpios	= 8,
+	.af_bits	= 1,
+	.regs		= stmpe811_regs,
+	.blocks		= stmpe811_blocks,
+	.num_blocks	= ARRAY_SIZE(stmpe811_blocks),
+	.num_irqs	= STMPE811_NR_INTERNAL_IRQS,
+	.enable		= stmpe811_enable,
+	.get_altfunc	= stmpe811_get_altfunc,
+};
+
+/*
+ * STMPE1601
+ */
+
+static const u8 stmpe1601_regs[] = {
+	[STMPE_IDX_CHIP_ID]	= STMPE1601_REG_CHIP_ID,
+	[STMPE_IDX_ICR_LSB]	= STMPE1601_REG_ICR_LSB,
+	[STMPE_IDX_IER_LSB]	= STMPE1601_REG_IER_LSB,
+	[STMPE_IDX_ISR_MSB]	= STMPE1601_REG_ISR_MSB,
+	[STMPE_IDX_GPMR_LSB]	= STMPE1601_REG_GPIO_MP_LSB,
+	[STMPE_IDX_GPSR_LSB]	= STMPE1601_REG_GPIO_SET_LSB,
+	[STMPE_IDX_GPCR_LSB]	= STMPE1601_REG_GPIO_CLR_LSB,
+	[STMPE_IDX_GPDR_LSB]	= STMPE1601_REG_GPIO_SET_DIR_LSB,
+	[STMPE_IDX_GPRER_LSB]	= STMPE1601_REG_GPIO_RE_LSB,
+	[STMPE_IDX_GPFER_LSB]	= STMPE1601_REG_GPIO_FE_LSB,
+	[STMPE_IDX_GPAFR_U_MSB]	= STMPE1601_REG_GPIO_AF_U_MSB,
+	[STMPE_IDX_IEGPIOR_LSB]	= STMPE1601_REG_INT_EN_GPIO_MASK_LSB,
+	[STMPE_IDX_ISGPIOR_MSB]	= STMPE1601_REG_INT_STA_GPIO_MSB,
+	[STMPE_IDX_GPEDR_MSB]	= STMPE1601_REG_GPIO_ED_MSB,
+};
+
+static struct stmpe_variant_block stmpe1601_blocks[] = {
+	{
+		.cell	= &stmpe_gpio_cell,
+		.irq	= STMPE24XX_IRQ_GPIOC,
+		.block	= STMPE_BLOCK_GPIO,
+	},
+	{
+		.cell	= &stmpe_keypad_cell,
+		.irq	= STMPE24XX_IRQ_KEYPAD,
+		.block	= STMPE_BLOCK_KEYPAD,
+	},
+};
+
+static int stmpe1601_enable(struct stmpe *stmpe, unsigned int blocks,
+			    bool enable)
+{
+	unsigned int mask = 0;
+
+	if (blocks & STMPE_BLOCK_GPIO)
+		mask |= STMPE1601_SYS_CTRL_ENABLE_GPIO;
+
+	if (blocks & STMPE_BLOCK_KEYPAD)
+		mask |= STMPE1601_SYS_CTRL_ENABLE_KPC;
+
+	return __stmpe_set_bits(stmpe, STMPE1601_REG_SYS_CTRL, mask,
+				enable ? mask : 0);
+}
+
+static int stmpe1601_get_altfunc(struct stmpe *stmpe, enum stmpe_block block)
+{
+	switch (block) {
+	case STMPE_BLOCK_PWM:
+		return 2;
+
+	case STMPE_BLOCK_KEYPAD:
+		return 1;
+
+	case STMPE_BLOCK_GPIO:
+	default:
+		return 0;
+	}
+}
+
+static struct stmpe_variant_info stmpe1601 = {
+	.name		= "stmpe1601",
+	.id_val		= 0x0210,
+	.id_mask	= 0xfff0,	/* at least 0x0210 and 0x0212 */
+	.num_gpios	= 16,
+	.af_bits	= 2,
+	.regs		= stmpe1601_regs,
+	.blocks		= stmpe1601_blocks,
+	.num_blocks	= ARRAY_SIZE(stmpe1601_blocks),
+	.num_irqs	= STMPE1601_NR_INTERNAL_IRQS,
+	.enable		= stmpe1601_enable,
+	.get_altfunc	= stmpe1601_get_altfunc,
+};
+
+/*
+ * STMPE24XX
+ */
+
+static const u8 stmpe24xx_regs[] = {
+	[STMPE_IDX_CHIP_ID]	= STMPE24XX_REG_CHIP_ID,
+	[STMPE_IDX_ICR_LSB]	= STMPE24XX_REG_ICR_LSB,
+	[STMPE_IDX_IER_LSB]	= STMPE24XX_REG_IER_LSB,
+	[STMPE_IDX_ISR_MSB]	= STMPE24XX_REG_ISR_MSB,
+	[STMPE_IDX_GPMR_LSB]	= STMPE24XX_REG_GPMR_LSB,
+	[STMPE_IDX_GPSR_LSB]	= STMPE24XX_REG_GPSR_LSB,
+	[STMPE_IDX_GPCR_LSB]	= STMPE24XX_REG_GPCR_LSB,
+	[STMPE_IDX_GPDR_LSB]	= STMPE24XX_REG_GPDR_LSB,
+	[STMPE_IDX_GPRER_LSB]	= STMPE24XX_REG_GPRER_LSB,
+	[STMPE_IDX_GPFER_LSB]	= STMPE24XX_REG_GPFER_LSB,
+	[STMPE_IDX_GPAFR_U_MSB]	= STMPE24XX_REG_GPAFR_U_MSB,
+	[STMPE_IDX_IEGPIOR_LSB]	= STMPE24XX_REG_IEGPIOR_LSB,
+	[STMPE_IDX_ISGPIOR_MSB]	= STMPE24XX_REG_ISGPIOR_MSB,
+	[STMPE_IDX_GPEDR_MSB]	= STMPE24XX_REG_GPEDR_MSB,
+};
+
+static struct stmpe_variant_block stmpe24xx_blocks[] = {
+	{
+		.cell	= &stmpe_gpio_cell,
+		.irq	= STMPE24XX_IRQ_GPIOC,
+		.block	= STMPE_BLOCK_GPIO,
+	},
+	{
+		.cell	= &stmpe_keypad_cell,
+		.irq	= STMPE24XX_IRQ_KEYPAD,
+		.block	= STMPE_BLOCK_KEYPAD,
+	},
+};
+
+static int stmpe24xx_enable(struct stmpe *stmpe, unsigned int blocks,
+			    bool enable)
+{
+	unsigned int mask = 0;
+
+	if (blocks & STMPE_BLOCK_GPIO)
+		mask |= STMPE24XX_SYS_CTRL_ENABLE_GPIO;
+
+	if (blocks & STMPE_BLOCK_KEYPAD)
+		mask |= STMPE24XX_SYS_CTRL_ENABLE_KPC;
+
+	return __stmpe_set_bits(stmpe, STMPE24XX_REG_SYS_CTRL, mask,
+				enable ? mask : 0);
+}
+
+static int stmpe24xx_get_altfunc(struct stmpe *stmpe, enum stmpe_block block)
+{
+	switch (block) {
+	case STMPE_BLOCK_ROTATOR:
+		return 2;
+
+	case STMPE_BLOCK_KEYPAD:
+		return 1;
+
+	case STMPE_BLOCK_GPIO:
+	default:
+		return 0;
+	}
+}
+
+static struct stmpe_variant_info stmpe2401 = {
+	.name		= "stmpe2401",
+	.id_val		= 0x0101,
+	.id_mask	= 0xffff,
+	.num_gpios	= 24,
+	.af_bits	= 2,
+	.regs		= stmpe24xx_regs,
+	.blocks		= stmpe24xx_blocks,
+	.num_blocks	= ARRAY_SIZE(stmpe24xx_blocks),
+	.num_irqs	= STMPE24XX_NR_INTERNAL_IRQS,
+	.enable		= stmpe24xx_enable,
+	.get_altfunc	= stmpe24xx_get_altfunc,
+};
+
+static struct stmpe_variant_info stmpe2403 = {
+	.name		= "stmpe2403",
+	.id_val		= 0x0120,
+	.id_mask	= 0xffff,
+	.num_gpios	= 24,
+	.af_bits	= 2,
+	.regs		= stmpe24xx_regs,
+	.blocks		= stmpe24xx_blocks,
+	.num_blocks	= ARRAY_SIZE(stmpe24xx_blocks),
+	.num_irqs	= STMPE24XX_NR_INTERNAL_IRQS,
+	.enable		= stmpe24xx_enable,
+	.get_altfunc	= stmpe24xx_get_altfunc,
+};
+
+static struct stmpe_variant_info *stmpe_variant_info[] = {
+	[STMPE811]	= &stmpe811,
+	[STMPE1601]	= &stmpe1601,
+	[STMPE2401]	= &stmpe2401,
+	[STMPE2403]	= &stmpe2403,
+};
+
+static irqreturn_t stmpe_irq(int irq, void *data)
+{
+	struct stmpe *stmpe = data;
+	struct stmpe_variant_info *variant = stmpe->variant;
+	int num = DIV_ROUND_UP(variant->num_irqs, 8);
+	u8 israddr = stmpe->regs[STMPE_IDX_ISR_MSB];
+	u8 isr[num];
+	int ret;
+	int i;
+
+	ret = stmpe_block_read(stmpe, israddr, num, isr);
+	if (ret < 0)
+		return IRQ_NONE;
+
+	for (i = 0; i < num; i++) {
+		int bank = num - i - 1;
+		u8 status = isr[i];
+		u8 clear;
+
+		status &= stmpe->ier[bank];
+		if (!status)
+			continue;
+
+		clear = status;
+		while (status) {
+			int bit = __ffs(status);
+			int line = bank * 8 + bit;
+
+			handle_nested_irq(stmpe->irq_base + line);
+			status &= ~(1 << bit);
+		}
+
+		stmpe_reg_write(stmpe, israddr + i, clear);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void stmpe_irq_lock(unsigned int irq)
+{
+	struct stmpe *stmpe = get_irq_chip_data(irq);
+
+	mutex_lock(&stmpe->irq_lock);
+}
+
+static void stmpe_irq_sync_unlock(unsigned int irq)
+{
+	struct stmpe *stmpe = get_irq_chip_data(irq);
+	struct stmpe_variant_info *variant = stmpe->variant;
+	int num = DIV_ROUND_UP(variant->num_irqs, 8);
+	int i;
+
+	for (i = 0; i < num; i++) {
+		u8 new = stmpe->ier[i];
+		u8 old = stmpe->oldier[i];
+
+		if (new == old)
+			continue;
+
+		stmpe->oldier[i] = new;
+		stmpe_reg_write(stmpe, stmpe->regs[STMPE_IDX_IER_LSB] - i, new);
+	}
+
+	mutex_unlock(&stmpe->irq_lock);
+}
+
+static void stmpe_irq_mask(unsigned int irq)
+{
+	struct stmpe *stmpe = get_irq_chip_data(irq);
+	int offset = irq - stmpe->irq_base;
+	int regoffset = offset / 8;
+	int mask = 1 << (offset % 8);
+
+	stmpe->ier[regoffset] &= ~mask;
+}
+
+static void stmpe_irq_unmask(unsigned int irq)
+{
+	struct stmpe *stmpe = get_irq_chip_data(irq);
+	int offset = irq - stmpe->irq_base;
+	int regoffset = offset / 8;
+	int mask = 1 << (offset % 8);
+
+	stmpe->ier[regoffset] |= mask;
+}
+
+static struct irq_chip stmpe_irq_chip = {
+	.name			= "stmpe",
+	.bus_lock		= stmpe_irq_lock,
+	.bus_sync_unlock	= stmpe_irq_sync_unlock,
+	.mask			= stmpe_irq_mask,
+	.unmask			= stmpe_irq_unmask,
+};
+
+static int __devinit stmpe_irq_init(struct stmpe *stmpe)
+{
+	int num_irqs = stmpe->variant->num_irqs;
+	int base = stmpe->irq_base;
+	int irq;
+
+	for (irq = base; irq < base + num_irqs; irq++) {
+		set_irq_chip_data(irq, stmpe);
+		set_irq_chip_and_handler(irq, &stmpe_irq_chip,
+					 handle_edge_irq);
+		set_irq_nested_thread(irq, 1);
+#ifdef CONFIG_ARM
+		set_irq_flags(irq, IRQF_VALID);
+#else
+		set_irq_noprobe(irq);
+#endif
+	}
+
+	return 0;
+}
+
+static void stmpe_irq_remove(struct stmpe *stmpe)
+{
+	int num_irqs = stmpe->variant->num_irqs;
+	int base = stmpe->irq_base;
+	int irq;
+
+	for (irq = base; irq < base + num_irqs; irq++) {
+#ifdef CONFIG_ARM
+		set_irq_flags(irq, 0);
+#endif
+		set_irq_chip_and_handler(irq, NULL, NULL);
+		set_irq_chip_data(irq, NULL);
+	}
+}
+
+static int __devinit stmpe_chip_init(struct stmpe *stmpe)
+{
+	unsigned int irq_trigger = stmpe->pdata->irq_trigger;
+	struct stmpe_variant_info *variant = stmpe->variant;
+	u8 icr = STMPE_ICR_LSB_GIM;
+	unsigned int id;
+	u8 data[2];
+	int ret;
+
+	ret = stmpe_block_read(stmpe, stmpe->regs[STMPE_IDX_CHIP_ID],
+			       ARRAY_SIZE(data), data);
+	if (ret < 0)
+		return ret;
+
+	id = (data[0] << 8) | data[1];
+	if ((id & variant->id_mask) != variant->id_val) {
+		dev_err(stmpe->dev, "unknown chip id: %#x\n", id);
+		return -EINVAL;
+	}
+
+	dev_info(stmpe->dev, "%s detected, chip id: %#x\n", variant->name, id);
+
+	/* Disable all modules -- subdrivers should enable what they need. */
+	ret = stmpe_disable(stmpe, ~0);
+	if (ret)
+		return ret;
+
+	if (irq_trigger == IRQF_TRIGGER_FALLING ||
+	    irq_trigger == IRQF_TRIGGER_RISING)
+		icr |= STMPE_ICR_LSB_EDGE;
+
+	if (irq_trigger == IRQF_TRIGGER_RISING ||
+	    irq_trigger == IRQF_TRIGGER_HIGH)
+		icr |= STMPE_ICR_LSB_HIGH;
+
+	if (stmpe->pdata->irq_invert_polarity)
+		icr ^= STMPE_ICR_LSB_HIGH;
+
+	return stmpe_reg_write(stmpe, stmpe->regs[STMPE_IDX_ICR_LSB], icr);
+}
+
+static int __devinit stmpe_add_device(struct stmpe *stmpe,
+				      struct mfd_cell *cell, int irq)
+{
+	return mfd_add_devices(stmpe->dev, stmpe->pdata->id, cell, 1,
+			       NULL, stmpe->irq_base + irq);
+}
+
+static int __devinit stmpe_devices_init(struct stmpe *stmpe)
+{
+	struct stmpe_variant_info *variant = stmpe->variant;
+	unsigned int platform_blocks = stmpe->pdata->blocks;
+	int ret = -EINVAL;
+	int i;
+
+	for (i = 0; i < variant->num_blocks; i++) {
+		struct stmpe_variant_block *block = &variant->blocks[i];
+
+		if (!(platform_blocks & block->block))
+			continue;
+
+		platform_blocks &= ~block->block;
+		ret = stmpe_add_device(stmpe, block->cell, block->irq);
+		if (ret)
+			return ret;
+	}
+
+	if (platform_blocks)
+		dev_warn(stmpe->dev,
+			 "platform wants blocks (%#x) not present on variant",
+			 platform_blocks);
+
+	return ret;
+}
+
+static int __devinit stmpe_probe(struct i2c_client *i2c,
+				 const struct i2c_device_id *id)
+{
+	struct stmpe_platform_data *pdata = i2c->dev.platform_data;
+	struct stmpe *stmpe;
+	int ret;
+
+	if (!pdata)
+		return -EINVAL;
+
+	stmpe = kzalloc(sizeof(struct stmpe), GFP_KERNEL);
+	if (!stmpe)
+		return -ENOMEM;
+
+	mutex_init(&stmpe->irq_lock);
+	mutex_init(&stmpe->lock);
+
+	stmpe->dev = &i2c->dev;
+	stmpe->i2c = i2c;
+
+	stmpe->pdata = pdata;
+	stmpe->irq_base = pdata->irq_base;
+
+	stmpe->partnum = id->driver_data;
+	stmpe->variant = stmpe_variant_info[stmpe->partnum];
+	stmpe->regs = stmpe->variant->regs;
+	stmpe->num_gpios = stmpe->variant->num_gpios;
+
+	i2c_set_clientdata(i2c, stmpe);
+
+	ret = stmpe_chip_init(stmpe);
+	if (ret)
+		goto out_free;
+
+	ret = stmpe_irq_init(stmpe);
+	if (ret)
+		goto out_free;
+
+	ret = request_threaded_irq(stmpe->i2c->irq, NULL, stmpe_irq,
+				   pdata->irq_trigger | IRQF_ONESHOT,
+				   "stmpe", stmpe);
+	if (ret) {
+		dev_err(stmpe->dev, "failed to request IRQ: %d\n", ret);
+		goto out_removeirq;
+	}
+
+	ret = stmpe_devices_init(stmpe);
+	if (ret) {
+		dev_err(stmpe->dev, "failed to add children\n");
+		goto out_removedevs;
+	}
+
+	return 0;
+
+out_removedevs:
+	mfd_remove_devices(stmpe->dev);
+	free_irq(stmpe->i2c->irq, stmpe);
+out_removeirq:
+	stmpe_irq_remove(stmpe);
+out_free:
+	kfree(stmpe);
+	return ret;
+}
+
+static int __devexit stmpe_remove(struct i2c_client *client)
+{
+	struct stmpe *stmpe = i2c_get_clientdata(client);
+
+	mfd_remove_devices(stmpe->dev);
+
+	free_irq(stmpe->i2c->irq, stmpe);
+	stmpe_irq_remove(stmpe);
+
+	kfree(stmpe);
+
+	return 0;
+}
+
+static const struct i2c_device_id stmpe_id[] = {
+	{ "stmpe811", STMPE811 },
+	{ "stmpe1601", STMPE1601 },
+	{ "stmpe2401", STMPE2401 },
+	{ "stmpe2403", STMPE2403 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, stmpe_id);
+
+static struct i2c_driver stmpe_driver = {
+	.driver.name	= "stmpe",
+	.driver.owner	= THIS_MODULE,
+	.probe		= stmpe_probe,
+	.remove		= __devexit_p(stmpe_remove),
+	.id_table	= stmpe_id,
+};
+
+static int __init stmpe_init(void)
+{
+	return i2c_add_driver(&stmpe_driver);
+}
+subsys_initcall(stmpe_init);
+
+static void __exit stmpe_exit(void)
+{
+	i2c_del_driver(&stmpe_driver);
+}
+module_exit(stmpe_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("STMPE MFD core driver");
+MODULE_AUTHOR("Rabin Vincent <rabin.vincent@stericsson.com>");
diff --git a/drivers/mfd/stmpe.h b/drivers/mfd/stmpe.h
new file mode 100644
index 000000000000..991f0ecbeb34
--- /dev/null
+++ b/drivers/mfd/stmpe.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) ST-Ericsson SA 2010
+ *
+ * License Terms: GNU General Public License, version 2
+ * Author: Rabin Vincent <rabin.vincent@stericsson.com> for ST-Ericsson
+ */
+
+#ifndef __STMPE_H
+#define __STMPE_H
+
+#ifdef STMPE_DUMP_BYTES
+static inline void stmpe_dump_bytes(const char *str, const void *buf,
+				    size_t len)
+{
+	print_hex_dump_bytes(str, DUMP_PREFIX_OFFSET, buf, len);
+}
+#else
+static inline void stmpe_dump_bytes(const char *str, const void *buf,
+				    size_t len)
+{
+}
+#endif
+
+/**
+ * struct stmpe_variant_block - information about block
+ * @cell:	base mfd cell
+ * @irq:	interrupt number to be added to each IORESOURCE_IRQ
+ *		in the cell
+ * @block:	block id; used for identification with platform data and for
+ *		enable and altfunc callbacks
+ */
+struct stmpe_variant_block {
+	struct mfd_cell		*cell;
+	int			irq;
+	enum stmpe_block	block;
+};
+
+/**
+ * struct stmpe_variant_info - variant-specific information
+ * @name:	part name
+ * @id_val:	content of CHIPID register
+ * @id_mask:	bits valid in CHIPID register for comparison with id_val
+ * @num_gpios:	number of GPIOS
+ * @af_bits:	number of bits used to specify the alternate function
+ * @blocks:	list of blocks present on this device
+ * @num_blocks:	number of blocks present on this device
+ * @num_irqs:	number of internal IRQs available on this device
+ * @enable:	callback to enable the specified blocks.
+ *		Called with the I/O lock held.
+ * @get_altfunc: callback to get the alternate function number for the
+ *		 specific block
+ */
+struct stmpe_variant_info {
+	const char *name;
+	u16 id_val;
+	u16 id_mask;
+	int num_gpios;
+	int af_bits;
+	const u8 *regs;
+	struct stmpe_variant_block *blocks;
+	int num_blocks;
+	int num_irqs;
+	int (*enable)(struct stmpe *stmpe, unsigned int blocks, bool enable);
+	int (*get_altfunc)(struct stmpe *stmpe, enum stmpe_block block);
+};
+
+#define STMPE_ICR_LSB_HIGH	(1 << 2)
+#define STMPE_ICR_LSB_EDGE	(1 << 1)
+#define STMPE_ICR_LSB_GIM	(1 << 0)
+
+/*
+ * STMPE811
+ */
+
+#define STMPE811_IRQ_TOUCH_DET		0
+#define STMPE811_IRQ_FIFO_TH		1
+#define STMPE811_IRQ_FIFO_OFLOW		2
+#define STMPE811_IRQ_FIFO_FULL		3
+#define STMPE811_IRQ_FIFO_EMPTY		4
+#define STMPE811_IRQ_TEMP_SENS		5
+#define STMPE811_IRQ_ADC		6
+#define STMPE811_IRQ_GPIOC		7
+#define STMPE811_NR_INTERNAL_IRQS	8
+
+#define STMPE811_REG_CHIP_ID		0x00
+#define STMPE811_REG_SYS_CTRL2		0x04
+#define STMPE811_REG_INT_CTRL		0x09
+#define STMPE811_REG_INT_EN		0x0A
+#define STMPE811_REG_INT_STA		0x0B
+#define STMPE811_REG_GPIO_INT_EN	0x0C
+#define STMPE811_REG_GPIO_INT_STA	0x0D
+#define STMPE811_REG_GPIO_SET_PIN	0x10
+#define STMPE811_REG_GPIO_CLR_PIN	0x11
+#define STMPE811_REG_GPIO_MP_STA	0x12
+#define STMPE811_REG_GPIO_DIR		0x13
+#define STMPE811_REG_GPIO_ED		0x14
+#define STMPE811_REG_GPIO_RE		0x15
+#define STMPE811_REG_GPIO_FE		0x16
+#define STMPE811_REG_GPIO_AF		0x17
+
+#define STMPE811_SYS_CTRL2_ADC_OFF	(1 << 0)
+#define STMPE811_SYS_CTRL2_TSC_OFF	(1 << 1)
+#define STMPE811_SYS_CTRL2_GPIO_OFF	(1 << 2)
+#define STMPE811_SYS_CTRL2_TS_OFF	(1 << 3)
+
+/*
+ * STMPE1601
+ */
+
+#define STMPE1601_IRQ_GPIOC		8
+#define STMPE1601_IRQ_PWM3		7
+#define STMPE1601_IRQ_PWM2		6
+#define STMPE1601_IRQ_PWM1		5
+#define STMPE1601_IRQ_PWM0		4
+#define STMPE1601_IRQ_KEYPAD_OVER	2
+#define STMPE1601_IRQ_KEYPAD		1
+#define STMPE1601_IRQ_WAKEUP		0
+#define STMPE1601_NR_INTERNAL_IRQS	9
+
+#define STMPE1601_REG_SYS_CTRL			0x02
+#define STMPE1601_REG_ICR_LSB			0x11
+#define STMPE1601_REG_IER_LSB			0x13
+#define STMPE1601_REG_ISR_MSB			0x14
+#define STMPE1601_REG_CHIP_ID			0x80
+#define STMPE1601_REG_INT_EN_GPIO_MASK_LSB	0x17
+#define STMPE1601_REG_INT_STA_GPIO_MSB		0x18
+#define STMPE1601_REG_GPIO_MP_LSB		0x87
+#define STMPE1601_REG_GPIO_SET_LSB		0x83
+#define STMPE1601_REG_GPIO_CLR_LSB		0x85
+#define STMPE1601_REG_GPIO_SET_DIR_LSB		0x89
+#define STMPE1601_REG_GPIO_ED_MSB		0x8A
+#define STMPE1601_REG_GPIO_RE_LSB		0x8D
+#define STMPE1601_REG_GPIO_FE_LSB		0x8F
+#define STMPE1601_REG_GPIO_AF_U_MSB		0x92
+
+#define STMPE1601_SYS_CTRL_ENABLE_GPIO		(1 << 3)
+#define STMPE1601_SYS_CTRL_ENABLE_KPC		(1 << 1)
+#define STMPE1601_SYSCON_ENABLE_SPWM		(1 << 0)
+
+/*
+ * STMPE24xx
+ */
+
+#define STMPE24XX_IRQ_GPIOC		8
+#define STMPE24XX_IRQ_PWM2		7
+#define STMPE24XX_IRQ_PWM1		6
+#define STMPE24XX_IRQ_PWM0		5
+#define STMPE24XX_IRQ_ROT_OVER		4
+#define STMPE24XX_IRQ_ROT		3
+#define STMPE24XX_IRQ_KEYPAD_OVER	2
+#define STMPE24XX_IRQ_KEYPAD		1
+#define STMPE24XX_IRQ_WAKEUP		0
+#define STMPE24XX_NR_INTERNAL_IRQS	9
+
+#define STMPE24XX_REG_SYS_CTRL		0x02
+#define STMPE24XX_REG_ICR_LSB		0x11
+#define STMPE24XX_REG_IER_LSB		0x13
+#define STMPE24XX_REG_ISR_MSB		0x14
+#define STMPE24XX_REG_CHIP_ID		0x80
+#define STMPE24XX_REG_IEGPIOR_LSB	0x18
+#define STMPE24XX_REG_ISGPIOR_MSB	0x19
+#define STMPE24XX_REG_GPMR_LSB		0xA5
+#define STMPE24XX_REG_GPSR_LSB		0x85
+#define STMPE24XX_REG_GPCR_LSB		0x88
+#define STMPE24XX_REG_GPDR_LSB		0x8B
+#define STMPE24XX_REG_GPEDR_MSB		0x8C
+#define STMPE24XX_REG_GPRER_LSB		0x91
+#define STMPE24XX_REG_GPFER_LSB		0x94
+#define STMPE24XX_REG_GPAFR_U_MSB	0x9B
+
+#define STMPE24XX_SYS_CTRL_ENABLE_GPIO		(1 << 3)
+#define STMPE24XX_SYSCON_ENABLE_PWM		(1 << 2)
+#define STMPE24XX_SYS_CTRL_ENABLE_KPC		(1 << 1)
+#define STMPE24XX_SYSCON_ENABLE_ROT		(1 << 0)
+
+#endif
diff --git a/include/linux/mfd/stmpe.h b/include/linux/mfd/stmpe.h
new file mode 100644
index 000000000000..90faa98b577f
--- /dev/null
+++ b/include/linux/mfd/stmpe.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (C) ST-Ericsson SA 2010
+ *
+ * License Terms: GNU General Public License, version 2
+ * Author: Rabin Vincent <rabin.vincent@stericsson.com> for ST-Ericsson
+ */
+
+#ifndef __LINUX_MFD_STMPE_H
+#define __LINUX_MFD_STMPE_H
+
+#include <linux/device.h>
+
+enum stmpe_block {
+	STMPE_BLOCK_GPIO	= 1 << 0,
+	STMPE_BLOCK_KEYPAD	= 1 << 1,
+	STMPE_BLOCK_TOUCHSCREEN	= 1 << 2,
+	STMPE_BLOCK_ADC		= 1 << 3,
+	STMPE_BLOCK_PWM		= 1 << 4,
+	STMPE_BLOCK_ROTATOR	= 1 << 5,
+};
+
+enum stmpe_partnum {
+	STMPE811,
+	STMPE1601,
+	STMPE2401,
+	STMPE2403,
+};
+
+/*
+ * For registers whose locations differ on variants,  the correct address is
+ * obtained by indexing stmpe->regs with one of the following.
+ */
+enum {
+	STMPE_IDX_CHIP_ID,
+	STMPE_IDX_ICR_LSB,
+	STMPE_IDX_IER_LSB,
+	STMPE_IDX_ISR_MSB,
+	STMPE_IDX_GPMR_LSB,
+	STMPE_IDX_GPSR_LSB,
+	STMPE_IDX_GPCR_LSB,
+	STMPE_IDX_GPDR_LSB,
+	STMPE_IDX_GPEDR_MSB,
+	STMPE_IDX_GPRER_LSB,
+	STMPE_IDX_GPFER_LSB,
+	STMPE_IDX_GPAFR_U_MSB,
+	STMPE_IDX_IEGPIOR_LSB,
+	STMPE_IDX_ISGPIOR_MSB,
+	STMPE_IDX_MAX,
+};
+
+
+struct stmpe_variant_info;
+
+/**
+ * struct stmpe - STMPE MFD structure
+ * @lock: lock protecting I/O operations
+ * @irq_lock: IRQ bus lock
+ * @dev: device, mostly for dev_dbg()
+ * @i2c: i2c client
+ * @variant: the detected STMPE model number
+ * @regs: list of addresses of registers which are at different addresses on
+ *	  different variants.  Indexed by one of STMPE_IDX_*.
+ * @irq_base: starting IRQ number for internal IRQs
+ * @num_gpios: number of gpios, differs for variants
+ * @ier: cache of IER registers for bus_lock
+ * @oldier: cache of IER registers for bus_lock
+ * @pdata: platform data
+ */
+struct stmpe {
+	struct mutex lock;
+	struct mutex irq_lock;
+	struct device *dev;
+	struct i2c_client *i2c;
+	enum stmpe_partnum partnum;
+	struct stmpe_variant_info *variant;
+	const u8 *regs;
+
+	int irq_base;
+	int num_gpios;
+	u8 ier[2];
+	u8 oldier[2];
+	struct stmpe_platform_data *pdata;
+};
+
+extern int stmpe_reg_write(struct stmpe *stmpe, u8 reg, u8 data);
+extern int stmpe_reg_read(struct stmpe *stmpe, u8 reg);
+extern int stmpe_block_read(struct stmpe *stmpe, u8 reg, u8 length,
+			    u8 *values);
+extern int stmpe_block_write(struct stmpe *stmpe, u8 reg, u8 length,
+			     const u8 *values);
+extern int stmpe_set_bits(struct stmpe *stmpe, u8 reg, u8 mask, u8 val);
+extern int stmpe_set_altfunc(struct stmpe *stmpe, u32 pins,
+			     enum stmpe_block block);
+extern int stmpe_enable(struct stmpe *stmpe, unsigned int blocks);
+extern int stmpe_disable(struct stmpe *stmpe, unsigned int blocks);
+
+struct matrix_keymap_data;
+
+/**
+ * struct stmpe_keypad_platform_data - STMPE keypad platform data
+ * @keymap_data: key map table and size
+ * @debounce_ms: debounce interval, in ms.  Maximum is
+ *		 %STMPE_KEYPAD_MAX_DEBOUNCE.
+ * @scan_count: number of key scanning cycles to confirm key data.
+ *		Maximum is %STMPE_KEYPAD_MAX_SCAN_COUNT.
+ * @no_autorepeat: disable key autorepeat
+ */
+struct stmpe_keypad_platform_data {
+	struct matrix_keymap_data *keymap_data;
+	unsigned int debounce_ms;
+	unsigned int scan_count;
+	bool no_autorepeat;
+};
+
+/**
+ * struct stmpe_gpio_platform_data - STMPE GPIO platform data
+ * @gpio_base: first gpio number assigned.  A maximum of
+ *	       %STMPE_NR_GPIOS GPIOs will be allocated.
+ */
+struct stmpe_gpio_platform_data {
+	int gpio_base;
+	void (*setup)(struct stmpe *stmpe, unsigned gpio_base);
+	void (*remove)(struct stmpe *stmpe, unsigned gpio_base);
+};
+
+/**
+ * struct stmpe_ts_platform_data - stmpe811 touch screen controller platform
+ * data
+ * @sample_time: ADC converstion time in number of clock.
+ * (0 -> 36 clocks, 1 -> 44 clocks, 2 -> 56 clocks, 3 -> 64 clocks,
+ * 4 -> 80 clocks, 5 -> 96 clocks, 6 -> 144 clocks),
+ * recommended is 4.
+ * @mod_12b: ADC Bit mode (0 -> 10bit ADC, 1 -> 12bit ADC)
+ * @ref_sel: ADC reference source
+ * (0 -> internal reference, 1 -> external reference)
+ * @adc_freq: ADC Clock speed
+ * (0 -> 1.625 MHz, 1 -> 3.25 MHz, 2 || 3 -> 6.5 MHz)
+ * @ave_ctrl: Sample average control
+ * (0 -> 1 sample, 1 -> 2 samples, 2 -> 4 samples, 3 -> 8 samples)
+ * @touch_det_delay: Touch detect interrupt delay
+ * (0 -> 10 us, 1 -> 50 us, 2 -> 100 us, 3 -> 500 us,
+ * 4-> 1 ms, 5 -> 5 ms, 6 -> 10 ms, 7 -> 50 ms)
+ * recommended is 3
+ * @settling: Panel driver settling time
+ * (0 -> 10 us, 1 -> 100 us, 2 -> 500 us, 3 -> 1 ms,
+ * 4 -> 5 ms, 5 -> 10 ms, 6 for 50 ms, 7 -> 100 ms)
+ * recommended is 2
+ * @fraction_z: Length of the fractional part in z
+ * (fraction_z ([0..7]) = Count of the fractional part)
+ * recommended is 7
+ * @i_drive: current limit value of the touchscreen drivers
+ * (0 -> 20 mA typical 35 mA max, 1 -> 50 mA typical 80 mA max)
+ *
+ * */
+struct stmpe_ts_platform_data {
+       u8 sample_time;
+       u8 mod_12b;
+       u8 ref_sel;
+       u8 adc_freq;
+       u8 ave_ctrl;
+       u8 touch_det_delay;
+       u8 settling;
+       u8 fraction_z;
+       u8 i_drive;
+};
+
+/**
+ * struct stmpe_platform_data - STMPE platform data
+ * @id: device id to distinguish between multiple STMPEs on the same board
+ * @blocks: bitmask of blocks to enable (use STMPE_BLOCK_*)
+ * @irq_trigger: IRQ trigger to use for the interrupt to the host
+ * @irq_invert_polarity: IRQ line is connected with reversed polarity
+ * @irq_base: base IRQ number.  %STMPE_NR_IRQS irqs will be used, or
+ *	      %STMPE_NR_INTERNAL_IRQS if the GPIO driver is not used.
+ * @gpio: GPIO-specific platform data
+ * @keypad: keypad-specific platform data
+ * @ts: touchscreen-specific platform data
+ */
+struct stmpe_platform_data {
+	int id;
+	unsigned int blocks;
+	int irq_base;
+	unsigned int irq_trigger;
+	bool irq_invert_polarity;
+
+	struct stmpe_gpio_platform_data *gpio;
+	struct stmpe_keypad_platform_data *keypad;
+	struct stmpe_ts_platform_data *ts;
+};
+
+#define STMPE_NR_INTERNAL_IRQS	9
+#define STMPE_INT_GPIO(x)	(STMPE_NR_INTERNAL_IRQS + (x))
+
+#define STMPE_NR_GPIOS		24
+#define STMPE_NR_IRQS		STMPE_INT_GPIO(STMPE_NR_GPIOS)
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 91f4debf5e2df904e7fade530bd1a6d182efd72c Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Mon, 12 Jul 2010 03:48:08 +0200
Subject: mfd: Add JZ4740 ADC driver

This patch adds a MFD driver for the JZ4740 ADC unit. The driver is used to
demultiplex IRQs and synchronize access to shared registers between the
battery, hwmon and (future) touchscreen driver.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Kconfig        |   8 +
 drivers/mfd/Makefile       |   1 +
 drivers/mfd/jz4740-adc.c   | 384 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/jz4740-adc.h |  32 ++++
 4 files changed, 425 insertions(+)
 create mode 100644 drivers/mfd/jz4740-adc.c
 create mode 100644 include/linux/jz4740-adc.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 8f5145b4b56f..d59334f34bf6 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -514,6 +514,14 @@ config MFD_JANZ_CMODIO
 	  host many different types of MODULbus daughterboards, including
 	  CAN and GPIO controllers.
 
+config MFD_JZ4740_ADC
+	tristate "Support for the JZ4740 SoC ADC core"
+	select MFD_CORE
+	depends on MACH_JZ4740
+	help
+	  Say yes here if you want support for the ADC unit in the JZ4740 SoC.
+	  This driver is necessary for jz4740-battery and jz4740-hwmon driver.
+
 endif # MFD_SUPPORT
 
 menu "Multimedia Capabilities Port drivers"
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 4410747f7b5b..1f7077966994 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -72,3 +72,4 @@ obj-$(CONFIG_PMIC_ADP5520)	+= adp5520.o
 obj-$(CONFIG_LPC_SCH)		+= lpc_sch.o
 obj-$(CONFIG_MFD_RDC321X)	+= rdc321x-southbridge.o
 obj-$(CONFIG_MFD_JANZ_CMODIO)	+= janz-cmodio.o
+obj-$(CONFIG_MFD_JZ4740_ADC)	+= jz4740-adc.o
diff --git a/drivers/mfd/jz4740-adc.c b/drivers/mfd/jz4740-adc.c
new file mode 100644
index 000000000000..7a844aef5dcf
--- /dev/null
+++ b/drivers/mfd/jz4740-adc.c
@@ -0,0 +1,384 @@
+/*
+ * Copyright (C) 2009-2010, Lars-Peter Clausen <lars@metafoo.de>
+ * JZ4740 SoC ADC driver
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under  the terms of the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * This driver synchronizes access to the JZ4740 ADC core between the
+ * JZ4740 battery and hwmon drivers.
+ */
+
+#include <linux/err.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <linux/clk.h>
+#include <linux/mfd/core.h>
+
+#include <linux/jz4740-adc.h>
+
+
+#define JZ_REG_ADC_ENABLE	0x00
+#define JZ_REG_ADC_CFG		0x04
+#define JZ_REG_ADC_CTRL		0x08
+#define JZ_REG_ADC_STATUS	0x0c
+
+#define JZ_REG_ADC_TOUCHSCREEN_BASE	0x10
+#define JZ_REG_ADC_BATTERY_BASE	0x1c
+#define JZ_REG_ADC_HWMON_BASE	0x20
+
+#define JZ_ADC_ENABLE_TOUCH	BIT(2)
+#define JZ_ADC_ENABLE_BATTERY	BIT(1)
+#define JZ_ADC_ENABLE_ADCIN	BIT(0)
+
+enum {
+	JZ_ADC_IRQ_ADCIN = 0,
+	JZ_ADC_IRQ_BATTERY,
+	JZ_ADC_IRQ_TOUCH,
+	JZ_ADC_IRQ_PENUP,
+	JZ_ADC_IRQ_PENDOWN,
+};
+
+struct jz4740_adc {
+	struct resource *mem;
+	void __iomem *base;
+
+	int irq;
+	int irq_base;
+
+	struct clk *clk;
+	atomic_t clk_ref;
+
+	spinlock_t lock;
+};
+
+static inline void jz4740_adc_irq_set_masked(struct jz4740_adc *adc, int irq,
+	bool masked)
+{
+	unsigned long flags;
+	uint8_t val;
+
+	irq -= adc->irq_base;
+
+	spin_lock_irqsave(&adc->lock, flags);
+
+	val = readb(adc->base + JZ_REG_ADC_CTRL);
+	if (masked)
+		val |= BIT(irq);
+	else
+		val &= ~BIT(irq);
+	writeb(val, adc->base + JZ_REG_ADC_CTRL);
+
+	spin_unlock_irqrestore(&adc->lock, flags);
+}
+
+static void jz4740_adc_irq_mask(unsigned int irq)
+{
+	struct jz4740_adc *adc = get_irq_chip_data(irq);
+	jz4740_adc_irq_set_masked(adc, irq, true);
+}
+
+static void jz4740_adc_irq_unmask(unsigned int irq)
+{
+	struct jz4740_adc *adc = get_irq_chip_data(irq);
+	jz4740_adc_irq_set_masked(adc, irq, false);
+}
+
+static void jz4740_adc_irq_ack(unsigned int irq)
+{
+	struct jz4740_adc *adc = get_irq_chip_data(irq);
+
+	irq -= adc->irq_base;
+	writeb(BIT(irq), adc->base + JZ_REG_ADC_STATUS);
+}
+
+static struct irq_chip jz4740_adc_irq_chip = {
+	.name = "jz4740-adc",
+	.mask = jz4740_adc_irq_mask,
+	.unmask = jz4740_adc_irq_unmask,
+	.ack = jz4740_adc_irq_ack,
+};
+
+static void jz4740_adc_irq_demux(unsigned int irq, struct irq_desc *desc)
+{
+	struct jz4740_adc *adc = get_irq_desc_data(desc);
+	uint8_t status;
+	unsigned int i;
+
+	status = readb(adc->base + JZ_REG_ADC_STATUS);
+
+	for (i = 0; i < 5; ++i) {
+		if (status & BIT(i))
+			generic_handle_irq(adc->irq_base + i);
+	}
+}
+
+
+/* Refcounting for the ADC clock is done in here instead of in the clock
+ * framework, because it is the only clock which is shared between multiple
+ * devices and thus is the only clock which needs refcounting */
+static inline void jz4740_adc_clk_enable(struct jz4740_adc *adc)
+{
+	if (atomic_inc_return(&adc->clk_ref) == 1)
+		clk_enable(adc->clk);
+}
+
+static inline void jz4740_adc_clk_disable(struct jz4740_adc *adc)
+{
+	if (atomic_dec_return(&adc->clk_ref) == 0)
+		clk_disable(adc->clk);
+}
+
+static inline void jz4740_adc_set_enabled(struct jz4740_adc *adc, int engine,
+	bool enabled)
+{
+	unsigned long flags;
+	uint8_t val;
+
+	spin_lock_irqsave(&adc->lock, flags);
+
+	val = readb(adc->base + JZ_REG_ADC_ENABLE);
+	if (enabled)
+		val |= BIT(engine);
+	else
+		val &= BIT(engine);
+	writeb(val, adc->base + JZ_REG_ADC_ENABLE);
+
+	spin_unlock_irqrestore(&adc->lock, flags);
+}
+
+static int jz4740_adc_cell_enable(struct platform_device *pdev)
+{
+	struct jz4740_adc *adc = dev_get_drvdata(pdev->dev.parent);
+
+	jz4740_adc_clk_enable(adc);
+	jz4740_adc_set_enabled(adc, pdev->id, true);
+
+	return 0;
+}
+
+static int jz4740_adc_cell_disable(struct platform_device *pdev)
+{
+	struct jz4740_adc *adc = dev_get_drvdata(pdev->dev.parent);
+
+	jz4740_adc_set_enabled(adc, pdev->id, false);
+	jz4740_adc_clk_disable(adc);
+
+	return 0;
+}
+
+int jz4740_adc_set_config(struct device *dev, uint32_t mask, uint32_t val)
+{
+	struct jz4740_adc *adc = dev_get_drvdata(dev);
+	unsigned long flags;
+	uint32_t cfg;
+
+	if (!adc)
+		return -ENODEV;
+
+	spin_lock_irqsave(&adc->lock, flags);
+
+	cfg = readl(adc->base + JZ_REG_ADC_CFG);
+
+	cfg &= ~mask;
+	cfg |= val;
+
+	writel(cfg, adc->base + JZ_REG_ADC_CFG);
+
+	spin_unlock_irqrestore(&adc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(jz4740_adc_set_config);
+
+static struct resource jz4740_hwmon_resources[] = {
+	{
+		.start = JZ_ADC_IRQ_ADCIN,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.start	= JZ_REG_ADC_HWMON_BASE,
+		.end	= JZ_REG_ADC_HWMON_BASE + 3,
+		.flags	= IORESOURCE_MEM,
+	},
+};
+
+static struct resource jz4740_battery_resources[] = {
+	{
+		.start = JZ_ADC_IRQ_BATTERY,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.start	= JZ_REG_ADC_BATTERY_BASE,
+		.end	= JZ_REG_ADC_BATTERY_BASE + 3,
+		.flags	= IORESOURCE_MEM,
+	},
+};
+
+const struct mfd_cell jz4740_adc_cells[] = {
+	{
+		.id = 0,
+		.name = "jz4740-hwmon",
+		.num_resources = ARRAY_SIZE(jz4740_hwmon_resources),
+		.resources = jz4740_hwmon_resources,
+		.platform_data = (void *)&jz4740_adc_cells[0],
+		.data_size = sizeof(struct mfd_cell),
+
+		.enable = jz4740_adc_cell_enable,
+		.disable = jz4740_adc_cell_disable,
+	},
+	{
+		.id = 1,
+		.name = "jz4740-battery",
+		.num_resources = ARRAY_SIZE(jz4740_battery_resources),
+		.resources = jz4740_battery_resources,
+		.platform_data = (void *)&jz4740_adc_cells[1],
+		.data_size = sizeof(struct mfd_cell),
+
+		.enable = jz4740_adc_cell_enable,
+		.disable = jz4740_adc_cell_disable,
+	},
+};
+
+static int __devinit jz4740_adc_probe(struct platform_device *pdev)
+{
+	int ret;
+	struct jz4740_adc *adc;
+	struct resource *mem_base;
+	int irq;
+
+	adc = kmalloc(sizeof(*adc), GFP_KERNEL);
+
+	adc->irq = platform_get_irq(pdev, 0);
+	if (adc->irq < 0) {
+		ret = adc->irq;
+		dev_err(&pdev->dev, "Failed to get platform irq: %d\n", ret);
+		goto err_free;
+	}
+
+	adc->irq_base = platform_get_irq(pdev, 1);
+	if (adc->irq_base < 0) {
+		ret = adc->irq_base;
+		dev_err(&pdev->dev, "Failed to get irq base: %d\n", ret);
+		goto err_free;
+	}
+
+	mem_base = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!mem_base) {
+		ret = -ENOENT;
+		dev_err(&pdev->dev, "Failed to get platform mmio resource\n");
+		goto err_free;
+	}
+
+	/* Only request the shared registers for the MFD driver */
+	adc->mem = request_mem_region(mem_base->start, JZ_REG_ADC_STATUS,
+					pdev->name);
+	if (!adc->mem) {
+		ret = -EBUSY;
+		dev_err(&pdev->dev, "Failed to request mmio memory region\n");
+		goto err_free;
+	}
+
+	adc->base = ioremap_nocache(adc->mem->start, resource_size(adc->mem));
+	if (!adc->base) {
+		ret = -EBUSY;
+		dev_err(&pdev->dev, "Failed to ioremap mmio memory\n");
+		goto err_release_mem_region;
+	}
+
+	adc->clk = clk_get(&pdev->dev, "adc");
+	if (IS_ERR(adc->clk)) {
+		ret = PTR_ERR(adc->clk);
+		dev_err(&pdev->dev, "Failed to get clock: %d\n", ret);
+		goto err_iounmap;
+	}
+
+	spin_lock_init(&adc->lock);
+	atomic_set(&adc->clk_ref, 0);
+
+	platform_set_drvdata(pdev, adc);
+
+	for (irq = adc->irq_base; irq < adc->irq_base + 5; ++irq) {
+		set_irq_chip_data(irq, adc);
+		set_irq_chip_and_handler(irq, &jz4740_adc_irq_chip,
+		    handle_level_irq);
+	}
+
+	set_irq_data(adc->irq, adc);
+	set_irq_chained_handler(adc->irq, jz4740_adc_irq_demux);
+
+	writeb(0x00, adc->base + JZ_REG_ADC_ENABLE);
+	writeb(0xff, adc->base + JZ_REG_ADC_CTRL);
+
+	return mfd_add_devices(&pdev->dev, 0, jz4740_adc_cells,
+		ARRAY_SIZE(jz4740_adc_cells), mem_base, adc->irq_base);
+
+err_iounmap:
+	platform_set_drvdata(pdev, NULL);
+	iounmap(adc->base);
+err_release_mem_region:
+	release_mem_region(adc->mem->start, resource_size(adc->mem));
+err_free:
+	kfree(adc);
+
+	return ret;
+}
+
+static int __devexit jz4740_adc_remove(struct platform_device *pdev)
+{
+	struct jz4740_adc *adc = platform_get_drvdata(pdev);
+
+	mfd_remove_devices(&pdev->dev);
+
+	set_irq_data(adc->irq, NULL);
+	set_irq_chained_handler(adc->irq, NULL);
+
+	iounmap(adc->base);
+	release_mem_region(adc->mem->start, resource_size(adc->mem));
+
+	clk_put(adc->clk);
+
+	platform_set_drvdata(pdev, NULL);
+
+	kfree(adc);
+
+	return 0;
+}
+
+struct platform_driver jz4740_adc_driver = {
+	.probe	= jz4740_adc_probe,
+	.remove = __devexit_p(jz4740_adc_remove),
+	.driver = {
+		.name = "jz4740-adc",
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init jz4740_adc_init(void)
+{
+	return platform_driver_register(&jz4740_adc_driver);
+}
+module_init(jz4740_adc_init);
+
+static void __exit jz4740_adc_exit(void)
+{
+	platform_driver_unregister(&jz4740_adc_driver);
+}
+module_exit(jz4740_adc_exit);
+
+MODULE_DESCRIPTION("JZ4740 SoC ADC driver");
+MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:jz4740-adc");
diff --git a/include/linux/jz4740-adc.h b/include/linux/jz4740-adc.h
new file mode 100644
index 000000000000..9053f95e9687
--- /dev/null
+++ b/include/linux/jz4740-adc.h
@@ -0,0 +1,32 @@
+
+#ifndef __LINUX_JZ4740_ADC
+#define __LINUX_JZ4740_ADC
+
+#include <linux/device.h>
+
+/*
+ * jz4740_adc_set_config - Configure a JZ4740 adc device
+ * @dev: Pointer to a jz4740-adc device
+ * @mask: Mask for the config value to be set
+ * @val: Value to be set
+ *
+ * This function can be used by the JZ4740 ADC mfd cells to configure their
+ * options in the shared config register.
+*/
+int jz4740_adc_set_config(struct device *dev, uint32_t mask, uint32_t val);
+
+#define JZ_ADC_CONFIG_SPZZ		BIT(31)
+#define JZ_ADC_CONFIG_EX_IN		BIT(30)
+#define JZ_ADC_CONFIG_DNUM_MASK		(0x7 << 16)
+#define JZ_ADC_CONFIG_DMA_ENABLE	BIT(15)
+#define JZ_ADC_CONFIG_XYZ_MASK		(0x2 << 13)
+#define JZ_ADC_CONFIG_SAMPLE_NUM_MASK	(0x7 << 10)
+#define JZ_ADC_CONFIG_CLKDIV_MASK	(0xf << 5)
+#define JZ_ADC_CONFIG_BAT_MB		BIT(4)
+
+#define JZ_ADC_CONFIG_DNUM(dnum)	((dnum) << 16)
+#define JZ_ADC_CONFIG_XYZ_OFFSET(dnum)	((xyz) << 13)
+#define JZ_ADC_CONFIG_SAMPLE_NUM(x)	((x) << 10)
+#define JZ_ADC_CONFIG_CLKDIV(div)	((div) << 5)
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 9accdc1bf239ef20c0fe12ceff2a7532374fd7cd Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 8 Jul 2010 21:09:51 +0900
Subject: mfd: Add additional WM8994 GPIO functions

Later revisions of the WM8994 add some more GPIO functions, define them
in the header file.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/mfd/wm8994/gpio.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8994/gpio.h b/include/linux/mfd/wm8994/gpio.h
index b4d4c22991e8..0c79b5ff4b5a 100644
--- a/include/linux/mfd/wm8994/gpio.h
+++ b/include/linux/mfd/wm8994/gpio.h
@@ -36,6 +36,10 @@
 #define WM8994_GP_FN_WSEQ_STATUS    16
 #define WM8994_GP_FN_FIFO_ERROR     17
 #define WM8994_GP_FN_OPCLK          18
+#define WM8994_GP_FN_THW	    19
+#define WM8994_GP_FN_DCS_DONE	    20
+#define WM8994_GP_FN_FLL1_OUT       21
+#define WM8994_GP_FN_FLL2_OUT       22
 
 #define WM8994_GPN_DIR                          0x8000  /* GPN_DIR */
 #define WM8994_GPN_DIR_MASK                     0x8000  /* GPN_DIR */
-- 
cgit v1.2.3-59-g8ed1b


From 5981f4e65cb455a820b3d07b8e4bac506233f3ea Mon Sep 17 00:00:00 2001
From: Sundar R Iyer <sundar.iyer@stericsson.com>
Date: Wed, 21 Jul 2010 11:41:07 +0530
Subject: mfd: Add stmpe auto sleep feature

Some STMPE devices support entering sleep mode automatically on a
specified timeout of inactivity on the I2C bus with the host system.

Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Acked-by: Rabin Vincent <rabin.vincent@stericsson.com>
Signed-off-by: Sundar R Iyer <sundar.iyer@stericsson.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/stmpe.c       | 70 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/mfd/stmpe.h       |  7 +++++
 include/linux/mfd/stmpe.h |  4 +++
 3 files changed, 81 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/stmpe.c b/drivers/mfd/stmpe.c
index a7f3099fdcfd..0754c5e91995 100644
--- a/drivers/mfd/stmpe.c
+++ b/drivers/mfd/stmpe.c
@@ -455,6 +455,67 @@ static struct stmpe_variant_block stmpe1601_blocks[] = {
 	},
 };
 
+/* supported autosleep timeout delay (in msecs) */
+static const int stmpe_autosleep_delay[] = {
+	4, 16, 32, 64, 128, 256, 512, 1024,
+};
+
+static int stmpe_round_timeout(int timeout)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(stmpe_autosleep_delay); i++) {
+		if (stmpe_autosleep_delay[i] >= timeout)
+			return i;
+	}
+
+	/*
+	 * requests for delays longer than supported should not return the
+	 * longest supported delay
+	 */
+	return -EINVAL;
+}
+
+static int stmpe_autosleep(struct stmpe *stmpe, int autosleep_timeout)
+{
+	int ret;
+
+	if (!stmpe->variant->enable_autosleep)
+		return -ENOSYS;
+
+	mutex_lock(&stmpe->lock);
+	ret = stmpe->variant->enable_autosleep(stmpe, autosleep_timeout);
+	mutex_unlock(&stmpe->lock);
+
+	return ret;
+}
+
+/*
+ * Both stmpe 1601/2403 support same layout for autosleep
+ */
+static int stmpe1601_autosleep(struct stmpe *stmpe,
+		int autosleep_timeout)
+{
+	int ret, timeout;
+
+	/* choose the best available timeout */
+	timeout = stmpe_round_timeout(autosleep_timeout);
+	if (timeout < 0) {
+		dev_err(stmpe->dev, "invalid timeout\n");
+		return timeout;
+	}
+
+	ret = __stmpe_set_bits(stmpe, STMPE1601_REG_SYS_CTRL2,
+			STMPE1601_AUTOSLEEP_TIMEOUT_MASK,
+			timeout);
+	if (ret < 0)
+		return ret;
+
+	return __stmpe_set_bits(stmpe, STMPE1601_REG_SYS_CTRL2,
+			STPME1601_AUTOSLEEP_ENABLE,
+			STPME1601_AUTOSLEEP_ENABLE);
+}
+
 static int stmpe1601_enable(struct stmpe *stmpe, unsigned int blocks,
 			    bool enable)
 {
@@ -497,6 +558,7 @@ static struct stmpe_variant_info stmpe1601 = {
 	.num_irqs	= STMPE1601_NR_INTERNAL_IRQS,
 	.enable		= stmpe1601_enable,
 	.get_altfunc	= stmpe1601_get_altfunc,
+	.enable_autosleep	= stmpe1601_autosleep,
 };
 
 /*
@@ -589,6 +651,7 @@ static struct stmpe_variant_info stmpe2403 = {
 	.num_irqs	= STMPE24XX_NR_INTERNAL_IRQS,
 	.enable		= stmpe24xx_enable,
 	.get_altfunc	= stmpe24xx_get_altfunc,
+	.enable_autosleep	= stmpe1601_autosleep, /* same as stmpe1601 */
 };
 
 static struct stmpe_variant_info *stmpe_variant_info[] = {
@@ -731,6 +794,7 @@ static void stmpe_irq_remove(struct stmpe *stmpe)
 static int __devinit stmpe_chip_init(struct stmpe *stmpe)
 {
 	unsigned int irq_trigger = stmpe->pdata->irq_trigger;
+	int autosleep_timeout = stmpe->pdata->autosleep_timeout;
 	struct stmpe_variant_info *variant = stmpe->variant;
 	u8 icr = STMPE_ICR_LSB_GIM;
 	unsigned int id;
@@ -766,6 +830,12 @@ static int __devinit stmpe_chip_init(struct stmpe *stmpe)
 	if (stmpe->pdata->irq_invert_polarity)
 		icr ^= STMPE_ICR_LSB_HIGH;
 
+	if (stmpe->pdata->autosleep) {
+		ret = stmpe_autosleep(stmpe, autosleep_timeout);
+		if (ret)
+			return ret;
+	}
+
 	return stmpe_reg_write(stmpe, stmpe->regs[STMPE_IDX_ICR_LSB], icr);
 }
 
diff --git a/drivers/mfd/stmpe.h b/drivers/mfd/stmpe.h
index 991f0ecbeb34..0dbdc4e8cd77 100644
--- a/drivers/mfd/stmpe.h
+++ b/drivers/mfd/stmpe.h
@@ -49,6 +49,7 @@ struct stmpe_variant_block {
  *		Called with the I/O lock held.
  * @get_altfunc: callback to get the alternate function number for the
  *		 specific block
+ * @enable_autosleep: callback to configure autosleep with specified timeout
  */
 struct stmpe_variant_info {
 	const char *name;
@@ -62,6 +63,7 @@ struct stmpe_variant_info {
 	int num_irqs;
 	int (*enable)(struct stmpe *stmpe, unsigned int blocks, bool enable);
 	int (*get_altfunc)(struct stmpe *stmpe, enum stmpe_block block);
+	int (*enable_autosleep)(struct stmpe *stmpe, int autosleep_timeout);
 };
 
 #define STMPE_ICR_LSB_HIGH	(1 << 2)
@@ -118,6 +120,7 @@ struct stmpe_variant_info {
 #define STMPE1601_NR_INTERNAL_IRQS	9
 
 #define STMPE1601_REG_SYS_CTRL			0x02
+#define STMPE1601_REG_SYS_CTRL2			0x03
 #define STMPE1601_REG_ICR_LSB			0x11
 #define STMPE1601_REG_IER_LSB			0x13
 #define STMPE1601_REG_ISR_MSB			0x14
@@ -137,6 +140,10 @@ struct stmpe_variant_info {
 #define STMPE1601_SYS_CTRL_ENABLE_KPC		(1 << 1)
 #define STMPE1601_SYSCON_ENABLE_SPWM		(1 << 0)
 
+/* The 1601/2403 share the same masks */
+#define STMPE1601_AUTOSLEEP_TIMEOUT_MASK	(0x7)
+#define STPME1601_AUTOSLEEP_ENABLE		(1 << 3)
+
 /*
  * STMPE24xx
  */
diff --git a/include/linux/mfd/stmpe.h b/include/linux/mfd/stmpe.h
index 90faa98b577f..39ca7588659b 100644
--- a/include/linux/mfd/stmpe.h
+++ b/include/linux/mfd/stmpe.h
@@ -170,6 +170,8 @@ struct stmpe_ts_platform_data {
  * @blocks: bitmask of blocks to enable (use STMPE_BLOCK_*)
  * @irq_trigger: IRQ trigger to use for the interrupt to the host
  * @irq_invert_polarity: IRQ line is connected with reversed polarity
+ * @autosleep: bool to enable/disable stmpe autosleep
+ * @autosleep_timeout: inactivity timeout in milliseconds for autosleep
  * @irq_base: base IRQ number.  %STMPE_NR_IRQS irqs will be used, or
  *	      %STMPE_NR_INTERNAL_IRQS if the GPIO driver is not used.
  * @gpio: GPIO-specific platform data
@@ -182,6 +184,8 @@ struct stmpe_platform_data {
 	int irq_base;
 	unsigned int irq_trigger;
 	bool irq_invert_polarity;
+	bool autosleep;
+	int autosleep_timeout;
 
 	struct stmpe_gpio_platform_data *gpio;
 	struct stmpe_keypad_platform_data *keypad;
-- 
cgit v1.2.3-59-g8ed1b


From 3b16bb539c558cd523ea380653d4bf24a8c9e833 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 2 Aug 2010 11:14:17 +0200
Subject: mfd: New mc13783 function exposing flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is needed for the mc13783-adc driver to decide if a touch screen is
connected.  If so some channels are not available as generic hwmon inputs.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/mc13783-core.c  | 6 ++++++
 include/linux/mfd/mc13783.h | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/mc13783-core.c b/drivers/mfd/mc13783-core.c
index fecf38a4f025..b0778dcca917 100644
--- a/drivers/mfd/mc13783-core.c
+++ b/drivers/mfd/mc13783-core.c
@@ -226,6 +226,12 @@ int mc13783_reg_rmw(struct mc13783 *mc13783, unsigned int offset,
 }
 EXPORT_SYMBOL(mc13783_reg_rmw);
 
+int mc13783_get_flags(struct mc13783 *mc13783)
+{
+	return mc13783->flags;
+}
+EXPORT_SYMBOL(mc13783_get_flags);
+
 int mc13783_irq_mask(struct mc13783 *mc13783, int irq)
 {
 	int ret;
diff --git a/include/linux/mfd/mc13783.h b/include/linux/mfd/mc13783.h
index 4a894f688549..0fa44fb8dd26 100644
--- a/include/linux/mfd/mc13783.h
+++ b/include/linux/mfd/mc13783.h
@@ -21,6 +21,8 @@ int mc13783_reg_write(struct mc13783 *mc13783, unsigned int offset, u32 val);
 int mc13783_reg_rmw(struct mc13783 *mc13783, unsigned int offset,
 		u32 mask, u32 val);
 
+int mc13783_get_flags(struct mc13783 *mc13783);
+
 int mc13783_irq_request(struct mc13783 *mc13783, int irq,
 		irq_handler_t handler, const char *name, void *dev);
 int mc13783_irq_request_nounmask(struct mc13783 *mc13783, int irq,
-- 
cgit v1.2.3-59-g8ed1b


From b6e6d54cab7633dd2216ede77ccd00cdaebd77ad Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 2 Aug 2010 15:48:04 +0200
Subject: mfd: Get rid of now unused mc13783 private header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds all remaining definitions that are used by the core driver
to the .c file.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/mc13783-core.c          |  24 +++-
 include/linux/mfd/mc13783-private.h | 220 ------------------------------------
 2 files changed, 23 insertions(+), 221 deletions(-)
 delete mode 100644 include/linux/mfd/mc13783-private.h

(limited to 'include/linux')

diff --git a/drivers/mfd/mc13783-core.c b/drivers/mfd/mc13783-core.c
index b0778dcca917..6df34989c1f6 100644
--- a/drivers/mfd/mc13783-core.c
+++ b/drivers/mfd/mc13783-core.c
@@ -11,9 +11,31 @@
  */
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/mutex.h>
+#include <linux/interrupt.h>
 #include <linux/spi/spi.h>
 #include <linux/mfd/core.h>
-#include <linux/mfd/mc13783-private.h>
+#include <linux/mfd/mc13783.h>
+
+struct mc13783 {
+	struct spi_device *spidev;
+	struct mutex lock;
+	int irq;
+	int flags;
+
+	irq_handler_t irqhandler[MC13783_NUM_IRQ];
+	void *irqdata[MC13783_NUM_IRQ];
+
+	/* XXX these should go as platformdata to the regulator subdevice */
+	struct mc13783_regulator_init_data *regulators;
+	int num_regulators;
+};
+
+#define MC13783_REG_REVISION			 7
+#define MC13783_REG_ADC_0			43
+#define MC13783_REG_ADC_1			44
+#define MC13783_REG_ADC_2			45
 
 #define MC13783_IRQSTAT0	0
 #define MC13783_IRQSTAT0_ADCDONEI	(1 << 0)
diff --git a/include/linux/mfd/mc13783-private.h b/include/linux/mfd/mc13783-private.h
deleted file mode 100644
index 95cf9360553f..000000000000
--- a/include/linux/mfd/mc13783-private.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Copyright 2009 Pengutronix, Sascha Hauer <s.hauer@pengutronix.de>
- *
- * Initial development of this code was funded by
- * Phytec Messtechnik GmbH, http://www.phytec.de
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#ifndef __LINUX_MFD_MC13783_PRIV_H
-#define __LINUX_MFD_MC13783_PRIV_H
-
-#include <linux/platform_device.h>
-#include <linux/mfd/mc13783.h>
-#include <linux/mutex.h>
-#include <linux/interrupt.h>
-
-struct mc13783 {
-	struct spi_device *spidev;
-	struct mutex lock;
-	int irq;
-	int flags;
-
-	irq_handler_t irqhandler[MC13783_NUM_IRQ];
-	void *irqdata[MC13783_NUM_IRQ];
-
-	/* XXX these should go as platformdata to the regulator subdevice */
-	struct mc13783_regulator_init_data *regulators;
-	int num_regulators;
-};
-
-#define MC13783_REG_INTERRUPT_STATUS_0		 0
-#define MC13783_REG_INTERRUPT_MASK_0		 1
-#define MC13783_REG_INTERRUPT_SENSE_0		 2
-#define MC13783_REG_INTERRUPT_STATUS_1		 3
-#define MC13783_REG_INTERRUPT_MASK_1		 4
-#define MC13783_REG_INTERRUPT_SENSE_1		 5
-#define MC13783_REG_POWER_UP_MODE_SENSE		 6
-#define MC13783_REG_REVISION			 7
-#define MC13783_REG_SEMAPHORE			 8
-#define MC13783_REG_ARBITRATION_PERIPHERAL_AUDIO 9
-#define MC13783_REG_ARBITRATION_SWITCHERS	10
-#define MC13783_REG_ARBITRATION_REGULATORS_0	11
-#define MC13783_REG_ARBITRATION_REGULATORS_1	12
-#define MC13783_REG_POWER_CONTROL_0		13
-#define MC13783_REG_POWER_CONTROL_1		14
-#define MC13783_REG_POWER_CONTROL_2		15
-#define MC13783_REG_REGEN_ASSIGNMENT		16
-#define MC13783_REG_CONTROL_SPARE		17
-#define MC13783_REG_MEMORY_A			18
-#define MC13783_REG_MEMORY_B			19
-#define MC13783_REG_RTC_TIME			20
-#define MC13783_REG_RTC_ALARM			21
-#define MC13783_REG_RTC_DAY			22
-#define MC13783_REG_RTC_DAY_ALARM		23
-#define MC13783_REG_SWITCHERS_0			24
-#define MC13783_REG_SWITCHERS_1			25
-#define MC13783_REG_SWITCHERS_2			26
-#define MC13783_REG_SWITCHERS_3			27
-#define MC13783_REG_SWITCHERS_4			28
-#define MC13783_REG_SWITCHERS_5			29
-#define MC13783_REG_REGULATOR_SETTING_0		30
-#define MC13783_REG_REGULATOR_SETTING_1		31
-#define MC13783_REG_REGULATOR_MODE_0		32
-#define MC13783_REG_REGULATOR_MODE_1		33
-#define MC13783_REG_POWER_MISCELLANEOUS		34
-#define MC13783_REG_POWER_SPARE			35
-#define MC13783_REG_AUDIO_RX_0			36
-#define MC13783_REG_AUDIO_RX_1			37
-#define MC13783_REG_AUDIO_TX			38
-#define MC13783_REG_AUDIO_SSI_NETWORK		39
-#define MC13783_REG_AUDIO_CODEC			40
-#define MC13783_REG_AUDIO_STEREO_DAC		41
-#define MC13783_REG_AUDIO_SPARE			42
-#define MC13783_REG_ADC_0			43
-#define MC13783_REG_ADC_1			44
-#define MC13783_REG_ADC_2			45
-#define MC13783_REG_ADC_3			46
-#define MC13783_REG_ADC_4			47
-#define MC13783_REG_CHARGER			48
-#define MC13783_REG_USB				49
-#define MC13783_REG_CHARGE_USB_SPARE		50
-#define MC13783_REG_LED_CONTROL_0		51
-#define MC13783_REG_LED_CONTROL_1		52
-#define MC13783_REG_LED_CONTROL_2		53
-#define MC13783_REG_LED_CONTROL_3		54
-#define MC13783_REG_LED_CONTROL_4		55
-#define MC13783_REG_LED_CONTROL_5		56
-#define MC13783_REG_SPARE			57
-#define MC13783_REG_TRIM_0			58
-#define MC13783_REG_TRIM_1			59
-#define MC13783_REG_TEST_0			60
-#define MC13783_REG_TEST_1			61
-#define MC13783_REG_TEST_2			62
-#define MC13783_REG_TEST_3			63
-#define MC13783_REG_NB				64
-
-/*
- * Reg Regulator Mode 0
- */
-#define MC13783_REGCTRL_VAUDIO_EN	(1 << 0)
-#define MC13783_REGCTRL_VAUDIO_STBY	(1 << 1)
-#define MC13783_REGCTRL_VAUDIO_MODE	(1 << 2)
-#define MC13783_REGCTRL_VIOHI_EN	(1 << 3)
-#define MC13783_REGCTRL_VIOHI_STBY	(1 << 4)
-#define MC13783_REGCTRL_VIOHI_MODE	(1 << 5)
-#define MC13783_REGCTRL_VIOLO_EN	(1 << 6)
-#define MC13783_REGCTRL_VIOLO_STBY 	(1 << 7)
-#define MC13783_REGCTRL_VIOLO_MODE	(1 << 8)
-#define MC13783_REGCTRL_VDIG_EN		(1 << 9)
-#define MC13783_REGCTRL_VDIG_STBY	(1 << 10)
-#define MC13783_REGCTRL_VDIG_MODE	(1 << 11)
-#define MC13783_REGCTRL_VGEN_EN		(1 << 12)
-#define MC13783_REGCTRL_VGEN_STBY	(1 << 13)
-#define MC13783_REGCTRL_VGEN_MODE	(1 << 14)
-#define MC13783_REGCTRL_VRFDIG_EN	(1 << 15)
-#define MC13783_REGCTRL_VRFDIG_STBY	(1 << 16)
-#define MC13783_REGCTRL_VRFDIG_MODE	(1 << 17)
-#define MC13783_REGCTRL_VRFREF_EN	(1 << 18)
-#define MC13783_REGCTRL_VRFREF_STBY	(1 << 19)
-#define MC13783_REGCTRL_VRFREF_MODE	(1 << 20)
-#define MC13783_REGCTRL_VRFCP_EN	(1 << 21)
-#define MC13783_REGCTRL_VRFCP_STBY	(1 << 22)
-#define MC13783_REGCTRL_VRFCP_MODE	(1 << 23)
-
-/*
- * Reg Regulator Mode 1
- */
-#define MC13783_REGCTRL_VSIM_EN		(1 << 0)
-#define MC13783_REGCTRL_VSIM_STBY	(1 << 1)
-#define MC13783_REGCTRL_VSIM_MODE	(1 << 2)
-#define MC13783_REGCTRL_VESIM_EN	(1 << 3)
-#define MC13783_REGCTRL_VESIM_STBY	(1 << 4)
-#define MC13783_REGCTRL_VESIM_MODE	(1 << 5)
-#define MC13783_REGCTRL_VCAM_EN		(1 << 6)
-#define MC13783_REGCTRL_VCAM_STBY	(1 << 7)
-#define MC13783_REGCTRL_VCAM_MODE	(1 << 8)
-#define	MC13783_REGCTRL_VRFBG_EN	(1 << 9)
-#define MC13783_REGCTRL_VRFBG_STBY	(1 << 10)
-#define MC13783_REGCTRL_VVIB_EN		(1 << 11)
-#define MC13783_REGCTRL_VRF1_EN		(1 << 12)
-#define MC13783_REGCTRL_VRF1_STBY	(1 << 13)
-#define MC13783_REGCTRL_VRF1_MODE	(1 << 14)
-#define MC13783_REGCTRL_VRF2_EN		(1 << 15)
-#define MC13783_REGCTRL_VRF2_STBY	(1 << 16)
-#define MC13783_REGCTRL_VRF2_MODE	(1 << 17)
-#define MC13783_REGCTRL_VMMC1_EN	(1 << 18)
-#define MC13783_REGCTRL_VMMC1_STBY	(1 << 19)
-#define MC13783_REGCTRL_VMMC1_MODE	(1 << 20)
-#define MC13783_REGCTRL_VMMC2_EN	(1 << 21)
-#define MC13783_REGCTRL_VMMC2_STBY	(1 << 22)
-#define MC13783_REGCTRL_VMMC2_MODE	(1 << 23)
-
-/*
- * Reg Regulator Misc.
- */
-#define MC13783_REGCTRL_GPO1_EN		(1 << 6)
-#define MC13783_REGCTRL_GPO2_EN		(1 << 8)
-#define MC13783_REGCTRL_GPO3_EN		(1 << 10)
-#define MC13783_REGCTRL_GPO4_EN		(1 << 12)
-#define MC13783_REGCTRL_VIBPINCTRL	(1 << 14)
-
-/*
- * Reg Switcher 4
- */
-#define MC13783_SWCTRL_SW1A_MODE	(1 << 0)
-#define MC13783_SWCTRL_SW1A_STBY_MODE	(1 << 2)
-#define MC13783_SWCTRL_SW1A_DVS_SPEED	(1 << 6)
-#define MC13783_SWCTRL_SW1A_PANIC_MODE	(1 << 8)
-#define MC13783_SWCTRL_SW1A_SOFTSTART	(1 << 9)
-#define MC13783_SWCTRL_SW1B_MODE	(1 << 10)
-#define MC13783_SWCTRL_SW1B_STBY_MODE	(1 << 12)
-#define MC13783_SWCTRL_SW1B_DVS_SPEED	(1 << 14)
-#define MC13783_SWCTRL_SW1B_PANIC_MODE	(1 << 16)
-#define MC13783_SWCTRL_SW1B_SOFTSTART	(1 << 17)
-#define MC13783_SWCTRL_PLL_EN		(1 << 18)
-#define MC13783_SWCTRL_PLL_FACTOR	(1 << 19)
-
-/*
- * Reg Switcher 5
- */
-#define MC13783_SWCTRL_SW2A_MODE	(1 << 0)
-#define MC13783_SWCTRL_SW2A_STBY_MODE	(1 << 2)
-#define MC13783_SWCTRL_SW2A_DVS_SPEED	(1 << 6)
-#define MC13783_SWCTRL_SW2A_PANIC_MODE	(1 << 8)
-#define MC13783_SWCTRL_SW2A_SOFTSTART	(1 << 9)
-#define MC13783_SWCTRL_SW2B_MODE	(1 << 10)
-#define MC13783_SWCTRL_SW2B_STBY_MODE	(1 << 12)
-#define MC13783_SWCTRL_SW2B_DVS_SPEED	(1 << 14)
-#define MC13783_SWCTRL_SW2B_PANIC_MODE	(1 << 16)
-#define MC13783_SWCTRL_SW2B_SOFTSTART	(1 << 17)
-#define MC13783_SWSET_SW3		(1 << 18)
-#define MC13783_SWCTRL_SW3_EN		(1 << 20)
-#define MC13783_SWCTRL_SW3_STBY		(1 << 21)
-#define MC13783_SWCTRL_SW3_MODE		(1 << 22)
-
-static inline int mc13783_set_bits(struct mc13783 *mc13783, unsigned int offset,
-		u32 mask, u32 val)
-{
-	int ret;
-	mc13783_lock(mc13783);
-	ret = mc13783_reg_rmw(mc13783, offset, mask, val);
-	mc13783_unlock(mc13783);
-
-	return ret;
-}
-
-#endif /* __LINUX_MFD_MC13783_PRIV_H */
-- 
cgit v1.2.3-59-g8ed1b


From c6c193326384aecfd668c8f271799a44dbc74c1a Mon Sep 17 00:00:00 2001
From: Mike Rapoport <mike@compulab.co.il>
Date: Wed, 11 Aug 2010 01:11:04 +0200
Subject: mfd: Add TPS6586x driver

Add mfd core driver for TPS6586x PMICs family.
The driver provides I/O access for the sub-device drivers and performs
regstration of the sub-devices based on the platform requirements.
In addition it implements GPIOlib interface for the chip GPIOs.

TODO:
        - add interrupt support
        - add platform data for PWM, backlight leds and charger

Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Signed-off-by: Mike Rapoport <mike.rapoport@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Kconfig          |  14 ++
 drivers/mfd/Makefile         |   1 +
 drivers/mfd/tps6586x.c       | 375 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/tps6586x.h |  47 ++++++
 4 files changed, 437 insertions(+)
 create mode 100644 drivers/mfd/tps6586x.c
 create mode 100644 include/linux/mfd/tps6586x.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 23a891f396e4..d75909e7cf2f 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -531,6 +531,20 @@ config MFD_JZ4740_ADC
 	  Say yes here if you want support for the ADC unit in the JZ4740 SoC.
 	  This driver is necessary for jz4740-battery and jz4740-hwmon driver.
 
+config MFD_TPS6586X
+	tristate "TPS6586x Power Management chips"
+	depends on I2C && GPIOLIB
+	select MFD_CORE
+	help
+	  If you say yes here you get support for the TPS6586X series of
+	  Power Management chips.
+	  This driver provides common support for accessing the device,
+	  additional drivers must be enabled in order to use the
+	  functionality of the device.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called tps6586x.
+
 endif # MFD_SUPPORT
 
 menu "Multimedia Capabilities Port drivers"
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index cc7cce0976f3..1e48d7e3e884 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -74,3 +74,4 @@ obj-$(CONFIG_LPC_SCH)		+= lpc_sch.o
 obj-$(CONFIG_MFD_RDC321X)	+= rdc321x-southbridge.o
 obj-$(CONFIG_MFD_JANZ_CMODIO)	+= janz-cmodio.o
 obj-$(CONFIG_MFD_JZ4740_ADC)	+= jz4740-adc.o
+obj-$(CONFIG_MFD_TPS6586X)	+= tps6586x.o
diff --git a/drivers/mfd/tps6586x.c b/drivers/mfd/tps6586x.c
new file mode 100644
index 000000000000..4cde31e6a252
--- /dev/null
+++ b/drivers/mfd/tps6586x.c
@@ -0,0 +1,375 @@
+/*
+ * Core driver for TI TPS6586x PMIC family
+ *
+ * Copyright (c) 2010 CompuLab Ltd.
+ * Mike Rapoport <mike@compulab.co.il>
+ *
+ * Based on da903x.c.
+ * Copyright (C) 2008 Compulab, Ltd.
+ * Mike Rapoport <mike@compulab.co.il>
+ * Copyright (C) 2006-2008 Marvell International Ltd.
+ * Eric Miao <eric.miao@marvell.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+
+#include <linux/mfd/core.h>
+#include <linux/mfd/tps6586x.h>
+
+/* GPIO control registers */
+#define TPS6586X_GPIOSET1	0x5d
+#define TPS6586X_GPIOSET2	0x5e
+
+/* device id */
+#define TPS6586X_VERSIONCRC	0xcd
+#define TPS658621A_VERSIONCRC	0x15
+
+struct tps6586x {
+	struct mutex		lock;
+	struct device		*dev;
+	struct i2c_client	*client;
+
+	struct gpio_chip	gpio;
+};
+
+static inline int __tps6586x_read(struct i2c_client *client,
+				  int reg, uint8_t *val)
+{
+	int ret;
+
+	ret = i2c_smbus_read_byte_data(client, reg);
+	if (ret < 0) {
+		dev_err(&client->dev, "failed reading at 0x%02x\n", reg);
+		return ret;
+	}
+
+	*val = (uint8_t)ret;
+
+	return 0;
+}
+
+static inline int __tps6586x_reads(struct i2c_client *client, int reg,
+				   int len, uint8_t *val)
+{
+	int ret;
+
+	ret = i2c_smbus_read_i2c_block_data(client, reg, len, val);
+	if (ret < 0) {
+		dev_err(&client->dev, "failed reading from 0x%02x\n", reg);
+		return ret;
+	}
+
+	return 0;
+}
+
+static inline int __tps6586x_write(struct i2c_client *client,
+				 int reg, uint8_t val)
+{
+	int ret;
+
+	ret = i2c_smbus_write_byte_data(client, reg, val);
+	if (ret < 0) {
+		dev_err(&client->dev, "failed writing 0x%02x to 0x%02x\n",
+				val, reg);
+		return ret;
+	}
+
+	return 0;
+}
+
+static inline int __tps6586x_writes(struct i2c_client *client, int reg,
+				  int len, uint8_t *val)
+{
+	int ret;
+
+	ret = i2c_smbus_write_i2c_block_data(client, reg, len, val);
+	if (ret < 0) {
+		dev_err(&client->dev, "failed writings to 0x%02x\n", reg);
+		return ret;
+	}
+
+	return 0;
+}
+
+int tps6586x_write(struct device *dev, int reg, uint8_t val)
+{
+	return __tps6586x_write(to_i2c_client(dev), reg, val);
+}
+EXPORT_SYMBOL_GPL(tps6586x_write);
+
+int tps6586x_writes(struct device *dev, int reg, int len, uint8_t *val)
+{
+	return __tps6586x_writes(to_i2c_client(dev), reg, len, val);
+}
+EXPORT_SYMBOL_GPL(tps6586x_writes);
+
+int tps6586x_read(struct device *dev, int reg, uint8_t *val)
+{
+	return __tps6586x_read(to_i2c_client(dev), reg, val);
+}
+EXPORT_SYMBOL_GPL(tps6586x_read);
+
+int tps6586x_reads(struct device *dev, int reg, int len, uint8_t *val)
+{
+	return __tps6586x_reads(to_i2c_client(dev), reg, len, val);
+}
+EXPORT_SYMBOL_GPL(tps6586x_reads);
+
+int tps6586x_set_bits(struct device *dev, int reg, uint8_t bit_mask)
+{
+	struct tps6586x *tps6586x = dev_get_drvdata(dev);
+	uint8_t reg_val;
+	int ret = 0;
+
+	mutex_lock(&tps6586x->lock);
+
+	ret = __tps6586x_read(to_i2c_client(dev), reg, &reg_val);
+	if (ret)
+		goto out;
+
+	if ((reg_val & bit_mask) == 0) {
+		reg_val |= bit_mask;
+		ret = __tps6586x_write(to_i2c_client(dev), reg, reg_val);
+	}
+out:
+	mutex_unlock(&tps6586x->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tps6586x_set_bits);
+
+int tps6586x_clr_bits(struct device *dev, int reg, uint8_t bit_mask)
+{
+	struct tps6586x *tps6586x = dev_get_drvdata(dev);
+	uint8_t reg_val;
+	int ret = 0;
+
+	mutex_lock(&tps6586x->lock);
+
+	ret = __tps6586x_read(to_i2c_client(dev), reg, &reg_val);
+	if (ret)
+		goto out;
+
+	if (reg_val & bit_mask) {
+		reg_val &= ~bit_mask;
+		ret = __tps6586x_write(to_i2c_client(dev), reg, reg_val);
+	}
+out:
+	mutex_unlock(&tps6586x->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tps6586x_clr_bits);
+
+int tps6586x_update(struct device *dev, int reg, uint8_t val, uint8_t mask)
+{
+	struct tps6586x *tps6586x = dev_get_drvdata(dev);
+	uint8_t reg_val;
+	int ret = 0;
+
+	mutex_lock(&tps6586x->lock);
+
+	ret = __tps6586x_read(tps6586x->client, reg, &reg_val);
+	if (ret)
+		goto out;
+
+	if ((reg_val & mask) != val) {
+		reg_val = (reg_val & ~mask) | val;
+		ret = __tps6586x_write(tps6586x->client, reg, reg_val);
+	}
+out:
+	mutex_unlock(&tps6586x->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tps6586x_update);
+
+static int tps6586x_gpio_get(struct gpio_chip *gc, unsigned offset)
+{
+	struct tps6586x *tps6586x = container_of(gc, struct tps6586x, gpio);
+	uint8_t val;
+	int ret;
+
+	ret = __tps6586x_read(tps6586x->client, TPS6586X_GPIOSET2, &val);
+	if (ret)
+		return ret;
+
+	return !!(val & (1 << offset));
+}
+
+
+static void tps6586x_gpio_set(struct gpio_chip *chip, unsigned offset,
+			      int value)
+{
+	struct tps6586x *tps6586x = container_of(chip, struct tps6586x, gpio);
+
+	__tps6586x_write(tps6586x->client, TPS6586X_GPIOSET2,
+			 value << offset);
+}
+
+static int tps6586x_gpio_output(struct gpio_chip *gc, unsigned offset,
+				int value)
+{
+	struct tps6586x *tps6586x = container_of(gc, struct tps6586x, gpio);
+	uint8_t val, mask;
+
+	tps6586x_gpio_set(gc, offset, value);
+
+	val = 0x1 << (offset * 2);
+	mask = 0x3 << (offset * 2);
+
+	return tps6586x_update(tps6586x->dev, TPS6586X_GPIOSET1, val, mask);
+}
+
+static void tps6586x_gpio_init(struct tps6586x *tps6586x, int gpio_base)
+{
+	int ret;
+
+	if (!gpio_base)
+		return;
+
+	tps6586x->gpio.owner		= THIS_MODULE;
+	tps6586x->gpio.label		= tps6586x->client->name;
+	tps6586x->gpio.dev		= tps6586x->dev;
+	tps6586x->gpio.base		= gpio_base;
+	tps6586x->gpio.ngpio		= 4;
+	tps6586x->gpio.can_sleep	= 1;
+
+	/* FIXME: add handling of GPIOs as dedicated inputs */
+	tps6586x->gpio.direction_output	= tps6586x_gpio_output;
+	tps6586x->gpio.set		= tps6586x_gpio_set;
+	tps6586x->gpio.get		= tps6586x_gpio_get;
+
+	ret = gpiochip_add(&tps6586x->gpio);
+	if (ret)
+		dev_warn(tps6586x->dev, "GPIO registration failed: %d\n", ret);
+}
+
+static int __remove_subdev(struct device *dev, void *unused)
+{
+	platform_device_unregister(to_platform_device(dev));
+	return 0;
+}
+
+static int tps6586x_remove_subdevs(struct tps6586x *tps6586x)
+{
+	return device_for_each_child(tps6586x->dev, NULL, __remove_subdev);
+}
+
+static int __devinit tps6586x_add_subdevs(struct tps6586x *tps6586x,
+					  struct tps6586x_platform_data *pdata)
+{
+	struct tps6586x_subdev_info *subdev;
+	struct platform_device *pdev;
+	int i, ret = 0;
+
+	for (i = 0; i < pdata->num_subdevs; i++) {
+		subdev = &pdata->subdevs[i];
+
+		pdev = platform_device_alloc(subdev->name, subdev->id);
+
+		pdev->dev.parent = tps6586x->dev;
+		pdev->dev.platform_data = subdev->platform_data;
+
+		ret = platform_device_add(pdev);
+		if (ret)
+			goto failed;
+	}
+	return 0;
+
+failed:
+	tps6586x_remove_subdevs(tps6586x);
+	return ret;
+}
+
+static int __devinit tps6586x_i2c_probe(struct i2c_client *client,
+					const struct i2c_device_id *id)
+{
+	struct tps6586x_platform_data *pdata = client->dev.platform_data;
+	struct tps6586x *tps6586x;
+	int ret;
+
+	if (!pdata) {
+		dev_err(&client->dev, "tps6586x requires platform data\n");
+		return -ENOTSUPP;
+	}
+
+	ret = i2c_smbus_read_byte_data(client, TPS6586X_VERSIONCRC);
+	if (ret < 0) {
+		dev_err(&client->dev, "Chip ID read failed: %d\n", ret);
+		return -EIO;
+	}
+
+	if (ret != TPS658621A_VERSIONCRC) {
+		dev_err(&client->dev, "Unsupported chip ID: %x\n", ret);
+		return -ENODEV;
+	}
+
+	tps6586x = kzalloc(sizeof(struct tps6586x), GFP_KERNEL);
+	if (tps6586x == NULL)
+		return -ENOMEM;
+
+	tps6586x->client = client;
+	tps6586x->dev = &client->dev;
+	i2c_set_clientdata(client, tps6586x);
+
+	mutex_init(&tps6586x->lock);
+
+	ret = tps6586x_add_subdevs(tps6586x, pdata);
+	if (ret) {
+		dev_err(&client->dev, "add devices failed: %d\n", ret);
+		goto err_add_devs;
+	}
+
+	tps6586x_gpio_init(tps6586x, pdata->gpio_base);
+
+	return 0;
+
+err_add_devs:
+	kfree(tps6586x);
+	return ret;
+}
+
+static int __devexit tps6586x_i2c_remove(struct i2c_client *client)
+{
+	return 0;
+}
+
+static const struct i2c_device_id tps6586x_id_table[] = {
+	{ "tps6586x", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(i2c, tps6586x_id_table);
+
+static struct i2c_driver tps6586x_driver = {
+	.driver	= {
+		.name	= "tps6586x",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= tps6586x_i2c_probe,
+	.remove		= __devexit_p(tps6586x_i2c_remove),
+	.id_table	= tps6586x_id_table,
+};
+
+static int __init tps6586x_init(void)
+{
+	return i2c_add_driver(&tps6586x_driver);
+}
+subsys_initcall(tps6586x_init);
+
+static void __exit tps6586x_exit(void)
+{
+	i2c_del_driver(&tps6586x_driver);
+}
+module_exit(tps6586x_exit);
+
+MODULE_DESCRIPTION("TPS6586X core driver");
+MODULE_AUTHOR("Mike Rapoport <mike@compulab.co.il>");
+MODULE_LICENSE("GPL");
+
diff --git a/include/linux/mfd/tps6586x.h b/include/linux/mfd/tps6586x.h
new file mode 100644
index 000000000000..772b3ae640af
--- /dev/null
+++ b/include/linux/mfd/tps6586x.h
@@ -0,0 +1,47 @@
+#ifndef __LINUX_MFD_TPS6586X_H
+#define __LINUX_MFD_TPS6586X_H
+
+enum {
+	TPS6586X_ID_SM_0,
+	TPS6586X_ID_SM_1,
+	TPS6586X_ID_SM_2,
+	TPS6586X_ID_LDO_0,
+	TPS6586X_ID_LDO_1,
+	TPS6586X_ID_LDO_2,
+	TPS6586X_ID_LDO_3,
+	TPS6586X_ID_LDO_4,
+	TPS6586X_ID_LDO_5,
+	TPS6586X_ID_LDO_6,
+	TPS6586X_ID_LDO_7,
+	TPS6586X_ID_LDO_8,
+	TPS6586X_ID_LDO_9,
+	TPS6586X_ID_LDO_RTC,
+};
+
+struct tps6586x_subdev_info {
+	int		id;
+	const char	*name;
+	void		*platform_data;
+};
+
+struct tps6586x_platform_data {
+	int num_subdevs;
+	struct tps6586x_subdev_info *subdevs;
+
+	int gpio_base;
+};
+
+/*
+ * NOTE: the functions below are not intended for use outside
+ * of the TPS6586X sub-device drivers
+ */
+extern int tps6586x_write(struct device *dev, int reg, uint8_t val);
+extern int tps6586x_writes(struct device *dev, int reg, int len, uint8_t *val);
+extern int tps6586x_read(struct device *dev, int reg, uint8_t *val);
+extern int tps6586x_reads(struct device *dev, int reg, int len, uint8_t *val);
+extern int tps6586x_set_bits(struct device *dev, int reg, uint8_t bit_mask);
+extern int tps6586x_clr_bits(struct device *dev, int reg, uint8_t bit_mask);
+extern int tps6586x_update(struct device *dev, int reg, uint8_t val,
+			   uint8_t mask);
+
+#endif /*__LINUX_MFD_TPS6586X_H */
-- 
cgit v1.2.3-59-g8ed1b


From 16c4042f08919f447d6b2a55679546c9b97c7264 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 11 Aug 2010 14:17:39 -0700
Subject: writeback: avoid unnecessary calculation of bdi dirty thresholds

Split get_dirty_limits() into global_dirty_limits()+bdi_dirty_limit(), so
that the latter can be avoided when under global dirty background
threshold (which is the normal state for most systems).

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c         |  2 +-
 include/linux/writeback.h |  5 ++--
 mm/backing-dev.c          |  3 +-
 mm/page-writeback.c       | 75 ++++++++++++++++++++++++-----------------------
 4 files changed, 44 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2f76c4a081a2..fca43d4d7bf4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -590,7 +590,7 @@ static inline bool over_bground_thresh(void)
 {
 	unsigned long background_thresh, dirty_thresh;
 
-	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+	global_dirty_limits(&background_thresh, &dirty_thresh);
 
 	return (global_page_state(NR_FILE_DIRTY) +
 		global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index c24eca71e80c..72a5d647a5f2 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -124,8 +124,9 @@ struct ctl_table;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 				      void __user *, size_t *, loff_t *);
 
-void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
-		      unsigned long *pbdi_dirty, struct backing_dev_info *bdi);
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
+			       unsigned long dirty);
 
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 08d357522e78..eaa4a5bbe063 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -81,7 +81,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		nr_more_io++;
 	spin_unlock(&inode_lock);
 
-	get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+	global_dirty_limits(&background_thresh, &dirty_thresh);
+	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2cf69a5e46e6..1ea13ef350a8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -267,10 +267,11 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
  *
  *   dirty -= (dirty/8) * p_{t}
  */
-static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
+static unsigned long task_dirty_limit(struct task_struct *tsk,
+				       unsigned long bdi_dirty)
 {
 	long numerator, denominator;
-	unsigned long dirty = *pdirty;
+	unsigned long dirty = bdi_dirty;
 	u64 inv = dirty >> 3;
 
 	task_dirties_fraction(tsk, &numerator, &denominator);
@@ -278,10 +279,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
 	do_div(inv, denominator);
 
 	dirty -= inv;
-	if (dirty < *pdirty/2)
-		dirty = *pdirty/2;
 
-	*pdirty = dirty;
+	return max(dirty, bdi_dirty/2);
 }
 
 /*
@@ -391,9 +390,7 @@ unsigned long determine_dirtyable_memory(void)
 	return x + 1;	/* Ensure that we never return 0 */
 }
 
-void
-get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
-		 unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 {
 	unsigned long background;
 	unsigned long dirty;
@@ -425,26 +422,28 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
 	}
 	*pbackground = background;
 	*pdirty = dirty;
+}
 
-	if (bdi) {
-		u64 bdi_dirty;
-		long numerator, denominator;
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
+			       unsigned long dirty)
+{
+	u64 bdi_dirty;
+	long numerator, denominator;
 
-		/*
-		 * Calculate this BDI's share of the dirty ratio.
-		 */
-		bdi_writeout_fraction(bdi, &numerator, &denominator);
+	/*
+	 * Calculate this BDI's share of the dirty ratio.
+	 */
+	bdi_writeout_fraction(bdi, &numerator, &denominator);
 
-		bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
-		bdi_dirty *= numerator;
-		do_div(bdi_dirty, denominator);
-		bdi_dirty += (dirty * bdi->min_ratio) / 100;
-		if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
-			bdi_dirty = dirty * bdi->max_ratio / 100;
+	bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
+	bdi_dirty *= numerator;
+	do_div(bdi_dirty, denominator);
 
-		*pbdi_dirty = bdi_dirty;
-		task_dirty_limit(current, pbdi_dirty);
-	}
+	bdi_dirty += (dirty * bdi->min_ratio) / 100;
+	if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+		bdi_dirty = dirty * bdi->max_ratio / 100;
+
+	return bdi_dirty;
 }
 
 /*
@@ -475,13 +474,24 @@ static void balance_dirty_pages(struct address_space *mapping,
 			.range_cyclic	= 1,
 		};
 
-		get_dirty_limits(&background_thresh, &dirty_thresh,
-				&bdi_thresh, bdi);
-
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
+		global_dirty_limits(&background_thresh, &dirty_thresh);
+
+		/*
+		 * Throttle it only when the background writeback cannot
+		 * catch-up. This avoids (excessively) small writeouts
+		 * when the bdi limits are ramping up.
+		 */
+		if (nr_reclaimable + nr_writeback <
+				(background_thresh + dirty_thresh) / 2)
+			break;
+
+		bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+		bdi_thresh = task_dirty_limit(current, bdi_thresh);
+
 		/*
 		 * In order to avoid the stacked BDI deadlock we need
 		 * to ensure we accurately count the 'dirty' pages when
@@ -513,15 +523,6 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (!dirty_exceeded)
 			break;
 
-		/*
-		 * Throttle it only when the background writeback cannot
-		 * catch-up. This avoids (excessively) small writeouts
-		 * when the bdi limits are ramping up.
-		 */
-		if (nr_reclaimable + nr_writeback <
-				(background_thresh + dirty_thresh) / 2)
-			break;
-
 		if (!bdi->dirty_exceeded)
 			bdi->dirty_exceeded = 1;
 
@@ -634,7 +635,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
 	unsigned long dirty_thresh;
 
         for ( ; ; ) {
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+		global_dirty_limits(&background_thresh, &dirty_thresh);
 
                 /*
                  * Boost the allowable dirty threshold a bit for page
-- 
cgit v1.2.3-59-g8ed1b


From 81d73a32d775ae9674ea6edf0b5b721fc3bc57d9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 Aug 2010 14:17:44 -0700
Subject: mm: fix writeback_in_progress()

Commit 83ba7b071f3 ("writeback: simplify the write back thread queue")
broke writeback_in_progress() as in that commit we started to remove work
items from the list at the moment we start working on them and not at the
moment they are finished.  Thus if the flusher thread was doing some work
but there was no other work queued, writeback_in_progress() returned
false.  This could in particular cause unnecessary queueing of background
writeback from balance_dirty_pages() or writeout work from
writeback_sb_if_idle().

This patch fixes the problem by introducing a bit in the bdi state which
indicates that the flusher thread is processing some work and uses this
bit for writeback_in_progress() test.

NOTE: Both callsites of writeback_in_progress() (namely,
writeback_inodes_sb_if_idle() and balance_dirty_pages()) would actually
need a different information than what writeback_in_progress() provides.
They would need to know whether *the kind of writeback they are going to
submit* is already queued.  But this information isn't that simple to
provide so let's fix writeback_in_progress() for the time being.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c           | 4 +++-
 include/linux/backing-dev.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8a5807d2fb9d..7d9d06ba184b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -68,7 +68,7 @@ int nr_pdflush_threads;
  */
 int writeback_in_progress(struct backing_dev_info *bdi)
 {
-	return !list_empty(&bdi->work_list);
+	return test_bit(BDI_writeback_running, &bdi->state);
 }
 
 static void bdi_queue_work(struct backing_dev_info *bdi,
@@ -740,6 +740,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 	struct wb_writeback_work *work;
 	long wrote = 0;
 
+	set_bit(BDI_writeback_running, &wb->bdi->state);
 	while ((work = get_next_work_item(bdi)) != NULL) {
 		/*
 		 * Override sync mode, in case we must wait for completion
@@ -766,6 +767,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 	 * Check for periodic writeback, kupdated() style
 	 */
 	wrote += wb_check_old_data_flush(wb);
+	clear_bit(BDI_writeback_running, &wb->bdi->state);
 
 	return wrote;
 }
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 7628219e5386..35b00746c712 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -31,6 +31,7 @@ enum bdi_state {
 	BDI_async_congested,	/* The async (write) queue is getting full */
 	BDI_sync_congested,	/* The sync queue is getting full */
 	BDI_registered,		/* bdi_register() was done */
+	BDI_writeback_running,	/* Writeback is in progress */
 	BDI_unused,		/* Available bits start here */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From dfe86cba7676d58db8de7e623f5e72f1b0d3ca35 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Wed, 11 Aug 2010 14:17:46 -0700
Subject: mmc: add erase, secure erase, trim and secure trim operations

SD/MMC cards tend to support an erase operation.  In addition, eMMC v4.4
cards can support secure erase, trim and secure trim operations that are
all variants of the basic erase command.

SD/MMC device attributes "erase_size" and "preferred_erase_size" have been
added.

"erase_size" is the minimum size, in bytes, of an erase operation.  For
MMC, "erase_size" is the erase group size reported by the card.  Note that
"erase_size" does not apply to trim or secure trim operations where the
minimum size is always one 512 byte sector.  For SD, "erase_size" is 512
if the card is block-addressed, 0 otherwise.

SD/MMC cards can erase an arbitrarily large area up to and
including the whole card.  When erasing a large area it may
be desirable to do it in smaller chunks for three reasons:

    1. A single erase command will make all other I/O on the card
       wait.  This is not a problem if the whole card is being erased, but
       erasing one partition will make I/O for another partition on the
       same card wait for the duration of the erase - which could be a
       several minutes.

    2. To be able to inform the user of erase progress.

    3. The erase timeout becomes too large to be very useful.
       Because the erase timeout contains a margin which is multiplied by
       the size of the erase area, the value can end up being several
       minutes for large areas.

"erase_size" is not the most efficient unit to erase (especially for SD
where it is just one sector), hence "preferred_erase_size" provides a good
chunk size for erasing large areas.

For MMC, "preferred_erase_size" is the high-capacity erase size if a card
specifies one, otherwise it is based on the capacity of the card.

For SD, "preferred_erase_size" is the allocation unit size specified by
the card.

"preferred_erase_size" is in bytes.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Cc: Kyungmin Park <kmpark@infradead.org>
Cc: Madhusudhan Chikkature <madhu.cr@ti.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ben Gardiner <bengardiner@nanometrics.ca>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/00-INDEX              |   2 +
 Documentation/mmc/00-INDEX          |   4 +
 Documentation/mmc/mmc-dev-attrs.txt |  56 ++++++
 drivers/mmc/core/core.c             | 346 ++++++++++++++++++++++++++++++++++++
 drivers/mmc/core/core.h             |   2 +
 drivers/mmc/core/mmc.c              |  47 ++++-
 drivers/mmc/core/sd.c               |  82 +++++++++
 drivers/mmc/core/sd_ops.c           |  48 +++++
 drivers/mmc/core/sd_ops.h           |   1 +
 include/linux/mmc/card.h            |  20 +++
 include/linux/mmc/core.h            |  19 ++
 include/linux/mmc/host.h            |   1 +
 include/linux/mmc/mmc.h             |  26 ++-
 include/linux/mmc/sd.h              |   5 +
 14 files changed, 651 insertions(+), 8 deletions(-)
 create mode 100644 Documentation/mmc/00-INDEX
 create mode 100644 Documentation/mmc/mmc-dev-attrs.txt

(limited to 'include/linux')

diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 9e642c5bf526..8dfc6708a257 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -232,6 +232,8 @@ memory.txt
 	- info on typical Linux memory problems.
 mips/
 	- directory with info about Linux on MIPS architecture.
+mmc/
+	- directory with info about the MMC subsystem
 mono.txt
 	- how to execute Mono-based .NET binaries with the help of BINFMT_MISC.
 mutex-design.txt
diff --git a/Documentation/mmc/00-INDEX b/Documentation/mmc/00-INDEX
new file mode 100644
index 000000000000..fca586f5b853
--- /dev/null
+++ b/Documentation/mmc/00-INDEX
@@ -0,0 +1,4 @@
+00-INDEX
+        - this file
+mmc-dev-attrs.txt
+        - info on SD and MMC device attributes
diff --git a/Documentation/mmc/mmc-dev-attrs.txt b/Documentation/mmc/mmc-dev-attrs.txt
new file mode 100644
index 000000000000..ff2bd685bced
--- /dev/null
+++ b/Documentation/mmc/mmc-dev-attrs.txt
@@ -0,0 +1,56 @@
+SD and MMC Device Attributes
+============================
+
+All attributes are read-only.
+
+	cid			Card Identifaction Register
+	csd			Card Specific Data Register
+	scr			SD Card Configuration Register (SD only)
+	date			Manufacturing Date (from CID Register)
+	fwrev			Firmware/Product Revision (from CID Register) (SD and MMCv1 only)
+	hwrev			Hardware/Product Revision (from CID Register) (SD and MMCv1 only)
+	manfid			Manufacturer ID (from CID Register)
+	name			Product Name (from CID Register)
+	oemid			OEM/Application ID (from CID Register)
+	serial			Product Serial Number (from CID Register)
+	erase_size		Erase group size
+	preferred_erase_size	Preferred erase size
+
+Note on Erase Size and Preferred Erase Size:
+
+	"erase_size" is the  minimum size, in bytes, of an erase
+	operation.  For MMC, "erase_size" is the erase group size
+	reported by the card.  Note that "erase_size" does not apply
+	to trim or secure trim operations where the minimum size is
+	always one 512 byte sector.  For SD, "erase_size" is 512
+	if the card is block-addressed, 0 otherwise.
+
+	SD/MMC cards can erase an arbitrarily large area up to and
+	including the whole card.  When erasing a large area it may
+	be desirable to do it in smaller chunks for three reasons:
+		1. A single erase command will make all other I/O on
+		the card wait.  This is not a problem if the whole card
+		is being erased, but erasing one partition will make
+		I/O for another partition on the same card wait for the
+		duration of the erase - which could be a several
+		minutes.
+		2. To be able to inform the user of erase progress.
+		3. The erase timeout becomes too large to be very
+		useful.  Because the erase timeout contains a margin
+		which is multiplied by the size of the erase area,
+		the value can end up being several minutes for large
+		areas.
+
+	"erase_size" is not the most efficient unit to erase
+	(especially for SD where it is just one sector),
+	hence "preferred_erase_size" provides a good chunk
+	size for erasing large areas.
+
+	For MMC, "preferred_erase_size" is the high-capacity
+	erase size if a card specifies one, otherwise it is
+	based on the capacity of the card.
+
+	For SD, "preferred_erase_size" is the allocation unit
+	size specified by the card.
+
+	"preferred_erase_size" is in bytes.
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 83240faa1dc8..5db49b124ffa 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1050,6 +1050,352 @@ void mmc_detect_change(struct mmc_host *host, unsigned long delay)
 
 EXPORT_SYMBOL(mmc_detect_change);
 
+void mmc_init_erase(struct mmc_card *card)
+{
+	unsigned int sz;
+
+	if (is_power_of_2(card->erase_size))
+		card->erase_shift = ffs(card->erase_size) - 1;
+	else
+		card->erase_shift = 0;
+
+	/*
+	 * It is possible to erase an arbitrarily large area of an SD or MMC
+	 * card.  That is not desirable because it can take a long time
+	 * (minutes) potentially delaying more important I/O, and also the
+	 * timeout calculations become increasingly hugely over-estimated.
+	 * Consequently, 'pref_erase' is defined as a guide to limit erases
+	 * to that size and alignment.
+	 *
+	 * For SD cards that define Allocation Unit size, limit erases to one
+	 * Allocation Unit at a time.  For MMC cards that define High Capacity
+	 * Erase Size, whether it is switched on or not, limit to that size.
+	 * Otherwise just have a stab at a good value.  For modern cards it
+	 * will end up being 4MiB.  Note that if the value is too small, it
+	 * can end up taking longer to erase.
+	 */
+	if (mmc_card_sd(card) && card->ssr.au) {
+		card->pref_erase = card->ssr.au;
+		card->erase_shift = ffs(card->ssr.au) - 1;
+	} else if (card->ext_csd.hc_erase_size) {
+		card->pref_erase = card->ext_csd.hc_erase_size;
+	} else {
+		sz = (card->csd.capacity << (card->csd.read_blkbits - 9)) >> 11;
+		if (sz < 128)
+			card->pref_erase = 512 * 1024 / 512;
+		else if (sz < 512)
+			card->pref_erase = 1024 * 1024 / 512;
+		else if (sz < 1024)
+			card->pref_erase = 2 * 1024 * 1024 / 512;
+		else
+			card->pref_erase = 4 * 1024 * 1024 / 512;
+		if (card->pref_erase < card->erase_size)
+			card->pref_erase = card->erase_size;
+		else {
+			sz = card->pref_erase % card->erase_size;
+			if (sz)
+				card->pref_erase += card->erase_size - sz;
+		}
+	}
+}
+
+static void mmc_set_mmc_erase_timeout(struct mmc_card *card,
+				      struct mmc_command *cmd,
+				      unsigned int arg, unsigned int qty)
+{
+	unsigned int erase_timeout;
+
+	if (card->ext_csd.erase_group_def & 1) {
+		/* High Capacity Erase Group Size uses HC timeouts */
+		if (arg == MMC_TRIM_ARG)
+			erase_timeout = card->ext_csd.trim_timeout;
+		else
+			erase_timeout = card->ext_csd.hc_erase_timeout;
+	} else {
+		/* CSD Erase Group Size uses write timeout */
+		unsigned int mult = (10 << card->csd.r2w_factor);
+		unsigned int timeout_clks = card->csd.tacc_clks * mult;
+		unsigned int timeout_us;
+
+		/* Avoid overflow: e.g. tacc_ns=80000000 mult=1280 */
+		if (card->csd.tacc_ns < 1000000)
+			timeout_us = (card->csd.tacc_ns * mult) / 1000;
+		else
+			timeout_us = (card->csd.tacc_ns / 1000) * mult;
+
+		/*
+		 * ios.clock is only a target.  The real clock rate might be
+		 * less but not that much less, so fudge it by multiplying by 2.
+		 */
+		timeout_clks <<= 1;
+		timeout_us += (timeout_clks * 1000) /
+			      (card->host->ios.clock / 1000);
+
+		erase_timeout = timeout_us / 1000;
+
+		/*
+		 * Theoretically, the calculation could underflow so round up
+		 * to 1ms in that case.
+		 */
+		if (!erase_timeout)
+			erase_timeout = 1;
+	}
+
+	/* Multiplier for secure operations */
+	if (arg & MMC_SECURE_ARGS) {
+		if (arg == MMC_SECURE_ERASE_ARG)
+			erase_timeout *= card->ext_csd.sec_erase_mult;
+		else
+			erase_timeout *= card->ext_csd.sec_trim_mult;
+	}
+
+	erase_timeout *= qty;
+
+	/*
+	 * Ensure at least a 1 second timeout for SPI as per
+	 * 'mmc_set_data_timeout()'
+	 */
+	if (mmc_host_is_spi(card->host) && erase_timeout < 1000)
+		erase_timeout = 1000;
+
+	cmd->erase_timeout = erase_timeout;
+}
+
+static void mmc_set_sd_erase_timeout(struct mmc_card *card,
+				     struct mmc_command *cmd, unsigned int arg,
+				     unsigned int qty)
+{
+	if (card->ssr.erase_timeout) {
+		/* Erase timeout specified in SD Status Register (SSR) */
+		cmd->erase_timeout = card->ssr.erase_timeout * qty +
+				     card->ssr.erase_offset;
+	} else {
+		/*
+		 * Erase timeout not specified in SD Status Register (SSR) so
+		 * use 250ms per write block.
+		 */
+		cmd->erase_timeout = 250 * qty;
+	}
+
+	/* Must not be less than 1 second */
+	if (cmd->erase_timeout < 1000)
+		cmd->erase_timeout = 1000;
+}
+
+static void mmc_set_erase_timeout(struct mmc_card *card,
+				  struct mmc_command *cmd, unsigned int arg,
+				  unsigned int qty)
+{
+	if (mmc_card_sd(card))
+		mmc_set_sd_erase_timeout(card, cmd, arg, qty);
+	else
+		mmc_set_mmc_erase_timeout(card, cmd, arg, qty);
+}
+
+static int mmc_do_erase(struct mmc_card *card, unsigned int from,
+			unsigned int to, unsigned int arg)
+{
+	struct mmc_command cmd;
+	unsigned int qty = 0;
+	int err;
+
+	/*
+	 * qty is used to calculate the erase timeout which depends on how many
+	 * erase groups (or allocation units in SD terminology) are affected.
+	 * We count erasing part of an erase group as one erase group.
+	 * For SD, the allocation units are always a power of 2.  For MMC, the
+	 * erase group size is almost certainly also power of 2, but it does not
+	 * seem to insist on that in the JEDEC standard, so we fall back to
+	 * division in that case.  SD may not specify an allocation unit size,
+	 * in which case the timeout is based on the number of write blocks.
+	 *
+	 * Note that the timeout for secure trim 2 will only be correct if the
+	 * number of erase groups specified is the same as the total of all
+	 * preceding secure trim 1 commands.  Since the power may have been
+	 * lost since the secure trim 1 commands occurred, it is generally
+	 * impossible to calculate the secure trim 2 timeout correctly.
+	 */
+	if (card->erase_shift)
+		qty += ((to >> card->erase_shift) -
+			(from >> card->erase_shift)) + 1;
+	else if (mmc_card_sd(card))
+		qty += to - from + 1;
+	else
+		qty += ((to / card->erase_size) -
+			(from / card->erase_size)) + 1;
+
+	if (!mmc_card_blockaddr(card)) {
+		from <<= 9;
+		to <<= 9;
+	}
+
+	memset(&cmd, 0, sizeof(struct mmc_command));
+	if (mmc_card_sd(card))
+		cmd.opcode = SD_ERASE_WR_BLK_START;
+	else
+		cmd.opcode = MMC_ERASE_GROUP_START;
+	cmd.arg = from;
+	cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC;
+	err = mmc_wait_for_cmd(card->host, &cmd, 0);
+	if (err) {
+		printk(KERN_ERR "mmc_erase: group start error %d, "
+		       "status %#x\n", err, cmd.resp[0]);
+		err = -EINVAL;
+		goto out;
+	}
+
+	memset(&cmd, 0, sizeof(struct mmc_command));
+	if (mmc_card_sd(card))
+		cmd.opcode = SD_ERASE_WR_BLK_END;
+	else
+		cmd.opcode = MMC_ERASE_GROUP_END;
+	cmd.arg = to;
+	cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC;
+	err = mmc_wait_for_cmd(card->host, &cmd, 0);
+	if (err) {
+		printk(KERN_ERR "mmc_erase: group end error %d, status %#x\n",
+		       err, cmd.resp[0]);
+		err = -EINVAL;
+		goto out;
+	}
+
+	memset(&cmd, 0, sizeof(struct mmc_command));
+	cmd.opcode = MMC_ERASE;
+	cmd.arg = arg;
+	cmd.flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC;
+	mmc_set_erase_timeout(card, &cmd, arg, qty);
+	err = mmc_wait_for_cmd(card->host, &cmd, 0);
+	if (err) {
+		printk(KERN_ERR "mmc_erase: erase error %d, status %#x\n",
+		       err, cmd.resp[0]);
+		err = -EIO;
+		goto out;
+	}
+
+	if (mmc_host_is_spi(card->host))
+		goto out;
+
+	do {
+		memset(&cmd, 0, sizeof(struct mmc_command));
+		cmd.opcode = MMC_SEND_STATUS;
+		cmd.arg = card->rca << 16;
+		cmd.flags = MMC_RSP_R1 | MMC_CMD_AC;
+		/* Do not retry else we can't see errors */
+		err = mmc_wait_for_cmd(card->host, &cmd, 0);
+		if (err || (cmd.resp[0] & 0xFDF92000)) {
+			printk(KERN_ERR "error %d requesting status %#x\n",
+				err, cmd.resp[0]);
+			err = -EIO;
+			goto out;
+		}
+	} while (!(cmd.resp[0] & R1_READY_FOR_DATA) ||
+		 R1_CURRENT_STATE(cmd.resp[0]) == 7);
+out:
+	return err;
+}
+
+/**
+ * mmc_erase - erase sectors.
+ * @card: card to erase
+ * @from: first sector to erase
+ * @nr: number of sectors to erase
+ * @arg: erase command argument (SD supports only %MMC_ERASE_ARG)
+ *
+ * Caller must claim host before calling this function.
+ */
+int mmc_erase(struct mmc_card *card, unsigned int from, unsigned int nr,
+	      unsigned int arg)
+{
+	unsigned int rem, to = from + nr;
+
+	if (!(card->host->caps & MMC_CAP_ERASE) ||
+	    !(card->csd.cmdclass & CCC_ERASE))
+		return -EOPNOTSUPP;
+
+	if (!card->erase_size)
+		return -EOPNOTSUPP;
+
+	if (mmc_card_sd(card) && arg != MMC_ERASE_ARG)
+		return -EOPNOTSUPP;
+
+	if ((arg & MMC_SECURE_ARGS) &&
+	    !(card->ext_csd.sec_feature_support & EXT_CSD_SEC_ER_EN))
+		return -EOPNOTSUPP;
+
+	if ((arg & MMC_TRIM_ARGS) &&
+	    !(card->ext_csd.sec_feature_support & EXT_CSD_SEC_GB_CL_EN))
+		return -EOPNOTSUPP;
+
+	if (arg == MMC_SECURE_ERASE_ARG) {
+		if (from % card->erase_size || nr % card->erase_size)
+			return -EINVAL;
+	}
+
+	if (arg == MMC_ERASE_ARG) {
+		rem = from % card->erase_size;
+		if (rem) {
+			rem = card->erase_size - rem;
+			from += rem;
+			if (nr > rem)
+				nr -= rem;
+			else
+				return 0;
+		}
+		rem = nr % card->erase_size;
+		if (rem)
+			nr -= rem;
+	}
+
+	if (nr == 0)
+		return 0;
+
+	to = from + nr;
+
+	if (to <= from)
+		return -EINVAL;
+
+	/* 'from' and 'to' are inclusive */
+	to -= 1;
+
+	return mmc_do_erase(card, from, to, arg);
+}
+EXPORT_SYMBOL(mmc_erase);
+
+int mmc_can_erase(struct mmc_card *card)
+{
+	if ((card->host->caps & MMC_CAP_ERASE) &&
+	    (card->csd.cmdclass & CCC_ERASE) && card->erase_size)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(mmc_can_erase);
+
+int mmc_can_trim(struct mmc_card *card)
+{
+	if (card->ext_csd.sec_feature_support & EXT_CSD_SEC_GB_CL_EN)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(mmc_can_trim);
+
+int mmc_can_secure_erase_trim(struct mmc_card *card)
+{
+	if (card->ext_csd.sec_feature_support & EXT_CSD_SEC_ER_EN)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(mmc_can_secure_erase_trim);
+
+int mmc_erase_group_aligned(struct mmc_card *card, unsigned int from,
+			    unsigned int nr)
+{
+	if (!card->erase_size)
+		return 0;
+	if (from % card->erase_size || nr % card->erase_size)
+		return 0;
+	return 1;
+}
+EXPORT_SYMBOL(mmc_erase_group_aligned);
 
 void mmc_rescan(struct work_struct *work)
 {
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index a811c52a1659..9d9eef50e5d1 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -29,6 +29,8 @@ struct mmc_bus_ops {
 void mmc_attach_bus(struct mmc_host *host, const struct mmc_bus_ops *ops);
 void mmc_detach_bus(struct mmc_host *host);
 
+void mmc_init_erase(struct mmc_card *card);
+
 void mmc_set_chip_select(struct mmc_host *host, int mode);
 void mmc_set_clock(struct mmc_host *host, unsigned int hz);
 void mmc_set_bus_mode(struct mmc_host *host, unsigned int mode);
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index ccba3869c029..6909a54c39be 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -108,13 +108,23 @@ static int mmc_decode_cid(struct mmc_card *card)
 	return 0;
 }
 
+static void mmc_set_erase_size(struct mmc_card *card)
+{
+	if (card->ext_csd.erase_group_def & 1)
+		card->erase_size = card->ext_csd.hc_erase_size;
+	else
+		card->erase_size = card->csd.erase_size;
+
+	mmc_init_erase(card);
+}
+
 /*
  * Given a 128-bit response, decode to our card CSD structure.
  */
 static int mmc_decode_csd(struct mmc_card *card)
 {
 	struct mmc_csd *csd = &card->csd;
-	unsigned int e, m;
+	unsigned int e, m, a, b;
 	u32 *resp = card->raw_csd;
 
 	/*
@@ -152,6 +162,13 @@ static int mmc_decode_csd(struct mmc_card *card)
 	csd->write_blkbits = UNSTUFF_BITS(resp, 22, 4);
 	csd->write_partial = UNSTUFF_BITS(resp, 21, 1);
 
+	if (csd->write_blkbits >= 9) {
+		a = UNSTUFF_BITS(resp, 42, 5);
+		b = UNSTUFF_BITS(resp, 37, 5);
+		csd->erase_size = (a + 1) * (b + 1);
+		csd->erase_size <<= csd->write_blkbits - 9;
+	}
+
 	return 0;
 }
 
@@ -261,8 +278,30 @@ static int mmc_read_ext_csd(struct mmc_card *card)
 		if (sa_shift > 0 && sa_shift <= 0x17)
 			card->ext_csd.sa_timeout =
 					1 << ext_csd[EXT_CSD_S_A_TIMEOUT];
+		card->ext_csd.erase_group_def =
+			ext_csd[EXT_CSD_ERASE_GROUP_DEF];
+		card->ext_csd.hc_erase_timeout = 300 *
+			ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT];
+		card->ext_csd.hc_erase_size =
+			ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE] << 10;
+	}
+
+	if (card->ext_csd.rev >= 4) {
+		card->ext_csd.sec_trim_mult =
+			ext_csd[EXT_CSD_SEC_TRIM_MULT];
+		card->ext_csd.sec_erase_mult =
+			ext_csd[EXT_CSD_SEC_ERASE_MULT];
+		card->ext_csd.sec_feature_support =
+			ext_csd[EXT_CSD_SEC_FEATURE_SUPPORT];
+		card->ext_csd.trim_timeout = 300 *
+			ext_csd[EXT_CSD_TRIM_MULT];
 	}
 
+	if (ext_csd[EXT_CSD_ERASED_MEM_CONT])
+		card->erased_byte = 0xFF;
+	else
+		card->erased_byte = 0x0;
+
 out:
 	kfree(ext_csd);
 
@@ -274,6 +313,8 @@ MMC_DEV_ATTR(cid, "%08x%08x%08x%08x\n", card->raw_cid[0], card->raw_cid[1],
 MMC_DEV_ATTR(csd, "%08x%08x%08x%08x\n", card->raw_csd[0], card->raw_csd[1],
 	card->raw_csd[2], card->raw_csd[3]);
 MMC_DEV_ATTR(date, "%02d/%04d\n", card->cid.month, card->cid.year);
+MMC_DEV_ATTR(erase_size, "%u\n", card->erase_size << 9);
+MMC_DEV_ATTR(preferred_erase_size, "%u\n", card->pref_erase << 9);
 MMC_DEV_ATTR(fwrev, "0x%x\n", card->cid.fwrev);
 MMC_DEV_ATTR(hwrev, "0x%x\n", card->cid.hwrev);
 MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid);
@@ -285,6 +326,8 @@ static struct attribute *mmc_std_attrs[] = {
 	&dev_attr_cid.attr,
 	&dev_attr_csd.attr,
 	&dev_attr_date.attr,
+	&dev_attr_erase_size.attr,
+	&dev_attr_preferred_erase_size.attr,
 	&dev_attr_fwrev.attr,
 	&dev_attr_hwrev.attr,
 	&dev_attr_manfid.attr,
@@ -421,6 +464,8 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 		err = mmc_read_ext_csd(card);
 		if (err)
 			goto free_card;
+		/* Erase size depends on CSD and Extended CSD */
+		mmc_set_erase_size(card);
 	}
 
 	/*
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index e6d7d9fab446..0f5241085557 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -119,6 +119,13 @@ static int mmc_decode_csd(struct mmc_card *card)
 		csd->r2w_factor = UNSTUFF_BITS(resp, 26, 3);
 		csd->write_blkbits = UNSTUFF_BITS(resp, 22, 4);
 		csd->write_partial = UNSTUFF_BITS(resp, 21, 1);
+
+		if (UNSTUFF_BITS(resp, 46, 1)) {
+			csd->erase_size = 1;
+		} else if (csd->write_blkbits >= 9) {
+			csd->erase_size = UNSTUFF_BITS(resp, 39, 7) + 1;
+			csd->erase_size <<= csd->write_blkbits - 9;
+		}
 		break;
 	case 1:
 		/*
@@ -147,6 +154,7 @@ static int mmc_decode_csd(struct mmc_card *card)
 		csd->r2w_factor = 4; /* Unused */
 		csd->write_blkbits = 9;
 		csd->write_partial = 0;
+		csd->erase_size = 1;
 		break;
 	default:
 		printk(KERN_ERR "%s: unrecognised CSD structure version %d\n",
@@ -154,6 +162,8 @@ static int mmc_decode_csd(struct mmc_card *card)
 		return -EINVAL;
 	}
 
+	card->erase_size = csd->erase_size;
+
 	return 0;
 }
 
@@ -179,9 +189,67 @@ static int mmc_decode_scr(struct mmc_card *card)
 	scr->sda_vsn = UNSTUFF_BITS(resp, 56, 4);
 	scr->bus_widths = UNSTUFF_BITS(resp, 48, 4);
 
+	if (UNSTUFF_BITS(resp, 55, 1))
+		card->erased_byte = 0xFF;
+	else
+		card->erased_byte = 0x0;
+
 	return 0;
 }
 
+/*
+ * Fetch and process SD Status register.
+ */
+static int mmc_read_ssr(struct mmc_card *card)
+{
+	unsigned int au, es, et, eo;
+	int err, i;
+	u32 *ssr;
+
+	if (!(card->csd.cmdclass & CCC_APP_SPEC)) {
+		printk(KERN_WARNING "%s: card lacks mandatory SD Status "
+			"function.\n", mmc_hostname(card->host));
+		return 0;
+	}
+
+	ssr = kmalloc(64, GFP_KERNEL);
+	if (!ssr)
+		return -ENOMEM;
+
+	err = mmc_app_sd_status(card, ssr);
+	if (err) {
+		printk(KERN_WARNING "%s: problem reading SD Status "
+			"register.\n", mmc_hostname(card->host));
+		err = 0;
+		goto out;
+	}
+
+	for (i = 0; i < 16; i++)
+		ssr[i] = be32_to_cpu(ssr[i]);
+
+	/*
+	 * UNSTUFF_BITS only works with four u32s so we have to offset the
+	 * bitfield positions accordingly.
+	 */
+	au = UNSTUFF_BITS(ssr, 428 - 384, 4);
+	if (au > 0 || au <= 9) {
+		card->ssr.au = 1 << (au + 4);
+		es = UNSTUFF_BITS(ssr, 408 - 384, 16);
+		et = UNSTUFF_BITS(ssr, 402 - 384, 6);
+		eo = UNSTUFF_BITS(ssr, 400 - 384, 2);
+		if (es && et) {
+			card->ssr.erase_timeout = (et * 1000) / es;
+			card->ssr.erase_offset = eo * 1000;
+		}
+	} else {
+		printk(KERN_WARNING "%s: SD Status: Invalid Allocation Unit "
+			"size.\n", mmc_hostname(card->host));
+	}
+out:
+	kfree(ssr);
+	return err;
+}
+
 /*
  * Fetches and decodes switch information
  */
@@ -289,6 +357,8 @@ MMC_DEV_ATTR(csd, "%08x%08x%08x%08x\n", card->raw_csd[0], card->raw_csd[1],
 	card->raw_csd[2], card->raw_csd[3]);
 MMC_DEV_ATTR(scr, "%08x%08x\n", card->raw_scr[0], card->raw_scr[1]);
 MMC_DEV_ATTR(date, "%02d/%04d\n", card->cid.month, card->cid.year);
+MMC_DEV_ATTR(erase_size, "%u\n", card->erase_size << 9);
+MMC_DEV_ATTR(preferred_erase_size, "%u\n", card->pref_erase << 9);
 MMC_DEV_ATTR(fwrev, "0x%x\n", card->cid.fwrev);
 MMC_DEV_ATTR(hwrev, "0x%x\n", card->cid.hwrev);
 MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid);
@@ -302,6 +372,8 @@ static struct attribute *sd_std_attrs[] = {
 	&dev_attr_csd.attr,
 	&dev_attr_scr.attr,
 	&dev_attr_date.attr,
+	&dev_attr_erase_size.attr,
+	&dev_attr_preferred_erase_size.attr,
 	&dev_attr_fwrev.attr,
 	&dev_attr_hwrev.attr,
 	&dev_attr_manfid.attr,
@@ -396,6 +468,16 @@ int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
 		if (err)
 			return err;
 
+		/*
+		 * Fetch and process SD Status register.
+		 */
+		err = mmc_read_ssr(card);
+		if (err)
+			return err;
+
+		/* Erase init depends on CSD and SSR */
+		mmc_init_erase(card);
+
 		/*
 		 * Fetch switch information from card.
 		 */
diff --git a/drivers/mmc/core/sd_ops.c b/drivers/mmc/core/sd_ops.c
index 63772e7e7608..797cdb5887fd 100644
--- a/drivers/mmc/core/sd_ops.c
+++ b/drivers/mmc/core/sd_ops.c
@@ -346,3 +346,51 @@ int mmc_sd_switch(struct mmc_card *card, int mode, int group,
 	return 0;
 }
 
+int mmc_app_sd_status(struct mmc_card *card, void *ssr)
+{
+	int err;
+	struct mmc_request mrq;
+	struct mmc_command cmd;
+	struct mmc_data data;
+	struct scatterlist sg;
+
+	BUG_ON(!card);
+	BUG_ON(!card->host);
+	BUG_ON(!ssr);
+
+	/* NOTE: caller guarantees ssr is heap-allocated */
+
+	err = mmc_app_cmd(card->host, card);
+	if (err)
+		return err;
+
+	memset(&mrq, 0, sizeof(struct mmc_request));
+	memset(&cmd, 0, sizeof(struct mmc_command));
+	memset(&data, 0, sizeof(struct mmc_data));
+
+	mrq.cmd = &cmd;
+	mrq.data = &data;
+
+	cmd.opcode = SD_APP_SD_STATUS;
+	cmd.arg = 0;
+	cmd.flags = MMC_RSP_SPI_R2 | MMC_RSP_R1 | MMC_CMD_ADTC;
+
+	data.blksz = 64;
+	data.blocks = 1;
+	data.flags = MMC_DATA_READ;
+	data.sg = &sg;
+	data.sg_len = 1;
+
+	sg_init_one(&sg, ssr, 64);
+
+	mmc_set_data_timeout(&data, card);
+
+	mmc_wait_for_req(card->host, &mrq);
+
+	if (cmd.error)
+		return cmd.error;
+	if (data.error)
+		return data.error;
+
+	return 0;
+}
diff --git a/drivers/mmc/core/sd_ops.h b/drivers/mmc/core/sd_ops.h
index 9742d8a30664..ffc2305d905f 100644
--- a/drivers/mmc/core/sd_ops.h
+++ b/drivers/mmc/core/sd_ops.h
@@ -19,6 +19,7 @@ int mmc_send_relative_addr(struct mmc_host *host, unsigned int *rca);
 int mmc_app_send_scr(struct mmc_card *card, u32 *scr);
 int mmc_sd_switch(struct mmc_card *card, int mode, int group,
 	u8 value, u8 *resp);
+int mmc_app_sd_status(struct mmc_card *card, void *ssr);
 
 #endif
 
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 4d893eaf8174..6b7525099e56 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -31,6 +31,7 @@ struct mmc_csd {
 	unsigned int		tacc_ns;
 	unsigned int		r2w_factor;
 	unsigned int		max_dtr;
+	unsigned int		erase_size;		/* In sectors */
 	unsigned int		read_blkbits;
 	unsigned int		write_blkbits;
 	unsigned int		capacity;
@@ -42,9 +43,16 @@ struct mmc_csd {
 
 struct mmc_ext_csd {
 	u8			rev;
+	u8			erase_group_def;
+	u8			sec_feature_support;
 	unsigned int		sa_timeout;		/* Units: 100ns */
 	unsigned int		hs_max_dtr;
 	unsigned int		sectors;
+	unsigned int		hc_erase_size;		/* In sectors */
+	unsigned int		hc_erase_timeout;	/* In milliseconds */
+	unsigned int		sec_trim_mult;	/* Secure trim multiplier  */
+	unsigned int		sec_erase_mult;	/* Secure erase multiplier */
+	unsigned int		trim_timeout;		/* In milliseconds */
 };
 
 struct sd_scr {
@@ -54,6 +62,12 @@ struct sd_scr {
 #define SD_SCR_BUS_WIDTH_4	(1<<2)
 };
 
+struct sd_ssr {
+	unsigned int		au;			/* In sectors */
+	unsigned int		erase_timeout;		/* In milliseconds */
+	unsigned int		erase_offset;		/* In milliseconds */
+};
+
 struct sd_switch_caps {
 	unsigned int		hs_max_dtr;
 };
@@ -106,6 +120,11 @@ struct mmc_card {
 #define MMC_QUIRK_NONSTD_SDIO	(1<<2)		/* non-standard SDIO card attached */
 						/* (missing CIA registers) */
 
+	unsigned int		erase_size;	/* erase size in sectors */
+ 	unsigned int		erase_shift;	/* if erase unit is power 2 */
+ 	unsigned int		pref_erase;	/* in sectors */
+ 	u8			erased_byte;	/* value of erased bytes */
+
 	u32			raw_cid[4];	/* raw card CID */
 	u32			raw_csd[4];	/* raw card CSD */
 	u32			raw_scr[2];	/* raw card SCR */
@@ -113,6 +132,7 @@ struct mmc_card {
 	struct mmc_csd		csd;		/* card specific */
 	struct mmc_ext_csd	ext_csd;	/* mmc v4 extended card specific */
 	struct sd_scr		scr;		/* extra SD information */
+	struct sd_ssr		ssr;		/* yet more SD information */
 	struct sd_switch_caps	sw_caps;	/* switch (CMD6) caps */
 
 	unsigned int		sdio_funcs;	/* number of SDIO functions */
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index e4898e9eeb59..7429033acb66 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -92,6 +92,8 @@ struct mmc_command {
  *              actively failing requests
  */
 
+	unsigned int		erase_timeout;	/* in milliseconds */
+
 	struct mmc_data		*data;		/* data segment associated with cmd */
 	struct mmc_request	*mrq;		/* associated request */
 };
@@ -134,6 +136,23 @@ extern int mmc_wait_for_cmd(struct mmc_host *, struct mmc_command *, int);
 extern int mmc_wait_for_app_cmd(struct mmc_host *, struct mmc_card *,
 	struct mmc_command *, int);
 
+#define MMC_ERASE_ARG		0x00000000
+#define MMC_SECURE_ERASE_ARG	0x80000000
+#define MMC_TRIM_ARG		0x00000001
+#define MMC_SECURE_TRIM1_ARG	0x80000001
+#define MMC_SECURE_TRIM2_ARG	0x80008000
+
+#define MMC_SECURE_ARGS		0x80000000
+#define MMC_TRIM_ARGS		0x00008001
+
+extern int mmc_erase(struct mmc_card *card, unsigned int from, unsigned int nr,
+		     unsigned int arg);
+extern int mmc_can_erase(struct mmc_card *card);
+extern int mmc_can_trim(struct mmc_card *card);
+extern int mmc_can_secure_erase_trim(struct mmc_card *card);
+extern int mmc_erase_group_aligned(struct mmc_card *card, unsigned int from,
+				   unsigned int nr);
+
 extern void mmc_set_data_timeout(struct mmc_data *, const struct mmc_card *);
 extern unsigned int mmc_align_data_size(struct mmc_card *, unsigned int);
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 513ff0376b09..1575b52c3bfa 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -156,6 +156,7 @@ struct mmc_host {
 #define MMC_CAP_DISABLE		(1 << 7)	/* Can the host be disabled */
 #define MMC_CAP_NONREMOVABLE	(1 << 8)	/* Nonremovable e.g. eMMC */
 #define MMC_CAP_WAIT_WHILE_BUSY	(1 << 9)	/* Waits while card is busy */
+#define MMC_CAP_ERASE		(1 << 10)	/* Allow erase/trim commands */
 
 	mmc_pm_flag_t		pm_caps;	/* supported pm features */
 
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 52ce98866287..dd11ae51fb68 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -251,13 +251,21 @@ struct _mmc_csd {
  * EXT_CSD fields
  */
 
-#define EXT_CSD_BUS_WIDTH	183	/* R/W */
-#define EXT_CSD_HS_TIMING	185	/* R/W */
-#define EXT_CSD_CARD_TYPE	196	/* RO */
-#define EXT_CSD_STRUCTURE	194	/* RO */
-#define EXT_CSD_REV		192	/* RO */
-#define EXT_CSD_SEC_CNT		212	/* RO, 4 bytes */
-#define EXT_CSD_S_A_TIMEOUT	217
+#define EXT_CSD_ERASE_GROUP_DEF		175	/* R/W */
+#define EXT_CSD_ERASED_MEM_CONT		181	/* RO */
+#define EXT_CSD_BUS_WIDTH		183	/* R/W */
+#define EXT_CSD_HS_TIMING		185	/* R/W */
+#define EXT_CSD_REV			192	/* RO */
+#define EXT_CSD_STRUCTURE		194	/* RO */
+#define EXT_CSD_CARD_TYPE		196	/* RO */
+#define EXT_CSD_SEC_CNT			212	/* RO, 4 bytes */
+#define EXT_CSD_S_A_TIMEOUT		217	/* RO */
+#define EXT_CSD_ERASE_TIMEOUT_MULT	223	/* RO */
+#define EXT_CSD_HC_ERASE_GRP_SIZE	224	/* RO */
+#define EXT_CSD_SEC_TRIM_MULT		229	/* RO */
+#define EXT_CSD_SEC_ERASE_MULT		230	/* RO */
+#define EXT_CSD_SEC_FEATURE_SUPPORT	231	/* RO */
+#define EXT_CSD_TRIM_MULT		232	/* RO */
 
 /*
  * EXT_CSD field definitions
@@ -275,6 +283,10 @@ struct _mmc_csd {
 #define EXT_CSD_BUS_WIDTH_4	1	/* Card is in 4 bit mode */
 #define EXT_CSD_BUS_WIDTH_8	2	/* Card is in 8 bit mode */
 
+#define EXT_CSD_SEC_ER_EN	BIT(0)
+#define EXT_CSD_SEC_BD_BLK_EN	BIT(2)
+#define EXT_CSD_SEC_GB_CL_EN	BIT(4)
+
 /*
  * MMC_SWITCH access modes
  */
diff --git a/include/linux/mmc/sd.h b/include/linux/mmc/sd.h
index f310062cffb4..3fd85e088cc3 100644
--- a/include/linux/mmc/sd.h
+++ b/include/linux/mmc/sd.h
@@ -21,8 +21,13 @@
   /* class 10 */
 #define SD_SWITCH                 6   /* adtc [31:0] See below   R1  */
 
+  /* class 5 */
+#define SD_ERASE_WR_BLK_START    32   /* ac   [31:0] data addr   R1  */
+#define SD_ERASE_WR_BLK_END      33   /* ac   [31:0] data addr   R1  */
+
   /* Application commands */
 #define SD_APP_SET_BUS_WIDTH      6   /* ac   [1:0] bus width    R1  */
+#define SD_APP_SD_STATUS         13   /* adtc                    R1  */
 #define SD_APP_SEND_NUM_WR_BLKS  22   /* adtc                    R1  */
 #define SD_APP_OP_COND           41   /* bcr  [31:0] OCR         R3  */
 #define SD_APP_SEND_SCR          51   /* adtc                    R1  */
-- 
cgit v1.2.3-59-g8ed1b


From 8d57a98ccd0b4489003473979da8f5a1363ba7a3 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Wed, 11 Aug 2010 14:17:49 -0700
Subject: block: add secure discard

Secure discard is the same as discard except that all copies of the
discarded sectors (perhaps created by garbage collection) must also be
erased.

Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Cc: Kyungmin Park <kmpark@infradead.org>
Cc: Madhusudhan Chikkature <madhu.cr@ti.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ben Gardiner <bengardiner@nanometrics.ca>
Cc: <linux-mmc@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-core.c          |  5 ++++-
 block/blk-lib.c           |  6 ++++++
 block/compat_ioctl.c      |  1 +
 block/elevator.c          |  6 ++++++
 block/ioctl.c             | 15 ++++++++++-----
 include/linux/blk_types.h |  2 ++
 include/linux/blkdev.h    |  7 ++++++-
 include/linux/fs.h        |  2 ++
 kernel/trace/blktrace.c   |  8 ++++++++
 9 files changed, 45 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 7da630e25ae7..ee1a1e7e63cc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1514,7 +1514,10 @@ static inline void __generic_make_request(struct bio *bio)
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
 
-		if ((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(q)) {
+		if ((bio->bi_rw & REQ_DISCARD) &&
+		    (!blk_queue_discard(q) ||
+		     ((bio->bi_rw & REQ_SECURE) &&
+		      !blk_queue_secdiscard(q)))) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index c1fc55a83ba1..c392029a104e 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -62,6 +62,12 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		max_discard_sectors &= ~(disc_sects - 1);
 	}
 
+	if (flags & BLKDEV_IFL_SECURE) {
+		if (!blk_queue_secdiscard(q))
+			return -EOPNOTSUPP;
+		type |= DISCARD_SECURE;
+	}
+
 	while (nr_sects && !ret) {
 		bio = bio_alloc(gfp_mask, 1);
 		if (!bio) {
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index d53085637731..119f07b74dc0 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -703,6 +703,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKFLSBUF:
 	case BLKROSET:
 	case BLKDISCARD:
+	case BLKSECDISCARD:
 	/*
 	 * the ones below are implemented in blkdev_locked_ioctl,
 	 * but we call blkdev_ioctl, which gets the lock for us
diff --git a/block/elevator.c b/block/elevator.c
index 816a7c8d6394..ec585c9554d3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -82,6 +82,12 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 	if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD))
 		return 0;
 
+	/*
+	 * Don't merge discard requests and secure discard requests
+	 */
+	if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE))
+		return 0;
+
 	/*
 	 * different data direction or already started, don't merge
 	 */
diff --git a/block/ioctl.c b/block/ioctl.c
index 09fd7f1ef23a..d8052f0dabd3 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -114,8 +114,10 @@ static int blkdev_reread_part(struct block_device *bdev)
 }
 
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
-			     uint64_t len)
+			     uint64_t len, int secure)
 {
+	unsigned long flags = BLKDEV_IFL_WAIT;
+
 	if (start & 511)
 		return -EINVAL;
 	if (len & 511)
@@ -125,8 +127,9 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 
 	if (start + len > (bdev->bd_inode->i_size >> 9))
 		return -EINVAL;
-	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
-				    BLKDEV_IFL_WAIT);
+	if (secure)
+		flags |= BLKDEV_IFL_SECURE;
+	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
@@ -213,7 +216,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		set_device_ro(bdev, n);
 		return 0;
 
-	case BLKDISCARD: {
+	case BLKDISCARD:
+	case BLKSECDISCARD: {
 		uint64_t range[2];
 
 		if (!(mode & FMODE_WRITE))
@@ -222,7 +226,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
 			return -EFAULT;
 
-		return blk_ioctl_discard(bdev, range[0], range[1]);
+		return blk_ioctl_discard(bdev, range[0], range[1],
+					 cmd == BLKSECDISCARD);
 	}
 
 	case HDIO_GETGEO: {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 53691774d34e..ca83a97c9715 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -150,6 +150,7 @@ enum rq_flag_bits {
 	__REQ_FLUSH,		/* request for cache flush */
 	__REQ_IO_STAT,		/* account I/O stat */
 	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
+	__REQ_SECURE,		/* secure discard (used with __REQ_DISCARD) */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -190,5 +191,6 @@ enum rq_flag_bits {
 #define REQ_FLUSH		(1 << __REQ_FLUSH)
 #define REQ_IO_STAT		(1 << __REQ_IO_STAT)
 #define REQ_MIXED_MERGE		(1 << __REQ_MIXED_MERGE)
+#define REQ_SECURE		(1 << __REQ_SECURE)
 
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 89c855c5655c..2c54906f678f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -389,6 +389,7 @@ struct request_queue
 #define QUEUE_FLAG_DISCARD     16	/* supports DISCARD */
 #define QUEUE_FLAG_NOXMERGES   17	/* No extended merges */
 #define QUEUE_FLAG_ADD_RANDOM  18	/* Contributes to random pool */
+#define QUEUE_FLAG_SECDISCARD  19	/* supports SECDISCARD */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_CLUSTER) |		\
@@ -524,6 +525,8 @@ enum {
 #define blk_queue_stackable(q)	\
 	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
+#define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
+	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
 
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
@@ -918,10 +921,12 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 enum{
 	BLKDEV_WAIT,	/* wait for completion */
-	BLKDEV_BARRIER,	/*issue request with barrier */
+	BLKDEV_BARRIER,	/* issue request with barrier */
+	BLKDEV_SECURE,	/* secure discard */
 };
 #define BLKDEV_IFL_WAIT		(1 << BLKDEV_WAIT)
 #define BLKDEV_IFL_BARRIER	(1 << BLKDEV_BARRIER)
+#define BLKDEV_IFL_SECURE	(1 << BLKDEV_SECURE)
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
 			unsigned long);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 267d02630517..7a0625e26a39 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -174,6 +174,7 @@ struct inodes_stat_t {
  */
 #define DISCARD_NOBARRIER	(WRITE | REQ_DISCARD)
 #define DISCARD_BARRIER		(WRITE | REQ_DISCARD | REQ_HARDBARRIER)
+#define DISCARD_SECURE		(DISCARD_NOBARRIER | REQ_SECURE)
 
 #define SEL_IN		1
 #define SEL_OUT		2
@@ -317,6 +318,7 @@ struct inodes_stat_t {
 #define BLKALIGNOFF _IO(0x12,122)
 #define BLKPBSZGET _IO(0x12,123)
 #define BLKDISCARDZEROES _IO(0x12,124)
+#define BLKSECDISCARD _IO(0x12,125)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 82499a5bdcb7..959f8d6c8cc1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -710,6 +710,9 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (rq->cmd_flags & REQ_DISCARD)
 		rw |= REQ_DISCARD;
 
+	if (rq->cmd_flags & REQ_SECURE)
+		rw |= REQ_SECURE;
+
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
 		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
@@ -1816,6 +1819,8 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 		rwbs[i++] = 'S';
 	if (rw & REQ_META)
 		rwbs[i++] = 'M';
+	if (rw & REQ_SECURE)
+		rwbs[i++] = 'E';
 
 	rwbs[i] = '\0';
 }
@@ -1828,6 +1833,9 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
 	if (rq->cmd_flags & REQ_DISCARD)
 		rw |= REQ_DISCARD;
 
+	if (rq->cmd_flags & REQ_SECURE)
+		rw |= REQ_SECURE;
+
 	bytes = blk_rq_bytes(rq);
 
 	blk_fill_rwbs(rwbs, rw, bytes);
-- 
cgit v1.2.3-59-g8ed1b


From 12fdff3fc2483f906ae6404a6e8dcf2550310b6f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 12 Aug 2010 16:54:57 +0100
Subject: Add a dummy printk function for the maintenance of unused printks

Add a dummy printk function for the maintenance of unused printks through gcc
format checking, and also so that side-effect checking is maintained too.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mn10300/kernel/mn10300-serial.c |  5 -----
 fs/afs/internal.h                    | 12 +++---------
 fs/cachefiles/internal.h             | 13 +++----------
 fs/fscache/internal.h                | 14 ++++----------
 include/linux/kernel.h               |  7 +++++++
 kernel/cred.c                        |  4 ----
 net/rxrpc/ar-internal.h              | 16 +++++-----------
 security/keys/internal.h             |  5 -----
 8 files changed, 22 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mn10300/kernel/mn10300-serial.c b/arch/mn10300/kernel/mn10300-serial.c
index ef34d5a0f8bd..9d49073e827a 100644
--- a/arch/mn10300/kernel/mn10300-serial.c
+++ b/arch/mn10300/kernel/mn10300-serial.c
@@ -44,11 +44,6 @@ static const char serial_revdate[] = "2007-11-06";
 #include <unit/timex.h>
 #include "mn10300-serial.h"
 
-static inline __attribute__((format(printf, 1, 2)))
-void no_printk(const char *fmt, ...)
-{
-}
-
 #define kenter(FMT, ...) \
 	printk(KERN_DEBUG "-->%s(" FMT ")\n", __func__, ##__VA_ARGS__)
 #define _enter(FMT, ...) \
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8679089ce9a1..c6c93f180707 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -752,12 +752,6 @@ extern unsigned afs_debug;
 #define dbgprintk(FMT,...) \
 	printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
 
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf,1,2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
-
 #define kenter(FMT,...)	dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
 #define kleave(FMT,...)	dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
 #define kdebug(FMT,...)	dbgprintk("    "FMT ,##__VA_ARGS__)
@@ -792,9 +786,9 @@ do {							\
 } while (0)
 
 #else
-#define _enter(FMT,...)	_dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
-#define _leave(FMT,...)	_dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
-#define _debug(FMT,...)	_dbprintk("    "FMT ,##__VA_ARGS__)
+#define _enter(FMT,...)	no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...)	no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define _debug(FMT,...)	no_printk("    "FMT ,##__VA_ARGS__)
 #endif
 
 /*
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index a8cd821226da..bd6bc1bde2d7 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -267,13 +267,6 @@ do {									\
 #define dbgprintk(FMT, ...) \
 	printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
 
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline void _dbprintk(const char *fmt, ...)
-	__attribute__((format(printf, 1, 2)));
-static inline void _dbprintk(const char *fmt, ...)
-{
-}
-
 #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
 #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
 #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
@@ -304,9 +297,9 @@ do {							\
 } while (0)
 
 #else
-#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
 #endif
 
 #if 1 /* defined(__KDEBUGALL) */
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 6a026441c5a6..f6aad48d38a8 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -321,17 +321,11 @@ void fscache_put_context(struct fscache_cookie *cookie, void *context)
 #define dbgprintk(FMT, ...) \
 	printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
 
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf, 1, 2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
-
 #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
 #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
 #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
 
-#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
 
 #ifdef __KDEBUG
 #define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
@@ -358,9 +352,9 @@ do {						\
 } while (0)
 
 #else
-#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
 #endif
 
 /*
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d848cb854655..2b0a35e6bc69 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -306,6 +306,13 @@ static inline void log_buf_kexec_setup(void)
 }
 #endif
 
+/*
+ * Dummy printk for disabled debugging statements to use whilst maintaining
+ * gcc's format and side-effect checking.
+ */
+static inline __attribute__ ((format (printf, 1, 2)))
+int no_printk(const char *s, ...) { return 0; }
+
 extern int printk_needs_cpu(int cpu);
 extern void printk_tick(void);
 
diff --git a/kernel/cred.c b/kernel/cred.c
index 60bc8b1e32e6..9a3e22641fe7 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -22,10 +22,6 @@
 #define kdebug(FMT, ...) \
 	printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
 #else
-static inline __attribute__((format(printf, 1, 2)))
-void no_printk(const char *fmt, ...)
-{
-}
 #define kdebug(FMT, ...) \
 	no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
 #endif
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 7043b294bb67..8e22bd345e71 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -597,12 +597,6 @@ extern unsigned rxrpc_debug;
 #define dbgprintk(FMT,...) \
 	printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
 
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf,1,2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
-
 #define kenter(FMT,...)	dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
 #define kleave(FMT,...)	dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
 #define kdebug(FMT,...)	dbgprintk("    "FMT ,##__VA_ARGS__)
@@ -655,11 +649,11 @@ do {							\
 } while (0)
 
 #else
-#define _enter(FMT,...)	_dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
-#define _leave(FMT,...)	_dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
-#define _debug(FMT,...)	_dbprintk("    "FMT ,##__VA_ARGS__)
-#define _proto(FMT,...)	_dbprintk("### "FMT ,##__VA_ARGS__)
-#define _net(FMT,...)	_dbprintk("@@@ "FMT ,##__VA_ARGS__)
+#define _enter(FMT,...)	no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...)	no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define _debug(FMT,...)	no_printk("    "FMT ,##__VA_ARGS__)
+#define _proto(FMT,...)	no_printk("### "FMT ,##__VA_ARGS__)
+#define _net(FMT,...)	no_printk("@@@ "FMT ,##__VA_ARGS__)
 #endif
 
 /*
diff --git a/security/keys/internal.h b/security/keys/internal.h
index addb67b169f4..56a133d8f37d 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -15,11 +15,6 @@
 #include <linux/sched.h>
 #include <linux/key-type.h>
 
-static inline __attribute__((format(printf, 1, 2)))
-void no_printk(const char *fmt, ...)
-{
-}
-
 #ifdef __KDEBUG
 #define kenter(FMT, ...) \
 	printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
-- 
cgit v1.2.3-59-g8ed1b


From 4936a3b90d79dd8775c6ac23c2cf2dcebe29abde Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 9 Aug 2010 14:20:10 -0700
Subject: x86/hpet: Use the FSEC_PER_SEC constant for femto-second periods

The current computation, introduced with f12a15be63, of FSEC_PER_SEC using
the multiplication of (FSEC_PER_NSEC * NSEC_PER_SEC) is performed only
with 32bit integers on small machines, resulting in an overflow and a
*very* short intervals being programmed.  An interrupt storm follows.

Note that we also have to specify FSEC_PER_SEC as being long long to
overcome the same limitations.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/hpet.c | 4 ++--
 include/linux/time.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 33dbcc4ec5ff..351f9c0fea1f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -582,7 +582,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 	 * scaled math multiplication factor for nanosecond to hpet tick
 	 * conversion.
 	 */
-	hpet_freq = 1000000000000000ULL;
+	hpet_freq = FSEC_PER_SEC;
 	do_div(hpet_freq, hpet_period);
 	evt->mult = div_sc((unsigned long) hpet_freq,
 				      NSEC_PER_SEC, evt->shift);
@@ -837,7 +837,7 @@ static int hpet_clocksource_register(void)
 	 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
 	 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
 	 */
-	hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC;
+	hpet_freq = FSEC_PER_SEC;
 	do_div(hpet_freq, hpet_period);
 	clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
 
diff --git a/include/linux/time.h b/include/linux/time.h
index cb34e35fabac..12612701b1ae 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -38,7 +38,7 @@ extern struct timezone sys_tz;
 #define NSEC_PER_MSEC	1000000L
 #define USEC_PER_SEC	1000000L
 #define NSEC_PER_SEC	1000000000L
-#define FSEC_PER_SEC	1000000000000000L
+#define FSEC_PER_SEC	1000000000000000LL
 
 #define TIME_T_MAX	(time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
 
-- 
cgit v1.2.3-59-g8ed1b


From 2069601b3f0ea38170d4b509b89f3ca0a373bdc1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 12 Aug 2010 14:23:04 -0700
Subject: Revert "fsnotify: store struct file not struct path"

This reverts commit 3bcf3860a4ff9bbc522820b4b765e65e4deceb3e (and the
accompanying commit c1e5c954020e "vfs/fsnotify: fsnotify_close can delay
the final work in fput" that was a horribly ugly hack to make it work at
all).

The 'struct file' approach not only causes that disgusting hack, it
somehow breaks pulseaudio, probably due to some other subtlety with
f_count handling.

Fix up various conflicts due to later fsnotify work.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file_table.c                      |  9 ---------
 fs/notify/fanotify/fanotify.c        |  8 ++++----
 fs/notify/fanotify/fanotify_user.c   |  6 +++---
 fs/notify/fsnotify.c                 | 12 ++++++------
 fs/notify/inotify/inotify_fsnotify.c | 12 ++++++------
 fs/notify/notification.c             | 33 +++++++++++---------------------
 include/linux/fsnotify.h             | 37 ++++++++++++++++++++----------------
 include/linux/fsnotify_backend.h     | 16 ++++++++--------
 kernel/audit_watch.c                 |  4 ++--
 9 files changed, 61 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file_table.c b/fs/file_table.c
index 2fc3b3c08911..edecd36fed9b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -230,15 +230,6 @@ static void __fput(struct file *file)
 	might_sleep();
 
 	fsnotify_close(file);
-
-	/*
-	 * fsnotify_create_event may have taken one or more references on this
-	 * file.  If it did so it left one reference for us to drop to make sure
-	 * its calls to fput could not prematurely destroy the file.
-	 */
-	if (atomic_long_read(&file->f_count))
-		return fput(file);
-
 	/*
 	 * The function eventpoll_release() should be the first called
 	 * in the file cleanup chain.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index eb8f73c9c131..756566fe8449 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -17,9 +17,9 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
 	    old->data_type == new->data_type &&
 	    old->tgid == new->tgid) {
 		switch (old->data_type) {
-		case (FSNOTIFY_EVENT_FILE):
-			if ((old->file->f_path.mnt == new->file->f_path.mnt) &&
-			    (old->file->f_path.dentry == new->file->f_path.dentry))
+		case (FSNOTIFY_EVENT_PATH):
+			if ((old->path.mnt == new->path.mnt) &&
+			    (old->path.dentry == new->path.dentry))
 				return true;
 		case (FSNOTIFY_EVENT_NONE):
 			return true;
@@ -174,7 +174,7 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
 		return false;
 
 	/* if we don't have enough info to send an event to userspace say no */
-	if (data_type != FSNOTIFY_EVENT_FILE)
+	if (data_type != FSNOTIFY_EVENT_PATH)
 		return false;
 
 	if (inode_mark && vfsmnt_mark) {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 25a3b4dfcf61..032b837fcd11 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -65,7 +65,7 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
 	if (client_fd < 0)
 		return client_fd;
 
-	if (event->data_type != FSNOTIFY_EVENT_FILE) {
+	if (event->data_type != FSNOTIFY_EVENT_PATH) {
 		WARN_ON(1);
 		put_unused_fd(client_fd);
 		return -EINVAL;
@@ -75,8 +75,8 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
 	 * we need a new file handle for the userspace program so it can read even if it was
 	 * originally opened O_WRONLY.
 	 */
-	dentry = dget(event->file->f_path.dentry);
-	mnt = mntget(event->file->f_path.mnt);
+	dentry = dget(event->path.dentry);
+	mnt = mntget(event->path.mnt);
 	/* it's possible this event was an overflow event.  in that case dentry and mnt
 	 * are NULL;  That's fine, just don't call dentry open */
 	if (dentry && mnt)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4d2a82c1ceb1..3970392b2722 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -84,7 +84,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 }
 
 /* Notify this dentry's parent about a child's events. */
-void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
+void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
 	struct dentry *parent;
 	struct inode *p_inode;
@@ -92,7 +92,7 @@ void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
 	bool should_update_children = false;
 
 	if (!dentry)
-		dentry = file->f_path.dentry;
+		dentry = path->dentry;
 
 	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
 		return;
@@ -124,8 +124,8 @@ void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
 		 * specifies these are events which came from a child. */
 		mask |= FS_EVENT_ON_CHILD;
 
-		if (file)
-			fsnotify(p_inode, mask, file, FSNOTIFY_EVENT_FILE,
+		if (path)
+			fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
 				 dentry->d_name.name, 0);
 		else
 			fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
@@ -217,8 +217,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	/* global tests shouldn't care about events on child only the specific event */
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
-	if (data_is == FSNOTIFY_EVENT_FILE)
-		mnt = ((struct file *)data)->f_path.mnt;
+	if (data_is == FSNOTIFY_EVENT_PATH)
+		mnt = ((struct path *)data)->mnt;
 	else
 		mnt = NULL;
 
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 5e73eeb2c697..a91b69a6a291 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -52,9 +52,9 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 			    !strcmp(old->file_name, new->file_name))
 				return true;
 			break;
-		case (FSNOTIFY_EVENT_FILE):
-			if ((old->file->f_path.mnt == new->file->f_path.mnt) &&
-			    (old->file->f_path.dentry == new->file->f_path.dentry))
+		case (FSNOTIFY_EVENT_PATH):
+			if ((old->path.mnt == new->path.mnt) &&
+			    (old->path.dentry == new->path.dentry))
 				return true;
 			break;
 		case (FSNOTIFY_EVENT_NONE):
@@ -147,10 +147,10 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
 				      __u32 mask, void *data, int data_type)
 {
 	if ((inode_mark->mask & FS_EXCL_UNLINK) &&
-	    (data_type == FSNOTIFY_EVENT_FILE)) {
-		struct file *file  = data;
+	    (data_type == FSNOTIFY_EVENT_PATH)) {
+		struct path *path = data;
 
-		if (d_unlinked(file->f_path.dentry))
+		if (d_unlinked(path->dentry))
 			return false;
 	}
 
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index d6c435adc7a2..f39260f8f865 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -31,7 +31,6 @@
  * allocated and used.
  */
 
-#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -90,8 +89,8 @@ void fsnotify_put_event(struct fsnotify_event *event)
 	if (atomic_dec_and_test(&event->refcnt)) {
 		pr_debug("%s: event=%p\n", __func__, event);
 
-		if (event->data_type == FSNOTIFY_EVENT_FILE)
-			fput(event->file);
+		if (event->data_type == FSNOTIFY_EVENT_PATH)
+			path_put(&event->path);
 
 		BUG_ON(!list_empty(&event->private_data_list));
 
@@ -376,8 +375,8 @@ struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
 		}
 	}
 	event->tgid = get_pid(old_event->tgid);
-	if (event->data_type == FSNOTIFY_EVENT_FILE)
-		get_file(event->file);
+	if (event->data_type == FSNOTIFY_EVENT_PATH)
+		path_get(&event->path);
 
 	return event;
 }
@@ -424,22 +423,11 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 	event->data_type = data_type;
 
 	switch (data_type) {
-	case FSNOTIFY_EVENT_FILE: {
-		event->file = data;
-		/*
-		 * if this file is about to disappear hold an extra reference
-		 * until we return to __fput so we don't have to worry about
-		 * future get/put destroying the file under us or generating
-		 * additional events.  Notice that we change f_mode without
-		 * holding f_lock.  This is safe since this is the only possible
-		 * reference to this object in the kernel (it was about to be
-		 * freed, remember?)
-		 */
-		if (!atomic_long_read(&event->file->f_count)) {
-			event->file->f_mode |= FMODE_NONOTIFY;
-			get_file(event->file);
-		}
-		get_file(event->file);
+	case FSNOTIFY_EVENT_PATH: {
+		struct path *path = data;
+		event->path.dentry = path->dentry;
+		event->path.mnt = path->mnt;
+		path_get(&event->path);
 		break;
 	}
 	case FSNOTIFY_EVENT_INODE:
@@ -447,7 +435,8 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 		break;
 	case FSNOTIFY_EVENT_NONE:
 		event->inode = NULL;
-		event->file = NULL;
+		event->path.dentry = NULL;
+		event->path.mnt = NULL;
 		break;
 	default:
 		BUG();
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index e4e2204187ee..59d0df43ff9d 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -26,18 +26,19 @@ static inline void fsnotify_d_instantiate(struct dentry *dentry,
 }
 
 /* Notify this dentry's parent about a child's events. */
-static inline void fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
+static inline void fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
 	if (!dentry)
-		dentry = file->f_path.dentry;
+		dentry = path->dentry;
 
-	__fsnotify_parent(file, dentry, mask);
+	__fsnotify_parent(path, dentry, mask);
 }
 
 /* simple call site for access decisions */
 static inline int fsnotify_perm(struct file *file, int mask)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct path *path = &file->f_path;
+	struct inode *inode = path->dentry->d_inode;
 	__u32 fsnotify_mask = 0;
 
 	if (file->f_mode & FMODE_NONOTIFY)
@@ -51,7 +52,7 @@ static inline int fsnotify_perm(struct file *file, int mask)
 	else
 		BUG();
 
-	return fsnotify(inode, fsnotify_mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
+	return fsnotify(inode, fsnotify_mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 }
 
 /*
@@ -186,15 +187,16 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
  */
 static inline void fsnotify_access(struct file *file)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct path *path = &file->f_path;
+	struct inode *inode = path->dentry->d_inode;
 	__u32 mask = FS_ACCESS;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
 	if (!(file->f_mode & FMODE_NONOTIFY)) {
-		fsnotify_parent(file, NULL, mask);
-		fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
+		fsnotify_parent(path, NULL, mask);
+		fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 	}
 }
 
@@ -203,15 +205,16 @@ static inline void fsnotify_access(struct file *file)
  */
 static inline void fsnotify_modify(struct file *file)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct path *path = &file->f_path;
+	struct inode *inode = path->dentry->d_inode;
 	__u32 mask = FS_MODIFY;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
 	if (!(file->f_mode & FMODE_NONOTIFY)) {
-		fsnotify_parent(file, NULL, mask);
-		fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
+		fsnotify_parent(path, NULL, mask);
+		fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 	}
 }
 
@@ -220,15 +223,16 @@ static inline void fsnotify_modify(struct file *file)
  */
 static inline void fsnotify_open(struct file *file)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct path *path = &file->f_path;
+	struct inode *inode = path->dentry->d_inode;
 	__u32 mask = FS_OPEN;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
 	if (!(file->f_mode & FMODE_NONOTIFY)) {
-		fsnotify_parent(file, NULL, mask);
-		fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
+		fsnotify_parent(path, NULL, mask);
+		fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 	}
 }
 
@@ -237,6 +241,7 @@ static inline void fsnotify_open(struct file *file)
  */
 static inline void fsnotify_close(struct file *file)
 {
+	struct path *path = &file->f_path;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	fmode_t mode = file->f_mode;
 	__u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
@@ -245,8 +250,8 @@ static inline void fsnotify_close(struct file *file)
 		mask |= FS_IN_ISDIR;
 
 	if (!(file->f_mode & FMODE_NONOTIFY)) {
-		fsnotify_parent(file, NULL, mask);
-		fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
+		fsnotify_parent(path, NULL, mask);
+		fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 9bbfd7204b04..ed36fb57c426 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -203,20 +203,20 @@ struct fsnotify_event {
 	/* to_tell may ONLY be dereferenced during handle_event(). */
 	struct inode *to_tell;	/* either the inode the event happened to or its parent */
 	/*
-	 * depending on the event type we should have either a file or inode
-	 * We hold a reference on file, but NOT on inode.  Since we have the ref on
-	 * the file, it may be dereferenced at any point during this object's
+	 * depending on the event type we should have either a path or inode
+	 * We hold a reference on path, but NOT on inode.  Since we have the ref on
+	 * the path, it may be dereferenced at any point during this object's
 	 * lifetime.  That reference is dropped when this object's refcnt hits
-	 * 0.  If this event contains an inode instead of a file, the inode may
+	 * 0.  If this event contains an inode instead of a path, the inode may
 	 * ONLY be used during handle_event().
 	 */
 	union {
-		struct file *file;
+		struct path path;
 		struct inode *inode;
 	};
 /* when calling fsnotify tell it if the data is a path or inode */
 #define FSNOTIFY_EVENT_NONE	0
-#define FSNOTIFY_EVENT_FILE	1
+#define FSNOTIFY_EVENT_PATH	1
 #define FSNOTIFY_EVENT_INODE	2
 	int data_type;		/* which of the above union we have */
 	atomic_t refcnt;	/* how many groups still are using/need to send this event */
@@ -293,7 +293,7 @@ struct fsnotify_mark {
 /* main fsnotify call to send events */
 extern int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 		    const unsigned char *name, u32 cookie);
-extern void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask);
+extern void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
 extern u32 fsnotify_get_cookie(void);
@@ -422,7 +422,7 @@ static inline int fsnotify(struct inode *to_tell, __u32 mask, void *data, int da
 	return 0;
 }
 
-static inline void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
+static inline void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {}
 
 static inline void __fsnotify_inode_delete(struct inode *inode)
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 6bf2306be7d6..f0c9b2e7542d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -526,8 +526,8 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 	BUG_ON(group != audit_watch_group);
 
 	switch (event->data_type) {
-	case (FSNOTIFY_EVENT_FILE):
-		inode = event->file->f_path.dentry->d_inode;
+	case (FSNOTIFY_EVENT_PATH):
+		inode = event->path.dentry->d_inode;
 		break;
 	case (FSNOTIFY_EVENT_INODE):
 		inode = event->inode;
-- 
cgit v1.2.3-59-g8ed1b


From e259f191f2244df04a7746fac1df8aa68ebd0106 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 13 Aug 2010 09:39:18 +0200
Subject: dma-mapping: fix build errors on !HAS_DMA architectures

commit 4565f0170dfc849b3629c27d769db800467baa62 "dma-mapping: unify
dma_get_cache_alignment implementations" causes build errors on
!HAS_DMA architectures/platforms like s390 and sun3:

include/linux/dma-mapping.h:145: error: static declaration of 'dma_get_cache_alignment' follows non-static declaration
include/asm-generic/dma-mapping-broken.h:73: error: previous declaration of 'dma_get_cache_alignment' was here

Fix this by adding an explicit ifdef.

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dma-mapping.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index e0670a512056..ce29b8151198 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -142,6 +142,7 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
 		return -EIO;
 }
 
+#ifdef CONFIG_HAS_DMA
 static inline int dma_get_cache_alignment(void)
 {
 #ifdef ARCH_DMA_MINALIGN
@@ -149,6 +150,7 @@ static inline int dma_get_cache_alignment(void)
 #endif
 	return 1;
 }
+#endif
 
 /* flags for the coherent memory api */
 #define	DMA_MEMORY_MAP			0x01
-- 
cgit v1.2.3-59-g8ed1b


From b19dd42faf413b4705d4adb38521e82d73fa4249 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 4 Jul 2010 00:15:10 +0200
Subject: bkl: Remove locked .ioctl file operation

The last user is gone, so we can safely remove this

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/filesystems/Locking |  8 +-------
 Documentation/filesystems/vfs.txt |  6 +-----
 fs/bad_inode.c                    |  7 -------
 fs/compat_ioctl.c                 |  3 +--
 fs/ioctl.c                        | 18 ++++--------------
 fs/proc/inode.c                   | 17 ++++-------------
 include/linux/fs.h                |  5 ++---
 7 files changed, 13 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index bbcc15651a21..2db4283efa8d 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -374,8 +374,6 @@ prototypes:
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
-	int (*ioctl) (struct inode *, struct file *, unsigned int,
-			unsigned long);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -409,8 +407,7 @@ write:			no
 aio_write:		no
 readdir: 		no
 poll:			no
-ioctl:			yes	(see below)
-unlocked_ioctl:		no	(see below)
+unlocked_ioctl:		no
 compat_ioctl:		no
 mmap:			no
 open:			no
@@ -453,9 +450,6 @@ move ->readdir() to inode_operations and use a separate method for directory
 anything that resembles union-mount we won't have a struct file for all
 components. And there are other reasons why the current interface is a mess...
 
-->ioctl() on regular files is superceded by the ->unlocked_ioctl() that
-doesn't take the BKL.
-
 ->read on directories probably must go away - we should just enforce -EISDIR
 in sys_read() and friends.
 
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 94677e7dcb13..ed7e5efc06d8 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -727,7 +727,6 @@ struct file_operations {
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
-	int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -768,10 +767,7 @@ otherwise noted.
 	activity on this file and (optionally) go to sleep until there
 	is activity. Called by the select(2) and poll(2) system calls
 
-  ioctl: called by the ioctl(2) system call
-
-  unlocked_ioctl: called by the ioctl(2) system call. Filesystems that do not
-  	require the BKL should use this method instead of the ioctl() above.
+  unlocked_ioctl: called by the ioctl(2) system call.
 
   compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
  	 are used on 64 bit kernels.
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 52e59bf4aa5f..f024d8aaddef 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -55,12 +55,6 @@ static unsigned int bad_file_poll(struct file *filp, poll_table *wait)
 	return POLLERR;
 }
 
-static int bad_file_ioctl (struct inode *inode, struct file *filp,
-			unsigned int cmd, unsigned long arg)
-{
-	return -EIO;
-}
-
 static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
 			unsigned long arg)
 {
@@ -159,7 +153,6 @@ static const struct file_operations bad_file_ops =
 	.aio_write	= bad_file_aio_write,
 	.readdir	= bad_file_readdir,
 	.poll		= bad_file_poll,
-	.ioctl		= bad_file_ioctl,
 	.unlocked_ioctl	= bad_file_unlocked_ioctl,
 	.compat_ioctl	= bad_file_compat_ioctl,
 	.mmap		= bad_file_mmap,
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 70227e0dc01d..03e59aa318eb 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1699,8 +1699,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 				goto out_fput;
 		}
 
-		if (!filp->f_op ||
-		    (!filp->f_op->ioctl && !filp->f_op->unlocked_ioctl))
+		if (!filp->f_op || !filp->f_op->unlocked_ioctl)
 			goto do_ioctl;
 		break;
 	}
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 2d140a713861..f855ea4fc888 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -29,7 +29,6 @@
  * @arg:	command-specific argument for ioctl
  *
  * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
- * invokes filesystem specific ->ioctl method.  If neither method exists,
  * returns -ENOTTY.
  *
  * Returns 0 on success, -errno on error.
@@ -39,21 +38,12 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd,
 {
 	int error = -ENOTTY;
 
-	if (!filp->f_op)
+	if (!filp->f_op || !filp->f_op->unlocked_ioctl)
 		goto out;
 
-	if (filp->f_op->unlocked_ioctl) {
-		error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
-		if (error == -ENOIOCTLCMD)
-			error = -EINVAL;
-		goto out;
-	} else if (filp->f_op->ioctl) {
-		lock_kernel();
-		error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
-					  filp, cmd, arg);
-		unlock_kernel();
-	}
-
+	error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
+	if (error == -ENOIOCTLCMD)
+		error = -EINVAL;
  out:
 	return error;
 }
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 23561cda7245..9c2b5f484879 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -214,8 +214,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
 {
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	long rv = -ENOTTY;
-	long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
-	int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long);
+	long (*ioctl)(struct file *, unsigned int, unsigned long);
 
 	spin_lock(&pde->pde_unload_lock);
 	if (!pde->proc_fops) {
@@ -223,19 +222,11 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
 		return rv;
 	}
 	pde->pde_users++;
-	unlocked_ioctl = pde->proc_fops->unlocked_ioctl;
-	ioctl = pde->proc_fops->ioctl;
+	ioctl = pde->proc_fops->unlocked_ioctl;
 	spin_unlock(&pde->pde_unload_lock);
 
-	if (unlocked_ioctl) {
-		rv = unlocked_ioctl(file, cmd, arg);
-		if (rv == -ENOIOCTLCMD)
-			rv = -EINVAL;
-	} else if (ioctl) {
-		WARN_ONCE(1, "Procfs ioctl handlers must use unlocked_ioctl, "
-			  "%pf will be called without the Bkl held\n", ioctl);
-		rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
-	}
+	if (ioctl)
+		rv = ioctl(file, cmd, arg);
 
 	pde_users_dec(pde);
 	return rv;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7a0625e26a39..3c786fdaeab6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1483,8 +1483,8 @@ struct block_device_operations;
 
 /*
  * NOTE:
- * read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl
- * can be called without the big kernel lock held in all filesystems.
+ * all file operations except setlease can be called without
+ * the big kernel lock held in all filesystems.
  */
 struct file_operations {
 	struct module *owner;
@@ -1495,7 +1495,6 @@ struct file_operations {
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
-	int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
-- 
cgit v1.2.3-59-g8ed1b


From c7887325230aec47d47a32562a6e26014a0fafca Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 11 Aug 2010 11:26:22 +0100
Subject: Mark arguments to certain syscalls as being const

Mark arguments to certain system calls as being const where they should be but
aren't.  The list includes:

 (*) The filename arguments of various stat syscalls, execve(), various utimes
     syscalls and some mount syscalls.

 (*) The filename arguments of some syscall helpers relating to the above.

 (*) The buffer argument of various write syscalls.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/osf_sys.c             |  6 +++---
 arch/alpha/kernel/process.c             |  2 +-
 arch/arm/kernel/sys_arm.c               |  4 ++--
 arch/arm/kernel/sys_oabi-compat.c       |  6 +++---
 arch/avr32/include/asm/syscalls.h       |  2 +-
 arch/avr32/kernel/process.c             |  3 ++-
 arch/blackfin/kernel/process.c          |  2 +-
 arch/frv/kernel/process.c               |  3 ++-
 arch/h8300/kernel/process.c             |  2 +-
 arch/ia64/include/asm/unistd.h          |  2 +-
 arch/ia64/kernel/process.c              |  2 +-
 arch/m32r/kernel/process.c              |  3 ++-
 arch/m68k/kernel/process.c              |  2 +-
 arch/m68knommu/kernel/process.c         |  2 +-
 arch/microblaze/kernel/sys_microblaze.c |  2 +-
 arch/mips/kernel/syscall.c              |  2 +-
 arch/mn10300/kernel/process.c           |  2 +-
 arch/parisc/hpux/fs.c                   |  7 ++++---
 arch/powerpc/kernel/process.c           |  2 +-
 arch/powerpc/kernel/sys_ppc32.c         |  2 +-
 arch/s390/kernel/compat_linux.c         | 10 +++++-----
 arch/s390/kernel/compat_linux.h         | 10 +++++-----
 arch/s390/kernel/entry.h                |  2 +-
 arch/s390/kernel/process.c              |  2 +-
 arch/sh/include/asm/syscalls_32.h       |  2 +-
 arch/sh/include/asm/syscalls_64.h       |  2 +-
 arch/sh/kernel/process_64.c             |  2 +-
 arch/sparc/kernel/sys_sparc32.c         |  7 ++++---
 arch/um/kernel/exec.c                   |  6 +++---
 arch/um/kernel/internal.h               |  2 +-
 arch/um/kernel/syscall.c                |  2 +-
 arch/x86/ia32/sys_ia32.c                | 14 +++++++-------
 arch/x86/include/asm/sys_ia32.h         | 12 ++++++------
 arch/x86/include/asm/syscalls.h         |  2 +-
 arch/x86/kernel/entry_64.S              |  4 ++--
 arch/x86/kernel/process.c               |  2 +-
 arch/xtensa/kernel/process.c            |  2 +-
 fs/compat.c                             | 23 +++++++++++++----------
 fs/stat.c                               | 29 ++++++++++++++++++-----------
 fs/utimes.c                             |  7 ++++---
 include/linux/compat.h                  |  6 +++---
 include/linux/fs.h                      |  6 +++---
 include/linux/syscalls.h                | 20 ++++++++++----------
 include/linux/time.h                    |  2 +-
 44 files changed, 125 insertions(+), 109 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 88131c6e42e3..fb58150a7e8f 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -244,7 +244,7 @@ do_osf_statfs(struct path *path, struct osf_statfs __user *buffer,
 	return error;	
 }
 
-SYSCALL_DEFINE3(osf_statfs, char __user *, pathname,
+SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname,
 		struct osf_statfs __user *, buffer, unsigned long, bufsiz)
 {
 	struct path path;
@@ -358,7 +358,7 @@ osf_procfs_mount(char *dirname, struct procfs_args __user *args, int flags)
 	return do_mount("", dirname, "proc", flags, NULL);
 }
 
-SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path,
+SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, const char __user *, path,
 		int, flag, void __user *, data)
 {
 	int retval;
@@ -932,7 +932,7 @@ SYSCALL_DEFINE3(osf_setitimer, int, which, struct itimerval32 __user *, in,
 
 }
 
-SYSCALL_DEFINE2(osf_utimes, char __user *, filename,
+SYSCALL_DEFINE2(osf_utimes, const char __user *, filename,
 		struct timeval32 __user *, tvs)
 {
 	struct timespec tv[2];
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 395a464353b8..88e608aebc8c 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -387,7 +387,7 @@ EXPORT_SYMBOL(dump_elf_task_fp);
  * sys_execve() executes a new program.
  */
 asmlinkage int
-do_sys_execve(char __user *ufilename, char __user * __user *argv,
+do_sys_execve(const char __user *ufilename, char __user * __user *argv,
 	      char __user * __user *envp, struct pt_regs *regs)
 {
 	int error;
diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c
index c23501842b98..5b7c541a4c63 100644
--- a/arch/arm/kernel/sys_arm.c
+++ b/arch/arm/kernel/sys_arm.c
@@ -62,7 +62,7 @@ asmlinkage int sys_vfork(struct pt_regs *regs)
 /* sys_execve() executes a new program.
  * This is called indirectly via a small wrapper
  */
-asmlinkage int sys_execve(char __user *filenamei, char __user * __user *argv,
+asmlinkage int sys_execve(const char __user *filenamei, char __user * __user *argv,
 			  char __user * __user *envp, struct pt_regs *regs)
 {
 	int error;
@@ -84,7 +84,7 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[])
 	int ret;
 
 	memset(&regs, 0, sizeof(struct pt_regs));
-	ret = do_execve((char *)filename, (char __user * __user *)argv,
+	ret = do_execve(filename, (char __user * __user *)argv,
 			(char __user * __user *)envp, &regs);
 	if (ret < 0)
 		goto out;
diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index 33ff678e32f2..4ad8da15ef2b 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -141,7 +141,7 @@ static long cp_oldabi_stat64(struct kstat *stat,
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-asmlinkage long sys_oabi_stat64(char __user * filename,
+asmlinkage long sys_oabi_stat64(const char __user * filename,
 				struct oldabi_stat64 __user * statbuf)
 {
 	struct kstat stat;
@@ -151,7 +151,7 @@ asmlinkage long sys_oabi_stat64(char __user * filename,
 	return error;
 }
 
-asmlinkage long sys_oabi_lstat64(char __user * filename,
+asmlinkage long sys_oabi_lstat64(const char __user * filename,
 				 struct oldabi_stat64 __user * statbuf)
 {
 	struct kstat stat;
@@ -172,7 +172,7 @@ asmlinkage long sys_oabi_fstat64(unsigned long fd,
 }
 
 asmlinkage long sys_oabi_fstatat64(int dfd,
-				   char __user *filename,
+				   const char __user *filename,
 				   struct oldabi_stat64  __user *statbuf,
 				   int flag)
 {
diff --git a/arch/avr32/include/asm/syscalls.h b/arch/avr32/include/asm/syscalls.h
index 66a197266637..ab608b70b24d 100644
--- a/arch/avr32/include/asm/syscalls.h
+++ b/arch/avr32/include/asm/syscalls.h
@@ -21,7 +21,7 @@ asmlinkage int sys_clone(unsigned long, unsigned long,
 			 unsigned long, unsigned long,
 			 struct pt_regs *);
 asmlinkage int sys_vfork(struct pt_regs *);
-asmlinkage int sys_execve(char __user *, char __user *__user *,
+asmlinkage int sys_execve(const char __user *, char __user *__user *,
 			  char __user *__user *, struct pt_regs *);
 
 /* kernel/signal.c */
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 2d76515745a4..e5daddff397d 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -383,7 +383,8 @@ asmlinkage int sys_vfork(struct pt_regs *regs)
 		       0, NULL, NULL);
 }
 
-asmlinkage int sys_execve(char __user *ufilename, char __user *__user *uargv,
+asmlinkage int sys_execve(const char __user *ufilename,
+			  char __user *__user *uargv,
 			  char __user *__user *uenvp, struct pt_regs *regs)
 {
 	int error;
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 93ec07da2e51..a566f61c002a 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -209,7 +209,7 @@ copy_thread(unsigned long clone_flags,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp)
+asmlinkage int sys_execve(const char __user *name, char __user * __user *argv, char __user * __user *envp)
 {
 	int error;
 	char *filename;
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 21d0fd19276d..428931cf2f0c 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -250,7 +250,8 @@ int copy_thread(unsigned long clone_flags,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp)
+asmlinkage int sys_execve(const char __user *name, char __user * __user *argv,
+			  char __user * __user *envp)
 {
 	int error;
 	char * filename;
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 8c8b0ffa6ad7..8b7b78d77d5c 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -212,7 +212,7 @@ int copy_thread(unsigned long clone_flags,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char *name, char **argv, char **envp,int dummy,...)
+asmlinkage int sys_execve(const char *name, char **argv, char **envp,int dummy,...)
 {
 	int error;
 	char * filename;
diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h
index bb8b0fff32b3..46f36fc5125f 100644
--- a/arch/ia64/include/asm/unistd.h
+++ b/arch/ia64/include/asm/unistd.h
@@ -353,7 +353,7 @@ asmlinkage unsigned long sys_mmap2(
 				int fd, long pgoff);
 struct pt_regs;
 struct sigaction;
-long sys_execve(char __user *filename, char __user * __user *argv,
+long sys_execve(const char __user *filename, char __user * __user *argv,
 			   char __user * __user *envp, struct pt_regs *regs);
 asmlinkage long sys_ia64_pipe(void);
 asmlinkage long sys_rt_sigaction(int sig,
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 53f1648c8b81..a879c03b7f1c 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -633,7 +633,7 @@ dump_fpu (struct pt_regs *pt, elf_fpregset_t dst)
 }
 
 long
-sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp,
+sys_execve (const char __user *filename, char __user * __user *argv, char __user * __user *envp,
 	    struct pt_regs *regs)
 {
 	char *fname;
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index bc8c8c1511b2..8665a4d868ec 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -288,7 +288,8 @@ asmlinkage int sys_vfork(unsigned long r0, unsigned long r1, unsigned long r2,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char __user *ufilename, char __user * __user *uargv,
+asmlinkage int sys_execve(const char __user *ufilename,
+			  char __user * __user *uargv,
 			  char __user * __user *uenvp,
 			  unsigned long r3, unsigned long r4, unsigned long r5,
 			  unsigned long r6, struct pt_regs regs)
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 1a6be27cf165..221d0b71ce39 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -315,7 +315,7 @@ EXPORT_SYMBOL(dump_fpu);
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp)
+asmlinkage int sys_execve(const char __user *name, char __user * __user *argv, char __user * __user *envp)
 {
 	int error;
 	char * filename;
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index 6aa66134b433..6350f68cd026 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -350,7 +350,7 @@ void dump(struct pt_regs *fp)
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char *name, char **argv, char **envp)
+asmlinkage int sys_execve(const char *name, char **argv, char **envp)
 {
 	int error;
 	char * filename;
diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c
index f4e00b7f1259..6abab6ebedbe 100644
--- a/arch/microblaze/kernel/sys_microblaze.c
+++ b/arch/microblaze/kernel/sys_microblaze.c
@@ -47,7 +47,7 @@ asmlinkage long microblaze_clone(int flags, unsigned long stack, struct pt_regs
 	return do_fork(flags, stack, regs, 0, NULL, NULL);
 }
 
-asmlinkage long microblaze_execve(char __user *filenamei, char __user *__user *argv,
+asmlinkage long microblaze_execve(const char __user *filenamei, char __user *__user *argv,
 			char __user *__user *envp, struct pt_regs *regs)
 {
 	int error;
diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index 58bab2ef257f..bddce0bca195 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -254,7 +254,7 @@ asmlinkage int sys_execve(nabi_no_regargs struct pt_regs regs)
 	int error;
 	char * filename;
 
-	filename = getname((char __user *) (long)regs.regs[4]);
+	filename = getname((const char __user *) (long)regs.regs[4]);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 82b817c7f7b6..762eb325b949 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -268,7 +268,7 @@ asmlinkage long sys_vfork(void)
 		       0, NULL, NULL);
 }
 
-asmlinkage long sys_execve(char __user *name,
+asmlinkage long sys_execve(const char __user *name,
 			   char __user * __user *argv,
 			   char __user * __user *envp)
 {
diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c
index 6935123178eb..1444875a7611 100644
--- a/arch/parisc/hpux/fs.c
+++ b/arch/parisc/hpux/fs.c
@@ -36,7 +36,7 @@ int hpux_execve(struct pt_regs *regs)
 	int error;
 	char *filename;
 
-	filename = getname((char __user *) regs->gr[26]);
+	filename = getname((const char __user *) regs->gr[26]);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
@@ -169,7 +169,7 @@ static int cp_hpux_stat(struct kstat *stat, struct hpux_stat64 __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-long hpux_stat64(char __user *filename, struct hpux_stat64 __user *statbuf)
+long hpux_stat64(const char __user *filename, struct hpux_stat64 __user *statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat(filename, &stat);
@@ -191,7 +191,8 @@ long hpux_fstat64(unsigned int fd, struct hpux_stat64 __user *statbuf)
 	return error;
 }
 
-long hpux_lstat64(char __user *filename, struct hpux_stat64 __user *statbuf)
+long hpux_lstat64(const char __user *filename,
+		  struct hpux_stat64 __user *statbuf)
 {
 	struct kstat stat;
 	int error = vfs_lstat(filename, &stat);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index e78a5add7f15..feacfb789686 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1027,7 +1027,7 @@ int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2,
 	int error;
 	char *filename;
 
-	filename = getname((char __user *) a0);
+	filename = getname((const char __user *) a0);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index 19471a1cef1a..20fd701a686a 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -546,7 +546,7 @@ compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_siz
 	return sys_pread64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo);
 }
 
-compat_ssize_t compat_sys_pwrite64(unsigned int fd, char __user *ubuf, compat_size_t count,
+compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count,
 			      u32 reg6, u32 poshi, u32 poslo)
 {
 	return sys_pwrite64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo);
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 73b624ed9cd8..1e6449c79ab6 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -436,7 +436,7 @@ sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo)
  * sys32_execve() executes a new program after the asm stub has set
  * things up for us.  This should basically do what I want it to.
  */
-asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
+asmlinkage long sys32_execve(const char __user *name, compat_uptr_t __user *argv,
 			     compat_uptr_t __user *envp)
 {
 	struct pt_regs *regs = task_pt_regs(current);
@@ -570,7 +570,7 @@ static int cp_stat64(struct stat64_emu31 __user *ubuf, struct kstat *stat)
 	return copy_to_user(ubuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 
 }
 
-asmlinkage long sys32_stat64(char __user * filename, struct stat64_emu31 __user * statbuf)
+asmlinkage long sys32_stat64(const char __user * filename, struct stat64_emu31 __user * statbuf)
 {
 	struct kstat stat;
 	int ret = vfs_stat(filename, &stat);
@@ -579,7 +579,7 @@ asmlinkage long sys32_stat64(char __user * filename, struct stat64_emu31 __user
 	return ret;
 }
 
-asmlinkage long sys32_lstat64(char __user * filename, struct stat64_emu31 __user * statbuf)
+asmlinkage long sys32_lstat64(const char __user * filename, struct stat64_emu31 __user * statbuf)
 {
 	struct kstat stat;
 	int ret = vfs_lstat(filename, &stat);
@@ -597,7 +597,7 @@ asmlinkage long sys32_fstat64(unsigned long fd, struct stat64_emu31 __user * sta
 	return ret;
 }
 
-asmlinkage long sys32_fstatat64(unsigned int dfd, char __user *filename,
+asmlinkage long sys32_fstatat64(unsigned int dfd, const char __user *filename,
 				struct stat64_emu31 __user* statbuf, int flag)
 {
 	struct kstat stat;
@@ -655,7 +655,7 @@ asmlinkage long sys32_read(unsigned int fd, char __user * buf, size_t count)
 	return sys_read(fd, buf, count);
 }
 
-asmlinkage long sys32_write(unsigned int fd, char __user * buf, size_t count)
+asmlinkage long sys32_write(unsigned int fd, const char __user * buf, size_t count)
 {
 	if ((compat_ssize_t) count < 0)
 		return -EINVAL; 
diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h
index cb97afc85c94..9635d759c2b9 100644
--- a/arch/s390/kernel/compat_linux.h
+++ b/arch/s390/kernel/compat_linux.h
@@ -193,7 +193,7 @@ long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
 			  compat_sigset_t __user *oset, size_t sigsetsize);
 long sys32_rt_sigpending(compat_sigset_t __user *set, size_t sigsetsize);
 long sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo);
-long sys32_execve(char __user *name, compat_uptr_t __user *argv,
+long sys32_execve(const char __user *name, compat_uptr_t __user *argv,
 		  compat_uptr_t __user *envp);
 long sys32_init_module(void __user *umod, unsigned long len,
 		       const char __user *uargs);
@@ -207,16 +207,16 @@ long sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset,
 		    size_t count);
 long sys32_sendfile64(int out_fd, int in_fd, compat_loff_t __user *offset,
 		      s32 count);
-long sys32_stat64(char __user * filename, struct stat64_emu31 __user * statbuf);
-long sys32_lstat64(char __user * filename,
+long sys32_stat64(const char __user * filename, struct stat64_emu31 __user * statbuf);
+long sys32_lstat64(const char __user * filename,
 		   struct stat64_emu31 __user * statbuf);
 long sys32_fstat64(unsigned long fd, struct stat64_emu31 __user * statbuf);
-long sys32_fstatat64(unsigned int dfd, char __user *filename,
+long sys32_fstatat64(unsigned int dfd, const char __user *filename,
 		     struct stat64_emu31 __user* statbuf, int flag);
 unsigned long old32_mmap(struct mmap_arg_struct_emu31 __user *arg);
 long sys32_mmap2(struct mmap_arg_struct_emu31 __user *arg);
 long sys32_read(unsigned int fd, char __user * buf, size_t count);
-long sys32_write(unsigned int fd, char __user * buf, size_t count);
+long sys32_write(unsigned int fd, const char __user * buf, size_t count);
 long sys32_fadvise64(int fd, loff_t offset, size_t len, int advise);
 long sys32_fadvise64_64(struct fadvise64_64_args __user *args);
 long sys32_sigaction(int sig, const struct old_sigaction32 __user *act,
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index 5bb1e6b5db26..403fb430a896 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -42,7 +42,7 @@ long sys_clone(unsigned long newsp, unsigned long clone_flags,
 	       int __user *parent_tidptr, int __user *child_tidptr);
 long sys_vfork(void);
 void execve_tail(void);
-long sys_execve(char __user *name, char __user * __user *argv,
+long sys_execve(const char __user *name, char __user * __user *argv,
 		char __user * __user *envp);
 long sys_sigsuspend(int history0, int history1, old_sigset_t mask);
 long sys_sigaction(int sig, const struct old_sigaction __user *act,
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 1039fdea15b5..7eafaf2662b9 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -267,7 +267,7 @@ asmlinkage void execve_tail(void)
 /*
  * sys_execve() executes a new program.
  */
-SYSCALL_DEFINE3(execve, char __user *, name, char __user * __user *, argv,
+SYSCALL_DEFINE3(execve, const char __user *, name, char __user * __user *, argv,
 		char __user * __user *, envp)
 {
 	struct pt_regs *regs = task_pt_regs(current);
diff --git a/arch/sh/include/asm/syscalls_32.h b/arch/sh/include/asm/syscalls_32.h
index 8b30200305c3..be201fdc97aa 100644
--- a/arch/sh/include/asm/syscalls_32.h
+++ b/arch/sh/include/asm/syscalls_32.h
@@ -19,7 +19,7 @@ asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp,
 asmlinkage int sys_vfork(unsigned long r4, unsigned long r5,
 			 unsigned long r6, unsigned long r7,
 			 struct pt_regs __regs);
-asmlinkage int sys_execve(char __user *ufilename, char __user * __user *uargv,
+asmlinkage int sys_execve(const char __user *ufilename, char __user * __user *uargv,
 			  char __user * __user *uenvp, unsigned long r7,
 			  struct pt_regs __regs);
 asmlinkage int sys_sigsuspend(old_sigset_t mask, unsigned long r5,
diff --git a/arch/sh/include/asm/syscalls_64.h b/arch/sh/include/asm/syscalls_64.h
index 751fd8811364..ee519f41d950 100644
--- a/arch/sh/include/asm/syscalls_64.h
+++ b/arch/sh/include/asm/syscalls_64.h
@@ -21,7 +21,7 @@ asmlinkage int sys_vfork(unsigned long r2, unsigned long r3,
 			 unsigned long r4, unsigned long r5,
 			 unsigned long r6, unsigned long r7,
 			 struct pt_regs *pregs);
-asmlinkage int sys_execve(char *ufilename, char **uargv,
+asmlinkage int sys_execve(const char *ufilename, char **uargv,
 			  char **uenvp, unsigned long r5,
 			  unsigned long r6, unsigned long r7,
 			  struct pt_regs *pregs);
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index d4ca6480e355..68d128d651b3 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -483,7 +483,7 @@ asmlinkage int sys_vfork(unsigned long r2, unsigned long r3,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char *ufilename, char **uargv,
+asmlinkage int sys_execve(const char *ufilename, char **uargv,
 			  char **uenvp, unsigned long r5,
 			  unsigned long r6, unsigned long r7,
 			  struct pt_regs *pregs)
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index c0ca87553e1c..e6375a750d9a 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -162,7 +162,7 @@ static int cp_compat_stat64(struct kstat *stat,
 	return err;
 }
 
-asmlinkage long compat_sys_stat64(char __user * filename,
+asmlinkage long compat_sys_stat64(const char __user * filename,
 		struct compat_stat64 __user *statbuf)
 {
 	struct kstat stat;
@@ -173,7 +173,7 @@ asmlinkage long compat_sys_stat64(char __user * filename,
 	return error;
 }
 
-asmlinkage long compat_sys_lstat64(char __user * filename,
+asmlinkage long compat_sys_lstat64(const char __user * filename,
 		struct compat_stat64 __user *statbuf)
 {
 	struct kstat stat;
@@ -195,7 +195,8 @@ asmlinkage long compat_sys_fstat64(unsigned int fd,
 	return error;
 }
 
-asmlinkage long compat_sys_fstatat64(unsigned int dfd, char __user *filename,
+asmlinkage long compat_sys_fstatat64(unsigned int dfd,
+		const char __user *filename,
 		struct compat_stat64 __user * statbuf, int flag)
 {
 	struct kstat stat;
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 97974c1bdd12..59b20d93b6d4 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -44,7 +44,7 @@ void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp)
 	PT_REGS_SP(regs) = esp;
 }
 
-static long execve1(char *file, char __user * __user *argv,
+static long execve1(const char *file, char __user * __user *argv,
 		    char __user *__user *env)
 {
 	long error;
@@ -61,7 +61,7 @@ static long execve1(char *file, char __user * __user *argv,
 	return error;
 }
 
-long um_execve(char *file, char __user *__user *argv, char __user *__user *env)
+long um_execve(const char *file, char __user *__user *argv, char __user *__user *env)
 {
 	long err;
 
@@ -71,7 +71,7 @@ long um_execve(char *file, char __user *__user *argv, char __user *__user *env)
 	return err;
 }
 
-long sys_execve(char __user *file, char __user *__user *argv,
+long sys_execve(const char __user *file, char __user *__user *argv,
 		char __user *__user *env)
 {
 	long error;
diff --git a/arch/um/kernel/internal.h b/arch/um/kernel/internal.h
index 3bda43c7a786..1303a105fe91 100644
--- a/arch/um/kernel/internal.h
+++ b/arch/um/kernel/internal.h
@@ -1 +1 @@
-extern long um_execve(char *file, char __user *__user *argv, char __user *__user *env);
+extern long um_execve(const char *file, char __user *__user *argv, char __user *__user *env);
diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c
index 4393173923f5..7427c0b1930c 100644
--- a/arch/um/kernel/syscall.c
+++ b/arch/um/kernel/syscall.c
@@ -58,7 +58,7 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[])
 
 	fs = get_fs();
 	set_fs(KERNEL_DS);
-	ret = um_execve((char *)filename, (char __user *__user *)argv,
+	ret = um_execve(filename, (char __user *__user *)argv,
 			(char __user *__user *) envp);
 	set_fs(fs);
 
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 3d093311d5e2..849813f398e7 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -51,7 +51,7 @@
 #define AA(__x)		((unsigned long)(__x))
 
 
-asmlinkage long sys32_truncate64(char __user *filename,
+asmlinkage long sys32_truncate64(const char __user *filename,
 				 unsigned long offset_low,
 				 unsigned long offset_high)
 {
@@ -96,7 +96,7 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
 	return 0;
 }
 
-asmlinkage long sys32_stat64(char __user *filename,
+asmlinkage long sys32_stat64(const char __user *filename,
 			     struct stat64 __user *statbuf)
 {
 	struct kstat stat;
@@ -107,7 +107,7 @@ asmlinkage long sys32_stat64(char __user *filename,
 	return ret;
 }
 
-asmlinkage long sys32_lstat64(char __user *filename,
+asmlinkage long sys32_lstat64(const char __user *filename,
 			      struct stat64 __user *statbuf)
 {
 	struct kstat stat;
@@ -126,7 +126,7 @@ asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
 	return ret;
 }
 
-asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
+asmlinkage long sys32_fstatat(unsigned int dfd, const char __user *filename,
 			      struct stat64 __user *statbuf, int flag)
 {
 	struct kstat stat;
@@ -408,8 +408,8 @@ asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
 			 ((loff_t)AA(poshi) << 32) | AA(poslo));
 }
 
-asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count,
-			     u32 poslo, u32 poshi)
+asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf,
+			     u32 count, u32 poslo, u32 poshi)
 {
 	return sys_pwrite64(fd, ubuf, count,
 			  ((loff_t)AA(poshi) << 32) | AA(poslo));
@@ -449,7 +449,7 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
 	return ret;
 }
 
-asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
+asmlinkage long sys32_execve(const char __user *name, compat_uptr_t __user *argv,
 			     compat_uptr_t __user *envp, struct pt_regs *regs)
 {
 	long error;
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index cf4e2e381cba..cb238526a9f1 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -18,13 +18,13 @@
 #include <asm/ia32.h>
 
 /* ia32/sys_ia32.c */
-asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long);
+asmlinkage long sys32_truncate64(const char __user *, unsigned long, unsigned long);
 asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
 
-asmlinkage long sys32_stat64(char __user *, struct stat64 __user *);
-asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_stat64(const char __user *, struct stat64 __user *);
+asmlinkage long sys32_lstat64(const char __user *, struct stat64 __user *);
 asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
-asmlinkage long sys32_fstatat(unsigned int, char __user *,
+asmlinkage long sys32_fstatat(unsigned int, const char __user *,
 			      struct stat64 __user *, int);
 struct mmap_arg_struct32;
 asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);
@@ -49,12 +49,12 @@ asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
 asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
 
 asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
-asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
+asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32);
 
 asmlinkage long sys32_personality(unsigned long);
 asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
 
-asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
+asmlinkage long sys32_execve(const char __user *, compat_uptr_t __user *,
 			     compat_uptr_t __user *, struct pt_regs *);
 asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
 
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 5c044b43e9a7..feb2ff9bfc2d 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -23,7 +23,7 @@ long sys_iopl(unsigned int, struct pt_regs *);
 /* kernel/process.c */
 int sys_fork(struct pt_regs *);
 int sys_vfork(struct pt_regs *);
-long sys_execve(char __user *, char __user * __user *,
+long sys_execve(const char __user *, char __user * __user *,
 		char __user * __user *, struct pt_regs *);
 long sys_clone(unsigned long, unsigned long, void __user *,
 	       void __user *, struct pt_regs *);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c5ea5cdbe7b3..17be5ec7cbba 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1185,13 +1185,13 @@ END(kernel_thread_helper)
  * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
  *
  * C extern interface:
- *	 extern long execve(char *name, char **argv, char **envp)
+ *	 extern long execve(const char *name, char **argv, char **envp)
  *
  * asm input arguments:
  *	rdi: name, rsi: argv, rdx: envp
  *
  * We want to fallback into:
- *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
+ *	extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
  *
  * do_sys_execve asm fallback arguments:
  *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d401f1d2d06e..64ecaf0af9af 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(kernel_thread);
 /*
  * sys_execve() executes a new program.
  */
-long sys_execve(char __user *name, char __user * __user *argv,
+long sys_execve(const char __user *name, char __user * __user *argv,
 		char __user * __user *envp, struct pt_regs *regs)
 {
 	long error;
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index f167e0f5e05e..7c2f38f68ebb 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -318,7 +318,7 @@ long xtensa_clone(unsigned long clone_flags, unsigned long newsp,
  */
 
 asmlinkage
-long xtensa_execve(char __user *name, char __user * __user *argv,
+long xtensa_execve(const char __user *name, char __user * __user *argv,
                    char __user * __user *envp,
                    long a3, long a4, long a5,
                    struct pt_regs *regs)
diff --git a/fs/compat.c b/fs/compat.c
index e6d5d70cf3cf..718c7062aec1 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -77,7 +77,8 @@ int compat_printk(const char *fmt, ...)
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
  */
-asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __user *t)
+asmlinkage long compat_sys_utime(const char __user *filename,
+				 struct compat_utimbuf __user *t)
 {
 	struct timespec tv[2];
 
@@ -91,7 +92,7 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
 	return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
 }
 
-asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, struct compat_timespec __user *t, int flags)
+asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags)
 {
 	struct timespec tv[2];
 
@@ -106,7 +107,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, st
 	return do_utimes(dfd, filename, t ? tv : NULL, flags);
 }
 
-asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t)
 {
 	struct timespec tv[2];
 
@@ -125,7 +126,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, st
 	return do_utimes(dfd, filename, t ? tv : NULL, 0);
 }
 
-asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t)
 {
 	return compat_sys_futimesat(AT_FDCWD, filename, t);
 }
@@ -169,7 +170,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
 	return err;
 }
 
-asmlinkage long compat_sys_newstat(char __user * filename,
+asmlinkage long compat_sys_newstat(const char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
@@ -181,7 +182,7 @@ asmlinkage long compat_sys_newstat(char __user * filename,
 	return cp_compat_stat(&stat, statbuf);
 }
 
-asmlinkage long compat_sys_newlstat(char __user * filename,
+asmlinkage long compat_sys_newlstat(const char __user * filename,
 		struct compat_stat __user *statbuf)
 {
 	struct kstat stat;
@@ -194,7 +195,8 @@ asmlinkage long compat_sys_newlstat(char __user * filename,
 }
 
 #ifndef __ARCH_WANT_STAT64
-asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
+asmlinkage long compat_sys_newfstatat(unsigned int dfd,
+		const char __user *filename,
 		struct compat_stat __user *statbuf, int flag)
 {
 	struct kstat stat;
@@ -837,9 +839,10 @@ static int do_nfs4_super_data_conv(void *raw_data)
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME	"nfs4"
 
-asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
-				 char __user * type, unsigned long flags,
-				 void __user * data)
+asmlinkage long compat_sys_mount(const char __user * dev_name,
+				 const char __user * dir_name,
+				 const char __user * type, unsigned long flags,
+				 const void __user * data)
 {
 	char *kernel_type;
 	unsigned long data_page;
diff --git a/fs/stat.c b/fs/stat.c
index c4ecd52c5737..12e90e213900 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -68,7 +68,8 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
 }
 EXPORT_SYMBOL(vfs_fstat);
 
-int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
+int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
+		int flag)
 {
 	struct path path;
 	int error = -EINVAL;
@@ -91,13 +92,13 @@ out:
 }
 EXPORT_SYMBOL(vfs_fstatat);
 
-int vfs_stat(char __user *name, struct kstat *stat)
+int vfs_stat(const char __user *name, struct kstat *stat)
 {
 	return vfs_fstatat(AT_FDCWD, name, stat, 0);
 }
 EXPORT_SYMBOL(vfs_stat);
 
-int vfs_lstat(char __user *name, struct kstat *stat)
+int vfs_lstat(const char __user *name, struct kstat *stat)
 {
 	return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
 }
@@ -147,7 +148,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
+SYSCALL_DEFINE2(stat, const char __user *, filename,
+		struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error;
@@ -159,7 +161,8 @@ SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *
 	return cp_old_stat(&stat, statbuf);
 }
 
-SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
+SYSCALL_DEFINE2(lstat, const char __user *, filename,
+		struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error;
@@ -234,7 +237,8 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
+SYSCALL_DEFINE2(newstat, const char __user *, filename,
+		struct stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat(filename, &stat);
@@ -244,7 +248,8 @@ SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
 	return cp_new_stat(&stat, statbuf);
 }
 
-SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
+SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+		struct stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error;
@@ -257,7 +262,7 @@ SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf
 }
 
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
-SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
 		struct stat __user *, statbuf, int, flag)
 {
 	struct kstat stat;
@@ -355,7 +360,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf)
+SYSCALL_DEFINE2(stat64, const char __user *, filename,
+		struct stat64 __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat(filename, &stat);
@@ -366,7 +372,8 @@ SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf
 	return error;
 }
 
-SYSCALL_DEFINE2(lstat64, char __user *, filename, struct stat64 __user *, statbuf)
+SYSCALL_DEFINE2(lstat64, const char __user *, filename,
+		struct stat64 __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_lstat(filename, &stat);
@@ -388,7 +395,7 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
 	return error;
 }
 
-SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
 		struct stat64 __user *, statbuf, int, flag)
 {
 	struct kstat stat;
diff --git a/fs/utimes.c b/fs/utimes.c
index e4c75db5d373..179b58690657 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -126,7 +126,8 @@ out:
  * must be owner or have write permission.
  * Else, update from *times, must be owner or super user.
  */
-long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
+long do_utimes(int dfd, const char __user *filename, struct timespec *times,
+	       int flags)
 {
 	int error = -EINVAL;
 
@@ -170,7 +171,7 @@ out:
 	return error;
 }
 
-SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
 		struct timespec __user *, utimes, int, flags)
 {
 	struct timespec tstimes[2];
@@ -188,7 +189,7 @@ SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
 	return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
 }
 
-SYSCALL_DEFINE3(futimesat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
 		struct timeval __user *, utimes)
 {
 	struct timeval times[2];
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 168f7daa7bde..9ddc8780e8db 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -331,7 +331,7 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 			const compat_sigset_t __user *sigmask,
 			compat_size_t sigsetsize);
 
-asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename,
+asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename,
 				struct compat_timespec __user *t, int flags);
 
 asmlinkage long compat_sys_signalfd(int ufd,
@@ -348,9 +348,9 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page,
 				      const int __user *nodes,
 				      int __user *status,
 				      int flags);
-asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename,
+asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename,
 				     struct compat_timeval __user *t);
-asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename,
+asmlinkage long compat_sys_newfstatat(unsigned int dfd, const char __user * filename,
 				      struct compat_stat __user *statbuf,
 				      int flag);
 asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7a0625e26a39..5f0ca2fbb2a0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2320,10 +2320,10 @@ void inode_set_bytes(struct inode *inode, loff_t bytes);
 
 extern int vfs_readdir(struct file *, filldir_t, void *);
 
-extern int vfs_stat(char __user *, struct kstat *);
-extern int vfs_lstat(char __user *, struct kstat *);
+extern int vfs_stat(const char __user *, struct kstat *);
+extern int vfs_lstat(const char __user *, struct kstat *);
 extern int vfs_fstat(unsigned int, struct kstat *);
-extern int vfs_fstatat(int , char __user *, struct kstat *, int);
+extern int vfs_fstatat(int , const char __user *, struct kstat *, int);
 
 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1b67bd333b5e..6e5d19788634 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -394,7 +394,7 @@ asmlinkage long sys_umount(char __user *name, int flags);
 asmlinkage long sys_oldumount(char __user *name);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
-asmlinkage long sys_stat(char __user *filename,
+asmlinkage long sys_stat(const char __user *filename,
 			struct __old_kernel_stat __user *statbuf);
 asmlinkage long sys_statfs(const char __user * path,
 				struct statfs __user *buf);
@@ -403,21 +403,21 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz,
 asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
 asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
 				struct statfs64 __user *buf);
-asmlinkage long sys_lstat(char __user *filename,
+asmlinkage long sys_lstat(const char __user *filename,
 			struct __old_kernel_stat __user *statbuf);
 asmlinkage long sys_fstat(unsigned int fd,
 			struct __old_kernel_stat __user *statbuf);
-asmlinkage long sys_newstat(char __user *filename,
+asmlinkage long sys_newstat(const char __user *filename,
 				struct stat __user *statbuf);
-asmlinkage long sys_newlstat(char __user *filename,
+asmlinkage long sys_newlstat(const char __user *filename,
 				struct stat __user *statbuf);
 asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf);
 asmlinkage long sys_ustat(unsigned dev, struct ustat __user *ubuf);
 #if BITS_PER_LONG == 32
-asmlinkage long sys_stat64(char __user *filename,
+asmlinkage long sys_stat64(const char __user *filename,
 				struct stat64 __user *statbuf);
 asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
-asmlinkage long sys_lstat64(char __user *filename,
+asmlinkage long sys_lstat64(const char __user *filename,
 				struct stat64 __user *statbuf);
 asmlinkage long sys_truncate64(const char __user *path, loff_t length);
 asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length);
@@ -760,7 +760,7 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 			   int newdfd, const char __user *newname, int flags);
 asmlinkage long sys_renameat(int olddfd, const char __user * oldname,
 			     int newdfd, const char __user * newname);
-asmlinkage long sys_futimesat(int dfd, char __user *filename,
+asmlinkage long sys_futimesat(int dfd, const char __user *filename,
 			      struct timeval __user *utimes);
 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode);
 asmlinkage long sys_fchmodat(int dfd, const char __user * filename,
@@ -769,13 +769,13 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
 			     gid_t group, int flag);
 asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 			   int mode);
-asmlinkage long sys_newfstatat(int dfd, char __user *filename,
+asmlinkage long sys_newfstatat(int dfd, const char __user *filename,
 			       struct stat __user *statbuf, int flag);
-asmlinkage long sys_fstatat64(int dfd, char __user *filename,
+asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
 			       struct stat64 __user *statbuf, int flag);
 asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
 			       int bufsiz);
-asmlinkage long sys_utimensat(int dfd, char __user *filename,
+asmlinkage long sys_utimensat(int dfd, const char __user *filename,
 				struct timespec __user *utimes, int flags);
 asmlinkage long sys_unshare(unsigned long unshare_flags);
 
diff --git a/include/linux/time.h b/include/linux/time.h
index 12612701b1ae..9f15ac7ab92a 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -150,7 +150,7 @@ extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday(struct timespec *tv);
 extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
 #define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
-extern long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags);
+extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags);
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value,
 			struct itimerval *ovalue);
-- 
cgit v1.2.3-59-g8ed1b